summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2020-01-22 02:10:09 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2020-01-22 02:10:09 +0000
commitd1e8c371581041f403dcdcff4ab8a88e970d221e (patch)
tree621cf3eea9401b6fc19ce2a6dc5aa7579ecc8c70
parent81f619d3e99a3a218e6318d06c2bc1a36052e75d (diff)
Import Mesa 19.2.8
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c83
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_private.h2165
-rw-r--r--lib/mesa/src/freedreno/vulkan/vk_format.h546
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp14
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h3
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp1612
-rw-r--r--lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c151
-rw-r--r--lib/mesa/src/gallium/drivers/panfrost/pan_compute.c125
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c722
9 files changed, 2742 insertions, 2679 deletions
diff --git a/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c b/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c
index 5cfc79fbf..b8b2ceda2 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c
@@ -27,7 +27,6 @@
#include "util/disk_cache.h"
#include "util/mesa-sha1.h"
#include "util/u_atomic.h"
-#include "vulkan/util/vk_util.h"
struct cache_entry_variant_info
{
@@ -44,7 +43,7 @@ struct cache_entry
char code[0];
};
-static void
+void
tu_pipeline_cache_init(struct tu_pipeline_cache *cache,
struct tu_device *device)
{
@@ -67,7 +66,7 @@ tu_pipeline_cache_init(struct tu_pipeline_cache *cache,
memset(cache->hash_table, 0, byte_size);
}
-static void
+void
tu_pipeline_cache_finish(struct tu_pipeline_cache *cache)
{
for (unsigned i = 0; i < cache->table_size; ++i)
@@ -89,6 +88,41 @@ entry_size(struct cache_entry *entry)
return ret;
}
+void
+tu_hash_shaders(unsigned char *hash,
+ const VkPipelineShaderStageCreateInfo **stages,
+ const struct tu_pipeline_layout *layout,
+ const struct tu_pipeline_key *key,
+ uint32_t flags)
+{
+ struct mesa_sha1 ctx;
+
+ _mesa_sha1_init(&ctx);
+ if (key)
+ _mesa_sha1_update(&ctx, key, sizeof(*key));
+ if (layout)
+ _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+ for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
+ if (stages[i]) {
+ TU_FROM_HANDLE(tu_shader_module, module, stages[i]->module);
+ const VkSpecializationInfo *spec_info =
+ stages[i]->pSpecializationInfo;
+
+ _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
+ _mesa_sha1_update(&ctx, stages[i]->pName, strlen(stages[i]->pName));
+ if (spec_info) {
+ _mesa_sha1_update(
+ &ctx, spec_info->pMapEntries,
+ spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
+ _mesa_sha1_update(&ctx, spec_info->pData, spec_info->dataSize);
+ }
+ }
+ }
+ _mesa_sha1_update(&ctx, &flags, 4);
+ _mesa_sha1_final(&ctx, hash);
+}
+
static struct cache_entry *
tu_pipeline_cache_search_unlocked(struct tu_pipeline_cache *cache,
const unsigned char *sha1)
@@ -162,7 +196,7 @@ tu_pipeline_cache_grow(struct tu_pipeline_cache *cache)
table = malloc(byte_size);
if (table == NULL)
- return vk_error(cache, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(cache->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
cache->hash_table = table;
cache->table_size = table_size;
@@ -197,13 +231,22 @@ tu_pipeline_cache_add_entry(struct tu_pipeline_cache *cache,
tu_pipeline_cache_set_entry(cache, entry);
}
-static void
+struct cache_header
+{
+ uint32_t header_size;
+ uint32_t header_version;
+ uint32_t vendor_id;
+ uint32_t device_id;
+ uint8_t uuid[VK_UUID_SIZE];
+};
+
+void
tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
const void *data,
size_t size)
{
struct tu_device *device = cache->device;
- struct vk_pipeline_cache_header header;
+ struct cache_header header;
if (size < sizeof(header))
return;
@@ -212,9 +255,9 @@ tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
return;
if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
return;
- if (header.vendor_id != 0x5143)
+ if (header.vendor_id != 0 /* TODO */)
return;
- if (header.device_id != device->physical_device->dev_id.chip_id)
+ if (header.device_id != 0 /* TODO */)
return;
if (memcmp(header.uuid, device->physical_device->cache_uuid,
VK_UUID_SIZE) != 0)
@@ -242,7 +285,7 @@ tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
}
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreatePipelineCache(VkDevice _device,
const VkPipelineCacheCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -254,15 +297,15 @@ tu_CreatePipelineCache(VkDevice _device,
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
assert(pCreateInfo->flags == 0);
- cache = vk_object_alloc(&device->vk, pAllocator, sizeof(*cache),
- VK_OBJECT_TYPE_PIPELINE_CACHE);
+ cache = vk_alloc2(&device->alloc, pAllocator, sizeof(*cache), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (cache == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
if (pAllocator)
cache->alloc = *pAllocator;
else
- cache->alloc = device->vk.alloc;
+ cache->alloc = device->alloc;
tu_pipeline_cache_init(cache, device);
@@ -276,7 +319,7 @@ tu_CreatePipelineCache(VkDevice _device,
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyPipelineCache(VkDevice _device,
VkPipelineCache _cache,
const VkAllocationCallbacks *pAllocator)
@@ -288,10 +331,10 @@ tu_DestroyPipelineCache(VkDevice _device,
return;
tu_pipeline_cache_finish(cache);
- vk_object_free(&device->vk, pAllocator, cache);
+ vk_free2(&device->alloc, pAllocator, cache);
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_GetPipelineCacheData(VkDevice _device,
VkPipelineCache _cache,
size_t *pDataSize,
@@ -299,7 +342,7 @@ tu_GetPipelineCacheData(VkDevice _device,
{
TU_FROM_HANDLE(tu_device, device, _device);
TU_FROM_HANDLE(tu_pipeline_cache, cache, _cache);
- struct vk_pipeline_cache_header *header;
+ struct cache_header *header;
VkResult result = VK_SUCCESS;
pthread_mutex_lock(&cache->mutex);
@@ -319,8 +362,8 @@ tu_GetPipelineCacheData(VkDevice _device,
header = p;
header->header_size = sizeof(*header);
header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
- header->vendor_id = 0x5143;
- header->device_id = device->physical_device->dev_id.chip_id;
+ header->vendor_id = 0 /* TODO */;
+ header->device_id = 0 /* TODO */;
memcpy(header->uuid, device->physical_device->cache_uuid, VK_UUID_SIZE);
p += header->header_size;
@@ -361,7 +404,7 @@ tu_pipeline_cache_merge(struct tu_pipeline_cache *dst,
}
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_MergePipelineCaches(VkDevice _device,
VkPipelineCache destCache,
uint32_t srcCacheCount,
diff --git a/lib/mesa/src/freedreno/vulkan/tu_private.h b/lib/mesa/src/freedreno/vulkan/tu_private.h
index 862d507c9..c2440471f 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_private.h
+++ b/lib/mesa/src/freedreno/vulkan/tu_private.h
@@ -40,47 +40,28 @@
#include <valgrind.h>
#define VG(x) x
#else
-#define VG(x) ((void)0)
+#define VG(x)
#endif
-#define MESA_LOG_TAG "TU"
-
#include "c11/threads.h"
-#include "util/rounding.h"
-#include "util/bitscan.h"
+#include "compiler/shader_enums.h"
+#include "main/macros.h"
#include "util/list.h"
-#include "util/log.h"
#include "util/macros.h"
-#include "util/sparse_array.h"
-#include "util/u_atomic.h"
-#include "util/u_dynarray.h"
-#include "util/xmlconfig.h"
-#include "util/perf/u_trace.h"
#include "vk_alloc.h"
#include "vk_debug_report.h"
-#include "vk_device.h"
-#include "vk_dispatch_table.h"
-#include "vk_extensions.h"
-#include "vk_instance.h"
-#include "vk_log.h"
-#include "vk_physical_device.h"
-#include "vk_shader_module.h"
#include "wsi_common.h"
+#include "drm-uapi/msm_drm.h"
#include "ir3/ir3_compiler.h"
#include "ir3/ir3_shader.h"
#include "adreno_common.xml.h"
#include "adreno_pm4.xml.h"
#include "a6xx.xml.h"
-#include "fdl/freedreno_layout.h"
-#include "common/freedreno_dev_info.h"
-#include "perfcntrs/freedreno_perfcntr.h"
#include "tu_descriptor_set.h"
-#include "tu_autotune.h"
-#include "tu_util.h"
-#include "tu_perfetto.h"
+#include "tu_extensions.h"
/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
@@ -92,54 +73,143 @@ typedef uint32_t xcb_window_t;
#include <vulkan/vk_android_native_buffer.h>
#include <vulkan/vk_icd.h>
#include <vulkan/vulkan.h>
+#include <vulkan/vulkan_intel.h>
#include "tu_entrypoints.h"
-#include "vk_format.h"
-#include "vk_image.h"
-#include "vk_command_buffer.h"
-#include "vk_command_pool.h"
-#include "vk_queue.h"
-#include "vk_object.h"
-#include "vk_sync.h"
-#include "vk_fence.h"
-#include "vk_semaphore.h"
-#include "vk_drm_syncobj.h"
-#include "vk_sync_timeline.h"
-
#define MAX_VBS 32
#define MAX_VERTEX_ATTRIBS 32
#define MAX_RTS 8
#define MAX_VSC_PIPES 32
-#define MAX_VIEWPORTS 16
-#define MAX_VIEWPORT_SIZE (1 << 14)
+#define MAX_VIEWPORTS 1
#define MAX_SCISSORS 16
#define MAX_DISCARD_RECTANGLES 4
#define MAX_PUSH_CONSTANTS_SIZE 128
#define MAX_PUSH_DESCRIPTORS 32
#define MAX_DYNAMIC_UNIFORM_BUFFERS 16
#define MAX_DYNAMIC_STORAGE_BUFFERS 8
-#define MAX_DYNAMIC_BUFFERS_SIZE \
- (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \
- A6XX_TEX_CONST_DWORDS
-
+#define MAX_DYNAMIC_BUFFERS \
+ (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS)
+#define MAX_SAMPLES_LOG2 4
+#define NUM_META_FS_KEYS 13
#define TU_MAX_DRM_DEVICES 8
-#define MAX_VIEWS 16
-#define MAX_BIND_POINTS 2 /* compute + graphics */
-/* The Qualcomm driver exposes 0x20000058 */
-#define MAX_STORAGE_BUFFER_RANGE 0x20000000
-/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
- * expose the same maximum range.
- * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
- * range might be higher.
+#define MAX_VIEWS 8
+
+#define NUM_DEPTH_CLEAR_PIPELINES 3
+
+/*
+ * This is the point we switch from using CP to compute shader
+ * for certain buffer operations.
*/
-#define MAX_UNIFORM_BUFFER_RANGE 0x10000
+#define TU_BUFFER_OPS_CS_THRESHOLD 4096
+
+enum tu_mem_heap
+{
+ TU_MEM_HEAP_VRAM,
+ TU_MEM_HEAP_VRAM_CPU_ACCESS,
+ TU_MEM_HEAP_GTT,
+ TU_MEM_HEAP_COUNT
+};
+
+enum tu_mem_type
+{
+ TU_MEM_TYPE_VRAM,
+ TU_MEM_TYPE_GTT_WRITE_COMBINE,
+ TU_MEM_TYPE_VRAM_CPU_ACCESS,
+ TU_MEM_TYPE_GTT_CACHED,
+ TU_MEM_TYPE_COUNT
+};
+
+#define tu_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+ assert(a != 0 && a == (a & -a));
+ return (v + a - 1) & ~(a - 1);
+}
+
+static inline uint32_t
+align_u32_npot(uint32_t v, uint32_t a)
+{
+ return (v + a - 1) / a * a;
+}
+
+static inline uint64_t
+align_u64(uint64_t v, uint64_t a)
+{
+ assert(a != 0 && a == (a & -a));
+ return (v + a - 1) & ~(a - 1);
+}
+
+static inline int32_t
+align_i32(int32_t v, int32_t a)
+{
+ assert(a != 0 && a == (a & -a));
+ return (v + a - 1) & ~(a - 1);
+}
-#define A6XX_TEX_CONST_DWORDS 16
-#define A6XX_TEX_SAMP_DWORDS 4
+/** Alignment must be a power of 2. */
+static inline bool
+tu_is_aligned(uintmax_t n, uintmax_t a)
+{
+ assert(a == (a & -a));
+ return (n & (a - 1)) == 0;
+}
-#define COND(bool, val) ((bool) ? (val) : 0)
-#define BIT(bit) (1u << (bit))
+static inline uint32_t
+round_up_u32(uint32_t v, uint32_t a)
+{
+ return (v + a - 1) / a;
+}
+
+static inline uint64_t
+round_up_u64(uint64_t v, uint64_t a)
+{
+ return (v + a - 1) / a;
+}
+
+static inline uint32_t
+tu_minify(uint32_t n, uint32_t levels)
+{
+ if (unlikely(n == 0))
+ return 0;
+ else
+ return MAX2(n >> levels, 1);
+}
+static inline float
+tu_clamp_f(float f, float min, float max)
+{
+ assert(min < max);
+
+ if (f > max)
+ return max;
+ else if (f < min)
+ return min;
+ else
+ return f;
+}
+
+static inline bool
+tu_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
+{
+ if (*inout_mask & clear_mask) {
+ *inout_mask &= ~clear_mask;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+#define for_each_bit(b, dword) \
+ for (uint32_t __dword = (dword); \
+ (b) = __builtin_ffs(__dword) - 1, __dword; __dword &= ~(1 << (b)))
+
+#define typed_memcpy(dest, src, count) \
+ ({ \
+ STATIC_ASSERT(sizeof(*src) == sizeof(*dest)); \
+ memcpy((dest), (src), (count) * sizeof(*(src))); \
+ })
/* Whenever we generate an error, pass it through this function. Useful for
* debugging, where we can break on it. Only call at error site, not when
@@ -149,25 +219,29 @@ typedef uint32_t xcb_window_t;
struct tu_instance;
VkResult
-__vk_startup_errorf(struct tu_instance *instance,
- VkResult error,
- bool force_print,
- const char *file,
- int line,
- const char *format,
- ...) PRINTFLIKE(6, 7);
-
-/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
- * build.
- */
-#define vk_startup_errorf(instance, error, format, ...) \
- __vk_startup_errorf(instance, error, \
- instance->debug_flags & TU_DEBUG_STARTUP, \
- __FILE__, __LINE__, format, ##__VA_ARGS__)
+__vk_errorf(struct tu_instance *instance,
+ VkResult error,
+ const char *file,
+ int line,
+ const char *format,
+ ...);
+
+#define vk_error(instance, error) \
+ __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
+#define vk_errorf(instance, error, format, ...) \
+ __vk_errorf(instance, error, __FILE__, __LINE__, format, ##__VA_ARGS__);
void
__tu_finishme(const char *file, int line, const char *format, ...)
- PRINTFLIKE(3, 4);
+ tu_printflike(3, 4);
+void
+tu_loge(const char *format, ...) tu_printflike(1, 2);
+void
+tu_loge_v(const char *format, va_list va);
+void
+tu_logi(const char *format, ...) tu_printflike(1, 2);
+void
+tu_logi_v(const char *format, va_list va);
/**
* Print a FINISHME message, including its source location.
@@ -181,35 +255,46 @@ __tu_finishme(const char *file, int line, const char *format, ...)
} \
} while (0)
+/* A non-fatal assert. Useful for debugging. */
+#ifdef DEBUG
+#define tu_assert(x) \
+ ({ \
+ if (unlikely(!(x))) \
+ fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x); \
+ })
+#else
+#define tu_assert(x)
+#endif
+
+/* Suppress -Wunused in stub functions */
+#define tu_use_args(...) __tu_use_args(0, ##__VA_ARGS__)
+static inline void
+__tu_use_args(int ignore, ...)
+{
+}
+
#define tu_stub() \
do { \
tu_finishme("stub %s", __func__); \
} while (0)
-struct tu_memory_heap {
- /* Standard bits passed on to the client */
- VkDeviceSize size;
- VkMemoryHeapFlags flags;
-
- /** Copied from ANV:
- *
- * Driver-internal book-keeping.
- *
- * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
- */
- VkDeviceSize used __attribute__ ((aligned (8)));
-};
-
-uint64_t
-tu_get_system_heap_size(void);
+void *
+tu_lookup_entrypoint_unchecked(const char *name);
+void *
+tu_lookup_entrypoint_checked(
+ const char *name,
+ uint32_t core_version,
+ const struct tu_instance_extension_table *instance,
+ const struct tu_device_extension_table *device);
struct tu_physical_device
{
- struct vk_physical_device vk;
+ VK_LOADER_DATA _loader_data;
struct tu_instance *instance;
- const char *name;
+ char path[20];
+ char name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
uint8_t driver_uuid[VK_UUID_SIZE];
uint8_t device_uuid[VK_UUID_SIZE];
uint8_t cache_uuid[VK_UUID_SIZE];
@@ -217,71 +302,43 @@ struct tu_physical_device
struct wsi_device wsi_device;
int local_fd;
- bool has_local;
- int64_t local_major;
- int64_t local_minor;
int master_fd;
- bool has_master;
- int64_t master_major;
- int64_t master_minor;
+ unsigned gpu_id;
uint32_t gmem_size;
- uint64_t gmem_base;
- uint32_t ccu_offset_gmem;
- uint32_t ccu_offset_bypass;
-
- struct fd_dev_id dev_id;
- const struct fd_dev_info *info;
-
- int msm_major_version;
- int msm_minor_version;
-
- /* Address space and global fault count for this local_fd with DRM backend */
- uint64_t fault_count;
+ uint32_t tile_align_w;
+ uint32_t tile_align_h;
/* This is the drivers on-disk cache used as a fallback as opposed to
* the pipeline cache defined by apps.
*/
struct disk_cache *disk_cache;
- struct tu_memory_heap heap;
-
- struct vk_sync_type syncobj_type;
- struct vk_sync_timeline_type timeline_type;
- const struct vk_sync_type *sync_types[3];
+ struct tu_device_extension_table supported_extensions;
};
enum tu_debug_flags
{
TU_DEBUG_STARTUP = 1 << 0,
TU_DEBUG_NIR = 1 << 1,
- TU_DEBUG_NOBIN = 1 << 3,
- TU_DEBUG_SYSMEM = 1 << 4,
- TU_DEBUG_FORCEBIN = 1 << 5,
- TU_DEBUG_NOUBWC = 1 << 6,
- TU_DEBUG_NOMULTIPOS = 1 << 7,
- TU_DEBUG_NOLRZ = 1 << 8,
- TU_DEBUG_PERFC = 1 << 9,
- TU_DEBUG_FLUSHALL = 1 << 10,
- TU_DEBUG_SYNCDRAW = 1 << 11,
- TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
- TU_DEBUG_GMEM = 1 << 13,
- TU_DEBUG_RAST_ORDER = 1 << 14,
- TU_DEBUG_UNALIGNED_STORE = 1 << 15,
+ TU_DEBUG_IR3 = 1 << 2,
};
struct tu_instance
{
- struct vk_instance vk;
+ VK_LOADER_DATA _loader_data;
+
+ VkAllocationCallbacks alloc;
uint32_t api_version;
int physical_device_count;
struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
- struct driOptionCache dri_options;
- struct driOptionCache available_dri_options;
-
enum tu_debug_flags debug_flags;
+
+ struct vk_debug_report_instance debug_report_callbacks;
+
+ struct tu_instance_extension_table enabled_extensions;
};
VkResult
@@ -297,19 +354,10 @@ bool
tu_physical_device_extension_supported(struct tu_physical_device *dev,
const char *name);
-enum tu_bo_alloc_flags
-{
- TU_BO_ALLOC_NO_FLAGS = 0,
- TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
- TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
-};
-
struct cache_entry;
struct tu_pipeline_cache
{
- struct vk_object_base base;
-
struct tu_device *device;
pthread_mutex_t mutex;
@@ -326,313 +374,115 @@ struct tu_pipeline_key
{
};
+void
+tu_pipeline_cache_init(struct tu_pipeline_cache *cache,
+ struct tu_device *device);
+void
+tu_pipeline_cache_finish(struct tu_pipeline_cache *cache);
+void
+tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
+ const void *data,
+ size_t size);
-/* queue types */
-#define TU_QUEUE_GENERAL 0
-
-#define TU_MAX_QUEUE_FAMILIES 1
-
-/* Keep tu_syncobj until porting to common code for kgsl too */
-#ifdef TU_USE_KGSL
-struct tu_syncobj;
-#endif
-struct tu_u_trace_syncobj;
-
-/* Define tu_timeline_sync type based on drm syncobj for a point type
- * for vk_sync_timeline, and the logic to handle is mostly copied from
- * anv_bo_sync since it seems it can be used by similar way to anv.
- */
-enum tu_timeline_sync_state {
- /** Indicates that this is a new (or newly reset fence) */
- TU_TIMELINE_SYNC_STATE_RESET,
-
- /** Indicates that this fence has been submitted to the GPU but is still
- * (as far as we know) in use by the GPU.
- */
- TU_TIMELINE_SYNC_STATE_SUBMITTED,
-
- TU_TIMELINE_SYNC_STATE_SIGNALED,
-};
-
-struct tu_timeline_sync {
- struct vk_sync base;
-
- enum tu_timeline_sync_state state;
- uint32_t syncobj;
-};
-
-struct tu_queue
-{
- struct vk_queue vk;
-
- struct tu_device *device;
+struct tu_shader_variant;
- uint32_t msm_queue_id;
- int fence;
-};
+bool
+tu_create_shader_variants_from_pipeline_cache(
+ struct tu_device *device,
+ struct tu_pipeline_cache *cache,
+ const unsigned char *sha1,
+ struct tu_shader_variant **variants);
-struct tu_bo
+void
+tu_pipeline_cache_insert_shaders(struct tu_device *device,
+ struct tu_pipeline_cache *cache,
+ const unsigned char *sha1,
+ struct tu_shader_variant **variants,
+ const void *const *codes,
+ const unsigned *code_sizes);
+
+struct tu_meta_state
{
- uint32_t gem_handle;
- uint64_t size;
- uint64_t iova;
- void *map;
- int32_t refcnt;
-
-#ifndef TU_USE_KGSL
- uint32_t bo_list_idx;
-#endif
+ VkAllocationCallbacks alloc;
- bool implicit_sync : 1;
+ struct tu_pipeline_cache cache;
};
-/* externally-synchronized BO suballocator. */
-struct tu_suballocator
-{
- struct tu_device *dev;
-
- uint32_t default_size;
- enum tu_bo_alloc_flags flags;
-
- /** Current BO we're suballocating out of. */
- struct tu_bo *bo;
- uint32_t next_offset;
+/* queue types */
+#define TU_QUEUE_GENERAL 0
- /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
- struct tu_bo *cached_bo;
-};
+#define TU_MAX_QUEUE_FAMILIES 1
-struct tu_suballoc_bo
+struct tu_fence
{
- struct tu_bo *bo;
- uint64_t iova;
- uint32_t size; /* bytes */
+ bool signaled;
+ int fd;
};
void
-tu_bo_suballocator_init(struct tu_suballocator *suballoc,
- struct tu_device *dev,
- uint32_t default_size,
- uint32_t flags);
+tu_fence_init(struct tu_fence *fence, bool signaled);
void
-tu_bo_suballocator_finish(struct tu_suballocator *suballoc);
-
-VkResult
-tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
- uint32_t size, uint32_t align);
-
-void *
-tu_suballoc_bo_map(struct tu_suballoc_bo *bo);
-
+tu_fence_finish(struct tu_fence *fence);
void
-tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);
-
-enum global_shader {
- GLOBAL_SH_VS_BLIT,
- GLOBAL_SH_VS_CLEAR,
- GLOBAL_SH_FS_BLIT,
- GLOBAL_SH_FS_BLIT_ZSCALE,
- GLOBAL_SH_FS_COPY_MS,
- GLOBAL_SH_FS_CLEAR0,
- GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
- GLOBAL_SH_COUNT,
-};
-
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
- /* Points into GPU memory */
- struct tu_renderpass_samples* samples;
-
- struct tu_suballoc_bo bo;
-
- /*
- * Below here, only used internally within autotune
- */
- uint64_t rp_key;
- struct tu_renderpass_history *history;
- struct list_head node;
- uint32_t fence;
- uint64_t samples_passed;
-};
-
-#define TU_BORDER_COLOR_COUNT 4096
-#define TU_BORDER_COLOR_BUILTIN 6
-
-#define TU_BLIT_SHADER_SIZE 1024
+tu_fence_update_fd(struct tu_fence *fence, int fd);
+void
+tu_fence_copy(struct tu_fence *fence, const struct tu_fence *src);
+void
+tu_fence_signal(struct tu_fence *fence);
+void
+tu_fence_wait_idle(struct tu_fence *fence);
-/* This struct defines the layout of the global_bo */
-struct tu6_global
+struct tu_queue
{
- /* clear/blit shaders */
- uint32_t shaders[TU_BLIT_SHADER_SIZE];
-
- uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
- uint32_t _pad0;
- volatile uint32_t vsc_draw_overflow;
- uint32_t _pad1;
- volatile uint32_t vsc_prim_overflow;
- uint32_t _pad2;
- uint64_t predicate;
-
- /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
- struct {
- uint32_t offset;
- uint32_t pad[7];
- } flush_base[4];
-
- ALIGN16 uint32_t cs_indirect_xyz[3];
-
- /* To know when renderpass stats for autotune are valid */
- volatile uint32_t autotune_fence;
+ VK_LOADER_DATA _loader_data;
+ struct tu_device *device;
+ uint32_t queue_family_index;
+ int queue_idx;
+ VkDeviceQueueCreateFlags flags;
- /* note: larger global bo will be used for customBorderColors */
- struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
+ uint32_t msm_queue_id;
+ struct tu_fence submit_fence;
};
-#define gb_offset(member) offsetof(struct tu6_global, member)
-#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
-
-/* extra space in vsc draw/prim streams */
-#define VSC_PAD 0x40
struct tu_device
{
- struct vk_device vk;
+ VK_LOADER_DATA _loader_data;
+
+ VkAllocationCallbacks alloc;
+
struct tu_instance *instance;
+ struct tu_meta_state meta_state;
+
struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
int queue_count[TU_MAX_QUEUE_FAMILIES];
struct tu_physical_device *physical_device;
- int fd;
struct ir3_compiler *compiler;
/* Backup in-memory cache to be used if the app doesn't provide one */
struct tu_pipeline_cache *mem_cache;
-#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
-
- /* Currently the kernel driver uses a 32-bit GPU address space, but it
- * should be impossible to go beyond 48 bits.
- */
- struct {
- struct tu_bo *bo;
- mtx_t construct_mtx;
- bool initialized;
- } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
-
- struct tu_bo *global_bo;
-
- uint32_t implicit_sync_bo_count;
-
- /* Device-global BO suballocator for reducing BO management overhead for
- * (read-only) pipeline state. Synchronized by pipeline_mutex.
- */
- struct tu_suballocator pipeline_suballoc;
- mtx_t pipeline_mutex;
-
- /* Device-global BO suballocator for reducing BO management for small
- * gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
- */
- struct tu_suballocator autotune_suballoc;
- mtx_t autotune_mutex;
-
- /* the blob seems to always use 8K factor and 128K param sizes, copy them */
-#define TU_TESS_FACTOR_SIZE (8 * 1024)
-#define TU_TESS_PARAM_SIZE (128 * 1024)
-#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
- /* Lazily allocated, protected by the device mutex. */
- struct tu_bo *tess_bo;
-
- struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
- uint64_t global_shader_va[GLOBAL_SH_COUNT];
-
- uint32_t vsc_draw_strm_pitch;
- uint32_t vsc_prim_strm_pitch;
- BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
- mtx_t mutex;
-
- /* bo list for submits: */
- struct drm_msm_gem_submit_bo *bo_list;
- /* map bo handles to bo list index: */
- uint32_t bo_count, bo_list_size;
- mtx_t bo_mutex;
- /* protects imported BOs creation/freeing */
- struct u_rwlock dma_bo_lock;
-
- /* This array holds all our 'struct tu_bo' allocations. We use this
- * so we can add a refcount to our BOs and check if a particular BO
- * was already allocated in this device using its GEM handle. This is
- * necessary to properly manage BO imports, because the kernel doesn't
- * refcount the underlying BO memory.
- *
- * Specifically, when self-importing (i.e. importing a BO into the same
- * device that created it), the kernel will give us the same BO handle
- * for both BOs and we must only free it once when both references are
- * freed. Otherwise, if we are not self-importing, we get two different BO
- * handles, and we want to free each one individually.
- *
- * The refcount is also useful for being able to maintain BOs across
- * VK object lifetimes, such as pipelines suballocating out of BOs
- * allocated on the device.
- */
- struct util_sparse_array bo_map;
-
- /* Command streams to set pass index to a scratch reg */
- struct tu_cs *perfcntrs_pass_cs;
- struct tu_cs_entry *perfcntrs_pass_cs_entries;
-
- /* Condition variable for timeline semaphore to notify waiters when a
- * new submit is executed. */
- pthread_cond_t timeline_cond;
- pthread_mutex_t submit_mutex;
-
- struct tu_autotune autotune;
-
-#ifdef ANDROID
- const void *gralloc;
- enum {
- TU_GRALLOC_UNKNOWN,
- TU_GRALLOC_CROS,
- TU_GRALLOC_OTHER,
- } gralloc_type;
-#endif
+ struct list_head shader_slabs;
+ mtx_t shader_slab_mutex;
- uint32_t submit_count;
-
- struct u_trace_context trace_context;
-
- #ifdef HAVE_PERFETTO
- struct tu_perfetto_state perfetto;
- #endif
+ struct tu_device_extension_table enabled_extensions;
};
-void tu_init_clear_blit_shaders(struct tu_device *dev);
-
-void tu_destroy_clear_blit_shaders(struct tu_device *dev);
-
-VkResult
-tu_device_submit_deferred_locked(struct tu_device *dev);
-
-VkResult
-tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
-
-uint64_t
-tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
-
-VkResult
-tu_device_check_status(struct vk_device *vk_device);
+struct tu_bo
+{
+ uint32_t gem_handle;
+ uint64_t size;
+ uint64_t iova;
+ void *map;
+};
VkResult
-tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
- enum tu_bo_alloc_flags flags);
+tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size);
VkResult
tu_bo_init_dmabuf(struct tu_device *dev,
- struct tu_bo **bo,
+ struct tu_bo *bo,
uint64_t size,
int fd);
int
@@ -642,28 +492,6 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
VkResult
tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
-static inline struct tu_bo *
-tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
-{
- return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
-}
-
-static inline struct tu_bo *
-tu_bo_get_ref(struct tu_bo *bo)
-{
- p_atomic_inc(&bo->refcnt);
- return bo;
-}
-
-/* Get a scratch bo for use inside a command buffer. This will always return
- * the same bo given the same size or similar sizes, so only one scratch bo
- * can be used at the same time. It's meant for short-lived things where we
- * need to write to some piece of memory, read from it, and then immediately
- * discard it.
- */
-VkResult
-tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
-
struct tu_cs_entry
{
/* No ownership */
@@ -673,58 +501,6 @@ struct tu_cs_entry
uint32_t offset;
};
-struct tu_cs_memory {
- uint32_t *map;
- uint64_t iova;
-};
-
-struct tu_draw_state {
- uint64_t iova : 48;
- uint32_t size : 16;
-};
-
-enum tu_dynamic_state
-{
- /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
- TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
- TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
- TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
- TU_DYNAMIC_STATE_VB_STRIDE,
- TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
- TU_DYNAMIC_STATE_COUNT,
- /* no associated draw state: */
- TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
- TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
- /* re-use the line width enum as it uses GRAS_SU_CNTL: */
- TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
-};
-
-enum tu_draw_state_group_id
-{
- TU_DRAW_STATE_PROGRAM_CONFIG,
- TU_DRAW_STATE_PROGRAM,
- TU_DRAW_STATE_PROGRAM_BINNING,
- TU_DRAW_STATE_VB,
- TU_DRAW_STATE_VI,
- TU_DRAW_STATE_VI_BINNING,
- TU_DRAW_STATE_RAST,
- TU_DRAW_STATE_BLEND,
- TU_DRAW_STATE_SHADER_GEOM_CONST,
- TU_DRAW_STATE_FS_CONST,
- TU_DRAW_STATE_DESC_SETS,
- TU_DRAW_STATE_DESC_SETS_LOAD,
- TU_DRAW_STATE_VS_PARAMS,
- TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
- TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
- TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
- TU_DRAW_STATE_PRIM_MODE_GMEM,
- TU_DRAW_STATE_PRIM_MODE_SYSMEM,
-
- /* dynamic state related draw states */
- TU_DRAW_STATE_DYNAMIC,
- TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
-};
-
enum tu_cs_mode
{
@@ -765,7 +541,6 @@ struct tu_cs
uint32_t *reserved_end;
uint32_t *end;
- struct tu_device *device;
enum tu_cs_mode mode;
uint32_t next_bo_size;
@@ -776,20 +551,20 @@ struct tu_cs
struct tu_bo **bos;
uint32_t bo_count;
uint32_t bo_capacity;
-
- /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
- struct tu_bo *refcount_bo;
-
- /* state for cond_exec_start/cond_exec_end */
- uint32_t cond_flags;
- uint32_t *cond_dwords;
};
struct tu_device_memory
{
- struct vk_object_base base;
+ struct tu_bo bo;
+ VkDeviceSize size;
- struct tu_bo *bo;
+ /* for dedicated allocations */
+ struct tu_image *image;
+ struct tu_buffer *buffer;
+
+ uint32_t type_index;
+ void *map;
+ void *user_ptr;
};
struct tu_descriptor_range
@@ -800,19 +575,18 @@ struct tu_descriptor_range
struct tu_descriptor_set
{
- struct vk_object_base base;
-
- /* Link to descriptor pool's desc_sets list . */
- struct list_head pool_link;
-
- struct tu_descriptor_set_layout *layout;
- struct tu_descriptor_pool *pool;
+ const struct tu_descriptor_set_layout *layout;
uint32_t size;
uint64_t va;
uint32_t *mapped_ptr;
+ struct tu_descriptor_range *dynamic_descriptors;
+};
- uint32_t *dynamic_descriptors;
+struct tu_push_descriptor_set
+{
+ struct tu_descriptor_set set;
+ uint32_t capacity;
};
struct tu_descriptor_pool_entry
@@ -824,18 +598,13 @@ struct tu_descriptor_pool_entry
struct tu_descriptor_pool
{
- struct vk_object_base base;
-
- struct tu_bo *bo;
+ uint8_t *mapped_ptr;
uint64_t current_offset;
uint64_t size;
uint8_t *host_memory_base;
uint8_t *host_memory_ptr;
uint8_t *host_memory_end;
- uint8_t *host_bo;
-
- struct list_head desc_sets;
uint32_t entry_count;
uint32_t max_entry_count;
@@ -866,13 +635,11 @@ struct tu_descriptor_update_template_entry
size_t src_stride;
/* For push descriptors */
- const struct tu_sampler *immutable_samplers;
+ const uint32_t *immutable_samplers;
};
struct tu_descriptor_update_template
{
- struct vk_object_base base;
-
uint32_t entry_count;
VkPipelineBindPoint bind_point;
struct tu_descriptor_update_template_entry entry[0];
@@ -880,257 +647,175 @@ struct tu_descriptor_update_template
struct tu_buffer
{
- struct vk_object_base base;
-
VkDeviceSize size;
VkBufferUsageFlags usage;
VkBufferCreateFlags flags;
struct tu_bo *bo;
- uint64_t iova;
+ VkDeviceSize bo_offset;
};
-const char *
-tu_get_debug_option_name(int id);
-
-const char *
-tu_get_perftest_option_name(int id);
+enum tu_dynamic_state_bits
+{
+ TU_DYNAMIC_VIEWPORT = 1 << 0,
+ TU_DYNAMIC_SCISSOR = 1 << 1,
+ TU_DYNAMIC_LINE_WIDTH = 1 << 2,
+ TU_DYNAMIC_DEPTH_BIAS = 1 << 3,
+ TU_DYNAMIC_BLEND_CONSTANTS = 1 << 4,
+ TU_DYNAMIC_DEPTH_BOUNDS = 1 << 5,
+ TU_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 6,
+ TU_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7,
+ TU_DYNAMIC_STENCIL_REFERENCE = 1 << 8,
+ TU_DYNAMIC_DISCARD_RECTANGLE = 1 << 9,
+ TU_DYNAMIC_ALL = (1 << 10) - 1,
+};
+
+struct tu_vertex_binding
+{
+ struct tu_buffer *buffer;
+ VkDeviceSize offset;
+};
-struct tu_descriptor_state
+struct tu_viewport_state
{
- struct tu_descriptor_set *sets[MAX_SETS];
- struct tu_descriptor_set push_set;
- uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
+ uint32_t count;
+ VkViewport viewports[MAX_VIEWPORTS];
};
-enum tu_cmd_dirty_bits
+struct tu_scissor_state
{
- TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
- TU_CMD_DIRTY_VB_STRIDE = BIT(1),
- TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
- TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
- TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
- TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
- TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
- TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
- TU_CMD_DIRTY_LRZ = BIT(8),
- TU_CMD_DIRTY_VS_PARAMS = BIT(9),
- TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
- TU_CMD_DIRTY_VIEWPORTS = BIT(11),
- /* all draw states were disabled and need to be re-enabled: */
- TU_CMD_DIRTY_DRAW_STATE = BIT(12)
+ uint32_t count;
+ VkRect2D scissors[MAX_SCISSORS];
};
-/* There are only three cache domains we have to care about: the CCU, or
- * color cache unit, which is used for color and depth/stencil attachments
- * and copy/blit destinations, and is split conceptually into color and depth,
- * and the universal cache or UCHE which is used for pretty much everything
- * else, except for the CP (uncached) and host. We need to flush whenever data
- * crosses these boundaries.
- */
+struct tu_discard_rectangle_state
+{
+ uint32_t count;
+ VkRect2D rectangles[MAX_DISCARD_RECTANGLES];
+};
-enum tu_cmd_access_mask {
- TU_ACCESS_UCHE_READ = 1 << 0,
- TU_ACCESS_UCHE_WRITE = 1 << 1,
- TU_ACCESS_CCU_COLOR_READ = 1 << 2,
- TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
- TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
- TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
-
- /* Experiments have shown that while it's safe to avoid flushing the CCU
- * after each blit/renderpass, it's not safe to assume that subsequent
- * lookups with a different attachment state will hit unflushed cache
- * entries. That is, the CCU needs to be flushed and possibly invalidated
- * when accessing memory with a different attachment state. Writing to an
- * attachment under the following conditions after clearing using the
- * normal 2d engine path is known to have issues:
- *
- * - It isn't the 0'th layer.
- * - There are more than one attachment, and this isn't the 0'th attachment
- * (this seems to also depend on the cpp of the attachments).
- *
- * Our best guess is that the layer/MRT state is used when computing
- * the location of a cache entry in CCU, to avoid conflicts. We assume that
- * any access in a renderpass after or before an access by a transfer needs
- * a flush/invalidate, and use the _INCOHERENT variants to represent access
- * by a renderpass.
+struct tu_dynamic_state
+{
+ /**
+ * Bitmask of (1 << VK_DYNAMIC_STATE_*).
+ * Defines the set of saved dynamic state.
*/
- TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
- TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
- TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
- TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
+ uint32_t mask;
- /* Accesses which bypasses any cache. e.g. writes via the host,
- * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
- */
- TU_ACCESS_SYSMEM_READ = 1 << 10,
- TU_ACCESS_SYSMEM_WRITE = 1 << 11,
+ struct tu_viewport_state viewport;
- /* Memory writes from the CP start in-order with draws and event writes,
- * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
- */
- TU_ACCESS_CP_WRITE = 1 << 12,
-
- TU_ACCESS_READ =
- TU_ACCESS_UCHE_READ |
- TU_ACCESS_CCU_COLOR_READ |
- TU_ACCESS_CCU_DEPTH_READ |
- TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
- TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
- TU_ACCESS_SYSMEM_READ,
-
- TU_ACCESS_WRITE =
- TU_ACCESS_UCHE_WRITE |
- TU_ACCESS_CCU_COLOR_WRITE |
- TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
- TU_ACCESS_CCU_DEPTH_WRITE |
- TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
- TU_ACCESS_SYSMEM_WRITE |
- TU_ACCESS_CP_WRITE,
-
- TU_ACCESS_ALL =
- TU_ACCESS_READ |
- TU_ACCESS_WRITE,
-};
+ struct tu_scissor_state scissor;
-/* Starting with a6xx, the pipeline is split into several "clusters" (really
- * pipeline stages). Each stage has its own pair of register banks and can
- * switch them independently, so that earlier stages can run ahead of later
- * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
- * the same time.
- *
- * As a result of this, we need to insert a WFI when an earlier stage depends
- * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
- * pending WFI's to complete before starting, and usually before reading
- * indirect params even, so a WFI also acts as a full "pipeline stall".
- *
- * Note, the names of the stages come from CLUSTER_* in devcoredump. We
- * include all the stages for completeness, even ones which do not read/write
- * anything.
- */
+ float line_width;
-enum tu_stage {
- /* This doesn't correspond to a cluster, but we need it for tracking
- * indirect draw parameter reads etc.
- */
- TU_STAGE_CP,
+ struct
+ {
+ float bias;
+ float clamp;
+ float slope;
+ } depth_bias;
- /* - Fetch index buffer
- * - Fetch vertex attributes, dispatch VS
- */
- TU_STAGE_FE,
+ float blend_constants[4];
- /* Execute all geometry stages (VS thru GS) */
- TU_STAGE_SP_VS,
+ struct
+ {
+ float min;
+ float max;
+ } depth_bounds;
- /* Write to VPC, do primitive assembly. */
- TU_STAGE_PC_VS,
+ struct
+ {
+ uint32_t front;
+ uint32_t back;
+ } stencil_compare_mask;
- /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
- * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
- * early depth testing is enabled before dispatching fragments? However
- * GRAS reads and writes LRZ directly.
- */
- TU_STAGE_GRAS,
+ struct
+ {
+ uint32_t front;
+ uint32_t back;
+ } stencil_write_mask;
- /* Execute FS */
- TU_STAGE_SP_PS,
+ struct
+ {
+ uint32_t front;
+ uint32_t back;
+ } stencil_reference;
- /* - Fragment tests
- * - Write color/depth
- * - Streamout writes (???)
- * - Varying interpolation (???)
- */
- TU_STAGE_PS,
+ struct tu_discard_rectangle_state discard_rectangle;
};
-enum tu_cmd_flush_bits {
- TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
- TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
- TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
- TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
- TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
- TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
- TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
- TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
- TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
-
- TU_CMD_FLAG_ALL_FLUSH =
- TU_CMD_FLAG_CCU_FLUSH_DEPTH |
- TU_CMD_FLAG_CCU_FLUSH_COLOR |
- TU_CMD_FLAG_CACHE_FLUSH |
- /* Treat the CP as a sort of "cache" which may need to be "flushed" via
- * waiting for writes to land with WAIT_FOR_MEM_WRITES.
- */
- TU_CMD_FLAG_WAIT_MEM_WRITES,
-
- TU_CMD_FLAG_ALL_INVALIDATE =
- TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
- TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
- TU_CMD_FLAG_CACHE_INVALIDATE |
- /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
- * a command that needs CP_WAIT_FOR_ME is executed. This means we may
- * insert an extra WAIT_FOR_ME before an indirect command requiring it
- * in case there was another command before the current command buffer
- * that it needs to wait for.
- */
- TU_CMD_FLAG_WAIT_FOR_ME,
-};
+extern const struct tu_dynamic_state default_dynamic_state;
-/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
- * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
- * which part of the gmem is used by the CCU. Here we keep track of what the
- * state of the CCU.
- */
-enum tu_cmd_ccu_state {
- TU_CMD_CCU_SYSMEM,
- TU_CMD_CCU_GMEM,
- TU_CMD_CCU_UNKNOWN,
-};
+const char *
+tu_get_debug_option_name(int id);
-struct tu_cache_state {
- /* Caches which must be made available (flushed) eventually if there are
- * any users outside that cache domain, and caches which must be
- * invalidated eventually if there are any reads.
- */
- enum tu_cmd_flush_bits pending_flush_bits;
- /* Pending flushes */
- enum tu_cmd_flush_bits flush_bits;
-};
+const char *
+tu_get_perftest_option_name(int id);
-enum tu_lrz_force_disable_mask {
- TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
- TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
+/**
+ * Attachment state when recording a renderpass instance.
+ *
+ * The clear value is valid only if there exists a pending clear.
+ */
+struct tu_attachment_state
+{
+ VkImageAspectFlags pending_clear_aspects;
+ uint32_t cleared_views;
+ VkClearValue clear_value;
+ VkImageLayout current_layout;
};
-enum tu_lrz_direction {
- TU_LRZ_UNKNOWN,
- /* Depth func less/less-than: */
- TU_LRZ_LESS,
- /* Depth func greater/greater-than: */
- TU_LRZ_GREATER,
+struct tu_descriptor_state
+{
+ struct tu_descriptor_set *sets[MAX_SETS];
+ uint32_t dirty;
+ uint32_t valid;
+ struct tu_push_descriptor_set push_set;
+ bool push_dirty;
+ uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS];
};
-struct tu_lrz_pipeline
+struct tu_tile
{
- uint32_t force_disable_mask;
- bool fs_has_kill;
- bool force_late_z;
- bool early_fragment_tests;
+ uint8_t pipe;
+ uint8_t slot;
+ VkOffset2D begin;
+ VkOffset2D end;
};
-struct tu_lrz_state
+struct tu_tiling_config
{
- /* Depth/Stencil image currently on use to do LRZ */
- struct tu_image *image;
- bool valid : 1;
- enum tu_lrz_direction prev_direction;
+ VkRect2D render_area;
+ uint32_t buffer_cpp[MAX_RTS + 2];
+ uint32_t buffer_count;
+
+ /* position and size of the first tile */
+ VkRect2D tile0;
+ /* number of tiles */
+ VkExtent2D tile_count;
+
+ uint32_t gmem_offsets[MAX_RTS + 2];
+
+ /* size of the first VSC pipe */
+ VkExtent2D pipe0;
+ /* number of VSC pipes */
+ VkExtent2D pipe_count;
+
+ /* pipe register values */
+ uint32_t pipe_config[MAX_VSC_PIPES];
+ uint32_t pipe_sizes[MAX_VSC_PIPES];
};
-struct tu_vs_params {
- uint32_t vertex_offset;
- uint32_t first_instance;
+enum tu_cmd_dirty_bits
+{
+ TU_CMD_DIRTY_PIPELINE = 1 << 0,
+ TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1,
+
+ TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16,
+ TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17,
+ TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 18,
+ TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 19,
};
struct tu_cmd_state
@@ -1138,119 +823,48 @@ struct tu_cmd_state
uint32_t dirty;
struct tu_pipeline *pipeline;
- struct tu_pipeline *compute_pipeline;
- /* Vertex buffers, viewports, and scissors
- * the states for these can be updated partially, so we need to save these
- * to be able to emit a complete draw state
- */
- struct {
- uint64_t base;
- uint32_t size;
- uint32_t stride;
- } vb[MAX_VBS];
- VkViewport viewport[MAX_VIEWPORTS];
- VkRect2D scissor[MAX_SCISSORS];
- uint32_t max_viewport, max_scissor;
-
- /* for dynamic states that can't be emitted directly */
- uint32_t dynamic_stencil_mask;
- uint32_t dynamic_stencil_wrmask;
- uint32_t dynamic_stencil_ref;
-
- uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
- uint32_t pc_raster_cntl, vpc_unknown_9107;
- enum pc_di_primtype primtype;
- bool primitive_restart_enable;
-
- /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
- struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
- struct tu_draw_state vertex_buffers;
- struct tu_draw_state shader_const[2];
- struct tu_draw_state desc_sets;
-
- struct tu_draw_state vs_params;
+ /* Vertex buffers */
+ struct
+ {
+ struct tu_buffer *buffers[MAX_VBS];
+ VkDeviceSize offsets[MAX_VBS];
+ } vb;
+
+ struct tu_dynamic_state dynamic;
/* Index buffer */
- uint64_t index_va;
+ struct tu_buffer *index_buffer;
+ uint64_t index_offset;
+ uint32_t index_type;
uint32_t max_index_count;
- uint8_t index_size;
-
- /* because streamout base has to be 32-byte aligned
- * there is an extra offset to deal with when it is
- * unaligned
- */
- uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
-
- /* Renderpasses are tricky, because we may need to flush differently if
- * using sysmem vs. gmem and therefore we have to delay any flushing that
- * happens before a renderpass. So we have to have two copies of the flush
- * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
- * and one for outside a renderpass.
- */
- struct tu_cache_state cache;
- struct tu_cache_state renderpass_cache;
-
- enum tu_cmd_ccu_state ccu_state;
+ uint64_t index_va;
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
- VkRect2D render_area;
-
- const struct tu_image_view **attachments;
+ struct tu_attachment_state *attachments;
- bool xfb_used;
- bool has_tess;
- bool tessfactor_addr_set;
- bool has_subpass_predication;
- bool predication_active;
- bool disable_gmem;
- enum a5xx_line_mode line_mode;
- bool z_negative_one_to_one;
+ struct tu_tiling_config tiling_config;
- uint32_t drawcall_count;
-
- /* A calculated "draw cost" value for renderpass, which tries to
- * estimate the bandwidth-per-sample of all the draws according
- * to:
- *
- * foreach_draw (...) {
- * cost += num_frag_outputs;
- * if (blend_enabled)
- * cost += num_blend_enabled;
- * if (depth_test_enabled)
- * cost++;
- * if (depth_write_enabled)
- * cost++;
- * }
- *
- * The idea is that each sample-passed minimally does one write
- * per MRT. If blend is enabled, the hw will additionally do
- * a framebuffer read per sample-passed (for each MRT with blend
- * enabled). If depth-test is enabled, the hw will additionally
- * a depth buffer read. If depth-write is enable, the hw will
- * additionally do a depth buffer write.
- *
- * This does ignore depth buffer traffic for samples which do not
- * pass do to depth-test fail, and some other details. But it is
- * just intended to be a rough estimate that is easy to calculate.
- */
- uint32_t total_drawcalls_cost;
-
- struct tu_lrz_state lrz;
-
- struct tu_draw_state lrz_and_depth_plane_state;
-
- struct tu_vs_params last_vs_params;
+ struct tu_cs_entry tile_load_ib;
+ struct tu_cs_entry tile_store_ib;
};
struct tu_cmd_pool
{
- struct vk_command_pool vk;
-
+ VkAllocationCallbacks alloc;
struct list_head cmd_buffers;
struct list_head free_cmd_buffers;
+ uint32_t queue_family_index;
+};
+
+struct tu_cmd_buffer_upload
+{
+ uint8_t *map;
+ unsigned offset;
+ uint64_t size;
+ struct list_head list;
};
enum tu_cmd_buffer_status
@@ -1262,116 +876,165 @@ enum tu_cmd_buffer_status
TU_CMD_BUFFER_STATUS_PENDING,
};
+struct tu_bo_list
+{
+ uint32_t count;
+ uint32_t capacity;
+ struct drm_msm_gem_submit_bo *bo_infos;
+};
+
+#define TU_BO_LIST_FAILED (~0)
+
+void
+tu_bo_list_init(struct tu_bo_list *list);
+void
+tu_bo_list_destroy(struct tu_bo_list *list);
+void
+tu_bo_list_reset(struct tu_bo_list *list);
+uint32_t
+tu_bo_list_add(struct tu_bo_list *list,
+ const struct tu_bo *bo,
+ uint32_t flags);
+VkResult
+tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other);
+
struct tu_cmd_buffer
{
- struct vk_command_buffer vk;
+ VK_LOADER_DATA _loader_data;
struct tu_device *device;
struct tu_cmd_pool *pool;
struct list_head pool_link;
- struct u_trace trace;
- struct u_trace_iterator trace_renderpass_start;
- struct u_trace_iterator trace_renderpass_end;
-
- struct list_head renderpass_autotune_results;
- struct tu_autotune_results_buffer* autotune_buffer;
-
VkCommandBufferUsageFlags usage_flags;
+ VkCommandBufferLevel level;
enum tu_cmd_buffer_status status;
struct tu_cmd_state state;
+ struct tu_vertex_binding vertex_bindings[MAX_VBS];
uint32_t queue_family_index;
- uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
+ uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
VkShaderStageFlags push_constant_stages;
struct tu_descriptor_set meta_push_descriptors;
- struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
+ struct tu_descriptor_state descriptors[VK_PIPELINE_BIND_POINT_RANGE_SIZE];
+
+ struct tu_cmd_buffer_upload upload;
VkResult record_result;
+ struct tu_bo_list bo_list;
struct tu_cs cs;
struct tu_cs draw_cs;
- struct tu_cs tile_store_cs;
- struct tu_cs draw_epilogue_cs;
- struct tu_cs sub_cs;
+ struct tu_cs tile_cs;
- uint32_t vsc_draw_strm_pitch;
- uint32_t vsc_prim_strm_pitch;
-};
+ uint16_t marker_reg;
+ uint32_t marker_seqno;
-/* Temporary struct for tracking a register state to be written, used by
- * a6xx-pack.h and tu_cs_emit_regs()
- */
-struct tu_reg_value {
- uint32_t reg;
- uint64_t value;
- bool is_address;
- struct tu_bo *bo;
- bool bo_write;
- uint32_t bo_offset;
- uint32_t bo_shift;
+ struct tu_bo scratch_bo;
+ uint32_t scratch_seqno;
+
+ bool wait_for_idle;
};
+void
+tu6_emit_event_write(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ enum vgt_event_type event,
+ bool need_seqno);
+
+bool
+tu_get_memory_fd(struct tu_device *device,
+ struct tu_device_memory *memory,
+ int *pFD);
-void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
- struct tu_cs *cs);
+/*
+ * Takes x,y,z as exact numbers of invocations, instead of blocks.
+ *
+ * Limitations: Can't call normal dispatch functions without binding or
+ * rebinding
+ * the compute pipeline.
+ */
+void
+tu_unaligned_dispatch(struct tu_cmd_buffer *cmd_buffer,
+ uint32_t x,
+ uint32_t y,
+ uint32_t z);
+
+struct tu_event
+{
+ uint64_t *map;
+};
-void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
- struct tu_cs *cs,
- enum tu_cmd_ccu_state ccu_state);
+struct tu_shader_module;
+#define TU_HASH_SHADER_IS_GEOM_COPY_SHADER (1 << 0)
+#define TU_HASH_SHADER_SISCHED (1 << 1)
+#define TU_HASH_SHADER_UNSAFE_MATH (1 << 2)
void
-tu6_emit_event_write(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- enum vgt_event_type event);
+tu_hash_shaders(unsigned char *hash,
+ const VkPipelineShaderStageCreateInfo **stages,
+ const struct tu_pipeline_layout *layout,
+ const struct tu_pipeline_key *key,
+ uint32_t flags);
+
+static inline gl_shader_stage
+vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage)
+{
+ assert(__builtin_popcount(vk_stage) == 1);
+ return ffs(vk_stage) - 1;
+}
-static inline struct tu_descriptor_state *
-tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
- VkPipelineBindPoint bind_point)
+static inline VkShaderStageFlagBits
+mesa_to_vk_shader_stage(gl_shader_stage mesa_stage)
{
- return &cmd_buffer->descriptors[bind_point];
+ return (1 << mesa_stage);
}
-struct tu_event
+#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
+
+#define tu_foreach_stage(stage, stage_bits) \
+ for (gl_shader_stage stage, \
+ __tmp = (gl_shader_stage)((stage_bits) &TU_STAGE_MASK); \
+ stage = __builtin_ffs(__tmp) - 1, __tmp; __tmp &= ~(1 << (stage)))
+
+struct tu_shader_module
{
- struct vk_object_base base;
- struct tu_bo *bo;
+ unsigned char sha1[20];
+
+ uint32_t code_size;
+ const uint32_t *code[0];
};
-struct tu_push_constant_range
+struct tu_shader_compile_options
{
- uint32_t lo;
- uint32_t count;
+ struct ir3_shader_key key;
+
+ bool optimize;
+ bool include_binning_pass;
};
struct tu_shader
{
- struct ir3_shader *ir3_shader;
+ struct ir3_shader ir3_shader;
- struct tu_push_constant_range push_consts;
- uint8_t active_desc_sets;
- bool multi_pos_output;
-};
+ /* This may be true for vertex shaders. When true, variants[1] is the
+ * binning variant and binning_binary is non-NULL.
+ */
+ bool has_binning_pass;
-bool
-tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
- struct tu_device *dev);
+ void *binary;
+ void *binning_binary;
-nir_shader *
-tu_spirv_to_nir(struct tu_device *dev,
- void *mem_ctx,
- const VkPipelineShaderStageCreateInfo *stage_info,
- gl_shader_stage stage);
+ struct ir3_shader_variant variants[0];
+};
struct tu_shader *
tu_shader_create(struct tu_device *dev,
- nir_shader *nir,
+ gl_shader_stage stage,
const VkPipelineShaderStageCreateInfo *stage_info,
- unsigned multiview_mask,
- struct tu_pipeline_layout *layout,
const VkAllocationCallbacks *alloc);
void
@@ -1379,78 +1042,50 @@ tu_shader_destroy(struct tu_device *dev,
struct tu_shader *shader,
const VkAllocationCallbacks *alloc);
-struct tu_program_descriptor_linkage
-{
- struct ir3_const_state const_state;
-
- uint32_t constlen;
-
- struct tu_push_constant_range push_consts;
-};
-
-struct tu_pipeline_executable {
- gl_shader_stage stage;
-
- struct ir3_info stats;
- bool is_binning;
+void
+tu_shader_compile_options_init(
+ struct tu_shader_compile_options *options,
+ const VkGraphicsPipelineCreateInfo *pipeline_info);
- char *nir_from_spirv;
- char *nir_final;
- char *disasm;
-};
+VkResult
+tu_shader_compile(struct tu_device *dev,
+ struct tu_shader *shader,
+ const struct tu_shader *next_stage,
+ const struct tu_shader_compile_options *options,
+ const VkAllocationCallbacks *alloc);
struct tu_pipeline
{
- struct vk_object_base base;
-
struct tu_cs cs;
- struct tu_suballoc_bo bo;
- /* Separate BO for private memory since it should GPU writable */
- struct tu_bo *pvtmem_bo;
+ struct tu_dynamic_state dynamic_state;
+
+ struct tu_pipeline_layout *layout;
bool need_indirect_descriptor_sets;
VkShaderStageFlags active_stages;
- uint32_t active_desc_sets;
-
- /* mask of enabled dynamic states
- * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
- */
- uint32_t dynamic_state_mask;
- struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
-
- /* for dynamic states which use the same register: */
- uint32_t gras_su_cntl, gras_su_cntl_mask;
- uint32_t rb_depth_cntl, rb_depth_cntl_mask;
- uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
- uint32_t pc_raster_cntl, pc_raster_cntl_mask;
- uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
- uint32_t stencil_wrmask;
-
- bool rb_depth_cntl_disable;
-
- enum a5xx_line_mode line_mode;
-
- /* draw states for the pipeline */
- struct tu_draw_state load_state, rast_state, blend_state;
- struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
-
- /* for vertex buffers state */
- uint32_t num_vbs;
struct
{
- struct tu_draw_state config_state;
- struct tu_draw_state state;
- struct tu_draw_state binning_state;
-
- struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
+ struct tu_bo binary_bo;
+ struct tu_cs_entry state_ib;
+ struct tu_cs_entry binning_state_ib;
} program;
struct
{
- struct tu_draw_state state;
- struct tu_draw_state binning_state;
+ uint8_t bindings[MAX_VERTEX_ATTRIBS];
+ uint16_t strides[MAX_VERTEX_ATTRIBS];
+ uint16_t offsets[MAX_VERTEX_ATTRIBS];
+ uint32_t count;
+
+ uint8_t binning_bindings[MAX_VERTEX_ATTRIBS];
+ uint16_t binning_strides[MAX_VERTEX_ATTRIBS];
+ uint16_t binning_offsets[MAX_VERTEX_ATTRIBS];
+ uint32_t binning_count;
+
+ struct tu_cs_entry state_ib;
+ struct tu_cs_entry binning_state_ib;
} vi;
struct
@@ -1461,47 +1096,36 @@ struct tu_pipeline
struct
{
- uint32_t patch_type;
- uint32_t param_stride;
- bool upper_left_domain_origin;
- } tess;
+ struct tu_cs_entry state_ib;
+ } vp;
struct
{
- uint32_t local_size[3];
- uint32_t subgroup_size;
- } compute;
-
- bool provoking_vertex_last;
-
- struct tu_lrz_pipeline lrz;
+ uint32_t gras_su_cntl;
+ struct tu_cs_entry state_ib;
+ } rast;
- /* In other words - framebuffer fetch support */
- bool raster_order_attachment_access;
- bool subpass_feedback_loop_ds;
-
- bool z_negative_one_to_one;
-
- /* Base drawcall cost for sysmem vs gmem autotuner */
- uint8_t drawcall_base_cost;
+ struct
+ {
+ struct tu_cs_entry state_ib;
+ } ds;
- void *executables_mem_ctx;
- /* tu_pipeline_executable */
- struct util_dynarray executables;
+ struct
+ {
+ struct tu_cs_entry state_ib;
+ } blend;
};
void
-tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
- bool z_negative_one_to_one);
+tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport);
void
-tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
+tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor);
void
-tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
-
-void
-tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
+tu6_emit_gras_su_cntl(struct tu_cs *cs,
+ uint32_t gras_su_cntl,
+ float line_width);
void
tu6_emit_depth_bias(struct tu_cs *cs,
@@ -1509,143 +1133,106 @@ tu6_emit_depth_bias(struct tu_cs *cs,
float clamp,
float slope_factor);
-void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
- enum a5xx_line_mode line_mode);
-
-void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
-
-void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
-
-void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
-
-void tu6_apply_depth_bounds_workaround(struct tu_device *device,
- uint32_t *rb_depth_cntl);
-
-struct tu_pvtmem_config {
- uint64_t iova;
- uint32_t per_fiber_size;
- uint32_t per_sp_size;
- bool per_wave;
-};
-
-void
-tu6_emit_xs_config(struct tu_cs *cs,
- gl_shader_stage stage,
- const struct ir3_shader_variant *xs);
-
-void
-tu6_emit_xs(struct tu_cs *cs,
- gl_shader_stage stage,
- const struct ir3_shader_variant *xs,
- const struct tu_pvtmem_config *pvtmem,
- uint64_t binary_iova);
-
-void
-tu6_emit_vpc(struct tu_cs *cs,
- const struct ir3_shader_variant *vs,
- const struct ir3_shader_variant *hs,
- const struct ir3_shader_variant *ds,
- const struct ir3_shader_variant *gs,
- const struct ir3_shader_variant *fs,
- uint32_t patch_control_points);
-
void
-tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
-
-struct tu_image_view;
+tu6_emit_stencil_compare_mask(struct tu_cs *cs,
+ uint32_t front,
+ uint32_t back);
void
-tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- const struct tu_image_view *src,
- const struct tu_image_view *dst,
- uint32_t layer_mask,
- uint32_t layers,
- const VkRect2D *rect);
+tu6_emit_stencil_write_mask(struct tu_cs *cs, uint32_t front, uint32_t back);
void
-tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- uint32_t a,
- const VkRenderPassBeginInfo *info);
+tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back);
void
-tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- uint32_t a,
- const VkRenderPassBeginInfo *info);
+tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]);
-void
-tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- uint32_t a,
- bool force_load);
+struct tu_userdata_info *
+tu_lookup_user_sgpr(struct tu_pipeline *pipeline,
+ gl_shader_stage stage,
+ int idx);
-/* expose this function to be able to emit load without checking LOAD_OP */
-void
-tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
+struct tu_shader_variant *
+tu_get_shader(struct tu_pipeline *pipeline, gl_shader_stage stage);
-/* note: gmem store can also resolve */
-void
-tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- uint32_t a,
- uint32_t gmem_a);
-
-enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
+struct tu_graphics_pipeline_create_info
+{
+ bool use_rectlist;
+ bool db_depth_clear;
+ bool db_stencil_clear;
+ bool db_depth_disable_expclear;
+ bool db_stencil_disable_expclear;
+ bool db_flush_depth_inplace;
+ bool db_flush_stencil_inplace;
+ bool db_resummarize;
+ uint32_t custom_blend_mode;
+};
struct tu_native_format
{
- enum a6xx_format fmt : 8;
- enum a3xx_color_swap swap : 8;
- enum a6xx_tile_mode tile_mode : 8;
+ int vtx; /* VFMTn_xxx or -1 */
+ int tex; /* TFMTn_xxx or -1 */
+ int rb; /* RBn_xxx or -1 */
+ int swap; /* enum a3xx_color_swap */
+ bool present; /* internal only; always true to external users */
};
-enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
-bool tu6_format_vtx_supported(VkFormat format);
-struct tu_native_format tu6_format_vtx(VkFormat format);
-bool tu6_format_color_supported(enum pipe_format format);
-struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
-bool tu6_format_texture_supported(enum pipe_format format);
-struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
+const struct tu_native_format *
+tu6_get_native_format(VkFormat format);
+
+int
+tu_pack_clear_value(const VkClearValue *val,
+ VkFormat format,
+ uint32_t buf[4]);
+enum a6xx_2d_ifmt tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt);
-static inline enum a6xx_format
-tu6_base_format(enum pipe_format format)
+struct tu_image_level
{
- /* note: tu6_format_color doesn't care about tiling for .fmt field */
- return tu6_format_color(format, TILE6_LINEAR).fmt;
-}
+ VkDeviceSize offset;
+ VkDeviceSize size;
+ uint32_t pitch;
+};
struct tu_image
{
- struct vk_object_base base;
-
+ VkImageType type;
/* The original VkFormat provided by the client. This may not match any
* of the actual surface formats.
*/
VkFormat vk_format;
+ VkImageAspectFlags aspects;
+ VkImageUsageFlags usage; /**< Superset of VkImageCreateInfo::usage. */
+ VkImageTiling tiling; /** VkImageCreateInfo::tiling */
+ VkImageCreateFlags flags; /** VkImageCreateInfo::flags */
+ VkExtent3D extent;
uint32_t level_count;
uint32_t layer_count;
- struct fdl_layout layout[3];
- uint32_t total_size;
+ VkDeviceSize size;
+ uint32_t alignment;
+
+ /* memory layout */
+ VkDeviceSize layer_size;
+ struct tu_image_level levels[15];
+ unsigned tile_mode;
+
+ unsigned queue_family_mask;
+ bool exclusive;
+ bool shareable;
-#ifdef ANDROID
/* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
VkDeviceMemory owned_memory;
-#endif
/* Set when bound */
- struct tu_bo *bo;
- uint64_t iova;
-
- uint32_t lrz_height;
- uint32_t lrz_pitch;
- uint32_t lrz_offset;
-
- bool shareable;
+ const struct tu_bo *bo;
+ VkDeviceSize bo_offset;
};
+unsigned
+tu_image_queue_family_mask(const struct tu_image *image,
+ uint32_t family,
+ uint32_t queue_family);
+
static inline uint32_t
tu_get_layerCount(const struct tu_image *image,
const VkImageSubresourceRange *range)
@@ -1664,108 +1251,99 @@ tu_get_levelCount(const struct tu_image *image,
: range->levelCount;
}
-enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
-
-uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
-
-enum pipe_format tu_format_for_aspect(enum pipe_format format,
- VkImageAspectFlags aspect_mask);
-
struct tu_image_view
{
- struct vk_object_base base;
-
struct tu_image *image; /**< VkImageViewCreateInfo::image */
- struct fdl6_view view;
+ VkImageViewType type;
+ VkImageAspectFlags aspect_mask;
+ VkFormat vk_format;
+ uint32_t base_layer;
+ uint32_t layer_count;
+ uint32_t base_mip;
+ uint32_t level_count;
+ VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */
- /* for d32s8 separate depth */
- uint64_t depth_base_addr;
- uint32_t depth_layer_size;
- uint32_t depth_PITCH;
+ uint32_t descriptor[16];
- /* for d32s8 separate stencil */
- uint64_t stencil_base_addr;
- uint32_t stencil_layer_size;
- uint32_t stencil_PITCH;
+ /* Descriptor for use as a storage image as opposed to a sampled image.
+ * This has a few differences for cube maps (e.g. type).
+ */
+ uint32_t storage_descriptor[16];
};
-struct tu_sampler_ycbcr_conversion {
- struct vk_object_base base;
-
- VkFormat format;
- VkSamplerYcbcrModelConversion ycbcr_model;
- VkSamplerYcbcrRange ycbcr_range;
- VkComponentMapping components;
- VkChromaLocation chroma_offsets[2];
- VkFilter chroma_filter;
+struct tu_sampler
+{
};
-struct tu_sampler {
- struct vk_object_base base;
-
- uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
- struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
+struct tu_image_create_info
+{
+ const VkImageCreateInfo *vk_info;
+ bool scanout;
+ bool no_metadata_planes;
};
-void
-tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
-
-void
-tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
-
-void
-tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
-
-void
-tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-
-void
-tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-
-#define tu_image_view_stencil(iview, x) \
- ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
-
-#define tu_image_view_depth(iview, x) \
- ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT))
-
VkResult
-tu_gralloc_info(struct tu_device *device,
- const VkNativeBufferANDROID *gralloc_info,
- int *dma_buf,
- uint64_t *modifier);
+tu_image_create(VkDevice _device,
+ const struct tu_image_create_info *info,
+ const VkAllocationCallbacks *alloc,
+ VkImage *pImage);
VkResult
-tu_import_memory_from_gralloc_handle(VkDevice device_h,
- int dma_buf,
- const VkAllocationCallbacks *alloc,
- VkImage image_h);
+tu_image_from_gralloc(VkDevice device_h,
+ const VkImageCreateInfo *base_info,
+ const VkNativeBufferANDROID *gralloc_info,
+ const VkAllocationCallbacks *alloc,
+ VkImage *out_image_h);
void
-tu_image_view_init(struct tu_image_view *iview,
- const VkImageViewCreateInfo *pCreateInfo,
- bool limited_z24s8);
-
-bool
-tiling_possible(VkFormat format);
-
-bool
-ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
- const struct fd_dev_info *info, VkSampleCountFlagBits samples);
+tu_image_view_init(struct tu_image_view *view,
+ struct tu_device *device,
+ const VkImageViewCreateInfo *pCreateInfo);
struct tu_buffer_view
{
- struct vk_object_base base;
-
- uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
-
- struct tu_buffer *buffer;
+ VkFormat vk_format;
+ uint64_t range; /**< VkBufferViewCreateInfo::range */
+ uint32_t state[4];
};
void
tu_buffer_view_init(struct tu_buffer_view *view,
struct tu_device *device,
const VkBufferViewCreateInfo *pCreateInfo);
+static inline struct VkExtent3D
+tu_sanitize_image_extent(const VkImageType imageType,
+ const struct VkExtent3D imageExtent)
+{
+ switch (imageType) {
+ case VK_IMAGE_TYPE_1D:
+ return (VkExtent3D) { imageExtent.width, 1, 1 };
+ case VK_IMAGE_TYPE_2D:
+ return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 };
+ case VK_IMAGE_TYPE_3D:
+ return imageExtent;
+ default:
+ unreachable("invalid image type");
+ }
+}
+
+static inline struct VkOffset3D
+tu_sanitize_image_offset(const VkImageType imageType,
+ const struct VkOffset3D imageOffset)
+{
+ switch (imageType) {
+ case VK_IMAGE_TYPE_1D:
+ return (VkOffset3D) { imageOffset.x, 0, 0 };
+ case VK_IMAGE_TYPE_2D:
+ return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 };
+ case VK_IMAGE_TYPE_3D:
+ return imageOffset;
+ default:
+ unreachable("invalid image type");
+ }
+}
+
struct tu_attachment_info
{
struct tu_image_view *attachment;
@@ -1773,146 +1351,100 @@ struct tu_attachment_info
struct tu_framebuffer
{
- struct vk_object_base base;
-
uint32_t width;
uint32_t height;
uint32_t layers;
- /* size of the first tile */
- VkExtent2D tile0;
- /* number of tiles */
- VkExtent2D tile_count;
-
- /* size of the first VSC pipe */
- VkExtent2D pipe0;
- /* number of VSC pipes */
- VkExtent2D pipe_count;
-
- /* pipe register values */
- uint32_t pipe_config[MAX_VSC_PIPES];
- uint32_t pipe_sizes[MAX_VSC_PIPES];
-
uint32_t attachment_count;
struct tu_attachment_info attachments[0];
};
-void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
- const struct tu_device *device,
- const struct tu_render_pass *pass);
-
-struct tu_subpass_barrier {
+struct tu_subpass_barrier
+{
VkPipelineStageFlags src_stage_mask;
- VkPipelineStageFlags dst_stage_mask;
VkAccessFlags src_access_mask;
VkAccessFlags dst_access_mask;
- bool incoherent_ccu_color, incoherent_ccu_depth;
};
+void
+tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
+ const struct tu_subpass_barrier *barrier);
+
struct tu_subpass_attachment
{
uint32_t attachment;
-
- /* For input attachments, true if it needs to be patched to refer to GMEM
- * in GMEM mode. This is false if it hasn't already been written as an
- * attachment.
- */
- bool patch_input_gmem;
+ VkImageLayout layout;
};
struct tu_subpass
{
uint32_t input_count;
uint32_t color_count;
- uint32_t resolve_count;
- bool resolve_depth_stencil;
-
- bool feedback_loop_color;
- bool feedback_loop_ds;
-
- /* True if we must invalidate UCHE thanks to a feedback loop. */
- bool feedback_invalidate;
-
- /* In other words - framebuffer fetch support */
- bool raster_order_attachment_access;
-
struct tu_subpass_attachment *input_attachments;
struct tu_subpass_attachment *color_attachments;
struct tu_subpass_attachment *resolve_attachments;
struct tu_subpass_attachment depth_stencil_attachment;
- VkSampleCountFlagBits samples;
-
- uint32_t srgb_cntl;
- uint32_t multiview_mask;
+ /** Subpass has at least one resolve attachment */
+ bool has_resolve;
struct tu_subpass_barrier start_barrier;
+
+ uint32_t view_mask;
+ VkSampleCountFlagBits max_sample_count;
};
struct tu_render_pass_attachment
{
VkFormat format;
uint32_t samples;
- uint32_t cpp;
- VkImageAspectFlags clear_mask;
- uint32_t clear_views;
- bool load;
- bool store;
- int32_t gmem_offset;
- /* for D32S8 separate stencil: */
- bool load_stencil;
- bool store_stencil;
- int32_t gmem_offset_stencil;
+ VkAttachmentLoadOp load_op;
+ VkAttachmentLoadOp stencil_load_op;
+ VkImageLayout initial_layout;
+ VkImageLayout final_layout;
+ uint32_t view_mask;
};
struct tu_render_pass
{
- struct vk_object_base base;
-
uint32_t attachment_count;
uint32_t subpass_count;
- uint32_t gmem_pixels;
- uint32_t tile_align_w;
struct tu_subpass_attachment *subpass_attachments;
struct tu_render_pass_attachment *attachments;
struct tu_subpass_barrier end_barrier;
struct tu_subpass subpasses[0];
};
-#define PERF_CNTRS_REG 4
-
-struct tu_perf_query_data
-{
- uint32_t gid; /* group-id */
- uint32_t cid; /* countable-id within the group */
- uint32_t cntr_reg; /* counter register within the group */
- uint32_t pass; /* pass index that countables can be requested */
- uint32_t app_idx; /* index provided by apps */
-};
+VkResult
+tu_device_init_meta(struct tu_device *device);
+void
+tu_device_finish_meta(struct tu_device *device);
struct tu_query_pool
{
- struct vk_object_base base;
-
- VkQueryType type;
uint32_t stride;
+ uint32_t availability_offset;
uint64_t size;
- uint32_t pipeline_statistics;
- struct tu_bo *bo;
+ char *ptr;
+ VkQueryType type;
+ uint32_t pipeline_stats_mask;
+};
- /* For performance query */
- const struct fd_perfcntr_group *perf_group;
- uint32_t perf_group_count;
- uint32_t counter_index_count;
- struct tu_perf_query_data perf_query_data[0];
+struct tu_semaphore
+{
+ uint32_t syncobj;
+ uint32_t temp_syncobj;
};
-uint32_t
-tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
+void
+tu_set_descriptor_set(struct tu_cmd_buffer *cmd_buffer,
+ VkPipelineBindPoint bind_point,
+ struct tu_descriptor_set *set,
+ unsigned idx);
void
-tu_update_descriptor_sets(const struct tu_device *device,
+tu_update_descriptor_sets(struct tu_device *device,
+ struct tu_cmd_buffer *cmd_buffer,
VkDescriptorSet overrideSet,
uint32_t descriptorWriteCount,
const VkWriteDescriptorSet *pDescriptorWrites,
@@ -1921,24 +1453,25 @@ tu_update_descriptor_sets(const struct tu_device *device,
void
tu_update_descriptor_set_with_template(
- const struct tu_device *device,
+ struct tu_device *device,
+ struct tu_cmd_buffer *cmd_buffer,
struct tu_descriptor_set *set,
VkDescriptorUpdateTemplate descriptorUpdateTemplate,
const void *pData);
-VkResult
-tu_physical_device_init(struct tu_physical_device *device,
- struct tu_instance *instance);
-VkResult
-tu_enumerate_devices(struct tu_instance *instance);
+void
+tu_meta_push_descriptor_set(struct tu_cmd_buffer *cmd_buffer,
+ VkPipelineBindPoint pipelineBindPoint,
+ VkPipelineLayout _layout,
+ uint32_t set,
+ uint32_t descriptorWriteCount,
+ const VkWriteDescriptorSet *pDescriptorWrites);
int
-tu_device_get_gpu_timestamp(struct tu_device *dev,
- uint64_t *ts);
+tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id);
int
-tu_device_get_suspend_count(struct tu_device *dev,
- uint64_t *suspend_count);
+tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size);
int
tu_drm_submitqueue_new(const struct tu_device *dev,
@@ -1948,116 +1481,76 @@ tu_drm_submitqueue_new(const struct tu_device *dev,
void
tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
+uint32_t
+tu_gem_new(const struct tu_device *dev, uint64_t size, uint32_t flags);
+uint32_t
+tu_gem_import_dmabuf(const struct tu_device *dev,
+ int prime_fd,
+ uint64_t size);
int
-tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2);
-
-int
-tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);
-
-VkResult
-tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);
-
+tu_gem_export_dmabuf(const struct tu_device *dev, uint32_t gem_handle);
void
-tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
- void *ts_from, uint32_t from_offset,
- void *ts_to, uint32_t to_offset,
- uint32_t count);
-
-
-VkResult
-tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
- struct u_trace **trace_copy);
-
-/* If we copy trace and timestamps we will have to free them. */
-struct tu_u_trace_cmd_data
-{
- struct tu_cs *timestamp_copy_cs;
- struct u_trace *trace;
-};
-
-/* Data necessary to retrieve timestamps and clean all
- * associated resources afterwards.
- */
-struct tu_u_trace_submission_data
-{
- uint32_t submission_id;
- /* We have to know when timestamps are available,
- * this sync object indicates it.
- */
- struct tu_u_trace_syncobj *syncobj;
-
- uint32_t cmd_buffer_count;
- uint32_t last_buffer_with_tracepoints;
- struct tu_u_trace_cmd_data *cmd_trace_data;
-};
-
-VkResult
-tu_u_trace_submission_data_create(
- struct tu_device *device,
- struct tu_cmd_buffer **cmd_buffers,
- uint32_t cmd_buffer_count,
- struct tu_u_trace_submission_data **submission_data);
-
-void
-tu_u_trace_submission_data_finish(
- struct tu_device *device,
- struct tu_u_trace_submission_data *submission_data);
+tu_gem_close(const struct tu_device *dev, uint32_t gem_handle);
+uint64_t
+tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle);
+uint64_t
+tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle);
+
+#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType) \
+ \
+ static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
+ { \
+ return (struct __tu_type *) _handle; \
+ } \
+ \
+ static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj) \
+ { \
+ return (__VkType) _obj; \
+ }
+
+#define TU_DEFINE_NONDISP_HANDLE_CASTS(__tu_type, __VkType) \
+ \
+ static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
+ { \
+ return (struct __tu_type *) (uintptr_t) _handle; \
+ } \
+ \
+ static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj) \
+ { \
+ return (__VkType)(uintptr_t) _obj; \
+ }
#define TU_FROM_HANDLE(__tu_type, __name, __handle) \
- VK_FROM_HANDLE(__tu_type, __name, __handle)
-
-VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
- VK_OBJECT_TYPE_COMMAND_BUFFER)
-VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
-VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
- VK_OBJECT_TYPE_INSTANCE)
-VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
- VK_OBJECT_TYPE_PHYSICAL_DEVICE)
-VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
-
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
- VK_OBJECT_TYPE_COMMAND_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
- VK_OBJECT_TYPE_BUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
- VK_OBJECT_TYPE_BUFFER_VIEW)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
- VK_OBJECT_TYPE_DESCRIPTOR_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
- VK_OBJECT_TYPE_DESCRIPTOR_SET)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
- VkDescriptorSetLayout,
- VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
- VkDescriptorUpdateTemplate,
- VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
- VK_OBJECT_TYPE_DEVICE_MEMORY)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
- VK_OBJECT_TYPE_FRAMEBUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
- VK_OBJECT_TYPE_IMAGE_VIEW);
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
- VK_OBJECT_TYPE_PIPELINE_CACHE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
- VK_OBJECT_TYPE_PIPELINE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
- VK_OBJECT_TYPE_PIPELINE_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
- VK_OBJECT_TYPE_QUERY_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
- VK_OBJECT_TYPE_RENDER_PASS)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
- VK_OBJECT_TYPE_SAMPLER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
- VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
-
-/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
-#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
-
-void
-update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
+ struct __tu_type *__name = __tu_type##_from_handle(__handle)
+
+TU_DEFINE_HANDLE_CASTS(tu_cmd_buffer, VkCommandBuffer)
+TU_DEFINE_HANDLE_CASTS(tu_device, VkDevice)
+TU_DEFINE_HANDLE_CASTS(tu_instance, VkInstance)
+TU_DEFINE_HANDLE_CASTS(tu_physical_device, VkPhysicalDevice)
+TU_DEFINE_HANDLE_CASTS(tu_queue, VkQueue)
+
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, VkCommandPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, VkBuffer)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, VkBufferView)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, VkDescriptorPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, VkDescriptorSet)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout,
+ VkDescriptorSetLayout)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template,
+ VkDescriptorUpdateTemplate)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, VkDeviceMemory)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_fence, VkFence)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_event, VkEvent)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, VkFramebuffer)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image, VkImage)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, VkImageView);
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, VkPipelineCache)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, VkPipeline)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_shader_module, VkShaderModule)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_semaphore, VkSemaphore)
#endif /* TU_PRIVATE_H */
diff --git a/lib/mesa/src/freedreno/vulkan/vk_format.h b/lib/mesa/src/freedreno/vulkan/vk_format.h
index c335a12ae..4e13bc9c0 100644
--- a/lib/mesa/src/freedreno/vulkan/vk_format.h
+++ b/lib/mesa/src/freedreno/vulkan/vk_format.h
@@ -3,7 +3,7 @@
* Copyright © 2016 Bas Nieuwenhuizen
*
* Based on u_format.h which is:
- * Copyright 2009-2010 VMware, Inc.
+ * Copyright 2009-2010 Vmware, Inc.
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
@@ -29,29 +29,549 @@
#include <assert.h>
#include <util/macros.h>
-#include <util/format/u_format.h>
-#include <vulkan/util/vk_format.h>
#include <vulkan/vulkan.h>
+enum vk_format_layout
+{
+ /**
+ * Formats with vk_format_block::width == vk_format_block::height == 1
+ * that can be described as an ordinary data structure.
+ */
+ VK_FORMAT_LAYOUT_PLAIN = 0,
+
+ /**
+ * Formats with sub-sampled channels.
+ *
+ * This is for formats like YVYU where there is less than one sample per
+ * pixel.
+ */
+ VK_FORMAT_LAYOUT_SUBSAMPLED = 3,
+
+ /**
+ * S3 Texture Compression formats.
+ */
+ VK_FORMAT_LAYOUT_S3TC = 4,
+
+ /**
+ * Red-Green Texture Compression formats.
+ */
+ VK_FORMAT_LAYOUT_RGTC = 5,
+
+ /**
+ * Ericsson Texture Compression
+ */
+ VK_FORMAT_LAYOUT_ETC = 6,
+
+ /**
+ * BC6/7 Texture Compression
+ */
+ VK_FORMAT_LAYOUT_BPTC = 7,
+
+ /**
+ * ASTC
+ */
+ VK_FORMAT_LAYOUT_ASTC = 8,
+
+ /**
+ * Everything else that doesn't fit in any of the above layouts.
+ */
+ VK_FORMAT_LAYOUT_OTHER = 9
+};
+
+struct vk_format_block
+{
+ /** Block width in pixels */
+ unsigned width;
+
+ /** Block height in pixels */
+ unsigned height;
+
+ /** Block size in bits */
+ unsigned bits;
+};
+
+enum vk_format_type
+{
+ VK_FORMAT_TYPE_VOID = 0,
+ VK_FORMAT_TYPE_UNSIGNED = 1,
+ VK_FORMAT_TYPE_SIGNED = 2,
+ VK_FORMAT_TYPE_FIXED = 3,
+ VK_FORMAT_TYPE_FLOAT = 4
+};
+
+enum vk_format_colorspace
+{
+ VK_FORMAT_COLORSPACE_RGB = 0,
+ VK_FORMAT_COLORSPACE_SRGB = 1,
+ VK_FORMAT_COLORSPACE_YUV = 2,
+ VK_FORMAT_COLORSPACE_ZS = 3
+};
+
+struct vk_format_channel_description
+{
+ unsigned type : 5;
+ unsigned normalized : 1;
+ unsigned pure_integer : 1;
+ unsigned scaled : 1;
+ unsigned size : 8;
+ unsigned shift : 16;
+};
+
+struct vk_format_description
+{
+ VkFormat format;
+ const char *name;
+ const char *short_name;
+
+ struct vk_format_block block;
+ enum vk_format_layout layout;
+
+ unsigned nr_channels : 3;
+ unsigned is_array : 1;
+ unsigned is_bitmask : 1;
+ unsigned is_mixed : 1;
+
+ struct vk_format_channel_description channel[4];
+
+ unsigned char swizzle[4];
+
+ enum vk_format_colorspace colorspace;
+};
+
+extern const struct vk_format_description vk_format_description_table[];
+
+const struct vk_format_description *
+vk_format_description(VkFormat format);
+
+/**
+ * Return total bits needed for the pixel format per block.
+ */
+static inline unsigned
+vk_format_get_blocksizebits(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ assert(desc);
+ if (!desc) {
+ return 0;
+ }
+
+ return desc->block.bits;
+}
+
+/**
+ * Return bytes per block (not pixel) for the given format.
+ */
+static inline unsigned
+vk_format_get_blocksize(VkFormat format)
+{
+ unsigned bits = vk_format_get_blocksizebits(format);
+ unsigned bytes = bits / 8;
+
+ assert(bits % 8 == 0);
+ assert(bytes > 0);
+ if (bytes == 0) {
+ bytes = 1;
+ }
+
+ return bytes;
+}
+
+static inline unsigned
+vk_format_get_blockwidth(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ assert(desc);
+ if (!desc) {
+ return 1;
+ }
+
+ return desc->block.width;
+}
+
+static inline unsigned
+vk_format_get_blockheight(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ assert(desc);
+ if (!desc) {
+ return 1;
+ }
+
+ return desc->block.height;
+}
+
+static inline unsigned
+vk_format_get_block_count_width(VkFormat format, unsigned width)
+{
+ unsigned blockwidth = vk_format_get_blockwidth(format);
+ return (width + blockwidth - 1) / blockwidth;
+}
+
+static inline unsigned
+vk_format_get_block_count_height(VkFormat format, unsigned height)
+{
+ unsigned blockheight = vk_format_get_blockheight(format);
+ return (height + blockheight - 1) / blockheight;
+}
+
+static inline unsigned
+vk_format_get_block_count(VkFormat format, unsigned width, unsigned height)
+{
+ return vk_format_get_block_count_width(format, width) *
+ vk_format_get_block_count_height(format, height);
+}
+
+/**
+ * Return the index of the first non-void channel
+ * -1 if no non-void channels
+ */
+static inline int
+vk_format_get_first_non_void_channel(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+ int i;
+
+ for (i = 0; i < 4; i++)
+ if (desc->channel[i].type != VK_FORMAT_TYPE_VOID)
+ break;
+
+ if (i == 4)
+ return -1;
+
+ return i;
+}
+
+enum vk_swizzle
+{
+ VK_SWIZZLE_X,
+ VK_SWIZZLE_Y,
+ VK_SWIZZLE_Z,
+ VK_SWIZZLE_W,
+ VK_SWIZZLE_0,
+ VK_SWIZZLE_1,
+ VK_SWIZZLE_NONE,
+ VK_SWIZZLE_MAX, /**< Number of enums counter (must be last) */
+};
+
+static inline VkImageAspectFlags
+vk_format_aspects(VkFormat format)
+{
+ switch (format) {
+ case VK_FORMAT_UNDEFINED:
+ return 0;
+
+ case VK_FORMAT_S8_UINT:
+ return VK_IMAGE_ASPECT_STENCIL_BIT;
+
+ case VK_FORMAT_D16_UNORM_S8_UINT:
+ case VK_FORMAT_D24_UNORM_S8_UINT:
+ case VK_FORMAT_D32_SFLOAT_S8_UINT:
+ return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+
+ case VK_FORMAT_D16_UNORM:
+ case VK_FORMAT_X8_D24_UNORM_PACK32:
+ case VK_FORMAT_D32_SFLOAT:
+ return VK_IMAGE_ASPECT_DEPTH_BIT;
+
+ default:
+ return VK_IMAGE_ASPECT_COLOR_BIT;
+ }
+}
+
+static inline enum vk_swizzle
+tu_swizzle_conv(VkComponentSwizzle component,
+ const unsigned char chan[4],
+ VkComponentSwizzle vk_swiz)
+{
+ int x;
+
+ if (vk_swiz == VK_COMPONENT_SWIZZLE_IDENTITY)
+ vk_swiz = component;
+ switch (vk_swiz) {
+ case VK_COMPONENT_SWIZZLE_ZERO:
+ return VK_SWIZZLE_0;
+ case VK_COMPONENT_SWIZZLE_ONE:
+ return VK_SWIZZLE_1;
+ case VK_COMPONENT_SWIZZLE_R:
+ for (x = 0; x < 4; x++)
+ if (chan[x] == 0)
+ return x;
+ return VK_SWIZZLE_0;
+ case VK_COMPONENT_SWIZZLE_G:
+ for (x = 0; x < 4; x++)
+ if (chan[x] == 1)
+ return x;
+ return VK_SWIZZLE_0;
+ case VK_COMPONENT_SWIZZLE_B:
+ for (x = 0; x < 4; x++)
+ if (chan[x] == 2)
+ return x;
+ return VK_SWIZZLE_0;
+ case VK_COMPONENT_SWIZZLE_A:
+ for (x = 0; x < 4; x++)
+ if (chan[x] == 3)
+ return x;
+ return VK_SWIZZLE_1;
+ default:
+ unreachable("Illegal swizzle");
+ }
+}
+
+static inline void
+vk_format_compose_swizzles(const VkComponentMapping *mapping,
+ const unsigned char swz[4],
+ enum vk_swizzle dst[4])
+{
+ dst[0] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_R, swz, mapping->r);
+ dst[1] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_G, swz, mapping->g);
+ dst[2] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_B, swz, mapping->b);
+ dst[3] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_A, swz, mapping->a);
+}
+
+static inline bool
+vk_format_is_compressed(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ assert(desc);
+ if (!desc) {
+ return false;
+ }
+
+ switch (desc->layout) {
+ case VK_FORMAT_LAYOUT_S3TC:
+ case VK_FORMAT_LAYOUT_RGTC:
+ case VK_FORMAT_LAYOUT_ETC:
+ case VK_FORMAT_LAYOUT_BPTC:
+ case VK_FORMAT_LAYOUT_ASTC:
+ /* XXX add other formats in the future */
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool
+vk_format_has_depth(const struct vk_format_description *desc)
+{
+ return desc->colorspace == VK_FORMAT_COLORSPACE_ZS &&
+ desc->swizzle[0] != VK_SWIZZLE_NONE;
+}
+
+static inline bool
+vk_format_has_stencil(const struct vk_format_description *desc)
+{
+ return desc->colorspace == VK_FORMAT_COLORSPACE_ZS &&
+ desc->swizzle[1] != VK_SWIZZLE_NONE;
+}
+
+static inline bool
+vk_format_is_depth_or_stencil(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ assert(desc);
+ if (!desc) {
+ return false;
+ }
+
+ return vk_format_has_depth(desc) || vk_format_has_stencil(desc);
+}
+
+static inline bool
+vk_format_is_depth(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ assert(desc);
+ if (!desc) {
+ return false;
+ }
+
+ return vk_format_has_depth(desc);
+}
+
+static inline bool
+vk_format_is_stencil(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ assert(desc);
+ if (!desc) {
+ return false;
+ }
+
+ return vk_format_has_stencil(desc);
+}
+
+static inline bool
+vk_format_is_color(VkFormat format)
+{
+ return !vk_format_is_depth_or_stencil(format);
+}
+
+static inline bool
+vk_format_has_alpha(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+
+ return (desc->colorspace == VK_FORMAT_COLORSPACE_RGB ||
+ desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) &&
+ desc->swizzle[3] != VK_SWIZZLE_1;
+}
+
+static inline VkFormat
+vk_format_depth_only(VkFormat format)
+{
+ switch (format) {
+ case VK_FORMAT_D16_UNORM_S8_UINT:
+ return VK_FORMAT_D16_UNORM;
+ case VK_FORMAT_D24_UNORM_S8_UINT:
+ return VK_FORMAT_X8_D24_UNORM_PACK32;
+ case VK_FORMAT_D32_SFLOAT_S8_UINT:
+ return VK_FORMAT_D32_SFLOAT;
+ default:
+ return format;
+ }
+}
+
+static inline bool
+vk_format_is_int(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+ int channel = vk_format_get_first_non_void_channel(format);
+
+ return channel >= 0 && desc->channel[channel].pure_integer;
+}
+
+static inline bool
+vk_format_is_srgb(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+ return desc->colorspace == VK_FORMAT_COLORSPACE_SRGB;
+}
+
+static inline VkFormat
+vk_format_no_srgb(VkFormat format)
+{
+ switch (format) {
+ case VK_FORMAT_R8_SRGB:
+ return VK_FORMAT_R8_UNORM;
+ case VK_FORMAT_R8G8_SRGB:
+ return VK_FORMAT_R8G8_UNORM;
+ case VK_FORMAT_R8G8B8_SRGB:
+ return VK_FORMAT_R8G8B8_UNORM;
+ case VK_FORMAT_B8G8R8_SRGB:
+ return VK_FORMAT_B8G8R8_UNORM;
+ case VK_FORMAT_R8G8B8A8_SRGB:
+ return VK_FORMAT_R8G8B8A8_UNORM;
+ case VK_FORMAT_B8G8R8A8_SRGB:
+ return VK_FORMAT_B8G8R8A8_UNORM;
+ case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+ return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
+ case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
+ return VK_FORMAT_BC1_RGB_UNORM_BLOCK;
+ case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
+ return VK_FORMAT_BC1_RGBA_UNORM_BLOCK;
+ case VK_FORMAT_BC2_SRGB_BLOCK:
+ return VK_FORMAT_BC2_UNORM_BLOCK;
+ case VK_FORMAT_BC3_SRGB_BLOCK:
+ return VK_FORMAT_BC3_UNORM_BLOCK;
+ case VK_FORMAT_BC7_SRGB_BLOCK:
+ return VK_FORMAT_BC7_UNORM_BLOCK;
+ case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+ return VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK;
+ case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+ return VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK;
+ case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+ return VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK;
+ default:
+ assert(!vk_format_is_srgb(format));
+ return format;
+ }
+}
+
+static inline VkFormat
+vk_format_stencil_only(VkFormat format)
+{
+ return VK_FORMAT_S8_UINT;
+}
+
static inline unsigned
vk_format_get_component_bits(VkFormat format,
- enum util_format_colorspace colorspace,
+ enum vk_format_colorspace colorspace,
unsigned component)
{
+ const struct vk_format_description *desc = vk_format_description(format);
+ enum vk_format_colorspace desc_colorspace;
+
+ assert(format);
+ if (!format) {
+ return 0;
+ }
+
+ assert(component < 4);
+
+ /* Treat RGB and SRGB as equivalent. */
+ if (colorspace == VK_FORMAT_COLORSPACE_SRGB) {
+ colorspace = VK_FORMAT_COLORSPACE_RGB;
+ }
+ if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) {
+ desc_colorspace = VK_FORMAT_COLORSPACE_RGB;
+ } else {
+ desc_colorspace = desc->colorspace;
+ }
+
+ if (desc_colorspace != colorspace) {
+ return 0;
+ }
+
+ switch (desc->swizzle[component]) {
+ case VK_SWIZZLE_X:
+ return desc->channel[0].size;
+ case VK_SWIZZLE_Y:
+ return desc->channel[1].size;
+ case VK_SWIZZLE_Z:
+ return desc->channel[2].size;
+ case VK_SWIZZLE_W:
+ return desc->channel[3].size;
+ default:
+ return 0;
+ }
+}
+
+static inline VkFormat
+vk_to_non_srgb_format(VkFormat format)
+{
switch (format) {
- case VK_FORMAT_G8B8G8R8_422_UNORM:
- case VK_FORMAT_B8G8R8G8_422_UNORM:
- case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
- case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
- /* util_format_get_component_bits doesn't return what we want */
- return 8;
+ case VK_FORMAT_R8_SRGB:
+ return VK_FORMAT_R8_UNORM;
+ case VK_FORMAT_R8G8_SRGB:
+ return VK_FORMAT_R8G8_UNORM;
+ case VK_FORMAT_R8G8B8_SRGB:
+ return VK_FORMAT_R8G8B8_UNORM;
+ case VK_FORMAT_B8G8R8_SRGB:
+ return VK_FORMAT_B8G8R8_UNORM;
+ case VK_FORMAT_R8G8B8A8_SRGB:
+ return VK_FORMAT_R8G8B8A8_UNORM;
+ case VK_FORMAT_B8G8R8A8_SRGB:
+ return VK_FORMAT_B8G8R8A8_UNORM;
+ case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+ return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
default:
- break;
+ return format;
}
+}
- return util_format_get_component_bits(vk_format_to_pipe_format(format),
- colorspace, component);
+static inline unsigned
+vk_format_get_nr_components(VkFormat format)
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+ return desc->nr_channels;
}
#endif /* VK_FORMAT_H */
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp
index 0a70c6881..0ad6087e5 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp
@@ -24,11 +24,9 @@
namespace nv50_ir {
-ConverterCommon::ConverterCommon(Program *prog, nv50_ir_prog_info *info,
- nv50_ir_prog_info_out *info_out)
+ConverterCommon::ConverterCommon(Program *prog, nv50_ir_prog_info *info)
: BuildUtil(prog),
- info(info),
- info_out(info_out) {}
+ info(info) {}
ConverterCommon::Subroutine *
ConverterCommon::getSubroutine(unsigned ip)
@@ -84,7 +82,7 @@ ConverterCommon::handleUserClipPlanes()
int n, i, c;
for (c = 0; c < 4; ++c) {
- for (i = 0; i < info_out->io.genUserClip; ++i) {
+ for (i = 0; i < info->io.genUserClip; ++i) {
Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot,
TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
@@ -95,13 +93,13 @@ ConverterCommon::handleUserClipPlanes()
}
}
- const int first = info_out->numOutputs - (info_out->io.genUserClip + 3) / 4;
+ const int first = info->numOutputs - (info->io.genUserClip + 3) / 4;
- for (i = 0; i < info_out->io.genUserClip; ++i) {
+ for (i = 0; i < info->io.genUserClip; ++i) {
n = i / 4 + first;
c = i % 4;
Symbol *sym =
- mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info_out->out[n].slot[c] * 4);
+ mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info->out[n].slot[c] * 4);
mkStore(OP_EXPORT, TYPE_F32, sym, NULL, res[i]);
}
}
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h
index a144ca23a..e44eea86a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h
@@ -28,7 +28,7 @@ namespace nv50_ir {
class ConverterCommon : public BuildUtil
{
public:
- ConverterCommon(Program *, nv50_ir_prog_info *, nv50_ir_prog_info_out *);
+ ConverterCommon(Program *, nv50_ir_prog_info *);
protected:
struct Subroutine
{
@@ -50,7 +50,6 @@ protected:
} sub;
struct nv50_ir_prog_info *info;
- struct nv50_ir_prog_info_out *info_out;
Value *fragCoord[4];
Value *clipVtx[4];
Value *outBase; // base address of vertex out patch (for TCP)
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
index 5df3f3168..950923a0d 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
@@ -25,13 +25,11 @@
#include "compiler/nir/nir.h"
#include "util/u_debug.h"
-#include "util/u_prim.h"
#include "codegen/nv50_ir.h"
#include "codegen/nv50_ir_from_common.h"
#include "codegen/nv50_ir_lowering_helper.h"
#include "codegen/nv50_ir_util.h"
-#include "tgsi/tgsi_from_mesa.h"
#if __cplusplus >= 201103L
#include <unordered_map>
@@ -60,32 +58,17 @@ type_size(const struct glsl_type *type, bool bindless)
return glsl_count_attribute_slots(type, false);
}
-static void
-function_temp_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
-{
- assert(glsl_type_is_vector_or_scalar(type));
-
- if (glsl_type_is_scalar(type)) {
- glsl_get_natural_size_align_bytes(type, size, align);
- } else {
- unsigned comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
- unsigned length = glsl_get_vector_elements(type);
-
- *size = comp_size * length;
- *align = 0x10;
- }
-}
-
class Converter : public ConverterCommon
{
public:
- Converter(Program *, nir_shader *, nv50_ir_prog_info *, nv50_ir_prog_info_out *);
+ Converter(Program *, nir_shader *, nv50_ir_prog_info *);
bool run();
private:
typedef std::vector<LValue*> LValues;
typedef unordered_map<unsigned, LValues> NirDefMap;
typedef unordered_map<unsigned, nir_load_const_instr*> ImmediateMap;
+ typedef unordered_map<unsigned, uint32_t> NirArrayLMemOffsets;
typedef unordered_map<unsigned, BasicBlock*> NirBlockMap;
CacheMode convert(enum gl_access_qualifier);
@@ -98,6 +81,8 @@ private:
LValues& convert(nir_register *);
LValues& convert(nir_ssa_def *);
+ ImgFormat convertGLImgFormat(GLuint);
+
Value* getSrc(nir_alu_src *, uint8_t component = 0);
Value* getSrc(nir_register *, uint8_t);
Value* getSrc(nir_src *, uint8_t, bool indirect = false);
@@ -137,10 +122,9 @@ private:
DataType getDType(nir_alu_instr *);
DataType getDType(nir_intrinsic_instr *);
+ DataType getDType(nir_intrinsic_instr *, bool isSigned);
DataType getDType(nir_op, uint8_t);
- DataFile getFile(nir_intrinsic_op);
-
std::vector<DataType> getSTypes(nir_alu_instr *);
DataType getSType(nir_src &, bool isFloat, bool isSigned);
@@ -160,6 +144,7 @@ private:
bool visit(nir_alu_instr *);
bool visit(nir_block *);
bool visit(nir_cf_node *);
+ bool visit(nir_deref_instr *);
bool visit(nir_function *);
bool visit(nir_if *);
bool visit(nir_instr *);
@@ -171,16 +156,21 @@ private:
bool visit(nir_tex_instr *);
// tex stuff
+ Value* applyProjection(Value *src, Value *proj);
unsigned int getNIRArgCount(TexInstruction::Target&);
+ // image stuff
+ uint16_t handleDeref(nir_deref_instr *, Value * & indirect, const nir_variable * &);
+ CacheMode getCacheModeFromVar(const nir_variable *);
+
nir_shader *nir;
NirDefMap ssaDefs;
NirDefMap regDefs;
ImmediateMap immediates;
+ NirArrayLMemOffsets regToLmemOffset;
NirBlockMap blocks;
unsigned int curLoopDepth;
- unsigned int curIfDepth;
BasicBlock *exit;
Value *zero;
@@ -195,14 +185,10 @@ private:
};
};
-Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info,
- nv50_ir_prog_info_out *info_out)
- : ConverterCommon(prog, info, info_out),
+Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info)
+ : ConverterCommon(prog, info),
nir(nir),
curLoopDepth(0),
- curIfDepth(0),
- exit(NULL),
- immInsertPos(NULL),
clipVertexOutput(-1)
{
zero = mkImm((uint32_t)0);
@@ -274,33 +260,29 @@ Converter::getDType(nir_alu_instr *insn)
DataType
Converter::getDType(nir_intrinsic_instr *insn)
{
- bool isFloat, isSigned;
+ bool isSigned;
switch (insn->intrinsic) {
- case nir_intrinsic_bindless_image_atomic_fadd:
- case nir_intrinsic_global_atomic_fadd:
- case nir_intrinsic_image_atomic_fadd:
- case nir_intrinsic_shared_atomic_fadd:
- case nir_intrinsic_ssbo_atomic_fadd:
- isFloat = true;
- isSigned = false;
- break;
case nir_intrinsic_shared_atomic_imax:
case nir_intrinsic_shared_atomic_imin:
case nir_intrinsic_ssbo_atomic_imax:
case nir_intrinsic_ssbo_atomic_imin:
- isFloat = false;
isSigned = true;
break;
default:
- isFloat = false;
isSigned = false;
break;
}
+ return getDType(insn, isSigned);
+}
+
+DataType
+Converter::getDType(nir_intrinsic_instr *insn, bool isSigned)
+{
if (insn->dest.is_ssa)
- return typeOfSize(insn->dest.ssa.bit_size / 8, isFloat, isSigned);
+ return typeOfSize(insn->dest.ssa.bit_size / 8, false, isSigned);
else
- return typeOfSize(insn->dest.reg.reg->bit_size / 8, isFloat, isSigned);
+ return typeOfSize(insn->dest.reg.reg->bit_size / 8, false, isSigned);
}
DataType
@@ -358,29 +340,6 @@ Converter::getSType(nir_src &src, bool isFloat, bool isSigned)
return ty;
}
-DataFile
-Converter::getFile(nir_intrinsic_op op)
-{
- switch (op) {
- case nir_intrinsic_load_global:
- case nir_intrinsic_store_global:
- case nir_intrinsic_load_global_constant:
- return FILE_MEMORY_GLOBAL;
- case nir_intrinsic_load_scratch:
- case nir_intrinsic_store_scratch:
- return FILE_MEMORY_LOCAL;
- case nir_intrinsic_load_shared:
- case nir_intrinsic_store_shared:
- return FILE_MEMORY_SHARED;
- case nir_intrinsic_load_kernel_input:
- return FILE_SHADER_INPUT;
- default:
- ERROR("couldn't get DateFile for op %s\n", nir_intrinsic_infos[op].name);
- assert(false);
- }
- return FILE_NULL;
-}
-
operation
Converter::getOperation(nir_op op)
{
@@ -480,7 +439,7 @@ Converter::getOperation(nir_op op)
case nir_op_flt32:
case nir_op_ilt32:
case nir_op_ult32:
- case nir_op_fneu32:
+ case nir_op_fne32:
case nir_op_ine32:
return OP_SET;
case nir_op_ishl:
@@ -492,6 +451,9 @@ Converter::getOperation(nir_op op)
return OP_SIN;
case nir_op_fsqrt:
return OP_SQRT;
+ case nir_op_fsub:
+ case nir_op_isub:
+ return OP_SUB;
case nir_op_ftrunc:
return OP_TRUNC;
case nir_op_ixor:
@@ -543,39 +505,43 @@ Converter::getOperation(nir_intrinsic_op op)
return OP_RESTART;
case nir_intrinsic_bindless_image_atomic_add:
case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_deref_atomic_add:
case nir_intrinsic_bindless_image_atomic_and:
case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_bindless_image_atomic_comp_swap:
case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_exchange:
case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_bindless_image_atomic_imax:
- case nir_intrinsic_image_atomic_imax:
- case nir_intrinsic_bindless_image_atomic_umax:
- case nir_intrinsic_image_atomic_umax:
- case nir_intrinsic_bindless_image_atomic_imin:
- case nir_intrinsic_image_atomic_imin:
- case nir_intrinsic_bindless_image_atomic_umin:
- case nir_intrinsic_image_atomic_umin:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_bindless_image_atomic_max:
+ case nir_intrinsic_image_atomic_max:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_bindless_image_atomic_min:
+ case nir_intrinsic_image_atomic_min:
+ case nir_intrinsic_image_deref_atomic_min:
case nir_intrinsic_bindless_image_atomic_or:
case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_bindless_image_atomic_xor:
case nir_intrinsic_image_atomic_xor:
- case nir_intrinsic_bindless_image_atomic_inc_wrap:
- case nir_intrinsic_image_atomic_inc_wrap:
- case nir_intrinsic_bindless_image_atomic_dec_wrap:
- case nir_intrinsic_image_atomic_dec_wrap:
+ case nir_intrinsic_image_deref_atomic_xor:
return OP_SUREDP;
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_image_load:
+ case nir_intrinsic_image_deref_load:
return OP_SULDP;
case nir_intrinsic_bindless_image_samples:
case nir_intrinsic_image_samples:
+ case nir_intrinsic_image_deref_samples:
case nir_intrinsic_bindless_image_size:
case nir_intrinsic_image_size:
+ case nir_intrinsic_image_deref_size:
return OP_SUQ;
case nir_intrinsic_bindless_image_store:
case nir_intrinsic_image_store:
+ case nir_intrinsic_image_deref_store:
return OP_SUSTP;
default:
ERROR("couldn't get operation for nir_intrinsic_op %u\n", op);
@@ -603,10 +569,6 @@ Converter::getSubOp(nir_op op)
case nir_op_imul_high:
case nir_op_umul_high:
return NV50_IR_SUBOP_MUL_HIGH;
- case nir_op_ishl:
- case nir_op_ishr:
- case nir_op_ushr:
- return NV50_IR_SUBOP_SHIFT_WRAP;
default:
return 0;
}
@@ -617,78 +579,61 @@ Converter::getSubOp(nir_intrinsic_op op)
{
switch (op) {
case nir_intrinsic_bindless_image_atomic_add:
- case nir_intrinsic_global_atomic_add:
case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_deref_atomic_add:
case nir_intrinsic_shared_atomic_add:
case nir_intrinsic_ssbo_atomic_add:
return NV50_IR_SUBOP_ATOM_ADD;
- case nir_intrinsic_bindless_image_atomic_fadd:
- case nir_intrinsic_global_atomic_fadd:
- case nir_intrinsic_image_atomic_fadd:
- case nir_intrinsic_shared_atomic_fadd:
- case nir_intrinsic_ssbo_atomic_fadd:
- return NV50_IR_SUBOP_ATOM_ADD;
case nir_intrinsic_bindless_image_atomic_and:
- case nir_intrinsic_global_atomic_and:
case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_shared_atomic_and:
case nir_intrinsic_ssbo_atomic_and:
return NV50_IR_SUBOP_ATOM_AND;
case nir_intrinsic_bindless_image_atomic_comp_swap:
- case nir_intrinsic_global_atomic_comp_swap:
case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_ssbo_atomic_comp_swap:
return NV50_IR_SUBOP_ATOM_CAS;
case nir_intrinsic_bindless_image_atomic_exchange:
- case nir_intrinsic_global_atomic_exchange:
case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_shared_atomic_exchange:
case nir_intrinsic_ssbo_atomic_exchange:
return NV50_IR_SUBOP_ATOM_EXCH;
case nir_intrinsic_bindless_image_atomic_or:
- case nir_intrinsic_global_atomic_or:
case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_shared_atomic_or:
case nir_intrinsic_ssbo_atomic_or:
return NV50_IR_SUBOP_ATOM_OR;
- case nir_intrinsic_bindless_image_atomic_imax:
- case nir_intrinsic_bindless_image_atomic_umax:
- case nir_intrinsic_global_atomic_imax:
- case nir_intrinsic_global_atomic_umax:
- case nir_intrinsic_image_atomic_imax:
- case nir_intrinsic_image_atomic_umax:
+ case nir_intrinsic_bindless_image_atomic_max:
+ case nir_intrinsic_image_atomic_max:
+ case nir_intrinsic_image_deref_atomic_max:
case nir_intrinsic_shared_atomic_imax:
case nir_intrinsic_shared_atomic_umax:
case nir_intrinsic_ssbo_atomic_imax:
case nir_intrinsic_ssbo_atomic_umax:
return NV50_IR_SUBOP_ATOM_MAX;
- case nir_intrinsic_bindless_image_atomic_imin:
- case nir_intrinsic_bindless_image_atomic_umin:
- case nir_intrinsic_global_atomic_imin:
- case nir_intrinsic_global_atomic_umin:
- case nir_intrinsic_image_atomic_imin:
- case nir_intrinsic_image_atomic_umin:
+ case nir_intrinsic_bindless_image_atomic_min:
+ case nir_intrinsic_image_atomic_min:
+ case nir_intrinsic_image_deref_atomic_min:
case nir_intrinsic_shared_atomic_imin:
case nir_intrinsic_shared_atomic_umin:
case nir_intrinsic_ssbo_atomic_imin:
case nir_intrinsic_ssbo_atomic_umin:
return NV50_IR_SUBOP_ATOM_MIN;
case nir_intrinsic_bindless_image_atomic_xor:
- case nir_intrinsic_global_atomic_xor:
case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_shared_atomic_xor:
case nir_intrinsic_ssbo_atomic_xor:
return NV50_IR_SUBOP_ATOM_XOR;
- case nir_intrinsic_bindless_image_atomic_inc_wrap:
- case nir_intrinsic_image_atomic_inc_wrap:
- return NV50_IR_SUBOP_ATOM_INC;
- case nir_intrinsic_bindless_image_atomic_dec_wrap:
- case nir_intrinsic_image_atomic_dec_wrap:
- return NV50_IR_SUBOP_ATOM_DEC;
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_memory_barrier_atomic_counter:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
return NV50_IR_SUBOP_MEMBAR(M, GL);
@@ -721,7 +666,7 @@ Converter::getCondCode(nir_op op)
case nir_op_ilt32:
case nir_op_ult32:
return CC_LT;
- case nir_op_fneu32:
+ case nir_op_fne32:
return CC_NEU;
case nir_op_ine32:
return CC_NE;
@@ -753,8 +698,6 @@ Converter::convert(nir_dest *dest)
Converter::LValues&
Converter::convert(nir_register *reg)
{
- assert(!reg->num_array_elems);
-
NirDefMap::iterator it = regDefs.find(reg->index);
if (it != regDefs.end())
return it->second;
@@ -914,6 +857,256 @@ vert_attrib_to_tgsi_semantic(gl_vert_attrib slot, unsigned *name, unsigned *inde
}
}
+static void
+varying_slot_to_tgsi_semantic(gl_varying_slot slot, unsigned *name, unsigned *index)
+{
+ assert(name && index);
+
+ if (slot >= VARYING_SLOT_TESS_MAX) {
+ ERROR("invalid varying slot %u\n", slot);
+ assert(false);
+ return;
+ }
+
+ if (slot >= VARYING_SLOT_PATCH0) {
+ *name = TGSI_SEMANTIC_PATCH;
+ *index = slot - VARYING_SLOT_PATCH0;
+ return;
+ }
+
+ if (slot >= VARYING_SLOT_VAR0) {
+ *name = TGSI_SEMANTIC_GENERIC;
+ *index = slot - VARYING_SLOT_VAR0;
+ return;
+ }
+
+ if (slot >= VARYING_SLOT_TEX0 && slot <= VARYING_SLOT_TEX7) {
+ *name = TGSI_SEMANTIC_TEXCOORD;
+ *index = slot - VARYING_SLOT_TEX0;
+ return;
+ }
+
+ switch (slot) {
+ case VARYING_SLOT_BFC0:
+ *name = TGSI_SEMANTIC_BCOLOR;
+ *index = 0;
+ break;
+ case VARYING_SLOT_BFC1:
+ *name = TGSI_SEMANTIC_BCOLOR;
+ *index = 1;
+ break;
+ case VARYING_SLOT_CLIP_DIST0:
+ *name = TGSI_SEMANTIC_CLIPDIST;
+ *index = 0;
+ break;
+ case VARYING_SLOT_CLIP_DIST1:
+ *name = TGSI_SEMANTIC_CLIPDIST;
+ *index = 1;
+ break;
+ case VARYING_SLOT_CLIP_VERTEX:
+ *name = TGSI_SEMANTIC_CLIPVERTEX;
+ *index = 0;
+ break;
+ case VARYING_SLOT_COL0:
+ *name = TGSI_SEMANTIC_COLOR;
+ *index = 0;
+ break;
+ case VARYING_SLOT_COL1:
+ *name = TGSI_SEMANTIC_COLOR;
+ *index = 1;
+ break;
+ case VARYING_SLOT_EDGE:
+ *name = TGSI_SEMANTIC_EDGEFLAG;
+ *index = 0;
+ break;
+ case VARYING_SLOT_FACE:
+ *name = TGSI_SEMANTIC_FACE;
+ *index = 0;
+ break;
+ case VARYING_SLOT_FOGC:
+ *name = TGSI_SEMANTIC_FOG;
+ *index = 0;
+ break;
+ case VARYING_SLOT_LAYER:
+ *name = TGSI_SEMANTIC_LAYER;
+ *index = 0;
+ break;
+ case VARYING_SLOT_PNTC:
+ *name = TGSI_SEMANTIC_PCOORD;
+ *index = 0;
+ break;
+ case VARYING_SLOT_POS:
+ *name = TGSI_SEMANTIC_POSITION;
+ *index = 0;
+ break;
+ case VARYING_SLOT_PRIMITIVE_ID:
+ *name = TGSI_SEMANTIC_PRIMID;
+ *index = 0;
+ break;
+ case VARYING_SLOT_PSIZ:
+ *name = TGSI_SEMANTIC_PSIZE;
+ *index = 0;
+ break;
+ case VARYING_SLOT_TESS_LEVEL_INNER:
+ *name = TGSI_SEMANTIC_TESSINNER;
+ *index = 0;
+ break;
+ case VARYING_SLOT_TESS_LEVEL_OUTER:
+ *name = TGSI_SEMANTIC_TESSOUTER;
+ *index = 0;
+ break;
+ case VARYING_SLOT_VIEWPORT:
+ *name = TGSI_SEMANTIC_VIEWPORT_INDEX;
+ *index = 0;
+ break;
+ default:
+ ERROR("unknown varying slot %u\n", slot);
+ assert(false);
+ break;
+ }
+}
+
+static void
+frag_result_to_tgsi_semantic(unsigned slot, unsigned *name, unsigned *index)
+{
+ if (slot >= FRAG_RESULT_DATA0) {
+ *name = TGSI_SEMANTIC_COLOR;
+ *index = slot - FRAG_RESULT_COLOR - 2; // intentional
+ return;
+ }
+
+ switch (slot) {
+ case FRAG_RESULT_COLOR:
+ *name = TGSI_SEMANTIC_COLOR;
+ *index = 0;
+ break;
+ case FRAG_RESULT_DEPTH:
+ *name = TGSI_SEMANTIC_POSITION;
+ *index = 0;
+ break;
+ case FRAG_RESULT_SAMPLE_MASK:
+ *name = TGSI_SEMANTIC_SAMPLEMASK;
+ *index = 0;
+ break;
+ default:
+ ERROR("unknown frag result slot %u\n", slot);
+ assert(false);
+ break;
+ }
+}
+
+// copy of _mesa_sysval_to_semantic
+static void
+system_val_to_tgsi_semantic(unsigned val, unsigned *name, unsigned *index)
+{
+ *index = 0;
+ switch (val) {
+ // Vertex shader
+ case SYSTEM_VALUE_VERTEX_ID:
+ *name = TGSI_SEMANTIC_VERTEXID;
+ break;
+ case SYSTEM_VALUE_INSTANCE_ID:
+ *name = TGSI_SEMANTIC_INSTANCEID;
+ break;
+ case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+ *name = TGSI_SEMANTIC_VERTEXID_NOBASE;
+ break;
+ case SYSTEM_VALUE_BASE_VERTEX:
+ *name = TGSI_SEMANTIC_BASEVERTEX;
+ break;
+ case SYSTEM_VALUE_BASE_INSTANCE:
+ *name = TGSI_SEMANTIC_BASEINSTANCE;
+ break;
+ case SYSTEM_VALUE_DRAW_ID:
+ *name = TGSI_SEMANTIC_DRAWID;
+ break;
+
+ // Geometry shader
+ case SYSTEM_VALUE_INVOCATION_ID:
+ *name = TGSI_SEMANTIC_INVOCATIONID;
+ break;
+
+ // Fragment shader
+ case SYSTEM_VALUE_FRAG_COORD:
+ *name = TGSI_SEMANTIC_POSITION;
+ break;
+ case SYSTEM_VALUE_FRONT_FACE:
+ *name = TGSI_SEMANTIC_FACE;
+ break;
+ case SYSTEM_VALUE_SAMPLE_ID:
+ *name = TGSI_SEMANTIC_SAMPLEID;
+ break;
+ case SYSTEM_VALUE_SAMPLE_POS:
+ *name = TGSI_SEMANTIC_SAMPLEPOS;
+ break;
+ case SYSTEM_VALUE_SAMPLE_MASK_IN:
+ *name = TGSI_SEMANTIC_SAMPLEMASK;
+ break;
+ case SYSTEM_VALUE_HELPER_INVOCATION:
+ *name = TGSI_SEMANTIC_HELPER_INVOCATION;
+ break;
+
+ // Tessellation shader
+ case SYSTEM_VALUE_TESS_COORD:
+ *name = TGSI_SEMANTIC_TESSCOORD;
+ break;
+ case SYSTEM_VALUE_VERTICES_IN:
+ *name = TGSI_SEMANTIC_VERTICESIN;
+ break;
+ case SYSTEM_VALUE_PRIMITIVE_ID:
+ *name = TGSI_SEMANTIC_PRIMID;
+ break;
+ case SYSTEM_VALUE_TESS_LEVEL_OUTER:
+ *name = TGSI_SEMANTIC_TESSOUTER;
+ break;
+ case SYSTEM_VALUE_TESS_LEVEL_INNER:
+ *name = TGSI_SEMANTIC_TESSINNER;
+ break;
+
+ // Compute shader
+ case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
+ *name = TGSI_SEMANTIC_THREAD_ID;
+ break;
+ case SYSTEM_VALUE_WORK_GROUP_ID:
+ *name = TGSI_SEMANTIC_BLOCK_ID;
+ break;
+ case SYSTEM_VALUE_NUM_WORK_GROUPS:
+ *name = TGSI_SEMANTIC_GRID_SIZE;
+ break;
+ case SYSTEM_VALUE_LOCAL_GROUP_SIZE:
+ *name = TGSI_SEMANTIC_BLOCK_SIZE;
+ break;
+
+ // ARB_shader_ballot
+ case SYSTEM_VALUE_SUBGROUP_SIZE:
+ *name = TGSI_SEMANTIC_SUBGROUP_SIZE;
+ break;
+ case SYSTEM_VALUE_SUBGROUP_INVOCATION:
+ *name = TGSI_SEMANTIC_SUBGROUP_INVOCATION;
+ break;
+ case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
+ *name = TGSI_SEMANTIC_SUBGROUP_EQ_MASK;
+ break;
+ case SYSTEM_VALUE_SUBGROUP_GE_MASK:
+ *name = TGSI_SEMANTIC_SUBGROUP_GE_MASK;
+ break;
+ case SYSTEM_VALUE_SUBGROUP_GT_MASK:
+ *name = TGSI_SEMANTIC_SUBGROUP_GT_MASK;
+ break;
+ case SYSTEM_VALUE_SUBGROUP_LE_MASK:
+ *name = TGSI_SEMANTIC_SUBGROUP_LE_MASK;
+ break;
+ case SYSTEM_VALUE_SUBGROUP_LT_MASK:
+ *name = TGSI_SEMANTIC_SUBGROUP_LT_MASK;
+ break;
+
+ default:
+ ERROR("unknown system value %u\n", val);
+ assert(false);
+ break;
+ }
+}
+
void
Converter::setInterpolate(nv50_ir_varying *var,
uint8_t mode,
@@ -949,7 +1142,7 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
uint16_t slots;
switch (stage) {
case Program::TYPE_GEOMETRY:
- slots = type->count_attribute_slots(false);
+ slots = type->uniform_locations();
if (input)
slots /= info.gs.vertices_in;
break;
@@ -957,9 +1150,9 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
case Program::TYPE_TESSELLATION_EVAL:
// remove first dimension
if (var->data.patch || (!input && stage == Program::TYPE_TESSELLATION_EVAL))
- slots = type->count_attribute_slots(false);
+ slots = type->uniform_locations();
else
- slots = type->fields.array->count_attribute_slots(false);
+ slots = type->fields.array->uniform_locations();
break;
default:
slots = type->count_attribute_slots(false);
@@ -969,94 +1162,93 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
return slots;
}
-static uint8_t
-getMaskForType(const glsl_type *type, uint8_t slot) {
- uint16_t comp = type->without_array()->components();
- comp = comp ? comp : 4;
-
- if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
- comp *= 2;
- if (comp > 4) {
- if (slot % 2)
- comp -= 4;
- else
- comp = 4;
- }
- }
-
- return (1 << comp) - 1;
-}
-
bool Converter::assignSlots() {
unsigned name;
unsigned index;
info->io.viewportId = -1;
- info_out->numInputs = 0;
- info_out->numOutputs = 0;
- info_out->numSysVals = 0;
+ info->numInputs = 0;
+ info->numOutputs = 0;
- uint8_t i;
- BITSET_FOREACH_SET(i, nir->info.system_values_read, SYSTEM_VALUE_MAX) {
- info_out->sv[info_out->numSysVals].sn = tgsi_get_sysval_semantic(i);
- info_out->sv[info_out->numSysVals].si = 0;
- info_out->sv[info_out->numSysVals].input = 0; // TODO inferSysValDirection(sn);
+ // we have to fixup the uniform locations for arrays
+ unsigned numImages = 0;
+ nir_foreach_variable(var, &nir->uniforms) {
+ const glsl_type *type = var->type;
+ if (!type->without_array()->is_image())
+ continue;
+ var->data.driver_location = numImages;
+ numImages += type->is_array() ? type->arrays_of_arrays_size() : 1;
+ }
+
+ info->numSysVals = 0;
+ for (uint8_t i = 0; i < SYSTEM_VALUE_MAX; ++i) {
+ if (!(nir->info.system_values_read & 1ull << i))
+ continue;
+
+ system_val_to_tgsi_semantic(i, &name, &index);
+ info->sv[info->numSysVals].sn = name;
+ info->sv[info->numSysVals].si = index;
+ info->sv[info->numSysVals].input = 0; // TODO inferSysValDirection(sn);
switch (i) {
case SYSTEM_VALUE_INSTANCE_ID:
- info_out->io.instanceId = info_out->numSysVals;
+ info->io.instanceId = info->numSysVals;
break;
case SYSTEM_VALUE_TESS_LEVEL_INNER:
case SYSTEM_VALUE_TESS_LEVEL_OUTER:
- info_out->sv[info_out->numSysVals].patch = 1;
+ info->sv[info->numSysVals].patch = 1;
break;
case SYSTEM_VALUE_VERTEX_ID:
- info_out->io.vertexId = info_out->numSysVals;
+ info->io.vertexId = info->numSysVals;
break;
default:
break;
}
- info_out->numSysVals += 1;
+ info->numSysVals += 1;
}
if (prog->getType() == Program::TYPE_COMPUTE)
return true;
- nir_foreach_shader_in_variable(var, nir) {
+ nir_foreach_variable(var, &nir->inputs) {
const glsl_type *type = var->type;
int slot = var->data.location;
uint16_t slots = calcSlots(type, prog->getType(), nir->info, true, var);
+ uint32_t comp = type->is_array() ? type->without_array()->component_slots()
+ : type->component_slots();
+ uint32_t frac = var->data.location_frac;
uint32_t vary = var->data.driver_location;
+
+ if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
+ if (comp > 2)
+ slots *= 2;
+ }
+
assert(vary + slots <= PIPE_MAX_SHADER_INPUTS);
switch(prog->getType()) {
case Program::TYPE_FRAGMENT:
- tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
- &name, &index);
+ varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
for (uint16_t i = 0; i < slots; ++i) {
- setInterpolate(&info_out->in[vary + i], var->data.interpolation,
+ setInterpolate(&info->in[vary + i], var->data.interpolation,
var->data.centroid | var->data.sample, name);
}
break;
case Program::TYPE_GEOMETRY:
- tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
- &name, &index);
+ varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
break;
case Program::TYPE_TESSELLATION_CONTROL:
case Program::TYPE_TESSELLATION_EVAL:
- tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
- &name, &index);
+ varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
if (var->data.patch && name == TGSI_SEMANTIC_PATCH)
- info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots);
+ info->numPatchConstants = MAX2(info->numPatchConstants, index + slots);
break;
case Program::TYPE_VERTEX:
- if (slot >= VERT_ATTRIB_GENERIC0)
- slot = VERT_ATTRIB_GENERIC0 + vary;
vert_attrib_to_tgsi_semantic((gl_vert_attrib)slot, &name, &index);
switch (name) {
case TGSI_SEMANTIC_EDGEFLAG:
- info_out->io.edgeFlagIn = vary;
+ info->io.edgeFlagIn = vary;
break;
default:
break;
@@ -1068,44 +1260,55 @@ bool Converter::assignSlots() {
}
for (uint16_t i = 0u; i < slots; ++i, ++vary) {
- nv50_ir_varying *v = &info_out->in[vary];
-
- v->patch = var->data.patch;
- v->sn = name;
- v->si = index + i;
- v->mask |= getMaskForType(type, i) << var->data.location_frac;
+ info->in[vary].id = vary;
+ info->in[vary].patch = var->data.patch;
+ info->in[vary].sn = name;
+ info->in[vary].si = index + i;
+ if (glsl_base_type_is_64bit(type->without_array()->base_type))
+ if (i & 0x1)
+ info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
+ else
+ info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
+ else
+ info->in[vary].mask |= ((1 << comp) - 1) << frac;
}
- info_out->numInputs = std::max<uint8_t>(info_out->numInputs, vary);
+ info->numInputs = std::max<uint8_t>(info->numInputs, vary);
}
- nir_foreach_shader_out_variable(var, nir) {
+ nir_foreach_variable(var, &nir->outputs) {
const glsl_type *type = var->type;
int slot = var->data.location;
uint16_t slots = calcSlots(type, prog->getType(), nir->info, false, var);
+ uint32_t comp = type->is_array() ? type->without_array()->component_slots()
+ : type->component_slots();
+ uint32_t frac = var->data.location_frac;
uint32_t vary = var->data.driver_location;
+ if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
+ if (comp > 2)
+ slots *= 2;
+ }
+
assert(vary < PIPE_MAX_SHADER_OUTPUTS);
switch(prog->getType()) {
case Program::TYPE_FRAGMENT:
- tgsi_get_gl_frag_result_semantic((gl_frag_result)slot, &name, &index);
+ frag_result_to_tgsi_semantic((gl_frag_result)slot, &name, &index);
switch (name) {
case TGSI_SEMANTIC_COLOR:
if (!var->data.fb_fetch_output)
- info_out->prop.fp.numColourResults++;
- if (var->data.location == FRAG_RESULT_COLOR &&
- nir->info.outputs_written & BITFIELD64_BIT(var->data.location))
- info_out->prop.fp.separateFragData = true;
+ info->prop.fp.numColourResults++;
+ info->prop.fp.separateFragData = true;
// sometimes we get FRAG_RESULT_DATAX with data.index 0
// sometimes we get FRAG_RESULT_DATA0 with data.index X
index = index == 0 ? var->data.index : index;
break;
case TGSI_SEMANTIC_POSITION:
- info_out->io.fragDepth = vary;
- info_out->prop.fp.writesDepth = true;
+ info->io.fragDepth = vary;
+ info->prop.fp.writesDepth = true;
break;
case TGSI_SEMANTIC_SAMPLEMASK:
- info_out->io.sampleMask = vary;
+ info->io.sampleMask = vary;
break;
default:
break;
@@ -1115,22 +1318,21 @@ bool Converter::assignSlots() {
case Program::TYPE_TESSELLATION_CONTROL:
case Program::TYPE_TESSELLATION_EVAL:
case Program::TYPE_VERTEX:
- tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
- &name, &index);
+ varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
if (var->data.patch && name != TGSI_SEMANTIC_TESSINNER &&
name != TGSI_SEMANTIC_TESSOUTER)
- info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots);
+ info->numPatchConstants = MAX2(info->numPatchConstants, index + slots);
switch (name) {
case TGSI_SEMANTIC_CLIPDIST:
- info_out->io.genUserClip = -1;
+ info->io.genUserClip = -1;
break;
case TGSI_SEMANTIC_CLIPVERTEX:
clipVertexOutput = vary;
break;
case TGSI_SEMANTIC_EDGEFLAG:
- info_out->io.edgeFlagOut = vary;
+ info->io.edgeFlagOut = vary;
break;
case TGSI_SEMANTIC_POSITION:
if (clipVertexOutput < 0)
@@ -1146,33 +1348,39 @@ bool Converter::assignSlots() {
}
for (uint16_t i = 0u; i < slots; ++i, ++vary) {
- nv50_ir_varying *v = &info_out->out[vary];
- v->patch = var->data.patch;
- v->sn = name;
- v->si = index + i;
- v->mask |= getMaskForType(type, i) << var->data.location_frac;
+ info->out[vary].id = vary;
+ info->out[vary].patch = var->data.patch;
+ info->out[vary].sn = name;
+ info->out[vary].si = index + i;
+ if (glsl_base_type_is_64bit(type->without_array()->base_type))
+ if (i & 0x1)
+ info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
+ else
+ info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
+ else
+ info->out[vary].mask |= ((1 << comp) - 1) << frac;
if (nir->info.outputs_read & 1ull << slot)
- v->oread = 1;
+ info->out[vary].oread = 1;
}
- info_out->numOutputs = std::max<uint8_t>(info_out->numOutputs, vary);
+ info->numOutputs = std::max<uint8_t>(info->numOutputs, vary);
}
- if (info_out->io.genUserClip > 0) {
- info_out->io.clipDistances = info_out->io.genUserClip;
+ if (info->io.genUserClip > 0) {
+ info->io.clipDistances = info->io.genUserClip;
- const unsigned int nOut = (info_out->io.genUserClip + 3) / 4;
+ const unsigned int nOut = (info->io.genUserClip + 3) / 4;
for (unsigned int n = 0; n < nOut; ++n) {
- unsigned int i = info_out->numOutputs++;
- info_out->out[i].id = i;
- info_out->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
- info_out->out[i].si = n;
- info_out->out[i].mask = ((1 << info_out->io.clipDistances) - 1) >> (n * 4);
+ unsigned int i = info->numOutputs++;
+ info->out[i].id = i;
+ info->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
+ info->out[i].si = n;
+ info->out[i].mask = ((1 << info->io.clipDistances) - 1) >> (n * 4);
}
}
- return info->assignSlots(info_out) == 0;
+ return info->assignSlots(info) == 0;
}
uint32_t
@@ -1222,7 +1430,7 @@ Converter::getSlotAddress(nir_intrinsic_instr *insn, uint8_t idx, uint8_t slot)
assert(!input || idx < PIPE_MAX_SHADER_INPUTS);
assert(input || idx < PIPE_MAX_SHADER_OUTPUTS);
- const nv50_ir_varying *vary = input ? info_out->in : info_out->out;
+ const nv50_ir_varying *vary = input ? info->in : info->out;
return vary[idx].slot[slot] * 4;
}
@@ -1280,63 +1488,66 @@ Converter::storeTo(nir_intrinsic_instr *insn, DataFile file, operation op,
}
mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address), indirect0,
- split[0])->perPatch = info_out->out[idx].patch;
+ split[0])->perPatch = info->out[idx].patch;
mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address + 4), indirect0,
- split[1])->perPatch = info_out->out[idx].patch;
+ split[1])->perPatch = info->out[idx].patch;
} else {
if (op == OP_EXPORT)
src = mkMov(getSSA(size), src, ty)->getDef(0);
mkStore(op, ty, mkSymbol(file, 0, ty, address), indirect0,
- src)->perPatch = info_out->out[idx].patch;
+ src)->perPatch = info->out[idx].patch;
}
}
bool
Converter::parseNIR()
{
- info_out->bin.tlsSpace = nir->scratch_size;
- info_out->io.clipDistances = nir->info.clip_distance_array_size;
- info_out->io.cullDistances = nir->info.cull_distance_array_size;
- info_out->io.layer_viewport_relative = nir->info.layer_viewport_relative;
+ info->bin.tlsSpace = 0;
+ info->io.clipDistances = nir->info.clip_distance_array_size;
+ info->io.cullDistances = nir->info.cull_distance_array_size;
switch(prog->getType()) {
case Program::TYPE_COMPUTE:
- info->prop.cp.numThreads[0] = nir->info.workgroup_size[0];
- info->prop.cp.numThreads[1] = nir->info.workgroup_size[1];
- info->prop.cp.numThreads[2] = nir->info.workgroup_size[2];
- info_out->bin.smemSize = std::max(info_out->bin.smemSize, nir->info.shared_size);
+ info->prop.cp.numThreads[0] = nir->info.cs.local_size[0];
+ info->prop.cp.numThreads[1] = nir->info.cs.local_size[1];
+ info->prop.cp.numThreads[2] = nir->info.cs.local_size[2];
+ info->bin.smemSize = nir->info.cs.shared_size;
break;
case Program::TYPE_FRAGMENT:
- info_out->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests;
- prog->persampleInvocation =
- BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
- BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
- info_out->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage;
- info_out->prop.fp.readsSampleLocations =
- BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
- info_out->prop.fp.usesDiscard = nir->info.fs.uses_discard || nir->info.fs.uses_demote;
- info_out->prop.fp.usesSampleMaskIn =
- BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
+ info->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests;
+ info->prop.fp.persampleInvocation =
+ (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_ID) ||
+ (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
+ info->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage;
+ info->prop.fp.readsSampleLocations =
+ (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
+ info->prop.fp.usesDiscard = nir->info.fs.uses_discard;
+ info->prop.fp.usesSampleMaskIn =
+ !!(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN);
break;
case Program::TYPE_GEOMETRY:
- info_out->prop.gp.instanceCount = nir->info.gs.invocations;
- info_out->prop.gp.maxVertices = nir->info.gs.vertices_out;
- info_out->prop.gp.outputPrim = nir->info.gs.output_primitive;
+ info->prop.gp.inputPrim = nir->info.gs.input_primitive;
+ info->prop.gp.instanceCount = nir->info.gs.invocations;
+ info->prop.gp.maxVertices = nir->info.gs.vertices_out;
+ info->prop.gp.outputPrim = nir->info.gs.output_primitive;
break;
case Program::TYPE_TESSELLATION_CONTROL:
case Program::TYPE_TESSELLATION_EVAL:
- info_out->prop.tp.domain = u_tess_prim_from_shader(nir->info.tess._primitive_mode);
- info_out->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out;
- info_out->prop.tp.outputPrim =
+ if (nir->info.tess.primitive_mode == GL_ISOLINES)
+ info->prop.tp.domain = GL_LINES;
+ else
+ info->prop.tp.domain = nir->info.tess.primitive_mode;
+ info->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out;
+ info->prop.tp.outputPrim =
nir->info.tess.point_mode ? PIPE_PRIM_POINTS : PIPE_PRIM_TRIANGLES;
- info_out->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3;
- info_out->prop.tp.winding = !nir->info.tess.ccw;
+ info->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3;
+ info->prop.tp.winding = !nir->info.tess.ccw;
break;
case Program::TYPE_VERTEX:
- info_out->prop.vp.usesDrawParameters =
- BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
- BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
- BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
+ info->prop.vp.usesDrawParameters =
+ (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX)) ||
+ (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE)) ||
+ (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID));
break;
default:
break;
@@ -1359,7 +1570,7 @@ Converter::visit(nir_function *function)
setPosition(entry, true);
- if (info_out->io.genUserClip > 0) {
+ if (info->io.genUserClip > 0) {
for (int c = 0; c < 4; ++c)
clipVtx[c] = getScratch();
}
@@ -1381,6 +1592,16 @@ Converter::visit(nir_function *function)
break;
}
+ nir_foreach_register(reg, &function->impl->registers) {
+ if (reg->num_array_elems) {
+ // TODO: packed variables would be nice, but MemoryOpt fails
+ // replace 4 with reg->num_components
+ uint32_t size = 4 * reg->num_array_elems * (reg->bit_size / 8);
+ regToLmemOffset[reg->index] = info->bin.tlsSpace;
+ info->bin.tlsSpace += size;
+ }
+ }
+
nir_index_ssa_defs(function->impl);
foreach_list_typed(nir_cf_node, node, node, &function->impl->body) {
if (!visit(node))
@@ -1392,7 +1613,7 @@ Converter::visit(nir_function *function)
if ((prog->getType() == Program::TYPE_VERTEX ||
prog->getType() == Program::TYPE_TESSELLATION_EVAL)
- && info_out->io.genUserClip > 0)
+ && info->io.genUserClip > 0)
handleUserClipPlanes();
// TODO: for non main function this needs to be a OP_RETURN
@@ -1435,69 +1656,64 @@ Converter::visit(nir_block *block)
bool
Converter::visit(nir_if *nif)
{
- curIfDepth++;
-
DataType sType = getSType(nif->condition, false, false);
Value *src = getSrc(&nif->condition, 0);
nir_block *lastThen = nir_if_last_then_block(nif);
nir_block *lastElse = nir_if_last_else_block(nif);
- BasicBlock *headBB = bb;
+ assert(!lastThen->successors[1]);
+ assert(!lastElse->successors[1]);
+
BasicBlock *ifBB = convert(nir_if_first_then_block(nif));
BasicBlock *elseBB = convert(nir_if_first_else_block(nif));
bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE);
bb->cfg.attach(&elseBB->cfg, Graph::Edge::TREE);
- bool insertJoins = lastThen->successors[0] == lastElse->successors[0];
+ // we only insert joinats, if both nodes end up at the end of the if again.
+ // the reason for this to not happens are breaks/continues/ret/... which
+ // have their own handling
+ if (lastThen->successors[0] == lastElse->successors[0])
+ bb->joinAt = mkFlow(OP_JOINAT, convert(lastThen->successors[0]),
+ CC_ALWAYS, NULL);
+
mkFlow(OP_BRA, elseBB, CC_EQ, src)->setType(sType);
foreach_list_typed(nir_cf_node, node, node, &nif->then_list) {
if (!visit(node))
return false;
}
-
setPosition(convert(lastThen), true);
- if (!bb->isTerminated()) {
+ if (!bb->getExit() ||
+ !bb->getExit()->asFlow() ||
+ bb->getExit()->asFlow()->op == OP_JOIN) {
BasicBlock *tailBB = convert(lastThen->successors[0]);
mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
- } else {
- insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
}
foreach_list_typed(nir_cf_node, node, node, &nif->else_list) {
if (!visit(node))
return false;
}
-
setPosition(convert(lastElse), true);
- if (!bb->isTerminated()) {
+ if (!bb->getExit() ||
+ !bb->getExit()->asFlow() ||
+ bb->getExit()->asFlow()->op == OP_JOIN) {
BasicBlock *tailBB = convert(lastElse->successors[0]);
mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
- } else {
- insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
}
- /* only insert joins for the most outer if */
- if (--curIfDepth)
- insertJoins = false;
-
- /* we made sure that all threads would converge at the same block */
- if (insertJoins) {
- BasicBlock *conv = convert(lastThen->successors[0]);
- setPosition(headBB->getExit(), false);
- headBB->joinAt = mkFlow(OP_JOINAT, conv, CC_ALWAYS, NULL);
- setPosition(conv, false);
+ if (lastThen->successors[0] == lastElse->successors[0]) {
+ setPosition(convert(lastThen->successors[0]), true);
mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
}
return true;
}
-// TODO: add convergency
bool
Converter::visit(nir_loop *loop)
{
@@ -1505,8 +1721,8 @@ Converter::visit(nir_loop *loop)
func->loopNestingBound = std::max(func->loopNestingBound, curLoopDepth);
BasicBlock *loopBB = convert(nir_loop_first_block(loop));
- BasicBlock *tailBB = convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
-
+ BasicBlock *tailBB =
+ convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
bb->cfg.attach(&loopBB->cfg, Graph::Edge::TREE);
mkFlow(OP_PREBREAK, tailBB, CC_ALWAYS, NULL);
@@ -1517,15 +1733,19 @@ Converter::visit(nir_loop *loop)
if (!visit(node))
return false;
}
-
- if (!bb->isTerminated()) {
- mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
- bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+ Instruction *insn = bb->getExit();
+ if (bb->cfg.incidentCount() != 0) {
+ if (!insn || !insn->asFlow()) {
+ mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
+ bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+ } else if (insn && insn->op == OP_BRA && !insn->getPredicate() &&
+ tailBB->cfg.incidentCount() == 0) {
+ // RA doesn't like having blocks around with no incident edge,
+ // so we create a fake one to make it happy
+ bb->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
+ }
}
- if (tailBB->cfg.incidentCount() == 0)
- loopBB->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
-
curLoopDepth -= 1;
return true;
@@ -1539,6 +1759,8 @@ Converter::visit(nir_instr *insn)
switch (insn->type) {
case nir_instr_type_alu:
return visit(nir_instr_as_alu(insn));
+ case nir_instr_type_deref:
+ return visit(nir_instr_as_deref(insn));
case nir_instr_type_intrinsic:
return visit(nir_instr_as_intrinsic(insn));
case nir_instr_type_jump:
@@ -1568,18 +1790,17 @@ Converter::convert(nir_intrinsic_op intr)
return SV_DRAWID;
case nir_intrinsic_load_front_face:
return SV_FACE;
- case nir_intrinsic_is_helper_invocation:
case nir_intrinsic_load_helper_invocation:
return SV_THREAD_KILL;
case nir_intrinsic_load_instance_id:
return SV_INSTANCE_ID;
case nir_intrinsic_load_invocation_id:
return SV_INVOCATION_ID;
- case nir_intrinsic_load_workgroup_size:
+ case nir_intrinsic_load_local_group_size:
return SV_NTID;
case nir_intrinsic_load_local_invocation_id:
return SV_TID;
- case nir_intrinsic_load_num_workgroups:
+ case nir_intrinsic_load_num_work_groups:
return SV_NCTAID;
case nir_intrinsic_load_patch_vertices_in:
return SV_VERTEX_COUNT;
@@ -1611,10 +1832,8 @@ Converter::convert(nir_intrinsic_op intr)
return SV_TESS_OUTER;
case nir_intrinsic_load_vertex_id:
return SV_VERTEX_ID;
- case nir_intrinsic_load_workgroup_id:
+ case nir_intrinsic_load_work_group_id:
return SV_CTAID;
- case nir_intrinsic_load_work_dim:
- return SV_WORK_DIM;
default:
ERROR("unknown SVSemantic for nir_intrinsic_op %s\n",
nir_intrinsic_infos[intr].name);
@@ -1623,12 +1842,73 @@ Converter::convert(nir_intrinsic_op intr)
}
}
+ImgFormat
+Converter::convertGLImgFormat(GLuint format)
+{
+#define FMT_CASE(a, b) \
+ case GL_ ## a: return nv50_ir::FMT_ ## b
+
+ switch (format) {
+ FMT_CASE(NONE, NONE);
+
+ FMT_CASE(RGBA32F, RGBA32F);
+ FMT_CASE(RGBA16F, RGBA16F);
+ FMT_CASE(RG32F, RG32F);
+ FMT_CASE(RG16F, RG16F);
+ FMT_CASE(R11F_G11F_B10F, R11G11B10F);
+ FMT_CASE(R32F, R32F);
+ FMT_CASE(R16F, R16F);
+
+ FMT_CASE(RGBA32UI, RGBA32UI);
+ FMT_CASE(RGBA16UI, RGBA16UI);
+ FMT_CASE(RGB10_A2UI, RGB10A2UI);
+ FMT_CASE(RGBA8UI, RGBA8UI);
+ FMT_CASE(RG32UI, RG32UI);
+ FMT_CASE(RG16UI, RG16UI);
+ FMT_CASE(RG8UI, RG8UI);
+ FMT_CASE(R32UI, R32UI);
+ FMT_CASE(R16UI, R16UI);
+ FMT_CASE(R8UI, R8UI);
+
+ FMT_CASE(RGBA32I, RGBA32I);
+ FMT_CASE(RGBA16I, RGBA16I);
+ FMT_CASE(RGBA8I, RGBA8I);
+ FMT_CASE(RG32I, RG32I);
+ FMT_CASE(RG16I, RG16I);
+ FMT_CASE(RG8I, RG8I);
+ FMT_CASE(R32I, R32I);
+ FMT_CASE(R16I, R16I);
+ FMT_CASE(R8I, R8I);
+
+ FMT_CASE(RGBA16, RGBA16);
+ FMT_CASE(RGB10_A2, RGB10A2);
+ FMT_CASE(RGBA8, RGBA8);
+ FMT_CASE(RG16, RG16);
+ FMT_CASE(RG8, RG8);
+ FMT_CASE(R16, R16);
+ FMT_CASE(R8, R8);
+
+ FMT_CASE(RGBA16_SNORM, RGBA16_SNORM);
+ FMT_CASE(RGBA8_SNORM, RGBA8_SNORM);
+ FMT_CASE(RG16_SNORM, RG16_SNORM);
+ FMT_CASE(RG8_SNORM, RG8_SNORM);
+ FMT_CASE(R16_SNORM, R16_SNORM);
+ FMT_CASE(R8_SNORM, R8_SNORM);
+
+ FMT_CASE(BGRA_INTEGER, BGRA8);
+ default:
+ ERROR("unknown format %x\n", format);
+ assert(false);
+ return nv50_ir::FMT_NONE;
+ }
+#undef FMT_CASE
+}
+
bool
Converter::visit(nir_intrinsic_instr *insn)
{
nir_intrinsic_op op = insn->intrinsic;
const nir_intrinsic_info &opInfo = nir_intrinsic_infos[op];
- unsigned dest_components = nir_intrinsic_dest_components(insn);
switch (op) {
case nir_intrinsic_load_uniform: {
@@ -1636,7 +1916,7 @@ Converter::visit(nir_intrinsic_instr *insn)
const DataType dType = getDType(insn);
Value *indirect;
uint32_t coffset = getIndirect(insn, 0, 0, indirect);
- for (uint8_t i = 0; i < dest_components; ++i) {
+ for (uint8_t i = 0; i < insn->num_components; ++i) {
loadFrom(FILE_MEMORY_CONST, 0, dType, newDefs[i], 16 * coffset, i, indirect);
}
break;
@@ -1647,7 +1927,7 @@ Converter::visit(nir_intrinsic_instr *insn)
DataType dType = getSType(insn->src[0], false, false);
uint32_t idx = getIndirect(insn, op == nir_intrinsic_store_output ? 1 : 2, 0, indirect);
- for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
if (!((1u << i) & nir_intrinsic_write_mask(insn)))
continue;
@@ -1655,7 +1935,7 @@ Converter::visit(nir_intrinsic_instr *insn)
Value *src = getSrc(&insn->src[0], i);
switch (prog->getType()) {
case Program::TYPE_FRAGMENT: {
- if (info_out->out[idx].sn == TGSI_SEMANTIC_POSITION) {
+ if (info->out[idx].sn == TGSI_SEMANTIC_POSITION) {
// TGSI uses a different interface than NIR, TGSI stores that
// value in the z component, NIR in X
offset += 2;
@@ -1664,9 +1944,8 @@ Converter::visit(nir_intrinsic_instr *insn)
break;
}
case Program::TYPE_GEOMETRY:
- case Program::TYPE_TESSELLATION_EVAL:
case Program::TYPE_VERTEX: {
- if (info_out->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) {
+ if (info->io.genUserClip > 0 && idx == clipVertexOutput) {
mkMov(clipVtx[i], src);
src = clipVtx[i];
}
@@ -1701,19 +1980,19 @@ Converter::visit(nir_intrinsic_instr *insn)
srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LAYER, 0)));
srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_SAMPLE_INDEX, 0)));
- for (uint8_t i = 0u; i < dest_components; ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
defs.push_back(newDefs[i]);
mask |= 1 << i;
}
TexInstruction *texi = mkTex(OP_TXF, TEX_TARGET_2D_MS_ARRAY, 0, 0, defs, srcs);
- texi->tex.levelZero = true;
+ texi->tex.levelZero = 1;
texi->tex.mask = mask;
texi->tex.useOffsets = 0;
texi->tex.r = 0xffff;
texi->tex.s = 0xffff;
- info_out->prop.fp.readsFramebuffer = true;
+ info->prop.fp.readsFramebuffer = true;
break;
}
@@ -1724,29 +2003,19 @@ Converter::visit(nir_intrinsic_instr *insn)
uint32_t mode = 0;
uint32_t idx = getIndirect(insn, op == nir_intrinsic_load_interpolated_input ? 1 : 0, 0, indirect);
- nv50_ir_varying& vary = input ? info_out->in[idx] : info_out->out[idx];
+ nv50_ir_varying& vary = input ? info->in[idx] : info->out[idx];
// see load_barycentric_* handling
if (prog->getType() == Program::TYPE_FRAGMENT) {
+ mode = translateInterpMode(&vary, nvirOp);
if (op == nir_intrinsic_load_interpolated_input) {
ImmediateValue immMode;
if (getSrc(&insn->src[0], 1)->getUniqueInsn()->src(0).getImmediate(immMode))
- mode = immMode.reg.data.u32;
- }
- if (mode == NV50_IR_INTERP_DEFAULT)
- mode |= translateInterpMode(&vary, nvirOp);
- else {
- if (vary.linear) {
- nvirOp = OP_LINTERP;
- mode |= NV50_IR_INTERP_LINEAR;
- } else {
- nvirOp = OP_PINTERP;
- mode |= NV50_IR_INTERP_PERSPECTIVE;
- }
+ mode |= immMode.reg.data.u32;
}
}
- for (uint8_t i = 0u; i < dest_components; ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
uint32_t address = getSlotAddress(insn, idx, i);
Symbol *sym = mkSymbol(input ? FILE_SHADER_INPUT : FILE_SHADER_OUTPUT, 0, dType, address);
if (prog->getType() == Program::TYPE_FRAGMENT) {
@@ -1789,6 +2058,18 @@ Converter::visit(nir_intrinsic_instr *insn)
}
break;
}
+ case nir_intrinsic_load_kernel_input: {
+ assert(prog->getType() == Program::TYPE_COMPUTE);
+ assert(insn->num_components == 1);
+
+ LValues &newDefs = convert(&insn->dest);
+ const DataType dType = getDType(insn);
+ Value *indirect;
+ uint32_t idx = getIndirect(insn, 0, 0, indirect, true);
+
+ mkLoad(dType, newDefs[0], mkSymbol(FILE_SHADER_INPUT, 0, dType, idx), indirect);
+ break;
+ }
case nir_intrinsic_load_barycentric_at_offset:
case nir_intrinsic_load_barycentric_at_sample:
case nir_intrinsic_load_barycentric_centroid:
@@ -1815,11 +2096,8 @@ Converter::visit(nir_intrinsic_instr *insn)
} else if (op == nir_intrinsic_load_barycentric_pixel) {
mode = NV50_IR_INTERP_DEFAULT;
} else if (op == nir_intrinsic_load_barycentric_at_sample) {
- info_out->prop.fp.readsSampleLocations = true;
- Value *sample = getSSA();
- mkOp3(OP_SELP, TYPE_U32, sample, mkImm(0), getSrc(&insn->src[0], 0), mkImm(0))
- ->subOp = 2;
- mkOp1(OP_PIXLD, TYPE_U32, newDefs[0], sample)->subOp = NV50_IR_SUBOP_PIXLD_OFFSET;
+ info->prop.fp.readsSampleLocations = true;
+ mkOp1(OP_PIXLD, TYPE_U32, newDefs[0], getSrc(&insn->src[0], 0))->subOp = NV50_IR_SUBOP_PIXLD_OFFSET;
mode = NV50_IR_INTERP_OFFSET;
} else {
unreachable("all intrinsics already handled above");
@@ -1828,11 +2106,9 @@ Converter::visit(nir_intrinsic_instr *insn)
loadImm(newDefs[1], mode);
break;
}
- case nir_intrinsic_demote:
case nir_intrinsic_discard:
mkOp(OP_DISCARD, TYPE_NONE, NULL);
break;
- case nir_intrinsic_demote_if:
case nir_intrinsic_discard_if: {
Value *pred = getSSA(1, FILE_PREDICATE);
if (insn->num_components > 1) {
@@ -1848,13 +2124,12 @@ Converter::visit(nir_intrinsic_instr *insn)
case nir_intrinsic_load_base_instance:
case nir_intrinsic_load_draw_id:
case nir_intrinsic_load_front_face:
- case nir_intrinsic_is_helper_invocation:
case nir_intrinsic_load_helper_invocation:
case nir_intrinsic_load_instance_id:
case nir_intrinsic_load_invocation_id:
- case nir_intrinsic_load_workgroup_size:
+ case nir_intrinsic_load_local_group_size:
case nir_intrinsic_load_local_invocation_id:
- case nir_intrinsic_load_num_workgroups:
+ case nir_intrinsic_load_num_work_groups:
case nir_intrinsic_load_patch_vertices_in:
case nir_intrinsic_load_primitive_id:
case nir_intrinsic_load_sample_id:
@@ -1870,13 +2145,12 @@ Converter::visit(nir_intrinsic_instr *insn)
case nir_intrinsic_load_tess_level_inner:
case nir_intrinsic_load_tess_level_outer:
case nir_intrinsic_load_vertex_id:
- case nir_intrinsic_load_workgroup_id:
- case nir_intrinsic_load_work_dim: {
+ case nir_intrinsic_load_work_group_id: {
const DataType dType = getDType(insn);
SVSemantic sv = convert(op);
LValues &newDefs = convert(&insn->dest);
- for (uint8_t i = 0u; i < nir_intrinsic_dest_components(insn); ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
Value *def;
if (typeSizeof(dType) == 8)
def = getSSA();
@@ -1928,12 +2202,12 @@ Converter::visit(nir_intrinsic_instr *insn)
if (op == nir_intrinsic_read_first_invocation) {
mkOp1(OP_VOTE, TYPE_U32, tmp, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY;
- mkOp1(OP_BREV, TYPE_U32, tmp, tmp);
+ mkOp2(OP_EXTBF, TYPE_U32, tmp, tmp, mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
mkOp1(OP_BFIND, TYPE_U32, tmp, tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT;
} else
tmp = getSrc(&insn->src[1], 0);
- for (uint8_t i = 0; i < dest_components; ++i) {
+ for (uint8_t i = 0; i < insn->num_components; ++i) {
mkOp3(OP_SHFL, dType, newDefs[i], getSrc(&insn->src[0], i), tmp, mkImm(0x1f))
->subOp = NV50_IR_SUBOP_SHFL_IDX;
}
@@ -1949,10 +2223,10 @@ Converter::visit(nir_intrinsic_instr *insn)
Value *vtxBase = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(4, FILE_ADDRESS),
mkImm(baseVertex), indirectVertex);
- for (uint8_t i = 0u; i < dest_components; ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
uint32_t address = getSlotAddress(insn, idx, i);
loadFrom(FILE_SHADER_INPUT, 0, dType, newDefs[i], address, 0,
- indirectOffset, vtxBase, info_out->in[idx].patch);
+ indirectOffset, vtxBase, info->in[idx].patch);
}
break;
}
@@ -1972,24 +2246,19 @@ Converter::visit(nir_intrinsic_instr *insn)
vtxBase = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, FILE_ADDRESS), outBase, vtxBase);
- for (uint8_t i = 0u; i < dest_components; ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
uint32_t address = getSlotAddress(insn, idx, i);
loadFrom(FILE_SHADER_OUTPUT, 0, dType, newDefs[i], address, 0,
- indirectOffset, vtxBase, info_out->in[idx].patch);
+ indirectOffset, vtxBase, info->in[idx].patch);
}
break;
}
- case nir_intrinsic_emit_vertex: {
- if (info_out->io.genUserClip > 0)
+ case nir_intrinsic_emit_vertex:
+ if (info->io.genUserClip > 0)
handleUserClipPlanes();
- uint32_t idx = nir_intrinsic_stream_id(insn);
- mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
- break;
- }
+ // fallthrough
case nir_intrinsic_end_primitive: {
uint32_t idx = nir_intrinsic_stream_id(insn);
- if (idx)
- break;
mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
break;
}
@@ -2001,13 +2270,13 @@ Converter::visit(nir_intrinsic_instr *insn)
uint32_t index = getIndirect(&insn->src[0], 0, indirectIndex) + 1;
uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
- for (uint8_t i = 0u; i < dest_components; ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
loadFrom(FILE_MEMORY_CONST, index, dType, newDefs[i], offset, i,
indirectOffset, indirectIndex);
}
break;
}
- case nir_intrinsic_get_ssbo_size: {
+ case nir_intrinsic_get_buffer_size: {
LValues &newDefs = convert(&insn->dest);
const DataType dType = getDType(insn);
Value *indirectBuffer;
@@ -2024,7 +2293,7 @@ Converter::visit(nir_intrinsic_instr *insn)
uint32_t buffer = getIndirect(&insn->src[1], 0, indirectBuffer);
uint32_t offset = getIndirect(&insn->src[2], 0, indirectOffset);
- for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
if (!((1u << i) & nir_intrinsic_write_mask(insn)))
continue;
Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, sType,
@@ -2032,7 +2301,7 @@ Converter::visit(nir_intrinsic_instr *insn)
mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i))
->setIndirect(0, 1, indirectBuffer);
}
- info_out->io.globalAccess |= 0x2;
+ info->io.globalAccess |= 0x2;
break;
}
case nir_intrinsic_load_ssbo: {
@@ -2043,15 +2312,14 @@ Converter::visit(nir_intrinsic_instr *insn)
uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer);
uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
- for (uint8_t i = 0u; i < dest_components; ++i)
+ for (uint8_t i = 0u; i < insn->num_components; ++i)
loadFrom(FILE_MEMORY_BUFFER, buffer, dType, newDefs[i], offset, i,
indirectOffset, indirectBuffer);
- info_out->io.globalAccess |= 0x1;
+ info->io.globalAccess |= 0x1;
break;
}
case nir_intrinsic_shared_atomic_add:
- case nir_intrinsic_shared_atomic_fadd:
case nir_intrinsic_shared_atomic_and:
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_shared_atomic_exchange:
@@ -2074,7 +2342,6 @@ Converter::visit(nir_intrinsic_instr *insn)
break;
}
case nir_intrinsic_ssbo_atomic_add:
- case nir_intrinsic_ssbo_atomic_fadd:
case nir_intrinsic_ssbo_atomic_and:
case nir_intrinsic_ssbo_atomic_comp_swap:
case nir_intrinsic_ssbo_atomic_exchange:
@@ -2100,72 +2367,23 @@ Converter::visit(nir_intrinsic_instr *insn)
atom->setIndirect(0, 1, indirectBuffer);
atom->subOp = getSubOp(op);
- info_out->io.globalAccess |= 0x2;
- break;
- }
- case nir_intrinsic_global_atomic_add:
- case nir_intrinsic_global_atomic_fadd:
- case nir_intrinsic_global_atomic_and:
- case nir_intrinsic_global_atomic_comp_swap:
- case nir_intrinsic_global_atomic_exchange:
- case nir_intrinsic_global_atomic_or:
- case nir_intrinsic_global_atomic_imax:
- case nir_intrinsic_global_atomic_imin:
- case nir_intrinsic_global_atomic_umax:
- case nir_intrinsic_global_atomic_umin:
- case nir_intrinsic_global_atomic_xor: {
- const DataType dType = getDType(insn);
- LValues &newDefs = convert(&insn->dest);
- Value *address;
- uint32_t offset = getIndirect(&insn->src[0], 0, address);
-
- Symbol *sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, dType, offset);
- Instruction *atom =
- mkOp2(OP_ATOM, dType, newDefs[0], sym, getSrc(&insn->src[1], 0));
- if (op == nir_intrinsic_global_atomic_comp_swap)
- atom->setSrc(2, getSrc(&insn->src[2], 0));
- atom->setIndirect(0, 0, address);
- atom->subOp = getSubOp(op);
-
- info_out->io.globalAccess |= 0x2;
+ info->io.globalAccess |= 0x2;
break;
}
case nir_intrinsic_bindless_image_atomic_add:
- case nir_intrinsic_bindless_image_atomic_fadd:
case nir_intrinsic_bindless_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_exchange:
- case nir_intrinsic_bindless_image_atomic_imax:
- case nir_intrinsic_bindless_image_atomic_umax:
- case nir_intrinsic_bindless_image_atomic_imin:
- case nir_intrinsic_bindless_image_atomic_umin:
+ case nir_intrinsic_bindless_image_atomic_max:
+ case nir_intrinsic_bindless_image_atomic_min:
case nir_intrinsic_bindless_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_xor:
- case nir_intrinsic_bindless_image_atomic_inc_wrap:
- case nir_intrinsic_bindless_image_atomic_dec_wrap:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_samples:
case nir_intrinsic_bindless_image_size:
- case nir_intrinsic_bindless_image_store:
- case nir_intrinsic_image_atomic_add:
- case nir_intrinsic_image_atomic_fadd:
- case nir_intrinsic_image_atomic_and:
- case nir_intrinsic_image_atomic_comp_swap:
- case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_image_atomic_imax:
- case nir_intrinsic_image_atomic_umax:
- case nir_intrinsic_image_atomic_imin:
- case nir_intrinsic_image_atomic_umin:
- case nir_intrinsic_image_atomic_or:
- case nir_intrinsic_image_atomic_xor:
- case nir_intrinsic_image_atomic_inc_wrap:
- case nir_intrinsic_image_atomic_dec_wrap:
- case nir_intrinsic_image_load:
- case nir_intrinsic_image_samples:
- case nir_intrinsic_image_size:
- case nir_intrinsic_image_store: {
+ case nir_intrinsic_bindless_image_store: {
std::vector<Value*> srcs, defs;
- Value *indirect;
+ Value *indirect = getSrc(&insn->src[0], 0);
DataType ty;
uint32_t mask = 0;
@@ -2182,84 +2400,40 @@ Converter::visit(nir_intrinsic_instr *insn)
}
}
- int lod_src = -1;
- bool bindless = false;
switch (op) {
case nir_intrinsic_bindless_image_atomic_add:
- case nir_intrinsic_bindless_image_atomic_fadd:
case nir_intrinsic_bindless_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_exchange:
- case nir_intrinsic_bindless_image_atomic_imax:
- case nir_intrinsic_bindless_image_atomic_umax:
- case nir_intrinsic_bindless_image_atomic_imin:
- case nir_intrinsic_bindless_image_atomic_umin:
+ case nir_intrinsic_bindless_image_atomic_max:
+ case nir_intrinsic_bindless_image_atomic_min:
case nir_intrinsic_bindless_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_xor:
- case nir_intrinsic_bindless_image_atomic_inc_wrap:
- case nir_intrinsic_bindless_image_atomic_dec_wrap:
ty = getDType(insn);
- bindless = true;
- info_out->io.globalAccess |= 0x2;
- mask = 0x1;
- break;
- case nir_intrinsic_image_atomic_add:
- case nir_intrinsic_image_atomic_fadd:
- case nir_intrinsic_image_atomic_and:
- case nir_intrinsic_image_atomic_comp_swap:
- case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_image_atomic_imax:
- case nir_intrinsic_image_atomic_umax:
- case nir_intrinsic_image_atomic_imin:
- case nir_intrinsic_image_atomic_umin:
- case nir_intrinsic_image_atomic_or:
- case nir_intrinsic_image_atomic_xor:
- case nir_intrinsic_image_atomic_inc_wrap:
- case nir_intrinsic_image_atomic_dec_wrap:
- ty = getDType(insn);
- bindless = false;
- info_out->io.globalAccess |= 0x2;
mask = 0x1;
+ info->io.globalAccess |= 0x2;
break;
case nir_intrinsic_bindless_image_load:
- case nir_intrinsic_image_load:
ty = TYPE_U32;
- bindless = op == nir_intrinsic_bindless_image_load;
- info_out->io.globalAccess |= 0x1;
- lod_src = 4;
+ info->io.globalAccess |= 0x1;
break;
case nir_intrinsic_bindless_image_store:
- case nir_intrinsic_image_store:
ty = TYPE_U32;
- bindless = op == nir_intrinsic_bindless_image_store;
- info_out->io.globalAccess |= 0x2;
- lod_src = 5;
mask = 0xf;
+ info->io.globalAccess |= 0x2;
break;
case nir_intrinsic_bindless_image_samples:
mask = 0x8;
- FALLTHROUGH;
- case nir_intrinsic_image_samples:
ty = TYPE_U32;
- bindless = op == nir_intrinsic_bindless_image_samples;
- mask = 0x8;
break;
case nir_intrinsic_bindless_image_size:
- case nir_intrinsic_image_size:
- assert(nir_src_as_uint(insn->src[1]) == 0);
ty = TYPE_U32;
- bindless = op == nir_intrinsic_bindless_image_size;
break;
default:
unreachable("unhandled image opcode");
break;
}
- if (bindless)
- indirect = getSrc(&insn->src[0], 0);
- else
- location = getIndirect(&insn->src[0], 0, indirect);
-
// coords
if (opInfo.num_srcs >= 2)
for (unsigned int i = 0u; i < argCount; ++i)
@@ -2269,21 +2443,22 @@ Converter::visit(nir_intrinsic_instr *insn)
if (opInfo.num_srcs >= 3 && target.isMS())
srcs.push_back(getSrc(&insn->src[2], 0));
- if (opInfo.num_srcs >= 4 && lod_src != 4) {
+ if (opInfo.num_srcs >= 4) {
unsigned components = opInfo.src_components[3] ? opInfo.src_components[3] : insn->num_components;
for (uint8_t i = 0u; i < components; ++i)
srcs.push_back(getSrc(&insn->src[3], i));
}
- if (opInfo.num_srcs >= 5 && lod_src != 5)
+ if (opInfo.num_srcs >= 5)
// 1 for aotmic swap
for (uint8_t i = 0u; i < opInfo.src_components[4]; ++i)
srcs.push_back(getSrc(&insn->src[4], i));
TexInstruction *texi = mkTex(getOperation(op), target.getEnum(), location, 0, defs, srcs);
- texi->tex.bindless = bindless;
- texi->tex.format = nv50_ir::TexInstruction::translateImgFormat(nir_intrinsic_format(insn));
+ texi->tex.bindless = false;
+ texi->tex.format = &nv50_ir::TexInstruction::formatTable[convertGLImgFormat(nir_intrinsic_format(insn))];
texi->tex.mask = mask;
+ texi->tex.bindless = true;
texi->cache = convert(nir_intrinsic_access(insn));
texi->setType(ty);
texi->subOp = getSubOp(op);
@@ -2293,40 +2468,134 @@ Converter::visit(nir_intrinsic_instr *insn)
break;
}
- case nir_intrinsic_store_scratch:
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_min:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_deref_samples:
+ case nir_intrinsic_image_deref_size:
+ case nir_intrinsic_image_deref_store: {
+ const nir_variable *tex;
+ std::vector<Value*> srcs, defs;
+ Value *indirect;
+ DataType ty;
+
+ uint32_t mask = 0;
+ nir_deref_instr *deref = nir_src_as_deref(insn->src[0]);
+ const glsl_type *type = deref->type;
+ TexInstruction::Target target =
+ convert((glsl_sampler_dim)type->sampler_dimensionality,
+ type->sampler_array, type->sampler_shadow);
+ unsigned int argCount = getNIRArgCount(target);
+ uint16_t location = handleDeref(deref, indirect, tex);
+
+ if (opInfo.has_dest) {
+ LValues &newDefs = convert(&insn->dest);
+ for (uint8_t i = 0u; i < newDefs.size(); ++i) {
+ defs.push_back(newDefs[i]);
+ mask |= 1 << i;
+ }
+ }
+
+ switch (op) {
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_min:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ ty = getDType(insn);
+ mask = 0x1;
+ info->io.globalAccess |= 0x2;
+ break;
+ case nir_intrinsic_image_deref_load:
+ ty = TYPE_U32;
+ info->io.globalAccess |= 0x1;
+ break;
+ case nir_intrinsic_image_deref_store:
+ ty = TYPE_U32;
+ mask = 0xf;
+ info->io.globalAccess |= 0x2;
+ break;
+ case nir_intrinsic_image_deref_samples:
+ mask = 0x8;
+ ty = TYPE_U32;
+ break;
+ case nir_intrinsic_image_deref_size:
+ ty = TYPE_U32;
+ break;
+ default:
+ unreachable("unhandled image opcode");
+ break;
+ }
+
+ // coords
+ if (opInfo.num_srcs >= 2)
+ for (unsigned int i = 0u; i < argCount; ++i)
+ srcs.push_back(getSrc(&insn->src[1], i));
+
+ // the sampler is just another src added after coords
+ if (opInfo.num_srcs >= 3 && target.isMS())
+ srcs.push_back(getSrc(&insn->src[2], 0));
+
+ if (opInfo.num_srcs >= 4) {
+ unsigned components = opInfo.src_components[3] ? opInfo.src_components[3] : insn->num_components;
+ for (uint8_t i = 0u; i < components; ++i)
+ srcs.push_back(getSrc(&insn->src[3], i));
+ }
+
+ if (opInfo.num_srcs >= 5)
+ // 1 for aotmic swap
+ for (uint8_t i = 0u; i < opInfo.src_components[4]; ++i)
+ srcs.push_back(getSrc(&insn->src[4], i));
+
+ TexInstruction *texi = mkTex(getOperation(op), target.getEnum(), location, 0, defs, srcs);
+ texi->tex.bindless = false;
+ texi->tex.format = &nv50_ir::TexInstruction::formatTable[convertGLImgFormat(tex->data.image.format)];
+ texi->tex.mask = mask;
+ texi->cache = getCacheModeFromVar(tex);
+ texi->setType(ty);
+ texi->subOp = getSubOp(op);
+
+ if (indirect)
+ texi->setIndirectR(indirect);
+
+ break;
+ }
case nir_intrinsic_store_shared: {
DataType sType = getSType(insn->src[0], false, false);
Value *indirectOffset;
uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
- if (indirectOffset)
- indirectOffset = mkOp1v(OP_MOV, TYPE_U32, getSSA(4, FILE_ADDRESS), indirectOffset);
- for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+ for (uint8_t i = 0u; i < insn->num_components; ++i) {
if (!((1u << i) & nir_intrinsic_write_mask(insn)))
continue;
- Symbol *sym = mkSymbol(getFile(op), 0, sType, offset + i * typeSizeof(sType));
+ Symbol *sym = mkSymbol(FILE_MEMORY_SHARED, 0, sType, offset + i * typeSizeof(sType));
mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i));
}
break;
}
- case nir_intrinsic_load_kernel_input:
- case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shared: {
const DataType dType = getDType(insn);
LValues &newDefs = convert(&insn->dest);
Value *indirectOffset;
uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
- if (indirectOffset)
- indirectOffset = mkOp1v(OP_MOV, TYPE_U32, getSSA(4, FILE_ADDRESS), indirectOffset);
- for (uint8_t i = 0u; i < dest_components; ++i)
- loadFrom(getFile(op), 0, dType, newDefs[i], offset, i, indirectOffset);
+ for (uint8_t i = 0u; i < insn->num_components; ++i)
+ loadFrom(FILE_MEMORY_SHARED, 0, dType, newDefs[i], offset, i, indirectOffset);
break;
}
- case nir_intrinsic_control_barrier: {
+ case nir_intrinsic_barrier: {
// TODO: add flag to shader_info
- info_out->numBarriers = 1;
+ info->numBarriers = 1;
Instruction *bar = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0));
bar->fixed = 1;
bar->subOp = NV50_IR_SUBOP_BAR_SYNC;
@@ -2334,6 +2603,7 @@ Converter::visit(nir_intrinsic_instr *insn)
}
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_memory_barrier_atomic_counter:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared: {
@@ -2342,8 +2612,6 @@ Converter::visit(nir_intrinsic_instr *insn)
bar->subOp = getSubOp(op);
break;
}
- case nir_intrinsic_memory_barrier_tcs_patch:
- break;
case nir_intrinsic_shader_clock: {
const DataType dType = getDType(insn);
LValues &newDefs = convert(&insn->dest);
@@ -2352,23 +2620,22 @@ Converter::visit(nir_intrinsic_instr *insn)
mkOp1(OP_RDSV, dType, newDefs[1], mkSysVal(SV_CLOCK, 0))->fixed = 1;
break;
}
- case nir_intrinsic_load_global:
- case nir_intrinsic_load_global_constant: {
+ case nir_intrinsic_load_global: {
const DataType dType = getDType(insn);
LValues &newDefs = convert(&insn->dest);
Value *indirectOffset;
uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
- for (auto i = 0u; i < dest_components; ++i)
+ for (auto i = 0u; i < insn->num_components; ++i)
loadFrom(FILE_MEMORY_GLOBAL, 0, dType, newDefs[i], offset, i, indirectOffset);
- info_out->io.globalAccess |= 0x1;
+ info->io.globalAccess |= 0x1;
break;
}
case nir_intrinsic_store_global: {
DataType sType = getSType(insn->src[0], false, false);
- for (auto i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+ for (auto i = 0u; i < insn->num_components; ++i) {
if (!((1u << i) & nir_intrinsic_write_mask(insn)))
continue;
if (typeSizeof(sType) == 8) {
@@ -2386,7 +2653,7 @@ Converter::visit(nir_intrinsic_instr *insn)
}
}
- info_out->io.globalAccess |= 0x2;
+ info->io.globalAccess |= 0x2;
break;
}
default:
@@ -2410,6 +2677,7 @@ Converter::visit(nir_jump_instr *insn)
case nir_jump_continue: {
bool isBreak = insn->type == nir_jump_break;
nir_block *block = insn->instr.block;
+ assert(!block->successors[1]);
BasicBlock *target = convert(block->successors[0]);
mkFlow(isBreak ? OP_BREAK : OP_CONT, target, CC_ALWAYS, NULL);
bb->cfg.attach(&target->cfg, isBreak ? Graph::Edge::CROSS : Graph::Edge::BACK);
@@ -2529,6 +2797,8 @@ Converter::visit(nir_alu_instr *insn)
case nir_op_ushr:
case nir_op_fsin:
case nir_op_fsqrt:
+ case nir_op_fsub:
+ case nir_op_isub:
case nir_op_ftrunc:
case nir_op_ishl:
case nir_op_ixor: {
@@ -2600,7 +2870,7 @@ Converter::visit(nir_alu_instr *insn)
case nir_op_flt32:
case nir_op_ilt32:
case nir_op_ult32:
- case nir_op_fneu32:
+ case nir_op_fne32:
case nir_op_ine32: {
DEFAULT_CHECKS;
LValues &newDefs = convert(&insn->dest);
@@ -2616,12 +2886,58 @@ Converter::visit(nir_alu_instr *insn)
i->sType = sTypes[0];
break;
}
+ // those are weird ALU ops and need special handling, because
+ // 1. they are always componend based
+ // 2. they basically just merge multiple values into one data type
case nir_op_mov:
+ if (!insn->dest.dest.is_ssa && insn->dest.dest.reg.reg->num_array_elems) {
+ nir_reg_dest& reg = insn->dest.dest.reg;
+ uint32_t goffset = regToLmemOffset[reg.reg->index];
+ uint8_t comps = reg.reg->num_components;
+ uint8_t size = reg.reg->bit_size / 8;
+ uint8_t csize = 4 * size; // TODO after fixing MemoryOpts: comps * size;
+ uint32_t aoffset = csize * reg.base_offset;
+ Value *indirect = NULL;
+
+ if (reg.indirect)
+ indirect = mkOp2v(OP_MUL, TYPE_U32, getSSA(4, FILE_ADDRESS),
+ getSrc(reg.indirect, 0), mkImm(csize));
+
+ for (uint8_t i = 0u; i < comps; ++i) {
+ if (!((1u << i) & insn->dest.write_mask))
+ continue;
+
+ Symbol *sym = mkSymbol(FILE_MEMORY_LOCAL, 0, dType, goffset + aoffset + i * size);
+ mkStore(OP_STORE, dType, sym, indirect, getSrc(&insn->src[0], i));
+ }
+ break;
+ } else if (!insn->src[0].src.is_ssa && insn->src[0].src.reg.reg->num_array_elems) {
+ LValues &newDefs = convert(&insn->dest);
+ nir_reg_src& reg = insn->src[0].src.reg;
+ uint32_t goffset = regToLmemOffset[reg.reg->index];
+ // uint8_t comps = reg.reg->num_components;
+ uint8_t size = reg.reg->bit_size / 8;
+ uint8_t csize = 4 * size; // TODO after fixing MemoryOpts: comps * size;
+ uint32_t aoffset = csize * reg.base_offset;
+ Value *indirect = NULL;
+
+ if (reg.indirect)
+ indirect = mkOp2v(OP_MUL, TYPE_U32, getSSA(4, FILE_ADDRESS), getSrc(reg.indirect, 0), mkImm(csize));
+
+ for (uint8_t i = 0u; i < newDefs.size(); ++i)
+ loadFrom(FILE_MEMORY_LOCAL, 0, dType, newDefs[i], goffset + aoffset, i, indirect);
+
+ break;
+ } else {
+ LValues &newDefs = convert(&insn->dest);
+ for (LValues::size_type c = 0u; c < newDefs.size(); ++c) {
+ mkMov(newDefs[c], getSrc(&insn->src[0], c), dType);
+ }
+ }
+ break;
case nir_op_vec2:
case nir_op_vec3:
- case nir_op_vec4:
- case nir_op_vec8:
- case nir_op_vec16: {
+ case nir_op_vec4: {
LValues &newDefs = convert(&insn->dest);
for (LValues::size_type c = 0u; c < newDefs.size(); ++c) {
mkMov(newDefs[c], getSrc(&insn->src[c]), dType);
@@ -2717,7 +3033,7 @@ Converter::visit(nir_alu_instr *insn)
case nir_op_bfm: {
DEFAULT_CHECKS;
LValues &newDefs = convert(&insn->dest);
- mkOp2(OP_BMSK, dType, newDefs[0], getSrc(&insn->src[1]), getSrc(&insn->src[0]))->subOp = NV50_IR_SUBOP_BMSK_W;
+ mkOp3(OP_INSBF, dType, newDefs[0], getSrc(&insn->src[0]), loadImm(NULL, 0x808), getSrc(&insn->src[1]));
break;
}
case nir_op_bitfield_insert: {
@@ -2737,69 +3053,17 @@ Converter::visit(nir_alu_instr *insn)
case nir_op_bitfield_reverse: {
DEFAULT_CHECKS;
LValues &newDefs = convert(&insn->dest);
- mkOp1(OP_BREV, TYPE_U32, newDefs[0], getSrc(&insn->src[0]));
+ mkOp2(OP_EXTBF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
break;
}
case nir_op_find_lsb: {
DEFAULT_CHECKS;
LValues &newDefs = convert(&insn->dest);
Value *tmp = getSSA();
- mkOp1(OP_BREV, TYPE_U32, tmp, getSrc(&insn->src[0]));
+ mkOp2(OP_EXTBF, TYPE_U32, tmp, getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
mkOp1(OP_BFIND, TYPE_U32, newDefs[0], tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT;
break;
}
- case nir_op_extract_u8: {
- DEFAULT_CHECKS;
- LValues &newDefs = convert(&insn->dest);
- Value *prmt = getSSA();
- mkOp2(OP_OR, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x4440));
- mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
- break;
- }
- case nir_op_extract_i8: {
- DEFAULT_CHECKS;
- LValues &newDefs = convert(&insn->dest);
- Value *prmt = getSSA();
- mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x1111), loadImm(NULL, 0x8880));
- mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
- break;
- }
- case nir_op_extract_u16: {
- DEFAULT_CHECKS;
- LValues &newDefs = convert(&insn->dest);
- Value *prmt = getSSA();
- mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x22), loadImm(NULL, 0x4410));
- mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
- break;
- }
- case nir_op_extract_i16: {
- DEFAULT_CHECKS;
- LValues &newDefs = convert(&insn->dest);
- Value *prmt = getSSA();
- mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x2222), loadImm(NULL, 0x9910));
- mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
- break;
- }
- case nir_op_urol: {
- DEFAULT_CHECKS;
- LValues &newDefs = convert(&insn->dest);
- mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]),
- getSrc(&insn->src[1]), getSrc(&insn->src[0]))
- ->subOp = NV50_IR_SUBOP_SHF_L |
- NV50_IR_SUBOP_SHF_W |
- NV50_IR_SUBOP_SHF_HI;
- break;
- }
- case nir_op_uror: {
- DEFAULT_CHECKS;
- LValues &newDefs = convert(&insn->dest);
- mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]),
- getSrc(&insn->src[1]), getSrc(&insn->src[0]))
- ->subOp = NV50_IR_SUBOP_SHF_R |
- NV50_IR_SUBOP_SHF_W |
- NV50_IR_SUBOP_SHF_LO;
- break;
- }
// boolean conversions
case nir_op_b2f32: {
DEFAULT_CHECKS;
@@ -2845,7 +3109,6 @@ Converter::visit(nir_alu_instr *insn)
}
default:
ERROR("unknown nir_op %s\n", info.name);
- assert(false);
return false;
}
@@ -2917,6 +3180,14 @@ Converter::convert(glsl_sampler_dim dim, bool isArray, bool isShadow)
}
#undef CASE_SAMPLER
+Value*
+Converter::applyProjection(Value *src, Value *proj)
+{
+ if (!proj)
+ return src;
+ return mkOp2v(OP_MUL, TYPE_F32, getScratch(), src, proj);
+}
+
unsigned int
Converter::getNIRArgCount(TexInstruction::Target& target)
{
@@ -2928,14 +3199,70 @@ Converter::getNIRArgCount(TexInstruction::Target& target)
return result;
}
+uint16_t
+Converter::handleDeref(nir_deref_instr *deref, Value * &indirect, const nir_variable * &tex)
+{
+ typedef std::pair<uint32_t,Value*> DerefPair;
+ std::list<DerefPair> derefs;
+
+ uint16_t result = 0;
+ while (deref->deref_type != nir_deref_type_var) {
+ switch (deref->deref_type) {
+ case nir_deref_type_array: {
+ Value *indirect;
+ uint8_t size = type_size(deref->type, true);
+ result += size * getIndirect(&deref->arr.index, 0, indirect);
+
+ if (indirect) {
+ derefs.push_front(std::make_pair(size, indirect));
+ }
+
+ break;
+ }
+ case nir_deref_type_struct: {
+ result += nir_deref_instr_parent(deref)->type->struct_location_offset(deref->strct.index);
+ break;
+ }
+ case nir_deref_type_var:
+ default:
+ unreachable("nir_deref_type_var reached in handleDeref!");
+ break;
+ }
+ deref = nir_deref_instr_parent(deref);
+ }
+
+ indirect = NULL;
+ for (std::list<DerefPair>::const_iterator it = derefs.begin(); it != derefs.end(); ++it) {
+ Value *offset = mkOp2v(OP_MUL, TYPE_U32, getSSA(), loadImm(getSSA(), it->first), it->second);
+ if (indirect)
+ indirect = mkOp2v(OP_ADD, TYPE_U32, getSSA(), indirect, offset);
+ else
+ indirect = offset;
+ }
+
+ tex = nir_deref_instr_get_variable(deref);
+ assert(tex);
+
+ return result + tex->data.driver_location;
+}
+
CacheMode
Converter::convert(enum gl_access_qualifier access)
{
- if (access & ACCESS_VOLATILE)
+ switch (access) {
+ case ACCESS_VOLATILE:
return CACHE_CV;
- if (access & ACCESS_COHERENT)
+ case ACCESS_COHERENT:
return CACHE_CG;
- return CACHE_CA;
+ default:
+ return CACHE_CA;
+ }
+}
+
+CacheMode
+Converter::getCacheModeFromVar(const nir_variable *var)
+{
+ return convert(var->data.image.access);
}
bool
@@ -2959,6 +3286,7 @@ Converter::visit(nir_tex_instr *insn)
std::vector<nir_src*> offsets;
uint8_t mask = 0;
bool lz = false;
+ Value *proj = NULL;
TexInstruction::Target target = convert(insn->sampler_dim, insn->is_array, insn->is_shadow);
operation op = getOperation(insn->op);
@@ -2971,6 +3299,7 @@ Converter::visit(nir_tex_instr *insn)
int msIdx = nir_tex_instr_src_index(insn, nir_tex_src_ms_index);
int lodIdx = nir_tex_instr_src_index(insn, nir_tex_src_lod);
int offsetIdx = nir_tex_instr_src_index(insn, nir_tex_src_offset);
+ int projIdx = nir_tex_instr_src_index(insn, nir_tex_src_projector);
int sampOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_offset);
int texOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_texture_offset);
int sampHandleIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_handle);
@@ -2979,9 +3308,12 @@ Converter::visit(nir_tex_instr *insn)
bool bindless = sampHandleIdx != -1 || texHandleIdx != -1;
assert((sampHandleIdx != -1) == (texHandleIdx != -1));
+ if (projIdx != -1)
+ proj = mkOp1v(OP_RCP, TYPE_F32, getScratch(), getSrc(&insn->src[projIdx].src, 0));
+
srcs.resize(insn->coord_components);
for (uint8_t i = 0u; i < insn->coord_components; ++i)
- srcs[i] = getSrc(&insn->src[coordsIdx].src, i);
+ srcs[i] = applyProjection(getSrc(&insn->src[coordsIdx].src, i), proj);
// sometimes we get less args than target.getArgCount, but codegen expects the latter
if (insn->coord_components) {
@@ -3009,7 +3341,7 @@ Converter::visit(nir_tex_instr *insn)
if (offsetIdx != -1)
offsets.push_back(&insn->src[offsetIdx].src);
if (compIdx != -1)
- srcs.push_back(getSrc(&insn->src[compIdx].src, 0));
+ srcs.push_back(applyProjection(getSrc(&insn->src[compIdx].src, 0), proj));
if (texOffIdx != -1) {
srcs.push_back(getSrc(&insn->src[texOffIdx].src, 0));
texOffIdx = srcs.size() - 1;
@@ -3111,34 +3443,23 @@ Converter::visit(nir_tex_instr *insn)
return true;
}
-/* nouveau's RA doesn't track the liveness of exported registers in the fragment
- * shader, so we need all the store_outputs to appear at the end of the shader
- * with no other instructions that might generate a temp value in between them.
- */
-static void
-nv_nir_move_stores_to_end(nir_shader *s)
+bool
+Converter::visit(nir_deref_instr *deref)
{
- nir_function_impl *impl = nir_shader_get_entrypoint(s);
- nir_block *block = nir_impl_last_block(impl);
- nir_instr *first_store = NULL;
-
- nir_foreach_instr_safe(instr, block) {
- if (instr == first_store)
- break;
- if (instr->type != nir_instr_type_intrinsic)
- continue;
- nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
- if (intrin->intrinsic == nir_intrinsic_store_output) {
- nir_instr_remove(instr);
- nir_instr_insert(nir_after_block(block), instr);
-
- if (!first_store)
- first_store = instr;
- }
+ // we just ignore those, because images intrinsics are the only place where
+ // we should end up with deref sources and those have to backtrack anyway
+ // to get the nir_variable. This code just exists to handle some special
+ // cases.
+ switch (deref->deref_type) {
+ case nir_deref_type_array:
+ case nir_deref_type_struct:
+ case nir_deref_type_var:
+ break;
+ default:
+ ERROR("unknown nir_deref_instr %u\n", deref->deref_type);
+ return false;
}
- nir_metadata_preserve(impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ return true;
}
bool
@@ -3149,44 +3470,18 @@ Converter::run()
if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
nir_print_shader(nir, stderr);
- struct nir_lower_subgroups_options subgroup_options = {};
- subgroup_options.subgroup_size = 32;
- subgroup_options.ballot_bit_size = 32;
- subgroup_options.ballot_components = 1;
- subgroup_options.lower_elect = true;
-
- /* prepare for IO lowering */
- NIR_PASS_V(nir, nir_opt_deref);
- NIR_PASS_V(nir, nir_lower_regs_to_ssa);
- NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
- /* codegen assumes vec4 alignment for memory */
- NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, function_temp_type_info);
- NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset);
- NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
-
- NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
- type_size, (nir_lower_io_options)0);
+ struct nir_lower_subgroups_options subgroup_options = {
+ .subgroup_size = 32,
+ .ballot_bit_size = 32,
+ };
+ NIR_PASS_V(nir, nir_lower_io, nir_var_all, type_size, (nir_lower_io_options)0);
NIR_PASS_V(nir, nir_lower_subgroups, &subgroup_options);
-
- struct nir_lower_tex_options tex_options = {};
- tex_options.lower_txp = ~0;
-
- NIR_PASS_V(nir, nir_lower_tex, &tex_options);
-
+ NIR_PASS_V(nir, nir_lower_regs_to_ssa);
NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
- NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
- NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
-
- /*TODO: improve this lowering/optimisation loop so that we can use
- * nir_opt_idiv_const effectively before this.
- */
- nir_lower_idiv_options idiv_options = {
- .imprecise_32bit_lowering = false,
- .allow_fp16 = true,
- };
- NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+ NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL);
+ NIR_PASS_V(nir, nir_lower_phis_to_scalar);
do {
progress = false;
@@ -3199,21 +3494,11 @@ Converter::run()
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_dead_cf);
- NIR_PASS(progress, nir, nir_lower_64bit_phis);
} while (progress);
- nir_move_options move_options =
- (nir_move_options)(nir_move_const_undef |
- nir_move_load_ubo |
- nir_move_load_uniform |
- nir_move_load_input);
- NIR_PASS_V(nir, nir_opt_sink, move_options);
- NIR_PASS_V(nir, nir_opt_move, move_options);
-
- if (nir->info.stage == MESA_SHADER_FRAGMENT)
- NIR_PASS_V(nir, nv_nir_move_stores_to_end);
-
NIR_PASS_V(nir, nir_lower_bool_to_int32);
+ NIR_PASS_V(nir, nir_lower_locals_to_regs);
+ NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
NIR_PASS_V(nir, nir_convert_from_ssa, true);
// Garbage collect dead instructions
@@ -3245,150 +3530,17 @@ Converter::run()
namespace nv50_ir {
bool
-Program::makeFromNIR(struct nv50_ir_prog_info *info,
- struct nv50_ir_prog_info_out *info_out)
+Program::makeFromNIR(struct nv50_ir_prog_info *info)
{
nir_shader *nir = (nir_shader*)info->bin.source;
- Converter converter(this, nir, info, info_out);
+ Converter converter(this, nir, info);
bool result = converter.run();
if (!result)
return result;
LoweringHelper lowering;
lowering.run(this);
- tlsSize = info_out->bin.tlsSpace;
+ tlsSize = info->bin.tlsSpace;
return result;
}
} // namespace nv50_ir
-
-static nir_shader_compiler_options
-nvir_nir_shader_compiler_options(int chipset)
-{
- nir_shader_compiler_options op = {};
- op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET);
- op.lower_ffma16 = false;
- op.lower_ffma32 = false;
- op.lower_ffma64 = false;
- op.fuse_ffma16 = false; /* nir doesn't track mad vs fma */
- op.fuse_ffma32 = false; /* nir doesn't track mad vs fma */
- op.fuse_ffma64 = false; /* nir doesn't track mad vs fma */
- op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET);
- op.lower_flrp32 = true;
- op.lower_flrp64 = true;
- op.lower_fpow = false; // TODO: nir's lowering is broken, or we could use it
- op.lower_fsat = false;
- op.lower_fsqrt = false; // TODO: only before gm200
- op.lower_sincos = false;
- op.lower_fmod = true;
- op.lower_bitfield_extract = false;
- op.lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
- op.lower_bitfield_insert = false;
- op.lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
- op.lower_bitfield_insert_to_bitfield_select = false;
- op.lower_bitfield_reverse = false;
- op.lower_bit_count = false;
- op.lower_ifind_msb = false;
- op.lower_find_lsb = false;
- op.lower_uadd_carry = true; // TODO
- op.lower_usub_borrow = true; // TODO
- op.lower_mul_high = false;
- op.lower_fneg = false;
- op.lower_ineg = false;
- op.lower_scmp = true; // TODO: not implemented yet
- op.lower_vector_cmp = false;
- op.lower_bitops = false;
- op.lower_isign = (chipset >= NVISA_GV100_CHIPSET);
- op.lower_fsign = (chipset >= NVISA_GV100_CHIPSET);
- op.lower_fdph = false;
- op.lower_fdot = false;
- op.fdot_replicates = false; // TODO
- op.lower_ffloor = false; // TODO
- op.lower_ffract = true;
- op.lower_fceil = false; // TODO
- op.lower_ftrunc = false;
- op.lower_ldexp = true;
- op.lower_pack_half_2x16 = true;
- op.lower_pack_unorm_2x16 = true;
- op.lower_pack_snorm_2x16 = true;
- op.lower_pack_unorm_4x8 = true;
- op.lower_pack_snorm_4x8 = true;
- op.lower_unpack_half_2x16 = true;
- op.lower_unpack_unorm_2x16 = true;
- op.lower_unpack_snorm_2x16 = true;
- op.lower_unpack_unorm_4x8 = true;
- op.lower_unpack_snorm_4x8 = true;
- op.lower_pack_split = false;
- op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET);
- op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET);
- op.lower_insert_byte = true;
- op.lower_insert_word = true;
- op.lower_all_io_to_temps = false;
- op.lower_all_io_to_elements = false;
- op.vertex_id_zero_based = false;
- op.lower_base_vertex = false;
- op.lower_helper_invocation = false;
- op.optimize_sample_mask_in = false;
- op.lower_cs_local_index_to_id = true;
- op.lower_cs_local_id_to_index = false;
- op.lower_device_index_to_zero = false; // TODO
- op.lower_wpos_pntc = false; // TODO
- op.lower_hadd = true; // TODO
- op.lower_uadd_sat = true; // TODO
- op.lower_iadd_sat = true; // TODO
- op.vectorize_io = false;
- op.lower_to_scalar = false;
- op.unify_interfaces = false;
- op.use_interpolated_input_intrinsics = true;
- op.lower_mul_2x32_64 = true; // TODO
- op.lower_rotate = (chipset < NVISA_GV100_CHIPSET);
- op.has_imul24 = false;
- op.intel_vec4 = false;
- op.max_unroll_iterations = 32;
- op.lower_int64_options = (nir_lower_int64_options) (
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) |
- nir_lower_divmod64 |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) |
- ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) |
- nir_lower_ufind_msb64
- );
- op.lower_doubles_options = (nir_lower_doubles_options) (
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) |
- nir_lower_dmod |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) |
- ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0)
- );
- return op;
-}
-
-static const nir_shader_compiler_options g80_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_G80_CHIPSET);
-static const nir_shader_compiler_options gf100_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_GF100_CHIPSET);
-static const nir_shader_compiler_options gm107_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_GM107_CHIPSET);
-static const nir_shader_compiler_options gv100_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_GV100_CHIPSET);
-
-const nir_shader_compiler_options *
-nv50_ir_nir_shader_compiler_options(int chipset)
-{
- if (chipset >= NVISA_GV100_CHIPSET)
- return &gv100_nir_shader_compiler_options;
- if (chipset >= NVISA_GM107_CHIPSET)
- return &gm107_nir_shader_compiler_options;
- if (chipset >= NVISA_GF100_CHIPSET)
- return &gf100_nir_shader_compiler_options;
- return &g80_nir_shader_compiler_options;
-}
diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c b/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c
index ea9d7be0d..47f6c1e53 100644
--- a/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c
@@ -1,6 +1,5 @@
/*
* © Copyright 2018 Alyssa Rosenzweig
- * Copyright (C) 2019-2020 Collabora, Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,29 +25,27 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "pan_bo.h"
#include "pan_context.h"
-#include "pan_shader.h"
-#include "pan_util.h"
#include "compiler/nir/nir.h"
#include "nir/tgsi_to_nir.h"
+#include "midgard/midgard_compile.h"
#include "util/u_dynarray.h"
-#include "util/u_upload_mgr.h"
#include "tgsi/tgsi_dump.h"
void
-panfrost_shader_compile(struct pipe_screen *pscreen,
- struct panfrost_pool *shader_pool,
- struct panfrost_pool *desc_pool,
- enum pipe_shader_ir ir_type,
- const void *ir,
- gl_shader_stage stage,
- struct panfrost_shader_state *state)
+panfrost_shader_compile(
+ struct panfrost_context *ctx,
+ struct mali_shader_meta *meta,
+ enum pipe_shader_ir ir_type,
+ const void *ir,
+ gl_shader_stage stage,
+ struct panfrost_shader_state *state,
+ uint64_t *outputs_written)
{
- struct panfrost_screen *screen = pan_screen(pscreen);
- struct panfrost_device *dev = pan_device(pscreen);
+ struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+ uint8_t *dst;
nir_shader *s;
@@ -56,47 +53,117 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
s = nir_shader_clone(NULL, ir);
} else {
assert (ir_type == PIPE_SHADER_IR_TGSI);
- s = tgsi_to_nir(ir, pscreen, false);
+ s = tgsi_to_nir(ir, ctx->base.screen);
}
- /* Lower this early so the backends don't have to worry about it */
- if (stage == MESA_SHADER_FRAGMENT)
- NIR_PASS_V(s, nir_lower_fragcolor, state->nr_cbufs);
-
s->info.stage = stage;
+ if (stage == MESA_SHADER_FRAGMENT) {
+ /* Inject the alpha test now if we need to */
+
+ if (state->alpha_state.enabled) {
+ NIR_PASS_V(s, nir_lower_alpha_test, state->alpha_state.func, false);
+ }
+ }
+
/* Call out to Midgard compiler given the above NIR */
- struct panfrost_compile_inputs inputs = {
- .gpu_id = dev->gpu_id,
- .shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
+
+ midgard_program program = {
+ .alpha_ref = state->alpha_state.ref_value
};
- memcpy(inputs.rt_formats, state->rt_formats, sizeof(inputs.rt_formats));
+ midgard_compile_shader_nir(&ctx->compiler, s, &program, false);
+
+ /* Prepare the compiled binary for upload */
+ int size = program.compiled.size;
+ dst = program.compiled.data;
+
+ /* Upload the shader. The lookahead tag is ORed on as a tagged pointer.
+ * I bet someone just thought that would be a cute pun. At least,
+ * that's how I'd do it. */
+
+ state->bo = panfrost_drm_create_bo(screen, size, PAN_ALLOCATE_EXECUTE);
+ memcpy(state->bo->cpu, dst, size);
+ meta->shader = state->bo->gpu | program.first_tag;
+
+ util_dynarray_fini(&program.compiled);
+
+ /* Sysvals are prepended */
+ program.uniform_count += program.sysval_count;
+ state->sysval_count = program.sysval_count;
+ memcpy(state->sysval, program.sysvals, sizeof(state->sysval[0]) * state->sysval_count);
+
+ meta->midgard1.uniform_count = MIN2(program.uniform_count, program.uniform_cutoff);
+ meta->midgard1.work_count = program.work_register_count;
+
+ switch (stage) {
+ case MESA_SHADER_VERTEX:
+ meta->attribute_count = util_bitcount64(s->info.inputs_read);
+ meta->varying_count = util_bitcount64(s->info.outputs_written);
+ break;
+ case MESA_SHADER_FRAGMENT:
+ meta->attribute_count = 0;
+ meta->varying_count = util_bitcount64(s->info.inputs_read);
+ break;
+ case MESA_SHADER_COMPUTE:
+ /* TODO: images */
+ meta->attribute_count = 0;
+ meta->varying_count = 0;
+ break;
+ default:
+ unreachable("Unknown shader state");
+ }
- struct util_dynarray binary;
+ state->can_discard = s->info.fs.uses_discard;
+ state->writes_point_size = program.writes_point_size;
+ state->reads_point_coord = false;
+ state->helper_invocations = s->info.fs.needs_helper_invocations;
- util_dynarray_init(&binary, NULL);
- screen->vtbl.compile_shader(s, &inputs, &binary, &state->info);
+ if (outputs_written)
+ *outputs_written = s->info.outputs_written;
- if (binary.size) {
- state->bin = panfrost_pool_take_ref(shader_pool,
- pan_pool_upload_aligned(&shader_pool->base,
- binary.data, binary.size, 128));
- }
+ /* Separate as primary uniform count is truncated */
+ state->uniform_count = program.uniform_count;
+ meta->midgard1.unknown2 = 8; /* XXX */
- /* Don't upload RSD for fragment shaders since they need draw-time
- * merging for e.g. depth/stencil/alpha. RSDs are replaced by simpler
- * shader program descriptors on Valhall, which can be preuploaded even
- * for fragment shaders. */
- bool upload = !(stage == MESA_SHADER_FRAGMENT && dev->arch <= 7);
- screen->vtbl.prepare_shader(state, desc_pool, upload);
+ unsigned default_vec1_swizzle = panfrost_get_default_swizzle(1);
+ unsigned default_vec2_swizzle = panfrost_get_default_swizzle(2);
+ unsigned default_vec4_swizzle = panfrost_get_default_swizzle(4);
- panfrost_analyze_sysvals(state);
+ /* Iterate the varyings and emit the corresponding descriptor */
+ for (unsigned i = 0; i < meta->varying_count; ++i) {
+ unsigned location = program.varyings[i];
- util_dynarray_fini(&binary);
+ /* Default to a vec4 varying */
+ struct mali_attr_meta v = {
+ .format = MALI_RGBA32F,
+ .swizzle = default_vec4_swizzle,
+ .unknown1 = 0x2,
+ };
- /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against
- * a NULL context */
- ralloc_free(s);
+ /* Check for special cases, otherwise assume general varying */
+
+ if (location == VARYING_SLOT_POS) {
+ v.format = MALI_VARYING_POS;
+ } else if (location == VARYING_SLOT_PSIZ) {
+ v.format = MALI_R16F;
+ v.swizzle = default_vec1_swizzle;
+
+ state->writes_point_size = true;
+ } else if (location == VARYING_SLOT_PNTC) {
+ v.format = MALI_RG16F;
+ v.swizzle = default_vec2_swizzle;
+
+ state->reads_point_coord = true;
+ } else if (location == VARYING_SLOT_FACE) {
+ v.format = MALI_R32I;
+ v.swizzle = default_vec1_swizzle;
+
+ state->reads_face = true;
+ }
+
+ state->varyings[i] = v;
+ state->varyings_loc[i] = location;
+ }
}
diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c b/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c
index 844016ee9..d0b2e1322 100644
--- a/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c
+++ b/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c
@@ -1,6 +1,5 @@
/*
* Copyright (C) 2019 Collabora, Ltd.
- * Copyright (C) 2019 Red Hat Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -27,10 +26,7 @@
*/
#include "pan_context.h"
-#include "pan_bo.h"
-#include "pan_shader.h"
#include "util/u_memory.h"
-#include "nir_serialize.h"
/* Compute CSOs are tracked like graphics shader CSOs, but are
* considerably simpler. We do not implement multiple
@@ -43,38 +39,23 @@ panfrost_create_compute_state(
const struct pipe_compute_state *cso)
{
struct panfrost_context *ctx = pan_context(pctx);
- struct panfrost_screen *screen = pan_screen(pctx->screen);
struct panfrost_shader_variants *so = CALLOC_STRUCT(panfrost_shader_variants);
so->cbase = *cso;
so->is_compute = true;
- struct panfrost_shader_state *v = calloc(1, sizeof(*v));
- so->variants = v;
+ struct panfrost_shader_state *v = &so->variants[0];
so->variant_count = 1;
so->active_variant = 0;
- if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) {
- struct blob_reader reader;
- const struct pipe_binary_program_header *hdr = cso->prog;
+ v->tripipe = malloc(sizeof(struct mali_shader_meta));
- blob_reader_init(&reader, hdr->blob, hdr->num_bytes);
+ panfrost_shader_compile(ctx, v->tripipe,
+ cso->ir_type, cso->prog,
+ MESA_SHADER_COMPUTE, v, NULL);
- const struct nir_shader_compiler_options *options =
- screen->vtbl.get_compiler_options();
- so->cbase.prog = nir_deserialize(NULL, options, &reader);
- so->cbase.ir_type = PIPE_SHADER_IR_NIR;
- }
-
- panfrost_shader_compile(pctx->screen, &ctx->shaders, &ctx->descs,
- so->cbase.ir_type, so->cbase.prog, MESA_SHADER_COMPUTE,
- v);
-
- /* There are no variants so we won't need the NIR again */
- ralloc_free((void *)so->cbase.prog);
- so->cbase.prog = NULL;
return so;
}
@@ -83,57 +64,76 @@ static void
panfrost_bind_compute_state(struct pipe_context *pipe, void *cso)
{
struct panfrost_context *ctx = pan_context(pipe);
- ctx->shader[PIPE_SHADER_COMPUTE] = cso;
+
+ struct panfrost_shader_variants *variants =
+ (struct panfrost_shader_variants *) cso;
+
+ ctx->shader[PIPE_SHADER_COMPUTE] = variants;
}
static void
panfrost_delete_compute_state(struct pipe_context *pipe, void *cso)
{
- struct panfrost_shader_variants *so =
- (struct panfrost_shader_variants *)cso;
-
- free(so->variants);
free(cso);
}
-static void
-panfrost_set_compute_resources(struct pipe_context *pctx,
- unsigned start, unsigned count,
- struct pipe_surface **resources)
-{
- /* TODO */
-}
+/* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
+ * construct the COMPUTE job and some of its payload.
+ */
static void
-panfrost_set_global_binding(struct pipe_context *pctx,
- unsigned first, unsigned count,
- struct pipe_resource **resources,
- uint32_t **handles)
+panfrost_launch_grid(struct pipe_context *pipe,
+ const struct pipe_grid_info *info)
{
- if (!resources)
- return;
+ struct panfrost_context *ctx = pan_context(pipe);
- struct panfrost_context *ctx = pan_context(pctx);
- struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+ ctx->compute_grid = info;
- for (unsigned i = first; i < first + count; ++i) {
- struct panfrost_resource *rsrc = pan_resource(resources[i]);
- panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_COMPUTE);
+ struct mali_job_descriptor_header job = {
+ .job_type = JOB_TYPE_COMPUTE,
+ .job_descriptor_size = 1,
+ .job_barrier = 1
+ };
- util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
- 0, rsrc->base.width0);
+ /* TODO: Stub */
+ struct midgard_payload_vertex_tiler *payload = &ctx->payloads[PIPE_SHADER_COMPUTE];
- /* The handle points to uint32_t, but space is allocated for 64 bits */
- memcpy(handles[i], &rsrc->image.data.bo->ptr.gpu, sizeof(mali_ptr));
- }
-}
+ panfrost_emit_for_draw(ctx, false);
-static void
-panfrost_memory_barrier(struct pipe_context *pctx, unsigned flags)
-{
- /* TODO: Be smart and only flush the minimum needed, maybe emitting a
- * cache flush job if that would help */
- panfrost_flush_all_batches(pan_context(pctx), "Memory barrier");
+ /* Compute jobs have a "compute FBD". It's not a real framebuffer
+ * descriptor - there is no framebuffer - but it takes the place of
+ * one. As far as I can tell, it's actually the beginning of a
+ * single-render-target framebuffer descriptor with almost everything
+ * zeroed out.
+ */
+ struct mali_compute_fbd compute_fbd = {
+ .unknown1 = {
+ 0, 0x1F, 0, 0, 0, 0, 0, 0
+ }
+ };
+
+ payload->postfix.framebuffer =
+ panfrost_upload_transient(ctx, &compute_fbd, sizeof(compute_fbd));
+
+ /* Invoke according to the grid info */
+
+ panfrost_pack_work_groups_compute(&payload->prefix,
+ info->grid[0], info->grid[1], info->grid[2],
+ info->block[0], info->block[1], info->block[2]);
+
+ /* Upload the payload */
+
+ struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(*payload));
+ memcpy(transfer.cpu, &job, sizeof(job));
+ memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload));
+
+ /* TODO: Do we want a special compute-only batch? */
+ struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
+
+ /* Queue the job */
+ panfrost_scoreboard_queue_compute_job(batch, transfer);
+
+ panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME);
}
void
@@ -143,8 +143,7 @@ panfrost_compute_context_init(struct pipe_context *pctx)
pctx->bind_compute_state = panfrost_bind_compute_state;
pctx->delete_compute_state = panfrost_delete_compute_state;
- pctx->set_compute_resources = panfrost_set_compute_resources;
- pctx->set_global_binding = panfrost_set_global_binding;
-
- pctx->memory_barrier = panfrost_memory_barrier;
+ pctx->launch_grid = panfrost_launch_grid;
}
+
+
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c
index d4637ce72..d2c6378a0 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c
@@ -25,502 +25,294 @@
*
**************************************************************************/
+#include <stdio.h>
+
#include "pipe/p_video_codec.h"
-#include "radeon_vcn_enc.h"
-#include "radeon_video.h"
-#include "si_pipe.h"
-#include "util/u_video.h"
-#include "util/u_memory.h"
-#include "radeon_efc.h"
-#include <stdio.h>
+#include "util/u_video.h"
-#define RENCODE_FW_INTERFACE_MAJOR_VERSION 1
-#define RENCODE_FW_INTERFACE_MINOR_VERSION 1
-
-#define RENCODE_IB_PARAM_SESSION_INFO 0x00000001
-#define RENCODE_IB_PARAM_TASK_INFO 0x00000002
-#define RENCODE_IB_PARAM_SESSION_INIT 0x00000003
-#define RENCODE_IB_PARAM_LAYER_CONTROL 0x00000004
-#define RENCODE_IB_PARAM_LAYER_SELECT 0x00000005
-#define RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT 0x00000006
-#define RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT 0x00000007
-#define RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE 0x00000008
-#define RENCODE_IB_PARAM_QUALITY_PARAMS 0x00000009
-#define RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU 0x0000000a
-#define RENCODE_IB_PARAM_SLICE_HEADER 0x0000000b
-#define RENCODE_IB_PARAM_INPUT_FORMAT 0x0000000c
-#define RENCODE_IB_PARAM_OUTPUT_FORMAT 0x0000000d
-#define RENCODE_IB_PARAM_ENCODE_PARAMS 0x0000000f
-#define RENCODE_IB_PARAM_INTRA_REFRESH 0x00000010
-#define RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER 0x00000011
-#define RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER 0x00000012
-#define RENCODE_IB_PARAM_FEEDBACK_BUFFER 0x00000015
-#define RENCODE_IB_PARAM_EFC_CONFIG 0x0000000e
-
-#define RENCODE_HEVC_IB_PARAM_SLICE_CONTROL 0x00100001
-#define RENCODE_HEVC_IB_PARAM_SPEC_MISC 0x00100002
-#define RENCODE_HEVC_IB_PARAM_LOOP_FILTER 0x00100003
-
-#define RENCODE_H264_IB_PARAM_SLICE_CONTROL 0x00200001
-#define RENCODE_H264_IB_PARAM_SPEC_MISC 0x00200002
-#define RENCODE_H264_IB_PARAM_ENCODE_PARAMS 0x00200003
-#define RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER 0x00200004
-
-static void radeon_enc_op_balance(struct radeon_encoder *enc)
-{
- RADEON_ENC_BEGIN(RENCODE_IB_OP_SET_BALANCE_ENCODING_MODE);
- RADEON_ENC_END();
-}
+#include "si_pipe.h"
+#include "radeon_video.h"
+#include "radeon_vcn_enc.h"
-static void radeon_enc_slice_header_hevc(struct radeon_encoder *enc)
+#define RENCODE_FW_INTERFACE_MAJOR_VERSION 1
+#define RENCODE_FW_INTERFACE_MINOR_VERSION 1
+
+#define RENCODE_IB_PARAM_SESSION_INFO 0x00000001
+#define RENCODE_IB_PARAM_TASK_INFO 0x00000002
+#define RENCODE_IB_PARAM_SESSION_INIT 0x00000003
+#define RENCODE_IB_PARAM_LAYER_CONTROL 0x00000004
+#define RENCODE_IB_PARAM_LAYER_SELECT 0x00000005
+#define RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT 0x00000006
+#define RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT 0x00000007
+#define RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE 0x00000008
+#define RENCODE_IB_PARAM_QUALITY_PARAMS 0x00000009
+#define RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU 0x0000000a
+#define RENCODE_IB_PARAM_SLICE_HEADER 0x0000000b
+#define RENCODE_IB_PARAM_INPUT_FORMAT 0x0000000c
+#define RENCODE_IB_PARAM_OUTPUT_FORMAT 0x0000000d
+#define RENCODE_IB_PARAM_ENCODE_PARAMS 0x0000000f
+#define RENCODE_IB_PARAM_INTRA_REFRESH 0x00000010
+#define RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER 0x00000011
+#define RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER 0x00000012
+#define RENCODE_IB_PARAM_FEEDBACK_BUFFER 0x00000015
+
+#define RENCODE_HEVC_IB_PARAM_SLICE_CONTROL 0x00100001
+#define RENCODE_HEVC_IB_PARAM_SPEC_MISC 0x00100002
+#define RENCODE_HEVC_IB_PARAM_LOOP_FILTER 0x00100003
+
+#define RENCODE_H264_IB_PARAM_SLICE_CONTROL 0x00200001
+#define RENCODE_H264_IB_PARAM_SPEC_MISC 0x00200002
+#define RENCODE_H264_IB_PARAM_ENCODE_PARAMS 0x00200003
+#define RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER 0x00200004
+
+static void radeon_enc_quality_params(struct radeon_encoder *enc)
{
- uint32_t instruction[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
- uint32_t num_bits[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
- unsigned int inst_index = 0;
- unsigned int cdw_start = 0;
- unsigned int cdw_filled = 0;
- unsigned int bits_copied = 0;
- RADEON_ENC_BEGIN(enc->cmd.slice_header);
- radeon_enc_reset(enc);
- radeon_enc_set_emulation_prevention(enc, false);
-
- cdw_start = enc->cs.current.cdw;
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.nal_unit_type, 6);
- radeon_enc_code_fixed_bits(enc, 0x0, 6);
- radeon_enc_code_fixed_bits(enc, 0x1, 3);
-
- radeon_enc_flush_headers(enc);
- instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
- num_bits[inst_index] = enc->bits_output - bits_copied;
- bits_copied = enc->bits_output;
- inst_index++;
-
- instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_FIRST_SLICE;
- inst_index++;
-
- if ((enc->enc_pic.nal_unit_type >= 16) && (enc->enc_pic.nal_unit_type <= 23))
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
- radeon_enc_code_ue(enc, 0x0);
-
- radeon_enc_flush_headers(enc);
- instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
- num_bits[inst_index] = enc->bits_output - bits_copied;
- bits_copied = enc->bits_output;
- inst_index++;
-
- instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SLICE_SEGMENT;
- inst_index++;
-
- instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_DEPENDENT_SLICE_END;
- inst_index++;
-
- switch (enc->enc_pic.picture_type) {
- case PIPE_H2645_ENC_PICTURE_TYPE_I:
- case PIPE_H2645_ENC_PICTURE_TYPE_IDR:
- radeon_enc_code_ue(enc, 0x2);
- break;
- case PIPE_H2645_ENC_PICTURE_TYPE_P:
- case PIPE_H2645_ENC_PICTURE_TYPE_SKIP:
- radeon_enc_code_ue(enc, 0x1);
- break;
- case PIPE_H2645_ENC_PICTURE_TYPE_B:
- radeon_enc_code_ue(enc, 0x0);
- break;
- default:
- radeon_enc_code_ue(enc, 0x1);
- }
-
- if ((enc->enc_pic.nal_unit_type != 19) && (enc->enc_pic.nal_unit_type != 20)) {
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.pic_order_cnt, enc->enc_pic.log2_max_poc);
- if (enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P)
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- else {
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_ue(enc, 0x0);
- }
- }
-
- if (enc->enc_pic.sample_adaptive_offset_enabled_flag) {
- radeon_enc_flush_headers(enc);
- instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
- num_bits[inst_index] = enc->bits_output - bits_copied;
- bits_copied = enc->bits_output;
- inst_index++;
-
- instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SAO_ENABLE;
- inst_index++;
- }
-
- if ((enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P) ||
- (enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_B)) {
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.cabac_init_flag, 1);
- radeon_enc_code_ue(enc, 5 - enc->enc_pic.max_num_merge_cand);
- }
-
- radeon_enc_flush_headers(enc);
- instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
- num_bits[inst_index] = enc->bits_output - bits_copied;
- bits_copied = enc->bits_output;
- inst_index++;
-
- instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SLICE_QP_DELTA;
- inst_index++;
-
- if ((enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled) &&
- (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled ||
- enc->enc_pic.sample_adaptive_offset_enabled_flag)) {
- if (enc->enc_pic.sample_adaptive_offset_enabled_flag) {
- radeon_enc_flush_headers(enc);
- instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
- num_bits[inst_index] = enc->bits_output - bits_copied;
- bits_copied = enc->bits_output;
- inst_index++;
-
- instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_LOOP_FILTER_ACROSS_SLICES_ENABLE;
- inst_index++;
- }
- else
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1);
- }
-
- radeon_enc_flush_headers(enc);
- instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
- num_bits[inst_index] = enc->bits_output - bits_copied;
- bits_copied = enc->bits_output;
- inst_index++;
- instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_END;
-
- cdw_filled = enc->cs.current.cdw - cdw_start;
- for (int i = 0; i < RENCODE_SLICE_HEADER_TEMPLATE_MAX_TEMPLATE_SIZE_IN_DWORDS - cdw_filled; i++)
- RADEON_ENC_CS(0x00000000);
-
- for (int j = 0; j < RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS; j++) {
- RADEON_ENC_CS(instruction[j]);
- RADEON_ENC_CS(num_bits[j]);
- }
-
- RADEON_ENC_END();
+ enc->enc_pic.quality_params.vbaq_mode = 0;
+ enc->enc_pic.quality_params.scene_change_sensitivity = 0;
+ enc->enc_pic.quality_params.scene_change_min_idr_interval = 0;
+ enc->enc_pic.quality_params.two_pass_search_center_map_mode = 0;
+
+ RADEON_ENC_BEGIN(enc->cmd.quality_params);
+ RADEON_ENC_CS(enc->enc_pic.quality_params.vbaq_mode);
+ RADEON_ENC_CS(enc->enc_pic.quality_params.scene_change_sensitivity);
+ RADEON_ENC_CS(enc->enc_pic.quality_params.scene_change_min_idr_interval);
+ RADEON_ENC_CS(enc->enc_pic.quality_params.two_pass_search_center_map_mode);
+ RADEON_ENC_END();
}
static void radeon_enc_loop_filter_hevc(struct radeon_encoder *enc)
{
- RADEON_ENC_BEGIN(enc->cmd.deblocking_filter_hevc);
- RADEON_ENC_CS(enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled);
- RADEON_ENC_CS(enc->enc_pic.hevc_deblock.deblocking_filter_disabled);
- RADEON_ENC_CS(enc->enc_pic.hevc_deblock.beta_offset_div2);
- RADEON_ENC_CS(enc->enc_pic.hevc_deblock.tc_offset_div2);
- RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cb_qp_offset);
- RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cr_qp_offset);
- RADEON_ENC_CS(!enc->enc_pic.sample_adaptive_offset_enabled_flag);
- RADEON_ENC_END();
+ RADEON_ENC_BEGIN(enc->cmd.deblocking_filter_hevc);
+ RADEON_ENC_CS(enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled);
+ RADEON_ENC_CS(enc->enc_pic.hevc_deblock.deblocking_filter_disabled);
+ RADEON_ENC_CS(enc->enc_pic.hevc_deblock.beta_offset_div2);
+ RADEON_ENC_CS(enc->enc_pic.hevc_deblock.tc_offset_div2);
+ RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cb_qp_offset);
+ RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cr_qp_offset);
+ RADEON_ENC_CS(1);
+ RADEON_ENC_END();
}
static void radeon_enc_nalu_sps_hevc(struct radeon_encoder *enc)
{
- RADEON_ENC_BEGIN(enc->cmd.nalu);
- RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_SPS);
- uint32_t *size_in_bytes = &enc->cs.current.buf[enc->cs.current.cdw++];
- int i;
-
- radeon_enc_reset(enc);
- radeon_enc_set_emulation_prevention(enc, false);
- radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
- radeon_enc_code_fixed_bits(enc, 0x4201, 16);
- radeon_enc_byte_align(enc);
- radeon_enc_set_emulation_prevention(enc, true);
- radeon_enc_code_fixed_bits(enc, 0x0, 4);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1, 3);
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 2);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_tier_flag, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_profile_idc, 5);
-
- if (enc->enc_pic.general_profile_idc == 2)
- radeon_enc_code_fixed_bits(enc, 0x20000000, 32);
- else
- radeon_enc_code_fixed_bits(enc, 0x60000000, 32);
-
- radeon_enc_code_fixed_bits(enc, 0xb0000000, 32);
- radeon_enc_code_fixed_bits(enc, 0x0, 16);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_level_idc, 8);
-
- for (i = 0; i < (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i++)
- radeon_enc_code_fixed_bits(enc, 0x0, 2);
-
- if ((enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) > 0) {
- for (i = (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i < 8; i++)
- radeon_enc_code_fixed_bits(enc, 0x0, 2);
- }
-
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_ue(enc, enc->enc_pic.chroma_format_idc);
- radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_width);
- radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_height);
-
- if ((enc->enc_pic.crop_left != 0) || (enc->enc_pic.crop_right != 0) ||
- (enc->enc_pic.crop_top != 0) || (enc->enc_pic.crop_bottom != 0)) {
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- radeon_enc_code_ue(enc, enc->enc_pic.crop_left);
- radeon_enc_code_ue(enc, enc->enc_pic.crop_right);
- radeon_enc_code_ue(enc, enc->enc_pic.crop_top);
- radeon_enc_code_ue(enc, enc->enc_pic.crop_bottom);
- } else if (enc->enc_pic.session_init.padding_width != 0 ||
- enc->enc_pic.session_init.padding_height != 0) {
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_width / 2);
- radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_width / 2);
- radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_height / 2);
- radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_height / 2);
- } else
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
- radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_luma_minus8);
- radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_chroma_minus8);
- radeon_enc_code_ue(enc, enc->enc_pic.log2_max_poc - 4);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_ue(enc, 1);
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_ue(enc, enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3);
- // Only support CTBSize 64
- radeon_enc_code_ue(enc,
- 6 - (enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3 + 3));
- radeon_enc_code_ue(enc, enc->enc_pic.log2_min_transform_block_size_minus2);
- radeon_enc_code_ue(enc, enc->enc_pic.log2_diff_max_min_transform_block_size);
- radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_inter);
- radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_intra);
-
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, !enc->enc_pic.hevc_spec_misc.amp_disabled, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.sample_adaptive_offset_enabled_flag, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.pcm_enabled_flag, 1);
-
- radeon_enc_code_ue(enc, 1);
- radeon_enc_code_ue(enc, 1);
- radeon_enc_code_ue(enc, 0);
- radeon_enc_code_ue(enc, 0);
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
-
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
- radeon_enc_code_fixed_bits(enc, 0, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.strong_intra_smoothing_enabled, 1);
-
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
-
- radeon_enc_byte_align(enc);
- radeon_enc_flush_headers(enc);
- *size_in_bytes = (enc->bits_output + 7) / 8;
- RADEON_ENC_END();
+ RADEON_ENC_BEGIN(enc->cmd.nalu);
+ RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_SPS);
+ uint32_t *size_in_bytes = &enc->cs->current.buf[enc->cs->current.cdw++];
+ int i;
+
+ radeon_enc_reset(enc);
+ radeon_enc_set_emulation_prevention(enc, false);
+ radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
+ radeon_enc_code_fixed_bits(enc, 0x4201, 16);
+ radeon_enc_byte_align(enc);
+ radeon_enc_set_emulation_prevention(enc, true);
+ radeon_enc_code_fixed_bits(enc, 0x0, 4);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1, 3);
+ radeon_enc_code_fixed_bits(enc, 0x1, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 2);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_tier_flag, 1);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_profile_idc, 5);
+ radeon_enc_code_fixed_bits(enc, 0x60000000, 32);
+ radeon_enc_code_fixed_bits(enc, 0xb0000000, 32);
+ radeon_enc_code_fixed_bits(enc, 0x0, 16);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_level_idc, 8);
+
+ for (i = 0; i < (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) ; i++)
+ radeon_enc_code_fixed_bits(enc, 0x0, 2);
+
+ if ((enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) > 0) {
+ for (i = (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i < 8; i++)
+ radeon_enc_code_fixed_bits(enc, 0x0, 2);
+ }
+
+ radeon_enc_code_ue(enc, 0x0);
+ radeon_enc_code_ue(enc, enc->enc_pic.chroma_format_idc);
+ radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_width);
+ radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_height);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_luma_minus8);
+ radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_chroma_minus8);
+ radeon_enc_code_ue(enc, enc->enc_pic.log2_max_poc - 4);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_ue(enc, 1);
+ radeon_enc_code_ue(enc, 0x0);
+ radeon_enc_code_ue(enc, 0x0);
+ radeon_enc_code_ue(enc, enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3);
+ //Only support CTBSize 64
+ radeon_enc_code_ue(enc, 6 - (enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3 + 3));
+ radeon_enc_code_ue(enc, enc->enc_pic.log2_min_transform_block_size_minus2);
+ radeon_enc_code_ue(enc, enc->enc_pic.log2_diff_max_min_transform_block_size);
+ radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_inter);
+ radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_intra);
+
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, !enc->enc_pic.hevc_spec_misc.amp_disabled, 1);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.sample_adaptive_offset_enabled_flag, 1);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.pcm_enabled_flag, 1);
+
+ radeon_enc_code_ue(enc, 1);
+ radeon_enc_code_ue(enc, 1);
+ radeon_enc_code_ue(enc, 0);
+ radeon_enc_code_ue(enc, 0);
+ radeon_enc_code_fixed_bits(enc, 0x1, 1);
+
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+
+ radeon_enc_code_fixed_bits(enc, 0, 1);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.strong_intra_smoothing_enabled, 1);
+
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+
+ radeon_enc_code_fixed_bits(enc, 0x1, 1);
+
+ radeon_enc_byte_align(enc);
+ radeon_enc_flush_headers(enc);
+ *size_in_bytes = (enc->bits_output + 7) / 8;
+ RADEON_ENC_END();
}
static void radeon_enc_nalu_pps_hevc(struct radeon_encoder *enc)
{
- RADEON_ENC_BEGIN(enc->cmd.nalu);
- RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_PPS);
- uint32_t *size_in_bytes = &enc->cs.current.buf[enc->cs.current.cdw++];
- radeon_enc_reset(enc);
- radeon_enc_set_emulation_prevention(enc, false);
- radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
- radeon_enc_code_fixed_bits(enc, 0x4401, 16);
- radeon_enc_byte_align(enc);
- radeon_enc_set_emulation_prevention(enc, true);
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 4);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_ue(enc, 0x0);
- radeon_enc_code_se(enc, 0x0);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.constrained_intra_pred_flag, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- if (enc->enc_pic.rc_session_init.rate_control_method == RENCODE_RATE_CONTROL_METHOD_NONE)
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- else {
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- radeon_enc_code_ue(enc, 0x0);
- }
- radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cb_qp_offset);
- radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cr_qp_offset);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 2);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1);
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.deblocking_filter_disabled, 1);
-
- if (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled) {
- radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.beta_offset_div2);
- radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.tc_offset_div2);
- }
-
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_fixed_bits(enc, 0x0, 1);
- radeon_enc_code_ue(enc, enc->enc_pic.log2_parallel_merge_level_minus2);
- radeon_enc_code_fixed_bits(enc, 0x0, 2);
-
- radeon_enc_code_fixed_bits(enc, 0x1, 1);
-
- radeon_enc_byte_align(enc);
- radeon_enc_flush_headers(enc);
- *size_in_bytes = (enc->bits_output + 7) / 8;
- RADEON_ENC_END();
-}
-
-static void radeon_enc_session_init(struct radeon_encoder *enc)
-{
- enc->enc_pic.session_init.encode_standard = RENCODE_ENCODE_STANDARD_H264;
- enc->enc_pic.session_init.aligned_picture_width = align(enc->base.width, 16);
- enc->enc_pic.session_init.aligned_picture_height = align(enc->base.height, 16);
- enc->enc_pic.session_init.padding_width = enc->enc_pic.session_init.aligned_picture_width - enc->base.width;
- enc->enc_pic.session_init.padding_height = enc->enc_pic.session_init.aligned_picture_height - enc->base.height;
- enc->enc_pic.session_init.pre_encode_mode = RENCODE_PREENCODE_MODE_NONE;
- enc->enc_pic.session_init.pre_encode_chroma_enabled = FALSE;
-
- RADEON_ENC_BEGIN(enc->cmd.session_init);
- RADEON_ENC_CS(enc->enc_pic.session_init.encode_standard);
- RADEON_ENC_CS(enc->enc_pic.session_init.aligned_picture_width);
- RADEON_ENC_CS(enc->enc_pic.session_init.aligned_picture_height);
- RADEON_ENC_CS(enc->enc_pic.session_init.padding_width);
- RADEON_ENC_CS(enc->enc_pic.session_init.padding_height);
- RADEON_ENC_CS(enc->enc_pic.session_init.pre_encode_mode);
- RADEON_ENC_CS(enc->enc_pic.session_init.pre_encode_chroma_enabled);
- RADEON_ENC_END();
-}
-
-static void radeon_enc_efc_config(struct radeon_encoder *enc)
-{
- if (enc->efc == NULL) {
- enc->efc = CALLOC_STRUCT(rvid_buffer);
- int buffer_size = 46 * 1024;
- if (!si_vid_create_buffer(enc->screen, enc->efc, buffer_size, PIPE_USAGE_DYNAMIC)) {
- RVID_ERR("Can't create EFC conversion table buffer.\n");
- FREE(enc->efc);
- return;
- }
-
- uint32_t *ptr = enc->ws->buffer_map(enc->ws, enc->efc->res->buf, &enc->cs, PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
- memcpy(ptr, Yuv_st2084_rec2020_st2084_rec2020_2000, 46817);
- enc->ws->buffer_unmap(enc->ws, enc->efc->res->buf);
- }
-
- enc->enc_pic.efc_params.coef_buffer_size = 46817;
- enc->enc_pic.efc_params.cm_program_register_data_size = 1728;
-
- assert(enc->efc);
-
- RADEON_ENC_BEGIN(enc->cmd.efc_params);
- RADEON_ENC_WRITE(enc->efc->res->buf, enc->efc->res->domains, 0x0);
- RADEON_ENC_CS(enc->enc_pic.efc_params.coef_buffer_size);
- RADEON_ENC_CS(enc->enc_pic.efc_params.cm_program_register_data_size);
- RADEON_ENC_END();
+ RADEON_ENC_BEGIN(enc->cmd.nalu);
+ RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_PPS);
+ uint32_t *size_in_bytes = &enc->cs->current.buf[enc->cs->current.cdw++];
+ radeon_enc_reset(enc);
+ radeon_enc_set_emulation_prevention(enc, false);
+ radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
+ radeon_enc_code_fixed_bits(enc, 0x4401, 16);
+ radeon_enc_byte_align(enc);
+ radeon_enc_set_emulation_prevention(enc, true);
+ radeon_enc_code_ue(enc, 0x0);
+ radeon_enc_code_ue(enc, 0x0);
+ radeon_enc_code_fixed_bits(enc, 0x1, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 4);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, 0x1, 1);
+ radeon_enc_code_ue(enc, 0x0);
+ radeon_enc_code_ue(enc, 0x0);
+ radeon_enc_code_se(enc, 0x0);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.constrained_intra_pred_flag, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cb_qp_offset);
+ radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cr_qp_offset);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 2);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1);
+ radeon_enc_code_fixed_bits(enc, 0x1, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.deblocking_filter_disabled, 1);
+
+ if (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled) {
+ radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.beta_offset_div2);
+ radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.tc_offset_div2);
+ }
+
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_fixed_bits(enc, 0x0, 1);
+ radeon_enc_code_ue(enc, enc->enc_pic.log2_parallel_merge_level_minus2);
+ radeon_enc_code_fixed_bits(enc, 0x0, 2);
+
+ radeon_enc_code_fixed_bits(enc, 0x1, 1);
+
+ radeon_enc_byte_align(enc);
+ radeon_enc_flush_headers(enc);
+ *size_in_bytes = (enc->bits_output + 7) / 8;
+ RADEON_ENC_END();
}
static void radeon_enc_input_format(struct radeon_encoder *enc)
{
- RADEON_ENC_BEGIN(enc->cmd.input_format);
- RADEON_ENC_CS(enc->enc_pic.input_format.input_color_volume);
- RADEON_ENC_CS(enc->enc_pic.input_format.input_color_space);
- RADEON_ENC_CS(enc->enc_pic.input_format.input_color_range);
- RADEON_ENC_CS(enc->enc_pic.input_format.input_chroma_subsampling);
- RADEON_ENC_CS(enc->enc_pic.input_format.input_chroma_location);
- RADEON_ENC_CS(enc->enc_pic.input_format.input_color_bit_depth);
- RADEON_ENC_CS(enc->enc_pic.input_format.input_color_packing_format);
- RADEON_ENC_END();
+ RADEON_ENC_BEGIN(enc->cmd.input_format);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_END();
}
static void radeon_enc_output_format(struct radeon_encoder *enc)
{
- RADEON_ENC_BEGIN(enc->cmd.output_format);
- RADEON_ENC_CS(enc->enc_pic.output_format.output_color_volume);
- RADEON_ENC_CS(enc->enc_pic.output_format.output_color_range);
- RADEON_ENC_CS(enc->enc_pic.output_format.output_chroma_location);
- RADEON_ENC_CS(enc->enc_pic.output_format.output_color_bit_depth);
- RADEON_ENC_END();
+ RADEON_ENC_BEGIN(enc->cmd.output_format);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_CS(0);
+ RADEON_ENC_END();
}
static void encode(struct radeon_encoder *enc)
{
- enc->before_encode(enc);
- enc->session_info(enc);
- enc->total_task_size = 0;
- enc->task_info(enc, enc->need_feedback);
- enc->efc_params(enc);
- enc->encode_headers(enc);
- enc->ctx(enc);
- enc->bitstream(enc);
- enc->feedback(enc);
- enc->intra_refresh(enc);
- enc->input_format(enc);
- enc->output_format(enc);
-
- enc->op_preset(enc);
- enc->op_enc(enc);
- *enc->p_task_size = (enc->total_task_size);
+ enc->session_info(enc);
+ enc->total_task_size = 0;
+ enc->task_info(enc, enc->need_feedback);
+
+ enc->encode_headers(enc);
+ enc->ctx(enc);
+ enc->bitstream(enc);
+ enc->feedback(enc);
+ enc->intra_refresh(enc);
+ enc->input_format(enc);
+ enc->output_format(enc);
+
+ enc->op_speed(enc);
+ enc->op_enc(enc);
+ *enc->p_task_size = (enc->total_task_size);
}
void radeon_enc_2_0_init(struct radeon_encoder *enc)
{
- radeon_enc_1_2_init(enc);
- enc->encode = encode;
- enc->input_format = radeon_enc_input_format;
- enc->output_format = radeon_enc_output_format;
- enc->efc_params = radeon_enc_efc_config;
-
- if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
- enc->session_init = radeon_enc_session_init;
- }
- if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
- enc->deblocking_filter = radeon_enc_loop_filter_hevc;
- enc->nalu_sps = radeon_enc_nalu_sps_hevc;
- enc->nalu_pps = radeon_enc_nalu_pps_hevc;
- enc->slice_header = radeon_enc_slice_header_hevc;
- enc->op_preset = radeon_enc_op_balance;
- }
-
- enc->cmd.session_info = RENCODE_IB_PARAM_SESSION_INFO;
- enc->cmd.task_info = RENCODE_IB_PARAM_TASK_INFO;
- enc->cmd.session_init = RENCODE_IB_PARAM_SESSION_INIT;
- enc->cmd.layer_control = RENCODE_IB_PARAM_LAYER_CONTROL;
- enc->cmd.layer_select = RENCODE_IB_PARAM_LAYER_SELECT;
- enc->cmd.rc_session_init = RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT;
- enc->cmd.rc_layer_init = RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT;
- enc->cmd.rc_per_pic = RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE;
- enc->cmd.quality_params = RENCODE_IB_PARAM_QUALITY_PARAMS;
- enc->cmd.nalu = RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU;
- enc->cmd.slice_header = RENCODE_IB_PARAM_SLICE_HEADER;
- enc->cmd.input_format = RENCODE_IB_PARAM_INPUT_FORMAT;
- enc->cmd.output_format = RENCODE_IB_PARAM_OUTPUT_FORMAT;
- enc->cmd.enc_params = RENCODE_IB_PARAM_ENCODE_PARAMS;
- enc->cmd.intra_refresh = RENCODE_IB_PARAM_INTRA_REFRESH;
- enc->cmd.ctx = RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER;
- enc->cmd.bitstream = RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER;
- enc->cmd.feedback = RENCODE_IB_PARAM_FEEDBACK_BUFFER;
- enc->cmd.slice_control_hevc = RENCODE_HEVC_IB_PARAM_SLICE_CONTROL;
- enc->cmd.spec_misc_hevc = RENCODE_HEVC_IB_PARAM_SPEC_MISC;
- enc->cmd.deblocking_filter_hevc = RENCODE_HEVC_IB_PARAM_LOOP_FILTER;
- enc->cmd.slice_control_h264 = RENCODE_H264_IB_PARAM_SLICE_CONTROL;
- enc->cmd.spec_misc_h264 = RENCODE_H264_IB_PARAM_SPEC_MISC;
- enc->cmd.enc_params_h264 = RENCODE_H264_IB_PARAM_ENCODE_PARAMS;
- enc->cmd.deblocking_filter_h264 = RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER;
- enc->cmd.efc_params = RENCODE_IB_PARAM_EFC_CONFIG;
-
- enc->enc_pic.session_info.interface_version =
- ((RENCODE_FW_INTERFACE_MAJOR_VERSION << RENCODE_IF_MAJOR_VERSION_SHIFT) |
- (RENCODE_FW_INTERFACE_MINOR_VERSION << RENCODE_IF_MINOR_VERSION_SHIFT));
+ radeon_enc_1_2_init(enc);
+ enc->encode = encode;
+ enc->quality_params = radeon_enc_quality_params;
+ enc->input_format = radeon_enc_input_format;
+ enc->output_format = radeon_enc_output_format;
+
+ if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
+ enc->deblocking_filter = radeon_enc_loop_filter_hevc;
+ enc->nalu_sps = radeon_enc_nalu_sps_hevc;
+ enc->nalu_pps = radeon_enc_nalu_pps_hevc;
+ }
+
+ enc->cmd.session_info = RENCODE_IB_PARAM_SESSION_INFO;
+ enc->cmd.task_info = RENCODE_IB_PARAM_TASK_INFO;
+ enc->cmd.session_init = RENCODE_IB_PARAM_SESSION_INIT;
+ enc->cmd.layer_control = RENCODE_IB_PARAM_LAYER_CONTROL;
+ enc->cmd.layer_select = RENCODE_IB_PARAM_LAYER_SELECT;
+ enc->cmd.rc_session_init = RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT;
+ enc->cmd.rc_layer_init = RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT;
+ enc->cmd.rc_per_pic = RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE;
+ enc->cmd.quality_params = RENCODE_IB_PARAM_QUALITY_PARAMS;
+ enc->cmd.nalu = RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU;
+ enc->cmd.slice_header = RENCODE_IB_PARAM_SLICE_HEADER;
+ enc->cmd.input_format = RENCODE_IB_PARAM_INPUT_FORMAT;
+ enc->cmd.output_format = RENCODE_IB_PARAM_OUTPUT_FORMAT;
+ enc->cmd.enc_params = RENCODE_IB_PARAM_ENCODE_PARAMS;
+ enc->cmd.intra_refresh = RENCODE_IB_PARAM_INTRA_REFRESH;
+ enc->cmd.ctx = RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER;
+ enc->cmd.bitstream = RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER;
+ enc->cmd.feedback = RENCODE_IB_PARAM_FEEDBACK_BUFFER;
+ enc->cmd.slice_control_hevc = RENCODE_HEVC_IB_PARAM_SLICE_CONTROL;
+ enc->cmd.spec_misc_hevc = RENCODE_HEVC_IB_PARAM_SPEC_MISC;
+ enc->cmd.deblocking_filter_hevc = RENCODE_HEVC_IB_PARAM_LOOP_FILTER;
+ enc->cmd.slice_control_h264 = RENCODE_H264_IB_PARAM_SLICE_CONTROL;
+ enc->cmd.spec_misc_h264 = RENCODE_H264_IB_PARAM_SPEC_MISC;
+ enc->cmd.enc_params_h264 = RENCODE_H264_IB_PARAM_ENCODE_PARAMS;
+ enc->cmd.deblocking_filter_h264 = RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER;
+
+ enc->enc_pic.session_info.interface_version =
+ ((RENCODE_FW_INTERFACE_MAJOR_VERSION << RENCODE_IF_MAJOR_VERSION_SHIFT) |
+ (RENCODE_FW_INTERFACE_MINOR_VERSION << RENCODE_IF_MINOR_VERSION_SHIFT));
}