diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:10:09 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:10:09 +0000 |
commit | d1e8c371581041f403dcdcff4ab8a88e970d221e (patch) | |
tree | 621cf3eea9401b6fc19ce2a6dc5aa7579ecc8c70 | |
parent | 81f619d3e99a3a218e6318d06c2bc1a36052e75d (diff) |
Import Mesa 19.2.8
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c | 83 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_private.h | 2165 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/vk_format.h | 546 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp | 14 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h | 3 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp | 1612 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c | 151 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/panfrost/pan_compute.c | 125 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c | 722 |
9 files changed, 2742 insertions, 2679 deletions
diff --git a/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c b/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c index 5cfc79fbf..b8b2ceda2 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c +++ b/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c @@ -27,7 +27,6 @@ #include "util/disk_cache.h" #include "util/mesa-sha1.h" #include "util/u_atomic.h" -#include "vulkan/util/vk_util.h" struct cache_entry_variant_info { @@ -44,7 +43,7 @@ struct cache_entry char code[0]; }; -static void +void tu_pipeline_cache_init(struct tu_pipeline_cache *cache, struct tu_device *device) { @@ -67,7 +66,7 @@ tu_pipeline_cache_init(struct tu_pipeline_cache *cache, memset(cache->hash_table, 0, byte_size); } -static void +void tu_pipeline_cache_finish(struct tu_pipeline_cache *cache) { for (unsigned i = 0; i < cache->table_size; ++i) @@ -89,6 +88,41 @@ entry_size(struct cache_entry *entry) return ret; } +void +tu_hash_shaders(unsigned char *hash, + const VkPipelineShaderStageCreateInfo **stages, + const struct tu_pipeline_layout *layout, + const struct tu_pipeline_key *key, + uint32_t flags) +{ + struct mesa_sha1 ctx; + + _mesa_sha1_init(&ctx); + if (key) + _mesa_sha1_update(&ctx, key, sizeof(*key)); + if (layout) + _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); + + for (int i = 0; i < MESA_SHADER_STAGES; ++i) { + if (stages[i]) { + TU_FROM_HANDLE(tu_shader_module, module, stages[i]->module); + const VkSpecializationInfo *spec_info = + stages[i]->pSpecializationInfo; + + _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1)); + _mesa_sha1_update(&ctx, stages[i]->pName, strlen(stages[i]->pName)); + if (spec_info) { + _mesa_sha1_update( + &ctx, spec_info->pMapEntries, + spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]); + _mesa_sha1_update(&ctx, spec_info->pData, spec_info->dataSize); + } + } + } + _mesa_sha1_update(&ctx, &flags, 4); + _mesa_sha1_final(&ctx, hash); +} + static struct cache_entry * tu_pipeline_cache_search_unlocked(struct tu_pipeline_cache *cache, const unsigned char *sha1) @@ -162,7 +196,7 @@ tu_pipeline_cache_grow(struct tu_pipeline_cache *cache) table = malloc(byte_size); if (table == NULL) - return vk_error(cache, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(cache->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); cache->hash_table = table; cache->table_size = table_size; @@ -197,13 +231,22 @@ tu_pipeline_cache_add_entry(struct tu_pipeline_cache *cache, tu_pipeline_cache_set_entry(cache, entry); } -static void +struct cache_header +{ + uint32_t header_size; + uint32_t header_version; + uint32_t vendor_id; + uint32_t device_id; + uint8_t uuid[VK_UUID_SIZE]; +}; + +void tu_pipeline_cache_load(struct tu_pipeline_cache *cache, const void *data, size_t size) { struct tu_device *device = cache->device; - struct vk_pipeline_cache_header header; + struct cache_header header; if (size < sizeof(header)) return; @@ -212,9 +255,9 @@ tu_pipeline_cache_load(struct tu_pipeline_cache *cache, return; if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE) return; - if (header.vendor_id != 0x5143) + if (header.vendor_id != 0 /* TODO */) return; - if (header.device_id != device->physical_device->dev_id.chip_id) + if (header.device_id != 0 /* TODO */) return; if (memcmp(header.uuid, device->physical_device->cache_uuid, VK_UUID_SIZE) != 0) @@ -242,7 +285,7 @@ tu_pipeline_cache_load(struct tu_pipeline_cache *cache, } } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreatePipelineCache(VkDevice _device, const VkPipelineCacheCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -254,15 +297,15 @@ tu_CreatePipelineCache(VkDevice _device, assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO); assert(pCreateInfo->flags == 0); - cache = vk_object_alloc(&device->vk, pAllocator, sizeof(*cache), - VK_OBJECT_TYPE_PIPELINE_CACHE); + cache = vk_alloc2(&device->alloc, pAllocator, sizeof(*cache), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (cache == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); if (pAllocator) cache->alloc = *pAllocator; else - cache->alloc = device->vk.alloc; + cache->alloc = device->alloc; tu_pipeline_cache_init(cache, device); @@ -276,7 +319,7 @@ tu_CreatePipelineCache(VkDevice _device, return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyPipelineCache(VkDevice _device, VkPipelineCache _cache, const VkAllocationCallbacks *pAllocator) @@ -288,10 +331,10 @@ tu_DestroyPipelineCache(VkDevice _device, return; tu_pipeline_cache_finish(cache); - vk_object_free(&device->vk, pAllocator, cache); + vk_free2(&device->alloc, pAllocator, cache); } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_GetPipelineCacheData(VkDevice _device, VkPipelineCache _cache, size_t *pDataSize, @@ -299,7 +342,7 @@ tu_GetPipelineCacheData(VkDevice _device, { TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_pipeline_cache, cache, _cache); - struct vk_pipeline_cache_header *header; + struct cache_header *header; VkResult result = VK_SUCCESS; pthread_mutex_lock(&cache->mutex); @@ -319,8 +362,8 @@ tu_GetPipelineCacheData(VkDevice _device, header = p; header->header_size = sizeof(*header); header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE; - header->vendor_id = 0x5143; - header->device_id = device->physical_device->dev_id.chip_id; + header->vendor_id = 0 /* TODO */; + header->device_id = 0 /* TODO */; memcpy(header->uuid, device->physical_device->cache_uuid, VK_UUID_SIZE); p += header->header_size; @@ -361,7 +404,7 @@ tu_pipeline_cache_merge(struct tu_pipeline_cache *dst, } } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_MergePipelineCaches(VkDevice _device, VkPipelineCache destCache, uint32_t srcCacheCount, diff --git a/lib/mesa/src/freedreno/vulkan/tu_private.h b/lib/mesa/src/freedreno/vulkan/tu_private.h index 862d507c9..c2440471f 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_private.h +++ b/lib/mesa/src/freedreno/vulkan/tu_private.h @@ -40,47 +40,28 @@ #include <valgrind.h> #define VG(x) x #else -#define VG(x) ((void)0) +#define VG(x) #endif -#define MESA_LOG_TAG "TU" - #include "c11/threads.h" -#include "util/rounding.h" -#include "util/bitscan.h" +#include "compiler/shader_enums.h" +#include "main/macros.h" #include "util/list.h" -#include "util/log.h" #include "util/macros.h" -#include "util/sparse_array.h" -#include "util/u_atomic.h" -#include "util/u_dynarray.h" -#include "util/xmlconfig.h" -#include "util/perf/u_trace.h" #include "vk_alloc.h" #include "vk_debug_report.h" -#include "vk_device.h" -#include "vk_dispatch_table.h" -#include "vk_extensions.h" -#include "vk_instance.h" -#include "vk_log.h" -#include "vk_physical_device.h" -#include "vk_shader_module.h" #include "wsi_common.h" +#include "drm-uapi/msm_drm.h" #include "ir3/ir3_compiler.h" #include "ir3/ir3_shader.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" #include "a6xx.xml.h" -#include "fdl/freedreno_layout.h" -#include "common/freedreno_dev_info.h" -#include "perfcntrs/freedreno_perfcntr.h" #include "tu_descriptor_set.h" -#include "tu_autotune.h" -#include "tu_util.h" -#include "tu_perfetto.h" +#include "tu_extensions.h" /* Pre-declarations needed for WSI entrypoints */ struct wl_surface; @@ -92,54 +73,143 @@ typedef uint32_t xcb_window_t; #include <vulkan/vk_android_native_buffer.h> #include <vulkan/vk_icd.h> #include <vulkan/vulkan.h> +#include <vulkan/vulkan_intel.h> #include "tu_entrypoints.h" -#include "vk_format.h" -#include "vk_image.h" -#include "vk_command_buffer.h" -#include "vk_command_pool.h" -#include "vk_queue.h" -#include "vk_object.h" -#include "vk_sync.h" -#include "vk_fence.h" -#include "vk_semaphore.h" -#include "vk_drm_syncobj.h" -#include "vk_sync_timeline.h" - #define MAX_VBS 32 #define MAX_VERTEX_ATTRIBS 32 #define MAX_RTS 8 #define MAX_VSC_PIPES 32 -#define MAX_VIEWPORTS 16 -#define MAX_VIEWPORT_SIZE (1 << 14) +#define MAX_VIEWPORTS 1 #define MAX_SCISSORS 16 #define MAX_DISCARD_RECTANGLES 4 #define MAX_PUSH_CONSTANTS_SIZE 128 #define MAX_PUSH_DESCRIPTORS 32 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16 #define MAX_DYNAMIC_STORAGE_BUFFERS 8 -#define MAX_DYNAMIC_BUFFERS_SIZE \ - (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \ - A6XX_TEX_CONST_DWORDS - +#define MAX_DYNAMIC_BUFFERS \ + (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS) +#define MAX_SAMPLES_LOG2 4 +#define NUM_META_FS_KEYS 13 #define TU_MAX_DRM_DEVICES 8 -#define MAX_VIEWS 16 -#define MAX_BIND_POINTS 2 /* compute + graphics */ -/* The Qualcomm driver exposes 0x20000058 */ -#define MAX_STORAGE_BUFFER_RANGE 0x20000000 -/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so - * expose the same maximum range. - * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual - * range might be higher. +#define MAX_VIEWS 8 + +#define NUM_DEPTH_CLEAR_PIPELINES 3 + +/* + * This is the point we switch from using CP to compute shader + * for certain buffer operations. */ -#define MAX_UNIFORM_BUFFER_RANGE 0x10000 +#define TU_BUFFER_OPS_CS_THRESHOLD 4096 + +enum tu_mem_heap +{ + TU_MEM_HEAP_VRAM, + TU_MEM_HEAP_VRAM_CPU_ACCESS, + TU_MEM_HEAP_GTT, + TU_MEM_HEAP_COUNT +}; + +enum tu_mem_type +{ + TU_MEM_TYPE_VRAM, + TU_MEM_TYPE_GTT_WRITE_COMBINE, + TU_MEM_TYPE_VRAM_CPU_ACCESS, + TU_MEM_TYPE_GTT_CACHED, + TU_MEM_TYPE_COUNT +}; + +#define tu_printflike(a, b) __attribute__((__format__(__printf__, a, b))) + +static inline uint32_t +align_u32(uint32_t v, uint32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +static inline uint32_t +align_u32_npot(uint32_t v, uint32_t a) +{ + return (v + a - 1) / a * a; +} + +static inline uint64_t +align_u64(uint64_t v, uint64_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +static inline int32_t +align_i32(int32_t v, int32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} -#define A6XX_TEX_CONST_DWORDS 16 -#define A6XX_TEX_SAMP_DWORDS 4 +/** Alignment must be a power of 2. */ +static inline bool +tu_is_aligned(uintmax_t n, uintmax_t a) +{ + assert(a == (a & -a)); + return (n & (a - 1)) == 0; +} -#define COND(bool, val) ((bool) ? (val) : 0) -#define BIT(bit) (1u << (bit)) +static inline uint32_t +round_up_u32(uint32_t v, uint32_t a) +{ + return (v + a - 1) / a; +} + +static inline uint64_t +round_up_u64(uint64_t v, uint64_t a) +{ + return (v + a - 1) / a; +} + +static inline uint32_t +tu_minify(uint32_t n, uint32_t levels) +{ + if (unlikely(n == 0)) + return 0; + else + return MAX2(n >> levels, 1); +} +static inline float +tu_clamp_f(float f, float min, float max) +{ + assert(min < max); + + if (f > max) + return max; + else if (f < min) + return min; + else + return f; +} + +static inline bool +tu_clear_mask(uint32_t *inout_mask, uint32_t clear_mask) +{ + if (*inout_mask & clear_mask) { + *inout_mask &= ~clear_mask; + return true; + } else { + return false; + } +} + +#define for_each_bit(b, dword) \ + for (uint32_t __dword = (dword); \ + (b) = __builtin_ffs(__dword) - 1, __dword; __dword &= ~(1 << (b))) + +#define typed_memcpy(dest, src, count) \ + ({ \ + STATIC_ASSERT(sizeof(*src) == sizeof(*dest)); \ + memcpy((dest), (src), (count) * sizeof(*(src))); \ + }) /* Whenever we generate an error, pass it through this function. Useful for * debugging, where we can break on it. Only call at error site, not when @@ -149,25 +219,29 @@ typedef uint32_t xcb_window_t; struct tu_instance; VkResult -__vk_startup_errorf(struct tu_instance *instance, - VkResult error, - bool force_print, - const char *file, - int line, - const char *format, - ...) PRINTFLIKE(6, 7); - -/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver - * build. - */ -#define vk_startup_errorf(instance, error, format, ...) \ - __vk_startup_errorf(instance, error, \ - instance->debug_flags & TU_DEBUG_STARTUP, \ - __FILE__, __LINE__, format, ##__VA_ARGS__) +__vk_errorf(struct tu_instance *instance, + VkResult error, + const char *file, + int line, + const char *format, + ...); + +#define vk_error(instance, error) \ + __vk_errorf(instance, error, __FILE__, __LINE__, NULL); +#define vk_errorf(instance, error, format, ...) \ + __vk_errorf(instance, error, __FILE__, __LINE__, format, ##__VA_ARGS__); void __tu_finishme(const char *file, int line, const char *format, ...) - PRINTFLIKE(3, 4); + tu_printflike(3, 4); +void +tu_loge(const char *format, ...) tu_printflike(1, 2); +void +tu_loge_v(const char *format, va_list va); +void +tu_logi(const char *format, ...) tu_printflike(1, 2); +void +tu_logi_v(const char *format, va_list va); /** * Print a FINISHME message, including its source location. @@ -181,35 +255,46 @@ __tu_finishme(const char *file, int line, const char *format, ...) } \ } while (0) +/* A non-fatal assert. Useful for debugging. */ +#ifdef DEBUG +#define tu_assert(x) \ + ({ \ + if (unlikely(!(x))) \ + fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x); \ + }) +#else +#define tu_assert(x) +#endif + +/* Suppress -Wunused in stub functions */ +#define tu_use_args(...) __tu_use_args(0, ##__VA_ARGS__) +static inline void +__tu_use_args(int ignore, ...) +{ +} + #define tu_stub() \ do { \ tu_finishme("stub %s", __func__); \ } while (0) -struct tu_memory_heap { - /* Standard bits passed on to the client */ - VkDeviceSize size; - VkMemoryHeapFlags flags; - - /** Copied from ANV: - * - * Driver-internal book-keeping. - * - * Align it to 64 bits to make atomic operations faster on 32 bit platforms. - */ - VkDeviceSize used __attribute__ ((aligned (8))); -}; - -uint64_t -tu_get_system_heap_size(void); +void * +tu_lookup_entrypoint_unchecked(const char *name); +void * +tu_lookup_entrypoint_checked( + const char *name, + uint32_t core_version, + const struct tu_instance_extension_table *instance, + const struct tu_device_extension_table *device); struct tu_physical_device { - struct vk_physical_device vk; + VK_LOADER_DATA _loader_data; struct tu_instance *instance; - const char *name; + char path[20]; + char name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE]; uint8_t driver_uuid[VK_UUID_SIZE]; uint8_t device_uuid[VK_UUID_SIZE]; uint8_t cache_uuid[VK_UUID_SIZE]; @@ -217,71 +302,43 @@ struct tu_physical_device struct wsi_device wsi_device; int local_fd; - bool has_local; - int64_t local_major; - int64_t local_minor; int master_fd; - bool has_master; - int64_t master_major; - int64_t master_minor; + unsigned gpu_id; uint32_t gmem_size; - uint64_t gmem_base; - uint32_t ccu_offset_gmem; - uint32_t ccu_offset_bypass; - - struct fd_dev_id dev_id; - const struct fd_dev_info *info; - - int msm_major_version; - int msm_minor_version; - - /* Address space and global fault count for this local_fd with DRM backend */ - uint64_t fault_count; + uint32_t tile_align_w; + uint32_t tile_align_h; /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ struct disk_cache *disk_cache; - struct tu_memory_heap heap; - - struct vk_sync_type syncobj_type; - struct vk_sync_timeline_type timeline_type; - const struct vk_sync_type *sync_types[3]; + struct tu_device_extension_table supported_extensions; }; enum tu_debug_flags { TU_DEBUG_STARTUP = 1 << 0, TU_DEBUG_NIR = 1 << 1, - TU_DEBUG_NOBIN = 1 << 3, - TU_DEBUG_SYSMEM = 1 << 4, - TU_DEBUG_FORCEBIN = 1 << 5, - TU_DEBUG_NOUBWC = 1 << 6, - TU_DEBUG_NOMULTIPOS = 1 << 7, - TU_DEBUG_NOLRZ = 1 << 8, - TU_DEBUG_PERFC = 1 << 9, - TU_DEBUG_FLUSHALL = 1 << 10, - TU_DEBUG_SYNCDRAW = 1 << 11, - TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12, - TU_DEBUG_GMEM = 1 << 13, - TU_DEBUG_RAST_ORDER = 1 << 14, - TU_DEBUG_UNALIGNED_STORE = 1 << 15, + TU_DEBUG_IR3 = 1 << 2, }; struct tu_instance { - struct vk_instance vk; + VK_LOADER_DATA _loader_data; + + VkAllocationCallbacks alloc; uint32_t api_version; int physical_device_count; struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES]; - struct driOptionCache dri_options; - struct driOptionCache available_dri_options; - enum tu_debug_flags debug_flags; + + struct vk_debug_report_instance debug_report_callbacks; + + struct tu_instance_extension_table enabled_extensions; }; VkResult @@ -297,19 +354,10 @@ bool tu_physical_device_extension_supported(struct tu_physical_device *dev, const char *name); -enum tu_bo_alloc_flags -{ - TU_BO_ALLOC_NO_FLAGS = 0, - TU_BO_ALLOC_ALLOW_DUMP = 1 << 0, - TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1, -}; - struct cache_entry; struct tu_pipeline_cache { - struct vk_object_base base; - struct tu_device *device; pthread_mutex_t mutex; @@ -326,313 +374,115 @@ struct tu_pipeline_key { }; +void +tu_pipeline_cache_init(struct tu_pipeline_cache *cache, + struct tu_device *device); +void +tu_pipeline_cache_finish(struct tu_pipeline_cache *cache); +void +tu_pipeline_cache_load(struct tu_pipeline_cache *cache, + const void *data, + size_t size); -/* queue types */ -#define TU_QUEUE_GENERAL 0 - -#define TU_MAX_QUEUE_FAMILIES 1 - -/* Keep tu_syncobj until porting to common code for kgsl too */ -#ifdef TU_USE_KGSL -struct tu_syncobj; -#endif -struct tu_u_trace_syncobj; - -/* Define tu_timeline_sync type based on drm syncobj for a point type - * for vk_sync_timeline, and the logic to handle is mostly copied from - * anv_bo_sync since it seems it can be used by similar way to anv. - */ -enum tu_timeline_sync_state { - /** Indicates that this is a new (or newly reset fence) */ - TU_TIMELINE_SYNC_STATE_RESET, - - /** Indicates that this fence has been submitted to the GPU but is still - * (as far as we know) in use by the GPU. - */ - TU_TIMELINE_SYNC_STATE_SUBMITTED, - - TU_TIMELINE_SYNC_STATE_SIGNALED, -}; - -struct tu_timeline_sync { - struct vk_sync base; - - enum tu_timeline_sync_state state; - uint32_t syncobj; -}; - -struct tu_queue -{ - struct vk_queue vk; - - struct tu_device *device; +struct tu_shader_variant; - uint32_t msm_queue_id; - int fence; -}; +bool +tu_create_shader_variants_from_pipeline_cache( + struct tu_device *device, + struct tu_pipeline_cache *cache, + const unsigned char *sha1, + struct tu_shader_variant **variants); -struct tu_bo +void +tu_pipeline_cache_insert_shaders(struct tu_device *device, + struct tu_pipeline_cache *cache, + const unsigned char *sha1, + struct tu_shader_variant **variants, + const void *const *codes, + const unsigned *code_sizes); + +struct tu_meta_state { - uint32_t gem_handle; - uint64_t size; - uint64_t iova; - void *map; - int32_t refcnt; - -#ifndef TU_USE_KGSL - uint32_t bo_list_idx; -#endif + VkAllocationCallbacks alloc; - bool implicit_sync : 1; + struct tu_pipeline_cache cache; }; -/* externally-synchronized BO suballocator. */ -struct tu_suballocator -{ - struct tu_device *dev; - - uint32_t default_size; - enum tu_bo_alloc_flags flags; - - /** Current BO we're suballocating out of. */ - struct tu_bo *bo; - uint32_t next_offset; +/* queue types */ +#define TU_QUEUE_GENERAL 0 - /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */ - struct tu_bo *cached_bo; -}; +#define TU_MAX_QUEUE_FAMILIES 1 -struct tu_suballoc_bo +struct tu_fence { - struct tu_bo *bo; - uint64_t iova; - uint32_t size; /* bytes */ + bool signaled; + int fd; }; void -tu_bo_suballocator_init(struct tu_suballocator *suballoc, - struct tu_device *dev, - uint32_t default_size, - uint32_t flags); +tu_fence_init(struct tu_fence *fence, bool signaled); void -tu_bo_suballocator_finish(struct tu_suballocator *suballoc); - -VkResult -tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc, - uint32_t size, uint32_t align); - -void * -tu_suballoc_bo_map(struct tu_suballoc_bo *bo); - +tu_fence_finish(struct tu_fence *fence); void -tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo); - -enum global_shader { - GLOBAL_SH_VS_BLIT, - GLOBAL_SH_VS_CLEAR, - GLOBAL_SH_FS_BLIT, - GLOBAL_SH_FS_BLIT_ZSCALE, - GLOBAL_SH_FS_COPY_MS, - GLOBAL_SH_FS_CLEAR0, - GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS, - GLOBAL_SH_COUNT, -}; - -/** - * Tracks the results from an individual renderpass. Initially created - * per renderpass, and appended to the tail of at->pending_results. At a later - * time, when the GPU has finished writing the results, we fill samples_passed. - */ -struct tu_renderpass_result { - /* Points into GPU memory */ - struct tu_renderpass_samples* samples; - - struct tu_suballoc_bo bo; - - /* - * Below here, only used internally within autotune - */ - uint64_t rp_key; - struct tu_renderpass_history *history; - struct list_head node; - uint32_t fence; - uint64_t samples_passed; -}; - -#define TU_BORDER_COLOR_COUNT 4096 -#define TU_BORDER_COLOR_BUILTIN 6 - -#define TU_BLIT_SHADER_SIZE 1024 +tu_fence_update_fd(struct tu_fence *fence, int fd); +void +tu_fence_copy(struct tu_fence *fence, const struct tu_fence *src); +void +tu_fence_signal(struct tu_fence *fence); +void +tu_fence_wait_idle(struct tu_fence *fence); -/* This struct defines the layout of the global_bo */ -struct tu6_global +struct tu_queue { - /* clear/blit shaders */ - uint32_t shaders[TU_BLIT_SHADER_SIZE]; - - uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */ - uint32_t _pad0; - volatile uint32_t vsc_draw_overflow; - uint32_t _pad1; - volatile uint32_t vsc_prim_overflow; - uint32_t _pad2; - uint64_t predicate; - - /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ - struct { - uint32_t offset; - uint32_t pad[7]; - } flush_base[4]; - - ALIGN16 uint32_t cs_indirect_xyz[3]; - - /* To know when renderpass stats for autotune are valid */ - volatile uint32_t autotune_fence; + VK_LOADER_DATA _loader_data; + struct tu_device *device; + uint32_t queue_family_index; + int queue_idx; + VkDeviceQueueCreateFlags flags; - /* note: larger global bo will be used for customBorderColors */ - struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[]; + uint32_t msm_queue_id; + struct tu_fence submit_fence; }; -#define gb_offset(member) offsetof(struct tu6_global, member) -#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member)) - -/* extra space in vsc draw/prim streams */ -#define VSC_PAD 0x40 struct tu_device { - struct vk_device vk; + VK_LOADER_DATA _loader_data; + + VkAllocationCallbacks alloc; + struct tu_instance *instance; + struct tu_meta_state meta_state; + struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES]; int queue_count[TU_MAX_QUEUE_FAMILIES]; struct tu_physical_device *physical_device; - int fd; struct ir3_compiler *compiler; /* Backup in-memory cache to be used if the app doesn't provide one */ struct tu_pipeline_cache *mem_cache; -#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */ - - /* Currently the kernel driver uses a 32-bit GPU address space, but it - * should be impossible to go beyond 48 bits. - */ - struct { - struct tu_bo *bo; - mtx_t construct_mtx; - bool initialized; - } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2]; - - struct tu_bo *global_bo; - - uint32_t implicit_sync_bo_count; - - /* Device-global BO suballocator for reducing BO management overhead for - * (read-only) pipeline state. Synchronized by pipeline_mutex. - */ - struct tu_suballocator pipeline_suballoc; - mtx_t pipeline_mutex; - - /* Device-global BO suballocator for reducing BO management for small - * gmem/sysmem autotune result buffers. Synchronized by autotune_mutex. - */ - struct tu_suballocator autotune_suballoc; - mtx_t autotune_mutex; - - /* the blob seems to always use 8K factor and 128K param sizes, copy them */ -#define TU_TESS_FACTOR_SIZE (8 * 1024) -#define TU_TESS_PARAM_SIZE (128 * 1024) -#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE) - /* Lazily allocated, protected by the device mutex. */ - struct tu_bo *tess_bo; - - struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT]; - uint64_t global_shader_va[GLOBAL_SH_COUNT]; - - uint32_t vsc_draw_strm_pitch; - uint32_t vsc_prim_strm_pitch; - BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT); - mtx_t mutex; - - /* bo list for submits: */ - struct drm_msm_gem_submit_bo *bo_list; - /* map bo handles to bo list index: */ - uint32_t bo_count, bo_list_size; - mtx_t bo_mutex; - /* protects imported BOs creation/freeing */ - struct u_rwlock dma_bo_lock; - - /* This array holds all our 'struct tu_bo' allocations. We use this - * so we can add a refcount to our BOs and check if a particular BO - * was already allocated in this device using its GEM handle. This is - * necessary to properly manage BO imports, because the kernel doesn't - * refcount the underlying BO memory. - * - * Specifically, when self-importing (i.e. importing a BO into the same - * device that created it), the kernel will give us the same BO handle - * for both BOs and we must only free it once when both references are - * freed. Otherwise, if we are not self-importing, we get two different BO - * handles, and we want to free each one individually. - * - * The refcount is also useful for being able to maintain BOs across - * VK object lifetimes, such as pipelines suballocating out of BOs - * allocated on the device. - */ - struct util_sparse_array bo_map; - - /* Command streams to set pass index to a scratch reg */ - struct tu_cs *perfcntrs_pass_cs; - struct tu_cs_entry *perfcntrs_pass_cs_entries; - - /* Condition variable for timeline semaphore to notify waiters when a - * new submit is executed. */ - pthread_cond_t timeline_cond; - pthread_mutex_t submit_mutex; - - struct tu_autotune autotune; - -#ifdef ANDROID - const void *gralloc; - enum { - TU_GRALLOC_UNKNOWN, - TU_GRALLOC_CROS, - TU_GRALLOC_OTHER, - } gralloc_type; -#endif + struct list_head shader_slabs; + mtx_t shader_slab_mutex; - uint32_t submit_count; - - struct u_trace_context trace_context; - - #ifdef HAVE_PERFETTO - struct tu_perfetto_state perfetto; - #endif + struct tu_device_extension_table enabled_extensions; }; -void tu_init_clear_blit_shaders(struct tu_device *dev); - -void tu_destroy_clear_blit_shaders(struct tu_device *dev); - -VkResult -tu_device_submit_deferred_locked(struct tu_device *dev); - -VkResult -tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj); - -uint64_t -tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts); - -VkResult -tu_device_check_status(struct vk_device *vk_device); +struct tu_bo +{ + uint32_t gem_handle; + uint64_t size; + uint64_t iova; + void *map; +}; VkResult -tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size, - enum tu_bo_alloc_flags flags); +tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size); VkResult tu_bo_init_dmabuf(struct tu_device *dev, - struct tu_bo **bo, + struct tu_bo *bo, uint64_t size, int fd); int @@ -642,28 +492,6 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo); VkResult tu_bo_map(struct tu_device *dev, struct tu_bo *bo); -static inline struct tu_bo * -tu_device_lookup_bo(struct tu_device *device, uint32_t handle) -{ - return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle); -} - -static inline struct tu_bo * -tu_bo_get_ref(struct tu_bo *bo) -{ - p_atomic_inc(&bo->refcnt); - return bo; -} - -/* Get a scratch bo for use inside a command buffer. This will always return - * the same bo given the same size or similar sizes, so only one scratch bo - * can be used at the same time. It's meant for short-lived things where we - * need to write to some piece of memory, read from it, and then immediately - * discard it. - */ -VkResult -tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo); - struct tu_cs_entry { /* No ownership */ @@ -673,58 +501,6 @@ struct tu_cs_entry uint32_t offset; }; -struct tu_cs_memory { - uint32_t *map; - uint64_t iova; -}; - -struct tu_draw_state { - uint64_t iova : 48; - uint32_t size : 16; -}; - -enum tu_dynamic_state -{ - /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */ - TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1, - TU_DYNAMIC_STATE_RB_DEPTH_CNTL, - TU_DYNAMIC_STATE_RB_STENCIL_CNTL, - TU_DYNAMIC_STATE_VB_STRIDE, - TU_DYNAMIC_STATE_RASTERIZER_DISCARD, - TU_DYNAMIC_STATE_COUNT, - /* no associated draw state: */ - TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT, - TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE, - /* re-use the line width enum as it uses GRAS_SU_CNTL: */ - TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH, -}; - -enum tu_draw_state_group_id -{ - TU_DRAW_STATE_PROGRAM_CONFIG, - TU_DRAW_STATE_PROGRAM, - TU_DRAW_STATE_PROGRAM_BINNING, - TU_DRAW_STATE_VB, - TU_DRAW_STATE_VI, - TU_DRAW_STATE_VI_BINNING, - TU_DRAW_STATE_RAST, - TU_DRAW_STATE_BLEND, - TU_DRAW_STATE_SHADER_GEOM_CONST, - TU_DRAW_STATE_FS_CONST, - TU_DRAW_STATE_DESC_SETS, - TU_DRAW_STATE_DESC_SETS_LOAD, - TU_DRAW_STATE_VS_PARAMS, - TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, - TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, - TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, - TU_DRAW_STATE_PRIM_MODE_GMEM, - TU_DRAW_STATE_PRIM_MODE_SYSMEM, - - /* dynamic state related draw states */ - TU_DRAW_STATE_DYNAMIC, - TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT, -}; - enum tu_cs_mode { @@ -765,7 +541,6 @@ struct tu_cs uint32_t *reserved_end; uint32_t *end; - struct tu_device *device; enum tu_cs_mode mode; uint32_t next_bo_size; @@ -776,20 +551,20 @@ struct tu_cs struct tu_bo **bos; uint32_t bo_count; uint32_t bo_capacity; - - /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */ - struct tu_bo *refcount_bo; - - /* state for cond_exec_start/cond_exec_end */ - uint32_t cond_flags; - uint32_t *cond_dwords; }; struct tu_device_memory { - struct vk_object_base base; + struct tu_bo bo; + VkDeviceSize size; - struct tu_bo *bo; + /* for dedicated allocations */ + struct tu_image *image; + struct tu_buffer *buffer; + + uint32_t type_index; + void *map; + void *user_ptr; }; struct tu_descriptor_range @@ -800,19 +575,18 @@ struct tu_descriptor_range struct tu_descriptor_set { - struct vk_object_base base; - - /* Link to descriptor pool's desc_sets list . */ - struct list_head pool_link; - - struct tu_descriptor_set_layout *layout; - struct tu_descriptor_pool *pool; + const struct tu_descriptor_set_layout *layout; uint32_t size; uint64_t va; uint32_t *mapped_ptr; + struct tu_descriptor_range *dynamic_descriptors; +}; - uint32_t *dynamic_descriptors; +struct tu_push_descriptor_set +{ + struct tu_descriptor_set set; + uint32_t capacity; }; struct tu_descriptor_pool_entry @@ -824,18 +598,13 @@ struct tu_descriptor_pool_entry struct tu_descriptor_pool { - struct vk_object_base base; - - struct tu_bo *bo; + uint8_t *mapped_ptr; uint64_t current_offset; uint64_t size; uint8_t *host_memory_base; uint8_t *host_memory_ptr; uint8_t *host_memory_end; - uint8_t *host_bo; - - struct list_head desc_sets; uint32_t entry_count; uint32_t max_entry_count; @@ -866,13 +635,11 @@ struct tu_descriptor_update_template_entry size_t src_stride; /* For push descriptors */ - const struct tu_sampler *immutable_samplers; + const uint32_t *immutable_samplers; }; struct tu_descriptor_update_template { - struct vk_object_base base; - uint32_t entry_count; VkPipelineBindPoint bind_point; struct tu_descriptor_update_template_entry entry[0]; @@ -880,257 +647,175 @@ struct tu_descriptor_update_template struct tu_buffer { - struct vk_object_base base; - VkDeviceSize size; VkBufferUsageFlags usage; VkBufferCreateFlags flags; struct tu_bo *bo; - uint64_t iova; + VkDeviceSize bo_offset; }; -const char * -tu_get_debug_option_name(int id); - -const char * -tu_get_perftest_option_name(int id); +enum tu_dynamic_state_bits +{ + TU_DYNAMIC_VIEWPORT = 1 << 0, + TU_DYNAMIC_SCISSOR = 1 << 1, + TU_DYNAMIC_LINE_WIDTH = 1 << 2, + TU_DYNAMIC_DEPTH_BIAS = 1 << 3, + TU_DYNAMIC_BLEND_CONSTANTS = 1 << 4, + TU_DYNAMIC_DEPTH_BOUNDS = 1 << 5, + TU_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 6, + TU_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, + TU_DYNAMIC_STENCIL_REFERENCE = 1 << 8, + TU_DYNAMIC_DISCARD_RECTANGLE = 1 << 9, + TU_DYNAMIC_ALL = (1 << 10) - 1, +}; + +struct tu_vertex_binding +{ + struct tu_buffer *buffer; + VkDeviceSize offset; +}; -struct tu_descriptor_state +struct tu_viewport_state { - struct tu_descriptor_set *sets[MAX_SETS]; - struct tu_descriptor_set push_set; - uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE]; + uint32_t count; + VkViewport viewports[MAX_VIEWPORTS]; }; -enum tu_cmd_dirty_bits +struct tu_scissor_state { - TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0), - TU_CMD_DIRTY_VB_STRIDE = BIT(1), - TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2), - TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3), - TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4), - TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5), - TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6), - TU_CMD_DIRTY_SHADER_CONSTS = BIT(7), - TU_CMD_DIRTY_LRZ = BIT(8), - TU_CMD_DIRTY_VS_PARAMS = BIT(9), - TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10), - TU_CMD_DIRTY_VIEWPORTS = BIT(11), - /* all draw states were disabled and need to be re-enabled: */ - TU_CMD_DIRTY_DRAW_STATE = BIT(12) + uint32_t count; + VkRect2D scissors[MAX_SCISSORS]; }; -/* There are only three cache domains we have to care about: the CCU, or - * color cache unit, which is used for color and depth/stencil attachments - * and copy/blit destinations, and is split conceptually into color and depth, - * and the universal cache or UCHE which is used for pretty much everything - * else, except for the CP (uncached) and host. We need to flush whenever data - * crosses these boundaries. - */ +struct tu_discard_rectangle_state +{ + uint32_t count; + VkRect2D rectangles[MAX_DISCARD_RECTANGLES]; +}; -enum tu_cmd_access_mask { - TU_ACCESS_UCHE_READ = 1 << 0, - TU_ACCESS_UCHE_WRITE = 1 << 1, - TU_ACCESS_CCU_COLOR_READ = 1 << 2, - TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, - TU_ACCESS_CCU_DEPTH_READ = 1 << 4, - TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, - - /* Experiments have shown that while it's safe to avoid flushing the CCU - * after each blit/renderpass, it's not safe to assume that subsequent - * lookups with a different attachment state will hit unflushed cache - * entries. That is, the CCU needs to be flushed and possibly invalidated - * when accessing memory with a different attachment state. Writing to an - * attachment under the following conditions after clearing using the - * normal 2d engine path is known to have issues: - * - * - It isn't the 0'th layer. - * - There are more than one attachment, and this isn't the 0'th attachment - * (this seems to also depend on the cpp of the attachments). - * - * Our best guess is that the layer/MRT state is used when computing - * the location of a cache entry in CCU, to avoid conflicts. We assume that - * any access in a renderpass after or before an access by a transfer needs - * a flush/invalidate, and use the _INCOHERENT variants to represent access - * by a renderpass. +struct tu_dynamic_state +{ + /** + * Bitmask of (1 << VK_DYNAMIC_STATE_*). + * Defines the set of saved dynamic state. */ - TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, - TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, - TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, - TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, + uint32_t mask; - /* Accesses which bypasses any cache. e.g. writes via the host, - * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE. - */ - TU_ACCESS_SYSMEM_READ = 1 << 10, - TU_ACCESS_SYSMEM_WRITE = 1 << 11, + struct tu_viewport_state viewport; - /* Memory writes from the CP start in-order with draws and event writes, - * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read. - */ - TU_ACCESS_CP_WRITE = 1 << 12, - - TU_ACCESS_READ = - TU_ACCESS_UCHE_READ | - TU_ACCESS_CCU_COLOR_READ | - TU_ACCESS_CCU_DEPTH_READ | - TU_ACCESS_CCU_COLOR_INCOHERENT_READ | - TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | - TU_ACCESS_SYSMEM_READ, - - TU_ACCESS_WRITE = - TU_ACCESS_UCHE_WRITE | - TU_ACCESS_CCU_COLOR_WRITE | - TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | - TU_ACCESS_CCU_DEPTH_WRITE | - TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | - TU_ACCESS_SYSMEM_WRITE | - TU_ACCESS_CP_WRITE, - - TU_ACCESS_ALL = - TU_ACCESS_READ | - TU_ACCESS_WRITE, -}; + struct tu_scissor_state scissor; -/* Starting with a6xx, the pipeline is split into several "clusters" (really - * pipeline stages). Each stage has its own pair of register banks and can - * switch them independently, so that earlier stages can run ahead of later - * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at - * the same time. - * - * As a result of this, we need to insert a WFI when an earlier stage depends - * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any - * pending WFI's to complete before starting, and usually before reading - * indirect params even, so a WFI also acts as a full "pipeline stall". - * - * Note, the names of the stages come from CLUSTER_* in devcoredump. We - * include all the stages for completeness, even ones which do not read/write - * anything. - */ + float line_width; -enum tu_stage { - /* This doesn't correspond to a cluster, but we need it for tracking - * indirect draw parameter reads etc. - */ - TU_STAGE_CP, + struct + { + float bias; + float clamp; + float slope; + } depth_bias; - /* - Fetch index buffer - * - Fetch vertex attributes, dispatch VS - */ - TU_STAGE_FE, + float blend_constants[4]; - /* Execute all geometry stages (VS thru GS) */ - TU_STAGE_SP_VS, + struct + { + float min; + float max; + } depth_bounds; - /* Write to VPC, do primitive assembly. */ - TU_STAGE_PC_VS, + struct + { + uint32_t front; + uint32_t back; + } stencil_compare_mask; - /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according - * to devcoredump so presumably this stage stalls for TU_STAGE_PS when - * early depth testing is enabled before dispatching fragments? However - * GRAS reads and writes LRZ directly. - */ - TU_STAGE_GRAS, + struct + { + uint32_t front; + uint32_t back; + } stencil_write_mask; - /* Execute FS */ - TU_STAGE_SP_PS, + struct + { + uint32_t front; + uint32_t back; + } stencil_reference; - /* - Fragment tests - * - Write color/depth - * - Streamout writes (???) - * - Varying interpolation (???) - */ - TU_STAGE_PS, + struct tu_discard_rectangle_state discard_rectangle; }; -enum tu_cmd_flush_bits { - TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, - TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, - TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, - TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, - TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, - TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, - TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, - TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, - - TU_CMD_FLAG_ALL_FLUSH = - TU_CMD_FLAG_CCU_FLUSH_DEPTH | - TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CACHE_FLUSH | - /* Treat the CP as a sort of "cache" which may need to be "flushed" via - * waiting for writes to land with WAIT_FOR_MEM_WRITES. - */ - TU_CMD_FLAG_WAIT_MEM_WRITES, - - TU_CMD_FLAG_ALL_INVALIDATE = - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | - TU_CMD_FLAG_CCU_INVALIDATE_COLOR | - TU_CMD_FLAG_CACHE_INVALIDATE | - /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a - * a command that needs CP_WAIT_FOR_ME is executed. This means we may - * insert an extra WAIT_FOR_ME before an indirect command requiring it - * in case there was another command before the current command buffer - * that it needs to wait for. - */ - TU_CMD_FLAG_WAIT_FOR_ME, -}; +extern const struct tu_dynamic_state default_dynamic_state; -/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty - * heavy, involving a CCU cache flush/invalidate and a WFI in order to change - * which part of the gmem is used by the CCU. Here we keep track of what the - * state of the CCU. - */ -enum tu_cmd_ccu_state { - TU_CMD_CCU_SYSMEM, - TU_CMD_CCU_GMEM, - TU_CMD_CCU_UNKNOWN, -}; +const char * +tu_get_debug_option_name(int id); -struct tu_cache_state { - /* Caches which must be made available (flushed) eventually if there are - * any users outside that cache domain, and caches which must be - * invalidated eventually if there are any reads. - */ - enum tu_cmd_flush_bits pending_flush_bits; - /* Pending flushes */ - enum tu_cmd_flush_bits flush_bits; -}; +const char * +tu_get_perftest_option_name(int id); -enum tu_lrz_force_disable_mask { - TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0, - TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1, +/** + * Attachment state when recording a renderpass instance. + * + * The clear value is valid only if there exists a pending clear. + */ +struct tu_attachment_state +{ + VkImageAspectFlags pending_clear_aspects; + uint32_t cleared_views; + VkClearValue clear_value; + VkImageLayout current_layout; }; -enum tu_lrz_direction { - TU_LRZ_UNKNOWN, - /* Depth func less/less-than: */ - TU_LRZ_LESS, - /* Depth func greater/greater-than: */ - TU_LRZ_GREATER, +struct tu_descriptor_state +{ + struct tu_descriptor_set *sets[MAX_SETS]; + uint32_t dirty; + uint32_t valid; + struct tu_push_descriptor_set push_set; + bool push_dirty; + uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS]; }; -struct tu_lrz_pipeline +struct tu_tile { - uint32_t force_disable_mask; - bool fs_has_kill; - bool force_late_z; - bool early_fragment_tests; + uint8_t pipe; + uint8_t slot; + VkOffset2D begin; + VkOffset2D end; }; -struct tu_lrz_state +struct tu_tiling_config { - /* Depth/Stencil image currently on use to do LRZ */ - struct tu_image *image; - bool valid : 1; - enum tu_lrz_direction prev_direction; + VkRect2D render_area; + uint32_t buffer_cpp[MAX_RTS + 2]; + uint32_t buffer_count; + + /* position and size of the first tile */ + VkRect2D tile0; + /* number of tiles */ + VkExtent2D tile_count; + + uint32_t gmem_offsets[MAX_RTS + 2]; + + /* size of the first VSC pipe */ + VkExtent2D pipe0; + /* number of VSC pipes */ + VkExtent2D pipe_count; + + /* pipe register values */ + uint32_t pipe_config[MAX_VSC_PIPES]; + uint32_t pipe_sizes[MAX_VSC_PIPES]; }; -struct tu_vs_params { - uint32_t vertex_offset; - uint32_t first_instance; +enum tu_cmd_dirty_bits +{ + TU_CMD_DIRTY_PIPELINE = 1 << 0, + TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1, + + TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16, + TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17, + TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 18, + TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 19, }; struct tu_cmd_state @@ -1138,119 +823,48 @@ struct tu_cmd_state uint32_t dirty; struct tu_pipeline *pipeline; - struct tu_pipeline *compute_pipeline; - /* Vertex buffers, viewports, and scissors - * the states for these can be updated partially, so we need to save these - * to be able to emit a complete draw state - */ - struct { - uint64_t base; - uint32_t size; - uint32_t stride; - } vb[MAX_VBS]; - VkViewport viewport[MAX_VIEWPORTS]; - VkRect2D scissor[MAX_SCISSORS]; - uint32_t max_viewport, max_scissor; - - /* for dynamic states that can't be emitted directly */ - uint32_t dynamic_stencil_mask; - uint32_t dynamic_stencil_wrmask; - uint32_t dynamic_stencil_ref; - - uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl; - uint32_t pc_raster_cntl, vpc_unknown_9107; - enum pc_di_primtype primtype; - bool primitive_restart_enable; - - /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */ - struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; - struct tu_draw_state vertex_buffers; - struct tu_draw_state shader_const[2]; - struct tu_draw_state desc_sets; - - struct tu_draw_state vs_params; + /* Vertex buffers */ + struct + { + struct tu_buffer *buffers[MAX_VBS]; + VkDeviceSize offsets[MAX_VBS]; + } vb; + + struct tu_dynamic_state dynamic; /* Index buffer */ - uint64_t index_va; + struct tu_buffer *index_buffer; + uint64_t index_offset; + uint32_t index_type; uint32_t max_index_count; - uint8_t index_size; - - /* because streamout base has to be 32-byte aligned - * there is an extra offset to deal with when it is - * unaligned - */ - uint8_t streamout_offset[IR3_MAX_SO_BUFFERS]; - - /* Renderpasses are tricky, because we may need to flush differently if - * using sysmem vs. gmem and therefore we have to delay any flushing that - * happens before a renderpass. So we have to have two copies of the flush - * state, one for intra-renderpass flushes (i.e. renderpass dependencies) - * and one for outside a renderpass. - */ - struct tu_cache_state cache; - struct tu_cache_state renderpass_cache; - - enum tu_cmd_ccu_state ccu_state; + uint64_t index_va; const struct tu_render_pass *pass; const struct tu_subpass *subpass; const struct tu_framebuffer *framebuffer; - VkRect2D render_area; - - const struct tu_image_view **attachments; + struct tu_attachment_state *attachments; - bool xfb_used; - bool has_tess; - bool tessfactor_addr_set; - bool has_subpass_predication; - bool predication_active; - bool disable_gmem; - enum a5xx_line_mode line_mode; - bool z_negative_one_to_one; + struct tu_tiling_config tiling_config; - uint32_t drawcall_count; - - /* A calculated "draw cost" value for renderpass, which tries to - * estimate the bandwidth-per-sample of all the draws according - * to: - * - * foreach_draw (...) { - * cost += num_frag_outputs; - * if (blend_enabled) - * cost += num_blend_enabled; - * if (depth_test_enabled) - * cost++; - * if (depth_write_enabled) - * cost++; - * } - * - * The idea is that each sample-passed minimally does one write - * per MRT. If blend is enabled, the hw will additionally do - * a framebuffer read per sample-passed (for each MRT with blend - * enabled). If depth-test is enabled, the hw will additionally - * a depth buffer read. If depth-write is enable, the hw will - * additionally do a depth buffer write. - * - * This does ignore depth buffer traffic for samples which do not - * pass do to depth-test fail, and some other details. But it is - * just intended to be a rough estimate that is easy to calculate. - */ - uint32_t total_drawcalls_cost; - - struct tu_lrz_state lrz; - - struct tu_draw_state lrz_and_depth_plane_state; - - struct tu_vs_params last_vs_params; + struct tu_cs_entry tile_load_ib; + struct tu_cs_entry tile_store_ib; }; struct tu_cmd_pool { - struct vk_command_pool vk; - + VkAllocationCallbacks alloc; struct list_head cmd_buffers; struct list_head free_cmd_buffers; + uint32_t queue_family_index; +}; + +struct tu_cmd_buffer_upload +{ + uint8_t *map; + unsigned offset; + uint64_t size; + struct list_head list; }; enum tu_cmd_buffer_status @@ -1262,116 +876,165 @@ enum tu_cmd_buffer_status TU_CMD_BUFFER_STATUS_PENDING, }; +struct tu_bo_list +{ + uint32_t count; + uint32_t capacity; + struct drm_msm_gem_submit_bo *bo_infos; +}; + +#define TU_BO_LIST_FAILED (~0) + +void +tu_bo_list_init(struct tu_bo_list *list); +void +tu_bo_list_destroy(struct tu_bo_list *list); +void +tu_bo_list_reset(struct tu_bo_list *list); +uint32_t +tu_bo_list_add(struct tu_bo_list *list, + const struct tu_bo *bo, + uint32_t flags); +VkResult +tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other); + struct tu_cmd_buffer { - struct vk_command_buffer vk; + VK_LOADER_DATA _loader_data; struct tu_device *device; struct tu_cmd_pool *pool; struct list_head pool_link; - struct u_trace trace; - struct u_trace_iterator trace_renderpass_start; - struct u_trace_iterator trace_renderpass_end; - - struct list_head renderpass_autotune_results; - struct tu_autotune_results_buffer* autotune_buffer; - VkCommandBufferUsageFlags usage_flags; + VkCommandBufferLevel level; enum tu_cmd_buffer_status status; struct tu_cmd_state state; + struct tu_vertex_binding vertex_bindings[MAX_VBS]; uint32_t queue_family_index; - uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; + uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE]; VkShaderStageFlags push_constant_stages; struct tu_descriptor_set meta_push_descriptors; - struct tu_descriptor_state descriptors[MAX_BIND_POINTS]; + struct tu_descriptor_state descriptors[VK_PIPELINE_BIND_POINT_RANGE_SIZE]; + + struct tu_cmd_buffer_upload upload; VkResult record_result; + struct tu_bo_list bo_list; struct tu_cs cs; struct tu_cs draw_cs; - struct tu_cs tile_store_cs; - struct tu_cs draw_epilogue_cs; - struct tu_cs sub_cs; + struct tu_cs tile_cs; - uint32_t vsc_draw_strm_pitch; - uint32_t vsc_prim_strm_pitch; -}; + uint16_t marker_reg; + uint32_t marker_seqno; -/* Temporary struct for tracking a register state to be written, used by - * a6xx-pack.h and tu_cs_emit_regs() - */ -struct tu_reg_value { - uint32_t reg; - uint64_t value; - bool is_address; - struct tu_bo *bo; - bool bo_write; - uint32_t bo_offset; - uint32_t bo_shift; + struct tu_bo scratch_bo; + uint32_t scratch_seqno; + + bool wait_for_idle; }; +void +tu6_emit_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum vgt_event_type event, + bool need_seqno); + +bool +tu_get_memory_fd(struct tu_device *device, + struct tu_device_memory *memory, + int *pFD); -void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs); +/* + * Takes x,y,z as exact numbers of invocations, instead of blocks. + * + * Limitations: Can't call normal dispatch functions without binding or + * rebinding + * the compute pipeline. + */ +void +tu_unaligned_dispatch(struct tu_cmd_buffer *cmd_buffer, + uint32_t x, + uint32_t y, + uint32_t z); + +struct tu_event +{ + uint64_t *map; +}; -void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs, - enum tu_cmd_ccu_state ccu_state); +struct tu_shader_module; +#define TU_HASH_SHADER_IS_GEOM_COPY_SHADER (1 << 0) +#define TU_HASH_SHADER_SISCHED (1 << 1) +#define TU_HASH_SHADER_UNSAFE_MATH (1 << 2) void -tu6_emit_event_write(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - enum vgt_event_type event); +tu_hash_shaders(unsigned char *hash, + const VkPipelineShaderStageCreateInfo **stages, + const struct tu_pipeline_layout *layout, + const struct tu_pipeline_key *key, + uint32_t flags); + +static inline gl_shader_stage +vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage) +{ + assert(__builtin_popcount(vk_stage) == 1); + return ffs(vk_stage) - 1; +} -static inline struct tu_descriptor_state * -tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, - VkPipelineBindPoint bind_point) +static inline VkShaderStageFlagBits +mesa_to_vk_shader_stage(gl_shader_stage mesa_stage) { - return &cmd_buffer->descriptors[bind_point]; + return (1 << mesa_stage); } -struct tu_event +#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1) + +#define tu_foreach_stage(stage, stage_bits) \ + for (gl_shader_stage stage, \ + __tmp = (gl_shader_stage)((stage_bits) &TU_STAGE_MASK); \ + stage = __builtin_ffs(__tmp) - 1, __tmp; __tmp &= ~(1 << (stage))) + +struct tu_shader_module { - struct vk_object_base base; - struct tu_bo *bo; + unsigned char sha1[20]; + + uint32_t code_size; + const uint32_t *code[0]; }; -struct tu_push_constant_range +struct tu_shader_compile_options { - uint32_t lo; - uint32_t count; + struct ir3_shader_key key; + + bool optimize; + bool include_binning_pass; }; struct tu_shader { - struct ir3_shader *ir3_shader; + struct ir3_shader ir3_shader; - struct tu_push_constant_range push_consts; - uint8_t active_desc_sets; - bool multi_pos_output; -}; + /* This may be true for vertex shaders. When true, variants[1] is the + * binning variant and binning_binary is non-NULL. + */ + bool has_binning_pass; -bool -tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output, - struct tu_device *dev); + void *binary; + void *binning_binary; -nir_shader * -tu_spirv_to_nir(struct tu_device *dev, - void *mem_ctx, - const VkPipelineShaderStageCreateInfo *stage_info, - gl_shader_stage stage); + struct ir3_shader_variant variants[0]; +}; struct tu_shader * tu_shader_create(struct tu_device *dev, - nir_shader *nir, + gl_shader_stage stage, const VkPipelineShaderStageCreateInfo *stage_info, - unsigned multiview_mask, - struct tu_pipeline_layout *layout, const VkAllocationCallbacks *alloc); void @@ -1379,78 +1042,50 @@ tu_shader_destroy(struct tu_device *dev, struct tu_shader *shader, const VkAllocationCallbacks *alloc); -struct tu_program_descriptor_linkage -{ - struct ir3_const_state const_state; - - uint32_t constlen; - - struct tu_push_constant_range push_consts; -}; - -struct tu_pipeline_executable { - gl_shader_stage stage; - - struct ir3_info stats; - bool is_binning; +void +tu_shader_compile_options_init( + struct tu_shader_compile_options *options, + const VkGraphicsPipelineCreateInfo *pipeline_info); - char *nir_from_spirv; - char *nir_final; - char *disasm; -}; +VkResult +tu_shader_compile(struct tu_device *dev, + struct tu_shader *shader, + const struct tu_shader *next_stage, + const struct tu_shader_compile_options *options, + const VkAllocationCallbacks *alloc); struct tu_pipeline { - struct vk_object_base base; - struct tu_cs cs; - struct tu_suballoc_bo bo; - /* Separate BO for private memory since it should GPU writable */ - struct tu_bo *pvtmem_bo; + struct tu_dynamic_state dynamic_state; + + struct tu_pipeline_layout *layout; bool need_indirect_descriptor_sets; VkShaderStageFlags active_stages; - uint32_t active_desc_sets; - - /* mask of enabled dynamic states - * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used - */ - uint32_t dynamic_state_mask; - struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; - - /* for dynamic states which use the same register: */ - uint32_t gras_su_cntl, gras_su_cntl_mask; - uint32_t rb_depth_cntl, rb_depth_cntl_mask; - uint32_t rb_stencil_cntl, rb_stencil_cntl_mask; - uint32_t pc_raster_cntl, pc_raster_cntl_mask; - uint32_t vpc_unknown_9107, vpc_unknown_9107_mask; - uint32_t stencil_wrmask; - - bool rb_depth_cntl_disable; - - enum a5xx_line_mode line_mode; - - /* draw states for the pipeline */ - struct tu_draw_state load_state, rast_state, blend_state; - struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem; - - /* for vertex buffers state */ - uint32_t num_vbs; struct { - struct tu_draw_state config_state; - struct tu_draw_state state; - struct tu_draw_state binning_state; - - struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; + struct tu_bo binary_bo; + struct tu_cs_entry state_ib; + struct tu_cs_entry binning_state_ib; } program; struct { - struct tu_draw_state state; - struct tu_draw_state binning_state; + uint8_t bindings[MAX_VERTEX_ATTRIBS]; + uint16_t strides[MAX_VERTEX_ATTRIBS]; + uint16_t offsets[MAX_VERTEX_ATTRIBS]; + uint32_t count; + + uint8_t binning_bindings[MAX_VERTEX_ATTRIBS]; + uint16_t binning_strides[MAX_VERTEX_ATTRIBS]; + uint16_t binning_offsets[MAX_VERTEX_ATTRIBS]; + uint32_t binning_count; + + struct tu_cs_entry state_ib; + struct tu_cs_entry binning_state_ib; } vi; struct @@ -1461,47 +1096,36 @@ struct tu_pipeline struct { - uint32_t patch_type; - uint32_t param_stride; - bool upper_left_domain_origin; - } tess; + struct tu_cs_entry state_ib; + } vp; struct { - uint32_t local_size[3]; - uint32_t subgroup_size; - } compute; - - bool provoking_vertex_last; - - struct tu_lrz_pipeline lrz; + uint32_t gras_su_cntl; + struct tu_cs_entry state_ib; + } rast; - /* In other words - framebuffer fetch support */ - bool raster_order_attachment_access; - bool subpass_feedback_loop_ds; - - bool z_negative_one_to_one; - - /* Base drawcall cost for sysmem vs gmem autotuner */ - uint8_t drawcall_base_cost; + struct + { + struct tu_cs_entry state_ib; + } ds; - void *executables_mem_ctx; - /* tu_pipeline_executable */ - struct util_dynarray executables; + struct + { + struct tu_cs_entry state_ib; + } blend; }; void -tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport, - bool z_negative_one_to_one); +tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport); void -tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count); +tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor); void -tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value); - -void -tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc); +tu6_emit_gras_su_cntl(struct tu_cs *cs, + uint32_t gras_su_cntl, + float line_width); void tu6_emit_depth_bias(struct tu_cs *cs, @@ -1509,143 +1133,106 @@ tu6_emit_depth_bias(struct tu_cs *cs, float clamp, float slope_factor); -void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples, - enum a5xx_line_mode line_mode); - -void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); - -void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); - -void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); - -void tu6_apply_depth_bounds_workaround(struct tu_device *device, - uint32_t *rb_depth_cntl); - -struct tu_pvtmem_config { - uint64_t iova; - uint32_t per_fiber_size; - uint32_t per_sp_size; - bool per_wave; -}; - -void -tu6_emit_xs_config(struct tu_cs *cs, - gl_shader_stage stage, - const struct ir3_shader_variant *xs); - -void -tu6_emit_xs(struct tu_cs *cs, - gl_shader_stage stage, - const struct ir3_shader_variant *xs, - const struct tu_pvtmem_config *pvtmem, - uint64_t binary_iova); - -void -tu6_emit_vpc(struct tu_cs *cs, - const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *hs, - const struct ir3_shader_variant *ds, - const struct ir3_shader_variant *gs, - const struct ir3_shader_variant *fs, - uint32_t patch_control_points); - void -tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs); - -struct tu_image_view; +tu6_emit_stencil_compare_mask(struct tu_cs *cs, + uint32_t front, + uint32_t back); void -tu_resolve_sysmem(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - const struct tu_image_view *src, - const struct tu_image_view *dst, - uint32_t layer_mask, - uint32_t layers, - const VkRect2D *rect); +tu6_emit_stencil_write_mask(struct tu_cs *cs, uint32_t front, uint32_t back); void -tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - const VkRenderPassBeginInfo *info); +tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back); void -tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - const VkRenderPassBeginInfo *info); +tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]); -void -tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - bool force_load); +struct tu_userdata_info * +tu_lookup_user_sgpr(struct tu_pipeline *pipeline, + gl_shader_stage stage, + int idx); -/* expose this function to be able to emit load without checking LOAD_OP */ -void -tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); +struct tu_shader_variant * +tu_get_shader(struct tu_pipeline *pipeline, gl_shader_stage stage); -/* note: gmem store can also resolve */ -void -tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - uint32_t gmem_a); - -enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); +struct tu_graphics_pipeline_create_info +{ + bool use_rectlist; + bool db_depth_clear; + bool db_stencil_clear; + bool db_depth_disable_expclear; + bool db_stencil_disable_expclear; + bool db_flush_depth_inplace; + bool db_flush_stencil_inplace; + bool db_resummarize; + uint32_t custom_blend_mode; +}; struct tu_native_format { - enum a6xx_format fmt : 8; - enum a3xx_color_swap swap : 8; - enum a6xx_tile_mode tile_mode : 8; + int vtx; /* VFMTn_xxx or -1 */ + int tex; /* TFMTn_xxx or -1 */ + int rb; /* RBn_xxx or -1 */ + int swap; /* enum a3xx_color_swap */ + bool present; /* internal only; always true to external users */ }; -enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); -bool tu6_format_vtx_supported(VkFormat format); -struct tu_native_format tu6_format_vtx(VkFormat format); -bool tu6_format_color_supported(enum pipe_format format); -struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode); -bool tu6_format_texture_supported(enum pipe_format format); -struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode); +const struct tu_native_format * +tu6_get_native_format(VkFormat format); + +int +tu_pack_clear_value(const VkClearValue *val, + VkFormat format, + uint32_t buf[4]); +enum a6xx_2d_ifmt tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt); -static inline enum a6xx_format -tu6_base_format(enum pipe_format format) +struct tu_image_level { - /* note: tu6_format_color doesn't care about tiling for .fmt field */ - return tu6_format_color(format, TILE6_LINEAR).fmt; -} + VkDeviceSize offset; + VkDeviceSize size; + uint32_t pitch; +}; struct tu_image { - struct vk_object_base base; - + VkImageType type; /* The original VkFormat provided by the client. This may not match any * of the actual surface formats. */ VkFormat vk_format; + VkImageAspectFlags aspects; + VkImageUsageFlags usage; /**< Superset of VkImageCreateInfo::usage. */ + VkImageTiling tiling; /** VkImageCreateInfo::tiling */ + VkImageCreateFlags flags; /** VkImageCreateInfo::flags */ + VkExtent3D extent; uint32_t level_count; uint32_t layer_count; - struct fdl_layout layout[3]; - uint32_t total_size; + VkDeviceSize size; + uint32_t alignment; + + /* memory layout */ + VkDeviceSize layer_size; + struct tu_image_level levels[15]; + unsigned tile_mode; + + unsigned queue_family_mask; + bool exclusive; + bool shareable; -#ifdef ANDROID /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */ VkDeviceMemory owned_memory; -#endif /* Set when bound */ - struct tu_bo *bo; - uint64_t iova; - - uint32_t lrz_height; - uint32_t lrz_pitch; - uint32_t lrz_offset; - - bool shareable; + const struct tu_bo *bo; + VkDeviceSize bo_offset; }; +unsigned +tu_image_queue_family_mask(const struct tu_image *image, + uint32_t family, + uint32_t queue_family); + static inline uint32_t tu_get_layerCount(const struct tu_image *image, const VkImageSubresourceRange *range) @@ -1664,108 +1251,99 @@ tu_get_levelCount(const struct tu_image *image, : range->levelCount; } -enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane); - -uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask); - -enum pipe_format tu_format_for_aspect(enum pipe_format format, - VkImageAspectFlags aspect_mask); - struct tu_image_view { - struct vk_object_base base; - struct tu_image *image; /**< VkImageViewCreateInfo::image */ - struct fdl6_view view; + VkImageViewType type; + VkImageAspectFlags aspect_mask; + VkFormat vk_format; + uint32_t base_layer; + uint32_t layer_count; + uint32_t base_mip; + uint32_t level_count; + VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */ - /* for d32s8 separate depth */ - uint64_t depth_base_addr; - uint32_t depth_layer_size; - uint32_t depth_PITCH; + uint32_t descriptor[16]; - /* for d32s8 separate stencil */ - uint64_t stencil_base_addr; - uint32_t stencil_layer_size; - uint32_t stencil_PITCH; + /* Descriptor for use as a storage image as opposed to a sampled image. + * This has a few differences for cube maps (e.g. type). + */ + uint32_t storage_descriptor[16]; }; -struct tu_sampler_ycbcr_conversion { - struct vk_object_base base; - - VkFormat format; - VkSamplerYcbcrModelConversion ycbcr_model; - VkSamplerYcbcrRange ycbcr_range; - VkComponentMapping components; - VkChromaLocation chroma_offsets[2]; - VkFilter chroma_filter; +struct tu_sampler +{ }; -struct tu_sampler { - struct vk_object_base base; - - uint32_t descriptor[A6XX_TEX_SAMP_DWORDS]; - struct tu_sampler_ycbcr_conversion *ycbcr_sampler; +struct tu_image_create_info +{ + const VkImageCreateInfo *vk_info; + bool scanout; + bool no_metadata_planes; }; -void -tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer); - -void -tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src); - -void -tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer); - -void -tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); - -void -tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); - -#define tu_image_view_stencil(iview, x) \ - ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT)) - -#define tu_image_view_depth(iview, x) \ - ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT)) - VkResult -tu_gralloc_info(struct tu_device *device, - const VkNativeBufferANDROID *gralloc_info, - int *dma_buf, - uint64_t *modifier); +tu_image_create(VkDevice _device, + const struct tu_image_create_info *info, + const VkAllocationCallbacks *alloc, + VkImage *pImage); VkResult -tu_import_memory_from_gralloc_handle(VkDevice device_h, - int dma_buf, - const VkAllocationCallbacks *alloc, - VkImage image_h); +tu_image_from_gralloc(VkDevice device_h, + const VkImageCreateInfo *base_info, + const VkNativeBufferANDROID *gralloc_info, + const VkAllocationCallbacks *alloc, + VkImage *out_image_h); void -tu_image_view_init(struct tu_image_view *iview, - const VkImageViewCreateInfo *pCreateInfo, - bool limited_z24s8); - -bool -tiling_possible(VkFormat format); - -bool -ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage, - const struct fd_dev_info *info, VkSampleCountFlagBits samples); +tu_image_view_init(struct tu_image_view *view, + struct tu_device *device, + const VkImageViewCreateInfo *pCreateInfo); struct tu_buffer_view { - struct vk_object_base base; - - uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; - - struct tu_buffer *buffer; + VkFormat vk_format; + uint64_t range; /**< VkBufferViewCreateInfo::range */ + uint32_t state[4]; }; void tu_buffer_view_init(struct tu_buffer_view *view, struct tu_device *device, const VkBufferViewCreateInfo *pCreateInfo); +static inline struct VkExtent3D +tu_sanitize_image_extent(const VkImageType imageType, + const struct VkExtent3D imageExtent) +{ + switch (imageType) { + case VK_IMAGE_TYPE_1D: + return (VkExtent3D) { imageExtent.width, 1, 1 }; + case VK_IMAGE_TYPE_2D: + return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 }; + case VK_IMAGE_TYPE_3D: + return imageExtent; + default: + unreachable("invalid image type"); + } +} + +static inline struct VkOffset3D +tu_sanitize_image_offset(const VkImageType imageType, + const struct VkOffset3D imageOffset) +{ + switch (imageType) { + case VK_IMAGE_TYPE_1D: + return (VkOffset3D) { imageOffset.x, 0, 0 }; + case VK_IMAGE_TYPE_2D: + return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 }; + case VK_IMAGE_TYPE_3D: + return imageOffset; + default: + unreachable("invalid image type"); + } +} + struct tu_attachment_info { struct tu_image_view *attachment; @@ -1773,146 +1351,100 @@ struct tu_attachment_info struct tu_framebuffer { - struct vk_object_base base; - uint32_t width; uint32_t height; uint32_t layers; - /* size of the first tile */ - VkExtent2D tile0; - /* number of tiles */ - VkExtent2D tile_count; - - /* size of the first VSC pipe */ - VkExtent2D pipe0; - /* number of VSC pipes */ - VkExtent2D pipe_count; - - /* pipe register values */ - uint32_t pipe_config[MAX_VSC_PIPES]; - uint32_t pipe_sizes[MAX_VSC_PIPES]; - uint32_t attachment_count; struct tu_attachment_info attachments[0]; }; -void -tu_framebuffer_tiling_config(struct tu_framebuffer *fb, - const struct tu_device *device, - const struct tu_render_pass *pass); - -struct tu_subpass_barrier { +struct tu_subpass_barrier +{ VkPipelineStageFlags src_stage_mask; - VkPipelineStageFlags dst_stage_mask; VkAccessFlags src_access_mask; VkAccessFlags dst_access_mask; - bool incoherent_ccu_color, incoherent_ccu_depth; }; +void +tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, + const struct tu_subpass_barrier *barrier); + struct tu_subpass_attachment { uint32_t attachment; - - /* For input attachments, true if it needs to be patched to refer to GMEM - * in GMEM mode. This is false if it hasn't already been written as an - * attachment. - */ - bool patch_input_gmem; + VkImageLayout layout; }; struct tu_subpass { uint32_t input_count; uint32_t color_count; - uint32_t resolve_count; - bool resolve_depth_stencil; - - bool feedback_loop_color; - bool feedback_loop_ds; - - /* True if we must invalidate UCHE thanks to a feedback loop. */ - bool feedback_invalidate; - - /* In other words - framebuffer fetch support */ - bool raster_order_attachment_access; - struct tu_subpass_attachment *input_attachments; struct tu_subpass_attachment *color_attachments; struct tu_subpass_attachment *resolve_attachments; struct tu_subpass_attachment depth_stencil_attachment; - VkSampleCountFlagBits samples; - - uint32_t srgb_cntl; - uint32_t multiview_mask; + /** Subpass has at least one resolve attachment */ + bool has_resolve; struct tu_subpass_barrier start_barrier; + + uint32_t view_mask; + VkSampleCountFlagBits max_sample_count; }; struct tu_render_pass_attachment { VkFormat format; uint32_t samples; - uint32_t cpp; - VkImageAspectFlags clear_mask; - uint32_t clear_views; - bool load; - bool store; - int32_t gmem_offset; - /* for D32S8 separate stencil: */ - bool load_stencil; - bool store_stencil; - int32_t gmem_offset_stencil; + VkAttachmentLoadOp load_op; + VkAttachmentLoadOp stencil_load_op; + VkImageLayout initial_layout; + VkImageLayout final_layout; + uint32_t view_mask; }; struct tu_render_pass { - struct vk_object_base base; - uint32_t attachment_count; uint32_t subpass_count; - uint32_t gmem_pixels; - uint32_t tile_align_w; struct tu_subpass_attachment *subpass_attachments; struct tu_render_pass_attachment *attachments; struct tu_subpass_barrier end_barrier; struct tu_subpass subpasses[0]; }; -#define PERF_CNTRS_REG 4 - -struct tu_perf_query_data -{ - uint32_t gid; /* group-id */ - uint32_t cid; /* countable-id within the group */ - uint32_t cntr_reg; /* counter register within the group */ - uint32_t pass; /* pass index that countables can be requested */ - uint32_t app_idx; /* index provided by apps */ -}; +VkResult +tu_device_init_meta(struct tu_device *device); +void +tu_device_finish_meta(struct tu_device *device); struct tu_query_pool { - struct vk_object_base base; - - VkQueryType type; uint32_t stride; + uint32_t availability_offset; uint64_t size; - uint32_t pipeline_statistics; - struct tu_bo *bo; + char *ptr; + VkQueryType type; + uint32_t pipeline_stats_mask; +}; - /* For performance query */ - const struct fd_perfcntr_group *perf_group; - uint32_t perf_group_count; - uint32_t counter_index_count; - struct tu_perf_query_data perf_query_data[0]; +struct tu_semaphore +{ + uint32_t syncobj; + uint32_t temp_syncobj; }; -uint32_t -tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index); +void +tu_set_descriptor_set(struct tu_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point, + struct tu_descriptor_set *set, + unsigned idx); void -tu_update_descriptor_sets(const struct tu_device *device, +tu_update_descriptor_sets(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, VkDescriptorSet overrideSet, uint32_t descriptorWriteCount, const VkWriteDescriptorSet *pDescriptorWrites, @@ -1921,24 +1453,25 @@ tu_update_descriptor_sets(const struct tu_device *device, void tu_update_descriptor_set_with_template( - const struct tu_device *device, + struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, struct tu_descriptor_set *set, VkDescriptorUpdateTemplate descriptorUpdateTemplate, const void *pData); -VkResult -tu_physical_device_init(struct tu_physical_device *device, - struct tu_instance *instance); -VkResult -tu_enumerate_devices(struct tu_instance *instance); +void +tu_meta_push_descriptor_set(struct tu_cmd_buffer *cmd_buffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t set, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet *pDescriptorWrites); int -tu_device_get_gpu_timestamp(struct tu_device *dev, - uint64_t *ts); +tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id); int -tu_device_get_suspend_count(struct tu_device *dev, - uint64_t *suspend_count); +tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size); int tu_drm_submitqueue_new(const struct tu_device *dev, @@ -1948,116 +1481,76 @@ tu_drm_submitqueue_new(const struct tu_device *dev, void tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id); +uint32_t +tu_gem_new(const struct tu_device *dev, uint64_t size, uint32_t flags); +uint32_t +tu_gem_import_dmabuf(const struct tu_device *dev, + int prime_fd, + uint64_t size); int -tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2); - -int -tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync); - -VkResult -tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit); - +tu_gem_export_dmabuf(const struct tu_device *dev, uint32_t gem_handle); void -tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, - void *ts_from, uint32_t from_offset, - void *ts_to, uint32_t to_offset, - uint32_t count); - - -VkResult -tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, - struct u_trace **trace_copy); - -/* If we copy trace and timestamps we will have to free them. */ -struct tu_u_trace_cmd_data -{ - struct tu_cs *timestamp_copy_cs; - struct u_trace *trace; -}; - -/* Data necessary to retrieve timestamps and clean all - * associated resources afterwards. - */ -struct tu_u_trace_submission_data -{ - uint32_t submission_id; - /* We have to know when timestamps are available, - * this sync object indicates it. - */ - struct tu_u_trace_syncobj *syncobj; - - uint32_t cmd_buffer_count; - uint32_t last_buffer_with_tracepoints; - struct tu_u_trace_cmd_data *cmd_trace_data; -}; - -VkResult -tu_u_trace_submission_data_create( - struct tu_device *device, - struct tu_cmd_buffer **cmd_buffers, - uint32_t cmd_buffer_count, - struct tu_u_trace_submission_data **submission_data); - -void -tu_u_trace_submission_data_finish( - struct tu_device *device, - struct tu_u_trace_submission_data *submission_data); +tu_gem_close(const struct tu_device *dev, uint32_t gem_handle); +uint64_t +tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle); +uint64_t +tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle); + +#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType) \ + \ + static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \ + { \ + return (struct __tu_type *) _handle; \ + } \ + \ + static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj) \ + { \ + return (__VkType) _obj; \ + } + +#define TU_DEFINE_NONDISP_HANDLE_CASTS(__tu_type, __VkType) \ + \ + static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \ + { \ + return (struct __tu_type *) (uintptr_t) _handle; \ + } \ + \ + static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj) \ + { \ + return (__VkType)(uintptr_t) _obj; \ + } #define TU_FROM_HANDLE(__tu_type, __name, __handle) \ - VK_FROM_HANDLE(__tu_type, __name, __handle) - -VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer, - VK_OBJECT_TYPE_COMMAND_BUFFER) -VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) -VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, - VK_OBJECT_TYPE_INSTANCE) -VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice, - VK_OBJECT_TYPE_PHYSICAL_DEVICE) -VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) - -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool, - VK_OBJECT_TYPE_COMMAND_POOL) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer, - VK_OBJECT_TYPE_BUFFER) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView, - VK_OBJECT_TYPE_BUFFER_VIEW) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool, - VK_OBJECT_TYPE_DESCRIPTOR_POOL) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet, - VK_OBJECT_TYPE_DESCRIPTOR_SET) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base, - VkDescriptorSetLayout, - VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base, - VkDescriptorUpdateTemplate, - VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory, - VK_OBJECT_TYPE_DEVICE_MEMORY) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer, - VK_OBJECT_TYPE_FRAMEBUFFER) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView, - VK_OBJECT_TYPE_IMAGE_VIEW); -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache, - VK_OBJECT_TYPE_PIPELINE_CACHE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline, - VK_OBJECT_TYPE_PIPELINE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout, - VK_OBJECT_TYPE_PIPELINE_LAYOUT) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool, - VK_OBJECT_TYPE_QUERY_POOL) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass, - VK_OBJECT_TYPE_RENDER_PASS) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler, - VK_OBJECT_TYPE_SAMPLER) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion, - VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION) - -/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */ -#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x)) - -void -update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask); + struct __tu_type *__name = __tu_type##_from_handle(__handle) + +TU_DEFINE_HANDLE_CASTS(tu_cmd_buffer, VkCommandBuffer) +TU_DEFINE_HANDLE_CASTS(tu_device, VkDevice) +TU_DEFINE_HANDLE_CASTS(tu_instance, VkInstance) +TU_DEFINE_HANDLE_CASTS(tu_physical_device, VkPhysicalDevice) +TU_DEFINE_HANDLE_CASTS(tu_queue, VkQueue) + +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, VkCommandPool) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, VkBuffer) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, VkBufferView) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, VkDescriptorPool) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, VkDescriptorSet) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, + VkDescriptorSetLayout) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, + VkDescriptorUpdateTemplate) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, VkDeviceMemory) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_fence, VkFence) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_event, VkEvent) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, VkFramebuffer) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image, VkImage) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, VkImageView); +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, VkPipelineCache) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, VkPipeline) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_shader_module, VkShaderModule) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_semaphore, VkSemaphore) #endif /* TU_PRIVATE_H */ diff --git a/lib/mesa/src/freedreno/vulkan/vk_format.h b/lib/mesa/src/freedreno/vulkan/vk_format.h index c335a12ae..4e13bc9c0 100644 --- a/lib/mesa/src/freedreno/vulkan/vk_format.h +++ b/lib/mesa/src/freedreno/vulkan/vk_format.h @@ -3,7 +3,7 @@ * Copyright © 2016 Bas Nieuwenhuizen * * Based on u_format.h which is: - * Copyright 2009-2010 VMware, Inc. + * Copyright 2009-2010 Vmware, Inc. * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation @@ -29,29 +29,549 @@ #include <assert.h> #include <util/macros.h> -#include <util/format/u_format.h> -#include <vulkan/util/vk_format.h> #include <vulkan/vulkan.h> +enum vk_format_layout +{ + /** + * Formats with vk_format_block::width == vk_format_block::height == 1 + * that can be described as an ordinary data structure. + */ + VK_FORMAT_LAYOUT_PLAIN = 0, + + /** + * Formats with sub-sampled channels. + * + * This is for formats like YVYU where there is less than one sample per + * pixel. + */ + VK_FORMAT_LAYOUT_SUBSAMPLED = 3, + + /** + * S3 Texture Compression formats. + */ + VK_FORMAT_LAYOUT_S3TC = 4, + + /** + * Red-Green Texture Compression formats. + */ + VK_FORMAT_LAYOUT_RGTC = 5, + + /** + * Ericsson Texture Compression + */ + VK_FORMAT_LAYOUT_ETC = 6, + + /** + * BC6/7 Texture Compression + */ + VK_FORMAT_LAYOUT_BPTC = 7, + + /** + * ASTC + */ + VK_FORMAT_LAYOUT_ASTC = 8, + + /** + * Everything else that doesn't fit in any of the above layouts. + */ + VK_FORMAT_LAYOUT_OTHER = 9 +}; + +struct vk_format_block +{ + /** Block width in pixels */ + unsigned width; + + /** Block height in pixels */ + unsigned height; + + /** Block size in bits */ + unsigned bits; +}; + +enum vk_format_type +{ + VK_FORMAT_TYPE_VOID = 0, + VK_FORMAT_TYPE_UNSIGNED = 1, + VK_FORMAT_TYPE_SIGNED = 2, + VK_FORMAT_TYPE_FIXED = 3, + VK_FORMAT_TYPE_FLOAT = 4 +}; + +enum vk_format_colorspace +{ + VK_FORMAT_COLORSPACE_RGB = 0, + VK_FORMAT_COLORSPACE_SRGB = 1, + VK_FORMAT_COLORSPACE_YUV = 2, + VK_FORMAT_COLORSPACE_ZS = 3 +}; + +struct vk_format_channel_description +{ + unsigned type : 5; + unsigned normalized : 1; + unsigned pure_integer : 1; + unsigned scaled : 1; + unsigned size : 8; + unsigned shift : 16; +}; + +struct vk_format_description +{ + VkFormat format; + const char *name; + const char *short_name; + + struct vk_format_block block; + enum vk_format_layout layout; + + unsigned nr_channels : 3; + unsigned is_array : 1; + unsigned is_bitmask : 1; + unsigned is_mixed : 1; + + struct vk_format_channel_description channel[4]; + + unsigned char swizzle[4]; + + enum vk_format_colorspace colorspace; +}; + +extern const struct vk_format_description vk_format_description_table[]; + +const struct vk_format_description * +vk_format_description(VkFormat format); + +/** + * Return total bits needed for the pixel format per block. + */ +static inline unsigned +vk_format_get_blocksizebits(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return 0; + } + + return desc->block.bits; +} + +/** + * Return bytes per block (not pixel) for the given format. + */ +static inline unsigned +vk_format_get_blocksize(VkFormat format) +{ + unsigned bits = vk_format_get_blocksizebits(format); + unsigned bytes = bits / 8; + + assert(bits % 8 == 0); + assert(bytes > 0); + if (bytes == 0) { + bytes = 1; + } + + return bytes; +} + +static inline unsigned +vk_format_get_blockwidth(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return 1; + } + + return desc->block.width; +} + +static inline unsigned +vk_format_get_blockheight(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return 1; + } + + return desc->block.height; +} + +static inline unsigned +vk_format_get_block_count_width(VkFormat format, unsigned width) +{ + unsigned blockwidth = vk_format_get_blockwidth(format); + return (width + blockwidth - 1) / blockwidth; +} + +static inline unsigned +vk_format_get_block_count_height(VkFormat format, unsigned height) +{ + unsigned blockheight = vk_format_get_blockheight(format); + return (height + blockheight - 1) / blockheight; +} + +static inline unsigned +vk_format_get_block_count(VkFormat format, unsigned width, unsigned height) +{ + return vk_format_get_block_count_width(format, width) * + vk_format_get_block_count_height(format, height); +} + +/** + * Return the index of the first non-void channel + * -1 if no non-void channels + */ +static inline int +vk_format_get_first_non_void_channel(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + int i; + + for (i = 0; i < 4; i++) + if (desc->channel[i].type != VK_FORMAT_TYPE_VOID) + break; + + if (i == 4) + return -1; + + return i; +} + +enum vk_swizzle +{ + VK_SWIZZLE_X, + VK_SWIZZLE_Y, + VK_SWIZZLE_Z, + VK_SWIZZLE_W, + VK_SWIZZLE_0, + VK_SWIZZLE_1, + VK_SWIZZLE_NONE, + VK_SWIZZLE_MAX, /**< Number of enums counter (must be last) */ +}; + +static inline VkImageAspectFlags +vk_format_aspects(VkFormat format) +{ + switch (format) { + case VK_FORMAT_UNDEFINED: + return 0; + + case VK_FORMAT_S8_UINT: + return VK_IMAGE_ASPECT_STENCIL_BIT; + + case VK_FORMAT_D16_UNORM_S8_UINT: + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D32_SFLOAT: + return VK_IMAGE_ASPECT_DEPTH_BIT; + + default: + return VK_IMAGE_ASPECT_COLOR_BIT; + } +} + +static inline enum vk_swizzle +tu_swizzle_conv(VkComponentSwizzle component, + const unsigned char chan[4], + VkComponentSwizzle vk_swiz) +{ + int x; + + if (vk_swiz == VK_COMPONENT_SWIZZLE_IDENTITY) + vk_swiz = component; + switch (vk_swiz) { + case VK_COMPONENT_SWIZZLE_ZERO: + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_ONE: + return VK_SWIZZLE_1; + case VK_COMPONENT_SWIZZLE_R: + for (x = 0; x < 4; x++) + if (chan[x] == 0) + return x; + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_G: + for (x = 0; x < 4; x++) + if (chan[x] == 1) + return x; + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_B: + for (x = 0; x < 4; x++) + if (chan[x] == 2) + return x; + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_A: + for (x = 0; x < 4; x++) + if (chan[x] == 3) + return x; + return VK_SWIZZLE_1; + default: + unreachable("Illegal swizzle"); + } +} + +static inline void +vk_format_compose_swizzles(const VkComponentMapping *mapping, + const unsigned char swz[4], + enum vk_swizzle dst[4]) +{ + dst[0] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_R, swz, mapping->r); + dst[1] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_G, swz, mapping->g); + dst[2] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_B, swz, mapping->b); + dst[3] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_A, swz, mapping->a); +} + +static inline bool +vk_format_is_compressed(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return false; + } + + switch (desc->layout) { + case VK_FORMAT_LAYOUT_S3TC: + case VK_FORMAT_LAYOUT_RGTC: + case VK_FORMAT_LAYOUT_ETC: + case VK_FORMAT_LAYOUT_BPTC: + case VK_FORMAT_LAYOUT_ASTC: + /* XXX add other formats in the future */ + return true; + default: + return false; + } +} + +static inline bool +vk_format_has_depth(const struct vk_format_description *desc) +{ + return desc->colorspace == VK_FORMAT_COLORSPACE_ZS && + desc->swizzle[0] != VK_SWIZZLE_NONE; +} + +static inline bool +vk_format_has_stencil(const struct vk_format_description *desc) +{ + return desc->colorspace == VK_FORMAT_COLORSPACE_ZS && + desc->swizzle[1] != VK_SWIZZLE_NONE; +} + +static inline bool +vk_format_is_depth_or_stencil(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return false; + } + + return vk_format_has_depth(desc) || vk_format_has_stencil(desc); +} + +static inline bool +vk_format_is_depth(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return false; + } + + return vk_format_has_depth(desc); +} + +static inline bool +vk_format_is_stencil(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return false; + } + + return vk_format_has_stencil(desc); +} + +static inline bool +vk_format_is_color(VkFormat format) +{ + return !vk_format_is_depth_or_stencil(format); +} + +static inline bool +vk_format_has_alpha(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + return (desc->colorspace == VK_FORMAT_COLORSPACE_RGB || + desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) && + desc->swizzle[3] != VK_SWIZZLE_1; +} + +static inline VkFormat +vk_format_depth_only(VkFormat format) +{ + switch (format) { + case VK_FORMAT_D16_UNORM_S8_UINT: + return VK_FORMAT_D16_UNORM; + case VK_FORMAT_D24_UNORM_S8_UINT: + return VK_FORMAT_X8_D24_UNORM_PACK32; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return VK_FORMAT_D32_SFLOAT; + default: + return format; + } +} + +static inline bool +vk_format_is_int(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + int channel = vk_format_get_first_non_void_channel(format); + + return channel >= 0 && desc->channel[channel].pure_integer; +} + +static inline bool +vk_format_is_srgb(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + return desc->colorspace == VK_FORMAT_COLORSPACE_SRGB; +} + +static inline VkFormat +vk_format_no_srgb(VkFormat format) +{ + switch (format) { + case VK_FORMAT_R8_SRGB: + return VK_FORMAT_R8_UNORM; + case VK_FORMAT_R8G8_SRGB: + return VK_FORMAT_R8G8_UNORM; + case VK_FORMAT_R8G8B8_SRGB: + return VK_FORMAT_R8G8B8_UNORM; + case VK_FORMAT_B8G8R8_SRGB: + return VK_FORMAT_B8G8R8_UNORM; + case VK_FORMAT_R8G8B8A8_SRGB: + return VK_FORMAT_R8G8B8A8_UNORM; + case VK_FORMAT_B8G8R8A8_SRGB: + return VK_FORMAT_B8G8R8A8_UNORM; + case VK_FORMAT_A8B8G8R8_SRGB_PACK32: + return VK_FORMAT_A8B8G8R8_UNORM_PACK32; + case VK_FORMAT_BC1_RGB_SRGB_BLOCK: + return VK_FORMAT_BC1_RGB_UNORM_BLOCK; + case VK_FORMAT_BC1_RGBA_SRGB_BLOCK: + return VK_FORMAT_BC1_RGBA_UNORM_BLOCK; + case VK_FORMAT_BC2_SRGB_BLOCK: + return VK_FORMAT_BC2_UNORM_BLOCK; + case VK_FORMAT_BC3_SRGB_BLOCK: + return VK_FORMAT_BC3_UNORM_BLOCK; + case VK_FORMAT_BC7_SRGB_BLOCK: + return VK_FORMAT_BC7_UNORM_BLOCK; + case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK: + return VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK; + case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK: + return VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK; + case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK: + return VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK; + default: + assert(!vk_format_is_srgb(format)); + return format; + } +} + +static inline VkFormat +vk_format_stencil_only(VkFormat format) +{ + return VK_FORMAT_S8_UINT; +} + static inline unsigned vk_format_get_component_bits(VkFormat format, - enum util_format_colorspace colorspace, + enum vk_format_colorspace colorspace, unsigned component) { + const struct vk_format_description *desc = vk_format_description(format); + enum vk_format_colorspace desc_colorspace; + + assert(format); + if (!format) { + return 0; + } + + assert(component < 4); + + /* Treat RGB and SRGB as equivalent. */ + if (colorspace == VK_FORMAT_COLORSPACE_SRGB) { + colorspace = VK_FORMAT_COLORSPACE_RGB; + } + if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) { + desc_colorspace = VK_FORMAT_COLORSPACE_RGB; + } else { + desc_colorspace = desc->colorspace; + } + + if (desc_colorspace != colorspace) { + return 0; + } + + switch (desc->swizzle[component]) { + case VK_SWIZZLE_X: + return desc->channel[0].size; + case VK_SWIZZLE_Y: + return desc->channel[1].size; + case VK_SWIZZLE_Z: + return desc->channel[2].size; + case VK_SWIZZLE_W: + return desc->channel[3].size; + default: + return 0; + } +} + +static inline VkFormat +vk_to_non_srgb_format(VkFormat format) +{ switch (format) { - case VK_FORMAT_G8B8G8R8_422_UNORM: - case VK_FORMAT_B8G8R8G8_422_UNORM: - case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: - case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: - /* util_format_get_component_bits doesn't return what we want */ - return 8; + case VK_FORMAT_R8_SRGB: + return VK_FORMAT_R8_UNORM; + case VK_FORMAT_R8G8_SRGB: + return VK_FORMAT_R8G8_UNORM; + case VK_FORMAT_R8G8B8_SRGB: + return VK_FORMAT_R8G8B8_UNORM; + case VK_FORMAT_B8G8R8_SRGB: + return VK_FORMAT_B8G8R8_UNORM; + case VK_FORMAT_R8G8B8A8_SRGB: + return VK_FORMAT_R8G8B8A8_UNORM; + case VK_FORMAT_B8G8R8A8_SRGB: + return VK_FORMAT_B8G8R8A8_UNORM; + case VK_FORMAT_A8B8G8R8_SRGB_PACK32: + return VK_FORMAT_A8B8G8R8_UNORM_PACK32; default: - break; + return format; } +} - return util_format_get_component_bits(vk_format_to_pipe_format(format), - colorspace, component); +static inline unsigned +vk_format_get_nr_components(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + return desc->nr_channels; } #endif /* VK_FORMAT_H */ diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp index 0a70c6881..0ad6087e5 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp @@ -24,11 +24,9 @@ namespace nv50_ir { -ConverterCommon::ConverterCommon(Program *prog, nv50_ir_prog_info *info, - nv50_ir_prog_info_out *info_out) +ConverterCommon::ConverterCommon(Program *prog, nv50_ir_prog_info *info) : BuildUtil(prog), - info(info), - info_out(info_out) {} + info(info) {} ConverterCommon::Subroutine * ConverterCommon::getSubroutine(unsigned ip) @@ -84,7 +82,7 @@ ConverterCommon::handleUserClipPlanes() int n, i, c; for (c = 0; c < 4; ++c) { - for (i = 0; i < info_out->io.genUserClip; ++i) { + for (i = 0; i < info->io.genUserClip; ++i) { Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot, TYPE_F32, info->io.ucpBase + i * 16 + c * 4); Value *ucp = mkLoadv(TYPE_F32, sym, NULL); @@ -95,13 +93,13 @@ ConverterCommon::handleUserClipPlanes() } } - const int first = info_out->numOutputs - (info_out->io.genUserClip + 3) / 4; + const int first = info->numOutputs - (info->io.genUserClip + 3) / 4; - for (i = 0; i < info_out->io.genUserClip; ++i) { + for (i = 0; i < info->io.genUserClip; ++i) { n = i / 4 + first; c = i % 4; Symbol *sym = - mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info_out->out[n].slot[c] * 4); + mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info->out[n].slot[c] * 4); mkStore(OP_EXPORT, TYPE_F32, sym, NULL, res[i]); } } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h index a144ca23a..e44eea86a 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h @@ -28,7 +28,7 @@ namespace nv50_ir { class ConverterCommon : public BuildUtil { public: - ConverterCommon(Program *, nv50_ir_prog_info *, nv50_ir_prog_info_out *); + ConverterCommon(Program *, nv50_ir_prog_info *); protected: struct Subroutine { @@ -50,7 +50,6 @@ protected: } sub; struct nv50_ir_prog_info *info; - struct nv50_ir_prog_info_out *info_out; Value *fragCoord[4]; Value *clipVtx[4]; Value *outBase; // base address of vertex out patch (for TCP) diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp index 5df3f3168..950923a0d 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp @@ -25,13 +25,11 @@ #include "compiler/nir/nir.h" #include "util/u_debug.h" -#include "util/u_prim.h" #include "codegen/nv50_ir.h" #include "codegen/nv50_ir_from_common.h" #include "codegen/nv50_ir_lowering_helper.h" #include "codegen/nv50_ir_util.h" -#include "tgsi/tgsi_from_mesa.h" #if __cplusplus >= 201103L #include <unordered_map> @@ -60,32 +58,17 @@ type_size(const struct glsl_type *type, bool bindless) return glsl_count_attribute_slots(type, false); } -static void -function_temp_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) -{ - assert(glsl_type_is_vector_or_scalar(type)); - - if (glsl_type_is_scalar(type)) { - glsl_get_natural_size_align_bytes(type, size, align); - } else { - unsigned comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; - unsigned length = glsl_get_vector_elements(type); - - *size = comp_size * length; - *align = 0x10; - } -} - class Converter : public ConverterCommon { public: - Converter(Program *, nir_shader *, nv50_ir_prog_info *, nv50_ir_prog_info_out *); + Converter(Program *, nir_shader *, nv50_ir_prog_info *); bool run(); private: typedef std::vector<LValue*> LValues; typedef unordered_map<unsigned, LValues> NirDefMap; typedef unordered_map<unsigned, nir_load_const_instr*> ImmediateMap; + typedef unordered_map<unsigned, uint32_t> NirArrayLMemOffsets; typedef unordered_map<unsigned, BasicBlock*> NirBlockMap; CacheMode convert(enum gl_access_qualifier); @@ -98,6 +81,8 @@ private: LValues& convert(nir_register *); LValues& convert(nir_ssa_def *); + ImgFormat convertGLImgFormat(GLuint); + Value* getSrc(nir_alu_src *, uint8_t component = 0); Value* getSrc(nir_register *, uint8_t); Value* getSrc(nir_src *, uint8_t, bool indirect = false); @@ -137,10 +122,9 @@ private: DataType getDType(nir_alu_instr *); DataType getDType(nir_intrinsic_instr *); + DataType getDType(nir_intrinsic_instr *, bool isSigned); DataType getDType(nir_op, uint8_t); - DataFile getFile(nir_intrinsic_op); - std::vector<DataType> getSTypes(nir_alu_instr *); DataType getSType(nir_src &, bool isFloat, bool isSigned); @@ -160,6 +144,7 @@ private: bool visit(nir_alu_instr *); bool visit(nir_block *); bool visit(nir_cf_node *); + bool visit(nir_deref_instr *); bool visit(nir_function *); bool visit(nir_if *); bool visit(nir_instr *); @@ -171,16 +156,21 @@ private: bool visit(nir_tex_instr *); // tex stuff + Value* applyProjection(Value *src, Value *proj); unsigned int getNIRArgCount(TexInstruction::Target&); + // image stuff + uint16_t handleDeref(nir_deref_instr *, Value * & indirect, const nir_variable * &); + CacheMode getCacheModeFromVar(const nir_variable *); + nir_shader *nir; NirDefMap ssaDefs; NirDefMap regDefs; ImmediateMap immediates; + NirArrayLMemOffsets regToLmemOffset; NirBlockMap blocks; unsigned int curLoopDepth; - unsigned int curIfDepth; BasicBlock *exit; Value *zero; @@ -195,14 +185,10 @@ private: }; }; -Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info, - nv50_ir_prog_info_out *info_out) - : ConverterCommon(prog, info, info_out), +Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info) + : ConverterCommon(prog, info), nir(nir), curLoopDepth(0), - curIfDepth(0), - exit(NULL), - immInsertPos(NULL), clipVertexOutput(-1) { zero = mkImm((uint32_t)0); @@ -274,33 +260,29 @@ Converter::getDType(nir_alu_instr *insn) DataType Converter::getDType(nir_intrinsic_instr *insn) { - bool isFloat, isSigned; + bool isSigned; switch (insn->intrinsic) { - case nir_intrinsic_bindless_image_atomic_fadd: - case nir_intrinsic_global_atomic_fadd: - case nir_intrinsic_image_atomic_fadd: - case nir_intrinsic_shared_atomic_fadd: - case nir_intrinsic_ssbo_atomic_fadd: - isFloat = true; - isSigned = false; - break; case nir_intrinsic_shared_atomic_imax: case nir_intrinsic_shared_atomic_imin: case nir_intrinsic_ssbo_atomic_imax: case nir_intrinsic_ssbo_atomic_imin: - isFloat = false; isSigned = true; break; default: - isFloat = false; isSigned = false; break; } + return getDType(insn, isSigned); +} + +DataType +Converter::getDType(nir_intrinsic_instr *insn, bool isSigned) +{ if (insn->dest.is_ssa) - return typeOfSize(insn->dest.ssa.bit_size / 8, isFloat, isSigned); + return typeOfSize(insn->dest.ssa.bit_size / 8, false, isSigned); else - return typeOfSize(insn->dest.reg.reg->bit_size / 8, isFloat, isSigned); + return typeOfSize(insn->dest.reg.reg->bit_size / 8, false, isSigned); } DataType @@ -358,29 +340,6 @@ Converter::getSType(nir_src &src, bool isFloat, bool isSigned) return ty; } -DataFile -Converter::getFile(nir_intrinsic_op op) -{ - switch (op) { - case nir_intrinsic_load_global: - case nir_intrinsic_store_global: - case nir_intrinsic_load_global_constant: - return FILE_MEMORY_GLOBAL; - case nir_intrinsic_load_scratch: - case nir_intrinsic_store_scratch: - return FILE_MEMORY_LOCAL; - case nir_intrinsic_load_shared: - case nir_intrinsic_store_shared: - return FILE_MEMORY_SHARED; - case nir_intrinsic_load_kernel_input: - return FILE_SHADER_INPUT; - default: - ERROR("couldn't get DateFile for op %s\n", nir_intrinsic_infos[op].name); - assert(false); - } - return FILE_NULL; -} - operation Converter::getOperation(nir_op op) { @@ -480,7 +439,7 @@ Converter::getOperation(nir_op op) case nir_op_flt32: case nir_op_ilt32: case nir_op_ult32: - case nir_op_fneu32: + case nir_op_fne32: case nir_op_ine32: return OP_SET; case nir_op_ishl: @@ -492,6 +451,9 @@ Converter::getOperation(nir_op op) return OP_SIN; case nir_op_fsqrt: return OP_SQRT; + case nir_op_fsub: + case nir_op_isub: + return OP_SUB; case nir_op_ftrunc: return OP_TRUNC; case nir_op_ixor: @@ -543,39 +505,43 @@ Converter::getOperation(nir_intrinsic_op op) return OP_RESTART; case nir_intrinsic_bindless_image_atomic_add: case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_deref_atomic_add: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_bindless_image_atomic_comp_swap: case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_bindless_image_atomic_exchange: case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_imax: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_bindless_image_atomic_umax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_bindless_image_atomic_imin: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_bindless_image_atomic_umin: - case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_bindless_image_atomic_min: + case nir_intrinsic_image_atomic_min: + case nir_intrinsic_image_deref_atomic_min: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_image_atomic_inc_wrap: - case nir_intrinsic_bindless_image_atomic_dec_wrap: - case nir_intrinsic_image_atomic_dec_wrap: + case nir_intrinsic_image_deref_atomic_xor: return OP_SUREDP; case nir_intrinsic_bindless_image_load: case nir_intrinsic_image_load: + case nir_intrinsic_image_deref_load: return OP_SULDP; case nir_intrinsic_bindless_image_samples: case nir_intrinsic_image_samples: + case nir_intrinsic_image_deref_samples: case nir_intrinsic_bindless_image_size: case nir_intrinsic_image_size: + case nir_intrinsic_image_deref_size: return OP_SUQ; case nir_intrinsic_bindless_image_store: case nir_intrinsic_image_store: + case nir_intrinsic_image_deref_store: return OP_SUSTP; default: ERROR("couldn't get operation for nir_intrinsic_op %u\n", op); @@ -603,10 +569,6 @@ Converter::getSubOp(nir_op op) case nir_op_imul_high: case nir_op_umul_high: return NV50_IR_SUBOP_MUL_HIGH; - case nir_op_ishl: - case nir_op_ishr: - case nir_op_ushr: - return NV50_IR_SUBOP_SHIFT_WRAP; default: return 0; } @@ -617,78 +579,61 @@ Converter::getSubOp(nir_intrinsic_op op) { switch (op) { case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_global_atomic_add: case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_deref_atomic_add: case nir_intrinsic_shared_atomic_add: case nir_intrinsic_ssbo_atomic_add: return NV50_IR_SUBOP_ATOM_ADD; - case nir_intrinsic_bindless_image_atomic_fadd: - case nir_intrinsic_global_atomic_fadd: - case nir_intrinsic_image_atomic_fadd: - case nir_intrinsic_shared_atomic_fadd: - case nir_intrinsic_ssbo_atomic_fadd: - return NV50_IR_SUBOP_ATOM_ADD; case nir_intrinsic_bindless_image_atomic_and: - case nir_intrinsic_global_atomic_and: case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_shared_atomic_and: case nir_intrinsic_ssbo_atomic_and: return NV50_IR_SUBOP_ATOM_AND; case nir_intrinsic_bindless_image_atomic_comp_swap: - case nir_intrinsic_global_atomic_comp_swap: case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_shared_atomic_comp_swap: case nir_intrinsic_ssbo_atomic_comp_swap: return NV50_IR_SUBOP_ATOM_CAS; case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_global_atomic_exchange: case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_deref_atomic_exchange: case nir_intrinsic_shared_atomic_exchange: case nir_intrinsic_ssbo_atomic_exchange: return NV50_IR_SUBOP_ATOM_EXCH; case nir_intrinsic_bindless_image_atomic_or: - case nir_intrinsic_global_atomic_or: case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_shared_atomic_or: case nir_intrinsic_ssbo_atomic_or: return NV50_IR_SUBOP_ATOM_OR; - case nir_intrinsic_bindless_image_atomic_imax: - case nir_intrinsic_bindless_image_atomic_umax: - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_deref_atomic_max: case nir_intrinsic_shared_atomic_imax: case nir_intrinsic_shared_atomic_umax: case nir_intrinsic_ssbo_atomic_imax: case nir_intrinsic_ssbo_atomic_umax: return NV50_IR_SUBOP_ATOM_MAX; - case nir_intrinsic_bindless_image_atomic_imin: - case nir_intrinsic_bindless_image_atomic_umin: - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_min: + case nir_intrinsic_image_atomic_min: + case nir_intrinsic_image_deref_atomic_min: case nir_intrinsic_shared_atomic_imin: case nir_intrinsic_shared_atomic_umin: case nir_intrinsic_ssbo_atomic_imin: case nir_intrinsic_ssbo_atomic_umin: return NV50_IR_SUBOP_ATOM_MIN; case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_global_atomic_xor: case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_deref_atomic_xor: case nir_intrinsic_shared_atomic_xor: case nir_intrinsic_ssbo_atomic_xor: return NV50_IR_SUBOP_ATOM_XOR; - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_image_atomic_inc_wrap: - return NV50_IR_SUBOP_ATOM_INC; - case nir_intrinsic_bindless_image_atomic_dec_wrap: - case nir_intrinsic_image_atomic_dec_wrap: - return NV50_IR_SUBOP_ATOM_DEC; case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: return NV50_IR_SUBOP_MEMBAR(M, GL); @@ -721,7 +666,7 @@ Converter::getCondCode(nir_op op) case nir_op_ilt32: case nir_op_ult32: return CC_LT; - case nir_op_fneu32: + case nir_op_fne32: return CC_NEU; case nir_op_ine32: return CC_NE; @@ -753,8 +698,6 @@ Converter::convert(nir_dest *dest) Converter::LValues& Converter::convert(nir_register *reg) { - assert(!reg->num_array_elems); - NirDefMap::iterator it = regDefs.find(reg->index); if (it != regDefs.end()) return it->second; @@ -914,6 +857,256 @@ vert_attrib_to_tgsi_semantic(gl_vert_attrib slot, unsigned *name, unsigned *inde } } +static void +varying_slot_to_tgsi_semantic(gl_varying_slot slot, unsigned *name, unsigned *index) +{ + assert(name && index); + + if (slot >= VARYING_SLOT_TESS_MAX) { + ERROR("invalid varying slot %u\n", slot); + assert(false); + return; + } + + if (slot >= VARYING_SLOT_PATCH0) { + *name = TGSI_SEMANTIC_PATCH; + *index = slot - VARYING_SLOT_PATCH0; + return; + } + + if (slot >= VARYING_SLOT_VAR0) { + *name = TGSI_SEMANTIC_GENERIC; + *index = slot - VARYING_SLOT_VAR0; + return; + } + + if (slot >= VARYING_SLOT_TEX0 && slot <= VARYING_SLOT_TEX7) { + *name = TGSI_SEMANTIC_TEXCOORD; + *index = slot - VARYING_SLOT_TEX0; + return; + } + + switch (slot) { + case VARYING_SLOT_BFC0: + *name = TGSI_SEMANTIC_BCOLOR; + *index = 0; + break; + case VARYING_SLOT_BFC1: + *name = TGSI_SEMANTIC_BCOLOR; + *index = 1; + break; + case VARYING_SLOT_CLIP_DIST0: + *name = TGSI_SEMANTIC_CLIPDIST; + *index = 0; + break; + case VARYING_SLOT_CLIP_DIST1: + *name = TGSI_SEMANTIC_CLIPDIST; + *index = 1; + break; + case VARYING_SLOT_CLIP_VERTEX: + *name = TGSI_SEMANTIC_CLIPVERTEX; + *index = 0; + break; + case VARYING_SLOT_COL0: + *name = TGSI_SEMANTIC_COLOR; + *index = 0; + break; + case VARYING_SLOT_COL1: + *name = TGSI_SEMANTIC_COLOR; + *index = 1; + break; + case VARYING_SLOT_EDGE: + *name = TGSI_SEMANTIC_EDGEFLAG; + *index = 0; + break; + case VARYING_SLOT_FACE: + *name = TGSI_SEMANTIC_FACE; + *index = 0; + break; + case VARYING_SLOT_FOGC: + *name = TGSI_SEMANTIC_FOG; + *index = 0; + break; + case VARYING_SLOT_LAYER: + *name = TGSI_SEMANTIC_LAYER; + *index = 0; + break; + case VARYING_SLOT_PNTC: + *name = TGSI_SEMANTIC_PCOORD; + *index = 0; + break; + case VARYING_SLOT_POS: + *name = TGSI_SEMANTIC_POSITION; + *index = 0; + break; + case VARYING_SLOT_PRIMITIVE_ID: + *name = TGSI_SEMANTIC_PRIMID; + *index = 0; + break; + case VARYING_SLOT_PSIZ: + *name = TGSI_SEMANTIC_PSIZE; + *index = 0; + break; + case VARYING_SLOT_TESS_LEVEL_INNER: + *name = TGSI_SEMANTIC_TESSINNER; + *index = 0; + break; + case VARYING_SLOT_TESS_LEVEL_OUTER: + *name = TGSI_SEMANTIC_TESSOUTER; + *index = 0; + break; + case VARYING_SLOT_VIEWPORT: + *name = TGSI_SEMANTIC_VIEWPORT_INDEX; + *index = 0; + break; + default: + ERROR("unknown varying slot %u\n", slot); + assert(false); + break; + } +} + +static void +frag_result_to_tgsi_semantic(unsigned slot, unsigned *name, unsigned *index) +{ + if (slot >= FRAG_RESULT_DATA0) { + *name = TGSI_SEMANTIC_COLOR; + *index = slot - FRAG_RESULT_COLOR - 2; // intentional + return; + } + + switch (slot) { + case FRAG_RESULT_COLOR: + *name = TGSI_SEMANTIC_COLOR; + *index = 0; + break; + case FRAG_RESULT_DEPTH: + *name = TGSI_SEMANTIC_POSITION; + *index = 0; + break; + case FRAG_RESULT_SAMPLE_MASK: + *name = TGSI_SEMANTIC_SAMPLEMASK; + *index = 0; + break; + default: + ERROR("unknown frag result slot %u\n", slot); + assert(false); + break; + } +} + +// copy of _mesa_sysval_to_semantic +static void +system_val_to_tgsi_semantic(unsigned val, unsigned *name, unsigned *index) +{ + *index = 0; + switch (val) { + // Vertex shader + case SYSTEM_VALUE_VERTEX_ID: + *name = TGSI_SEMANTIC_VERTEXID; + break; + case SYSTEM_VALUE_INSTANCE_ID: + *name = TGSI_SEMANTIC_INSTANCEID; + break; + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + *name = TGSI_SEMANTIC_VERTEXID_NOBASE; + break; + case SYSTEM_VALUE_BASE_VERTEX: + *name = TGSI_SEMANTIC_BASEVERTEX; + break; + case SYSTEM_VALUE_BASE_INSTANCE: + *name = TGSI_SEMANTIC_BASEINSTANCE; + break; + case SYSTEM_VALUE_DRAW_ID: + *name = TGSI_SEMANTIC_DRAWID; + break; + + // Geometry shader + case SYSTEM_VALUE_INVOCATION_ID: + *name = TGSI_SEMANTIC_INVOCATIONID; + break; + + // Fragment shader + case SYSTEM_VALUE_FRAG_COORD: + *name = TGSI_SEMANTIC_POSITION; + break; + case SYSTEM_VALUE_FRONT_FACE: + *name = TGSI_SEMANTIC_FACE; + break; + case SYSTEM_VALUE_SAMPLE_ID: + *name = TGSI_SEMANTIC_SAMPLEID; + break; + case SYSTEM_VALUE_SAMPLE_POS: + *name = TGSI_SEMANTIC_SAMPLEPOS; + break; + case SYSTEM_VALUE_SAMPLE_MASK_IN: + *name = TGSI_SEMANTIC_SAMPLEMASK; + break; + case SYSTEM_VALUE_HELPER_INVOCATION: + *name = TGSI_SEMANTIC_HELPER_INVOCATION; + break; + + // Tessellation shader + case SYSTEM_VALUE_TESS_COORD: + *name = TGSI_SEMANTIC_TESSCOORD; + break; + case SYSTEM_VALUE_VERTICES_IN: + *name = TGSI_SEMANTIC_VERTICESIN; + break; + case SYSTEM_VALUE_PRIMITIVE_ID: + *name = TGSI_SEMANTIC_PRIMID; + break; + case SYSTEM_VALUE_TESS_LEVEL_OUTER: + *name = TGSI_SEMANTIC_TESSOUTER; + break; + case SYSTEM_VALUE_TESS_LEVEL_INNER: + *name = TGSI_SEMANTIC_TESSINNER; + break; + + // Compute shader + case SYSTEM_VALUE_LOCAL_INVOCATION_ID: + *name = TGSI_SEMANTIC_THREAD_ID; + break; + case SYSTEM_VALUE_WORK_GROUP_ID: + *name = TGSI_SEMANTIC_BLOCK_ID; + break; + case SYSTEM_VALUE_NUM_WORK_GROUPS: + *name = TGSI_SEMANTIC_GRID_SIZE; + break; + case SYSTEM_VALUE_LOCAL_GROUP_SIZE: + *name = TGSI_SEMANTIC_BLOCK_SIZE; + break; + + // ARB_shader_ballot + case SYSTEM_VALUE_SUBGROUP_SIZE: + *name = TGSI_SEMANTIC_SUBGROUP_SIZE; + break; + case SYSTEM_VALUE_SUBGROUP_INVOCATION: + *name = TGSI_SEMANTIC_SUBGROUP_INVOCATION; + break; + case SYSTEM_VALUE_SUBGROUP_EQ_MASK: + *name = TGSI_SEMANTIC_SUBGROUP_EQ_MASK; + break; + case SYSTEM_VALUE_SUBGROUP_GE_MASK: + *name = TGSI_SEMANTIC_SUBGROUP_GE_MASK; + break; + case SYSTEM_VALUE_SUBGROUP_GT_MASK: + *name = TGSI_SEMANTIC_SUBGROUP_GT_MASK; + break; + case SYSTEM_VALUE_SUBGROUP_LE_MASK: + *name = TGSI_SEMANTIC_SUBGROUP_LE_MASK; + break; + case SYSTEM_VALUE_SUBGROUP_LT_MASK: + *name = TGSI_SEMANTIC_SUBGROUP_LT_MASK; + break; + + default: + ERROR("unknown system value %u\n", val); + assert(false); + break; + } +} + void Converter::setInterpolate(nv50_ir_varying *var, uint8_t mode, @@ -949,7 +1142,7 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, uint16_t slots; switch (stage) { case Program::TYPE_GEOMETRY: - slots = type->count_attribute_slots(false); + slots = type->uniform_locations(); if (input) slots /= info.gs.vertices_in; break; @@ -957,9 +1150,9 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, case Program::TYPE_TESSELLATION_EVAL: // remove first dimension if (var->data.patch || (!input && stage == Program::TYPE_TESSELLATION_EVAL)) - slots = type->count_attribute_slots(false); + slots = type->uniform_locations(); else - slots = type->fields.array->count_attribute_slots(false); + slots = type->fields.array->uniform_locations(); break; default: slots = type->count_attribute_slots(false); @@ -969,94 +1162,93 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, return slots; } -static uint8_t -getMaskForType(const glsl_type *type, uint8_t slot) { - uint16_t comp = type->without_array()->components(); - comp = comp ? comp : 4; - - if (glsl_base_type_is_64bit(type->without_array()->base_type)) { - comp *= 2; - if (comp > 4) { - if (slot % 2) - comp -= 4; - else - comp = 4; - } - } - - return (1 << comp) - 1; -} - bool Converter::assignSlots() { unsigned name; unsigned index; info->io.viewportId = -1; - info_out->numInputs = 0; - info_out->numOutputs = 0; - info_out->numSysVals = 0; + info->numInputs = 0; + info->numOutputs = 0; - uint8_t i; - BITSET_FOREACH_SET(i, nir->info.system_values_read, SYSTEM_VALUE_MAX) { - info_out->sv[info_out->numSysVals].sn = tgsi_get_sysval_semantic(i); - info_out->sv[info_out->numSysVals].si = 0; - info_out->sv[info_out->numSysVals].input = 0; // TODO inferSysValDirection(sn); + // we have to fixup the uniform locations for arrays + unsigned numImages = 0; + nir_foreach_variable(var, &nir->uniforms) { + const glsl_type *type = var->type; + if (!type->without_array()->is_image()) + continue; + var->data.driver_location = numImages; + numImages += type->is_array() ? type->arrays_of_arrays_size() : 1; + } + + info->numSysVals = 0; + for (uint8_t i = 0; i < SYSTEM_VALUE_MAX; ++i) { + if (!(nir->info.system_values_read & 1ull << i)) + continue; + + system_val_to_tgsi_semantic(i, &name, &index); + info->sv[info->numSysVals].sn = name; + info->sv[info->numSysVals].si = index; + info->sv[info->numSysVals].input = 0; // TODO inferSysValDirection(sn); switch (i) { case SYSTEM_VALUE_INSTANCE_ID: - info_out->io.instanceId = info_out->numSysVals; + info->io.instanceId = info->numSysVals; break; case SYSTEM_VALUE_TESS_LEVEL_INNER: case SYSTEM_VALUE_TESS_LEVEL_OUTER: - info_out->sv[info_out->numSysVals].patch = 1; + info->sv[info->numSysVals].patch = 1; break; case SYSTEM_VALUE_VERTEX_ID: - info_out->io.vertexId = info_out->numSysVals; + info->io.vertexId = info->numSysVals; break; default: break; } - info_out->numSysVals += 1; + info->numSysVals += 1; } if (prog->getType() == Program::TYPE_COMPUTE) return true; - nir_foreach_shader_in_variable(var, nir) { + nir_foreach_variable(var, &nir->inputs) { const glsl_type *type = var->type; int slot = var->data.location; uint16_t slots = calcSlots(type, prog->getType(), nir->info, true, var); + uint32_t comp = type->is_array() ? type->without_array()->component_slots() + : type->component_slots(); + uint32_t frac = var->data.location_frac; uint32_t vary = var->data.driver_location; + + if (glsl_base_type_is_64bit(type->without_array()->base_type)) { + if (comp > 2) + slots *= 2; + } + assert(vary + slots <= PIPE_MAX_SHADER_INPUTS); switch(prog->getType()) { case Program::TYPE_FRAGMENT: - tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, - &name, &index); + varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); for (uint16_t i = 0; i < slots; ++i) { - setInterpolate(&info_out->in[vary + i], var->data.interpolation, + setInterpolate(&info->in[vary + i], var->data.interpolation, var->data.centroid | var->data.sample, name); } break; case Program::TYPE_GEOMETRY: - tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, - &name, &index); + varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); break; case Program::TYPE_TESSELLATION_CONTROL: case Program::TYPE_TESSELLATION_EVAL: - tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, - &name, &index); + varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); if (var->data.patch && name == TGSI_SEMANTIC_PATCH) - info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots); + info->numPatchConstants = MAX2(info->numPatchConstants, index + slots); break; case Program::TYPE_VERTEX: - if (slot >= VERT_ATTRIB_GENERIC0) - slot = VERT_ATTRIB_GENERIC0 + vary; vert_attrib_to_tgsi_semantic((gl_vert_attrib)slot, &name, &index); switch (name) { case TGSI_SEMANTIC_EDGEFLAG: - info_out->io.edgeFlagIn = vary; + info->io.edgeFlagIn = vary; break; default: break; @@ -1068,44 +1260,55 @@ bool Converter::assignSlots() { } for (uint16_t i = 0u; i < slots; ++i, ++vary) { - nv50_ir_varying *v = &info_out->in[vary]; - - v->patch = var->data.patch; - v->sn = name; - v->si = index + i; - v->mask |= getMaskForType(type, i) << var->data.location_frac; + info->in[vary].id = vary; + info->in[vary].patch = var->data.patch; + info->in[vary].sn = name; + info->in[vary].si = index + i; + if (glsl_base_type_is_64bit(type->without_array()->base_type)) + if (i & 0x1) + info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4); + else + info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf); + else + info->in[vary].mask |= ((1 << comp) - 1) << frac; } - info_out->numInputs = std::max<uint8_t>(info_out->numInputs, vary); + info->numInputs = std::max<uint8_t>(info->numInputs, vary); } - nir_foreach_shader_out_variable(var, nir) { + nir_foreach_variable(var, &nir->outputs) { const glsl_type *type = var->type; int slot = var->data.location; uint16_t slots = calcSlots(type, prog->getType(), nir->info, false, var); + uint32_t comp = type->is_array() ? type->without_array()->component_slots() + : type->component_slots(); + uint32_t frac = var->data.location_frac; uint32_t vary = var->data.driver_location; + if (glsl_base_type_is_64bit(type->without_array()->base_type)) { + if (comp > 2) + slots *= 2; + } + assert(vary < PIPE_MAX_SHADER_OUTPUTS); switch(prog->getType()) { case Program::TYPE_FRAGMENT: - tgsi_get_gl_frag_result_semantic((gl_frag_result)slot, &name, &index); + frag_result_to_tgsi_semantic((gl_frag_result)slot, &name, &index); switch (name) { case TGSI_SEMANTIC_COLOR: if (!var->data.fb_fetch_output) - info_out->prop.fp.numColourResults++; - if (var->data.location == FRAG_RESULT_COLOR && - nir->info.outputs_written & BITFIELD64_BIT(var->data.location)) - info_out->prop.fp.separateFragData = true; + info->prop.fp.numColourResults++; + info->prop.fp.separateFragData = true; // sometimes we get FRAG_RESULT_DATAX with data.index 0 // sometimes we get FRAG_RESULT_DATA0 with data.index X index = index == 0 ? var->data.index : index; break; case TGSI_SEMANTIC_POSITION: - info_out->io.fragDepth = vary; - info_out->prop.fp.writesDepth = true; + info->io.fragDepth = vary; + info->prop.fp.writesDepth = true; break; case TGSI_SEMANTIC_SAMPLEMASK: - info_out->io.sampleMask = vary; + info->io.sampleMask = vary; break; default: break; @@ -1115,22 +1318,21 @@ bool Converter::assignSlots() { case Program::TYPE_TESSELLATION_CONTROL: case Program::TYPE_TESSELLATION_EVAL: case Program::TYPE_VERTEX: - tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, - &name, &index); + varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); if (var->data.patch && name != TGSI_SEMANTIC_TESSINNER && name != TGSI_SEMANTIC_TESSOUTER) - info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots); + info->numPatchConstants = MAX2(info->numPatchConstants, index + slots); switch (name) { case TGSI_SEMANTIC_CLIPDIST: - info_out->io.genUserClip = -1; + info->io.genUserClip = -1; break; case TGSI_SEMANTIC_CLIPVERTEX: clipVertexOutput = vary; break; case TGSI_SEMANTIC_EDGEFLAG: - info_out->io.edgeFlagOut = vary; + info->io.edgeFlagOut = vary; break; case TGSI_SEMANTIC_POSITION: if (clipVertexOutput < 0) @@ -1146,33 +1348,39 @@ bool Converter::assignSlots() { } for (uint16_t i = 0u; i < slots; ++i, ++vary) { - nv50_ir_varying *v = &info_out->out[vary]; - v->patch = var->data.patch; - v->sn = name; - v->si = index + i; - v->mask |= getMaskForType(type, i) << var->data.location_frac; + info->out[vary].id = vary; + info->out[vary].patch = var->data.patch; + info->out[vary].sn = name; + info->out[vary].si = index + i; + if (glsl_base_type_is_64bit(type->without_array()->base_type)) + if (i & 0x1) + info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4); + else + info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf); + else + info->out[vary].mask |= ((1 << comp) - 1) << frac; if (nir->info.outputs_read & 1ull << slot) - v->oread = 1; + info->out[vary].oread = 1; } - info_out->numOutputs = std::max<uint8_t>(info_out->numOutputs, vary); + info->numOutputs = std::max<uint8_t>(info->numOutputs, vary); } - if (info_out->io.genUserClip > 0) { - info_out->io.clipDistances = info_out->io.genUserClip; + if (info->io.genUserClip > 0) { + info->io.clipDistances = info->io.genUserClip; - const unsigned int nOut = (info_out->io.genUserClip + 3) / 4; + const unsigned int nOut = (info->io.genUserClip + 3) / 4; for (unsigned int n = 0; n < nOut; ++n) { - unsigned int i = info_out->numOutputs++; - info_out->out[i].id = i; - info_out->out[i].sn = TGSI_SEMANTIC_CLIPDIST; - info_out->out[i].si = n; - info_out->out[i].mask = ((1 << info_out->io.clipDistances) - 1) >> (n * 4); + unsigned int i = info->numOutputs++; + info->out[i].id = i; + info->out[i].sn = TGSI_SEMANTIC_CLIPDIST; + info->out[i].si = n; + info->out[i].mask = ((1 << info->io.clipDistances) - 1) >> (n * 4); } } - return info->assignSlots(info_out) == 0; + return info->assignSlots(info) == 0; } uint32_t @@ -1222,7 +1430,7 @@ Converter::getSlotAddress(nir_intrinsic_instr *insn, uint8_t idx, uint8_t slot) assert(!input || idx < PIPE_MAX_SHADER_INPUTS); assert(input || idx < PIPE_MAX_SHADER_OUTPUTS); - const nv50_ir_varying *vary = input ? info_out->in : info_out->out; + const nv50_ir_varying *vary = input ? info->in : info->out; return vary[idx].slot[slot] * 4; } @@ -1280,63 +1488,66 @@ Converter::storeTo(nir_intrinsic_instr *insn, DataFile file, operation op, } mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address), indirect0, - split[0])->perPatch = info_out->out[idx].patch; + split[0])->perPatch = info->out[idx].patch; mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address + 4), indirect0, - split[1])->perPatch = info_out->out[idx].patch; + split[1])->perPatch = info->out[idx].patch; } else { if (op == OP_EXPORT) src = mkMov(getSSA(size), src, ty)->getDef(0); mkStore(op, ty, mkSymbol(file, 0, ty, address), indirect0, - src)->perPatch = info_out->out[idx].patch; + src)->perPatch = info->out[idx].patch; } } bool Converter::parseNIR() { - info_out->bin.tlsSpace = nir->scratch_size; - info_out->io.clipDistances = nir->info.clip_distance_array_size; - info_out->io.cullDistances = nir->info.cull_distance_array_size; - info_out->io.layer_viewport_relative = nir->info.layer_viewport_relative; + info->bin.tlsSpace = 0; + info->io.clipDistances = nir->info.clip_distance_array_size; + info->io.cullDistances = nir->info.cull_distance_array_size; switch(prog->getType()) { case Program::TYPE_COMPUTE: - info->prop.cp.numThreads[0] = nir->info.workgroup_size[0]; - info->prop.cp.numThreads[1] = nir->info.workgroup_size[1]; - info->prop.cp.numThreads[2] = nir->info.workgroup_size[2]; - info_out->bin.smemSize = std::max(info_out->bin.smemSize, nir->info.shared_size); + info->prop.cp.numThreads[0] = nir->info.cs.local_size[0]; + info->prop.cp.numThreads[1] = nir->info.cs.local_size[1]; + info->prop.cp.numThreads[2] = nir->info.cs.local_size[2]; + info->bin.smemSize = nir->info.cs.shared_size; break; case Program::TYPE_FRAGMENT: - info_out->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests; - prog->persampleInvocation = - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) || - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS); - info_out->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage; - info_out->prop.fp.readsSampleLocations = - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS); - info_out->prop.fp.usesDiscard = nir->info.fs.uses_discard || nir->info.fs.uses_demote; - info_out->prop.fp.usesSampleMaskIn = - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN); + info->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests; + info->prop.fp.persampleInvocation = + (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_ID) || + (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); + info->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage; + info->prop.fp.readsSampleLocations = + (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); + info->prop.fp.usesDiscard = nir->info.fs.uses_discard; + info->prop.fp.usesSampleMaskIn = + !!(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN); break; case Program::TYPE_GEOMETRY: - info_out->prop.gp.instanceCount = nir->info.gs.invocations; - info_out->prop.gp.maxVertices = nir->info.gs.vertices_out; - info_out->prop.gp.outputPrim = nir->info.gs.output_primitive; + info->prop.gp.inputPrim = nir->info.gs.input_primitive; + info->prop.gp.instanceCount = nir->info.gs.invocations; + info->prop.gp.maxVertices = nir->info.gs.vertices_out; + info->prop.gp.outputPrim = nir->info.gs.output_primitive; break; case Program::TYPE_TESSELLATION_CONTROL: case Program::TYPE_TESSELLATION_EVAL: - info_out->prop.tp.domain = u_tess_prim_from_shader(nir->info.tess._primitive_mode); - info_out->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out; - info_out->prop.tp.outputPrim = + if (nir->info.tess.primitive_mode == GL_ISOLINES) + info->prop.tp.domain = GL_LINES; + else + info->prop.tp.domain = nir->info.tess.primitive_mode; + info->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out; + info->prop.tp.outputPrim = nir->info.tess.point_mode ? PIPE_PRIM_POINTS : PIPE_PRIM_TRIANGLES; - info_out->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3; - info_out->prop.tp.winding = !nir->info.tess.ccw; + info->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3; + info->prop.tp.winding = !nir->info.tess.ccw; break; case Program::TYPE_VERTEX: - info_out->prop.vp.usesDrawParameters = - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) || - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) || - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); + info->prop.vp.usesDrawParameters = + (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX)) || + (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE)) || + (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)); break; default: break; @@ -1359,7 +1570,7 @@ Converter::visit(nir_function *function) setPosition(entry, true); - if (info_out->io.genUserClip > 0) { + if (info->io.genUserClip > 0) { for (int c = 0; c < 4; ++c) clipVtx[c] = getScratch(); } @@ -1381,6 +1592,16 @@ Converter::visit(nir_function *function) break; } + nir_foreach_register(reg, &function->impl->registers) { + if (reg->num_array_elems) { + // TODO: packed variables would be nice, but MemoryOpt fails + // replace 4 with reg->num_components + uint32_t size = 4 * reg->num_array_elems * (reg->bit_size / 8); + regToLmemOffset[reg->index] = info->bin.tlsSpace; + info->bin.tlsSpace += size; + } + } + nir_index_ssa_defs(function->impl); foreach_list_typed(nir_cf_node, node, node, &function->impl->body) { if (!visit(node)) @@ -1392,7 +1613,7 @@ Converter::visit(nir_function *function) if ((prog->getType() == Program::TYPE_VERTEX || prog->getType() == Program::TYPE_TESSELLATION_EVAL) - && info_out->io.genUserClip > 0) + && info->io.genUserClip > 0) handleUserClipPlanes(); // TODO: for non main function this needs to be a OP_RETURN @@ -1435,69 +1656,64 @@ Converter::visit(nir_block *block) bool Converter::visit(nir_if *nif) { - curIfDepth++; - DataType sType = getSType(nif->condition, false, false); Value *src = getSrc(&nif->condition, 0); nir_block *lastThen = nir_if_last_then_block(nif); nir_block *lastElse = nir_if_last_else_block(nif); - BasicBlock *headBB = bb; + assert(!lastThen->successors[1]); + assert(!lastElse->successors[1]); + BasicBlock *ifBB = convert(nir_if_first_then_block(nif)); BasicBlock *elseBB = convert(nir_if_first_else_block(nif)); bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE); bb->cfg.attach(&elseBB->cfg, Graph::Edge::TREE); - bool insertJoins = lastThen->successors[0] == lastElse->successors[0]; + // we only insert joinats, if both nodes end up at the end of the if again. + // the reason for this to not happens are breaks/continues/ret/... which + // have their own handling + if (lastThen->successors[0] == lastElse->successors[0]) + bb->joinAt = mkFlow(OP_JOINAT, convert(lastThen->successors[0]), + CC_ALWAYS, NULL); + mkFlow(OP_BRA, elseBB, CC_EQ, src)->setType(sType); foreach_list_typed(nir_cf_node, node, node, &nif->then_list) { if (!visit(node)) return false; } - setPosition(convert(lastThen), true); - if (!bb->isTerminated()) { + if (!bb->getExit() || + !bb->getExit()->asFlow() || + bb->getExit()->asFlow()->op == OP_JOIN) { BasicBlock *tailBB = convert(lastThen->successors[0]); mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL); bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD); - } else { - insertJoins = insertJoins && bb->getExit()->op == OP_BRA; } foreach_list_typed(nir_cf_node, node, node, &nif->else_list) { if (!visit(node)) return false; } - setPosition(convert(lastElse), true); - if (!bb->isTerminated()) { + if (!bb->getExit() || + !bb->getExit()->asFlow() || + bb->getExit()->asFlow()->op == OP_JOIN) { BasicBlock *tailBB = convert(lastElse->successors[0]); mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL); bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD); - } else { - insertJoins = insertJoins && bb->getExit()->op == OP_BRA; } - /* only insert joins for the most outer if */ - if (--curIfDepth) - insertJoins = false; - - /* we made sure that all threads would converge at the same block */ - if (insertJoins) { - BasicBlock *conv = convert(lastThen->successors[0]); - setPosition(headBB->getExit(), false); - headBB->joinAt = mkFlow(OP_JOINAT, conv, CC_ALWAYS, NULL); - setPosition(conv, false); + if (lastThen->successors[0] == lastElse->successors[0]) { + setPosition(convert(lastThen->successors[0]), true); mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; } return true; } -// TODO: add convergency bool Converter::visit(nir_loop *loop) { @@ -1505,8 +1721,8 @@ Converter::visit(nir_loop *loop) func->loopNestingBound = std::max(func->loopNestingBound, curLoopDepth); BasicBlock *loopBB = convert(nir_loop_first_block(loop)); - BasicBlock *tailBB = convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); - + BasicBlock *tailBB = + convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); bb->cfg.attach(&loopBB->cfg, Graph::Edge::TREE); mkFlow(OP_PREBREAK, tailBB, CC_ALWAYS, NULL); @@ -1517,15 +1733,19 @@ Converter::visit(nir_loop *loop) if (!visit(node)) return false; } - - if (!bb->isTerminated()) { - mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL); - bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); + Instruction *insn = bb->getExit(); + if (bb->cfg.incidentCount() != 0) { + if (!insn || !insn->asFlow()) { + mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL); + bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); + } else if (insn && insn->op == OP_BRA && !insn->getPredicate() && + tailBB->cfg.incidentCount() == 0) { + // RA doesn't like having blocks around with no incident edge, + // so we create a fake one to make it happy + bb->cfg.attach(&tailBB->cfg, Graph::Edge::TREE); + } } - if (tailBB->cfg.incidentCount() == 0) - loopBB->cfg.attach(&tailBB->cfg, Graph::Edge::TREE); - curLoopDepth -= 1; return true; @@ -1539,6 +1759,8 @@ Converter::visit(nir_instr *insn) switch (insn->type) { case nir_instr_type_alu: return visit(nir_instr_as_alu(insn)); + case nir_instr_type_deref: + return visit(nir_instr_as_deref(insn)); case nir_instr_type_intrinsic: return visit(nir_instr_as_intrinsic(insn)); case nir_instr_type_jump: @@ -1568,18 +1790,17 @@ Converter::convert(nir_intrinsic_op intr) return SV_DRAWID; case nir_intrinsic_load_front_face: return SV_FACE; - case nir_intrinsic_is_helper_invocation: case nir_intrinsic_load_helper_invocation: return SV_THREAD_KILL; case nir_intrinsic_load_instance_id: return SV_INSTANCE_ID; case nir_intrinsic_load_invocation_id: return SV_INVOCATION_ID; - case nir_intrinsic_load_workgroup_size: + case nir_intrinsic_load_local_group_size: return SV_NTID; case nir_intrinsic_load_local_invocation_id: return SV_TID; - case nir_intrinsic_load_num_workgroups: + case nir_intrinsic_load_num_work_groups: return SV_NCTAID; case nir_intrinsic_load_patch_vertices_in: return SV_VERTEX_COUNT; @@ -1611,10 +1832,8 @@ Converter::convert(nir_intrinsic_op intr) return SV_TESS_OUTER; case nir_intrinsic_load_vertex_id: return SV_VERTEX_ID; - case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_work_group_id: return SV_CTAID; - case nir_intrinsic_load_work_dim: - return SV_WORK_DIM; default: ERROR("unknown SVSemantic for nir_intrinsic_op %s\n", nir_intrinsic_infos[intr].name); @@ -1623,12 +1842,73 @@ Converter::convert(nir_intrinsic_op intr) } } +ImgFormat +Converter::convertGLImgFormat(GLuint format) +{ +#define FMT_CASE(a, b) \ + case GL_ ## a: return nv50_ir::FMT_ ## b + + switch (format) { + FMT_CASE(NONE, NONE); + + FMT_CASE(RGBA32F, RGBA32F); + FMT_CASE(RGBA16F, RGBA16F); + FMT_CASE(RG32F, RG32F); + FMT_CASE(RG16F, RG16F); + FMT_CASE(R11F_G11F_B10F, R11G11B10F); + FMT_CASE(R32F, R32F); + FMT_CASE(R16F, R16F); + + FMT_CASE(RGBA32UI, RGBA32UI); + FMT_CASE(RGBA16UI, RGBA16UI); + FMT_CASE(RGB10_A2UI, RGB10A2UI); + FMT_CASE(RGBA8UI, RGBA8UI); + FMT_CASE(RG32UI, RG32UI); + FMT_CASE(RG16UI, RG16UI); + FMT_CASE(RG8UI, RG8UI); + FMT_CASE(R32UI, R32UI); + FMT_CASE(R16UI, R16UI); + FMT_CASE(R8UI, R8UI); + + FMT_CASE(RGBA32I, RGBA32I); + FMT_CASE(RGBA16I, RGBA16I); + FMT_CASE(RGBA8I, RGBA8I); + FMT_CASE(RG32I, RG32I); + FMT_CASE(RG16I, RG16I); + FMT_CASE(RG8I, RG8I); + FMT_CASE(R32I, R32I); + FMT_CASE(R16I, R16I); + FMT_CASE(R8I, R8I); + + FMT_CASE(RGBA16, RGBA16); + FMT_CASE(RGB10_A2, RGB10A2); + FMT_CASE(RGBA8, RGBA8); + FMT_CASE(RG16, RG16); + FMT_CASE(RG8, RG8); + FMT_CASE(R16, R16); + FMT_CASE(R8, R8); + + FMT_CASE(RGBA16_SNORM, RGBA16_SNORM); + FMT_CASE(RGBA8_SNORM, RGBA8_SNORM); + FMT_CASE(RG16_SNORM, RG16_SNORM); + FMT_CASE(RG8_SNORM, RG8_SNORM); + FMT_CASE(R16_SNORM, R16_SNORM); + FMT_CASE(R8_SNORM, R8_SNORM); + + FMT_CASE(BGRA_INTEGER, BGRA8); + default: + ERROR("unknown format %x\n", format); + assert(false); + return nv50_ir::FMT_NONE; + } +#undef FMT_CASE +} + bool Converter::visit(nir_intrinsic_instr *insn) { nir_intrinsic_op op = insn->intrinsic; const nir_intrinsic_info &opInfo = nir_intrinsic_infos[op]; - unsigned dest_components = nir_intrinsic_dest_components(insn); switch (op) { case nir_intrinsic_load_uniform: { @@ -1636,7 +1916,7 @@ Converter::visit(nir_intrinsic_instr *insn) const DataType dType = getDType(insn); Value *indirect; uint32_t coffset = getIndirect(insn, 0, 0, indirect); - for (uint8_t i = 0; i < dest_components; ++i) { + for (uint8_t i = 0; i < insn->num_components; ++i) { loadFrom(FILE_MEMORY_CONST, 0, dType, newDefs[i], 16 * coffset, i, indirect); } break; @@ -1647,7 +1927,7 @@ Converter::visit(nir_intrinsic_instr *insn) DataType dType = getSType(insn->src[0], false, false); uint32_t idx = getIndirect(insn, op == nir_intrinsic_store_output ? 1 : 2, 0, indirect); - for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { if (!((1u << i) & nir_intrinsic_write_mask(insn))) continue; @@ -1655,7 +1935,7 @@ Converter::visit(nir_intrinsic_instr *insn) Value *src = getSrc(&insn->src[0], i); switch (prog->getType()) { case Program::TYPE_FRAGMENT: { - if (info_out->out[idx].sn == TGSI_SEMANTIC_POSITION) { + if (info->out[idx].sn == TGSI_SEMANTIC_POSITION) { // TGSI uses a different interface than NIR, TGSI stores that // value in the z component, NIR in X offset += 2; @@ -1664,9 +1944,8 @@ Converter::visit(nir_intrinsic_instr *insn) break; } case Program::TYPE_GEOMETRY: - case Program::TYPE_TESSELLATION_EVAL: case Program::TYPE_VERTEX: { - if (info_out->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) { + if (info->io.genUserClip > 0 && idx == clipVertexOutput) { mkMov(clipVtx[i], src); src = clipVtx[i]; } @@ -1701,19 +1980,19 @@ Converter::visit(nir_intrinsic_instr *insn) srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LAYER, 0))); srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_SAMPLE_INDEX, 0))); - for (uint8_t i = 0u; i < dest_components; ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { defs.push_back(newDefs[i]); mask |= 1 << i; } TexInstruction *texi = mkTex(OP_TXF, TEX_TARGET_2D_MS_ARRAY, 0, 0, defs, srcs); - texi->tex.levelZero = true; + texi->tex.levelZero = 1; texi->tex.mask = mask; texi->tex.useOffsets = 0; texi->tex.r = 0xffff; texi->tex.s = 0xffff; - info_out->prop.fp.readsFramebuffer = true; + info->prop.fp.readsFramebuffer = true; break; } @@ -1724,29 +2003,19 @@ Converter::visit(nir_intrinsic_instr *insn) uint32_t mode = 0; uint32_t idx = getIndirect(insn, op == nir_intrinsic_load_interpolated_input ? 1 : 0, 0, indirect); - nv50_ir_varying& vary = input ? info_out->in[idx] : info_out->out[idx]; + nv50_ir_varying& vary = input ? info->in[idx] : info->out[idx]; // see load_barycentric_* handling if (prog->getType() == Program::TYPE_FRAGMENT) { + mode = translateInterpMode(&vary, nvirOp); if (op == nir_intrinsic_load_interpolated_input) { ImmediateValue immMode; if (getSrc(&insn->src[0], 1)->getUniqueInsn()->src(0).getImmediate(immMode)) - mode = immMode.reg.data.u32; - } - if (mode == NV50_IR_INTERP_DEFAULT) - mode |= translateInterpMode(&vary, nvirOp); - else { - if (vary.linear) { - nvirOp = OP_LINTERP; - mode |= NV50_IR_INTERP_LINEAR; - } else { - nvirOp = OP_PINTERP; - mode |= NV50_IR_INTERP_PERSPECTIVE; - } + mode |= immMode.reg.data.u32; } } - for (uint8_t i = 0u; i < dest_components; ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { uint32_t address = getSlotAddress(insn, idx, i); Symbol *sym = mkSymbol(input ? FILE_SHADER_INPUT : FILE_SHADER_OUTPUT, 0, dType, address); if (prog->getType() == Program::TYPE_FRAGMENT) { @@ -1789,6 +2058,18 @@ Converter::visit(nir_intrinsic_instr *insn) } break; } + case nir_intrinsic_load_kernel_input: { + assert(prog->getType() == Program::TYPE_COMPUTE); + assert(insn->num_components == 1); + + LValues &newDefs = convert(&insn->dest); + const DataType dType = getDType(insn); + Value *indirect; + uint32_t idx = getIndirect(insn, 0, 0, indirect, true); + + mkLoad(dType, newDefs[0], mkSymbol(FILE_SHADER_INPUT, 0, dType, idx), indirect); + break; + } case nir_intrinsic_load_barycentric_at_offset: case nir_intrinsic_load_barycentric_at_sample: case nir_intrinsic_load_barycentric_centroid: @@ -1815,11 +2096,8 @@ Converter::visit(nir_intrinsic_instr *insn) } else if (op == nir_intrinsic_load_barycentric_pixel) { mode = NV50_IR_INTERP_DEFAULT; } else if (op == nir_intrinsic_load_barycentric_at_sample) { - info_out->prop.fp.readsSampleLocations = true; - Value *sample = getSSA(); - mkOp3(OP_SELP, TYPE_U32, sample, mkImm(0), getSrc(&insn->src[0], 0), mkImm(0)) - ->subOp = 2; - mkOp1(OP_PIXLD, TYPE_U32, newDefs[0], sample)->subOp = NV50_IR_SUBOP_PIXLD_OFFSET; + info->prop.fp.readsSampleLocations = true; + mkOp1(OP_PIXLD, TYPE_U32, newDefs[0], getSrc(&insn->src[0], 0))->subOp = NV50_IR_SUBOP_PIXLD_OFFSET; mode = NV50_IR_INTERP_OFFSET; } else { unreachable("all intrinsics already handled above"); @@ -1828,11 +2106,9 @@ Converter::visit(nir_intrinsic_instr *insn) loadImm(newDefs[1], mode); break; } - case nir_intrinsic_demote: case nir_intrinsic_discard: mkOp(OP_DISCARD, TYPE_NONE, NULL); break; - case nir_intrinsic_demote_if: case nir_intrinsic_discard_if: { Value *pred = getSSA(1, FILE_PREDICATE); if (insn->num_components > 1) { @@ -1848,13 +2124,12 @@ Converter::visit(nir_intrinsic_instr *insn) case nir_intrinsic_load_base_instance: case nir_intrinsic_load_draw_id: case nir_intrinsic_load_front_face: - case nir_intrinsic_is_helper_invocation: case nir_intrinsic_load_helper_invocation: case nir_intrinsic_load_instance_id: case nir_intrinsic_load_invocation_id: - case nir_intrinsic_load_workgroup_size: + case nir_intrinsic_load_local_group_size: case nir_intrinsic_load_local_invocation_id: - case nir_intrinsic_load_num_workgroups: + case nir_intrinsic_load_num_work_groups: case nir_intrinsic_load_patch_vertices_in: case nir_intrinsic_load_primitive_id: case nir_intrinsic_load_sample_id: @@ -1870,13 +2145,12 @@ Converter::visit(nir_intrinsic_instr *insn) case nir_intrinsic_load_tess_level_inner: case nir_intrinsic_load_tess_level_outer: case nir_intrinsic_load_vertex_id: - case nir_intrinsic_load_workgroup_id: - case nir_intrinsic_load_work_dim: { + case nir_intrinsic_load_work_group_id: { const DataType dType = getDType(insn); SVSemantic sv = convert(op); LValues &newDefs = convert(&insn->dest); - for (uint8_t i = 0u; i < nir_intrinsic_dest_components(insn); ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { Value *def; if (typeSizeof(dType) == 8) def = getSSA(); @@ -1928,12 +2202,12 @@ Converter::visit(nir_intrinsic_instr *insn) if (op == nir_intrinsic_read_first_invocation) { mkOp1(OP_VOTE, TYPE_U32, tmp, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY; - mkOp1(OP_BREV, TYPE_U32, tmp, tmp); + mkOp2(OP_EXTBF, TYPE_U32, tmp, tmp, mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; mkOp1(OP_BFIND, TYPE_U32, tmp, tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT; } else tmp = getSrc(&insn->src[1], 0); - for (uint8_t i = 0; i < dest_components; ++i) { + for (uint8_t i = 0; i < insn->num_components; ++i) { mkOp3(OP_SHFL, dType, newDefs[i], getSrc(&insn->src[0], i), tmp, mkImm(0x1f)) ->subOp = NV50_IR_SUBOP_SHFL_IDX; } @@ -1949,10 +2223,10 @@ Converter::visit(nir_intrinsic_instr *insn) Value *vtxBase = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(4, FILE_ADDRESS), mkImm(baseVertex), indirectVertex); - for (uint8_t i = 0u; i < dest_components; ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { uint32_t address = getSlotAddress(insn, idx, i); loadFrom(FILE_SHADER_INPUT, 0, dType, newDefs[i], address, 0, - indirectOffset, vtxBase, info_out->in[idx].patch); + indirectOffset, vtxBase, info->in[idx].patch); } break; } @@ -1972,24 +2246,19 @@ Converter::visit(nir_intrinsic_instr *insn) vtxBase = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, FILE_ADDRESS), outBase, vtxBase); - for (uint8_t i = 0u; i < dest_components; ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { uint32_t address = getSlotAddress(insn, idx, i); loadFrom(FILE_SHADER_OUTPUT, 0, dType, newDefs[i], address, 0, - indirectOffset, vtxBase, info_out->in[idx].patch); + indirectOffset, vtxBase, info->in[idx].patch); } break; } - case nir_intrinsic_emit_vertex: { - if (info_out->io.genUserClip > 0) + case nir_intrinsic_emit_vertex: + if (info->io.genUserClip > 0) handleUserClipPlanes(); - uint32_t idx = nir_intrinsic_stream_id(insn); - mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1; - break; - } + // fallthrough case nir_intrinsic_end_primitive: { uint32_t idx = nir_intrinsic_stream_id(insn); - if (idx) - break; mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1; break; } @@ -2001,13 +2270,13 @@ Converter::visit(nir_intrinsic_instr *insn) uint32_t index = getIndirect(&insn->src[0], 0, indirectIndex) + 1; uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); - for (uint8_t i = 0u; i < dest_components; ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { loadFrom(FILE_MEMORY_CONST, index, dType, newDefs[i], offset, i, indirectOffset, indirectIndex); } break; } - case nir_intrinsic_get_ssbo_size: { + case nir_intrinsic_get_buffer_size: { LValues &newDefs = convert(&insn->dest); const DataType dType = getDType(insn); Value *indirectBuffer; @@ -2024,7 +2293,7 @@ Converter::visit(nir_intrinsic_instr *insn) uint32_t buffer = getIndirect(&insn->src[1], 0, indirectBuffer); uint32_t offset = getIndirect(&insn->src[2], 0, indirectOffset); - for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { if (!((1u << i) & nir_intrinsic_write_mask(insn))) continue; Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, sType, @@ -2032,7 +2301,7 @@ Converter::visit(nir_intrinsic_instr *insn) mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i)) ->setIndirect(0, 1, indirectBuffer); } - info_out->io.globalAccess |= 0x2; + info->io.globalAccess |= 0x2; break; } case nir_intrinsic_load_ssbo: { @@ -2043,15 +2312,14 @@ Converter::visit(nir_intrinsic_instr *insn) uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer); uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); - for (uint8_t i = 0u; i < dest_components; ++i) + for (uint8_t i = 0u; i < insn->num_components; ++i) loadFrom(FILE_MEMORY_BUFFER, buffer, dType, newDefs[i], offset, i, indirectOffset, indirectBuffer); - info_out->io.globalAccess |= 0x1; + info->io.globalAccess |= 0x1; break; } case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_fadd: case nir_intrinsic_shared_atomic_and: case nir_intrinsic_shared_atomic_comp_swap: case nir_intrinsic_shared_atomic_exchange: @@ -2074,7 +2342,6 @@ Converter::visit(nir_intrinsic_instr *insn) break; } case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_fadd: case nir_intrinsic_ssbo_atomic_and: case nir_intrinsic_ssbo_atomic_comp_swap: case nir_intrinsic_ssbo_atomic_exchange: @@ -2100,72 +2367,23 @@ Converter::visit(nir_intrinsic_instr *insn) atom->setIndirect(0, 1, indirectBuffer); atom->subOp = getSubOp(op); - info_out->io.globalAccess |= 0x2; - break; - } - case nir_intrinsic_global_atomic_add: - case nir_intrinsic_global_atomic_fadd: - case nir_intrinsic_global_atomic_and: - case nir_intrinsic_global_atomic_comp_swap: - case nir_intrinsic_global_atomic_exchange: - case nir_intrinsic_global_atomic_or: - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_global_atomic_xor: { - const DataType dType = getDType(insn); - LValues &newDefs = convert(&insn->dest); - Value *address; - uint32_t offset = getIndirect(&insn->src[0], 0, address); - - Symbol *sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, dType, offset); - Instruction *atom = - mkOp2(OP_ATOM, dType, newDefs[0], sym, getSrc(&insn->src[1], 0)); - if (op == nir_intrinsic_global_atomic_comp_swap) - atom->setSrc(2, getSrc(&insn->src[2], 0)); - atom->setIndirect(0, 0, address); - atom->subOp = getSubOp(op); - - info_out->io.globalAccess |= 0x2; + info->io.globalAccess |= 0x2; break; } case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_fadd: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_comp_swap: case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_imax: - case nir_intrinsic_bindless_image_atomic_umax: - case nir_intrinsic_bindless_image_atomic_imin: - case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_bindless_image_atomic_min: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_bindless_image_atomic_dec_wrap: case nir_intrinsic_bindless_image_load: case nir_intrinsic_bindless_image_samples: case nir_intrinsic_bindless_image_size: - case nir_intrinsic_bindless_image_store: - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_fadd: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_comp_swap: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_atomic_inc_wrap: - case nir_intrinsic_image_atomic_dec_wrap: - case nir_intrinsic_image_load: - case nir_intrinsic_image_samples: - case nir_intrinsic_image_size: - case nir_intrinsic_image_store: { + case nir_intrinsic_bindless_image_store: { std::vector<Value*> srcs, defs; - Value *indirect; + Value *indirect = getSrc(&insn->src[0], 0); DataType ty; uint32_t mask = 0; @@ -2182,84 +2400,40 @@ Converter::visit(nir_intrinsic_instr *insn) } } - int lod_src = -1; - bool bindless = false; switch (op) { case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_fadd: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_comp_swap: case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_imax: - case nir_intrinsic_bindless_image_atomic_umax: - case nir_intrinsic_bindless_image_atomic_imin: - case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_bindless_image_atomic_min: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_bindless_image_atomic_dec_wrap: ty = getDType(insn); - bindless = true; - info_out->io.globalAccess |= 0x2; - mask = 0x1; - break; - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_fadd: - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_image_atomic_comp_swap: - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_image_atomic_imax: - case nir_intrinsic_image_atomic_umax: - case nir_intrinsic_image_atomic_imin: - case nir_intrinsic_image_atomic_umin: - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_image_atomic_inc_wrap: - case nir_intrinsic_image_atomic_dec_wrap: - ty = getDType(insn); - bindless = false; - info_out->io.globalAccess |= 0x2; mask = 0x1; + info->io.globalAccess |= 0x2; break; case nir_intrinsic_bindless_image_load: - case nir_intrinsic_image_load: ty = TYPE_U32; - bindless = op == nir_intrinsic_bindless_image_load; - info_out->io.globalAccess |= 0x1; - lod_src = 4; + info->io.globalAccess |= 0x1; break; case nir_intrinsic_bindless_image_store: - case nir_intrinsic_image_store: ty = TYPE_U32; - bindless = op == nir_intrinsic_bindless_image_store; - info_out->io.globalAccess |= 0x2; - lod_src = 5; mask = 0xf; + info->io.globalAccess |= 0x2; break; case nir_intrinsic_bindless_image_samples: mask = 0x8; - FALLTHROUGH; - case nir_intrinsic_image_samples: ty = TYPE_U32; - bindless = op == nir_intrinsic_bindless_image_samples; - mask = 0x8; break; case nir_intrinsic_bindless_image_size: - case nir_intrinsic_image_size: - assert(nir_src_as_uint(insn->src[1]) == 0); ty = TYPE_U32; - bindless = op == nir_intrinsic_bindless_image_size; break; default: unreachable("unhandled image opcode"); break; } - if (bindless) - indirect = getSrc(&insn->src[0], 0); - else - location = getIndirect(&insn->src[0], 0, indirect); - // coords if (opInfo.num_srcs >= 2) for (unsigned int i = 0u; i < argCount; ++i) @@ -2269,21 +2443,22 @@ Converter::visit(nir_intrinsic_instr *insn) if (opInfo.num_srcs >= 3 && target.isMS()) srcs.push_back(getSrc(&insn->src[2], 0)); - if (opInfo.num_srcs >= 4 && lod_src != 4) { + if (opInfo.num_srcs >= 4) { unsigned components = opInfo.src_components[3] ? opInfo.src_components[3] : insn->num_components; for (uint8_t i = 0u; i < components; ++i) srcs.push_back(getSrc(&insn->src[3], i)); } - if (opInfo.num_srcs >= 5 && lod_src != 5) + if (opInfo.num_srcs >= 5) // 1 for aotmic swap for (uint8_t i = 0u; i < opInfo.src_components[4]; ++i) srcs.push_back(getSrc(&insn->src[4], i)); TexInstruction *texi = mkTex(getOperation(op), target.getEnum(), location, 0, defs, srcs); - texi->tex.bindless = bindless; - texi->tex.format = nv50_ir::TexInstruction::translateImgFormat(nir_intrinsic_format(insn)); + texi->tex.bindless = false; + texi->tex.format = &nv50_ir::TexInstruction::formatTable[convertGLImgFormat(nir_intrinsic_format(insn))]; texi->tex.mask = mask; + texi->tex.bindless = true; texi->cache = convert(nir_intrinsic_access(insn)); texi->setType(ty); texi->subOp = getSubOp(op); @@ -2293,40 +2468,134 @@ Converter::visit(nir_intrinsic_instr *insn) break; } - case nir_intrinsic_store_scratch: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_store: { + const nir_variable *tex; + std::vector<Value*> srcs, defs; + Value *indirect; + DataType ty; + + uint32_t mask = 0; + nir_deref_instr *deref = nir_src_as_deref(insn->src[0]); + const glsl_type *type = deref->type; + TexInstruction::Target target = + convert((glsl_sampler_dim)type->sampler_dimensionality, + type->sampler_array, type->sampler_shadow); + unsigned int argCount = getNIRArgCount(target); + uint16_t location = handleDeref(deref, indirect, tex); + + if (opInfo.has_dest) { + LValues &newDefs = convert(&insn->dest); + for (uint8_t i = 0u; i < newDefs.size(); ++i) { + defs.push_back(newDefs[i]); + mask |= 1 << i; + } + } + + switch (op) { + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + ty = getDType(insn); + mask = 0x1; + info->io.globalAccess |= 0x2; + break; + case nir_intrinsic_image_deref_load: + ty = TYPE_U32; + info->io.globalAccess |= 0x1; + break; + case nir_intrinsic_image_deref_store: + ty = TYPE_U32; + mask = 0xf; + info->io.globalAccess |= 0x2; + break; + case nir_intrinsic_image_deref_samples: + mask = 0x8; + ty = TYPE_U32; + break; + case nir_intrinsic_image_deref_size: + ty = TYPE_U32; + break; + default: + unreachable("unhandled image opcode"); + break; + } + + // coords + if (opInfo.num_srcs >= 2) + for (unsigned int i = 0u; i < argCount; ++i) + srcs.push_back(getSrc(&insn->src[1], i)); + + // the sampler is just another src added after coords + if (opInfo.num_srcs >= 3 && target.isMS()) + srcs.push_back(getSrc(&insn->src[2], 0)); + + if (opInfo.num_srcs >= 4) { + unsigned components = opInfo.src_components[3] ? opInfo.src_components[3] : insn->num_components; + for (uint8_t i = 0u; i < components; ++i) + srcs.push_back(getSrc(&insn->src[3], i)); + } + + if (opInfo.num_srcs >= 5) + // 1 for aotmic swap + for (uint8_t i = 0u; i < opInfo.src_components[4]; ++i) + srcs.push_back(getSrc(&insn->src[4], i)); + + TexInstruction *texi = mkTex(getOperation(op), target.getEnum(), location, 0, defs, srcs); + texi->tex.bindless = false; + texi->tex.format = &nv50_ir::TexInstruction::formatTable[convertGLImgFormat(tex->data.image.format)]; + texi->tex.mask = mask; + texi->cache = getCacheModeFromVar(tex); + texi->setType(ty); + texi->subOp = getSubOp(op); + + if (indirect) + texi->setIndirectR(indirect); + + break; + } case nir_intrinsic_store_shared: { DataType sType = getSType(insn->src[0], false, false); Value *indirectOffset; uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); - if (indirectOffset) - indirectOffset = mkOp1v(OP_MOV, TYPE_U32, getSSA(4, FILE_ADDRESS), indirectOffset); - for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { + for (uint8_t i = 0u; i < insn->num_components; ++i) { if (!((1u << i) & nir_intrinsic_write_mask(insn))) continue; - Symbol *sym = mkSymbol(getFile(op), 0, sType, offset + i * typeSizeof(sType)); + Symbol *sym = mkSymbol(FILE_MEMORY_SHARED, 0, sType, offset + i * typeSizeof(sType)); mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i)); } break; } - case nir_intrinsic_load_kernel_input: - case nir_intrinsic_load_scratch: case nir_intrinsic_load_shared: { const DataType dType = getDType(insn); LValues &newDefs = convert(&insn->dest); Value *indirectOffset; uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset); - if (indirectOffset) - indirectOffset = mkOp1v(OP_MOV, TYPE_U32, getSSA(4, FILE_ADDRESS), indirectOffset); - for (uint8_t i = 0u; i < dest_components; ++i) - loadFrom(getFile(op), 0, dType, newDefs[i], offset, i, indirectOffset); + for (uint8_t i = 0u; i < insn->num_components; ++i) + loadFrom(FILE_MEMORY_SHARED, 0, dType, newDefs[i], offset, i, indirectOffset); break; } - case nir_intrinsic_control_barrier: { + case nir_intrinsic_barrier: { // TODO: add flag to shader_info - info_out->numBarriers = 1; + info->numBarriers = 1; Instruction *bar = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0)); bar->fixed = 1; bar->subOp = NV50_IR_SUBOP_BAR_SYNC; @@ -2334,6 +2603,7 @@ Converter::visit(nir_intrinsic_instr *insn) } case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: { @@ -2342,8 +2612,6 @@ Converter::visit(nir_intrinsic_instr *insn) bar->subOp = getSubOp(op); break; } - case nir_intrinsic_memory_barrier_tcs_patch: - break; case nir_intrinsic_shader_clock: { const DataType dType = getDType(insn); LValues &newDefs = convert(&insn->dest); @@ -2352,23 +2620,22 @@ Converter::visit(nir_intrinsic_instr *insn) mkOp1(OP_RDSV, dType, newDefs[1], mkSysVal(SV_CLOCK, 0))->fixed = 1; break; } - case nir_intrinsic_load_global: - case nir_intrinsic_load_global_constant: { + case nir_intrinsic_load_global: { const DataType dType = getDType(insn); LValues &newDefs = convert(&insn->dest); Value *indirectOffset; uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset); - for (auto i = 0u; i < dest_components; ++i) + for (auto i = 0u; i < insn->num_components; ++i) loadFrom(FILE_MEMORY_GLOBAL, 0, dType, newDefs[i], offset, i, indirectOffset); - info_out->io.globalAccess |= 0x1; + info->io.globalAccess |= 0x1; break; } case nir_intrinsic_store_global: { DataType sType = getSType(insn->src[0], false, false); - for (auto i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { + for (auto i = 0u; i < insn->num_components; ++i) { if (!((1u << i) & nir_intrinsic_write_mask(insn))) continue; if (typeSizeof(sType) == 8) { @@ -2386,7 +2653,7 @@ Converter::visit(nir_intrinsic_instr *insn) } } - info_out->io.globalAccess |= 0x2; + info->io.globalAccess |= 0x2; break; } default: @@ -2410,6 +2677,7 @@ Converter::visit(nir_jump_instr *insn) case nir_jump_continue: { bool isBreak = insn->type == nir_jump_break; nir_block *block = insn->instr.block; + assert(!block->successors[1]); BasicBlock *target = convert(block->successors[0]); mkFlow(isBreak ? OP_BREAK : OP_CONT, target, CC_ALWAYS, NULL); bb->cfg.attach(&target->cfg, isBreak ? Graph::Edge::CROSS : Graph::Edge::BACK); @@ -2529,6 +2797,8 @@ Converter::visit(nir_alu_instr *insn) case nir_op_ushr: case nir_op_fsin: case nir_op_fsqrt: + case nir_op_fsub: + case nir_op_isub: case nir_op_ftrunc: case nir_op_ishl: case nir_op_ixor: { @@ -2600,7 +2870,7 @@ Converter::visit(nir_alu_instr *insn) case nir_op_flt32: case nir_op_ilt32: case nir_op_ult32: - case nir_op_fneu32: + case nir_op_fne32: case nir_op_ine32: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); @@ -2616,12 +2886,58 @@ Converter::visit(nir_alu_instr *insn) i->sType = sTypes[0]; break; } + // those are weird ALU ops and need special handling, because + // 1. they are always componend based + // 2. they basically just merge multiple values into one data type case nir_op_mov: + if (!insn->dest.dest.is_ssa && insn->dest.dest.reg.reg->num_array_elems) { + nir_reg_dest& reg = insn->dest.dest.reg; + uint32_t goffset = regToLmemOffset[reg.reg->index]; + uint8_t comps = reg.reg->num_components; + uint8_t size = reg.reg->bit_size / 8; + uint8_t csize = 4 * size; // TODO after fixing MemoryOpts: comps * size; + uint32_t aoffset = csize * reg.base_offset; + Value *indirect = NULL; + + if (reg.indirect) + indirect = mkOp2v(OP_MUL, TYPE_U32, getSSA(4, FILE_ADDRESS), + getSrc(reg.indirect, 0), mkImm(csize)); + + for (uint8_t i = 0u; i < comps; ++i) { + if (!((1u << i) & insn->dest.write_mask)) + continue; + + Symbol *sym = mkSymbol(FILE_MEMORY_LOCAL, 0, dType, goffset + aoffset + i * size); + mkStore(OP_STORE, dType, sym, indirect, getSrc(&insn->src[0], i)); + } + break; + } else if (!insn->src[0].src.is_ssa && insn->src[0].src.reg.reg->num_array_elems) { + LValues &newDefs = convert(&insn->dest); + nir_reg_src& reg = insn->src[0].src.reg; + uint32_t goffset = regToLmemOffset[reg.reg->index]; + // uint8_t comps = reg.reg->num_components; + uint8_t size = reg.reg->bit_size / 8; + uint8_t csize = 4 * size; // TODO after fixing MemoryOpts: comps * size; + uint32_t aoffset = csize * reg.base_offset; + Value *indirect = NULL; + + if (reg.indirect) + indirect = mkOp2v(OP_MUL, TYPE_U32, getSSA(4, FILE_ADDRESS), getSrc(reg.indirect, 0), mkImm(csize)); + + for (uint8_t i = 0u; i < newDefs.size(); ++i) + loadFrom(FILE_MEMORY_LOCAL, 0, dType, newDefs[i], goffset + aoffset, i, indirect); + + break; + } else { + LValues &newDefs = convert(&insn->dest); + for (LValues::size_type c = 0u; c < newDefs.size(); ++c) { + mkMov(newDefs[c], getSrc(&insn->src[0], c), dType); + } + } + break; case nir_op_vec2: case nir_op_vec3: - case nir_op_vec4: - case nir_op_vec8: - case nir_op_vec16: { + case nir_op_vec4: { LValues &newDefs = convert(&insn->dest); for (LValues::size_type c = 0u; c < newDefs.size(); ++c) { mkMov(newDefs[c], getSrc(&insn->src[c]), dType); @@ -2717,7 +3033,7 @@ Converter::visit(nir_alu_instr *insn) case nir_op_bfm: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); - mkOp2(OP_BMSK, dType, newDefs[0], getSrc(&insn->src[1]), getSrc(&insn->src[0]))->subOp = NV50_IR_SUBOP_BMSK_W; + mkOp3(OP_INSBF, dType, newDefs[0], getSrc(&insn->src[0]), loadImm(NULL, 0x808), getSrc(&insn->src[1])); break; } case nir_op_bitfield_insert: { @@ -2737,69 +3053,17 @@ Converter::visit(nir_alu_instr *insn) case nir_op_bitfield_reverse: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); - mkOp1(OP_BREV, TYPE_U32, newDefs[0], getSrc(&insn->src[0])); + mkOp2(OP_EXTBF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; break; } case nir_op_find_lsb: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); Value *tmp = getSSA(); - mkOp1(OP_BREV, TYPE_U32, tmp, getSrc(&insn->src[0])); + mkOp2(OP_EXTBF, TYPE_U32, tmp, getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; mkOp1(OP_BFIND, TYPE_U32, newDefs[0], tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT; break; } - case nir_op_extract_u8: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); - Value *prmt = getSSA(); - mkOp2(OP_OR, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x4440)); - mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); - break; - } - case nir_op_extract_i8: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); - Value *prmt = getSSA(); - mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x1111), loadImm(NULL, 0x8880)); - mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); - break; - } - case nir_op_extract_u16: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); - Value *prmt = getSSA(); - mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x22), loadImm(NULL, 0x4410)); - mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); - break; - } - case nir_op_extract_i16: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); - Value *prmt = getSSA(); - mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x2222), loadImm(NULL, 0x9910)); - mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); - break; - } - case nir_op_urol: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); - mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), - getSrc(&insn->src[1]), getSrc(&insn->src[0])) - ->subOp = NV50_IR_SUBOP_SHF_L | - NV50_IR_SUBOP_SHF_W | - NV50_IR_SUBOP_SHF_HI; - break; - } - case nir_op_uror: { - DEFAULT_CHECKS; - LValues &newDefs = convert(&insn->dest); - mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), - getSrc(&insn->src[1]), getSrc(&insn->src[0])) - ->subOp = NV50_IR_SUBOP_SHF_R | - NV50_IR_SUBOP_SHF_W | - NV50_IR_SUBOP_SHF_LO; - break; - } // boolean conversions case nir_op_b2f32: { DEFAULT_CHECKS; @@ -2845,7 +3109,6 @@ Converter::visit(nir_alu_instr *insn) } default: ERROR("unknown nir_op %s\n", info.name); - assert(false); return false; } @@ -2917,6 +3180,14 @@ Converter::convert(glsl_sampler_dim dim, bool isArray, bool isShadow) } #undef CASE_SAMPLER +Value* +Converter::applyProjection(Value *src, Value *proj) +{ + if (!proj) + return src; + return mkOp2v(OP_MUL, TYPE_F32, getScratch(), src, proj); +} + unsigned int Converter::getNIRArgCount(TexInstruction::Target& target) { @@ -2928,14 +3199,70 @@ Converter::getNIRArgCount(TexInstruction::Target& target) return result; } +uint16_t +Converter::handleDeref(nir_deref_instr *deref, Value * &indirect, const nir_variable * &tex) +{ + typedef std::pair<uint32_t,Value*> DerefPair; + std::list<DerefPair> derefs; + + uint16_t result = 0; + while (deref->deref_type != nir_deref_type_var) { + switch (deref->deref_type) { + case nir_deref_type_array: { + Value *indirect; + uint8_t size = type_size(deref->type, true); + result += size * getIndirect(&deref->arr.index, 0, indirect); + + if (indirect) { + derefs.push_front(std::make_pair(size, indirect)); + } + + break; + } + case nir_deref_type_struct: { + result += nir_deref_instr_parent(deref)->type->struct_location_offset(deref->strct.index); + break; + } + case nir_deref_type_var: + default: + unreachable("nir_deref_type_var reached in handleDeref!"); + break; + } + deref = nir_deref_instr_parent(deref); + } + + indirect = NULL; + for (std::list<DerefPair>::const_iterator it = derefs.begin(); it != derefs.end(); ++it) { + Value *offset = mkOp2v(OP_MUL, TYPE_U32, getSSA(), loadImm(getSSA(), it->first), it->second); + if (indirect) + indirect = mkOp2v(OP_ADD, TYPE_U32, getSSA(), indirect, offset); + else + indirect = offset; + } + + tex = nir_deref_instr_get_variable(deref); + assert(tex); + + return result + tex->data.driver_location; +} + CacheMode Converter::convert(enum gl_access_qualifier access) { - if (access & ACCESS_VOLATILE) + switch (access) { + case ACCESS_VOLATILE: return CACHE_CV; - if (access & ACCESS_COHERENT) + case ACCESS_COHERENT: return CACHE_CG; - return CACHE_CA; + default: + return CACHE_CA; + } +} + +CacheMode +Converter::getCacheModeFromVar(const nir_variable *var) +{ + return convert(var->data.image.access); } bool @@ -2959,6 +3286,7 @@ Converter::visit(nir_tex_instr *insn) std::vector<nir_src*> offsets; uint8_t mask = 0; bool lz = false; + Value *proj = NULL; TexInstruction::Target target = convert(insn->sampler_dim, insn->is_array, insn->is_shadow); operation op = getOperation(insn->op); @@ -2971,6 +3299,7 @@ Converter::visit(nir_tex_instr *insn) int msIdx = nir_tex_instr_src_index(insn, nir_tex_src_ms_index); int lodIdx = nir_tex_instr_src_index(insn, nir_tex_src_lod); int offsetIdx = nir_tex_instr_src_index(insn, nir_tex_src_offset); + int projIdx = nir_tex_instr_src_index(insn, nir_tex_src_projector); int sampOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_offset); int texOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_texture_offset); int sampHandleIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_handle); @@ -2979,9 +3308,12 @@ Converter::visit(nir_tex_instr *insn) bool bindless = sampHandleIdx != -1 || texHandleIdx != -1; assert((sampHandleIdx != -1) == (texHandleIdx != -1)); + if (projIdx != -1) + proj = mkOp1v(OP_RCP, TYPE_F32, getScratch(), getSrc(&insn->src[projIdx].src, 0)); + srcs.resize(insn->coord_components); for (uint8_t i = 0u; i < insn->coord_components; ++i) - srcs[i] = getSrc(&insn->src[coordsIdx].src, i); + srcs[i] = applyProjection(getSrc(&insn->src[coordsIdx].src, i), proj); // sometimes we get less args than target.getArgCount, but codegen expects the latter if (insn->coord_components) { @@ -3009,7 +3341,7 @@ Converter::visit(nir_tex_instr *insn) if (offsetIdx != -1) offsets.push_back(&insn->src[offsetIdx].src); if (compIdx != -1) - srcs.push_back(getSrc(&insn->src[compIdx].src, 0)); + srcs.push_back(applyProjection(getSrc(&insn->src[compIdx].src, 0), proj)); if (texOffIdx != -1) { srcs.push_back(getSrc(&insn->src[texOffIdx].src, 0)); texOffIdx = srcs.size() - 1; @@ -3111,34 +3443,23 @@ Converter::visit(nir_tex_instr *insn) return true; } -/* nouveau's RA doesn't track the liveness of exported registers in the fragment - * shader, so we need all the store_outputs to appear at the end of the shader - * with no other instructions that might generate a temp value in between them. - */ -static void -nv_nir_move_stores_to_end(nir_shader *s) +bool +Converter::visit(nir_deref_instr *deref) { - nir_function_impl *impl = nir_shader_get_entrypoint(s); - nir_block *block = nir_impl_last_block(impl); - nir_instr *first_store = NULL; - - nir_foreach_instr_safe(instr, block) { - if (instr == first_store) - break; - if (instr->type != nir_instr_type_intrinsic) - continue; - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic == nir_intrinsic_store_output) { - nir_instr_remove(instr); - nir_instr_insert(nir_after_block(block), instr); - - if (!first_store) - first_store = instr; - } + // we just ignore those, because images intrinsics are the only place where + // we should end up with deref sources and those have to backtrack anyway + // to get the nir_variable. This code just exists to handle some special + // cases. + switch (deref->deref_type) { + case nir_deref_type_array: + case nir_deref_type_struct: + case nir_deref_type_var: + break; + default: + ERROR("unknown nir_deref_instr %u\n", deref->deref_type); + return false; } - nir_metadata_preserve(impl, - nir_metadata_block_index | - nir_metadata_dominance); + return true; } bool @@ -3149,44 +3470,18 @@ Converter::run() if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) nir_print_shader(nir, stderr); - struct nir_lower_subgroups_options subgroup_options = {}; - subgroup_options.subgroup_size = 32; - subgroup_options.ballot_bit_size = 32; - subgroup_options.ballot_components = 1; - subgroup_options.lower_elect = true; - - /* prepare for IO lowering */ - NIR_PASS_V(nir, nir_opt_deref); - NIR_PASS_V(nir, nir_lower_regs_to_ssa); - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - - /* codegen assumes vec4 alignment for memory */ - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, function_temp_type_info); - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset); - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL); - - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, - type_size, (nir_lower_io_options)0); + struct nir_lower_subgroups_options subgroup_options = { + .subgroup_size = 32, + .ballot_bit_size = 32, + }; + NIR_PASS_V(nir, nir_lower_io, nir_var_all, type_size, (nir_lower_io_options)0); NIR_PASS_V(nir, nir_lower_subgroups, &subgroup_options); - - struct nir_lower_tex_options tex_options = {}; - tex_options.lower_txp = ~0; - - NIR_PASS_V(nir, nir_lower_tex, &tex_options); - + NIR_PASS_V(nir, nir_lower_regs_to_ssa); NIR_PASS_V(nir, nir_lower_load_const_to_scalar); - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); - NIR_PASS_V(nir, nir_lower_phis_to_scalar, false); - - /*TODO: improve this lowering/optimisation loop so that we can use - * nir_opt_idiv_const effectively before this. - */ - nir_lower_idiv_options idiv_options = { - .imprecise_32bit_lowering = false, - .allow_fp16 = true, - }; - NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL); + NIR_PASS_V(nir, nir_lower_phis_to_scalar); do { progress = false; @@ -3199,21 +3494,11 @@ Converter::run() NIR_PASS(progress, nir, nir_copy_prop); NIR_PASS(progress, nir, nir_opt_dce); NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_lower_64bit_phis); } while (progress); - nir_move_options move_options = - (nir_move_options)(nir_move_const_undef | - nir_move_load_ubo | - nir_move_load_uniform | - nir_move_load_input); - NIR_PASS_V(nir, nir_opt_sink, move_options); - NIR_PASS_V(nir, nir_opt_move, move_options); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) - NIR_PASS_V(nir, nv_nir_move_stores_to_end); - NIR_PASS_V(nir, nir_lower_bool_to_int32); + NIR_PASS_V(nir, nir_lower_locals_to_regs); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp); NIR_PASS_V(nir, nir_convert_from_ssa, true); // Garbage collect dead instructions @@ -3245,150 +3530,17 @@ Converter::run() namespace nv50_ir { bool -Program::makeFromNIR(struct nv50_ir_prog_info *info, - struct nv50_ir_prog_info_out *info_out) +Program::makeFromNIR(struct nv50_ir_prog_info *info) { nir_shader *nir = (nir_shader*)info->bin.source; - Converter converter(this, nir, info, info_out); + Converter converter(this, nir, info); bool result = converter.run(); if (!result) return result; LoweringHelper lowering; lowering.run(this); - tlsSize = info_out->bin.tlsSpace; + tlsSize = info->bin.tlsSpace; return result; } } // namespace nv50_ir - -static nir_shader_compiler_options -nvir_nir_shader_compiler_options(int chipset) -{ - nir_shader_compiler_options op = {}; - op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET); - op.lower_ffma16 = false; - op.lower_ffma32 = false; - op.lower_ffma64 = false; - op.fuse_ffma16 = false; /* nir doesn't track mad vs fma */ - op.fuse_ffma32 = false; /* nir doesn't track mad vs fma */ - op.fuse_ffma64 = false; /* nir doesn't track mad vs fma */ - op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET); - op.lower_flrp32 = true; - op.lower_flrp64 = true; - op.lower_fpow = false; // TODO: nir's lowering is broken, or we could use it - op.lower_fsat = false; - op.lower_fsqrt = false; // TODO: only before gm200 - op.lower_sincos = false; - op.lower_fmod = true; - op.lower_bitfield_extract = false; - op.lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET); - op.lower_bitfield_insert = false; - op.lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET); - op.lower_bitfield_insert_to_bitfield_select = false; - op.lower_bitfield_reverse = false; - op.lower_bit_count = false; - op.lower_ifind_msb = false; - op.lower_find_lsb = false; - op.lower_uadd_carry = true; // TODO - op.lower_usub_borrow = true; // TODO - op.lower_mul_high = false; - op.lower_fneg = false; - op.lower_ineg = false; - op.lower_scmp = true; // TODO: not implemented yet - op.lower_vector_cmp = false; - op.lower_bitops = false; - op.lower_isign = (chipset >= NVISA_GV100_CHIPSET); - op.lower_fsign = (chipset >= NVISA_GV100_CHIPSET); - op.lower_fdph = false; - op.lower_fdot = false; - op.fdot_replicates = false; // TODO - op.lower_ffloor = false; // TODO - op.lower_ffract = true; - op.lower_fceil = false; // TODO - op.lower_ftrunc = false; - op.lower_ldexp = true; - op.lower_pack_half_2x16 = true; - op.lower_pack_unorm_2x16 = true; - op.lower_pack_snorm_2x16 = true; - op.lower_pack_unorm_4x8 = true; - op.lower_pack_snorm_4x8 = true; - op.lower_unpack_half_2x16 = true; - op.lower_unpack_unorm_2x16 = true; - op.lower_unpack_snorm_2x16 = true; - op.lower_unpack_unorm_4x8 = true; - op.lower_unpack_snorm_4x8 = true; - op.lower_pack_split = false; - op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET); - op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET); - op.lower_insert_byte = true; - op.lower_insert_word = true; - op.lower_all_io_to_temps = false; - op.lower_all_io_to_elements = false; - op.vertex_id_zero_based = false; - op.lower_base_vertex = false; - op.lower_helper_invocation = false; - op.optimize_sample_mask_in = false; - op.lower_cs_local_index_to_id = true; - op.lower_cs_local_id_to_index = false; - op.lower_device_index_to_zero = false; // TODO - op.lower_wpos_pntc = false; // TODO - op.lower_hadd = true; // TODO - op.lower_uadd_sat = true; // TODO - op.lower_iadd_sat = true; // TODO - op.vectorize_io = false; - op.lower_to_scalar = false; - op.unify_interfaces = false; - op.use_interpolated_input_intrinsics = true; - op.lower_mul_2x32_64 = true; // TODO - op.lower_rotate = (chipset < NVISA_GV100_CHIPSET); - op.has_imul24 = false; - op.intel_vec4 = false; - op.max_unroll_iterations = 32; - op.lower_int64_options = (nir_lower_int64_options) ( - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) | - nir_lower_divmod64 | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) | - ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) | - nir_lower_ufind_msb64 - ); - op.lower_doubles_options = (nir_lower_doubles_options) ( - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) | - nir_lower_dmod | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) | - ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0) - ); - return op; -} - -static const nir_shader_compiler_options g80_nir_shader_compiler_options = -nvir_nir_shader_compiler_options(NVISA_G80_CHIPSET); -static const nir_shader_compiler_options gf100_nir_shader_compiler_options = -nvir_nir_shader_compiler_options(NVISA_GF100_CHIPSET); -static const nir_shader_compiler_options gm107_nir_shader_compiler_options = -nvir_nir_shader_compiler_options(NVISA_GM107_CHIPSET); -static const nir_shader_compiler_options gv100_nir_shader_compiler_options = -nvir_nir_shader_compiler_options(NVISA_GV100_CHIPSET); - -const nir_shader_compiler_options * -nv50_ir_nir_shader_compiler_options(int chipset) -{ - if (chipset >= NVISA_GV100_CHIPSET) - return &gv100_nir_shader_compiler_options; - if (chipset >= NVISA_GM107_CHIPSET) - return &gm107_nir_shader_compiler_options; - if (chipset >= NVISA_GF100_CHIPSET) - return &gf100_nir_shader_compiler_options; - return &g80_nir_shader_compiler_options; -} diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c b/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c index ea9d7be0d..47f6c1e53 100644 --- a/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c +++ b/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c @@ -1,6 +1,5 @@ /* * © Copyright 2018 Alyssa Rosenzweig - * Copyright (C) 2019-2020 Collabora, Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,29 +25,27 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include "pan_bo.h" #include "pan_context.h" -#include "pan_shader.h" -#include "pan_util.h" #include "compiler/nir/nir.h" #include "nir/tgsi_to_nir.h" +#include "midgard/midgard_compile.h" #include "util/u_dynarray.h" -#include "util/u_upload_mgr.h" #include "tgsi/tgsi_dump.h" void -panfrost_shader_compile(struct pipe_screen *pscreen, - struct panfrost_pool *shader_pool, - struct panfrost_pool *desc_pool, - enum pipe_shader_ir ir_type, - const void *ir, - gl_shader_stage stage, - struct panfrost_shader_state *state) +panfrost_shader_compile( + struct panfrost_context *ctx, + struct mali_shader_meta *meta, + enum pipe_shader_ir ir_type, + const void *ir, + gl_shader_stage stage, + struct panfrost_shader_state *state, + uint64_t *outputs_written) { - struct panfrost_screen *screen = pan_screen(pscreen); - struct panfrost_device *dev = pan_device(pscreen); + struct panfrost_screen *screen = pan_screen(ctx->base.screen); + uint8_t *dst; nir_shader *s; @@ -56,47 +53,117 @@ panfrost_shader_compile(struct pipe_screen *pscreen, s = nir_shader_clone(NULL, ir); } else { assert (ir_type == PIPE_SHADER_IR_TGSI); - s = tgsi_to_nir(ir, pscreen, false); + s = tgsi_to_nir(ir, ctx->base.screen); } - /* Lower this early so the backends don't have to worry about it */ - if (stage == MESA_SHADER_FRAGMENT) - NIR_PASS_V(s, nir_lower_fragcolor, state->nr_cbufs); - s->info.stage = stage; + if (stage == MESA_SHADER_FRAGMENT) { + /* Inject the alpha test now if we need to */ + + if (state->alpha_state.enabled) { + NIR_PASS_V(s, nir_lower_alpha_test, state->alpha_state.func, false); + } + } + /* Call out to Midgard compiler given the above NIR */ - struct panfrost_compile_inputs inputs = { - .gpu_id = dev->gpu_id, - .shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE), + + midgard_program program = { + .alpha_ref = state->alpha_state.ref_value }; - memcpy(inputs.rt_formats, state->rt_formats, sizeof(inputs.rt_formats)); + midgard_compile_shader_nir(&ctx->compiler, s, &program, false); + + /* Prepare the compiled binary for upload */ + int size = program.compiled.size; + dst = program.compiled.data; + + /* Upload the shader. The lookahead tag is ORed on as a tagged pointer. + * I bet someone just thought that would be a cute pun. At least, + * that's how I'd do it. */ + + state->bo = panfrost_drm_create_bo(screen, size, PAN_ALLOCATE_EXECUTE); + memcpy(state->bo->cpu, dst, size); + meta->shader = state->bo->gpu | program.first_tag; + + util_dynarray_fini(&program.compiled); + + /* Sysvals are prepended */ + program.uniform_count += program.sysval_count; + state->sysval_count = program.sysval_count; + memcpy(state->sysval, program.sysvals, sizeof(state->sysval[0]) * state->sysval_count); + + meta->midgard1.uniform_count = MIN2(program.uniform_count, program.uniform_cutoff); + meta->midgard1.work_count = program.work_register_count; + + switch (stage) { + case MESA_SHADER_VERTEX: + meta->attribute_count = util_bitcount64(s->info.inputs_read); + meta->varying_count = util_bitcount64(s->info.outputs_written); + break; + case MESA_SHADER_FRAGMENT: + meta->attribute_count = 0; + meta->varying_count = util_bitcount64(s->info.inputs_read); + break; + case MESA_SHADER_COMPUTE: + /* TODO: images */ + meta->attribute_count = 0; + meta->varying_count = 0; + break; + default: + unreachable("Unknown shader state"); + } - struct util_dynarray binary; + state->can_discard = s->info.fs.uses_discard; + state->writes_point_size = program.writes_point_size; + state->reads_point_coord = false; + state->helper_invocations = s->info.fs.needs_helper_invocations; - util_dynarray_init(&binary, NULL); - screen->vtbl.compile_shader(s, &inputs, &binary, &state->info); + if (outputs_written) + *outputs_written = s->info.outputs_written; - if (binary.size) { - state->bin = panfrost_pool_take_ref(shader_pool, - pan_pool_upload_aligned(&shader_pool->base, - binary.data, binary.size, 128)); - } + /* Separate as primary uniform count is truncated */ + state->uniform_count = program.uniform_count; + meta->midgard1.unknown2 = 8; /* XXX */ - /* Don't upload RSD for fragment shaders since they need draw-time - * merging for e.g. depth/stencil/alpha. RSDs are replaced by simpler - * shader program descriptors on Valhall, which can be preuploaded even - * for fragment shaders. */ - bool upload = !(stage == MESA_SHADER_FRAGMENT && dev->arch <= 7); - screen->vtbl.prepare_shader(state, desc_pool, upload); + unsigned default_vec1_swizzle = panfrost_get_default_swizzle(1); + unsigned default_vec2_swizzle = panfrost_get_default_swizzle(2); + unsigned default_vec4_swizzle = panfrost_get_default_swizzle(4); - panfrost_analyze_sysvals(state); + /* Iterate the varyings and emit the corresponding descriptor */ + for (unsigned i = 0; i < meta->varying_count; ++i) { + unsigned location = program.varyings[i]; - util_dynarray_fini(&binary); + /* Default to a vec4 varying */ + struct mali_attr_meta v = { + .format = MALI_RGBA32F, + .swizzle = default_vec4_swizzle, + .unknown1 = 0x2, + }; - /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against - * a NULL context */ - ralloc_free(s); + /* Check for special cases, otherwise assume general varying */ + + if (location == VARYING_SLOT_POS) { + v.format = MALI_VARYING_POS; + } else if (location == VARYING_SLOT_PSIZ) { + v.format = MALI_R16F; + v.swizzle = default_vec1_swizzle; + + state->writes_point_size = true; + } else if (location == VARYING_SLOT_PNTC) { + v.format = MALI_RG16F; + v.swizzle = default_vec2_swizzle; + + state->reads_point_coord = true; + } else if (location == VARYING_SLOT_FACE) { + v.format = MALI_R32I; + v.swizzle = default_vec1_swizzle; + + state->reads_face = true; + } + + state->varyings[i] = v; + state->varyings_loc[i] = location; + } } diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c b/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c index 844016ee9..d0b2e1322 100644 --- a/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c +++ b/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c @@ -1,6 +1,5 @@ /* * Copyright (C) 2019 Collabora, Ltd. - * Copyright (C) 2019 Red Hat Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -27,10 +26,7 @@ */ #include "pan_context.h" -#include "pan_bo.h" -#include "pan_shader.h" #include "util/u_memory.h" -#include "nir_serialize.h" /* Compute CSOs are tracked like graphics shader CSOs, but are * considerably simpler. We do not implement multiple @@ -43,38 +39,23 @@ panfrost_create_compute_state( const struct pipe_compute_state *cso) { struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_screen *screen = pan_screen(pctx->screen); struct panfrost_shader_variants *so = CALLOC_STRUCT(panfrost_shader_variants); so->cbase = *cso; so->is_compute = true; - struct panfrost_shader_state *v = calloc(1, sizeof(*v)); - so->variants = v; + struct panfrost_shader_state *v = &so->variants[0]; so->variant_count = 1; so->active_variant = 0; - if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) { - struct blob_reader reader; - const struct pipe_binary_program_header *hdr = cso->prog; + v->tripipe = malloc(sizeof(struct mali_shader_meta)); - blob_reader_init(&reader, hdr->blob, hdr->num_bytes); + panfrost_shader_compile(ctx, v->tripipe, + cso->ir_type, cso->prog, + MESA_SHADER_COMPUTE, v, NULL); - const struct nir_shader_compiler_options *options = - screen->vtbl.get_compiler_options(); - so->cbase.prog = nir_deserialize(NULL, options, &reader); - so->cbase.ir_type = PIPE_SHADER_IR_NIR; - } - - panfrost_shader_compile(pctx->screen, &ctx->shaders, &ctx->descs, - so->cbase.ir_type, so->cbase.prog, MESA_SHADER_COMPUTE, - v); - - /* There are no variants so we won't need the NIR again */ - ralloc_free((void *)so->cbase.prog); - so->cbase.prog = NULL; return so; } @@ -83,57 +64,76 @@ static void panfrost_bind_compute_state(struct pipe_context *pipe, void *cso) { struct panfrost_context *ctx = pan_context(pipe); - ctx->shader[PIPE_SHADER_COMPUTE] = cso; + + struct panfrost_shader_variants *variants = + (struct panfrost_shader_variants *) cso; + + ctx->shader[PIPE_SHADER_COMPUTE] = variants; } static void panfrost_delete_compute_state(struct pipe_context *pipe, void *cso) { - struct panfrost_shader_variants *so = - (struct panfrost_shader_variants *)cso; - - free(so->variants); free(cso); } -static void -panfrost_set_compute_resources(struct pipe_context *pctx, - unsigned start, unsigned count, - struct pipe_surface **resources) -{ - /* TODO */ -} +/* Launch grid is the compute equivalent of draw_vbo, so in this routine, we + * construct the COMPUTE job and some of its payload. + */ static void -panfrost_set_global_binding(struct pipe_context *pctx, - unsigned first, unsigned count, - struct pipe_resource **resources, - uint32_t **handles) +panfrost_launch_grid(struct pipe_context *pipe, + const struct pipe_grid_info *info) { - if (!resources) - return; + struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_context *ctx = pan_context(pctx); - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + ctx->compute_grid = info; - for (unsigned i = first; i < first + count; ++i) { - struct panfrost_resource *rsrc = pan_resource(resources[i]); - panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_COMPUTE); + struct mali_job_descriptor_header job = { + .job_type = JOB_TYPE_COMPUTE, + .job_descriptor_size = 1, + .job_barrier = 1 + }; - util_range_add(&rsrc->base, &rsrc->valid_buffer_range, - 0, rsrc->base.width0); + /* TODO: Stub */ + struct midgard_payload_vertex_tiler *payload = &ctx->payloads[PIPE_SHADER_COMPUTE]; - /* The handle points to uint32_t, but space is allocated for 64 bits */ - memcpy(handles[i], &rsrc->image.data.bo->ptr.gpu, sizeof(mali_ptr)); - } -} + panfrost_emit_for_draw(ctx, false); -static void -panfrost_memory_barrier(struct pipe_context *pctx, unsigned flags) -{ - /* TODO: Be smart and only flush the minimum needed, maybe emitting a - * cache flush job if that would help */ - panfrost_flush_all_batches(pan_context(pctx), "Memory barrier"); + /* Compute jobs have a "compute FBD". It's not a real framebuffer + * descriptor - there is no framebuffer - but it takes the place of + * one. As far as I can tell, it's actually the beginning of a + * single-render-target framebuffer descriptor with almost everything + * zeroed out. + */ + struct mali_compute_fbd compute_fbd = { + .unknown1 = { + 0, 0x1F, 0, 0, 0, 0, 0, 0 + } + }; + + payload->postfix.framebuffer = + panfrost_upload_transient(ctx, &compute_fbd, sizeof(compute_fbd)); + + /* Invoke according to the grid info */ + + panfrost_pack_work_groups_compute(&payload->prefix, + info->grid[0], info->grid[1], info->grid[2], + info->block[0], info->block[1], info->block[2]); + + /* Upload the payload */ + + struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(*payload)); + memcpy(transfer.cpu, &job, sizeof(job)); + memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload)); + + /* TODO: Do we want a special compute-only batch? */ + struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + + /* Queue the job */ + panfrost_scoreboard_queue_compute_job(batch, transfer); + + panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME); } void @@ -143,8 +143,7 @@ panfrost_compute_context_init(struct pipe_context *pctx) pctx->bind_compute_state = panfrost_bind_compute_state; pctx->delete_compute_state = panfrost_delete_compute_state; - pctx->set_compute_resources = panfrost_set_compute_resources; - pctx->set_global_binding = panfrost_set_global_binding; - - pctx->memory_barrier = panfrost_memory_barrier; + pctx->launch_grid = panfrost_launch_grid; } + + diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c index d4637ce72..d2c6378a0 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c @@ -25,502 +25,294 @@ * **************************************************************************/ +#include <stdio.h> + #include "pipe/p_video_codec.h" -#include "radeon_vcn_enc.h" -#include "radeon_video.h" -#include "si_pipe.h" -#include "util/u_video.h" -#include "util/u_memory.h" -#include "radeon_efc.h" -#include <stdio.h> +#include "util/u_video.h" -#define RENCODE_FW_INTERFACE_MAJOR_VERSION 1 -#define RENCODE_FW_INTERFACE_MINOR_VERSION 1 - -#define RENCODE_IB_PARAM_SESSION_INFO 0x00000001 -#define RENCODE_IB_PARAM_TASK_INFO 0x00000002 -#define RENCODE_IB_PARAM_SESSION_INIT 0x00000003 -#define RENCODE_IB_PARAM_LAYER_CONTROL 0x00000004 -#define RENCODE_IB_PARAM_LAYER_SELECT 0x00000005 -#define RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT 0x00000006 -#define RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT 0x00000007 -#define RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE 0x00000008 -#define RENCODE_IB_PARAM_QUALITY_PARAMS 0x00000009 -#define RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU 0x0000000a -#define RENCODE_IB_PARAM_SLICE_HEADER 0x0000000b -#define RENCODE_IB_PARAM_INPUT_FORMAT 0x0000000c -#define RENCODE_IB_PARAM_OUTPUT_FORMAT 0x0000000d -#define RENCODE_IB_PARAM_ENCODE_PARAMS 0x0000000f -#define RENCODE_IB_PARAM_INTRA_REFRESH 0x00000010 -#define RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER 0x00000011 -#define RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER 0x00000012 -#define RENCODE_IB_PARAM_FEEDBACK_BUFFER 0x00000015 -#define RENCODE_IB_PARAM_EFC_CONFIG 0x0000000e - -#define RENCODE_HEVC_IB_PARAM_SLICE_CONTROL 0x00100001 -#define RENCODE_HEVC_IB_PARAM_SPEC_MISC 0x00100002 -#define RENCODE_HEVC_IB_PARAM_LOOP_FILTER 0x00100003 - -#define RENCODE_H264_IB_PARAM_SLICE_CONTROL 0x00200001 -#define RENCODE_H264_IB_PARAM_SPEC_MISC 0x00200002 -#define RENCODE_H264_IB_PARAM_ENCODE_PARAMS 0x00200003 -#define RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER 0x00200004 - -static void radeon_enc_op_balance(struct radeon_encoder *enc) -{ - RADEON_ENC_BEGIN(RENCODE_IB_OP_SET_BALANCE_ENCODING_MODE); - RADEON_ENC_END(); -} +#include "si_pipe.h" +#include "radeon_video.h" +#include "radeon_vcn_enc.h" -static void radeon_enc_slice_header_hevc(struct radeon_encoder *enc) +#define RENCODE_FW_INTERFACE_MAJOR_VERSION 1 +#define RENCODE_FW_INTERFACE_MINOR_VERSION 1 + +#define RENCODE_IB_PARAM_SESSION_INFO 0x00000001 +#define RENCODE_IB_PARAM_TASK_INFO 0x00000002 +#define RENCODE_IB_PARAM_SESSION_INIT 0x00000003 +#define RENCODE_IB_PARAM_LAYER_CONTROL 0x00000004 +#define RENCODE_IB_PARAM_LAYER_SELECT 0x00000005 +#define RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT 0x00000006 +#define RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT 0x00000007 +#define RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE 0x00000008 +#define RENCODE_IB_PARAM_QUALITY_PARAMS 0x00000009 +#define RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU 0x0000000a +#define RENCODE_IB_PARAM_SLICE_HEADER 0x0000000b +#define RENCODE_IB_PARAM_INPUT_FORMAT 0x0000000c +#define RENCODE_IB_PARAM_OUTPUT_FORMAT 0x0000000d +#define RENCODE_IB_PARAM_ENCODE_PARAMS 0x0000000f +#define RENCODE_IB_PARAM_INTRA_REFRESH 0x00000010 +#define RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER 0x00000011 +#define RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER 0x00000012 +#define RENCODE_IB_PARAM_FEEDBACK_BUFFER 0x00000015 + +#define RENCODE_HEVC_IB_PARAM_SLICE_CONTROL 0x00100001 +#define RENCODE_HEVC_IB_PARAM_SPEC_MISC 0x00100002 +#define RENCODE_HEVC_IB_PARAM_LOOP_FILTER 0x00100003 + +#define RENCODE_H264_IB_PARAM_SLICE_CONTROL 0x00200001 +#define RENCODE_H264_IB_PARAM_SPEC_MISC 0x00200002 +#define RENCODE_H264_IB_PARAM_ENCODE_PARAMS 0x00200003 +#define RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER 0x00200004 + +static void radeon_enc_quality_params(struct radeon_encoder *enc) { - uint32_t instruction[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0}; - uint32_t num_bits[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0}; - unsigned int inst_index = 0; - unsigned int cdw_start = 0; - unsigned int cdw_filled = 0; - unsigned int bits_copied = 0; - RADEON_ENC_BEGIN(enc->cmd.slice_header); - radeon_enc_reset(enc); - radeon_enc_set_emulation_prevention(enc, false); - - cdw_start = enc->cs.current.cdw; - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.nal_unit_type, 6); - radeon_enc_code_fixed_bits(enc, 0x0, 6); - radeon_enc_code_fixed_bits(enc, 0x1, 3); - - radeon_enc_flush_headers(enc); - instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY; - num_bits[inst_index] = enc->bits_output - bits_copied; - bits_copied = enc->bits_output; - inst_index++; - - instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_FIRST_SLICE; - inst_index++; - - if ((enc->enc_pic.nal_unit_type >= 16) && (enc->enc_pic.nal_unit_type <= 23)) - radeon_enc_code_fixed_bits(enc, 0x0, 1); - - radeon_enc_code_ue(enc, 0x0); - - radeon_enc_flush_headers(enc); - instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY; - num_bits[inst_index] = enc->bits_output - bits_copied; - bits_copied = enc->bits_output; - inst_index++; - - instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SLICE_SEGMENT; - inst_index++; - - instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_DEPENDENT_SLICE_END; - inst_index++; - - switch (enc->enc_pic.picture_type) { - case PIPE_H2645_ENC_PICTURE_TYPE_I: - case PIPE_H2645_ENC_PICTURE_TYPE_IDR: - radeon_enc_code_ue(enc, 0x2); - break; - case PIPE_H2645_ENC_PICTURE_TYPE_P: - case PIPE_H2645_ENC_PICTURE_TYPE_SKIP: - radeon_enc_code_ue(enc, 0x1); - break; - case PIPE_H2645_ENC_PICTURE_TYPE_B: - radeon_enc_code_ue(enc, 0x0); - break; - default: - radeon_enc_code_ue(enc, 0x1); - } - - if ((enc->enc_pic.nal_unit_type != 19) && (enc->enc_pic.nal_unit_type != 20)) { - radeon_enc_code_fixed_bits(enc, enc->enc_pic.pic_order_cnt, enc->enc_pic.log2_max_poc); - if (enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P) - radeon_enc_code_fixed_bits(enc, 0x1, 1); - else { - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_ue(enc, 0x0); - } - } - - if (enc->enc_pic.sample_adaptive_offset_enabled_flag) { - radeon_enc_flush_headers(enc); - instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY; - num_bits[inst_index] = enc->bits_output - bits_copied; - bits_copied = enc->bits_output; - inst_index++; - - instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SAO_ENABLE; - inst_index++; - } - - if ((enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P) || - (enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_B)) { - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.cabac_init_flag, 1); - radeon_enc_code_ue(enc, 5 - enc->enc_pic.max_num_merge_cand); - } - - radeon_enc_flush_headers(enc); - instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY; - num_bits[inst_index] = enc->bits_output - bits_copied; - bits_copied = enc->bits_output; - inst_index++; - - instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SLICE_QP_DELTA; - inst_index++; - - if ((enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled) && - (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled || - enc->enc_pic.sample_adaptive_offset_enabled_flag)) { - if (enc->enc_pic.sample_adaptive_offset_enabled_flag) { - radeon_enc_flush_headers(enc); - instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY; - num_bits[inst_index] = enc->bits_output - bits_copied; - bits_copied = enc->bits_output; - inst_index++; - - instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_LOOP_FILTER_ACROSS_SLICES_ENABLE; - inst_index++; - } - else - radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1); - } - - radeon_enc_flush_headers(enc); - instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY; - num_bits[inst_index] = enc->bits_output - bits_copied; - bits_copied = enc->bits_output; - inst_index++; - instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_END; - - cdw_filled = enc->cs.current.cdw - cdw_start; - for (int i = 0; i < RENCODE_SLICE_HEADER_TEMPLATE_MAX_TEMPLATE_SIZE_IN_DWORDS - cdw_filled; i++) - RADEON_ENC_CS(0x00000000); - - for (int j = 0; j < RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS; j++) { - RADEON_ENC_CS(instruction[j]); - RADEON_ENC_CS(num_bits[j]); - } - - RADEON_ENC_END(); + enc->enc_pic.quality_params.vbaq_mode = 0; + enc->enc_pic.quality_params.scene_change_sensitivity = 0; + enc->enc_pic.quality_params.scene_change_min_idr_interval = 0; + enc->enc_pic.quality_params.two_pass_search_center_map_mode = 0; + + RADEON_ENC_BEGIN(enc->cmd.quality_params); + RADEON_ENC_CS(enc->enc_pic.quality_params.vbaq_mode); + RADEON_ENC_CS(enc->enc_pic.quality_params.scene_change_sensitivity); + RADEON_ENC_CS(enc->enc_pic.quality_params.scene_change_min_idr_interval); + RADEON_ENC_CS(enc->enc_pic.quality_params.two_pass_search_center_map_mode); + RADEON_ENC_END(); } static void radeon_enc_loop_filter_hevc(struct radeon_encoder *enc) { - RADEON_ENC_BEGIN(enc->cmd.deblocking_filter_hevc); - RADEON_ENC_CS(enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled); - RADEON_ENC_CS(enc->enc_pic.hevc_deblock.deblocking_filter_disabled); - RADEON_ENC_CS(enc->enc_pic.hevc_deblock.beta_offset_div2); - RADEON_ENC_CS(enc->enc_pic.hevc_deblock.tc_offset_div2); - RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cb_qp_offset); - RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cr_qp_offset); - RADEON_ENC_CS(!enc->enc_pic.sample_adaptive_offset_enabled_flag); - RADEON_ENC_END(); + RADEON_ENC_BEGIN(enc->cmd.deblocking_filter_hevc); + RADEON_ENC_CS(enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled); + RADEON_ENC_CS(enc->enc_pic.hevc_deblock.deblocking_filter_disabled); + RADEON_ENC_CS(enc->enc_pic.hevc_deblock.beta_offset_div2); + RADEON_ENC_CS(enc->enc_pic.hevc_deblock.tc_offset_div2); + RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cb_qp_offset); + RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cr_qp_offset); + RADEON_ENC_CS(1); + RADEON_ENC_END(); } static void radeon_enc_nalu_sps_hevc(struct radeon_encoder *enc) { - RADEON_ENC_BEGIN(enc->cmd.nalu); - RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_SPS); - uint32_t *size_in_bytes = &enc->cs.current.buf[enc->cs.current.cdw++]; - int i; - - radeon_enc_reset(enc); - radeon_enc_set_emulation_prevention(enc, false); - radeon_enc_code_fixed_bits(enc, 0x00000001, 32); - radeon_enc_code_fixed_bits(enc, 0x4201, 16); - radeon_enc_byte_align(enc); - radeon_enc_set_emulation_prevention(enc, true); - radeon_enc_code_fixed_bits(enc, 0x0, 4); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1, 3); - radeon_enc_code_fixed_bits(enc, 0x1, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 2); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_tier_flag, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_profile_idc, 5); - - if (enc->enc_pic.general_profile_idc == 2) - radeon_enc_code_fixed_bits(enc, 0x20000000, 32); - else - radeon_enc_code_fixed_bits(enc, 0x60000000, 32); - - radeon_enc_code_fixed_bits(enc, 0xb0000000, 32); - radeon_enc_code_fixed_bits(enc, 0x0, 16); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_level_idc, 8); - - for (i = 0; i < (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i++) - radeon_enc_code_fixed_bits(enc, 0x0, 2); - - if ((enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) > 0) { - for (i = (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i < 8; i++) - radeon_enc_code_fixed_bits(enc, 0x0, 2); - } - - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_ue(enc, enc->enc_pic.chroma_format_idc); - radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_width); - radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_height); - - if ((enc->enc_pic.crop_left != 0) || (enc->enc_pic.crop_right != 0) || - (enc->enc_pic.crop_top != 0) || (enc->enc_pic.crop_bottom != 0)) { - radeon_enc_code_fixed_bits(enc, 0x1, 1); - radeon_enc_code_ue(enc, enc->enc_pic.crop_left); - radeon_enc_code_ue(enc, enc->enc_pic.crop_right); - radeon_enc_code_ue(enc, enc->enc_pic.crop_top); - radeon_enc_code_ue(enc, enc->enc_pic.crop_bottom); - } else if (enc->enc_pic.session_init.padding_width != 0 || - enc->enc_pic.session_init.padding_height != 0) { - radeon_enc_code_fixed_bits(enc, 0x1, 1); - radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_width / 2); - radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_width / 2); - radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_height / 2); - radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_height / 2); - } else - radeon_enc_code_fixed_bits(enc, 0x0, 1); - - radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_luma_minus8); - radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_chroma_minus8); - radeon_enc_code_ue(enc, enc->enc_pic.log2_max_poc - 4); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_ue(enc, 1); - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_ue(enc, enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3); - // Only support CTBSize 64 - radeon_enc_code_ue(enc, - 6 - (enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3 + 3)); - radeon_enc_code_ue(enc, enc->enc_pic.log2_min_transform_block_size_minus2); - radeon_enc_code_ue(enc, enc->enc_pic.log2_diff_max_min_transform_block_size); - radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_inter); - radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_intra); - - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, !enc->enc_pic.hevc_spec_misc.amp_disabled, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.sample_adaptive_offset_enabled_flag, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.pcm_enabled_flag, 1); - - radeon_enc_code_ue(enc, 1); - radeon_enc_code_ue(enc, 1); - radeon_enc_code_ue(enc, 0); - radeon_enc_code_ue(enc, 0); - radeon_enc_code_fixed_bits(enc, 0x1, 1); - - radeon_enc_code_fixed_bits(enc, 0x0, 1); - - radeon_enc_code_fixed_bits(enc, 0, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.strong_intra_smoothing_enabled, 1); - - radeon_enc_code_fixed_bits(enc, 0x0, 1); - - radeon_enc_code_fixed_bits(enc, 0x0, 1); - - radeon_enc_code_fixed_bits(enc, 0x1, 1); - - radeon_enc_byte_align(enc); - radeon_enc_flush_headers(enc); - *size_in_bytes = (enc->bits_output + 7) / 8; - RADEON_ENC_END(); + RADEON_ENC_BEGIN(enc->cmd.nalu); + RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_SPS); + uint32_t *size_in_bytes = &enc->cs->current.buf[enc->cs->current.cdw++]; + int i; + + radeon_enc_reset(enc); + radeon_enc_set_emulation_prevention(enc, false); + radeon_enc_code_fixed_bits(enc, 0x00000001, 32); + radeon_enc_code_fixed_bits(enc, 0x4201, 16); + radeon_enc_byte_align(enc); + radeon_enc_set_emulation_prevention(enc, true); + radeon_enc_code_fixed_bits(enc, 0x0, 4); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1, 3); + radeon_enc_code_fixed_bits(enc, 0x1, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 2); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_tier_flag, 1); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_profile_idc, 5); + radeon_enc_code_fixed_bits(enc, 0x60000000, 32); + radeon_enc_code_fixed_bits(enc, 0xb0000000, 32); + radeon_enc_code_fixed_bits(enc, 0x0, 16); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_level_idc, 8); + + for (i = 0; i < (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) ; i++) + radeon_enc_code_fixed_bits(enc, 0x0, 2); + + if ((enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) > 0) { + for (i = (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i < 8; i++) + radeon_enc_code_fixed_bits(enc, 0x0, 2); + } + + radeon_enc_code_ue(enc, 0x0); + radeon_enc_code_ue(enc, enc->enc_pic.chroma_format_idc); + radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_width); + radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_height); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_luma_minus8); + radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_chroma_minus8); + radeon_enc_code_ue(enc, enc->enc_pic.log2_max_poc - 4); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_ue(enc, 1); + radeon_enc_code_ue(enc, 0x0); + radeon_enc_code_ue(enc, 0x0); + radeon_enc_code_ue(enc, enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3); + //Only support CTBSize 64 + radeon_enc_code_ue(enc, 6 - (enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3 + 3)); + radeon_enc_code_ue(enc, enc->enc_pic.log2_min_transform_block_size_minus2); + radeon_enc_code_ue(enc, enc->enc_pic.log2_diff_max_min_transform_block_size); + radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_inter); + radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_intra); + + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, !enc->enc_pic.hevc_spec_misc.amp_disabled, 1); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.sample_adaptive_offset_enabled_flag, 1); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.pcm_enabled_flag, 1); + + radeon_enc_code_ue(enc, 1); + radeon_enc_code_ue(enc, 1); + radeon_enc_code_ue(enc, 0); + radeon_enc_code_ue(enc, 0); + radeon_enc_code_fixed_bits(enc, 0x1, 1); + + radeon_enc_code_fixed_bits(enc, 0x0, 1); + + radeon_enc_code_fixed_bits(enc, 0, 1); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.strong_intra_smoothing_enabled, 1); + + radeon_enc_code_fixed_bits(enc, 0x0, 1); + + radeon_enc_code_fixed_bits(enc, 0x0, 1); + + radeon_enc_code_fixed_bits(enc, 0x1, 1); + + radeon_enc_byte_align(enc); + radeon_enc_flush_headers(enc); + *size_in_bytes = (enc->bits_output + 7) / 8; + RADEON_ENC_END(); } static void radeon_enc_nalu_pps_hevc(struct radeon_encoder *enc) { - RADEON_ENC_BEGIN(enc->cmd.nalu); - RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_PPS); - uint32_t *size_in_bytes = &enc->cs.current.buf[enc->cs.current.cdw++]; - radeon_enc_reset(enc); - radeon_enc_set_emulation_prevention(enc, false); - radeon_enc_code_fixed_bits(enc, 0x00000001, 32); - radeon_enc_code_fixed_bits(enc, 0x4401, 16); - radeon_enc_byte_align(enc); - radeon_enc_set_emulation_prevention(enc, true); - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_fixed_bits(enc, 0x1, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 4); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, 0x1, 1); - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_ue(enc, 0x0); - radeon_enc_code_se(enc, 0x0); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.constrained_intra_pred_flag, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - if (enc->enc_pic.rc_session_init.rate_control_method == RENCODE_RATE_CONTROL_METHOD_NONE) - radeon_enc_code_fixed_bits(enc, 0x0, 1); - else { - radeon_enc_code_fixed_bits(enc, 0x1, 1); - radeon_enc_code_ue(enc, 0x0); - } - radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cb_qp_offset); - radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cr_qp_offset); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 2); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1); - radeon_enc_code_fixed_bits(enc, 0x1, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.deblocking_filter_disabled, 1); - - if (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled) { - radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.beta_offset_div2); - radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.tc_offset_div2); - } - - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_ue(enc, enc->enc_pic.log2_parallel_merge_level_minus2); - radeon_enc_code_fixed_bits(enc, 0x0, 2); - - radeon_enc_code_fixed_bits(enc, 0x1, 1); - - radeon_enc_byte_align(enc); - radeon_enc_flush_headers(enc); - *size_in_bytes = (enc->bits_output + 7) / 8; - RADEON_ENC_END(); -} - -static void radeon_enc_session_init(struct radeon_encoder *enc) -{ - enc->enc_pic.session_init.encode_standard = RENCODE_ENCODE_STANDARD_H264; - enc->enc_pic.session_init.aligned_picture_width = align(enc->base.width, 16); - enc->enc_pic.session_init.aligned_picture_height = align(enc->base.height, 16); - enc->enc_pic.session_init.padding_width = enc->enc_pic.session_init.aligned_picture_width - enc->base.width; - enc->enc_pic.session_init.padding_height = enc->enc_pic.session_init.aligned_picture_height - enc->base.height; - enc->enc_pic.session_init.pre_encode_mode = RENCODE_PREENCODE_MODE_NONE; - enc->enc_pic.session_init.pre_encode_chroma_enabled = FALSE; - - RADEON_ENC_BEGIN(enc->cmd.session_init); - RADEON_ENC_CS(enc->enc_pic.session_init.encode_standard); - RADEON_ENC_CS(enc->enc_pic.session_init.aligned_picture_width); - RADEON_ENC_CS(enc->enc_pic.session_init.aligned_picture_height); - RADEON_ENC_CS(enc->enc_pic.session_init.padding_width); - RADEON_ENC_CS(enc->enc_pic.session_init.padding_height); - RADEON_ENC_CS(enc->enc_pic.session_init.pre_encode_mode); - RADEON_ENC_CS(enc->enc_pic.session_init.pre_encode_chroma_enabled); - RADEON_ENC_END(); -} - -static void radeon_enc_efc_config(struct radeon_encoder *enc) -{ - if (enc->efc == NULL) { - enc->efc = CALLOC_STRUCT(rvid_buffer); - int buffer_size = 46 * 1024; - if (!si_vid_create_buffer(enc->screen, enc->efc, buffer_size, PIPE_USAGE_DYNAMIC)) { - RVID_ERR("Can't create EFC conversion table buffer.\n"); - FREE(enc->efc); - return; - } - - uint32_t *ptr = enc->ws->buffer_map(enc->ws, enc->efc->res->buf, &enc->cs, PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); - memcpy(ptr, Yuv_st2084_rec2020_st2084_rec2020_2000, 46817); - enc->ws->buffer_unmap(enc->ws, enc->efc->res->buf); - } - - enc->enc_pic.efc_params.coef_buffer_size = 46817; - enc->enc_pic.efc_params.cm_program_register_data_size = 1728; - - assert(enc->efc); - - RADEON_ENC_BEGIN(enc->cmd.efc_params); - RADEON_ENC_WRITE(enc->efc->res->buf, enc->efc->res->domains, 0x0); - RADEON_ENC_CS(enc->enc_pic.efc_params.coef_buffer_size); - RADEON_ENC_CS(enc->enc_pic.efc_params.cm_program_register_data_size); - RADEON_ENC_END(); + RADEON_ENC_BEGIN(enc->cmd.nalu); + RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_PPS); + uint32_t *size_in_bytes = &enc->cs->current.buf[enc->cs->current.cdw++]; + radeon_enc_reset(enc); + radeon_enc_set_emulation_prevention(enc, false); + radeon_enc_code_fixed_bits(enc, 0x00000001, 32); + radeon_enc_code_fixed_bits(enc, 0x4401, 16); + radeon_enc_byte_align(enc); + radeon_enc_set_emulation_prevention(enc, true); + radeon_enc_code_ue(enc, 0x0); + radeon_enc_code_ue(enc, 0x0); + radeon_enc_code_fixed_bits(enc, 0x1, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 4); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, 0x1, 1); + radeon_enc_code_ue(enc, 0x0); + radeon_enc_code_ue(enc, 0x0); + radeon_enc_code_se(enc, 0x0); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.constrained_intra_pred_flag, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cb_qp_offset); + radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cr_qp_offset); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 2); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1); + radeon_enc_code_fixed_bits(enc, 0x1, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.deblocking_filter_disabled, 1); + + if (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled) { + radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.beta_offset_div2); + radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.tc_offset_div2); + } + + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_fixed_bits(enc, 0x0, 1); + radeon_enc_code_ue(enc, enc->enc_pic.log2_parallel_merge_level_minus2); + radeon_enc_code_fixed_bits(enc, 0x0, 2); + + radeon_enc_code_fixed_bits(enc, 0x1, 1); + + radeon_enc_byte_align(enc); + radeon_enc_flush_headers(enc); + *size_in_bytes = (enc->bits_output + 7) / 8; + RADEON_ENC_END(); } static void radeon_enc_input_format(struct radeon_encoder *enc) { - RADEON_ENC_BEGIN(enc->cmd.input_format); - RADEON_ENC_CS(enc->enc_pic.input_format.input_color_volume); - RADEON_ENC_CS(enc->enc_pic.input_format.input_color_space); - RADEON_ENC_CS(enc->enc_pic.input_format.input_color_range); - RADEON_ENC_CS(enc->enc_pic.input_format.input_chroma_subsampling); - RADEON_ENC_CS(enc->enc_pic.input_format.input_chroma_location); - RADEON_ENC_CS(enc->enc_pic.input_format.input_color_bit_depth); - RADEON_ENC_CS(enc->enc_pic.input_format.input_color_packing_format); - RADEON_ENC_END(); + RADEON_ENC_BEGIN(enc->cmd.input_format); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_END(); } static void radeon_enc_output_format(struct radeon_encoder *enc) { - RADEON_ENC_BEGIN(enc->cmd.output_format); - RADEON_ENC_CS(enc->enc_pic.output_format.output_color_volume); - RADEON_ENC_CS(enc->enc_pic.output_format.output_color_range); - RADEON_ENC_CS(enc->enc_pic.output_format.output_chroma_location); - RADEON_ENC_CS(enc->enc_pic.output_format.output_color_bit_depth); - RADEON_ENC_END(); + RADEON_ENC_BEGIN(enc->cmd.output_format); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_CS(0); + RADEON_ENC_END(); } static void encode(struct radeon_encoder *enc) { - enc->before_encode(enc); - enc->session_info(enc); - enc->total_task_size = 0; - enc->task_info(enc, enc->need_feedback); - enc->efc_params(enc); - enc->encode_headers(enc); - enc->ctx(enc); - enc->bitstream(enc); - enc->feedback(enc); - enc->intra_refresh(enc); - enc->input_format(enc); - enc->output_format(enc); - - enc->op_preset(enc); - enc->op_enc(enc); - *enc->p_task_size = (enc->total_task_size); + enc->session_info(enc); + enc->total_task_size = 0; + enc->task_info(enc, enc->need_feedback); + + enc->encode_headers(enc); + enc->ctx(enc); + enc->bitstream(enc); + enc->feedback(enc); + enc->intra_refresh(enc); + enc->input_format(enc); + enc->output_format(enc); + + enc->op_speed(enc); + enc->op_enc(enc); + *enc->p_task_size = (enc->total_task_size); } void radeon_enc_2_0_init(struct radeon_encoder *enc) { - radeon_enc_1_2_init(enc); - enc->encode = encode; - enc->input_format = radeon_enc_input_format; - enc->output_format = radeon_enc_output_format; - enc->efc_params = radeon_enc_efc_config; - - if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_MPEG4_AVC) { - enc->session_init = radeon_enc_session_init; - } - if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_HEVC) { - enc->deblocking_filter = radeon_enc_loop_filter_hevc; - enc->nalu_sps = radeon_enc_nalu_sps_hevc; - enc->nalu_pps = radeon_enc_nalu_pps_hevc; - enc->slice_header = radeon_enc_slice_header_hevc; - enc->op_preset = radeon_enc_op_balance; - } - - enc->cmd.session_info = RENCODE_IB_PARAM_SESSION_INFO; - enc->cmd.task_info = RENCODE_IB_PARAM_TASK_INFO; - enc->cmd.session_init = RENCODE_IB_PARAM_SESSION_INIT; - enc->cmd.layer_control = RENCODE_IB_PARAM_LAYER_CONTROL; - enc->cmd.layer_select = RENCODE_IB_PARAM_LAYER_SELECT; - enc->cmd.rc_session_init = RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT; - enc->cmd.rc_layer_init = RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT; - enc->cmd.rc_per_pic = RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE; - enc->cmd.quality_params = RENCODE_IB_PARAM_QUALITY_PARAMS; - enc->cmd.nalu = RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU; - enc->cmd.slice_header = RENCODE_IB_PARAM_SLICE_HEADER; - enc->cmd.input_format = RENCODE_IB_PARAM_INPUT_FORMAT; - enc->cmd.output_format = RENCODE_IB_PARAM_OUTPUT_FORMAT; - enc->cmd.enc_params = RENCODE_IB_PARAM_ENCODE_PARAMS; - enc->cmd.intra_refresh = RENCODE_IB_PARAM_INTRA_REFRESH; - enc->cmd.ctx = RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER; - enc->cmd.bitstream = RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER; - enc->cmd.feedback = RENCODE_IB_PARAM_FEEDBACK_BUFFER; - enc->cmd.slice_control_hevc = RENCODE_HEVC_IB_PARAM_SLICE_CONTROL; - enc->cmd.spec_misc_hevc = RENCODE_HEVC_IB_PARAM_SPEC_MISC; - enc->cmd.deblocking_filter_hevc = RENCODE_HEVC_IB_PARAM_LOOP_FILTER; - enc->cmd.slice_control_h264 = RENCODE_H264_IB_PARAM_SLICE_CONTROL; - enc->cmd.spec_misc_h264 = RENCODE_H264_IB_PARAM_SPEC_MISC; - enc->cmd.enc_params_h264 = RENCODE_H264_IB_PARAM_ENCODE_PARAMS; - enc->cmd.deblocking_filter_h264 = RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER; - enc->cmd.efc_params = RENCODE_IB_PARAM_EFC_CONFIG; - - enc->enc_pic.session_info.interface_version = - ((RENCODE_FW_INTERFACE_MAJOR_VERSION << RENCODE_IF_MAJOR_VERSION_SHIFT) | - (RENCODE_FW_INTERFACE_MINOR_VERSION << RENCODE_IF_MINOR_VERSION_SHIFT)); + radeon_enc_1_2_init(enc); + enc->encode = encode; + enc->quality_params = radeon_enc_quality_params; + enc->input_format = radeon_enc_input_format; + enc->output_format = radeon_enc_output_format; + + if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_HEVC) { + enc->deblocking_filter = radeon_enc_loop_filter_hevc; + enc->nalu_sps = radeon_enc_nalu_sps_hevc; + enc->nalu_pps = radeon_enc_nalu_pps_hevc; + } + + enc->cmd.session_info = RENCODE_IB_PARAM_SESSION_INFO; + enc->cmd.task_info = RENCODE_IB_PARAM_TASK_INFO; + enc->cmd.session_init = RENCODE_IB_PARAM_SESSION_INIT; + enc->cmd.layer_control = RENCODE_IB_PARAM_LAYER_CONTROL; + enc->cmd.layer_select = RENCODE_IB_PARAM_LAYER_SELECT; + enc->cmd.rc_session_init = RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT; + enc->cmd.rc_layer_init = RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT; + enc->cmd.rc_per_pic = RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE; + enc->cmd.quality_params = RENCODE_IB_PARAM_QUALITY_PARAMS; + enc->cmd.nalu = RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU; + enc->cmd.slice_header = RENCODE_IB_PARAM_SLICE_HEADER; + enc->cmd.input_format = RENCODE_IB_PARAM_INPUT_FORMAT; + enc->cmd.output_format = RENCODE_IB_PARAM_OUTPUT_FORMAT; + enc->cmd.enc_params = RENCODE_IB_PARAM_ENCODE_PARAMS; + enc->cmd.intra_refresh = RENCODE_IB_PARAM_INTRA_REFRESH; + enc->cmd.ctx = RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER; + enc->cmd.bitstream = RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER; + enc->cmd.feedback = RENCODE_IB_PARAM_FEEDBACK_BUFFER; + enc->cmd.slice_control_hevc = RENCODE_HEVC_IB_PARAM_SLICE_CONTROL; + enc->cmd.spec_misc_hevc = RENCODE_HEVC_IB_PARAM_SPEC_MISC; + enc->cmd.deblocking_filter_hevc = RENCODE_HEVC_IB_PARAM_LOOP_FILTER; + enc->cmd.slice_control_h264 = RENCODE_H264_IB_PARAM_SLICE_CONTROL; + enc->cmd.spec_misc_h264 = RENCODE_H264_IB_PARAM_SPEC_MISC; + enc->cmd.enc_params_h264 = RENCODE_H264_IB_PARAM_ENCODE_PARAMS; + enc->cmd.deblocking_filter_h264 = RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER; + + enc->enc_pic.session_info.interface_version = + ((RENCODE_FW_INTERFACE_MAJOR_VERSION << RENCODE_IF_MAJOR_VERSION_SHIFT) | + (RENCODE_FW_INTERFACE_MINOR_VERSION << RENCODE_IF_MINOR_VERSION_SHIFT)); } |