diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:10:09 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:10:09 +0000 |
commit | d1e8c371581041f403dcdcff4ab8a88e970d221e (patch) | |
tree | 621cf3eea9401b6fc19ce2a6dc5aa7579ecc8c70 /lib/mesa/src/freedreno/vulkan/tu_private.h | |
parent | 81f619d3e99a3a218e6318d06c2bc1a36052e75d (diff) |
Import Mesa 19.2.8
Diffstat (limited to 'lib/mesa/src/freedreno/vulkan/tu_private.h')
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_private.h | 2165 |
1 files changed, 829 insertions, 1336 deletions
diff --git a/lib/mesa/src/freedreno/vulkan/tu_private.h b/lib/mesa/src/freedreno/vulkan/tu_private.h index 862d507c9..c2440471f 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_private.h +++ b/lib/mesa/src/freedreno/vulkan/tu_private.h @@ -40,47 +40,28 @@ #include <valgrind.h> #define VG(x) x #else -#define VG(x) ((void)0) +#define VG(x) #endif -#define MESA_LOG_TAG "TU" - #include "c11/threads.h" -#include "util/rounding.h" -#include "util/bitscan.h" +#include "compiler/shader_enums.h" +#include "main/macros.h" #include "util/list.h" -#include "util/log.h" #include "util/macros.h" -#include "util/sparse_array.h" -#include "util/u_atomic.h" -#include "util/u_dynarray.h" -#include "util/xmlconfig.h" -#include "util/perf/u_trace.h" #include "vk_alloc.h" #include "vk_debug_report.h" -#include "vk_device.h" -#include "vk_dispatch_table.h" -#include "vk_extensions.h" -#include "vk_instance.h" -#include "vk_log.h" -#include "vk_physical_device.h" -#include "vk_shader_module.h" #include "wsi_common.h" +#include "drm-uapi/msm_drm.h" #include "ir3/ir3_compiler.h" #include "ir3/ir3_shader.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" #include "a6xx.xml.h" -#include "fdl/freedreno_layout.h" -#include "common/freedreno_dev_info.h" -#include "perfcntrs/freedreno_perfcntr.h" #include "tu_descriptor_set.h" -#include "tu_autotune.h" -#include "tu_util.h" -#include "tu_perfetto.h" +#include "tu_extensions.h" /* Pre-declarations needed for WSI entrypoints */ struct wl_surface; @@ -92,54 +73,143 @@ typedef uint32_t xcb_window_t; #include <vulkan/vk_android_native_buffer.h> #include <vulkan/vk_icd.h> #include <vulkan/vulkan.h> +#include <vulkan/vulkan_intel.h> #include "tu_entrypoints.h" -#include "vk_format.h" -#include "vk_image.h" -#include "vk_command_buffer.h" -#include "vk_command_pool.h" -#include "vk_queue.h" -#include "vk_object.h" -#include "vk_sync.h" -#include "vk_fence.h" -#include "vk_semaphore.h" -#include "vk_drm_syncobj.h" -#include "vk_sync_timeline.h" - #define MAX_VBS 32 #define MAX_VERTEX_ATTRIBS 32 #define MAX_RTS 8 #define MAX_VSC_PIPES 32 -#define MAX_VIEWPORTS 16 -#define MAX_VIEWPORT_SIZE (1 << 14) +#define MAX_VIEWPORTS 1 #define MAX_SCISSORS 16 #define MAX_DISCARD_RECTANGLES 4 #define MAX_PUSH_CONSTANTS_SIZE 128 #define MAX_PUSH_DESCRIPTORS 32 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16 #define MAX_DYNAMIC_STORAGE_BUFFERS 8 -#define MAX_DYNAMIC_BUFFERS_SIZE \ - (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \ - A6XX_TEX_CONST_DWORDS - +#define MAX_DYNAMIC_BUFFERS \ + (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS) +#define MAX_SAMPLES_LOG2 4 +#define NUM_META_FS_KEYS 13 #define TU_MAX_DRM_DEVICES 8 -#define MAX_VIEWS 16 -#define MAX_BIND_POINTS 2 /* compute + graphics */ -/* The Qualcomm driver exposes 0x20000058 */ -#define MAX_STORAGE_BUFFER_RANGE 0x20000000 -/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so - * expose the same maximum range. - * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual - * range might be higher. +#define MAX_VIEWS 8 + +#define NUM_DEPTH_CLEAR_PIPELINES 3 + +/* + * This is the point we switch from using CP to compute shader + * for certain buffer operations. */ -#define MAX_UNIFORM_BUFFER_RANGE 0x10000 +#define TU_BUFFER_OPS_CS_THRESHOLD 4096 + +enum tu_mem_heap +{ + TU_MEM_HEAP_VRAM, + TU_MEM_HEAP_VRAM_CPU_ACCESS, + TU_MEM_HEAP_GTT, + TU_MEM_HEAP_COUNT +}; + +enum tu_mem_type +{ + TU_MEM_TYPE_VRAM, + TU_MEM_TYPE_GTT_WRITE_COMBINE, + TU_MEM_TYPE_VRAM_CPU_ACCESS, + TU_MEM_TYPE_GTT_CACHED, + TU_MEM_TYPE_COUNT +}; + +#define tu_printflike(a, b) __attribute__((__format__(__printf__, a, b))) + +static inline uint32_t +align_u32(uint32_t v, uint32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +static inline uint32_t +align_u32_npot(uint32_t v, uint32_t a) +{ + return (v + a - 1) / a * a; +} + +static inline uint64_t +align_u64(uint64_t v, uint64_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +static inline int32_t +align_i32(int32_t v, int32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} -#define A6XX_TEX_CONST_DWORDS 16 -#define A6XX_TEX_SAMP_DWORDS 4 +/** Alignment must be a power of 2. */ +static inline bool +tu_is_aligned(uintmax_t n, uintmax_t a) +{ + assert(a == (a & -a)); + return (n & (a - 1)) == 0; +} -#define COND(bool, val) ((bool) ? (val) : 0) -#define BIT(bit) (1u << (bit)) +static inline uint32_t +round_up_u32(uint32_t v, uint32_t a) +{ + return (v + a - 1) / a; +} + +static inline uint64_t +round_up_u64(uint64_t v, uint64_t a) +{ + return (v + a - 1) / a; +} + +static inline uint32_t +tu_minify(uint32_t n, uint32_t levels) +{ + if (unlikely(n == 0)) + return 0; + else + return MAX2(n >> levels, 1); +} +static inline float +tu_clamp_f(float f, float min, float max) +{ + assert(min < max); + + if (f > max) + return max; + else if (f < min) + return min; + else + return f; +} + +static inline bool +tu_clear_mask(uint32_t *inout_mask, uint32_t clear_mask) +{ + if (*inout_mask & clear_mask) { + *inout_mask &= ~clear_mask; + return true; + } else { + return false; + } +} + +#define for_each_bit(b, dword) \ + for (uint32_t __dword = (dword); \ + (b) = __builtin_ffs(__dword) - 1, __dword; __dword &= ~(1 << (b))) + +#define typed_memcpy(dest, src, count) \ + ({ \ + STATIC_ASSERT(sizeof(*src) == sizeof(*dest)); \ + memcpy((dest), (src), (count) * sizeof(*(src))); \ + }) /* Whenever we generate an error, pass it through this function. Useful for * debugging, where we can break on it. Only call at error site, not when @@ -149,25 +219,29 @@ typedef uint32_t xcb_window_t; struct tu_instance; VkResult -__vk_startup_errorf(struct tu_instance *instance, - VkResult error, - bool force_print, - const char *file, - int line, - const char *format, - ...) PRINTFLIKE(6, 7); - -/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver - * build. - */ -#define vk_startup_errorf(instance, error, format, ...) \ - __vk_startup_errorf(instance, error, \ - instance->debug_flags & TU_DEBUG_STARTUP, \ - __FILE__, __LINE__, format, ##__VA_ARGS__) +__vk_errorf(struct tu_instance *instance, + VkResult error, + const char *file, + int line, + const char *format, + ...); + +#define vk_error(instance, error) \ + __vk_errorf(instance, error, __FILE__, __LINE__, NULL); +#define vk_errorf(instance, error, format, ...) \ + __vk_errorf(instance, error, __FILE__, __LINE__, format, ##__VA_ARGS__); void __tu_finishme(const char *file, int line, const char *format, ...) - PRINTFLIKE(3, 4); + tu_printflike(3, 4); +void +tu_loge(const char *format, ...) tu_printflike(1, 2); +void +tu_loge_v(const char *format, va_list va); +void +tu_logi(const char *format, ...) tu_printflike(1, 2); +void +tu_logi_v(const char *format, va_list va); /** * Print a FINISHME message, including its source location. @@ -181,35 +255,46 @@ __tu_finishme(const char *file, int line, const char *format, ...) } \ } while (0) +/* A non-fatal assert. Useful for debugging. */ +#ifdef DEBUG +#define tu_assert(x) \ + ({ \ + if (unlikely(!(x))) \ + fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x); \ + }) +#else +#define tu_assert(x) +#endif + +/* Suppress -Wunused in stub functions */ +#define tu_use_args(...) __tu_use_args(0, ##__VA_ARGS__) +static inline void +__tu_use_args(int ignore, ...) +{ +} + #define tu_stub() \ do { \ tu_finishme("stub %s", __func__); \ } while (0) -struct tu_memory_heap { - /* Standard bits passed on to the client */ - VkDeviceSize size; - VkMemoryHeapFlags flags; - - /** Copied from ANV: - * - * Driver-internal book-keeping. - * - * Align it to 64 bits to make atomic operations faster on 32 bit platforms. - */ - VkDeviceSize used __attribute__ ((aligned (8))); -}; - -uint64_t -tu_get_system_heap_size(void); +void * +tu_lookup_entrypoint_unchecked(const char *name); +void * +tu_lookup_entrypoint_checked( + const char *name, + uint32_t core_version, + const struct tu_instance_extension_table *instance, + const struct tu_device_extension_table *device); struct tu_physical_device { - struct vk_physical_device vk; + VK_LOADER_DATA _loader_data; struct tu_instance *instance; - const char *name; + char path[20]; + char name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE]; uint8_t driver_uuid[VK_UUID_SIZE]; uint8_t device_uuid[VK_UUID_SIZE]; uint8_t cache_uuid[VK_UUID_SIZE]; @@ -217,71 +302,43 @@ struct tu_physical_device struct wsi_device wsi_device; int local_fd; - bool has_local; - int64_t local_major; - int64_t local_minor; int master_fd; - bool has_master; - int64_t master_major; - int64_t master_minor; + unsigned gpu_id; uint32_t gmem_size; - uint64_t gmem_base; - uint32_t ccu_offset_gmem; - uint32_t ccu_offset_bypass; - - struct fd_dev_id dev_id; - const struct fd_dev_info *info; - - int msm_major_version; - int msm_minor_version; - - /* Address space and global fault count for this local_fd with DRM backend */ - uint64_t fault_count; + uint32_t tile_align_w; + uint32_t tile_align_h; /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ struct disk_cache *disk_cache; - struct tu_memory_heap heap; - - struct vk_sync_type syncobj_type; - struct vk_sync_timeline_type timeline_type; - const struct vk_sync_type *sync_types[3]; + struct tu_device_extension_table supported_extensions; }; enum tu_debug_flags { TU_DEBUG_STARTUP = 1 << 0, TU_DEBUG_NIR = 1 << 1, - TU_DEBUG_NOBIN = 1 << 3, - TU_DEBUG_SYSMEM = 1 << 4, - TU_DEBUG_FORCEBIN = 1 << 5, - TU_DEBUG_NOUBWC = 1 << 6, - TU_DEBUG_NOMULTIPOS = 1 << 7, - TU_DEBUG_NOLRZ = 1 << 8, - TU_DEBUG_PERFC = 1 << 9, - TU_DEBUG_FLUSHALL = 1 << 10, - TU_DEBUG_SYNCDRAW = 1 << 11, - TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12, - TU_DEBUG_GMEM = 1 << 13, - TU_DEBUG_RAST_ORDER = 1 << 14, - TU_DEBUG_UNALIGNED_STORE = 1 << 15, + TU_DEBUG_IR3 = 1 << 2, }; struct tu_instance { - struct vk_instance vk; + VK_LOADER_DATA _loader_data; + + VkAllocationCallbacks alloc; uint32_t api_version; int physical_device_count; struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES]; - struct driOptionCache dri_options; - struct driOptionCache available_dri_options; - enum tu_debug_flags debug_flags; + + struct vk_debug_report_instance debug_report_callbacks; + + struct tu_instance_extension_table enabled_extensions; }; VkResult @@ -297,19 +354,10 @@ bool tu_physical_device_extension_supported(struct tu_physical_device *dev, const char *name); -enum tu_bo_alloc_flags -{ - TU_BO_ALLOC_NO_FLAGS = 0, - TU_BO_ALLOC_ALLOW_DUMP = 1 << 0, - TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1, -}; - struct cache_entry; struct tu_pipeline_cache { - struct vk_object_base base; - struct tu_device *device; pthread_mutex_t mutex; @@ -326,313 +374,115 @@ struct tu_pipeline_key { }; +void +tu_pipeline_cache_init(struct tu_pipeline_cache *cache, + struct tu_device *device); +void +tu_pipeline_cache_finish(struct tu_pipeline_cache *cache); +void +tu_pipeline_cache_load(struct tu_pipeline_cache *cache, + const void *data, + size_t size); -/* queue types */ -#define TU_QUEUE_GENERAL 0 - -#define TU_MAX_QUEUE_FAMILIES 1 - -/* Keep tu_syncobj until porting to common code for kgsl too */ -#ifdef TU_USE_KGSL -struct tu_syncobj; -#endif -struct tu_u_trace_syncobj; - -/* Define tu_timeline_sync type based on drm syncobj for a point type - * for vk_sync_timeline, and the logic to handle is mostly copied from - * anv_bo_sync since it seems it can be used by similar way to anv. - */ -enum tu_timeline_sync_state { - /** Indicates that this is a new (or newly reset fence) */ - TU_TIMELINE_SYNC_STATE_RESET, - - /** Indicates that this fence has been submitted to the GPU but is still - * (as far as we know) in use by the GPU. - */ - TU_TIMELINE_SYNC_STATE_SUBMITTED, - - TU_TIMELINE_SYNC_STATE_SIGNALED, -}; - -struct tu_timeline_sync { - struct vk_sync base; - - enum tu_timeline_sync_state state; - uint32_t syncobj; -}; - -struct tu_queue -{ - struct vk_queue vk; - - struct tu_device *device; +struct tu_shader_variant; - uint32_t msm_queue_id; - int fence; -}; +bool +tu_create_shader_variants_from_pipeline_cache( + struct tu_device *device, + struct tu_pipeline_cache *cache, + const unsigned char *sha1, + struct tu_shader_variant **variants); -struct tu_bo +void +tu_pipeline_cache_insert_shaders(struct tu_device *device, + struct tu_pipeline_cache *cache, + const unsigned char *sha1, + struct tu_shader_variant **variants, + const void *const *codes, + const unsigned *code_sizes); + +struct tu_meta_state { - uint32_t gem_handle; - uint64_t size; - uint64_t iova; - void *map; - int32_t refcnt; - -#ifndef TU_USE_KGSL - uint32_t bo_list_idx; -#endif + VkAllocationCallbacks alloc; - bool implicit_sync : 1; + struct tu_pipeline_cache cache; }; -/* externally-synchronized BO suballocator. */ -struct tu_suballocator -{ - struct tu_device *dev; - - uint32_t default_size; - enum tu_bo_alloc_flags flags; - - /** Current BO we're suballocating out of. */ - struct tu_bo *bo; - uint32_t next_offset; +/* queue types */ +#define TU_QUEUE_GENERAL 0 - /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */ - struct tu_bo *cached_bo; -}; +#define TU_MAX_QUEUE_FAMILIES 1 -struct tu_suballoc_bo +struct tu_fence { - struct tu_bo *bo; - uint64_t iova; - uint32_t size; /* bytes */ + bool signaled; + int fd; }; void -tu_bo_suballocator_init(struct tu_suballocator *suballoc, - struct tu_device *dev, - uint32_t default_size, - uint32_t flags); +tu_fence_init(struct tu_fence *fence, bool signaled); void -tu_bo_suballocator_finish(struct tu_suballocator *suballoc); - -VkResult -tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc, - uint32_t size, uint32_t align); - -void * -tu_suballoc_bo_map(struct tu_suballoc_bo *bo); - +tu_fence_finish(struct tu_fence *fence); void -tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo); - -enum global_shader { - GLOBAL_SH_VS_BLIT, - GLOBAL_SH_VS_CLEAR, - GLOBAL_SH_FS_BLIT, - GLOBAL_SH_FS_BLIT_ZSCALE, - GLOBAL_SH_FS_COPY_MS, - GLOBAL_SH_FS_CLEAR0, - GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS, - GLOBAL_SH_COUNT, -}; - -/** - * Tracks the results from an individual renderpass. Initially created - * per renderpass, and appended to the tail of at->pending_results. At a later - * time, when the GPU has finished writing the results, we fill samples_passed. - */ -struct tu_renderpass_result { - /* Points into GPU memory */ - struct tu_renderpass_samples* samples; - - struct tu_suballoc_bo bo; - - /* - * Below here, only used internally within autotune - */ - uint64_t rp_key; - struct tu_renderpass_history *history; - struct list_head node; - uint32_t fence; - uint64_t samples_passed; -}; - -#define TU_BORDER_COLOR_COUNT 4096 -#define TU_BORDER_COLOR_BUILTIN 6 - -#define TU_BLIT_SHADER_SIZE 1024 +tu_fence_update_fd(struct tu_fence *fence, int fd); +void +tu_fence_copy(struct tu_fence *fence, const struct tu_fence *src); +void +tu_fence_signal(struct tu_fence *fence); +void +tu_fence_wait_idle(struct tu_fence *fence); -/* This struct defines the layout of the global_bo */ -struct tu6_global +struct tu_queue { - /* clear/blit shaders */ - uint32_t shaders[TU_BLIT_SHADER_SIZE]; - - uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */ - uint32_t _pad0; - volatile uint32_t vsc_draw_overflow; - uint32_t _pad1; - volatile uint32_t vsc_prim_overflow; - uint32_t _pad2; - uint64_t predicate; - - /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ - struct { - uint32_t offset; - uint32_t pad[7]; - } flush_base[4]; - - ALIGN16 uint32_t cs_indirect_xyz[3]; - - /* To know when renderpass stats for autotune are valid */ - volatile uint32_t autotune_fence; + VK_LOADER_DATA _loader_data; + struct tu_device *device; + uint32_t queue_family_index; + int queue_idx; + VkDeviceQueueCreateFlags flags; - /* note: larger global bo will be used for customBorderColors */ - struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[]; + uint32_t msm_queue_id; + struct tu_fence submit_fence; }; -#define gb_offset(member) offsetof(struct tu6_global, member) -#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member)) - -/* extra space in vsc draw/prim streams */ -#define VSC_PAD 0x40 struct tu_device { - struct vk_device vk; + VK_LOADER_DATA _loader_data; + + VkAllocationCallbacks alloc; + struct tu_instance *instance; + struct tu_meta_state meta_state; + struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES]; int queue_count[TU_MAX_QUEUE_FAMILIES]; struct tu_physical_device *physical_device; - int fd; struct ir3_compiler *compiler; /* Backup in-memory cache to be used if the app doesn't provide one */ struct tu_pipeline_cache *mem_cache; -#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */ - - /* Currently the kernel driver uses a 32-bit GPU address space, but it - * should be impossible to go beyond 48 bits. - */ - struct { - struct tu_bo *bo; - mtx_t construct_mtx; - bool initialized; - } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2]; - - struct tu_bo *global_bo; - - uint32_t implicit_sync_bo_count; - - /* Device-global BO suballocator for reducing BO management overhead for - * (read-only) pipeline state. Synchronized by pipeline_mutex. - */ - struct tu_suballocator pipeline_suballoc; - mtx_t pipeline_mutex; - - /* Device-global BO suballocator for reducing BO management for small - * gmem/sysmem autotune result buffers. Synchronized by autotune_mutex. - */ - struct tu_suballocator autotune_suballoc; - mtx_t autotune_mutex; - - /* the blob seems to always use 8K factor and 128K param sizes, copy them */ -#define TU_TESS_FACTOR_SIZE (8 * 1024) -#define TU_TESS_PARAM_SIZE (128 * 1024) -#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE) - /* Lazily allocated, protected by the device mutex. */ - struct tu_bo *tess_bo; - - struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT]; - uint64_t global_shader_va[GLOBAL_SH_COUNT]; - - uint32_t vsc_draw_strm_pitch; - uint32_t vsc_prim_strm_pitch; - BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT); - mtx_t mutex; - - /* bo list for submits: */ - struct drm_msm_gem_submit_bo *bo_list; - /* map bo handles to bo list index: */ - uint32_t bo_count, bo_list_size; - mtx_t bo_mutex; - /* protects imported BOs creation/freeing */ - struct u_rwlock dma_bo_lock; - - /* This array holds all our 'struct tu_bo' allocations. We use this - * so we can add a refcount to our BOs and check if a particular BO - * was already allocated in this device using its GEM handle. This is - * necessary to properly manage BO imports, because the kernel doesn't - * refcount the underlying BO memory. - * - * Specifically, when self-importing (i.e. importing a BO into the same - * device that created it), the kernel will give us the same BO handle - * for both BOs and we must only free it once when both references are - * freed. Otherwise, if we are not self-importing, we get two different BO - * handles, and we want to free each one individually. - * - * The refcount is also useful for being able to maintain BOs across - * VK object lifetimes, such as pipelines suballocating out of BOs - * allocated on the device. - */ - struct util_sparse_array bo_map; - - /* Command streams to set pass index to a scratch reg */ - struct tu_cs *perfcntrs_pass_cs; - struct tu_cs_entry *perfcntrs_pass_cs_entries; - - /* Condition variable for timeline semaphore to notify waiters when a - * new submit is executed. */ - pthread_cond_t timeline_cond; - pthread_mutex_t submit_mutex; - - struct tu_autotune autotune; - -#ifdef ANDROID - const void *gralloc; - enum { - TU_GRALLOC_UNKNOWN, - TU_GRALLOC_CROS, - TU_GRALLOC_OTHER, - } gralloc_type; -#endif + struct list_head shader_slabs; + mtx_t shader_slab_mutex; - uint32_t submit_count; - - struct u_trace_context trace_context; - - #ifdef HAVE_PERFETTO - struct tu_perfetto_state perfetto; - #endif + struct tu_device_extension_table enabled_extensions; }; -void tu_init_clear_blit_shaders(struct tu_device *dev); - -void tu_destroy_clear_blit_shaders(struct tu_device *dev); - -VkResult -tu_device_submit_deferred_locked(struct tu_device *dev); - -VkResult -tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj); - -uint64_t -tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts); - -VkResult -tu_device_check_status(struct vk_device *vk_device); +struct tu_bo +{ + uint32_t gem_handle; + uint64_t size; + uint64_t iova; + void *map; +}; VkResult -tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size, - enum tu_bo_alloc_flags flags); +tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size); VkResult tu_bo_init_dmabuf(struct tu_device *dev, - struct tu_bo **bo, + struct tu_bo *bo, uint64_t size, int fd); int @@ -642,28 +492,6 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo); VkResult tu_bo_map(struct tu_device *dev, struct tu_bo *bo); -static inline struct tu_bo * -tu_device_lookup_bo(struct tu_device *device, uint32_t handle) -{ - return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle); -} - -static inline struct tu_bo * -tu_bo_get_ref(struct tu_bo *bo) -{ - p_atomic_inc(&bo->refcnt); - return bo; -} - -/* Get a scratch bo for use inside a command buffer. This will always return - * the same bo given the same size or similar sizes, so only one scratch bo - * can be used at the same time. It's meant for short-lived things where we - * need to write to some piece of memory, read from it, and then immediately - * discard it. - */ -VkResult -tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo); - struct tu_cs_entry { /* No ownership */ @@ -673,58 +501,6 @@ struct tu_cs_entry uint32_t offset; }; -struct tu_cs_memory { - uint32_t *map; - uint64_t iova; -}; - -struct tu_draw_state { - uint64_t iova : 48; - uint32_t size : 16; -}; - -enum tu_dynamic_state -{ - /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */ - TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1, - TU_DYNAMIC_STATE_RB_DEPTH_CNTL, - TU_DYNAMIC_STATE_RB_STENCIL_CNTL, - TU_DYNAMIC_STATE_VB_STRIDE, - TU_DYNAMIC_STATE_RASTERIZER_DISCARD, - TU_DYNAMIC_STATE_COUNT, - /* no associated draw state: */ - TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT, - TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE, - /* re-use the line width enum as it uses GRAS_SU_CNTL: */ - TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH, -}; - -enum tu_draw_state_group_id -{ - TU_DRAW_STATE_PROGRAM_CONFIG, - TU_DRAW_STATE_PROGRAM, - TU_DRAW_STATE_PROGRAM_BINNING, - TU_DRAW_STATE_VB, - TU_DRAW_STATE_VI, - TU_DRAW_STATE_VI_BINNING, - TU_DRAW_STATE_RAST, - TU_DRAW_STATE_BLEND, - TU_DRAW_STATE_SHADER_GEOM_CONST, - TU_DRAW_STATE_FS_CONST, - TU_DRAW_STATE_DESC_SETS, - TU_DRAW_STATE_DESC_SETS_LOAD, - TU_DRAW_STATE_VS_PARAMS, - TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, - TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, - TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, - TU_DRAW_STATE_PRIM_MODE_GMEM, - TU_DRAW_STATE_PRIM_MODE_SYSMEM, - - /* dynamic state related draw states */ - TU_DRAW_STATE_DYNAMIC, - TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT, -}; - enum tu_cs_mode { @@ -765,7 +541,6 @@ struct tu_cs uint32_t *reserved_end; uint32_t *end; - struct tu_device *device; enum tu_cs_mode mode; uint32_t next_bo_size; @@ -776,20 +551,20 @@ struct tu_cs struct tu_bo **bos; uint32_t bo_count; uint32_t bo_capacity; - - /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */ - struct tu_bo *refcount_bo; - - /* state for cond_exec_start/cond_exec_end */ - uint32_t cond_flags; - uint32_t *cond_dwords; }; struct tu_device_memory { - struct vk_object_base base; + struct tu_bo bo; + VkDeviceSize size; - struct tu_bo *bo; + /* for dedicated allocations */ + struct tu_image *image; + struct tu_buffer *buffer; + + uint32_t type_index; + void *map; + void *user_ptr; }; struct tu_descriptor_range @@ -800,19 +575,18 @@ struct tu_descriptor_range struct tu_descriptor_set { - struct vk_object_base base; - - /* Link to descriptor pool's desc_sets list . */ - struct list_head pool_link; - - struct tu_descriptor_set_layout *layout; - struct tu_descriptor_pool *pool; + const struct tu_descriptor_set_layout *layout; uint32_t size; uint64_t va; uint32_t *mapped_ptr; + struct tu_descriptor_range *dynamic_descriptors; +}; - uint32_t *dynamic_descriptors; +struct tu_push_descriptor_set +{ + struct tu_descriptor_set set; + uint32_t capacity; }; struct tu_descriptor_pool_entry @@ -824,18 +598,13 @@ struct tu_descriptor_pool_entry struct tu_descriptor_pool { - struct vk_object_base base; - - struct tu_bo *bo; + uint8_t *mapped_ptr; uint64_t current_offset; uint64_t size; uint8_t *host_memory_base; uint8_t *host_memory_ptr; uint8_t *host_memory_end; - uint8_t *host_bo; - - struct list_head desc_sets; uint32_t entry_count; uint32_t max_entry_count; @@ -866,13 +635,11 @@ struct tu_descriptor_update_template_entry size_t src_stride; /* For push descriptors */ - const struct tu_sampler *immutable_samplers; + const uint32_t *immutable_samplers; }; struct tu_descriptor_update_template { - struct vk_object_base base; - uint32_t entry_count; VkPipelineBindPoint bind_point; struct tu_descriptor_update_template_entry entry[0]; @@ -880,257 +647,175 @@ struct tu_descriptor_update_template struct tu_buffer { - struct vk_object_base base; - VkDeviceSize size; VkBufferUsageFlags usage; VkBufferCreateFlags flags; struct tu_bo *bo; - uint64_t iova; + VkDeviceSize bo_offset; }; -const char * -tu_get_debug_option_name(int id); - -const char * -tu_get_perftest_option_name(int id); +enum tu_dynamic_state_bits +{ + TU_DYNAMIC_VIEWPORT = 1 << 0, + TU_DYNAMIC_SCISSOR = 1 << 1, + TU_DYNAMIC_LINE_WIDTH = 1 << 2, + TU_DYNAMIC_DEPTH_BIAS = 1 << 3, + TU_DYNAMIC_BLEND_CONSTANTS = 1 << 4, + TU_DYNAMIC_DEPTH_BOUNDS = 1 << 5, + TU_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 6, + TU_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, + TU_DYNAMIC_STENCIL_REFERENCE = 1 << 8, + TU_DYNAMIC_DISCARD_RECTANGLE = 1 << 9, + TU_DYNAMIC_ALL = (1 << 10) - 1, +}; + +struct tu_vertex_binding +{ + struct tu_buffer *buffer; + VkDeviceSize offset; +}; -struct tu_descriptor_state +struct tu_viewport_state { - struct tu_descriptor_set *sets[MAX_SETS]; - struct tu_descriptor_set push_set; - uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE]; + uint32_t count; + VkViewport viewports[MAX_VIEWPORTS]; }; -enum tu_cmd_dirty_bits +struct tu_scissor_state { - TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0), - TU_CMD_DIRTY_VB_STRIDE = BIT(1), - TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2), - TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3), - TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4), - TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5), - TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6), - TU_CMD_DIRTY_SHADER_CONSTS = BIT(7), - TU_CMD_DIRTY_LRZ = BIT(8), - TU_CMD_DIRTY_VS_PARAMS = BIT(9), - TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10), - TU_CMD_DIRTY_VIEWPORTS = BIT(11), - /* all draw states were disabled and need to be re-enabled: */ - TU_CMD_DIRTY_DRAW_STATE = BIT(12) + uint32_t count; + VkRect2D scissors[MAX_SCISSORS]; }; -/* There are only three cache domains we have to care about: the CCU, or - * color cache unit, which is used for color and depth/stencil attachments - * and copy/blit destinations, and is split conceptually into color and depth, - * and the universal cache or UCHE which is used for pretty much everything - * else, except for the CP (uncached) and host. We need to flush whenever data - * crosses these boundaries. - */ +struct tu_discard_rectangle_state +{ + uint32_t count; + VkRect2D rectangles[MAX_DISCARD_RECTANGLES]; +}; -enum tu_cmd_access_mask { - TU_ACCESS_UCHE_READ = 1 << 0, - TU_ACCESS_UCHE_WRITE = 1 << 1, - TU_ACCESS_CCU_COLOR_READ = 1 << 2, - TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, - TU_ACCESS_CCU_DEPTH_READ = 1 << 4, - TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, - - /* Experiments have shown that while it's safe to avoid flushing the CCU - * after each blit/renderpass, it's not safe to assume that subsequent - * lookups with a different attachment state will hit unflushed cache - * entries. That is, the CCU needs to be flushed and possibly invalidated - * when accessing memory with a different attachment state. Writing to an - * attachment under the following conditions after clearing using the - * normal 2d engine path is known to have issues: - * - * - It isn't the 0'th layer. - * - There are more than one attachment, and this isn't the 0'th attachment - * (this seems to also depend on the cpp of the attachments). - * - * Our best guess is that the layer/MRT state is used when computing - * the location of a cache entry in CCU, to avoid conflicts. We assume that - * any access in a renderpass after or before an access by a transfer needs - * a flush/invalidate, and use the _INCOHERENT variants to represent access - * by a renderpass. +struct tu_dynamic_state +{ + /** + * Bitmask of (1 << VK_DYNAMIC_STATE_*). + * Defines the set of saved dynamic state. */ - TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, - TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, - TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, - TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, + uint32_t mask; - /* Accesses which bypasses any cache. e.g. writes via the host, - * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE. - */ - TU_ACCESS_SYSMEM_READ = 1 << 10, - TU_ACCESS_SYSMEM_WRITE = 1 << 11, + struct tu_viewport_state viewport; - /* Memory writes from the CP start in-order with draws and event writes, - * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read. - */ - TU_ACCESS_CP_WRITE = 1 << 12, - - TU_ACCESS_READ = - TU_ACCESS_UCHE_READ | - TU_ACCESS_CCU_COLOR_READ | - TU_ACCESS_CCU_DEPTH_READ | - TU_ACCESS_CCU_COLOR_INCOHERENT_READ | - TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | - TU_ACCESS_SYSMEM_READ, - - TU_ACCESS_WRITE = - TU_ACCESS_UCHE_WRITE | - TU_ACCESS_CCU_COLOR_WRITE | - TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | - TU_ACCESS_CCU_DEPTH_WRITE | - TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | - TU_ACCESS_SYSMEM_WRITE | - TU_ACCESS_CP_WRITE, - - TU_ACCESS_ALL = - TU_ACCESS_READ | - TU_ACCESS_WRITE, -}; + struct tu_scissor_state scissor; -/* Starting with a6xx, the pipeline is split into several "clusters" (really - * pipeline stages). Each stage has its own pair of register banks and can - * switch them independently, so that earlier stages can run ahead of later - * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at - * the same time. - * - * As a result of this, we need to insert a WFI when an earlier stage depends - * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any - * pending WFI's to complete before starting, and usually before reading - * indirect params even, so a WFI also acts as a full "pipeline stall". - * - * Note, the names of the stages come from CLUSTER_* in devcoredump. We - * include all the stages for completeness, even ones which do not read/write - * anything. - */ + float line_width; -enum tu_stage { - /* This doesn't correspond to a cluster, but we need it for tracking - * indirect draw parameter reads etc. - */ - TU_STAGE_CP, + struct + { + float bias; + float clamp; + float slope; + } depth_bias; - /* - Fetch index buffer - * - Fetch vertex attributes, dispatch VS - */ - TU_STAGE_FE, + float blend_constants[4]; - /* Execute all geometry stages (VS thru GS) */ - TU_STAGE_SP_VS, + struct + { + float min; + float max; + } depth_bounds; - /* Write to VPC, do primitive assembly. */ - TU_STAGE_PC_VS, + struct + { + uint32_t front; + uint32_t back; + } stencil_compare_mask; - /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according - * to devcoredump so presumably this stage stalls for TU_STAGE_PS when - * early depth testing is enabled before dispatching fragments? However - * GRAS reads and writes LRZ directly. - */ - TU_STAGE_GRAS, + struct + { + uint32_t front; + uint32_t back; + } stencil_write_mask; - /* Execute FS */ - TU_STAGE_SP_PS, + struct + { + uint32_t front; + uint32_t back; + } stencil_reference; - /* - Fragment tests - * - Write color/depth - * - Streamout writes (???) - * - Varying interpolation (???) - */ - TU_STAGE_PS, + struct tu_discard_rectangle_state discard_rectangle; }; -enum tu_cmd_flush_bits { - TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, - TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, - TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, - TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, - TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, - TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, - TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, - TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, - - TU_CMD_FLAG_ALL_FLUSH = - TU_CMD_FLAG_CCU_FLUSH_DEPTH | - TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CACHE_FLUSH | - /* Treat the CP as a sort of "cache" which may need to be "flushed" via - * waiting for writes to land with WAIT_FOR_MEM_WRITES. - */ - TU_CMD_FLAG_WAIT_MEM_WRITES, - - TU_CMD_FLAG_ALL_INVALIDATE = - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | - TU_CMD_FLAG_CCU_INVALIDATE_COLOR | - TU_CMD_FLAG_CACHE_INVALIDATE | - /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a - * a command that needs CP_WAIT_FOR_ME is executed. This means we may - * insert an extra WAIT_FOR_ME before an indirect command requiring it - * in case there was another command before the current command buffer - * that it needs to wait for. - */ - TU_CMD_FLAG_WAIT_FOR_ME, -}; +extern const struct tu_dynamic_state default_dynamic_state; -/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty - * heavy, involving a CCU cache flush/invalidate and a WFI in order to change - * which part of the gmem is used by the CCU. Here we keep track of what the - * state of the CCU. - */ -enum tu_cmd_ccu_state { - TU_CMD_CCU_SYSMEM, - TU_CMD_CCU_GMEM, - TU_CMD_CCU_UNKNOWN, -}; +const char * +tu_get_debug_option_name(int id); -struct tu_cache_state { - /* Caches which must be made available (flushed) eventually if there are - * any users outside that cache domain, and caches which must be - * invalidated eventually if there are any reads. - */ - enum tu_cmd_flush_bits pending_flush_bits; - /* Pending flushes */ - enum tu_cmd_flush_bits flush_bits; -}; +const char * +tu_get_perftest_option_name(int id); -enum tu_lrz_force_disable_mask { - TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0, - TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1, +/** + * Attachment state when recording a renderpass instance. + * + * The clear value is valid only if there exists a pending clear. + */ +struct tu_attachment_state +{ + VkImageAspectFlags pending_clear_aspects; + uint32_t cleared_views; + VkClearValue clear_value; + VkImageLayout current_layout; }; -enum tu_lrz_direction { - TU_LRZ_UNKNOWN, - /* Depth func less/less-than: */ - TU_LRZ_LESS, - /* Depth func greater/greater-than: */ - TU_LRZ_GREATER, +struct tu_descriptor_state +{ + struct tu_descriptor_set *sets[MAX_SETS]; + uint32_t dirty; + uint32_t valid; + struct tu_push_descriptor_set push_set; + bool push_dirty; + uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS]; }; -struct tu_lrz_pipeline +struct tu_tile { - uint32_t force_disable_mask; - bool fs_has_kill; - bool force_late_z; - bool early_fragment_tests; + uint8_t pipe; + uint8_t slot; + VkOffset2D begin; + VkOffset2D end; }; -struct tu_lrz_state +struct tu_tiling_config { - /* Depth/Stencil image currently on use to do LRZ */ - struct tu_image *image; - bool valid : 1; - enum tu_lrz_direction prev_direction; + VkRect2D render_area; + uint32_t buffer_cpp[MAX_RTS + 2]; + uint32_t buffer_count; + + /* position and size of the first tile */ + VkRect2D tile0; + /* number of tiles */ + VkExtent2D tile_count; + + uint32_t gmem_offsets[MAX_RTS + 2]; + + /* size of the first VSC pipe */ + VkExtent2D pipe0; + /* number of VSC pipes */ + VkExtent2D pipe_count; + + /* pipe register values */ + uint32_t pipe_config[MAX_VSC_PIPES]; + uint32_t pipe_sizes[MAX_VSC_PIPES]; }; -struct tu_vs_params { - uint32_t vertex_offset; - uint32_t first_instance; +enum tu_cmd_dirty_bits +{ + TU_CMD_DIRTY_PIPELINE = 1 << 0, + TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1, + + TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16, + TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17, + TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 18, + TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 19, }; struct tu_cmd_state @@ -1138,119 +823,48 @@ struct tu_cmd_state uint32_t dirty; struct tu_pipeline *pipeline; - struct tu_pipeline *compute_pipeline; - /* Vertex buffers, viewports, and scissors - * the states for these can be updated partially, so we need to save these - * to be able to emit a complete draw state - */ - struct { - uint64_t base; - uint32_t size; - uint32_t stride; - } vb[MAX_VBS]; - VkViewport viewport[MAX_VIEWPORTS]; - VkRect2D scissor[MAX_SCISSORS]; - uint32_t max_viewport, max_scissor; - - /* for dynamic states that can't be emitted directly */ - uint32_t dynamic_stencil_mask; - uint32_t dynamic_stencil_wrmask; - uint32_t dynamic_stencil_ref; - - uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl; - uint32_t pc_raster_cntl, vpc_unknown_9107; - enum pc_di_primtype primtype; - bool primitive_restart_enable; - - /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */ - struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; - struct tu_draw_state vertex_buffers; - struct tu_draw_state shader_const[2]; - struct tu_draw_state desc_sets; - - struct tu_draw_state vs_params; + /* Vertex buffers */ + struct + { + struct tu_buffer *buffers[MAX_VBS]; + VkDeviceSize offsets[MAX_VBS]; + } vb; + + struct tu_dynamic_state dynamic; /* Index buffer */ - uint64_t index_va; + struct tu_buffer *index_buffer; + uint64_t index_offset; + uint32_t index_type; uint32_t max_index_count; - uint8_t index_size; - - /* because streamout base has to be 32-byte aligned - * there is an extra offset to deal with when it is - * unaligned - */ - uint8_t streamout_offset[IR3_MAX_SO_BUFFERS]; - - /* Renderpasses are tricky, because we may need to flush differently if - * using sysmem vs. gmem and therefore we have to delay any flushing that - * happens before a renderpass. So we have to have two copies of the flush - * state, one for intra-renderpass flushes (i.e. renderpass dependencies) - * and one for outside a renderpass. - */ - struct tu_cache_state cache; - struct tu_cache_state renderpass_cache; - - enum tu_cmd_ccu_state ccu_state; + uint64_t index_va; const struct tu_render_pass *pass; const struct tu_subpass *subpass; const struct tu_framebuffer *framebuffer; - VkRect2D render_area; - - const struct tu_image_view **attachments; + struct tu_attachment_state *attachments; - bool xfb_used; - bool has_tess; - bool tessfactor_addr_set; - bool has_subpass_predication; - bool predication_active; - bool disable_gmem; - enum a5xx_line_mode line_mode; - bool z_negative_one_to_one; + struct tu_tiling_config tiling_config; - uint32_t drawcall_count; - - /* A calculated "draw cost" value for renderpass, which tries to - * estimate the bandwidth-per-sample of all the draws according - * to: - * - * foreach_draw (...) { - * cost += num_frag_outputs; - * if (blend_enabled) - * cost += num_blend_enabled; - * if (depth_test_enabled) - * cost++; - * if (depth_write_enabled) - * cost++; - * } - * - * The idea is that each sample-passed minimally does one write - * per MRT. If blend is enabled, the hw will additionally do - * a framebuffer read per sample-passed (for each MRT with blend - * enabled). If depth-test is enabled, the hw will additionally - * a depth buffer read. If depth-write is enable, the hw will - * additionally do a depth buffer write. - * - * This does ignore depth buffer traffic for samples which do not - * pass do to depth-test fail, and some other details. But it is - * just intended to be a rough estimate that is easy to calculate. - */ - uint32_t total_drawcalls_cost; - - struct tu_lrz_state lrz; - - struct tu_draw_state lrz_and_depth_plane_state; - - struct tu_vs_params last_vs_params; + struct tu_cs_entry tile_load_ib; + struct tu_cs_entry tile_store_ib; }; struct tu_cmd_pool { - struct vk_command_pool vk; - + VkAllocationCallbacks alloc; struct list_head cmd_buffers; struct list_head free_cmd_buffers; + uint32_t queue_family_index; +}; + +struct tu_cmd_buffer_upload +{ + uint8_t *map; + unsigned offset; + uint64_t size; + struct list_head list; }; enum tu_cmd_buffer_status @@ -1262,116 +876,165 @@ enum tu_cmd_buffer_status TU_CMD_BUFFER_STATUS_PENDING, }; +struct tu_bo_list +{ + uint32_t count; + uint32_t capacity; + struct drm_msm_gem_submit_bo *bo_infos; +}; + +#define TU_BO_LIST_FAILED (~0) + +void +tu_bo_list_init(struct tu_bo_list *list); +void +tu_bo_list_destroy(struct tu_bo_list *list); +void +tu_bo_list_reset(struct tu_bo_list *list); +uint32_t +tu_bo_list_add(struct tu_bo_list *list, + const struct tu_bo *bo, + uint32_t flags); +VkResult +tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other); + struct tu_cmd_buffer { - struct vk_command_buffer vk; + VK_LOADER_DATA _loader_data; struct tu_device *device; struct tu_cmd_pool *pool; struct list_head pool_link; - struct u_trace trace; - struct u_trace_iterator trace_renderpass_start; - struct u_trace_iterator trace_renderpass_end; - - struct list_head renderpass_autotune_results; - struct tu_autotune_results_buffer* autotune_buffer; - VkCommandBufferUsageFlags usage_flags; + VkCommandBufferLevel level; enum tu_cmd_buffer_status status; struct tu_cmd_state state; + struct tu_vertex_binding vertex_bindings[MAX_VBS]; uint32_t queue_family_index; - uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; + uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE]; VkShaderStageFlags push_constant_stages; struct tu_descriptor_set meta_push_descriptors; - struct tu_descriptor_state descriptors[MAX_BIND_POINTS]; + struct tu_descriptor_state descriptors[VK_PIPELINE_BIND_POINT_RANGE_SIZE]; + + struct tu_cmd_buffer_upload upload; VkResult record_result; + struct tu_bo_list bo_list; struct tu_cs cs; struct tu_cs draw_cs; - struct tu_cs tile_store_cs; - struct tu_cs draw_epilogue_cs; - struct tu_cs sub_cs; + struct tu_cs tile_cs; - uint32_t vsc_draw_strm_pitch; - uint32_t vsc_prim_strm_pitch; -}; + uint16_t marker_reg; + uint32_t marker_seqno; -/* Temporary struct for tracking a register state to be written, used by - * a6xx-pack.h and tu_cs_emit_regs() - */ -struct tu_reg_value { - uint32_t reg; - uint64_t value; - bool is_address; - struct tu_bo *bo; - bool bo_write; - uint32_t bo_offset; - uint32_t bo_shift; + struct tu_bo scratch_bo; + uint32_t scratch_seqno; + + bool wait_for_idle; }; +void +tu6_emit_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum vgt_event_type event, + bool need_seqno); + +bool +tu_get_memory_fd(struct tu_device *device, + struct tu_device_memory *memory, + int *pFD); -void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs); +/* + * Takes x,y,z as exact numbers of invocations, instead of blocks. + * + * Limitations: Can't call normal dispatch functions without binding or + * rebinding + * the compute pipeline. + */ +void +tu_unaligned_dispatch(struct tu_cmd_buffer *cmd_buffer, + uint32_t x, + uint32_t y, + uint32_t z); + +struct tu_event +{ + uint64_t *map; +}; -void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs, - enum tu_cmd_ccu_state ccu_state); +struct tu_shader_module; +#define TU_HASH_SHADER_IS_GEOM_COPY_SHADER (1 << 0) +#define TU_HASH_SHADER_SISCHED (1 << 1) +#define TU_HASH_SHADER_UNSAFE_MATH (1 << 2) void -tu6_emit_event_write(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - enum vgt_event_type event); +tu_hash_shaders(unsigned char *hash, + const VkPipelineShaderStageCreateInfo **stages, + const struct tu_pipeline_layout *layout, + const struct tu_pipeline_key *key, + uint32_t flags); + +static inline gl_shader_stage +vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage) +{ + assert(__builtin_popcount(vk_stage) == 1); + return ffs(vk_stage) - 1; +} -static inline struct tu_descriptor_state * -tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, - VkPipelineBindPoint bind_point) +static inline VkShaderStageFlagBits +mesa_to_vk_shader_stage(gl_shader_stage mesa_stage) { - return &cmd_buffer->descriptors[bind_point]; + return (1 << mesa_stage); } -struct tu_event +#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1) + +#define tu_foreach_stage(stage, stage_bits) \ + for (gl_shader_stage stage, \ + __tmp = (gl_shader_stage)((stage_bits) &TU_STAGE_MASK); \ + stage = __builtin_ffs(__tmp) - 1, __tmp; __tmp &= ~(1 << (stage))) + +struct tu_shader_module { - struct vk_object_base base; - struct tu_bo *bo; + unsigned char sha1[20]; + + uint32_t code_size; + const uint32_t *code[0]; }; -struct tu_push_constant_range +struct tu_shader_compile_options { - uint32_t lo; - uint32_t count; + struct ir3_shader_key key; + + bool optimize; + bool include_binning_pass; }; struct tu_shader { - struct ir3_shader *ir3_shader; + struct ir3_shader ir3_shader; - struct tu_push_constant_range push_consts; - uint8_t active_desc_sets; - bool multi_pos_output; -}; + /* This may be true for vertex shaders. When true, variants[1] is the + * binning variant and binning_binary is non-NULL. + */ + bool has_binning_pass; -bool -tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output, - struct tu_device *dev); + void *binary; + void *binning_binary; -nir_shader * -tu_spirv_to_nir(struct tu_device *dev, - void *mem_ctx, - const VkPipelineShaderStageCreateInfo *stage_info, - gl_shader_stage stage); + struct ir3_shader_variant variants[0]; +}; struct tu_shader * tu_shader_create(struct tu_device *dev, - nir_shader *nir, + gl_shader_stage stage, const VkPipelineShaderStageCreateInfo *stage_info, - unsigned multiview_mask, - struct tu_pipeline_layout *layout, const VkAllocationCallbacks *alloc); void @@ -1379,78 +1042,50 @@ tu_shader_destroy(struct tu_device *dev, struct tu_shader *shader, const VkAllocationCallbacks *alloc); -struct tu_program_descriptor_linkage -{ - struct ir3_const_state const_state; - - uint32_t constlen; - - struct tu_push_constant_range push_consts; -}; - -struct tu_pipeline_executable { - gl_shader_stage stage; - - struct ir3_info stats; - bool is_binning; +void +tu_shader_compile_options_init( + struct tu_shader_compile_options *options, + const VkGraphicsPipelineCreateInfo *pipeline_info); - char *nir_from_spirv; - char *nir_final; - char *disasm; -}; +VkResult +tu_shader_compile(struct tu_device *dev, + struct tu_shader *shader, + const struct tu_shader *next_stage, + const struct tu_shader_compile_options *options, + const VkAllocationCallbacks *alloc); struct tu_pipeline { - struct vk_object_base base; - struct tu_cs cs; - struct tu_suballoc_bo bo; - /* Separate BO for private memory since it should GPU writable */ - struct tu_bo *pvtmem_bo; + struct tu_dynamic_state dynamic_state; + + struct tu_pipeline_layout *layout; bool need_indirect_descriptor_sets; VkShaderStageFlags active_stages; - uint32_t active_desc_sets; - - /* mask of enabled dynamic states - * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used - */ - uint32_t dynamic_state_mask; - struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; - - /* for dynamic states which use the same register: */ - uint32_t gras_su_cntl, gras_su_cntl_mask; - uint32_t rb_depth_cntl, rb_depth_cntl_mask; - uint32_t rb_stencil_cntl, rb_stencil_cntl_mask; - uint32_t pc_raster_cntl, pc_raster_cntl_mask; - uint32_t vpc_unknown_9107, vpc_unknown_9107_mask; - uint32_t stencil_wrmask; - - bool rb_depth_cntl_disable; - - enum a5xx_line_mode line_mode; - - /* draw states for the pipeline */ - struct tu_draw_state load_state, rast_state, blend_state; - struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem; - - /* for vertex buffers state */ - uint32_t num_vbs; struct { - struct tu_draw_state config_state; - struct tu_draw_state state; - struct tu_draw_state binning_state; - - struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; + struct tu_bo binary_bo; + struct tu_cs_entry state_ib; + struct tu_cs_entry binning_state_ib; } program; struct { - struct tu_draw_state state; - struct tu_draw_state binning_state; + uint8_t bindings[MAX_VERTEX_ATTRIBS]; + uint16_t strides[MAX_VERTEX_ATTRIBS]; + uint16_t offsets[MAX_VERTEX_ATTRIBS]; + uint32_t count; + + uint8_t binning_bindings[MAX_VERTEX_ATTRIBS]; + uint16_t binning_strides[MAX_VERTEX_ATTRIBS]; + uint16_t binning_offsets[MAX_VERTEX_ATTRIBS]; + uint32_t binning_count; + + struct tu_cs_entry state_ib; + struct tu_cs_entry binning_state_ib; } vi; struct @@ -1461,47 +1096,36 @@ struct tu_pipeline struct { - uint32_t patch_type; - uint32_t param_stride; - bool upper_left_domain_origin; - } tess; + struct tu_cs_entry state_ib; + } vp; struct { - uint32_t local_size[3]; - uint32_t subgroup_size; - } compute; - - bool provoking_vertex_last; - - struct tu_lrz_pipeline lrz; + uint32_t gras_su_cntl; + struct tu_cs_entry state_ib; + } rast; - /* In other words - framebuffer fetch support */ - bool raster_order_attachment_access; - bool subpass_feedback_loop_ds; - - bool z_negative_one_to_one; - - /* Base drawcall cost for sysmem vs gmem autotuner */ - uint8_t drawcall_base_cost; + struct + { + struct tu_cs_entry state_ib; + } ds; - void *executables_mem_ctx; - /* tu_pipeline_executable */ - struct util_dynarray executables; + struct + { + struct tu_cs_entry state_ib; + } blend; }; void -tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport, - bool z_negative_one_to_one); +tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport); void -tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count); +tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor); void -tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value); - -void -tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc); +tu6_emit_gras_su_cntl(struct tu_cs *cs, + uint32_t gras_su_cntl, + float line_width); void tu6_emit_depth_bias(struct tu_cs *cs, @@ -1509,143 +1133,106 @@ tu6_emit_depth_bias(struct tu_cs *cs, float clamp, float slope_factor); -void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples, - enum a5xx_line_mode line_mode); - -void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); - -void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); - -void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); - -void tu6_apply_depth_bounds_workaround(struct tu_device *device, - uint32_t *rb_depth_cntl); - -struct tu_pvtmem_config { - uint64_t iova; - uint32_t per_fiber_size; - uint32_t per_sp_size; - bool per_wave; -}; - -void -tu6_emit_xs_config(struct tu_cs *cs, - gl_shader_stage stage, - const struct ir3_shader_variant *xs); - -void -tu6_emit_xs(struct tu_cs *cs, - gl_shader_stage stage, - const struct ir3_shader_variant *xs, - const struct tu_pvtmem_config *pvtmem, - uint64_t binary_iova); - -void -tu6_emit_vpc(struct tu_cs *cs, - const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *hs, - const struct ir3_shader_variant *ds, - const struct ir3_shader_variant *gs, - const struct ir3_shader_variant *fs, - uint32_t patch_control_points); - void -tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs); - -struct tu_image_view; +tu6_emit_stencil_compare_mask(struct tu_cs *cs, + uint32_t front, + uint32_t back); void -tu_resolve_sysmem(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - const struct tu_image_view *src, - const struct tu_image_view *dst, - uint32_t layer_mask, - uint32_t layers, - const VkRect2D *rect); +tu6_emit_stencil_write_mask(struct tu_cs *cs, uint32_t front, uint32_t back); void -tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - const VkRenderPassBeginInfo *info); +tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back); void -tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - const VkRenderPassBeginInfo *info); +tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]); -void -tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - bool force_load); +struct tu_userdata_info * +tu_lookup_user_sgpr(struct tu_pipeline *pipeline, + gl_shader_stage stage, + int idx); -/* expose this function to be able to emit load without checking LOAD_OP */ -void -tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); +struct tu_shader_variant * +tu_get_shader(struct tu_pipeline *pipeline, gl_shader_stage stage); -/* note: gmem store can also resolve */ -void -tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t a, - uint32_t gmem_a); - -enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); +struct tu_graphics_pipeline_create_info +{ + bool use_rectlist; + bool db_depth_clear; + bool db_stencil_clear; + bool db_depth_disable_expclear; + bool db_stencil_disable_expclear; + bool db_flush_depth_inplace; + bool db_flush_stencil_inplace; + bool db_resummarize; + uint32_t custom_blend_mode; +}; struct tu_native_format { - enum a6xx_format fmt : 8; - enum a3xx_color_swap swap : 8; - enum a6xx_tile_mode tile_mode : 8; + int vtx; /* VFMTn_xxx or -1 */ + int tex; /* TFMTn_xxx or -1 */ + int rb; /* RBn_xxx or -1 */ + int swap; /* enum a3xx_color_swap */ + bool present; /* internal only; always true to external users */ }; -enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); -bool tu6_format_vtx_supported(VkFormat format); -struct tu_native_format tu6_format_vtx(VkFormat format); -bool tu6_format_color_supported(enum pipe_format format); -struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode); -bool tu6_format_texture_supported(enum pipe_format format); -struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode); +const struct tu_native_format * +tu6_get_native_format(VkFormat format); + +int +tu_pack_clear_value(const VkClearValue *val, + VkFormat format, + uint32_t buf[4]); +enum a6xx_2d_ifmt tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt); -static inline enum a6xx_format -tu6_base_format(enum pipe_format format) +struct tu_image_level { - /* note: tu6_format_color doesn't care about tiling for .fmt field */ - return tu6_format_color(format, TILE6_LINEAR).fmt; -} + VkDeviceSize offset; + VkDeviceSize size; + uint32_t pitch; +}; struct tu_image { - struct vk_object_base base; - + VkImageType type; /* The original VkFormat provided by the client. This may not match any * of the actual surface formats. */ VkFormat vk_format; + VkImageAspectFlags aspects; + VkImageUsageFlags usage; /**< Superset of VkImageCreateInfo::usage. */ + VkImageTiling tiling; /** VkImageCreateInfo::tiling */ + VkImageCreateFlags flags; /** VkImageCreateInfo::flags */ + VkExtent3D extent; uint32_t level_count; uint32_t layer_count; - struct fdl_layout layout[3]; - uint32_t total_size; + VkDeviceSize size; + uint32_t alignment; + + /* memory layout */ + VkDeviceSize layer_size; + struct tu_image_level levels[15]; + unsigned tile_mode; + + unsigned queue_family_mask; + bool exclusive; + bool shareable; -#ifdef ANDROID /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */ VkDeviceMemory owned_memory; -#endif /* Set when bound */ - struct tu_bo *bo; - uint64_t iova; - - uint32_t lrz_height; - uint32_t lrz_pitch; - uint32_t lrz_offset; - - bool shareable; + const struct tu_bo *bo; + VkDeviceSize bo_offset; }; +unsigned +tu_image_queue_family_mask(const struct tu_image *image, + uint32_t family, + uint32_t queue_family); + static inline uint32_t tu_get_layerCount(const struct tu_image *image, const VkImageSubresourceRange *range) @@ -1664,108 +1251,99 @@ tu_get_levelCount(const struct tu_image *image, : range->levelCount; } -enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane); - -uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask); - -enum pipe_format tu_format_for_aspect(enum pipe_format format, - VkImageAspectFlags aspect_mask); - struct tu_image_view { - struct vk_object_base base; - struct tu_image *image; /**< VkImageViewCreateInfo::image */ - struct fdl6_view view; + VkImageViewType type; + VkImageAspectFlags aspect_mask; + VkFormat vk_format; + uint32_t base_layer; + uint32_t layer_count; + uint32_t base_mip; + uint32_t level_count; + VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */ - /* for d32s8 separate depth */ - uint64_t depth_base_addr; - uint32_t depth_layer_size; - uint32_t depth_PITCH; + uint32_t descriptor[16]; - /* for d32s8 separate stencil */ - uint64_t stencil_base_addr; - uint32_t stencil_layer_size; - uint32_t stencil_PITCH; + /* Descriptor for use as a storage image as opposed to a sampled image. + * This has a few differences for cube maps (e.g. type). + */ + uint32_t storage_descriptor[16]; }; -struct tu_sampler_ycbcr_conversion { - struct vk_object_base base; - - VkFormat format; - VkSamplerYcbcrModelConversion ycbcr_model; - VkSamplerYcbcrRange ycbcr_range; - VkComponentMapping components; - VkChromaLocation chroma_offsets[2]; - VkFilter chroma_filter; +struct tu_sampler +{ }; -struct tu_sampler { - struct vk_object_base base; - - uint32_t descriptor[A6XX_TEX_SAMP_DWORDS]; - struct tu_sampler_ycbcr_conversion *ycbcr_sampler; +struct tu_image_create_info +{ + const VkImageCreateInfo *vk_info; + bool scanout; + bool no_metadata_planes; }; -void -tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer); - -void -tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src); - -void -tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer); - -void -tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); - -void -tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); - -#define tu_image_view_stencil(iview, x) \ - ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT)) - -#define tu_image_view_depth(iview, x) \ - ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT)) - VkResult -tu_gralloc_info(struct tu_device *device, - const VkNativeBufferANDROID *gralloc_info, - int *dma_buf, - uint64_t *modifier); +tu_image_create(VkDevice _device, + const struct tu_image_create_info *info, + const VkAllocationCallbacks *alloc, + VkImage *pImage); VkResult -tu_import_memory_from_gralloc_handle(VkDevice device_h, - int dma_buf, - const VkAllocationCallbacks *alloc, - VkImage image_h); +tu_image_from_gralloc(VkDevice device_h, + const VkImageCreateInfo *base_info, + const VkNativeBufferANDROID *gralloc_info, + const VkAllocationCallbacks *alloc, + VkImage *out_image_h); void -tu_image_view_init(struct tu_image_view *iview, - const VkImageViewCreateInfo *pCreateInfo, - bool limited_z24s8); - -bool -tiling_possible(VkFormat format); - -bool -ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage, - const struct fd_dev_info *info, VkSampleCountFlagBits samples); +tu_image_view_init(struct tu_image_view *view, + struct tu_device *device, + const VkImageViewCreateInfo *pCreateInfo); struct tu_buffer_view { - struct vk_object_base base; - - uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; - - struct tu_buffer *buffer; + VkFormat vk_format; + uint64_t range; /**< VkBufferViewCreateInfo::range */ + uint32_t state[4]; }; void tu_buffer_view_init(struct tu_buffer_view *view, struct tu_device *device, const VkBufferViewCreateInfo *pCreateInfo); +static inline struct VkExtent3D +tu_sanitize_image_extent(const VkImageType imageType, + const struct VkExtent3D imageExtent) +{ + switch (imageType) { + case VK_IMAGE_TYPE_1D: + return (VkExtent3D) { imageExtent.width, 1, 1 }; + case VK_IMAGE_TYPE_2D: + return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 }; + case VK_IMAGE_TYPE_3D: + return imageExtent; + default: + unreachable("invalid image type"); + } +} + +static inline struct VkOffset3D +tu_sanitize_image_offset(const VkImageType imageType, + const struct VkOffset3D imageOffset) +{ + switch (imageType) { + case VK_IMAGE_TYPE_1D: + return (VkOffset3D) { imageOffset.x, 0, 0 }; + case VK_IMAGE_TYPE_2D: + return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 }; + case VK_IMAGE_TYPE_3D: + return imageOffset; + default: + unreachable("invalid image type"); + } +} + struct tu_attachment_info { struct tu_image_view *attachment; @@ -1773,146 +1351,100 @@ struct tu_attachment_info struct tu_framebuffer { - struct vk_object_base base; - uint32_t width; uint32_t height; uint32_t layers; - /* size of the first tile */ - VkExtent2D tile0; - /* number of tiles */ - VkExtent2D tile_count; - - /* size of the first VSC pipe */ - VkExtent2D pipe0; - /* number of VSC pipes */ - VkExtent2D pipe_count; - - /* pipe register values */ - uint32_t pipe_config[MAX_VSC_PIPES]; - uint32_t pipe_sizes[MAX_VSC_PIPES]; - uint32_t attachment_count; struct tu_attachment_info attachments[0]; }; -void -tu_framebuffer_tiling_config(struct tu_framebuffer *fb, - const struct tu_device *device, - const struct tu_render_pass *pass); - -struct tu_subpass_barrier { +struct tu_subpass_barrier +{ VkPipelineStageFlags src_stage_mask; - VkPipelineStageFlags dst_stage_mask; VkAccessFlags src_access_mask; VkAccessFlags dst_access_mask; - bool incoherent_ccu_color, incoherent_ccu_depth; }; +void +tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, + const struct tu_subpass_barrier *barrier); + struct tu_subpass_attachment { uint32_t attachment; - - /* For input attachments, true if it needs to be patched to refer to GMEM - * in GMEM mode. This is false if it hasn't already been written as an - * attachment. - */ - bool patch_input_gmem; + VkImageLayout layout; }; struct tu_subpass { uint32_t input_count; uint32_t color_count; - uint32_t resolve_count; - bool resolve_depth_stencil; - - bool feedback_loop_color; - bool feedback_loop_ds; - - /* True if we must invalidate UCHE thanks to a feedback loop. */ - bool feedback_invalidate; - - /* In other words - framebuffer fetch support */ - bool raster_order_attachment_access; - struct tu_subpass_attachment *input_attachments; struct tu_subpass_attachment *color_attachments; struct tu_subpass_attachment *resolve_attachments; struct tu_subpass_attachment depth_stencil_attachment; - VkSampleCountFlagBits samples; - - uint32_t srgb_cntl; - uint32_t multiview_mask; + /** Subpass has at least one resolve attachment */ + bool has_resolve; struct tu_subpass_barrier start_barrier; + + uint32_t view_mask; + VkSampleCountFlagBits max_sample_count; }; struct tu_render_pass_attachment { VkFormat format; uint32_t samples; - uint32_t cpp; - VkImageAspectFlags clear_mask; - uint32_t clear_views; - bool load; - bool store; - int32_t gmem_offset; - /* for D32S8 separate stencil: */ - bool load_stencil; - bool store_stencil; - int32_t gmem_offset_stencil; + VkAttachmentLoadOp load_op; + VkAttachmentLoadOp stencil_load_op; + VkImageLayout initial_layout; + VkImageLayout final_layout; + uint32_t view_mask; }; struct tu_render_pass { - struct vk_object_base base; - uint32_t attachment_count; uint32_t subpass_count; - uint32_t gmem_pixels; - uint32_t tile_align_w; struct tu_subpass_attachment *subpass_attachments; struct tu_render_pass_attachment *attachments; struct tu_subpass_barrier end_barrier; struct tu_subpass subpasses[0]; }; -#define PERF_CNTRS_REG 4 - -struct tu_perf_query_data -{ - uint32_t gid; /* group-id */ - uint32_t cid; /* countable-id within the group */ - uint32_t cntr_reg; /* counter register within the group */ - uint32_t pass; /* pass index that countables can be requested */ - uint32_t app_idx; /* index provided by apps */ -}; +VkResult +tu_device_init_meta(struct tu_device *device); +void +tu_device_finish_meta(struct tu_device *device); struct tu_query_pool { - struct vk_object_base base; - - VkQueryType type; uint32_t stride; + uint32_t availability_offset; uint64_t size; - uint32_t pipeline_statistics; - struct tu_bo *bo; + char *ptr; + VkQueryType type; + uint32_t pipeline_stats_mask; +}; - /* For performance query */ - const struct fd_perfcntr_group *perf_group; - uint32_t perf_group_count; - uint32_t counter_index_count; - struct tu_perf_query_data perf_query_data[0]; +struct tu_semaphore +{ + uint32_t syncobj; + uint32_t temp_syncobj; }; -uint32_t -tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index); +void +tu_set_descriptor_set(struct tu_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point, + struct tu_descriptor_set *set, + unsigned idx); void -tu_update_descriptor_sets(const struct tu_device *device, +tu_update_descriptor_sets(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, VkDescriptorSet overrideSet, uint32_t descriptorWriteCount, const VkWriteDescriptorSet *pDescriptorWrites, @@ -1921,24 +1453,25 @@ tu_update_descriptor_sets(const struct tu_device *device, void tu_update_descriptor_set_with_template( - const struct tu_device *device, + struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, struct tu_descriptor_set *set, VkDescriptorUpdateTemplate descriptorUpdateTemplate, const void *pData); -VkResult -tu_physical_device_init(struct tu_physical_device *device, - struct tu_instance *instance); -VkResult -tu_enumerate_devices(struct tu_instance *instance); +void +tu_meta_push_descriptor_set(struct tu_cmd_buffer *cmd_buffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t set, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet *pDescriptorWrites); int -tu_device_get_gpu_timestamp(struct tu_device *dev, - uint64_t *ts); +tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id); int -tu_device_get_suspend_count(struct tu_device *dev, - uint64_t *suspend_count); +tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size); int tu_drm_submitqueue_new(const struct tu_device *dev, @@ -1948,116 +1481,76 @@ tu_drm_submitqueue_new(const struct tu_device *dev, void tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id); +uint32_t +tu_gem_new(const struct tu_device *dev, uint64_t size, uint32_t flags); +uint32_t +tu_gem_import_dmabuf(const struct tu_device *dev, + int prime_fd, + uint64_t size); int -tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2); - -int -tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync); - -VkResult -tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit); - +tu_gem_export_dmabuf(const struct tu_device *dev, uint32_t gem_handle); void -tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, - void *ts_from, uint32_t from_offset, - void *ts_to, uint32_t to_offset, - uint32_t count); - - -VkResult -tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, - struct u_trace **trace_copy); - -/* If we copy trace and timestamps we will have to free them. */ -struct tu_u_trace_cmd_data -{ - struct tu_cs *timestamp_copy_cs; - struct u_trace *trace; -}; - -/* Data necessary to retrieve timestamps and clean all - * associated resources afterwards. - */ -struct tu_u_trace_submission_data -{ - uint32_t submission_id; - /* We have to know when timestamps are available, - * this sync object indicates it. - */ - struct tu_u_trace_syncobj *syncobj; - - uint32_t cmd_buffer_count; - uint32_t last_buffer_with_tracepoints; - struct tu_u_trace_cmd_data *cmd_trace_data; -}; - -VkResult -tu_u_trace_submission_data_create( - struct tu_device *device, - struct tu_cmd_buffer **cmd_buffers, - uint32_t cmd_buffer_count, - struct tu_u_trace_submission_data **submission_data); - -void -tu_u_trace_submission_data_finish( - struct tu_device *device, - struct tu_u_trace_submission_data *submission_data); +tu_gem_close(const struct tu_device *dev, uint32_t gem_handle); +uint64_t +tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle); +uint64_t +tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle); + +#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType) \ + \ + static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \ + { \ + return (struct __tu_type *) _handle; \ + } \ + \ + static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj) \ + { \ + return (__VkType) _obj; \ + } + +#define TU_DEFINE_NONDISP_HANDLE_CASTS(__tu_type, __VkType) \ + \ + static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \ + { \ + return (struct __tu_type *) (uintptr_t) _handle; \ + } \ + \ + static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj) \ + { \ + return (__VkType)(uintptr_t) _obj; \ + } #define TU_FROM_HANDLE(__tu_type, __name, __handle) \ - VK_FROM_HANDLE(__tu_type, __name, __handle) - -VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer, - VK_OBJECT_TYPE_COMMAND_BUFFER) -VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) -VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, - VK_OBJECT_TYPE_INSTANCE) -VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice, - VK_OBJECT_TYPE_PHYSICAL_DEVICE) -VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) - -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool, - VK_OBJECT_TYPE_COMMAND_POOL) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer, - VK_OBJECT_TYPE_BUFFER) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView, - VK_OBJECT_TYPE_BUFFER_VIEW) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool, - VK_OBJECT_TYPE_DESCRIPTOR_POOL) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet, - VK_OBJECT_TYPE_DESCRIPTOR_SET) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base, - VkDescriptorSetLayout, - VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base, - VkDescriptorUpdateTemplate, - VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory, - VK_OBJECT_TYPE_DEVICE_MEMORY) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer, - VK_OBJECT_TYPE_FRAMEBUFFER) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView, - VK_OBJECT_TYPE_IMAGE_VIEW); -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache, - VK_OBJECT_TYPE_PIPELINE_CACHE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline, - VK_OBJECT_TYPE_PIPELINE) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout, - VK_OBJECT_TYPE_PIPELINE_LAYOUT) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool, - VK_OBJECT_TYPE_QUERY_POOL) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass, - VK_OBJECT_TYPE_RENDER_PASS) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler, - VK_OBJECT_TYPE_SAMPLER) -VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion, - VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION) - -/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */ -#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x)) - -void -update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask); + struct __tu_type *__name = __tu_type##_from_handle(__handle) + +TU_DEFINE_HANDLE_CASTS(tu_cmd_buffer, VkCommandBuffer) +TU_DEFINE_HANDLE_CASTS(tu_device, VkDevice) +TU_DEFINE_HANDLE_CASTS(tu_instance, VkInstance) +TU_DEFINE_HANDLE_CASTS(tu_physical_device, VkPhysicalDevice) +TU_DEFINE_HANDLE_CASTS(tu_queue, VkQueue) + +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, VkCommandPool) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, VkBuffer) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, VkBufferView) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, VkDescriptorPool) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, VkDescriptorSet) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, + VkDescriptorSetLayout) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, + VkDescriptorUpdateTemplate) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, VkDeviceMemory) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_fence, VkFence) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_event, VkEvent) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, VkFramebuffer) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image, VkImage) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, VkImageView); +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, VkPipelineCache) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, VkPipeline) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_shader_module, VkShaderModule) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_semaphore, VkSemaphore) #endif /* TU_PRIVATE_H */ |