Import Mesa 19.2.8

author: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:10:09 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:10:09 +0000
commit: d1e8c371581041f403dcdcff4ab8a88e970d221e (patch)
tree: 621cf3eea9401b6fc19ce2a6dc5aa7579ecc8c70 /lib/mesa/src/freedreno/vulkan/tu_private.h
parent: 81f619d3e99a3a218e6318d06c2bc1a36052e75d (diff)
1 files changed, 829 insertions, 1336 deletions
diff --git a/lib/mesa/src/freedreno/vulkan/tu_private.h b/lib/mesa/src/freedreno/vulkan/tu_private.h
index 862d507c9..c2440471f 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_private.h
+++ b/lib/mesa/src/freedreno/vulkan/tu_private.h
@@ -40,47 +40,28 @@
 #include <valgrind.h>
 #define VG(x) x
 #else
-#define VG(x) ((void)0)
+#define VG(x)
 #endif
 
-#define MESA_LOG_TAG "TU"
-
 #include "c11/threads.h"
-#include "util/rounding.h"
-#include "util/bitscan.h"
+#include "compiler/shader_enums.h"
+#include "main/macros.h"
 #include "util/list.h"
-#include "util/log.h"
 #include "util/macros.h"
-#include "util/sparse_array.h"
-#include "util/u_atomic.h"
-#include "util/u_dynarray.h"
-#include "util/xmlconfig.h"
-#include "util/perf/u_trace.h"
 #include "vk_alloc.h"
 #include "vk_debug_report.h"
-#include "vk_device.h"
-#include "vk_dispatch_table.h"
-#include "vk_extensions.h"
-#include "vk_instance.h"
-#include "vk_log.h"
-#include "vk_physical_device.h"
-#include "vk_shader_module.h"
 #include "wsi_common.h"
 
+#include "drm-uapi/msm_drm.h"
 #include "ir3/ir3_compiler.h"
 #include "ir3/ir3_shader.h"
 
 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
 #include "a6xx.xml.h"
-#include "fdl/freedreno_layout.h"
-#include "common/freedreno_dev_info.h"
-#include "perfcntrs/freedreno_perfcntr.h"
 
 #include "tu_descriptor_set.h"
-#include "tu_autotune.h"
-#include "tu_util.h"
-#include "tu_perfetto.h"
+#include "tu_extensions.h"
 
 /* Pre-declarations needed for WSI entrypoints */
 struct wl_surface;
@@ -92,54 +73,143 @@ typedef uint32_t xcb_window_t;
 #include <vulkan/vk_android_native_buffer.h>
 #include <vulkan/vk_icd.h>
 #include <vulkan/vulkan.h>
+#include <vulkan/vulkan_intel.h>
 
 #include "tu_entrypoints.h"
 
-#include "vk_format.h"
-#include "vk_image.h"
-#include "vk_command_buffer.h"
-#include "vk_command_pool.h"
-#include "vk_queue.h"
-#include "vk_object.h"
-#include "vk_sync.h"
-#include "vk_fence.h"
-#include "vk_semaphore.h"
-#include "vk_drm_syncobj.h"
-#include "vk_sync_timeline.h"
-
 #define MAX_VBS 32
 #define MAX_VERTEX_ATTRIBS 32
 #define MAX_RTS 8
 #define MAX_VSC_PIPES 32
-#define MAX_VIEWPORTS 16
-#define MAX_VIEWPORT_SIZE (1 << 14)
+#define MAX_VIEWPORTS 1
 #define MAX_SCISSORS 16
 #define MAX_DISCARD_RECTANGLES 4
 #define MAX_PUSH_CONSTANTS_SIZE 128
 #define MAX_PUSH_DESCRIPTORS 32
 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16
 #define MAX_DYNAMIC_STORAGE_BUFFERS 8
-#define MAX_DYNAMIC_BUFFERS_SIZE                                             \
-   (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) *         \
-   A6XX_TEX_CONST_DWORDS
-
+#define MAX_DYNAMIC_BUFFERS                                                  \
+   (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS)
+#define MAX_SAMPLES_LOG2 4
+#define NUM_META_FS_KEYS 13
 #define TU_MAX_DRM_DEVICES 8
-#define MAX_VIEWS 16
-#define MAX_BIND_POINTS 2 /* compute + graphics */
-/* The Qualcomm driver exposes 0x20000058 */
-#define MAX_STORAGE_BUFFER_RANGE 0x20000000
-/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
- * expose the same maximum range.
- * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
- * range might be higher.
+#define MAX_VIEWS 8
+
+#define NUM_DEPTH_CLEAR_PIPELINES 3
+
+/*
+ * This is the point we switch from using CP to compute shader
+ * for certain buffer operations.
  */
-#define MAX_UNIFORM_BUFFER_RANGE 0x10000
+#define TU_BUFFER_OPS_CS_THRESHOLD 4096
+
+enum tu_mem_heap
+{
+   TU_MEM_HEAP_VRAM,
+   TU_MEM_HEAP_VRAM_CPU_ACCESS,
+   TU_MEM_HEAP_GTT,
+   TU_MEM_HEAP_COUNT
+};
+
+enum tu_mem_type
+{
+   TU_MEM_TYPE_VRAM,
+   TU_MEM_TYPE_GTT_WRITE_COMBINE,
+   TU_MEM_TYPE_VRAM_CPU_ACCESS,
+   TU_MEM_TYPE_GTT_CACHED,
+   TU_MEM_TYPE_COUNT
+};
+
+#define tu_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline uint32_t
+align_u32_npot(uint32_t v, uint32_t a)
+{
+   return (v + a - 1) / a * a;
+}
+
+static inline uint64_t
+align_u64(uint64_t v, uint64_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline int32_t
+align_i32(int32_t v, int32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
 
-#define A6XX_TEX_CONST_DWORDS 16
-#define A6XX_TEX_SAMP_DWORDS 4
+/** Alignment must be a power of 2. */
+static inline bool
+tu_is_aligned(uintmax_t n, uintmax_t a)
+{
+   assert(a == (a & -a));
+   return (n & (a - 1)) == 0;
+}
 
-#define COND(bool, val) ((bool) ? (val) : 0)
-#define BIT(bit) (1u << (bit))
+static inline uint32_t
+round_up_u32(uint32_t v, uint32_t a)
+{
+   return (v + a - 1) / a;
+}
+
+static inline uint64_t
+round_up_u64(uint64_t v, uint64_t a)
+{
+   return (v + a - 1) / a;
+}
+
+static inline uint32_t
+tu_minify(uint32_t n, uint32_t levels)
+{
+   if (unlikely(n == 0))
+      return 0;
+   else
+      return MAX2(n >> levels, 1);
+}
+static inline float
+tu_clamp_f(float f, float min, float max)
+{
+   assert(min < max);
+
+   if (f > max)
+      return max;
+   else if (f < min)
+      return min;
+   else
+      return f;
+}
+
+static inline bool
+tu_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
+{
+   if (*inout_mask & clear_mask) {
+      *inout_mask &= ~clear_mask;
+      return true;
+   } else {
+      return false;
+   }
+}
+
+#define for_each_bit(b, dword)                                               \
+   for (uint32_t __dword = (dword);                                          \
+        (b) = __builtin_ffs(__dword) - 1, __dword; __dword &= ~(1 << (b)))
+
+#define typed_memcpy(dest, src, count)                                       \
+   ({                                                                        \
+      STATIC_ASSERT(sizeof(*src) == sizeof(*dest));                          \
+      memcpy((dest), (src), (count) * sizeof(*(src)));                       \
+   })
 
 /* Whenever we generate an error, pass it through this function. Useful for
  * debugging, where we can break on it. Only call at error site, not when
@@ -149,25 +219,29 @@ typedef uint32_t xcb_window_t;
 struct tu_instance;
 
 VkResult
-__vk_startup_errorf(struct tu_instance *instance,
-                    VkResult error,
-                    bool force_print,
-                    const char *file,
-                    int line,
-                    const char *format,
-                    ...) PRINTFLIKE(6, 7);
-
-/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
- * build.
- */
-#define vk_startup_errorf(instance, error, format, ...) \
-   __vk_startup_errorf(instance, error, \
-                       instance->debug_flags & TU_DEBUG_STARTUP, \
-                       __FILE__, __LINE__, format, ##__VA_ARGS__)
+__vk_errorf(struct tu_instance *instance,
+            VkResult error,
+            const char *file,
+            int line,
+            const char *format,
+            ...);
+
+#define vk_error(instance, error)                                            \
+   __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
+#define vk_errorf(instance, error, format, ...)                              \
+   __vk_errorf(instance, error, __FILE__, __LINE__, format, ##__VA_ARGS__);
 
 void
 __tu_finishme(const char *file, int line, const char *format, ...)
-   PRINTFLIKE(3, 4);
+   tu_printflike(3, 4);
+void
+tu_loge(const char *format, ...) tu_printflike(1, 2);
+void
+tu_loge_v(const char *format, va_list va);
+void
+tu_logi(const char *format, ...) tu_printflike(1, 2);
+void
+tu_logi_v(const char *format, va_list va);
 
 /**
  * Print a FINISHME message, including its source location.
@@ -181,35 +255,46 @@ __tu_finishme(const char *file, int line, const char *format, ...)
       }                                                                      \
    } while (0)
 
+/* A non-fatal assert.  Useful for debugging. */
+#ifdef DEBUG
+#define tu_assert(x)                                                         \
+   ({                                                                        \
+      if (unlikely(!(x)))                                                    \
+         fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x);      \
+   })
+#else
+#define tu_assert(x)
+#endif
+
+/* Suppress -Wunused in stub functions */
+#define tu_use_args(...) __tu_use_args(0, ##__VA_ARGS__)
+static inline void
+__tu_use_args(int ignore, ...)
+{
+}
+
 #define tu_stub()                                                            \
    do {                                                                      \
       tu_finishme("stub %s", __func__);                                      \
    } while (0)
 
-struct tu_memory_heap {
-   /* Standard bits passed on to the client */
-   VkDeviceSize      size;
-   VkMemoryHeapFlags flags;
-
-   /** Copied from ANV:
-    *
-    * Driver-internal book-keeping.
-    *
-    * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
-    */
-   VkDeviceSize      used __attribute__ ((aligned (8)));
-};
-
-uint64_t
-tu_get_system_heap_size(void);
+void *
+tu_lookup_entrypoint_unchecked(const char *name);
+void *
+tu_lookup_entrypoint_checked(
+   const char *name,
+   uint32_t core_version,
+   const struct tu_instance_extension_table *instance,
+   const struct tu_device_extension_table *device);
 
 struct tu_physical_device
 {
-   struct vk_physical_device vk;
+   VK_LOADER_DATA _loader_data;
 
    struct tu_instance *instance;
 
-   const char *name;
+   char path[20];
+   char name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
    uint8_t driver_uuid[VK_UUID_SIZE];
    uint8_t device_uuid[VK_UUID_SIZE];
    uint8_t cache_uuid[VK_UUID_SIZE];
@@ -217,71 +302,43 @@ struct tu_physical_device
    struct wsi_device wsi_device;
 
    int local_fd;
-   bool has_local;
-   int64_t local_major;
-   int64_t local_minor;
    int master_fd;
-   bool has_master;
-   int64_t master_major;
-   int64_t master_minor;
 
+   unsigned gpu_id;
    uint32_t gmem_size;
-   uint64_t gmem_base;
-   uint32_t ccu_offset_gmem;
-   uint32_t ccu_offset_bypass;
-
-   struct fd_dev_id dev_id;
-   const struct fd_dev_info *info;
-
-   int msm_major_version;
-   int msm_minor_version;
-
-   /* Address space and global fault count for this local_fd with DRM backend */
-   uint64_t fault_count;
+   uint32_t tile_align_w;
+   uint32_t tile_align_h;
 
    /* This is the drivers on-disk cache used as a fallback as opposed to
     * the pipeline cache defined by apps.
     */
    struct disk_cache *disk_cache;
 
-   struct tu_memory_heap heap;
-
-   struct vk_sync_type syncobj_type;
-   struct vk_sync_timeline_type timeline_type;
-   const struct vk_sync_type *sync_types[3];
+   struct tu_device_extension_table supported_extensions;
 };
 
 enum tu_debug_flags
 {
    TU_DEBUG_STARTUP = 1 << 0,
    TU_DEBUG_NIR = 1 << 1,
-   TU_DEBUG_NOBIN = 1 << 3,
-   TU_DEBUG_SYSMEM = 1 << 4,
-   TU_DEBUG_FORCEBIN = 1 << 5,
-   TU_DEBUG_NOUBWC = 1 << 6,
-   TU_DEBUG_NOMULTIPOS = 1 << 7,
-   TU_DEBUG_NOLRZ = 1 << 8,
-   TU_DEBUG_PERFC = 1 << 9,
-   TU_DEBUG_FLUSHALL = 1 << 10,
-   TU_DEBUG_SYNCDRAW = 1 << 11,
-   TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
-   TU_DEBUG_GMEM = 1 << 13,
-   TU_DEBUG_RAST_ORDER = 1 << 14,
-   TU_DEBUG_UNALIGNED_STORE = 1 << 15,
+   TU_DEBUG_IR3 = 1 << 2,
 };
 
 struct tu_instance
 {
-   struct vk_instance vk;
+   VK_LOADER_DATA _loader_data;
+
+   VkAllocationCallbacks alloc;
 
    uint32_t api_version;
    int physical_device_count;
    struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
 
-   struct driOptionCache dri_options;
-   struct driOptionCache available_dri_options;
-
    enum tu_debug_flags debug_flags;
+
+   struct vk_debug_report_instance debug_report_callbacks;
+
+   struct tu_instance_extension_table enabled_extensions;
 };
 
 VkResult
@@ -297,19 +354,10 @@ bool
 tu_physical_device_extension_supported(struct tu_physical_device *dev,
                                        const char *name);
 
-enum tu_bo_alloc_flags
-{
-   TU_BO_ALLOC_NO_FLAGS = 0,
-   TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
-   TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
-};
-
 struct cache_entry;
 
 struct tu_pipeline_cache
 {
-   struct vk_object_base base;
-
    struct tu_device *device;
    pthread_mutex_t mutex;
 
@@ -326,313 +374,115 @@ struct tu_pipeline_key
 {
 };
 
+void
+tu_pipeline_cache_init(struct tu_pipeline_cache *cache,
+                       struct tu_device *device);
+void
+tu_pipeline_cache_finish(struct tu_pipeline_cache *cache);
+void
+tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
+                       const void *data,
+                       size_t size);
 
-/* queue types */
-#define TU_QUEUE_GENERAL 0
-
-#define TU_MAX_QUEUE_FAMILIES 1
-
-/* Keep tu_syncobj until porting to common code for kgsl too */
-#ifdef TU_USE_KGSL
-struct tu_syncobj;
-#endif
-struct tu_u_trace_syncobj;
-
-/* Define tu_timeline_sync type based on drm syncobj for a point type
- * for vk_sync_timeline, and the logic to handle is mostly copied from
- * anv_bo_sync since it seems it can be used by similar way to anv.
- */
-enum tu_timeline_sync_state {
-   /** Indicates that this is a new (or newly reset fence) */
-   TU_TIMELINE_SYNC_STATE_RESET,
-
-   /** Indicates that this fence has been submitted to the GPU but is still
-    * (as far as we know) in use by the GPU.
-    */
-   TU_TIMELINE_SYNC_STATE_SUBMITTED,
-
-   TU_TIMELINE_SYNC_STATE_SIGNALED,
-};
-
-struct tu_timeline_sync {
-   struct vk_sync base;
-
-   enum tu_timeline_sync_state state;
-   uint32_t syncobj;
-};
-
-struct tu_queue
-{
-   struct vk_queue vk;
-
-   struct tu_device *device;
+struct tu_shader_variant;
 
-   uint32_t msm_queue_id;
-   int fence;
-};
+bool
+tu_create_shader_variants_from_pipeline_cache(
+   struct tu_device *device,
+   struct tu_pipeline_cache *cache,
+   const unsigned char *sha1,
+   struct tu_shader_variant **variants);
 
-struct tu_bo
+void
+tu_pipeline_cache_insert_shaders(struct tu_device *device,
+                                 struct tu_pipeline_cache *cache,
+                                 const unsigned char *sha1,
+                                 struct tu_shader_variant **variants,
+                                 const void *const *codes,
+                                 const unsigned *code_sizes);
+
+struct tu_meta_state
 {
-   uint32_t gem_handle;
-   uint64_t size;
-   uint64_t iova;
-   void *map;
-   int32_t refcnt;
-
-#ifndef TU_USE_KGSL
-   uint32_t bo_list_idx;
-#endif
+   VkAllocationCallbacks alloc;
 
-   bool implicit_sync : 1;
+   struct tu_pipeline_cache cache;
 };
 
-/* externally-synchronized BO suballocator. */
-struct tu_suballocator
-{
-   struct tu_device *dev;
-
-   uint32_t default_size;
-   enum tu_bo_alloc_flags flags;
-
-   /** Current BO we're suballocating out of. */
-   struct tu_bo *bo;
-   uint32_t next_offset;
+/* queue types */
+#define TU_QUEUE_GENERAL 0
 
-   /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
-   struct tu_bo *cached_bo;
-};
+#define TU_MAX_QUEUE_FAMILIES 1
 
-struct tu_suballoc_bo
+struct tu_fence
 {
-   struct tu_bo *bo;
-   uint64_t iova;
-   uint32_t size; /* bytes */
+   bool signaled;
+   int fd;
 };
 
 void
-tu_bo_suballocator_init(struct tu_suballocator *suballoc,
-                        struct tu_device *dev,
-                        uint32_t default_size,
-                        uint32_t flags);
+tu_fence_init(struct tu_fence *fence, bool signaled);
 void
-tu_bo_suballocator_finish(struct tu_suballocator *suballoc);
-
-VkResult
-tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
-                     uint32_t size, uint32_t align);
-
-void *
-tu_suballoc_bo_map(struct tu_suballoc_bo *bo);
-
+tu_fence_finish(struct tu_fence *fence);
 void
-tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);
-
-enum global_shader {
-   GLOBAL_SH_VS_BLIT,
-   GLOBAL_SH_VS_CLEAR,
-   GLOBAL_SH_FS_BLIT,
-   GLOBAL_SH_FS_BLIT_ZSCALE,
-   GLOBAL_SH_FS_COPY_MS,
-   GLOBAL_SH_FS_CLEAR0,
-   GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
-   GLOBAL_SH_COUNT,
-};
-
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
-   /* Points into GPU memory */
-   struct tu_renderpass_samples* samples;
-
-   struct tu_suballoc_bo bo;
-
-   /*
-    * Below here, only used internally within autotune
-    */
-   uint64_t rp_key;
-   struct tu_renderpass_history *history;
-   struct list_head node;
-   uint32_t fence;
-   uint64_t samples_passed;
-};
-
-#define TU_BORDER_COLOR_COUNT 4096
-#define TU_BORDER_COLOR_BUILTIN 6
-
-#define TU_BLIT_SHADER_SIZE 1024
+tu_fence_update_fd(struct tu_fence *fence, int fd);
+void
+tu_fence_copy(struct tu_fence *fence, const struct tu_fence *src);
+void
+tu_fence_signal(struct tu_fence *fence);
+void
+tu_fence_wait_idle(struct tu_fence *fence);
 
-/* This struct defines the layout of the global_bo */
-struct tu6_global
+struct tu_queue
 {
-   /* clear/blit shaders */
-   uint32_t shaders[TU_BLIT_SHADER_SIZE];
-
-   uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
-   uint32_t _pad0;
-   volatile uint32_t vsc_draw_overflow;
-   uint32_t _pad1;
-   volatile uint32_t vsc_prim_overflow;
-   uint32_t _pad2;
-   uint64_t predicate;
-
-   /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
-   struct {
-      uint32_t offset;
-      uint32_t pad[7];
-   } flush_base[4];
-
-   ALIGN16 uint32_t cs_indirect_xyz[3];
-
-   /* To know when renderpass stats for autotune are valid */
-   volatile uint32_t autotune_fence;
+   VK_LOADER_DATA _loader_data;
+   struct tu_device *device;
+   uint32_t queue_family_index;
+   int queue_idx;
+   VkDeviceQueueCreateFlags flags;
 
-   /* note: larger global bo will be used for customBorderColors */
-   struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
+   uint32_t msm_queue_id;
+   struct tu_fence submit_fence;
 };
-#define gb_offset(member) offsetof(struct tu6_global, member)
-#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
-
-/* extra space in vsc draw/prim streams */
-#define VSC_PAD 0x40
 
 struct tu_device
 {
-   struct vk_device vk;
+   VK_LOADER_DATA _loader_data;
+
+   VkAllocationCallbacks alloc;
+
    struct tu_instance *instance;
 
+   struct tu_meta_state meta_state;
+
    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
    int queue_count[TU_MAX_QUEUE_FAMILIES];
 
    struct tu_physical_device *physical_device;
-   int fd;
 
    struct ir3_compiler *compiler;
 
    /* Backup in-memory cache to be used if the app doesn't provide one */
    struct tu_pipeline_cache *mem_cache;
 
-#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
-
-   /* Currently the kernel driver uses a 32-bit GPU address space, but it
-    * should be impossible to go beyond 48 bits.
-    */
-   struct {
-      struct tu_bo *bo;
-      mtx_t construct_mtx;
-      bool initialized;
-   } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
-
-   struct tu_bo *global_bo;
-
-   uint32_t implicit_sync_bo_count;
-
-   /* Device-global BO suballocator for reducing BO management overhead for
-    * (read-only) pipeline state.  Synchronized by pipeline_mutex.
-    */
-   struct tu_suballocator pipeline_suballoc;
-   mtx_t pipeline_mutex;
-
-   /* Device-global BO suballocator for reducing BO management for small
-    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
-    */
-   struct tu_suballocator autotune_suballoc;
-   mtx_t autotune_mutex;
-
-   /* the blob seems to always use 8K factor and 128K param sizes, copy them */
-#define TU_TESS_FACTOR_SIZE (8 * 1024)
-#define TU_TESS_PARAM_SIZE (128 * 1024)
-#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
-   /* Lazily allocated, protected by the device mutex. */
-   struct tu_bo *tess_bo;
-
-   struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
-   uint64_t global_shader_va[GLOBAL_SH_COUNT];
-
-   uint32_t vsc_draw_strm_pitch;
-   uint32_t vsc_prim_strm_pitch;
-   BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
-   mtx_t mutex;
-
-   /* bo list for submits: */
-   struct drm_msm_gem_submit_bo *bo_list;
-   /* map bo handles to bo list index: */
-   uint32_t bo_count, bo_list_size;
-   mtx_t bo_mutex;
-   /* protects imported BOs creation/freeing */
-   struct u_rwlock dma_bo_lock;
-
-   /* This array holds all our 'struct tu_bo' allocations. We use this
-    * so we can add a refcount to our BOs and check if a particular BO
-    * was already allocated in this device using its GEM handle. This is
-    * necessary to properly manage BO imports, because the kernel doesn't
-    * refcount the underlying BO memory.
-    *
-    * Specifically, when self-importing (i.e. importing a BO into the same
-    * device that created it), the kernel will give us the same BO handle
-    * for both BOs and we must only free it once when  both references are
-    * freed. Otherwise, if we are not self-importing, we get two different BO
-    * handles, and we want to free each one individually.
-    *
-    * The refcount is also useful for being able to maintain BOs across
-    * VK object lifetimes, such as pipelines suballocating out of BOs
-    * allocated on the device.
-    */
-   struct util_sparse_array bo_map;
-
-   /* Command streams to set pass index to a scratch reg */
-   struct tu_cs *perfcntrs_pass_cs;
-   struct tu_cs_entry *perfcntrs_pass_cs_entries;
-
-   /* Condition variable for timeline semaphore to notify waiters when a
-    * new submit is executed. */
-   pthread_cond_t timeline_cond;
-   pthread_mutex_t submit_mutex;
-
-   struct tu_autotune autotune;
-
-#ifdef ANDROID
-   const void *gralloc;
-   enum {
-      TU_GRALLOC_UNKNOWN,
-      TU_GRALLOC_CROS,
-      TU_GRALLOC_OTHER,
-   } gralloc_type;
-#endif
+   struct list_head shader_slabs;
+   mtx_t shader_slab_mutex;
 
-   uint32_t submit_count;
-
-   struct u_trace_context trace_context;
-
-   #ifdef HAVE_PERFETTO
-   struct tu_perfetto_state perfetto;
-   #endif
+   struct tu_device_extension_table enabled_extensions;
 };
 
-void tu_init_clear_blit_shaders(struct tu_device *dev);
-
-void tu_destroy_clear_blit_shaders(struct tu_device *dev);
-
-VkResult
-tu_device_submit_deferred_locked(struct tu_device *dev);
-
-VkResult
-tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
-
-uint64_t
-tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
-
-VkResult
-tu_device_check_status(struct vk_device *vk_device);
+struct tu_bo
+{
+   uint32_t gem_handle;
+   uint64_t size;
+   uint64_t iova;
+   void *map;
+};
 
 VkResult
-tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
-               enum tu_bo_alloc_flags flags);
+tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size);
 VkResult
 tu_bo_init_dmabuf(struct tu_device *dev,
-                  struct tu_bo **bo,
+                  struct tu_bo *bo,
                   uint64_t size,
                   int fd);
 int
@@ -642,28 +492,6 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
 VkResult
 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
 
-static inline struct tu_bo *
-tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
-{
-   return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
-}
-
-static inline struct tu_bo *
-tu_bo_get_ref(struct tu_bo *bo)
-{
-   p_atomic_inc(&bo->refcnt);
-   return bo;
-}
-
-/* Get a scratch bo for use inside a command buffer. This will always return
- * the same bo given the same size or similar sizes, so only one scratch bo
- * can be used at the same time. It's meant for short-lived things where we
- * need to write to some piece of memory, read from it, and then immediately
- * discard it.
- */
-VkResult
-tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
-
 struct tu_cs_entry
 {
    /* No ownership */
@@ -673,58 +501,6 @@ struct tu_cs_entry
    uint32_t offset;
 };
 
-struct tu_cs_memory {
-   uint32_t *map;
-   uint64_t iova;
-};
-
-struct tu_draw_state {
-   uint64_t iova : 48;
-   uint32_t size : 16;
-};
-
-enum tu_dynamic_state
-{
-   /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
-   TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
-   TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
-   TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
-   TU_DYNAMIC_STATE_VB_STRIDE,
-   TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
-   TU_DYNAMIC_STATE_COUNT,
-   /* no associated draw state: */
-   TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
-   TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
-   /* re-use the line width enum as it uses GRAS_SU_CNTL: */
-   TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
-};
-
-enum tu_draw_state_group_id
-{
-   TU_DRAW_STATE_PROGRAM_CONFIG,
-   TU_DRAW_STATE_PROGRAM,
-   TU_DRAW_STATE_PROGRAM_BINNING,
-   TU_DRAW_STATE_VB,
-   TU_DRAW_STATE_VI,
-   TU_DRAW_STATE_VI_BINNING,
-   TU_DRAW_STATE_RAST,
-   TU_DRAW_STATE_BLEND,
-   TU_DRAW_STATE_SHADER_GEOM_CONST,
-   TU_DRAW_STATE_FS_CONST,
-   TU_DRAW_STATE_DESC_SETS,
-   TU_DRAW_STATE_DESC_SETS_LOAD,
-   TU_DRAW_STATE_VS_PARAMS,
-   TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
-   TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
-   TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
-   TU_DRAW_STATE_PRIM_MODE_GMEM,
-   TU_DRAW_STATE_PRIM_MODE_SYSMEM,
-
-   /* dynamic state related draw states */
-   TU_DRAW_STATE_DYNAMIC,
-   TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
-};
-
 enum tu_cs_mode
 {
 
@@ -765,7 +541,6 @@ struct tu_cs
    uint32_t *reserved_end;
    uint32_t *end;
 
-   struct tu_device *device;
    enum tu_cs_mode mode;
    uint32_t next_bo_size;
 
@@ -776,20 +551,20 @@ struct tu_cs
    struct tu_bo **bos;
    uint32_t bo_count;
    uint32_t bo_capacity;
-
-   /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
-   struct tu_bo *refcount_bo;
-
-   /* state for cond_exec_start/cond_exec_end */
-   uint32_t cond_flags;
-   uint32_t *cond_dwords;
 };
 
 struct tu_device_memory
 {
-   struct vk_object_base base;
+   struct tu_bo bo;
+   VkDeviceSize size;
 
-   struct tu_bo *bo;
+   /* for dedicated allocations */
+   struct tu_image *image;
+   struct tu_buffer *buffer;
+
+   uint32_t type_index;
+   void *map;
+   void *user_ptr;
 };
 
 struct tu_descriptor_range
@@ -800,19 +575,18 @@ struct tu_descriptor_range
 
 struct tu_descriptor_set
 {
-   struct vk_object_base base;
-
-   /* Link to descriptor pool's desc_sets list . */
-   struct list_head pool_link;
-
-   struct tu_descriptor_set_layout *layout;
-   struct tu_descriptor_pool *pool;
+   const struct tu_descriptor_set_layout *layout;
    uint32_t size;
 
    uint64_t va;
    uint32_t *mapped_ptr;
+   struct tu_descriptor_range *dynamic_descriptors;
+};
 
-   uint32_t *dynamic_descriptors;
+struct tu_push_descriptor_set
+{
+   struct tu_descriptor_set set;
+   uint32_t capacity;
 };
 
 struct tu_descriptor_pool_entry
@@ -824,18 +598,13 @@ struct tu_descriptor_pool_entry
 
 struct tu_descriptor_pool
 {
-   struct vk_object_base base;
-
-   struct tu_bo *bo;
+   uint8_t *mapped_ptr;
    uint64_t current_offset;
    uint64_t size;
 
    uint8_t *host_memory_base;
    uint8_t *host_memory_ptr;
    uint8_t *host_memory_end;
-   uint8_t *host_bo;
-
-   struct list_head desc_sets;
 
    uint32_t entry_count;
    uint32_t max_entry_count;
@@ -866,13 +635,11 @@ struct tu_descriptor_update_template_entry
    size_t src_stride;
 
    /* For push descriptors */
-   const struct tu_sampler *immutable_samplers;
+   const uint32_t *immutable_samplers;
 };
 
 struct tu_descriptor_update_template
 {
-   struct vk_object_base base;
-
    uint32_t entry_count;
    VkPipelineBindPoint bind_point;
    struct tu_descriptor_update_template_entry entry[0];
@@ -880,257 +647,175 @@ struct tu_descriptor_update_template
 
 struct tu_buffer
 {
-   struct vk_object_base base;
-
    VkDeviceSize size;
 
    VkBufferUsageFlags usage;
    VkBufferCreateFlags flags;
 
    struct tu_bo *bo;
-   uint64_t iova;
+   VkDeviceSize bo_offset;
 };
 
-const char *
-tu_get_debug_option_name(int id);
-
-const char *
-tu_get_perftest_option_name(int id);
+enum tu_dynamic_state_bits
+{
+   TU_DYNAMIC_VIEWPORT = 1 << 0,
+   TU_DYNAMIC_SCISSOR = 1 << 1,
+   TU_DYNAMIC_LINE_WIDTH = 1 << 2,
+   TU_DYNAMIC_DEPTH_BIAS = 1 << 3,
+   TU_DYNAMIC_BLEND_CONSTANTS = 1 << 4,
+   TU_DYNAMIC_DEPTH_BOUNDS = 1 << 5,
+   TU_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 6,
+   TU_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7,
+   TU_DYNAMIC_STENCIL_REFERENCE = 1 << 8,
+   TU_DYNAMIC_DISCARD_RECTANGLE = 1 << 9,
+   TU_DYNAMIC_ALL = (1 << 10) - 1,
+};
+
+struct tu_vertex_binding
+{
+   struct tu_buffer *buffer;
+   VkDeviceSize offset;
+};
 
-struct tu_descriptor_state
+struct tu_viewport_state
 {
-   struct tu_descriptor_set *sets[MAX_SETS];
-   struct tu_descriptor_set push_set;
-   uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
+   uint32_t count;
+   VkViewport viewports[MAX_VIEWPORTS];
 };
 
-enum tu_cmd_dirty_bits
+struct tu_scissor_state
 {
-   TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
-   TU_CMD_DIRTY_VB_STRIDE = BIT(1),
-   TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
-   TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
-   TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
-   TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
-   TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
-   TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
-   TU_CMD_DIRTY_LRZ = BIT(8),
-   TU_CMD_DIRTY_VS_PARAMS = BIT(9),
-   TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
-   TU_CMD_DIRTY_VIEWPORTS = BIT(11),
-   /* all draw states were disabled and need to be re-enabled: */
-   TU_CMD_DIRTY_DRAW_STATE = BIT(12)
+   uint32_t count;
+   VkRect2D scissors[MAX_SCISSORS];
 };
 
-/* There are only three cache domains we have to care about: the CCU, or
- * color cache unit, which is used for color and depth/stencil attachments
- * and copy/blit destinations, and is split conceptually into color and depth,
- * and the universal cache or UCHE which is used for pretty much everything
- * else, except for the CP (uncached) and host. We need to flush whenever data
- * crosses these boundaries.
- */
+struct tu_discard_rectangle_state
+{
+   uint32_t count;
+   VkRect2D rectangles[MAX_DISCARD_RECTANGLES];
+};
 
-enum tu_cmd_access_mask {
-   TU_ACCESS_UCHE_READ = 1 << 0,
-   TU_ACCESS_UCHE_WRITE = 1 << 1,
-   TU_ACCESS_CCU_COLOR_READ = 1 << 2,
-   TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
-   TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
-   TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
-
-   /* Experiments have shown that while it's safe to avoid flushing the CCU
-    * after each blit/renderpass, it's not safe to assume that subsequent
-    * lookups with a different attachment state will hit unflushed cache
-    * entries. That is, the CCU needs to be flushed and possibly invalidated
-    * when accessing memory with a different attachment state. Writing to an
-    * attachment under the following conditions after clearing using the
-    * normal 2d engine path is known to have issues:
-    *
-    * - It isn't the 0'th layer.
-    * - There are more than one attachment, and this isn't the 0'th attachment
-    *   (this seems to also depend on the cpp of the attachments).
-    *
-    * Our best guess is that the layer/MRT state is used when computing
-    * the location of a cache entry in CCU, to avoid conflicts. We assume that
-    * any access in a renderpass after or before an access by a transfer needs
-    * a flush/invalidate, and use the _INCOHERENT variants to represent access
-    * by a renderpass.
+struct tu_dynamic_state
+{
+   /**
+    * Bitmask of (1 << VK_DYNAMIC_STATE_*).
+    * Defines the set of saved dynamic state.
     */
-   TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
-   TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
-   TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
-   TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
+   uint32_t mask;
 
-   /* Accesses which bypasses any cache. e.g. writes via the host,
-    * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
-    */
-   TU_ACCESS_SYSMEM_READ = 1 << 10,
-   TU_ACCESS_SYSMEM_WRITE = 1 << 11,
+   struct tu_viewport_state viewport;
 
-   /* Memory writes from the CP start in-order with draws and event writes,
-    * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
-    */
-   TU_ACCESS_CP_WRITE = 1 << 12,
-
-   TU_ACCESS_READ =
-      TU_ACCESS_UCHE_READ |
-      TU_ACCESS_CCU_COLOR_READ |
-      TU_ACCESS_CCU_DEPTH_READ |
-      TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
-      TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
-      TU_ACCESS_SYSMEM_READ,
-
-   TU_ACCESS_WRITE =
-      TU_ACCESS_UCHE_WRITE |
-      TU_ACCESS_CCU_COLOR_WRITE |
-      TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
-      TU_ACCESS_CCU_DEPTH_WRITE |
-      TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
-      TU_ACCESS_SYSMEM_WRITE |
-      TU_ACCESS_CP_WRITE,
-
-   TU_ACCESS_ALL =
-      TU_ACCESS_READ |
-      TU_ACCESS_WRITE,
-};
+   struct tu_scissor_state scissor;
 
-/* Starting with a6xx, the pipeline is split into several "clusters" (really
- * pipeline stages). Each stage has its own pair of register banks and can
- * switch them independently, so that earlier stages can run ahead of later
- * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
- * the same time.
- *
- * As a result of this, we need to insert a WFI when an earlier stage depends
- * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
- * pending WFI's to complete before starting, and usually before reading
- * indirect params even, so a WFI also acts as a full "pipeline stall".
- *
- * Note, the names of the stages come from CLUSTER_* in devcoredump. We
- * include all the stages for completeness, even ones which do not read/write
- * anything.
- */
+   float line_width;
 
-enum tu_stage {
-   /* This doesn't correspond to a cluster, but we need it for tracking
-    * indirect draw parameter reads etc.
-    */
-   TU_STAGE_CP,
+   struct
+   {
+      float bias;
+      float clamp;
+      float slope;
+   } depth_bias;
 
-   /* - Fetch index buffer
-    * - Fetch vertex attributes, dispatch VS
-    */
-   TU_STAGE_FE,
+   float blend_constants[4];
 
-   /* Execute all geometry stages (VS thru GS) */
-   TU_STAGE_SP_VS,
+   struct
+   {
+      float min;
+      float max;
+   } depth_bounds;
 
-   /* Write to VPC, do primitive assembly. */
-   TU_STAGE_PC_VS,
+   struct
+   {
+      uint32_t front;
+      uint32_t back;
+   } stencil_compare_mask;
 
-   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
-    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
-    * early depth testing is enabled before dispatching fragments? However
-    * GRAS reads and writes LRZ directly.
-    */
-   TU_STAGE_GRAS,
+   struct
+   {
+      uint32_t front;
+      uint32_t back;
+   } stencil_write_mask;
 
-   /* Execute FS */
-   TU_STAGE_SP_PS,
+   struct
+   {
+      uint32_t front;
+      uint32_t back;
+   } stencil_reference;
 
-   /* - Fragment tests
-    * - Write color/depth
-    * - Streamout writes (???)
-    * - Varying interpolation (???)
-    */
-   TU_STAGE_PS,
+   struct tu_discard_rectangle_state discard_rectangle;
 };
 
-enum tu_cmd_flush_bits {
-   TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
-   TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
-   TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
-   TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
-   TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
-   TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
-   TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
-   TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
-   TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
-
-   TU_CMD_FLAG_ALL_FLUSH =
-      TU_CMD_FLAG_CCU_FLUSH_DEPTH |
-      TU_CMD_FLAG_CCU_FLUSH_COLOR |
-      TU_CMD_FLAG_CACHE_FLUSH |
-      /* Treat the CP as a sort of "cache" which may need to be "flushed" via
-       * waiting for writes to land with WAIT_FOR_MEM_WRITES.
-       */
-      TU_CMD_FLAG_WAIT_MEM_WRITES,
-
-   TU_CMD_FLAG_ALL_INVALIDATE =
-      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
-      TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
-      TU_CMD_FLAG_CACHE_INVALIDATE |
-      /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
-       * a command that needs CP_WAIT_FOR_ME is executed. This means we may
-       * insert an extra WAIT_FOR_ME before an indirect command requiring it
-       * in case there was another command before the current command buffer
-       * that it needs to wait for.
-       */
-      TU_CMD_FLAG_WAIT_FOR_ME,
-};
+extern const struct tu_dynamic_state default_dynamic_state;
 
-/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
- * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
- * which part of the gmem is used by the CCU. Here we keep track of what the
- * state of the CCU.
- */
-enum tu_cmd_ccu_state {
-   TU_CMD_CCU_SYSMEM,
-   TU_CMD_CCU_GMEM,
-   TU_CMD_CCU_UNKNOWN,
-};
+const char *
+tu_get_debug_option_name(int id);
 
-struct tu_cache_state {
-   /* Caches which must be made available (flushed) eventually if there are
-    * any users outside that cache domain, and caches which must be
-    * invalidated eventually if there are any reads.
-    */
-   enum tu_cmd_flush_bits pending_flush_bits;
-   /* Pending flushes */
-   enum tu_cmd_flush_bits flush_bits;
-};
+const char *
+tu_get_perftest_option_name(int id);
 
-enum tu_lrz_force_disable_mask {
-   TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
-   TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
+/**
+ * Attachment state when recording a renderpass instance.
+ *
+ * The clear value is valid only if there exists a pending clear.
+ */
+struct tu_attachment_state
+{
+   VkImageAspectFlags pending_clear_aspects;
+   uint32_t cleared_views;
+   VkClearValue clear_value;
+   VkImageLayout current_layout;
 };
 
-enum tu_lrz_direction {
-   TU_LRZ_UNKNOWN,
-   /* Depth func less/less-than: */
-   TU_LRZ_LESS,
-   /* Depth func greater/greater-than: */
-   TU_LRZ_GREATER,
+struct tu_descriptor_state
+{
+   struct tu_descriptor_set *sets[MAX_SETS];
+   uint32_t dirty;
+   uint32_t valid;
+   struct tu_push_descriptor_set push_set;
+   bool push_dirty;
+   uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS];
 };
 
-struct tu_lrz_pipeline
+struct tu_tile
 {
-   uint32_t force_disable_mask;
-   bool fs_has_kill;
-   bool force_late_z;
-   bool early_fragment_tests;
+   uint8_t pipe;
+   uint8_t slot;
+   VkOffset2D begin;
+   VkOffset2D end;
 };
 
-struct tu_lrz_state
+struct tu_tiling_config
 {
-   /* Depth/Stencil image currently on use to do LRZ */
-   struct tu_image *image;
-   bool valid : 1;
-   enum tu_lrz_direction prev_direction;
+   VkRect2D render_area;
+   uint32_t buffer_cpp[MAX_RTS + 2];
+   uint32_t buffer_count;
+
+   /* position and size of the first tile */
+   VkRect2D tile0;
+   /* number of tiles */
+   VkExtent2D tile_count;
+
+   uint32_t gmem_offsets[MAX_RTS + 2];
+
+   /* size of the first VSC pipe */
+   VkExtent2D pipe0;
+   /* number of VSC pipes */
+   VkExtent2D pipe_count;
+
+   /* pipe register values */
+   uint32_t pipe_config[MAX_VSC_PIPES];
+   uint32_t pipe_sizes[MAX_VSC_PIPES];
 };
 
-struct tu_vs_params {
-   uint32_t vertex_offset;
-   uint32_t first_instance;
+enum tu_cmd_dirty_bits
+{
+   TU_CMD_DIRTY_PIPELINE = 1 << 0,
+   TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1,
+
+   TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16,
+   TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17,
+   TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 18,
+   TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 19,
 };
 
 struct tu_cmd_state
@@ -1138,119 +823,48 @@ struct tu_cmd_state
    uint32_t dirty;
 
    struct tu_pipeline *pipeline;
-   struct tu_pipeline *compute_pipeline;
 
-   /* Vertex buffers, viewports, and scissors
-    * the states for these can be updated partially, so we need to save these
-    * to be able to emit a complete draw state
-    */
-   struct {
-      uint64_t base;
-      uint32_t size;
-      uint32_t stride;
-   } vb[MAX_VBS];
-   VkViewport viewport[MAX_VIEWPORTS];
-   VkRect2D scissor[MAX_SCISSORS];
-   uint32_t max_viewport, max_scissor;
-
-   /* for dynamic states that can't be emitted directly */
-   uint32_t dynamic_stencil_mask;
-   uint32_t dynamic_stencil_wrmask;
-   uint32_t dynamic_stencil_ref;
-
-   uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
-   uint32_t pc_raster_cntl, vpc_unknown_9107;
-   enum pc_di_primtype primtype;
-   bool primitive_restart_enable;
-
-   /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
-   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
-   struct tu_draw_state vertex_buffers;
-   struct tu_draw_state shader_const[2];
-   struct tu_draw_state desc_sets;
-
-   struct tu_draw_state vs_params;
+   /* Vertex buffers */
+   struct
+   {
+      struct tu_buffer *buffers[MAX_VBS];
+      VkDeviceSize offsets[MAX_VBS];
+   } vb;
+
+   struct tu_dynamic_state dynamic;
 
    /* Index buffer */
-   uint64_t index_va;
+   struct tu_buffer *index_buffer;
+   uint64_t index_offset;
+   uint32_t index_type;
    uint32_t max_index_count;
-   uint8_t index_size;
-
-   /* because streamout base has to be 32-byte aligned
-    * there is an extra offset to deal with when it is
-    * unaligned
-    */
-   uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
-
-   /* Renderpasses are tricky, because we may need to flush differently if
-    * using sysmem vs. gmem and therefore we have to delay any flushing that
-    * happens before a renderpass. So we have to have two copies of the flush
-    * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
-    * and one for outside a renderpass.
-    */
-   struct tu_cache_state cache;
-   struct tu_cache_state renderpass_cache;
-
-   enum tu_cmd_ccu_state ccu_state;
+   uint64_t index_va;
 
    const struct tu_render_pass *pass;
    const struct tu_subpass *subpass;
    const struct tu_framebuffer *framebuffer;
-   VkRect2D render_area;
-
-   const struct tu_image_view **attachments;
+   struct tu_attachment_state *attachments;
 
-   bool xfb_used;
-   bool has_tess;
-   bool tessfactor_addr_set;
-   bool has_subpass_predication;
-   bool predication_active;
-   bool disable_gmem;
-   enum a5xx_line_mode line_mode;
-   bool z_negative_one_to_one;
+   struct tu_tiling_config tiling_config;
 
-   uint32_t drawcall_count;
-
-   /* A calculated "draw cost" value for renderpass, which tries to
-    * estimate the bandwidth-per-sample of all the draws according
-    * to:
-    *
-    *    foreach_draw (...) {
-    *      cost += num_frag_outputs;
-    *      if (blend_enabled)
-    *        cost += num_blend_enabled;
-    *      if (depth_test_enabled)
-    *        cost++;
-    *      if (depth_write_enabled)
-    *        cost++;
-    *    }
-    *
-    * The idea is that each sample-passed minimally does one write
-    * per MRT.  If blend is enabled, the hw will additionally do
-    * a framebuffer read per sample-passed (for each MRT with blend
-    * enabled).  If depth-test is enabled, the hw will additionally
-    * a depth buffer read.  If depth-write is enable, the hw will
-    * additionally do a depth buffer write.
-    *
-    * This does ignore depth buffer traffic for samples which do not
-    * pass do to depth-test fail, and some other details.  But it is
-    * just intended to be a rough estimate that is easy to calculate.
-    */
-   uint32_t total_drawcalls_cost;
-
-   struct tu_lrz_state lrz;
-
-   struct tu_draw_state lrz_and_depth_plane_state;
-
-   struct tu_vs_params last_vs_params;
+   struct tu_cs_entry tile_load_ib;
+   struct tu_cs_entry tile_store_ib;
 };
 
 struct tu_cmd_pool
 {
-   struct vk_command_pool vk;
-
+   VkAllocationCallbacks alloc;
    struct list_head cmd_buffers;
    struct list_head free_cmd_buffers;
+   uint32_t queue_family_index;
+};
+
+struct tu_cmd_buffer_upload
+{
+   uint8_t *map;
+   unsigned offset;
+   uint64_t size;
+   struct list_head list;
 };
 
 enum tu_cmd_buffer_status
@@ -1262,116 +876,165 @@ enum tu_cmd_buffer_status
    TU_CMD_BUFFER_STATUS_PENDING,
 };
 
+struct tu_bo_list
+{
+   uint32_t count;
+   uint32_t capacity;
+   struct drm_msm_gem_submit_bo *bo_infos;
+};
+
+#define TU_BO_LIST_FAILED (~0)
+
+void
+tu_bo_list_init(struct tu_bo_list *list);
+void
+tu_bo_list_destroy(struct tu_bo_list *list);
+void
+tu_bo_list_reset(struct tu_bo_list *list);
+uint32_t
+tu_bo_list_add(struct tu_bo_list *list,
+               const struct tu_bo *bo,
+               uint32_t flags);
+VkResult
+tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other);
+
 struct tu_cmd_buffer
 {
-   struct vk_command_buffer vk;
+   VK_LOADER_DATA _loader_data;
 
    struct tu_device *device;
 
    struct tu_cmd_pool *pool;
    struct list_head pool_link;
 
-   struct u_trace trace;
-   struct u_trace_iterator trace_renderpass_start;
-   struct u_trace_iterator trace_renderpass_end;
-
-   struct list_head renderpass_autotune_results;
-   struct tu_autotune_results_buffer* autotune_buffer;
-
    VkCommandBufferUsageFlags usage_flags;
+   VkCommandBufferLevel level;
    enum tu_cmd_buffer_status status;
 
    struct tu_cmd_state state;
+   struct tu_vertex_binding vertex_bindings[MAX_VBS];
    uint32_t queue_family_index;
 
-   uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
+   uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
    VkShaderStageFlags push_constant_stages;
    struct tu_descriptor_set meta_push_descriptors;
 
-   struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
+   struct tu_descriptor_state descriptors[VK_PIPELINE_BIND_POINT_RANGE_SIZE];
+
+   struct tu_cmd_buffer_upload upload;
 
    VkResult record_result;
 
+   struct tu_bo_list bo_list;
    struct tu_cs cs;
    struct tu_cs draw_cs;
-   struct tu_cs tile_store_cs;
-   struct tu_cs draw_epilogue_cs;
-   struct tu_cs sub_cs;
+   struct tu_cs tile_cs;
 
-   uint32_t vsc_draw_strm_pitch;
-   uint32_t vsc_prim_strm_pitch;
-};
+   uint16_t marker_reg;
+   uint32_t marker_seqno;
 
-/* Temporary struct for tracking a register state to be written, used by
- * a6xx-pack.h and tu_cs_emit_regs()
- */
-struct tu_reg_value {
-   uint32_t reg;
-   uint64_t value;
-   bool is_address;
-   struct tu_bo *bo;
-   bool bo_write;
-   uint32_t bo_offset;
-   uint32_t bo_shift;
+   struct tu_bo scratch_bo;
+   uint32_t scratch_seqno;
+
+   bool wait_for_idle;
 };
 
+void
+tu6_emit_event_write(struct tu_cmd_buffer *cmd,
+                     struct tu_cs *cs,
+                     enum vgt_event_type event,
+                     bool need_seqno);
+
+bool
+tu_get_memory_fd(struct tu_device *device,
+                 struct tu_device_memory *memory,
+                 int *pFD);
 
-void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
-                                    struct tu_cs *cs);
+/*
+ * Takes x,y,z as exact numbers of invocations, instead of blocks.
+ *
+ * Limitations: Can't call normal dispatch functions without binding or
+ * rebinding
+ *              the compute pipeline.
+ */
+void
+tu_unaligned_dispatch(struct tu_cmd_buffer *cmd_buffer,
+                      uint32_t x,
+                      uint32_t y,
+                      uint32_t z);
+
+struct tu_event
+{
+   uint64_t *map;
+};
 
-void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
-                             struct tu_cs *cs,
-                             enum tu_cmd_ccu_state ccu_state);
+struct tu_shader_module;
 
+#define TU_HASH_SHADER_IS_GEOM_COPY_SHADER (1 << 0)
+#define TU_HASH_SHADER_SISCHED (1 << 1)
+#define TU_HASH_SHADER_UNSAFE_MATH (1 << 2)
 void
-tu6_emit_event_write(struct tu_cmd_buffer *cmd,
-                     struct tu_cs *cs,
-                     enum vgt_event_type event);
+tu_hash_shaders(unsigned char *hash,
+                const VkPipelineShaderStageCreateInfo **stages,
+                const struct tu_pipeline_layout *layout,
+                const struct tu_pipeline_key *key,
+                uint32_t flags);
+
+static inline gl_shader_stage
+vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage)
+{
+   assert(__builtin_popcount(vk_stage) == 1);
+   return ffs(vk_stage) - 1;
+}
 
-static inline struct tu_descriptor_state *
-tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
-                         VkPipelineBindPoint bind_point)
+static inline VkShaderStageFlagBits
+mesa_to_vk_shader_stage(gl_shader_stage mesa_stage)
 {
-   return &cmd_buffer->descriptors[bind_point];
+   return (1 << mesa_stage);
 }
 
-struct tu_event
+#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
+
+#define tu_foreach_stage(stage, stage_bits)                                  \
+   for (gl_shader_stage stage,                                               \
+        __tmp = (gl_shader_stage)((stage_bits) &TU_STAGE_MASK);              \
+        stage = __builtin_ffs(__tmp) - 1, __tmp; __tmp &= ~(1 << (stage)))
+
+struct tu_shader_module
 {
-   struct vk_object_base base;
-   struct tu_bo *bo;
+   unsigned char sha1[20];
+
+   uint32_t code_size;
+   const uint32_t *code[0];
 };
 
-struct tu_push_constant_range
+struct tu_shader_compile_options
 {
-   uint32_t lo;
-   uint32_t count;
+   struct ir3_shader_key key;
+
+   bool optimize;
+   bool include_binning_pass;
 };
 
 struct tu_shader
 {
-   struct ir3_shader *ir3_shader;
+   struct ir3_shader ir3_shader;
 
-   struct tu_push_constant_range push_consts;
-   uint8_t active_desc_sets;
-   bool multi_pos_output;
-};
+   /* This may be true for vertex shaders.  When true, variants[1] is the
+    * binning variant and binning_binary is non-NULL.
+    */
+   bool has_binning_pass;
 
-bool
-tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
-                       struct tu_device *dev);
+   void *binary;
+   void *binning_binary;
 
-nir_shader *
-tu_spirv_to_nir(struct tu_device *dev,
-                void *mem_ctx,
-                const VkPipelineShaderStageCreateInfo *stage_info,
-                gl_shader_stage stage);
+   struct ir3_shader_variant variants[0];
+};
 
 struct tu_shader *
 tu_shader_create(struct tu_device *dev,
-                 nir_shader *nir,
+                 gl_shader_stage stage,
                  const VkPipelineShaderStageCreateInfo *stage_info,
-                 unsigned multiview_mask,
-                 struct tu_pipeline_layout *layout,
                  const VkAllocationCallbacks *alloc);
 
 void
@@ -1379,78 +1042,50 @@ tu_shader_destroy(struct tu_device *dev,
                   struct tu_shader *shader,
                   const VkAllocationCallbacks *alloc);
 
-struct tu_program_descriptor_linkage
-{
-   struct ir3_const_state const_state;
-
-   uint32_t constlen;
-
-   struct tu_push_constant_range push_consts;
-};
-
-struct tu_pipeline_executable {
-   gl_shader_stage stage;
-
-   struct ir3_info stats;
-   bool is_binning;
+void
+tu_shader_compile_options_init(
+   struct tu_shader_compile_options *options,
+   const VkGraphicsPipelineCreateInfo *pipeline_info);
 
-   char *nir_from_spirv;
-   char *nir_final;
-   char *disasm;
-};
+VkResult
+tu_shader_compile(struct tu_device *dev,
+                  struct tu_shader *shader,
+                  const struct tu_shader *next_stage,
+                  const struct tu_shader_compile_options *options,
+                  const VkAllocationCallbacks *alloc);
 
 struct tu_pipeline
 {
-   struct vk_object_base base;
-
    struct tu_cs cs;
-   struct tu_suballoc_bo bo;
 
-   /* Separate BO for private memory since it should GPU writable */
-   struct tu_bo *pvtmem_bo;
+   struct tu_dynamic_state dynamic_state;
+
+   struct tu_pipeline_layout *layout;
 
    bool need_indirect_descriptor_sets;
    VkShaderStageFlags active_stages;
-   uint32_t active_desc_sets;
-
-   /* mask of enabled dynamic states
-    * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
-    */
-   uint32_t dynamic_state_mask;
-   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
-
-   /* for dynamic states which use the same register: */
-   uint32_t gras_su_cntl, gras_su_cntl_mask;
-   uint32_t rb_depth_cntl, rb_depth_cntl_mask;
-   uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
-   uint32_t pc_raster_cntl, pc_raster_cntl_mask;
-   uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
-   uint32_t stencil_wrmask;
-
-   bool rb_depth_cntl_disable;
-
-   enum a5xx_line_mode line_mode;
-
-   /* draw states for the pipeline */
-   struct tu_draw_state load_state, rast_state, blend_state;
-   struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
-
-   /* for vertex buffers state */
-   uint32_t num_vbs;
 
    struct
    {
-      struct tu_draw_state config_state;
-      struct tu_draw_state state;
-      struct tu_draw_state binning_state;
-
-      struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
+      struct tu_bo binary_bo;
+      struct tu_cs_entry state_ib;
+      struct tu_cs_entry binning_state_ib;
    } program;
 
    struct
    {
-      struct tu_draw_state state;
-      struct tu_draw_state binning_state;
+      uint8_t bindings[MAX_VERTEX_ATTRIBS];
+      uint16_t strides[MAX_VERTEX_ATTRIBS];
+      uint16_t offsets[MAX_VERTEX_ATTRIBS];
+      uint32_t count;
+
+      uint8_t binning_bindings[MAX_VERTEX_ATTRIBS];
+      uint16_t binning_strides[MAX_VERTEX_ATTRIBS];
+      uint16_t binning_offsets[MAX_VERTEX_ATTRIBS];
+      uint32_t binning_count;
+
+      struct tu_cs_entry state_ib;
+      struct tu_cs_entry binning_state_ib;
    } vi;
 
    struct
@@ -1461,47 +1096,36 @@ struct tu_pipeline
 
    struct
    {
-      uint32_t patch_type;
-      uint32_t param_stride;
-      bool upper_left_domain_origin;
-   } tess;
+      struct tu_cs_entry state_ib;
+   } vp;
 
    struct
    {
-      uint32_t local_size[3];
-      uint32_t subgroup_size;
-   } compute;
-
-   bool provoking_vertex_last;
-
-   struct tu_lrz_pipeline lrz;
+      uint32_t gras_su_cntl;
+      struct tu_cs_entry state_ib;
+   } rast;
 
-   /* In other words - framebuffer fetch support */
-   bool raster_order_attachment_access;
-   bool subpass_feedback_loop_ds;
-
-   bool z_negative_one_to_one;
-
-   /* Base drawcall cost for sysmem vs gmem autotuner */
-   uint8_t drawcall_base_cost;
+   struct
+   {
+      struct tu_cs_entry state_ib;
+   } ds;
 
-   void *executables_mem_ctx;
-   /* tu_pipeline_executable */
-   struct util_dynarray executables;
+   struct
+   {
+      struct tu_cs_entry state_ib;
+   } blend;
 };
 
 void
-tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
-                  bool z_negative_one_to_one);
+tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport);
 
 void
-tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
+tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor);
 
 void
-tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
-
-void
-tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
+tu6_emit_gras_su_cntl(struct tu_cs *cs,
+                      uint32_t gras_su_cntl,
+                      float line_width);
 
 void
 tu6_emit_depth_bias(struct tu_cs *cs,
@@ -1509,143 +1133,106 @@ tu6_emit_depth_bias(struct tu_cs *cs,
                     float clamp,
                     float slope_factor);
 
-void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
-                   enum a5xx_line_mode line_mode);
-
-void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
-
-void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
-
-void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
-
-void tu6_apply_depth_bounds_workaround(struct tu_device *device,
-                                       uint32_t *rb_depth_cntl);
-
-struct tu_pvtmem_config {
-   uint64_t iova;
-   uint32_t per_fiber_size;
-   uint32_t per_sp_size;
-   bool per_wave;
-};
-
-void
-tu6_emit_xs_config(struct tu_cs *cs,
-                   gl_shader_stage stage,
-                   const struct ir3_shader_variant *xs);
-
-void
-tu6_emit_xs(struct tu_cs *cs,
-            gl_shader_stage stage,
-            const struct ir3_shader_variant *xs,
-            const struct tu_pvtmem_config *pvtmem,
-            uint64_t binary_iova);
-
-void
-tu6_emit_vpc(struct tu_cs *cs,
-             const struct ir3_shader_variant *vs,
-             const struct ir3_shader_variant *hs,
-             const struct ir3_shader_variant *ds,
-             const struct ir3_shader_variant *gs,
-             const struct ir3_shader_variant *fs,
-             uint32_t patch_control_points);
-
 void
-tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
-
-struct tu_image_view;
+tu6_emit_stencil_compare_mask(struct tu_cs *cs,
+                              uint32_t front,
+                              uint32_t back);
 
 void
-tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
-                  struct tu_cs *cs,
-                  const struct tu_image_view *src,
-                  const struct tu_image_view *dst,
-                  uint32_t layer_mask,
-                  uint32_t layers,
-                  const VkRect2D *rect);
+tu6_emit_stencil_write_mask(struct tu_cs *cs, uint32_t front, uint32_t back);
 
 void
-tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
-                           struct tu_cs *cs,
-                           uint32_t a,
-                           const VkRenderPassBeginInfo *info);
+tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back);
 
 void
-tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
-                         struct tu_cs *cs,
-                         uint32_t a,
-                         const VkRenderPassBeginInfo *info);
+tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]);
 
-void
-tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
-                        struct tu_cs *cs,
-                        uint32_t a,
-                        bool force_load);
+struct tu_userdata_info *
+tu_lookup_user_sgpr(struct tu_pipeline *pipeline,
+                    gl_shader_stage stage,
+                    int idx);
 
-/* expose this function to be able to emit load without checking LOAD_OP */
-void
-tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
+struct tu_shader_variant *
+tu_get_shader(struct tu_pipeline *pipeline, gl_shader_stage stage);
 
-/* note: gmem store can also resolve */
-void
-tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
-                         struct tu_cs *cs,
-                         uint32_t a,
-                         uint32_t gmem_a);
-
-enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
+struct tu_graphics_pipeline_create_info
+{
+   bool use_rectlist;
+   bool db_depth_clear;
+   bool db_stencil_clear;
+   bool db_depth_disable_expclear;
+   bool db_stencil_disable_expclear;
+   bool db_flush_depth_inplace;
+   bool db_flush_stencil_inplace;
+   bool db_resummarize;
+   uint32_t custom_blend_mode;
+};
 
 struct tu_native_format
 {
-   enum a6xx_format fmt : 8;
-   enum a3xx_color_swap swap : 8;
-   enum a6xx_tile_mode tile_mode : 8;
+   int vtx;      /* VFMTn_xxx or -1 */
+   int tex;      /* TFMTn_xxx or -1 */
+   int rb;       /* RBn_xxx or -1 */
+   int swap;     /* enum a3xx_color_swap */
+   bool present; /* internal only; always true to external users */
 };
 
-enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
-bool tu6_format_vtx_supported(VkFormat format);
-struct tu_native_format tu6_format_vtx(VkFormat format);
-bool tu6_format_color_supported(enum pipe_format format);
-struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
-bool tu6_format_texture_supported(enum pipe_format format);
-struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
+const struct tu_native_format *
+tu6_get_native_format(VkFormat format);
+
+int
+tu_pack_clear_value(const VkClearValue *val,
+                    VkFormat format,
+                    uint32_t buf[4]);
+enum a6xx_2d_ifmt tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt);
 
-static inline enum a6xx_format
-tu6_base_format(enum pipe_format format)
+struct tu_image_level
 {
-   /* note: tu6_format_color doesn't care about tiling for .fmt field */
-   return tu6_format_color(format, TILE6_LINEAR).fmt;
-}
+   VkDeviceSize offset;
+   VkDeviceSize size;
+   uint32_t pitch;
+};
 
 struct tu_image
 {
-   struct vk_object_base base;
-
+   VkImageType type;
    /* The original VkFormat provided by the client.  This may not match any
     * of the actual surface formats.
     */
    VkFormat vk_format;
+   VkImageAspectFlags aspects;
+   VkImageUsageFlags usage;  /**< Superset of VkImageCreateInfo::usage. */
+   VkImageTiling tiling;     /** VkImageCreateInfo::tiling */
+   VkImageCreateFlags flags; /** VkImageCreateInfo::flags */
+   VkExtent3D extent;
    uint32_t level_count;
    uint32_t layer_count;
 
-   struct fdl_layout layout[3];
-   uint32_t total_size;
+   VkDeviceSize size;
+   uint32_t alignment;
+
+   /* memory layout */
+   VkDeviceSize layer_size;
+   struct tu_image_level levels[15];
+   unsigned tile_mode;
+
+   unsigned queue_family_mask;
+   bool exclusive;
+   bool shareable;
 
-#ifdef ANDROID
    /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
    VkDeviceMemory owned_memory;
-#endif
 
    /* Set when bound */
-   struct tu_bo *bo;
-   uint64_t iova;
-
-   uint32_t lrz_height;
-   uint32_t lrz_pitch;
-   uint32_t lrz_offset;
-
-   bool shareable;
+   const struct tu_bo *bo;
+   VkDeviceSize bo_offset;
 };
 
+unsigned
+tu_image_queue_family_mask(const struct tu_image *image,
+                           uint32_t family,
+                           uint32_t queue_family);
+
 static inline uint32_t
 tu_get_layerCount(const struct tu_image *image,
                   const VkImageSubresourceRange *range)
@@ -1664,108 +1251,99 @@ tu_get_levelCount(const struct tu_image *image,
              : range->levelCount;
 }
 
-enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
-
-uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
-
-enum pipe_format tu_format_for_aspect(enum pipe_format format,
-                                      VkImageAspectFlags aspect_mask);
-
 struct tu_image_view
 {
-   struct vk_object_base base;
-
    struct tu_image *image; /**< VkImageViewCreateInfo::image */
 
-   struct fdl6_view view;
+   VkImageViewType type;
+   VkImageAspectFlags aspect_mask;
+   VkFormat vk_format;
+   uint32_t base_layer;
+   uint32_t layer_count;
+   uint32_t base_mip;
+   uint32_t level_count;
+   VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */
 
-   /* for d32s8 separate depth */
-   uint64_t depth_base_addr;
-   uint32_t depth_layer_size;
-   uint32_t depth_PITCH;
+   uint32_t descriptor[16];
 
-   /* for d32s8 separate stencil */
-   uint64_t stencil_base_addr;
-   uint32_t stencil_layer_size;
-   uint32_t stencil_PITCH;
+   /* Descriptor for use as a storage image as opposed to a sampled image.
+    * This has a few differences for cube maps (e.g. type).
+    */
+   uint32_t storage_descriptor[16];
 };
 
-struct tu_sampler_ycbcr_conversion {
-   struct vk_object_base base;
-
-   VkFormat format;
-   VkSamplerYcbcrModelConversion ycbcr_model;
-   VkSamplerYcbcrRange ycbcr_range;
-   VkComponentMapping components;
-   VkChromaLocation chroma_offsets[2];
-   VkFilter chroma_filter;
+struct tu_sampler
+{
 };
 
-struct tu_sampler {
-   struct vk_object_base base;
-
-   uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
-   struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
+struct tu_image_create_info
+{
+   const VkImageCreateInfo *vk_info;
+   bool scanout;
+   bool no_metadata_planes;
 };
 
-void
-tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
-
-void
-tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
-
-void
-tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
-
-void
-tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-
-void
-tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-
-#define tu_image_view_stencil(iview, x) \
-   ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
-
-#define tu_image_view_depth(iview, x) \
-   ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT))
-
 VkResult
-tu_gralloc_info(struct tu_device *device,
-                const VkNativeBufferANDROID *gralloc_info,
-                int *dma_buf,
-                uint64_t *modifier);
+tu_image_create(VkDevice _device,
+                const struct tu_image_create_info *info,
+                const VkAllocationCallbacks *alloc,
+                VkImage *pImage);
 
 VkResult
-tu_import_memory_from_gralloc_handle(VkDevice device_h,
-                                     int dma_buf,
-                                     const VkAllocationCallbacks *alloc,
-                                     VkImage image_h);
+tu_image_from_gralloc(VkDevice device_h,
+                      const VkImageCreateInfo *base_info,
+                      const VkNativeBufferANDROID *gralloc_info,
+                      const VkAllocationCallbacks *alloc,
+                      VkImage *out_image_h);
 
 void
-tu_image_view_init(struct tu_image_view *iview,
-                   const VkImageViewCreateInfo *pCreateInfo,
-                   bool limited_z24s8);
-
-bool
-tiling_possible(VkFormat format);
-
-bool
-ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
-              const struct fd_dev_info *info, VkSampleCountFlagBits samples);
+tu_image_view_init(struct tu_image_view *view,
+                   struct tu_device *device,
+                   const VkImageViewCreateInfo *pCreateInfo);
 
 struct tu_buffer_view
 {
-   struct vk_object_base base;
-
-   uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
-
-   struct tu_buffer *buffer;
+   VkFormat vk_format;
+   uint64_t range; /**< VkBufferViewCreateInfo::range */
+   uint32_t state[4];
 };
 void
 tu_buffer_view_init(struct tu_buffer_view *view,
                     struct tu_device *device,
                     const VkBufferViewCreateInfo *pCreateInfo);
 
+static inline struct VkExtent3D
+tu_sanitize_image_extent(const VkImageType imageType,
+                         const struct VkExtent3D imageExtent)
+{
+   switch (imageType) {
+   case VK_IMAGE_TYPE_1D:
+      return (VkExtent3D) { imageExtent.width, 1, 1 };
+   case VK_IMAGE_TYPE_2D:
+      return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 };
+   case VK_IMAGE_TYPE_3D:
+      return imageExtent;
+   default:
+      unreachable("invalid image type");
+   }
+}
+
+static inline struct VkOffset3D
+tu_sanitize_image_offset(const VkImageType imageType,
+                         const struct VkOffset3D imageOffset)
+{
+   switch (imageType) {
+   case VK_IMAGE_TYPE_1D:
+      return (VkOffset3D) { imageOffset.x, 0, 0 };
+   case VK_IMAGE_TYPE_2D:
+      return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 };
+   case VK_IMAGE_TYPE_3D:
+      return imageOffset;
+   default:
+      unreachable("invalid image type");
+   }
+}
+
 struct tu_attachment_info
 {
    struct tu_image_view *attachment;
@@ -1773,146 +1351,100 @@ struct tu_attachment_info
 
 struct tu_framebuffer
 {
-   struct vk_object_base base;
-
    uint32_t width;
    uint32_t height;
    uint32_t layers;
 
-   /* size of the first tile */
-   VkExtent2D tile0;
-   /* number of tiles */
-   VkExtent2D tile_count;
-
-   /* size of the first VSC pipe */
-   VkExtent2D pipe0;
-   /* number of VSC pipes */
-   VkExtent2D pipe_count;
-
-   /* pipe register values */
-   uint32_t pipe_config[MAX_VSC_PIPES];
-   uint32_t pipe_sizes[MAX_VSC_PIPES];
-
    uint32_t attachment_count;
    struct tu_attachment_info attachments[0];
 };
 
-void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
-                             const struct tu_render_pass *pass);
-
-struct tu_subpass_barrier {
+struct tu_subpass_barrier
+{
    VkPipelineStageFlags src_stage_mask;
-   VkPipelineStageFlags dst_stage_mask;
    VkAccessFlags src_access_mask;
    VkAccessFlags dst_access_mask;
-   bool incoherent_ccu_color, incoherent_ccu_depth;
 };
 
+void
+tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
+                   const struct tu_subpass_barrier *barrier);
+
 struct tu_subpass_attachment
 {
    uint32_t attachment;
-
-   /* For input attachments, true if it needs to be patched to refer to GMEM
-    * in GMEM mode. This is false if it hasn't already been written as an
-    * attachment.
-    */
-   bool patch_input_gmem;
+   VkImageLayout layout;
 };
 
 struct tu_subpass
 {
    uint32_t input_count;
    uint32_t color_count;
-   uint32_t resolve_count;
-   bool resolve_depth_stencil;
-
-   bool feedback_loop_color;
-   bool feedback_loop_ds;
-
-   /* True if we must invalidate UCHE thanks to a feedback loop. */
-   bool feedback_invalidate;
-
-   /* In other words - framebuffer fetch support */
-   bool raster_order_attachment_access;
-
    struct tu_subpass_attachment *input_attachments;
    struct tu_subpass_attachment *color_attachments;
    struct tu_subpass_attachment *resolve_attachments;
    struct tu_subpass_attachment depth_stencil_attachment;
 
-   VkSampleCountFlagBits samples;
-
-   uint32_t srgb_cntl;
-   uint32_t multiview_mask;
+   /** Subpass has at least one resolve attachment */
+   bool has_resolve;
 
    struct tu_subpass_barrier start_barrier;
+
+   uint32_t view_mask;
+   VkSampleCountFlagBits max_sample_count;
 };
 
 struct tu_render_pass_attachment
 {
    VkFormat format;
    uint32_t samples;
-   uint32_t cpp;
-   VkImageAspectFlags clear_mask;
-   uint32_t clear_views;
-   bool load;
-   bool store;
-   int32_t gmem_offset;
-   /* for D32S8 separate stencil: */
-   bool load_stencil;
-   bool store_stencil;
-   int32_t gmem_offset_stencil;
+   VkAttachmentLoadOp load_op;
+   VkAttachmentLoadOp stencil_load_op;
+   VkImageLayout initial_layout;
+   VkImageLayout final_layout;
+   uint32_t view_mask;
 };
 
 struct tu_render_pass
 {
-   struct vk_object_base base;
-
    uint32_t attachment_count;
    uint32_t subpass_count;
-   uint32_t gmem_pixels;
-   uint32_t tile_align_w;
    struct tu_subpass_attachment *subpass_attachments;
    struct tu_render_pass_attachment *attachments;
    struct tu_subpass_barrier end_barrier;
    struct tu_subpass subpasses[0];
 };
 
-#define PERF_CNTRS_REG 4
-
-struct tu_perf_query_data
-{
-   uint32_t gid;      /* group-id */
-   uint32_t cid;      /* countable-id within the group */
-   uint32_t cntr_reg; /* counter register within the group */
-   uint32_t pass;     /* pass index that countables can be requested */
-   uint32_t app_idx;  /* index provided by apps */
-};
+VkResult
+tu_device_init_meta(struct tu_device *device);
+void
+tu_device_finish_meta(struct tu_device *device);
 
 struct tu_query_pool
 {
-   struct vk_object_base base;
-
-   VkQueryType type;
    uint32_t stride;
+   uint32_t availability_offset;
    uint64_t size;
-   uint32_t pipeline_statistics;
-   struct tu_bo *bo;
+   char *ptr;
+   VkQueryType type;
+   uint32_t pipeline_stats_mask;
+};
 
-   /* For performance query */
-   const struct fd_perfcntr_group *perf_group;
-   uint32_t perf_group_count;
-   uint32_t counter_index_count;
-   struct tu_perf_query_data perf_query_data[0];
+struct tu_semaphore
+{
+   uint32_t syncobj;
+   uint32_t temp_syncobj;
 };
 
-uint32_t
-tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
+void
+tu_set_descriptor_set(struct tu_cmd_buffer *cmd_buffer,
+                      VkPipelineBindPoint bind_point,
+                      struct tu_descriptor_set *set,
+                      unsigned idx);
 
 void
-tu_update_descriptor_sets(const struct tu_device *device,
+tu_update_descriptor_sets(struct tu_device *device,
+                          struct tu_cmd_buffer *cmd_buffer,
                           VkDescriptorSet overrideSet,
                           uint32_t descriptorWriteCount,
                           const VkWriteDescriptorSet *pDescriptorWrites,
@@ -1921,24 +1453,25 @@ tu_update_descriptor_sets(const struct tu_device *device,
 
 void
 tu_update_descriptor_set_with_template(
-   const struct tu_device *device,
+   struct tu_device *device,
+   struct tu_cmd_buffer *cmd_buffer,
    struct tu_descriptor_set *set,
    VkDescriptorUpdateTemplate descriptorUpdateTemplate,
    const void *pData);
 
-VkResult
-tu_physical_device_init(struct tu_physical_device *device,
-                        struct tu_instance *instance);
-VkResult
-tu_enumerate_devices(struct tu_instance *instance);
+void
+tu_meta_push_descriptor_set(struct tu_cmd_buffer *cmd_buffer,
+                            VkPipelineBindPoint pipelineBindPoint,
+                            VkPipelineLayout _layout,
+                            uint32_t set,
+                            uint32_t descriptorWriteCount,
+                            const VkWriteDescriptorSet *pDescriptorWrites);
 
 int
-tu_device_get_gpu_timestamp(struct tu_device *dev,
-                            uint64_t *ts);
+tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id);
 
 int
-tu_device_get_suspend_count(struct tu_device *dev,
-                            uint64_t *suspend_count);
+tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size);
 
 int
 tu_drm_submitqueue_new(const struct tu_device *dev,
@@ -1948,116 +1481,76 @@ tu_drm_submitqueue_new(const struct tu_device *dev,
 void
 tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
 
+uint32_t
+tu_gem_new(const struct tu_device *dev, uint64_t size, uint32_t flags);
+uint32_t
+tu_gem_import_dmabuf(const struct tu_device *dev,
+                     int prime_fd,
+                     uint64_t size);
 int
-tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2);
-
-int
-tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);
-
-VkResult
-tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);
-
+tu_gem_export_dmabuf(const struct tu_device *dev, uint32_t gem_handle);
 void
-tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
-                         void *ts_from, uint32_t from_offset,
-                         void *ts_to, uint32_t to_offset,
-                         uint32_t count);
-
-
-VkResult
-tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
-                            struct u_trace **trace_copy);
-
-/* If we copy trace and timestamps we will have to free them. */
-struct tu_u_trace_cmd_data
-{
-   struct tu_cs *timestamp_copy_cs;
-   struct u_trace *trace;
-};
-
-/* Data necessary to retrieve timestamps and clean all
- * associated resources afterwards.
- */
-struct tu_u_trace_submission_data
-{
-   uint32_t submission_id;
-   /* We have to know when timestamps are available,
-    * this sync object indicates it.
-    */
-   struct tu_u_trace_syncobj *syncobj;
-
-   uint32_t cmd_buffer_count;
-   uint32_t last_buffer_with_tracepoints;
-   struct tu_u_trace_cmd_data *cmd_trace_data;
-};
-
-VkResult
-tu_u_trace_submission_data_create(
-   struct tu_device *device,
-   struct tu_cmd_buffer **cmd_buffers,
-   uint32_t cmd_buffer_count,
-   struct tu_u_trace_submission_data **submission_data);
-
-void
-tu_u_trace_submission_data_finish(
-   struct tu_device *device,
-   struct tu_u_trace_submission_data *submission_data);
+tu_gem_close(const struct tu_device *dev, uint32_t gem_handle);
+uint64_t
+tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle);
+uint64_t
+tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle);
+
+#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType)                          \
+                                                                             \
+   static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
+   {                                                                         \
+      return (struct __tu_type *) _handle;                                   \
+   }                                                                         \
+                                                                             \
+   static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
+   {                                                                         \
+      return (__VkType) _obj;                                                \
+   }
+
+#define TU_DEFINE_NONDISP_HANDLE_CASTS(__tu_type, __VkType)                  \
+                                                                             \
+   static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
+   {                                                                         \
+      return (struct __tu_type *) (uintptr_t) _handle;                       \
+   }                                                                         \
+                                                                             \
+   static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
+   {                                                                         \
+      return (__VkType)(uintptr_t) _obj;                                     \
+   }
 
 #define TU_FROM_HANDLE(__tu_type, __name, __handle)                          \
-   VK_FROM_HANDLE(__tu_type, __name, __handle)
-
-VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
-                       VK_OBJECT_TYPE_COMMAND_BUFFER)
-VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
-VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
-                       VK_OBJECT_TYPE_INSTANCE)
-VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
-                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
-VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
-
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
-                               VK_OBJECT_TYPE_COMMAND_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
-                               VK_OBJECT_TYPE_BUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
-                               VK_OBJECT_TYPE_BUFFER_VIEW)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
-                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
-                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
-                               VkDescriptorSetLayout,
-                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
-                               VkDescriptorUpdateTemplate,
-                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
-                               VK_OBJECT_TYPE_DEVICE_MEMORY)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
-                               VK_OBJECT_TYPE_FRAMEBUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
-                               VK_OBJECT_TYPE_IMAGE_VIEW);
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
-                               VK_OBJECT_TYPE_PIPELINE_CACHE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
-                               VK_OBJECT_TYPE_PIPELINE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
-                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
-                               VK_OBJECT_TYPE_QUERY_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
-                               VK_OBJECT_TYPE_RENDER_PASS)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
-                               VK_OBJECT_TYPE_SAMPLER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
-                               VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
-
-/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
-#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
-
-void
-update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
+   struct __tu_type *__name = __tu_type##_from_handle(__handle)
+
+TU_DEFINE_HANDLE_CASTS(tu_cmd_buffer, VkCommandBuffer)
+TU_DEFINE_HANDLE_CASTS(tu_device, VkDevice)
+TU_DEFINE_HANDLE_CASTS(tu_instance, VkInstance)
+TU_DEFINE_HANDLE_CASTS(tu_physical_device, VkPhysicalDevice)
+TU_DEFINE_HANDLE_CASTS(tu_queue, VkQueue)
+
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, VkCommandPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, VkBuffer)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, VkBufferView)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, VkDescriptorPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, VkDescriptorSet)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout,
+                               VkDescriptorSetLayout)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template,
+                               VkDescriptorUpdateTemplate)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, VkDeviceMemory)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_fence, VkFence)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_event, VkEvent)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, VkFramebuffer)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image, VkImage)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, VkImageView);
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, VkPipelineCache)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, VkPipeline)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_shader_module, VkShaderModule)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_semaphore, VkSemaphore)
 
 #endif /* TU_PRIVATE_H */
author	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:10:09 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:10:09 +0000
commit	d1e8c371581041f403dcdcff4ab8a88e970d221e (patch)
tree	621cf3eea9401b6fc19ce2a6dc5aa7579ecc8c70 /lib/mesa/src/freedreno/vulkan/tu_private.h
parent	81f619d3e99a3a218e6318d06c2bc1a36052e75d (diff)