Import Mesa 19.2.8

author: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:10:09 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:10:09 +0000
commit: d1e8c371581041f403dcdcff4ab8a88e970d221e (patch)
tree: 621cf3eea9401b6fc19ce2a6dc5aa7579ecc8c70
parent: 81f619d3e99a3a218e6318d06c2bc1a36052e75d (diff)
9 files changed, 2742 insertions, 2679 deletions
diff --git a/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c b/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c
index 5cfc79fbf..b8b2ceda2 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_pipeline_cache.c
@@ -27,7 +27,6 @@
 #include "util/disk_cache.h"
 #include "util/mesa-sha1.h"
 #include "util/u_atomic.h"
-#include "vulkan/util/vk_util.h"
 
 struct cache_entry_variant_info
 {
@@ -44,7 +43,7 @@ struct cache_entry
    char code[0];
 };
 
-static void
+void
 tu_pipeline_cache_init(struct tu_pipeline_cache *cache,
                        struct tu_device *device)
 {
@@ -67,7 +66,7 @@ tu_pipeline_cache_init(struct tu_pipeline_cache *cache,
       memset(cache->hash_table, 0, byte_size);
 }
 
-static void
+void
 tu_pipeline_cache_finish(struct tu_pipeline_cache *cache)
 {
    for (unsigned i = 0; i < cache->table_size; ++i)
@@ -89,6 +88,41 @@ entry_size(struct cache_entry *entry)
    return ret;
 }
 
+void
+tu_hash_shaders(unsigned char *hash,
+                const VkPipelineShaderStageCreateInfo **stages,
+                const struct tu_pipeline_layout *layout,
+                const struct tu_pipeline_key *key,
+                uint32_t flags)
+{
+   struct mesa_sha1 ctx;
+
+   _mesa_sha1_init(&ctx);
+   if (key)
+      _mesa_sha1_update(&ctx, key, sizeof(*key));
+   if (layout)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
+      if (stages[i]) {
+         TU_FROM_HANDLE(tu_shader_module, module, stages[i]->module);
+         const VkSpecializationInfo *spec_info =
+            stages[i]->pSpecializationInfo;
+
+         _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
+         _mesa_sha1_update(&ctx, stages[i]->pName, strlen(stages[i]->pName));
+         if (spec_info) {
+            _mesa_sha1_update(
+               &ctx, spec_info->pMapEntries,
+               spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
+            _mesa_sha1_update(&ctx, spec_info->pData, spec_info->dataSize);
+         }
+      }
+   }
+   _mesa_sha1_update(&ctx, &flags, 4);
+   _mesa_sha1_final(&ctx, hash);
+}
+
 static struct cache_entry *
 tu_pipeline_cache_search_unlocked(struct tu_pipeline_cache *cache,
                                   const unsigned char *sha1)
@@ -162,7 +196,7 @@ tu_pipeline_cache_grow(struct tu_pipeline_cache *cache)
 
    table = malloc(byte_size);
    if (table == NULL)
-      return vk_error(cache, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cache->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    cache->hash_table = table;
    cache->table_size = table_size;
@@ -197,13 +231,22 @@ tu_pipeline_cache_add_entry(struct tu_pipeline_cache *cache,
       tu_pipeline_cache_set_entry(cache, entry);
 }
 
-static void
+struct cache_header
+{
+   uint32_t header_size;
+   uint32_t header_version;
+   uint32_t vendor_id;
+   uint32_t device_id;
+   uint8_t uuid[VK_UUID_SIZE];
+};
+
+void
 tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
                        const void *data,
                        size_t size)
 {
    struct tu_device *device = cache->device;
-   struct vk_pipeline_cache_header header;
+   struct cache_header header;
 
    if (size < sizeof(header))
       return;
@@ -212,9 +255,9 @@ tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
       return;
    if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
       return;
-   if (header.vendor_id != 0x5143)
+   if (header.vendor_id != 0 /* TODO */)
       return;
-   if (header.device_id != device->physical_device->dev_id.chip_id)
+   if (header.device_id != 0 /* TODO */)
       return;
    if (memcmp(header.uuid, device->physical_device->cache_uuid,
               VK_UUID_SIZE) != 0)
@@ -242,7 +285,7 @@ tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
    }
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
 tu_CreatePipelineCache(VkDevice _device,
                        const VkPipelineCacheCreateInfo *pCreateInfo,
                        const VkAllocationCallbacks *pAllocator,
@@ -254,15 +297,15 @@ tu_CreatePipelineCache(VkDevice _device,
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
    assert(pCreateInfo->flags == 0);
 
-   cache = vk_object_alloc(&device->vk, pAllocator, sizeof(*cache),
-                           VK_OBJECT_TYPE_PIPELINE_CACHE);
+   cache = vk_alloc2(&device->alloc, pAllocator, sizeof(*cache), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cache == NULL)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pAllocator)
       cache->alloc = *pAllocator;
    else
-      cache->alloc = device->vk.alloc;
+      cache->alloc = device->alloc;
 
    tu_pipeline_cache_init(cache, device);
 
@@ -276,7 +319,7 @@ tu_CreatePipelineCache(VkDevice _device,
    return VK_SUCCESS;
 }
 
-VKAPI_ATTR void VKAPI_CALL
+void
 tu_DestroyPipelineCache(VkDevice _device,
                         VkPipelineCache _cache,
                         const VkAllocationCallbacks *pAllocator)
@@ -288,10 +331,10 @@ tu_DestroyPipelineCache(VkDevice _device,
       return;
    tu_pipeline_cache_finish(cache);
 
-   vk_object_free(&device->vk, pAllocator, cache);
+   vk_free2(&device->alloc, pAllocator, cache);
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
 tu_GetPipelineCacheData(VkDevice _device,
                         VkPipelineCache _cache,
                         size_t *pDataSize,
@@ -299,7 +342,7 @@ tu_GetPipelineCacheData(VkDevice _device,
 {
    TU_FROM_HANDLE(tu_device, device, _device);
    TU_FROM_HANDLE(tu_pipeline_cache, cache, _cache);
-   struct vk_pipeline_cache_header *header;
+   struct cache_header *header;
    VkResult result = VK_SUCCESS;
 
    pthread_mutex_lock(&cache->mutex);
@@ -319,8 +362,8 @@ tu_GetPipelineCacheData(VkDevice _device,
    header = p;
    header->header_size = sizeof(*header);
    header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
-   header->vendor_id = 0x5143;
-   header->device_id = device->physical_device->dev_id.chip_id;
+   header->vendor_id = 0 /* TODO */;
+   header->device_id = 0 /* TODO */;
    memcpy(header->uuid, device->physical_device->cache_uuid, VK_UUID_SIZE);
    p += header->header_size;
 
@@ -361,7 +404,7 @@ tu_pipeline_cache_merge(struct tu_pipeline_cache *dst,
    }
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
 tu_MergePipelineCaches(VkDevice _device,
                        VkPipelineCache destCache,
                        uint32_t srcCacheCount,
diff --git a/lib/mesa/src/freedreno/vulkan/tu_private.h b/lib/mesa/src/freedreno/vulkan/tu_private.h
index 862d507c9..c2440471f 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_private.h
+++ b/lib/mesa/src/freedreno/vulkan/tu_private.h
@@ -40,47 +40,28 @@
 #include <valgrind.h>
 #define VG(x) x
 #else
-#define VG(x) ((void)0)
+#define VG(x)
 #endif
 
-#define MESA_LOG_TAG "TU"
-
 #include "c11/threads.h"
-#include "util/rounding.h"
-#include "util/bitscan.h"
+#include "compiler/shader_enums.h"
+#include "main/macros.h"
 #include "util/list.h"
-#include "util/log.h"
 #include "util/macros.h"
-#include "util/sparse_array.h"
-#include "util/u_atomic.h"
-#include "util/u_dynarray.h"
-#include "util/xmlconfig.h"
-#include "util/perf/u_trace.h"
 #include "vk_alloc.h"
 #include "vk_debug_report.h"
-#include "vk_device.h"
-#include "vk_dispatch_table.h"
-#include "vk_extensions.h"
-#include "vk_instance.h"
-#include "vk_log.h"
-#include "vk_physical_device.h"
-#include "vk_shader_module.h"
 #include "wsi_common.h"
 
+#include "drm-uapi/msm_drm.h"
 #include "ir3/ir3_compiler.h"
 #include "ir3/ir3_shader.h"
 
 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
 #include "a6xx.xml.h"
-#include "fdl/freedreno_layout.h"
-#include "common/freedreno_dev_info.h"
-#include "perfcntrs/freedreno_perfcntr.h"
 
 #include "tu_descriptor_set.h"
-#include "tu_autotune.h"
-#include "tu_util.h"
-#include "tu_perfetto.h"
+#include "tu_extensions.h"
 
 /* Pre-declarations needed for WSI entrypoints */
 struct wl_surface;
@@ -92,54 +73,143 @@ typedef uint32_t xcb_window_t;
 #include <vulkan/vk_android_native_buffer.h>
 #include <vulkan/vk_icd.h>
 #include <vulkan/vulkan.h>
+#include <vulkan/vulkan_intel.h>
 
 #include "tu_entrypoints.h"
 
-#include "vk_format.h"
-#include "vk_image.h"
-#include "vk_command_buffer.h"
-#include "vk_command_pool.h"
-#include "vk_queue.h"
-#include "vk_object.h"
-#include "vk_sync.h"
-#include "vk_fence.h"
-#include "vk_semaphore.h"
-#include "vk_drm_syncobj.h"
-#include "vk_sync_timeline.h"
-
 #define MAX_VBS 32
 #define MAX_VERTEX_ATTRIBS 32
 #define MAX_RTS 8
 #define MAX_VSC_PIPES 32
-#define MAX_VIEWPORTS 16
-#define MAX_VIEWPORT_SIZE (1 << 14)
+#define MAX_VIEWPORTS 1
 #define MAX_SCISSORS 16
 #define MAX_DISCARD_RECTANGLES 4
 #define MAX_PUSH_CONSTANTS_SIZE 128
 #define MAX_PUSH_DESCRIPTORS 32
 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16
 #define MAX_DYNAMIC_STORAGE_BUFFERS 8
-#define MAX_DYNAMIC_BUFFERS_SIZE                                             \
-   (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) *         \
-   A6XX_TEX_CONST_DWORDS
-
+#define MAX_DYNAMIC_BUFFERS                                                  \
+   (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS)
+#define MAX_SAMPLES_LOG2 4
+#define NUM_META_FS_KEYS 13
 #define TU_MAX_DRM_DEVICES 8
-#define MAX_VIEWS 16
-#define MAX_BIND_POINTS 2 /* compute + graphics */
-/* The Qualcomm driver exposes 0x20000058 */
-#define MAX_STORAGE_BUFFER_RANGE 0x20000000
-/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
- * expose the same maximum range.
- * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
- * range might be higher.
+#define MAX_VIEWS 8
+
+#define NUM_DEPTH_CLEAR_PIPELINES 3
+
+/*
+ * This is the point we switch from using CP to compute shader
+ * for certain buffer operations.
  */
-#define MAX_UNIFORM_BUFFER_RANGE 0x10000
+#define TU_BUFFER_OPS_CS_THRESHOLD 4096
+
+enum tu_mem_heap
+{
+   TU_MEM_HEAP_VRAM,
+   TU_MEM_HEAP_VRAM_CPU_ACCESS,
+   TU_MEM_HEAP_GTT,
+   TU_MEM_HEAP_COUNT
+};
+
+enum tu_mem_type
+{
+   TU_MEM_TYPE_VRAM,
+   TU_MEM_TYPE_GTT_WRITE_COMBINE,
+   TU_MEM_TYPE_VRAM_CPU_ACCESS,
+   TU_MEM_TYPE_GTT_CACHED,
+   TU_MEM_TYPE_COUNT
+};
+
+#define tu_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline uint32_t
+align_u32_npot(uint32_t v, uint32_t a)
+{
+   return (v + a - 1) / a * a;
+}
+
+static inline uint64_t
+align_u64(uint64_t v, uint64_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline int32_t
+align_i32(int32_t v, int32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
 
-#define A6XX_TEX_CONST_DWORDS 16
-#define A6XX_TEX_SAMP_DWORDS 4
+/** Alignment must be a power of 2. */
+static inline bool
+tu_is_aligned(uintmax_t n, uintmax_t a)
+{
+   assert(a == (a & -a));
+   return (n & (a - 1)) == 0;
+}
 
-#define COND(bool, val) ((bool) ? (val) : 0)
-#define BIT(bit) (1u << (bit))
+static inline uint32_t
+round_up_u32(uint32_t v, uint32_t a)
+{
+   return (v + a - 1) / a;
+}
+
+static inline uint64_t
+round_up_u64(uint64_t v, uint64_t a)
+{
+   return (v + a - 1) / a;
+}
+
+static inline uint32_t
+tu_minify(uint32_t n, uint32_t levels)
+{
+   if (unlikely(n == 0))
+      return 0;
+   else
+      return MAX2(n >> levels, 1);
+}
+static inline float
+tu_clamp_f(float f, float min, float max)
+{
+   assert(min < max);
+
+   if (f > max)
+      return max;
+   else if (f < min)
+      return min;
+   else
+      return f;
+}
+
+static inline bool
+tu_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
+{
+   if (*inout_mask & clear_mask) {
+      *inout_mask &= ~clear_mask;
+      return true;
+   } else {
+      return false;
+   }
+}
+
+#define for_each_bit(b, dword)                                               \
+   for (uint32_t __dword = (dword);                                          \
+        (b) = __builtin_ffs(__dword) - 1, __dword; __dword &= ~(1 << (b)))
+
+#define typed_memcpy(dest, src, count)                                       \
+   ({                                                                        \
+      STATIC_ASSERT(sizeof(*src) == sizeof(*dest));                          \
+      memcpy((dest), (src), (count) * sizeof(*(src)));                       \
+   })
 
 /* Whenever we generate an error, pass it through this function. Useful for
  * debugging, where we can break on it. Only call at error site, not when
@@ -149,25 +219,29 @@ typedef uint32_t xcb_window_t;
 struct tu_instance;
 
 VkResult
-__vk_startup_errorf(struct tu_instance *instance,
-                    VkResult error,
-                    bool force_print,
-                    const char *file,
-                    int line,
-                    const char *format,
-                    ...) PRINTFLIKE(6, 7);
-
-/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
- * build.
- */
-#define vk_startup_errorf(instance, error, format, ...) \
-   __vk_startup_errorf(instance, error, \
-                       instance->debug_flags & TU_DEBUG_STARTUP, \
-                       __FILE__, __LINE__, format, ##__VA_ARGS__)
+__vk_errorf(struct tu_instance *instance,
+            VkResult error,
+            const char *file,
+            int line,
+            const char *format,
+            ...);
+
+#define vk_error(instance, error)                                            \
+   __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
+#define vk_errorf(instance, error, format, ...)                              \
+   __vk_errorf(instance, error, __FILE__, __LINE__, format, ##__VA_ARGS__);
 
 void
 __tu_finishme(const char *file, int line, const char *format, ...)
-   PRINTFLIKE(3, 4);
+   tu_printflike(3, 4);
+void
+tu_loge(const char *format, ...) tu_printflike(1, 2);
+void
+tu_loge_v(const char *format, va_list va);
+void
+tu_logi(const char *format, ...) tu_printflike(1, 2);
+void
+tu_logi_v(const char *format, va_list va);
 
 /**
  * Print a FINISHME message, including its source location.
@@ -181,35 +255,46 @@ __tu_finishme(const char *file, int line, const char *format, ...)
       }                                                                      \
    } while (0)
 
+/* A non-fatal assert.  Useful for debugging. */
+#ifdef DEBUG
+#define tu_assert(x)                                                         \
+   ({                                                                        \
+      if (unlikely(!(x)))                                                    \
+         fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x);      \
+   })
+#else
+#define tu_assert(x)
+#endif
+
+/* Suppress -Wunused in stub functions */
+#define tu_use_args(...) __tu_use_args(0, ##__VA_ARGS__)
+static inline void
+__tu_use_args(int ignore, ...)
+{
+}
+
 #define tu_stub()                                                            \
    do {                                                                      \
       tu_finishme("stub %s", __func__);                                      \
    } while (0)
 
-struct tu_memory_heap {
-   /* Standard bits passed on to the client */
-   VkDeviceSize      size;
-   VkMemoryHeapFlags flags;
-
-   /** Copied from ANV:
-    *
-    * Driver-internal book-keeping.
-    *
-    * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
-    */
-   VkDeviceSize      used __attribute__ ((aligned (8)));
-};
-
-uint64_t
-tu_get_system_heap_size(void);
+void *
+tu_lookup_entrypoint_unchecked(const char *name);
+void *
+tu_lookup_entrypoint_checked(
+   const char *name,
+   uint32_t core_version,
+   const struct tu_instance_extension_table *instance,
+   const struct tu_device_extension_table *device);
 
 struct tu_physical_device
 {
-   struct vk_physical_device vk;
+   VK_LOADER_DATA _loader_data;
 
    struct tu_instance *instance;
 
-   const char *name;
+   char path[20];
+   char name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
    uint8_t driver_uuid[VK_UUID_SIZE];
    uint8_t device_uuid[VK_UUID_SIZE];
    uint8_t cache_uuid[VK_UUID_SIZE];
@@ -217,71 +302,43 @@ struct tu_physical_device
    struct wsi_device wsi_device;
 
    int local_fd;
-   bool has_local;
-   int64_t local_major;
-   int64_t local_minor;
    int master_fd;
-   bool has_master;
-   int64_t master_major;
-   int64_t master_minor;
 
+   unsigned gpu_id;
    uint32_t gmem_size;
-   uint64_t gmem_base;
-   uint32_t ccu_offset_gmem;
-   uint32_t ccu_offset_bypass;
-
-   struct fd_dev_id dev_id;
-   const struct fd_dev_info *info;
-
-   int msm_major_version;
-   int msm_minor_version;
-
-   /* Address space and global fault count for this local_fd with DRM backend */
-   uint64_t fault_count;
+   uint32_t tile_align_w;
+   uint32_t tile_align_h;
 
    /* This is the drivers on-disk cache used as a fallback as opposed to
     * the pipeline cache defined by apps.
     */
    struct disk_cache *disk_cache;
 
-   struct tu_memory_heap heap;
-
-   struct vk_sync_type syncobj_type;
-   struct vk_sync_timeline_type timeline_type;
-   const struct vk_sync_type *sync_types[3];
+   struct tu_device_extension_table supported_extensions;
 };
 
 enum tu_debug_flags
 {
    TU_DEBUG_STARTUP = 1 << 0,
    TU_DEBUG_NIR = 1 << 1,
-   TU_DEBUG_NOBIN = 1 << 3,
-   TU_DEBUG_SYSMEM = 1 << 4,
-   TU_DEBUG_FORCEBIN = 1 << 5,
-   TU_DEBUG_NOUBWC = 1 << 6,
-   TU_DEBUG_NOMULTIPOS = 1 << 7,
-   TU_DEBUG_NOLRZ = 1 << 8,
-   TU_DEBUG_PERFC = 1 << 9,
-   TU_DEBUG_FLUSHALL = 1 << 10,
-   TU_DEBUG_SYNCDRAW = 1 << 11,
-   TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
-   TU_DEBUG_GMEM = 1 << 13,
-   TU_DEBUG_RAST_ORDER = 1 << 14,
-   TU_DEBUG_UNALIGNED_STORE = 1 << 15,
+   TU_DEBUG_IR3 = 1 << 2,
 };
 
 struct tu_instance
 {
-   struct vk_instance vk;
+   VK_LOADER_DATA _loader_data;
+
+   VkAllocationCallbacks alloc;
 
    uint32_t api_version;
    int physical_device_count;
    struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
 
-   struct driOptionCache dri_options;
-   struct driOptionCache available_dri_options;
-
    enum tu_debug_flags debug_flags;
+
+   struct vk_debug_report_instance debug_report_callbacks;
+
+   struct tu_instance_extension_table enabled_extensions;
 };
 
 VkResult
@@ -297,19 +354,10 @@ bool
 tu_physical_device_extension_supported(struct tu_physical_device *dev,
                                        const char *name);
 
-enum tu_bo_alloc_flags
-{
-   TU_BO_ALLOC_NO_FLAGS = 0,
-   TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
-   TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
-};
-
 struct cache_entry;
 
 struct tu_pipeline_cache
 {
-   struct vk_object_base base;
-
    struct tu_device *device;
    pthread_mutex_t mutex;
 
@@ -326,313 +374,115 @@ struct tu_pipeline_key
 {
 };
 
+void
+tu_pipeline_cache_init(struct tu_pipeline_cache *cache,
+                       struct tu_device *device);
+void
+tu_pipeline_cache_finish(struct tu_pipeline_cache *cache);
+void
+tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
+                       const void *data,
+                       size_t size);
 
-/* queue types */
-#define TU_QUEUE_GENERAL 0
-
-#define TU_MAX_QUEUE_FAMILIES 1
-
-/* Keep tu_syncobj until porting to common code for kgsl too */
-#ifdef TU_USE_KGSL
-struct tu_syncobj;
-#endif
-struct tu_u_trace_syncobj;
-
-/* Define tu_timeline_sync type based on drm syncobj for a point type
- * for vk_sync_timeline, and the logic to handle is mostly copied from
- * anv_bo_sync since it seems it can be used by similar way to anv.
- */
-enum tu_timeline_sync_state {
-   /** Indicates that this is a new (or newly reset fence) */
-   TU_TIMELINE_SYNC_STATE_RESET,
-
-   /** Indicates that this fence has been submitted to the GPU but is still
-    * (as far as we know) in use by the GPU.
-    */
-   TU_TIMELINE_SYNC_STATE_SUBMITTED,
-
-   TU_TIMELINE_SYNC_STATE_SIGNALED,
-};
-
-struct tu_timeline_sync {
-   struct vk_sync base;
-
-   enum tu_timeline_sync_state state;
-   uint32_t syncobj;
-};
-
-struct tu_queue
-{
-   struct vk_queue vk;
-
-   struct tu_device *device;
+struct tu_shader_variant;
 
-   uint32_t msm_queue_id;
-   int fence;
-};
+bool
+tu_create_shader_variants_from_pipeline_cache(
+   struct tu_device *device,
+   struct tu_pipeline_cache *cache,
+   const unsigned char *sha1,
+   struct tu_shader_variant **variants);
 
-struct tu_bo
+void
+tu_pipeline_cache_insert_shaders(struct tu_device *device,
+                                 struct tu_pipeline_cache *cache,
+                                 const unsigned char *sha1,
+                                 struct tu_shader_variant **variants,
+                                 const void *const *codes,
+                                 const unsigned *code_sizes);
+
+struct tu_meta_state
 {
-   uint32_t gem_handle;
-   uint64_t size;
-   uint64_t iova;
-   void *map;
-   int32_t refcnt;
-
-#ifndef TU_USE_KGSL
-   uint32_t bo_list_idx;
-#endif
+   VkAllocationCallbacks alloc;
 
-   bool implicit_sync : 1;
+   struct tu_pipeline_cache cache;
 };
 
-/* externally-synchronized BO suballocator. */
-struct tu_suballocator
-{
-   struct tu_device *dev;
-
-   uint32_t default_size;
-   enum tu_bo_alloc_flags flags;
-
-   /** Current BO we're suballocating out of. */
-   struct tu_bo *bo;
-   uint32_t next_offset;
+/* queue types */
+#define TU_QUEUE_GENERAL 0
 
-   /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
-   struct tu_bo *cached_bo;
-};
+#define TU_MAX_QUEUE_FAMILIES 1
 
-struct tu_suballoc_bo
+struct tu_fence
 {
-   struct tu_bo *bo;
-   uint64_t iova;
-   uint32_t size; /* bytes */
+   bool signaled;
+   int fd;
 };
 
 void
-tu_bo_suballocator_init(struct tu_suballocator *suballoc,
-                        struct tu_device *dev,
-                        uint32_t default_size,
-                        uint32_t flags);
+tu_fence_init(struct tu_fence *fence, bool signaled);
 void
-tu_bo_suballocator_finish(struct tu_suballocator *suballoc);
-
-VkResult
-tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
-                     uint32_t size, uint32_t align);
-
-void *
-tu_suballoc_bo_map(struct tu_suballoc_bo *bo);
-
+tu_fence_finish(struct tu_fence *fence);
 void
-tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);
-
-enum global_shader {
-   GLOBAL_SH_VS_BLIT,
-   GLOBAL_SH_VS_CLEAR,
-   GLOBAL_SH_FS_BLIT,
-   GLOBAL_SH_FS_BLIT_ZSCALE,
-   GLOBAL_SH_FS_COPY_MS,
-   GLOBAL_SH_FS_CLEAR0,
-   GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
-   GLOBAL_SH_COUNT,
-};
-
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
-   /* Points into GPU memory */
-   struct tu_renderpass_samples* samples;
-
-   struct tu_suballoc_bo bo;
-
-   /*
-    * Below here, only used internally within autotune
-    */
-   uint64_t rp_key;
-   struct tu_renderpass_history *history;
-   struct list_head node;
-   uint32_t fence;
-   uint64_t samples_passed;
-};
-
-#define TU_BORDER_COLOR_COUNT 4096
-#define TU_BORDER_COLOR_BUILTIN 6
-
-#define TU_BLIT_SHADER_SIZE 1024
+tu_fence_update_fd(struct tu_fence *fence, int fd);
+void
+tu_fence_copy(struct tu_fence *fence, const struct tu_fence *src);
+void
+tu_fence_signal(struct tu_fence *fence);
+void
+tu_fence_wait_idle(struct tu_fence *fence);
 
-/* This struct defines the layout of the global_bo */
-struct tu6_global
+struct tu_queue
 {
-   /* clear/blit shaders */
-   uint32_t shaders[TU_BLIT_SHADER_SIZE];
-
-   uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
-   uint32_t _pad0;
-   volatile uint32_t vsc_draw_overflow;
-   uint32_t _pad1;
-   volatile uint32_t vsc_prim_overflow;
-   uint32_t _pad2;
-   uint64_t predicate;
-
-   /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
-   struct {
-      uint32_t offset;
-      uint32_t pad[7];
-   } flush_base[4];
-
-   ALIGN16 uint32_t cs_indirect_xyz[3];
-
-   /* To know when renderpass stats for autotune are valid */
-   volatile uint32_t autotune_fence;
+   VK_LOADER_DATA _loader_data;
+   struct tu_device *device;
+   uint32_t queue_family_index;
+   int queue_idx;
+   VkDeviceQueueCreateFlags flags;
 
-   /* note: larger global bo will be used for customBorderColors */
-   struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
+   uint32_t msm_queue_id;
+   struct tu_fence submit_fence;
 };
-#define gb_offset(member) offsetof(struct tu6_global, member)
-#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
-
-/* extra space in vsc draw/prim streams */
-#define VSC_PAD 0x40
 
 struct tu_device
 {
-   struct vk_device vk;
+   VK_LOADER_DATA _loader_data;
+
+   VkAllocationCallbacks alloc;
+
    struct tu_instance *instance;
 
+   struct tu_meta_state meta_state;
+
    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
    int queue_count[TU_MAX_QUEUE_FAMILIES];
 
    struct tu_physical_device *physical_device;
-   int fd;
 
    struct ir3_compiler *compiler;
 
    /* Backup in-memory cache to be used if the app doesn't provide one */
    struct tu_pipeline_cache *mem_cache;
 
-#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
-
-   /* Currently the kernel driver uses a 32-bit GPU address space, but it
-    * should be impossible to go beyond 48 bits.
-    */
-   struct {
-      struct tu_bo *bo;
-      mtx_t construct_mtx;
-      bool initialized;
-   } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
-
-   struct tu_bo *global_bo;
-
-   uint32_t implicit_sync_bo_count;
-
-   /* Device-global BO suballocator for reducing BO management overhead for
-    * (read-only) pipeline state.  Synchronized by pipeline_mutex.
-    */
-   struct tu_suballocator pipeline_suballoc;
-   mtx_t pipeline_mutex;
-
-   /* Device-global BO suballocator for reducing BO management for small
-    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
-    */
-   struct tu_suballocator autotune_suballoc;
-   mtx_t autotune_mutex;
-
-   /* the blob seems to always use 8K factor and 128K param sizes, copy them */
-#define TU_TESS_FACTOR_SIZE (8 * 1024)
-#define TU_TESS_PARAM_SIZE (128 * 1024)
-#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
-   /* Lazily allocated, protected by the device mutex. */
-   struct tu_bo *tess_bo;
-
-   struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
-   uint64_t global_shader_va[GLOBAL_SH_COUNT];
-
-   uint32_t vsc_draw_strm_pitch;
-   uint32_t vsc_prim_strm_pitch;
-   BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
-   mtx_t mutex;
-
-   /* bo list for submits: */
-   struct drm_msm_gem_submit_bo *bo_list;
-   /* map bo handles to bo list index: */
-   uint32_t bo_count, bo_list_size;
-   mtx_t bo_mutex;
-   /* protects imported BOs creation/freeing */
-   struct u_rwlock dma_bo_lock;
-
-   /* This array holds all our 'struct tu_bo' allocations. We use this
-    * so we can add a refcount to our BOs and check if a particular BO
-    * was already allocated in this device using its GEM handle. This is
-    * necessary to properly manage BO imports, because the kernel doesn't
-    * refcount the underlying BO memory.
-    *
-    * Specifically, when self-importing (i.e. importing a BO into the same
-    * device that created it), the kernel will give us the same BO handle
-    * for both BOs and we must only free it once when  both references are
-    * freed. Otherwise, if we are not self-importing, we get two different BO
-    * handles, and we want to free each one individually.
-    *
-    * The refcount is also useful for being able to maintain BOs across
-    * VK object lifetimes, such as pipelines suballocating out of BOs
-    * allocated on the device.
-    */
-   struct util_sparse_array bo_map;
-
-   /* Command streams to set pass index to a scratch reg */
-   struct tu_cs *perfcntrs_pass_cs;
-   struct tu_cs_entry *perfcntrs_pass_cs_entries;
-
-   /* Condition variable for timeline semaphore to notify waiters when a
-    * new submit is executed. */
-   pthread_cond_t timeline_cond;
-   pthread_mutex_t submit_mutex;
-
-   struct tu_autotune autotune;
-
-#ifdef ANDROID
-   const void *gralloc;
-   enum {
-      TU_GRALLOC_UNKNOWN,
-      TU_GRALLOC_CROS,
-      TU_GRALLOC_OTHER,
-   } gralloc_type;
-#endif
+   struct list_head shader_slabs;
+   mtx_t shader_slab_mutex;
 
-   uint32_t submit_count;
-
-   struct u_trace_context trace_context;
-
-   #ifdef HAVE_PERFETTO
-   struct tu_perfetto_state perfetto;
-   #endif
+   struct tu_device_extension_table enabled_extensions;
 };
 
-void tu_init_clear_blit_shaders(struct tu_device *dev);
-
-void tu_destroy_clear_blit_shaders(struct tu_device *dev);
-
-VkResult
-tu_device_submit_deferred_locked(struct tu_device *dev);
-
-VkResult
-tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
-
-uint64_t
-tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
-
-VkResult
-tu_device_check_status(struct vk_device *vk_device);
+struct tu_bo
+{
+   uint32_t gem_handle;
+   uint64_t size;
+   uint64_t iova;
+   void *map;
+};
 
 VkResult
-tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
-               enum tu_bo_alloc_flags flags);
+tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size);
 VkResult
 tu_bo_init_dmabuf(struct tu_device *dev,
-                  struct tu_bo **bo,
+                  struct tu_bo *bo,
                   uint64_t size,
                   int fd);
 int
@@ -642,28 +492,6 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
 VkResult
 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
 
-static inline struct tu_bo *
-tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
-{
-   return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
-}
-
-static inline struct tu_bo *
-tu_bo_get_ref(struct tu_bo *bo)
-{
-   p_atomic_inc(&bo->refcnt);
-   return bo;
-}
-
-/* Get a scratch bo for use inside a command buffer. This will always return
- * the same bo given the same size or similar sizes, so only one scratch bo
- * can be used at the same time. It's meant for short-lived things where we
- * need to write to some piece of memory, read from it, and then immediately
- * discard it.
- */
-VkResult
-tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
-
 struct tu_cs_entry
 {
    /* No ownership */
@@ -673,58 +501,6 @@ struct tu_cs_entry
    uint32_t offset;
 };
 
-struct tu_cs_memory {
-   uint32_t *map;
-   uint64_t iova;
-};
-
-struct tu_draw_state {
-   uint64_t iova : 48;
-   uint32_t size : 16;
-};
-
-enum tu_dynamic_state
-{
-   /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
-   TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
-   TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
-   TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
-   TU_DYNAMIC_STATE_VB_STRIDE,
-   TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
-   TU_DYNAMIC_STATE_COUNT,
-   /* no associated draw state: */
-   TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
-   TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
-   /* re-use the line width enum as it uses GRAS_SU_CNTL: */
-   TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
-};
-
-enum tu_draw_state_group_id
-{
-   TU_DRAW_STATE_PROGRAM_CONFIG,
-   TU_DRAW_STATE_PROGRAM,
-   TU_DRAW_STATE_PROGRAM_BINNING,
-   TU_DRAW_STATE_VB,
-   TU_DRAW_STATE_VI,
-   TU_DRAW_STATE_VI_BINNING,
-   TU_DRAW_STATE_RAST,
-   TU_DRAW_STATE_BLEND,
-   TU_DRAW_STATE_SHADER_GEOM_CONST,
-   TU_DRAW_STATE_FS_CONST,
-   TU_DRAW_STATE_DESC_SETS,
-   TU_DRAW_STATE_DESC_SETS_LOAD,
-   TU_DRAW_STATE_VS_PARAMS,
-   TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
-   TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
-   TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
-   TU_DRAW_STATE_PRIM_MODE_GMEM,
-   TU_DRAW_STATE_PRIM_MODE_SYSMEM,
-
-   /* dynamic state related draw states */
-   TU_DRAW_STATE_DYNAMIC,
-   TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
-};
-
 enum tu_cs_mode
 {
 
@@ -765,7 +541,6 @@ struct tu_cs
    uint32_t *reserved_end;
    uint32_t *end;
 
-   struct tu_device *device;
    enum tu_cs_mode mode;
    uint32_t next_bo_size;
 
@@ -776,20 +551,20 @@ struct tu_cs
    struct tu_bo **bos;
    uint32_t bo_count;
    uint32_t bo_capacity;
-
-   /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
-   struct tu_bo *refcount_bo;
-
-   /* state for cond_exec_start/cond_exec_end */
-   uint32_t cond_flags;
-   uint32_t *cond_dwords;
 };
 
 struct tu_device_memory
 {
-   struct vk_object_base base;
+   struct tu_bo bo;
+   VkDeviceSize size;
 
-   struct tu_bo *bo;
+   /* for dedicated allocations */
+   struct tu_image *image;
+   struct tu_buffer *buffer;
+
+   uint32_t type_index;
+   void *map;
+   void *user_ptr;
 };
 
 struct tu_descriptor_range
@@ -800,19 +575,18 @@ struct tu_descriptor_range
 
 struct tu_descriptor_set
 {
-   struct vk_object_base base;
-
-   /* Link to descriptor pool's desc_sets list . */
-   struct list_head pool_link;
-
-   struct tu_descriptor_set_layout *layout;
-   struct tu_descriptor_pool *pool;
+   const struct tu_descriptor_set_layout *layout;
    uint32_t size;
 
    uint64_t va;
    uint32_t *mapped_ptr;
+   struct tu_descriptor_range *dynamic_descriptors;
+};
 
-   uint32_t *dynamic_descriptors;
+struct tu_push_descriptor_set
+{
+   struct tu_descriptor_set set;
+   uint32_t capacity;
 };
 
 struct tu_descriptor_pool_entry
@@ -824,18 +598,13 @@ struct tu_descriptor_pool_entry
 
 struct tu_descriptor_pool
 {
-   struct vk_object_base base;
-
-   struct tu_bo *bo;
+   uint8_t *mapped_ptr;
    uint64_t current_offset;
    uint64_t size;
 
    uint8_t *host_memory_base;
    uint8_t *host_memory_ptr;
    uint8_t *host_memory_end;
-   uint8_t *host_bo;
-
-   struct list_head desc_sets;
 
    uint32_t entry_count;
    uint32_t max_entry_count;
@@ -866,13 +635,11 @@ struct tu_descriptor_update_template_entry
    size_t src_stride;
 
    /* For push descriptors */
-   const struct tu_sampler *immutable_samplers;
+   const uint32_t *immutable_samplers;
 };
 
 struct tu_descriptor_update_template
 {
-   struct vk_object_base base;
-
    uint32_t entry_count;
    VkPipelineBindPoint bind_point;
    struct tu_descriptor_update_template_entry entry[0];
@@ -880,257 +647,175 @@ struct tu_descriptor_update_template
 
 struct tu_buffer
 {
-   struct vk_object_base base;
-
    VkDeviceSize size;
 
    VkBufferUsageFlags usage;
    VkBufferCreateFlags flags;
 
    struct tu_bo *bo;
-   uint64_t iova;
+   VkDeviceSize bo_offset;
 };
 
-const char *
-tu_get_debug_option_name(int id);
-
-const char *
-tu_get_perftest_option_name(int id);
+enum tu_dynamic_state_bits
+{
+   TU_DYNAMIC_VIEWPORT = 1 << 0,
+   TU_DYNAMIC_SCISSOR = 1 << 1,
+   TU_DYNAMIC_LINE_WIDTH = 1 << 2,
+   TU_DYNAMIC_DEPTH_BIAS = 1 << 3,
+   TU_DYNAMIC_BLEND_CONSTANTS = 1 << 4,
+   TU_DYNAMIC_DEPTH_BOUNDS = 1 << 5,
+   TU_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 6,
+   TU_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7,
+   TU_DYNAMIC_STENCIL_REFERENCE = 1 << 8,
+   TU_DYNAMIC_DISCARD_RECTANGLE = 1 << 9,
+   TU_DYNAMIC_ALL = (1 << 10) - 1,
+};
+
+struct tu_vertex_binding
+{
+   struct tu_buffer *buffer;
+   VkDeviceSize offset;
+};
 
-struct tu_descriptor_state
+struct tu_viewport_state
 {
-   struct tu_descriptor_set *sets[MAX_SETS];
-   struct tu_descriptor_set push_set;
-   uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
+   uint32_t count;
+   VkViewport viewports[MAX_VIEWPORTS];
 };
 
-enum tu_cmd_dirty_bits
+struct tu_scissor_state
 {
-   TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
-   TU_CMD_DIRTY_VB_STRIDE = BIT(1),
-   TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
-   TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
-   TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
-   TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
-   TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
-   TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
-   TU_CMD_DIRTY_LRZ = BIT(8),
-   TU_CMD_DIRTY_VS_PARAMS = BIT(9),
-   TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
-   TU_CMD_DIRTY_VIEWPORTS = BIT(11),
-   /* all draw states were disabled and need to be re-enabled: */
-   TU_CMD_DIRTY_DRAW_STATE = BIT(12)
+   uint32_t count;
+   VkRect2D scissors[MAX_SCISSORS];
 };
 
-/* There are only three cache domains we have to care about: the CCU, or
- * color cache unit, which is used for color and depth/stencil attachments
- * and copy/blit destinations, and is split conceptually into color and depth,
- * and the universal cache or UCHE which is used for pretty much everything
- * else, except for the CP (uncached) and host. We need to flush whenever data
- * crosses these boundaries.
- */
+struct tu_discard_rectangle_state
+{
+   uint32_t count;
+   VkRect2D rectangles[MAX_DISCARD_RECTANGLES];
+};
 
-enum tu_cmd_access_mask {
-   TU_ACCESS_UCHE_READ = 1 << 0,
-   TU_ACCESS_UCHE_WRITE = 1 << 1,
-   TU_ACCESS_CCU_COLOR_READ = 1 << 2,
-   TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
-   TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
-   TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
-
-   /* Experiments have shown that while it's safe to avoid flushing the CCU
-    * after each blit/renderpass, it's not safe to assume that subsequent
-    * lookups with a different attachment state will hit unflushed cache
-    * entries. That is, the CCU needs to be flushed and possibly invalidated
-    * when accessing memory with a different attachment state. Writing to an
-    * attachment under the following conditions after clearing using the
-    * normal 2d engine path is known to have issues:
-    *
-    * - It isn't the 0'th layer.
-    * - There are more than one attachment, and this isn't the 0'th attachment
-    *   (this seems to also depend on the cpp of the attachments).
-    *
-    * Our best guess is that the layer/MRT state is used when computing
-    * the location of a cache entry in CCU, to avoid conflicts. We assume that
-    * any access in a renderpass after or before an access by a transfer needs
-    * a flush/invalidate, and use the _INCOHERENT variants to represent access
-    * by a renderpass.
+struct tu_dynamic_state
+{
+   /**
+    * Bitmask of (1 << VK_DYNAMIC_STATE_*).
+    * Defines the set of saved dynamic state.
     */
-   TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
-   TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
-   TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
-   TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
+   uint32_t mask;
 
-   /* Accesses which bypasses any cache. e.g. writes via the host,
-    * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
-    */
-   TU_ACCESS_SYSMEM_READ = 1 << 10,
-   TU_ACCESS_SYSMEM_WRITE = 1 << 11,
+   struct tu_viewport_state viewport;
 
-   /* Memory writes from the CP start in-order with draws and event writes,
-    * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
-    */
-   TU_ACCESS_CP_WRITE = 1 << 12,
-
-   TU_ACCESS_READ =
-      TU_ACCESS_UCHE_READ |
-      TU_ACCESS_CCU_COLOR_READ |
-      TU_ACCESS_CCU_DEPTH_READ |
-      TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
-      TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
-      TU_ACCESS_SYSMEM_READ,
-
-   TU_ACCESS_WRITE =
-      TU_ACCESS_UCHE_WRITE |
-      TU_ACCESS_CCU_COLOR_WRITE |
-      TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
-      TU_ACCESS_CCU_DEPTH_WRITE |
-      TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
-      TU_ACCESS_SYSMEM_WRITE |
-      TU_ACCESS_CP_WRITE,
-
-   TU_ACCESS_ALL =
-      TU_ACCESS_READ |
-      TU_ACCESS_WRITE,
-};
+   struct tu_scissor_state scissor;
 
-/* Starting with a6xx, the pipeline is split into several "clusters" (really
- * pipeline stages). Each stage has its own pair of register banks and can
- * switch them independently, so that earlier stages can run ahead of later
- * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
- * the same time.
- *
- * As a result of this, we need to insert a WFI when an earlier stage depends
- * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
- * pending WFI's to complete before starting, and usually before reading
- * indirect params even, so a WFI also acts as a full "pipeline stall".
- *
- * Note, the names of the stages come from CLUSTER_* in devcoredump. We
- * include all the stages for completeness, even ones which do not read/write
- * anything.
- */
+   float line_width;
 
-enum tu_stage {
-   /* This doesn't correspond to a cluster, but we need it for tracking
-    * indirect draw parameter reads etc.
-    */
-   TU_STAGE_CP,
+   struct
+   {
+      float bias;
+      float clamp;
+      float slope;
+   } depth_bias;
 
-   /* - Fetch index buffer
-    * - Fetch vertex attributes, dispatch VS
-    */
-   TU_STAGE_FE,
+   float blend_constants[4];
 
-   /* Execute all geometry stages (VS thru GS) */
-   TU_STAGE_SP_VS,
+   struct
+   {
+      float min;
+      float max;
+   } depth_bounds;
 
-   /* Write to VPC, do primitive assembly. */
-   TU_STAGE_PC_VS,
+   struct
+   {
+      uint32_t front;
+      uint32_t back;
+   } stencil_compare_mask;
 
-   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
-    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
-    * early depth testing is enabled before dispatching fragments? However
-    * GRAS reads and writes LRZ directly.
-    */
-   TU_STAGE_GRAS,
+   struct
+   {
+      uint32_t front;
+      uint32_t back;
+   } stencil_write_mask;
 
-   /* Execute FS */
-   TU_STAGE_SP_PS,
+   struct
+   {
+      uint32_t front;
+      uint32_t back;
+   } stencil_reference;
 
-   /* - Fragment tests
-    * - Write color/depth
-    * - Streamout writes (???)
-    * - Varying interpolation (???)
-    */
-   TU_STAGE_PS,
+   struct tu_discard_rectangle_state discard_rectangle;
 };
 
-enum tu_cmd_flush_bits {
-   TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
-   TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
-   TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
-   TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
-   TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
-   TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
-   TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
-   TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
-   TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
-
-   TU_CMD_FLAG_ALL_FLUSH =
-      TU_CMD_FLAG_CCU_FLUSH_DEPTH |
-      TU_CMD_FLAG_CCU_FLUSH_COLOR |
-      TU_CMD_FLAG_CACHE_FLUSH |
-      /* Treat the CP as a sort of "cache" which may need to be "flushed" via
-       * waiting for writes to land with WAIT_FOR_MEM_WRITES.
-       */
-      TU_CMD_FLAG_WAIT_MEM_WRITES,
-
-   TU_CMD_FLAG_ALL_INVALIDATE =
-      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
-      TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
-      TU_CMD_FLAG_CACHE_INVALIDATE |
-      /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
-       * a command that needs CP_WAIT_FOR_ME is executed. This means we may
-       * insert an extra WAIT_FOR_ME before an indirect command requiring it
-       * in case there was another command before the current command buffer
-       * that it needs to wait for.
-       */
-      TU_CMD_FLAG_WAIT_FOR_ME,
-};
+extern const struct tu_dynamic_state default_dynamic_state;
 
-/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
- * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
- * which part of the gmem is used by the CCU. Here we keep track of what the
- * state of the CCU.
- */
-enum tu_cmd_ccu_state {
-   TU_CMD_CCU_SYSMEM,
-   TU_CMD_CCU_GMEM,
-   TU_CMD_CCU_UNKNOWN,
-};
+const char *
+tu_get_debug_option_name(int id);
 
-struct tu_cache_state {
-   /* Caches which must be made available (flushed) eventually if there are
-    * any users outside that cache domain, and caches which must be
-    * invalidated eventually if there are any reads.
-    */
-   enum tu_cmd_flush_bits pending_flush_bits;
-   /* Pending flushes */
-   enum tu_cmd_flush_bits flush_bits;
-};
+const char *
+tu_get_perftest_option_name(int id);
 
-enum tu_lrz_force_disable_mask {
-   TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
-   TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
+/**
+ * Attachment state when recording a renderpass instance.
+ *
+ * The clear value is valid only if there exists a pending clear.
+ */
+struct tu_attachment_state
+{
+   VkImageAspectFlags pending_clear_aspects;
+   uint32_t cleared_views;
+   VkClearValue clear_value;
+   VkImageLayout current_layout;
 };
 
-enum tu_lrz_direction {
-   TU_LRZ_UNKNOWN,
-   /* Depth func less/less-than: */
-   TU_LRZ_LESS,
-   /* Depth func greater/greater-than: */
-   TU_LRZ_GREATER,
+struct tu_descriptor_state
+{
+   struct tu_descriptor_set *sets[MAX_SETS];
+   uint32_t dirty;
+   uint32_t valid;
+   struct tu_push_descriptor_set push_set;
+   bool push_dirty;
+   uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS];
 };
 
-struct tu_lrz_pipeline
+struct tu_tile
 {
-   uint32_t force_disable_mask;
-   bool fs_has_kill;
-   bool force_late_z;
-   bool early_fragment_tests;
+   uint8_t pipe;
+   uint8_t slot;
+   VkOffset2D begin;
+   VkOffset2D end;
 };
 
-struct tu_lrz_state
+struct tu_tiling_config
 {
-   /* Depth/Stencil image currently on use to do LRZ */
-   struct tu_image *image;
-   bool valid : 1;
-   enum tu_lrz_direction prev_direction;
+   VkRect2D render_area;
+   uint32_t buffer_cpp[MAX_RTS + 2];
+   uint32_t buffer_count;
+
+   /* position and size of the first tile */
+   VkRect2D tile0;
+   /* number of tiles */
+   VkExtent2D tile_count;
+
+   uint32_t gmem_offsets[MAX_RTS + 2];
+
+   /* size of the first VSC pipe */
+   VkExtent2D pipe0;
+   /* number of VSC pipes */
+   VkExtent2D pipe_count;
+
+   /* pipe register values */
+   uint32_t pipe_config[MAX_VSC_PIPES];
+   uint32_t pipe_sizes[MAX_VSC_PIPES];
 };
 
-struct tu_vs_params {
-   uint32_t vertex_offset;
-   uint32_t first_instance;
+enum tu_cmd_dirty_bits
+{
+   TU_CMD_DIRTY_PIPELINE = 1 << 0,
+   TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1,
+
+   TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16,
+   TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17,
+   TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 18,
+   TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 19,
 };
 
 struct tu_cmd_state
@@ -1138,119 +823,48 @@ struct tu_cmd_state
    uint32_t dirty;
 
    struct tu_pipeline *pipeline;
-   struct tu_pipeline *compute_pipeline;
 
-   /* Vertex buffers, viewports, and scissors
-    * the states for these can be updated partially, so we need to save these
-    * to be able to emit a complete draw state
-    */
-   struct {
-      uint64_t base;
-      uint32_t size;
-      uint32_t stride;
-   } vb[MAX_VBS];
-   VkViewport viewport[MAX_VIEWPORTS];
-   VkRect2D scissor[MAX_SCISSORS];
-   uint32_t max_viewport, max_scissor;
-
-   /* for dynamic states that can't be emitted directly */
-   uint32_t dynamic_stencil_mask;
-   uint32_t dynamic_stencil_wrmask;
-   uint32_t dynamic_stencil_ref;
-
-   uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
-   uint32_t pc_raster_cntl, vpc_unknown_9107;
-   enum pc_di_primtype primtype;
-   bool primitive_restart_enable;
-
-   /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
-   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
-   struct tu_draw_state vertex_buffers;
-   struct tu_draw_state shader_const[2];
-   struct tu_draw_state desc_sets;
-
-   struct tu_draw_state vs_params;
+   /* Vertex buffers */
+   struct
+   {
+      struct tu_buffer *buffers[MAX_VBS];
+      VkDeviceSize offsets[MAX_VBS];
+   } vb;
+
+   struct tu_dynamic_state dynamic;
 
    /* Index buffer */
-   uint64_t index_va;
+   struct tu_buffer *index_buffer;
+   uint64_t index_offset;
+   uint32_t index_type;
    uint32_t max_index_count;
-   uint8_t index_size;
-
-   /* because streamout base has to be 32-byte aligned
-    * there is an extra offset to deal with when it is
-    * unaligned
-    */
-   uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
-
-   /* Renderpasses are tricky, because we may need to flush differently if
-    * using sysmem vs. gmem and therefore we have to delay any flushing that
-    * happens before a renderpass. So we have to have two copies of the flush
-    * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
-    * and one for outside a renderpass.
-    */
-   struct tu_cache_state cache;
-   struct tu_cache_state renderpass_cache;
-
-   enum tu_cmd_ccu_state ccu_state;
+   uint64_t index_va;
 
    const struct tu_render_pass *pass;
    const struct tu_subpass *subpass;
    const struct tu_framebuffer *framebuffer;
-   VkRect2D render_area;
-
-   const struct tu_image_view **attachments;
+   struct tu_attachment_state *attachments;
 
-   bool xfb_used;
-   bool has_tess;
-   bool tessfactor_addr_set;
-   bool has_subpass_predication;
-   bool predication_active;
-   bool disable_gmem;
-   enum a5xx_line_mode line_mode;
-   bool z_negative_one_to_one;
+   struct tu_tiling_config tiling_config;
 
-   uint32_t drawcall_count;
-
-   /* A calculated "draw cost" value for renderpass, which tries to
-    * estimate the bandwidth-per-sample of all the draws according
-    * to:
-    *
-    *    foreach_draw (...) {
-    *      cost += num_frag_outputs;
-    *      if (blend_enabled)
-    *        cost += num_blend_enabled;
-    *      if (depth_test_enabled)
-    *        cost++;
-    *      if (depth_write_enabled)
-    *        cost++;
-    *    }
-    *
-    * The idea is that each sample-passed minimally does one write
-    * per MRT.  If blend is enabled, the hw will additionally do
-    * a framebuffer read per sample-passed (for each MRT with blend
-    * enabled).  If depth-test is enabled, the hw will additionally
-    * a depth buffer read.  If depth-write is enable, the hw will
-    * additionally do a depth buffer write.
-    *
-    * This does ignore depth buffer traffic for samples which do not
-    * pass do to depth-test fail, and some other details.  But it is
-    * just intended to be a rough estimate that is easy to calculate.
-    */
-   uint32_t total_drawcalls_cost;
-
-   struct tu_lrz_state lrz;
-
-   struct tu_draw_state lrz_and_depth_plane_state;
-
-   struct tu_vs_params last_vs_params;
+   struct tu_cs_entry tile_load_ib;
+   struct tu_cs_entry tile_store_ib;
 };
 
 struct tu_cmd_pool
 {
-   struct vk_command_pool vk;
-
+   VkAllocationCallbacks alloc;
    struct list_head cmd_buffers;
    struct list_head free_cmd_buffers;
+   uint32_t queue_family_index;
+};
+
+struct tu_cmd_buffer_upload
+{
+   uint8_t *map;
+   unsigned offset;
+   uint64_t size;
+   struct list_head list;
 };
 
 enum tu_cmd_buffer_status
@@ -1262,116 +876,165 @@ enum tu_cmd_buffer_status
    TU_CMD_BUFFER_STATUS_PENDING,
 };
 
+struct tu_bo_list
+{
+   uint32_t count;
+   uint32_t capacity;
+   struct drm_msm_gem_submit_bo *bo_infos;
+};
+
+#define TU_BO_LIST_FAILED (~0)
+
+void
+tu_bo_list_init(struct tu_bo_list *list);
+void
+tu_bo_list_destroy(struct tu_bo_list *list);
+void
+tu_bo_list_reset(struct tu_bo_list *list);
+uint32_t
+tu_bo_list_add(struct tu_bo_list *list,
+               const struct tu_bo *bo,
+               uint32_t flags);
+VkResult
+tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other);
+
 struct tu_cmd_buffer
 {
-   struct vk_command_buffer vk;
+   VK_LOADER_DATA _loader_data;
 
    struct tu_device *device;
 
    struct tu_cmd_pool *pool;
    struct list_head pool_link;
 
-   struct u_trace trace;
-   struct u_trace_iterator trace_renderpass_start;
-   struct u_trace_iterator trace_renderpass_end;
-
-   struct list_head renderpass_autotune_results;
-   struct tu_autotune_results_buffer* autotune_buffer;
-
    VkCommandBufferUsageFlags usage_flags;
+   VkCommandBufferLevel level;
    enum tu_cmd_buffer_status status;
 
    struct tu_cmd_state state;
+   struct tu_vertex_binding vertex_bindings[MAX_VBS];
    uint32_t queue_family_index;
 
-   uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
+   uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
    VkShaderStageFlags push_constant_stages;
    struct tu_descriptor_set meta_push_descriptors;
 
-   struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
+   struct tu_descriptor_state descriptors[VK_PIPELINE_BIND_POINT_RANGE_SIZE];
+
+   struct tu_cmd_buffer_upload upload;
 
    VkResult record_result;
 
+   struct tu_bo_list bo_list;
    struct tu_cs cs;
    struct tu_cs draw_cs;
-   struct tu_cs tile_store_cs;
-   struct tu_cs draw_epilogue_cs;
-   struct tu_cs sub_cs;
+   struct tu_cs tile_cs;
 
-   uint32_t vsc_draw_strm_pitch;
-   uint32_t vsc_prim_strm_pitch;
-};
+   uint16_t marker_reg;
+   uint32_t marker_seqno;
 
-/* Temporary struct for tracking a register state to be written, used by
- * a6xx-pack.h and tu_cs_emit_regs()
- */
-struct tu_reg_value {
-   uint32_t reg;
-   uint64_t value;
-   bool is_address;
-   struct tu_bo *bo;
-   bool bo_write;
-   uint32_t bo_offset;
-   uint32_t bo_shift;
+   struct tu_bo scratch_bo;
+   uint32_t scratch_seqno;
+
+   bool wait_for_idle;
 };
 
+void
+tu6_emit_event_write(struct tu_cmd_buffer *cmd,
+                     struct tu_cs *cs,
+                     enum vgt_event_type event,
+                     bool need_seqno);
+
+bool
+tu_get_memory_fd(struct tu_device *device,
+                 struct tu_device_memory *memory,
+                 int *pFD);
 
-void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
-                                    struct tu_cs *cs);
+/*
+ * Takes x,y,z as exact numbers of invocations, instead of blocks.
+ *
+ * Limitations: Can't call normal dispatch functions without binding or
+ * rebinding
+ *              the compute pipeline.
+ */
+void
+tu_unaligned_dispatch(struct tu_cmd_buffer *cmd_buffer,
+                      uint32_t x,
+                      uint32_t y,
+                      uint32_t z);
+
+struct tu_event
+{
+   uint64_t *map;
+};
 
-void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
-                             struct tu_cs *cs,
-                             enum tu_cmd_ccu_state ccu_state);
+struct tu_shader_module;
 
+#define TU_HASH_SHADER_IS_GEOM_COPY_SHADER (1 << 0)
+#define TU_HASH_SHADER_SISCHED (1 << 1)
+#define TU_HASH_SHADER_UNSAFE_MATH (1 << 2)
 void
-tu6_emit_event_write(struct tu_cmd_buffer *cmd,
-                     struct tu_cs *cs,
-                     enum vgt_event_type event);
+tu_hash_shaders(unsigned char *hash,
+                const VkPipelineShaderStageCreateInfo **stages,
+                const struct tu_pipeline_layout *layout,
+                const struct tu_pipeline_key *key,
+                uint32_t flags);
+
+static inline gl_shader_stage
+vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage)
+{
+   assert(__builtin_popcount(vk_stage) == 1);
+   return ffs(vk_stage) - 1;
+}
 
-static inline struct tu_descriptor_state *
-tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
-                         VkPipelineBindPoint bind_point)
+static inline VkShaderStageFlagBits
+mesa_to_vk_shader_stage(gl_shader_stage mesa_stage)
 {
-   return &cmd_buffer->descriptors[bind_point];
+   return (1 << mesa_stage);
 }
 
-struct tu_event
+#define TU_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
+
+#define tu_foreach_stage(stage, stage_bits)                                  \
+   for (gl_shader_stage stage,                                               \
+        __tmp = (gl_shader_stage)((stage_bits) &TU_STAGE_MASK);              \
+        stage = __builtin_ffs(__tmp) - 1, __tmp; __tmp &= ~(1 << (stage)))
+
+struct tu_shader_module
 {
-   struct vk_object_base base;
-   struct tu_bo *bo;
+   unsigned char sha1[20];
+
+   uint32_t code_size;
+   const uint32_t *code[0];
 };
 
-struct tu_push_constant_range
+struct tu_shader_compile_options
 {
-   uint32_t lo;
-   uint32_t count;
+   struct ir3_shader_key key;
+
+   bool optimize;
+   bool include_binning_pass;
 };
 
 struct tu_shader
 {
-   struct ir3_shader *ir3_shader;
+   struct ir3_shader ir3_shader;
 
-   struct tu_push_constant_range push_consts;
-   uint8_t active_desc_sets;
-   bool multi_pos_output;
-};
+   /* This may be true for vertex shaders.  When true, variants[1] is the
+    * binning variant and binning_binary is non-NULL.
+    */
+   bool has_binning_pass;
 
-bool
-tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
-                       struct tu_device *dev);
+   void *binary;
+   void *binning_binary;
 
-nir_shader *
-tu_spirv_to_nir(struct tu_device *dev,
-                void *mem_ctx,
-                const VkPipelineShaderStageCreateInfo *stage_info,
-                gl_shader_stage stage);
+   struct ir3_shader_variant variants[0];
+};
 
 struct tu_shader *
 tu_shader_create(struct tu_device *dev,
-                 nir_shader *nir,
+                 gl_shader_stage stage,
                  const VkPipelineShaderStageCreateInfo *stage_info,
-                 unsigned multiview_mask,
-                 struct tu_pipeline_layout *layout,
                  const VkAllocationCallbacks *alloc);
 
 void
@@ -1379,78 +1042,50 @@ tu_shader_destroy(struct tu_device *dev,
                   struct tu_shader *shader,
                   const VkAllocationCallbacks *alloc);
 
-struct tu_program_descriptor_linkage
-{
-   struct ir3_const_state const_state;
-
-   uint32_t constlen;
-
-   struct tu_push_constant_range push_consts;
-};
-
-struct tu_pipeline_executable {
-   gl_shader_stage stage;
-
-   struct ir3_info stats;
-   bool is_binning;
+void
+tu_shader_compile_options_init(
+   struct tu_shader_compile_options *options,
+   const VkGraphicsPipelineCreateInfo *pipeline_info);
 
-   char *nir_from_spirv;
-   char *nir_final;
-   char *disasm;
-};
+VkResult
+tu_shader_compile(struct tu_device *dev,
+                  struct tu_shader *shader,
+                  const struct tu_shader *next_stage,
+                  const struct tu_shader_compile_options *options,
+                  const VkAllocationCallbacks *alloc);
 
 struct tu_pipeline
 {
-   struct vk_object_base base;
-
    struct tu_cs cs;
-   struct tu_suballoc_bo bo;
 
-   /* Separate BO for private memory since it should GPU writable */
-   struct tu_bo *pvtmem_bo;
+   struct tu_dynamic_state dynamic_state;
+
+   struct tu_pipeline_layout *layout;
 
    bool need_indirect_descriptor_sets;
    VkShaderStageFlags active_stages;
-   uint32_t active_desc_sets;
-
-   /* mask of enabled dynamic states
-    * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
-    */
-   uint32_t dynamic_state_mask;
-   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
-
-   /* for dynamic states which use the same register: */
-   uint32_t gras_su_cntl, gras_su_cntl_mask;
-   uint32_t rb_depth_cntl, rb_depth_cntl_mask;
-   uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
-   uint32_t pc_raster_cntl, pc_raster_cntl_mask;
-   uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
-   uint32_t stencil_wrmask;
-
-   bool rb_depth_cntl_disable;
-
-   enum a5xx_line_mode line_mode;
-
-   /* draw states for the pipeline */
-   struct tu_draw_state load_state, rast_state, blend_state;
-   struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
-
-   /* for vertex buffers state */
-   uint32_t num_vbs;
 
    struct
    {
-      struct tu_draw_state config_state;
-      struct tu_draw_state state;
-      struct tu_draw_state binning_state;
-
-      struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
+      struct tu_bo binary_bo;
+      struct tu_cs_entry state_ib;
+      struct tu_cs_entry binning_state_ib;
    } program;
 
    struct
    {
-      struct tu_draw_state state;
-      struct tu_draw_state binning_state;
+      uint8_t bindings[MAX_VERTEX_ATTRIBS];
+      uint16_t strides[MAX_VERTEX_ATTRIBS];
+      uint16_t offsets[MAX_VERTEX_ATTRIBS];
+      uint32_t count;
+
+      uint8_t binning_bindings[MAX_VERTEX_ATTRIBS];
+      uint16_t binning_strides[MAX_VERTEX_ATTRIBS];
+      uint16_t binning_offsets[MAX_VERTEX_ATTRIBS];
+      uint32_t binning_count;
+
+      struct tu_cs_entry state_ib;
+      struct tu_cs_entry binning_state_ib;
    } vi;
 
    struct
@@ -1461,47 +1096,36 @@ struct tu_pipeline
 
    struct
    {
-      uint32_t patch_type;
-      uint32_t param_stride;
-      bool upper_left_domain_origin;
-   } tess;
+      struct tu_cs_entry state_ib;
+   } vp;
 
    struct
    {
-      uint32_t local_size[3];
-      uint32_t subgroup_size;
-   } compute;
-
-   bool provoking_vertex_last;
-
-   struct tu_lrz_pipeline lrz;
+      uint32_t gras_su_cntl;
+      struct tu_cs_entry state_ib;
+   } rast;
 
-   /* In other words - framebuffer fetch support */
-   bool raster_order_attachment_access;
-   bool subpass_feedback_loop_ds;
-
-   bool z_negative_one_to_one;
-
-   /* Base drawcall cost for sysmem vs gmem autotuner */
-   uint8_t drawcall_base_cost;
+   struct
+   {
+      struct tu_cs_entry state_ib;
+   } ds;
 
-   void *executables_mem_ctx;
-   /* tu_pipeline_executable */
-   struct util_dynarray executables;
+   struct
+   {
+      struct tu_cs_entry state_ib;
+   } blend;
 };
 
 void
-tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
-                  bool z_negative_one_to_one);
+tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport);
 
 void
-tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
+tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor);
 
 void
-tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
-
-void
-tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
+tu6_emit_gras_su_cntl(struct tu_cs *cs,
+                      uint32_t gras_su_cntl,
+                      float line_width);
 
 void
 tu6_emit_depth_bias(struct tu_cs *cs,
@@ -1509,143 +1133,106 @@ tu6_emit_depth_bias(struct tu_cs *cs,
                     float clamp,
                     float slope_factor);
 
-void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
-                   enum a5xx_line_mode line_mode);
-
-void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
-
-void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
-
-void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
-
-void tu6_apply_depth_bounds_workaround(struct tu_device *device,
-                                       uint32_t *rb_depth_cntl);
-
-struct tu_pvtmem_config {
-   uint64_t iova;
-   uint32_t per_fiber_size;
-   uint32_t per_sp_size;
-   bool per_wave;
-};
-
-void
-tu6_emit_xs_config(struct tu_cs *cs,
-                   gl_shader_stage stage,
-                   const struct ir3_shader_variant *xs);
-
-void
-tu6_emit_xs(struct tu_cs *cs,
-            gl_shader_stage stage,
-            const struct ir3_shader_variant *xs,
-            const struct tu_pvtmem_config *pvtmem,
-            uint64_t binary_iova);
-
-void
-tu6_emit_vpc(struct tu_cs *cs,
-             const struct ir3_shader_variant *vs,
-             const struct ir3_shader_variant *hs,
-             const struct ir3_shader_variant *ds,
-             const struct ir3_shader_variant *gs,
-             const struct ir3_shader_variant *fs,
-             uint32_t patch_control_points);
-
 void
-tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
-
-struct tu_image_view;
+tu6_emit_stencil_compare_mask(struct tu_cs *cs,
+                              uint32_t front,
+                              uint32_t back);
 
 void
-tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
-                  struct tu_cs *cs,
-                  const struct tu_image_view *src,
-                  const struct tu_image_view *dst,
-                  uint32_t layer_mask,
-                  uint32_t layers,
-                  const VkRect2D *rect);
+tu6_emit_stencil_write_mask(struct tu_cs *cs, uint32_t front, uint32_t back);
 
 void
-tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
-                           struct tu_cs *cs,
-                           uint32_t a,
-                           const VkRenderPassBeginInfo *info);
+tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back);
 
 void
-tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
-                         struct tu_cs *cs,
-                         uint32_t a,
-                         const VkRenderPassBeginInfo *info);
+tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]);
 
-void
-tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
-                        struct tu_cs *cs,
-                        uint32_t a,
-                        bool force_load);
+struct tu_userdata_info *
+tu_lookup_user_sgpr(struct tu_pipeline *pipeline,
+                    gl_shader_stage stage,
+                    int idx);
 
-/* expose this function to be able to emit load without checking LOAD_OP */
-void
-tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
+struct tu_shader_variant *
+tu_get_shader(struct tu_pipeline *pipeline, gl_shader_stage stage);
 
-/* note: gmem store can also resolve */
-void
-tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
-                         struct tu_cs *cs,
-                         uint32_t a,
-                         uint32_t gmem_a);
-
-enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
+struct tu_graphics_pipeline_create_info
+{
+   bool use_rectlist;
+   bool db_depth_clear;
+   bool db_stencil_clear;
+   bool db_depth_disable_expclear;
+   bool db_stencil_disable_expclear;
+   bool db_flush_depth_inplace;
+   bool db_flush_stencil_inplace;
+   bool db_resummarize;
+   uint32_t custom_blend_mode;
+};
 
 struct tu_native_format
 {
-   enum a6xx_format fmt : 8;
-   enum a3xx_color_swap swap : 8;
-   enum a6xx_tile_mode tile_mode : 8;
+   int vtx;      /* VFMTn_xxx or -1 */
+   int tex;      /* TFMTn_xxx or -1 */
+   int rb;       /* RBn_xxx or -1 */
+   int swap;     /* enum a3xx_color_swap */
+   bool present; /* internal only; always true to external users */
 };
 
-enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
-bool tu6_format_vtx_supported(VkFormat format);
-struct tu_native_format tu6_format_vtx(VkFormat format);
-bool tu6_format_color_supported(enum pipe_format format);
-struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
-bool tu6_format_texture_supported(enum pipe_format format);
-struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
+const struct tu_native_format *
+tu6_get_native_format(VkFormat format);
+
+int
+tu_pack_clear_value(const VkClearValue *val,
+                    VkFormat format,
+                    uint32_t buf[4]);
+enum a6xx_2d_ifmt tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt);
 
-static inline enum a6xx_format
-tu6_base_format(enum pipe_format format)
+struct tu_image_level
 {
-   /* note: tu6_format_color doesn't care about tiling for .fmt field */
-   return tu6_format_color(format, TILE6_LINEAR).fmt;
-}
+   VkDeviceSize offset;
+   VkDeviceSize size;
+   uint32_t pitch;
+};
 
 struct tu_image
 {
-   struct vk_object_base base;
-
+   VkImageType type;
    /* The original VkFormat provided by the client.  This may not match any
     * of the actual surface formats.
     */
    VkFormat vk_format;
+   VkImageAspectFlags aspects;
+   VkImageUsageFlags usage;  /**< Superset of VkImageCreateInfo::usage. */
+   VkImageTiling tiling;     /** VkImageCreateInfo::tiling */
+   VkImageCreateFlags flags; /** VkImageCreateInfo::flags */
+   VkExtent3D extent;
    uint32_t level_count;
    uint32_t layer_count;
 
-   struct fdl_layout layout[3];
-   uint32_t total_size;
+   VkDeviceSize size;
+   uint32_t alignment;
+
+   /* memory layout */
+   VkDeviceSize layer_size;
+   struct tu_image_level levels[15];
+   unsigned tile_mode;
+
+   unsigned queue_family_mask;
+   bool exclusive;
+   bool shareable;
 
-#ifdef ANDROID
    /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
    VkDeviceMemory owned_memory;
-#endif
 
    /* Set when bound */
-   struct tu_bo *bo;
-   uint64_t iova;
-
-   uint32_t lrz_height;
-   uint32_t lrz_pitch;
-   uint32_t lrz_offset;
-
-   bool shareable;
+   const struct tu_bo *bo;
+   VkDeviceSize bo_offset;
 };
 
+unsigned
+tu_image_queue_family_mask(const struct tu_image *image,
+                           uint32_t family,
+                           uint32_t queue_family);
+
 static inline uint32_t
 tu_get_layerCount(const struct tu_image *image,
                   const VkImageSubresourceRange *range)
@@ -1664,108 +1251,99 @@ tu_get_levelCount(const struct tu_image *image,
              : range->levelCount;
 }
 
-enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
-
-uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
-
-enum pipe_format tu_format_for_aspect(enum pipe_format format,
-                                      VkImageAspectFlags aspect_mask);
-
 struct tu_image_view
 {
-   struct vk_object_base base;
-
    struct tu_image *image; /**< VkImageViewCreateInfo::image */
 
-   struct fdl6_view view;
+   VkImageViewType type;
+   VkImageAspectFlags aspect_mask;
+   VkFormat vk_format;
+   uint32_t base_layer;
+   uint32_t layer_count;
+   uint32_t base_mip;
+   uint32_t level_count;
+   VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */
 
-   /* for d32s8 separate depth */
-   uint64_t depth_base_addr;
-   uint32_t depth_layer_size;
-   uint32_t depth_PITCH;
+   uint32_t descriptor[16];
 
-   /* for d32s8 separate stencil */
-   uint64_t stencil_base_addr;
-   uint32_t stencil_layer_size;
-   uint32_t stencil_PITCH;
+   /* Descriptor for use as a storage image as opposed to a sampled image.
+    * This has a few differences for cube maps (e.g. type).
+    */
+   uint32_t storage_descriptor[16];
 };
 
-struct tu_sampler_ycbcr_conversion {
-   struct vk_object_base base;
-
-   VkFormat format;
-   VkSamplerYcbcrModelConversion ycbcr_model;
-   VkSamplerYcbcrRange ycbcr_range;
-   VkComponentMapping components;
-   VkChromaLocation chroma_offsets[2];
-   VkFilter chroma_filter;
+struct tu_sampler
+{
 };
 
-struct tu_sampler {
-   struct vk_object_base base;
-
-   uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
-   struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
+struct tu_image_create_info
+{
+   const VkImageCreateInfo *vk_info;
+   bool scanout;
+   bool no_metadata_planes;
 };
 
-void
-tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
-
-void
-tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
-
-void
-tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
-
-void
-tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-
-void
-tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-
-#define tu_image_view_stencil(iview, x) \
-   ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
-
-#define tu_image_view_depth(iview, x) \
-   ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT))
-
 VkResult
-tu_gralloc_info(struct tu_device *device,
-                const VkNativeBufferANDROID *gralloc_info,
-                int *dma_buf,
-                uint64_t *modifier);
+tu_image_create(VkDevice _device,
+                const struct tu_image_create_info *info,
+                const VkAllocationCallbacks *alloc,
+                VkImage *pImage);
 
 VkResult
-tu_import_memory_from_gralloc_handle(VkDevice device_h,
-                                     int dma_buf,
-                                     const VkAllocationCallbacks *alloc,
-                                     VkImage image_h);
+tu_image_from_gralloc(VkDevice device_h,
+                      const VkImageCreateInfo *base_info,
+                      const VkNativeBufferANDROID *gralloc_info,
+                      const VkAllocationCallbacks *alloc,
+                      VkImage *out_image_h);
 
 void
-tu_image_view_init(struct tu_image_view *iview,
-                   const VkImageViewCreateInfo *pCreateInfo,
-                   bool limited_z24s8);
-
-bool
-tiling_possible(VkFormat format);
-
-bool
-ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
-              const struct fd_dev_info *info, VkSampleCountFlagBits samples);
+tu_image_view_init(struct tu_image_view *view,
+                   struct tu_device *device,
+                   const VkImageViewCreateInfo *pCreateInfo);
 
 struct tu_buffer_view
 {
-   struct vk_object_base base;
-
-   uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
-
-   struct tu_buffer *buffer;
+   VkFormat vk_format;
+   uint64_t range; /**< VkBufferViewCreateInfo::range */
+   uint32_t state[4];
 };
 void
 tu_buffer_view_init(struct tu_buffer_view *view,
                     struct tu_device *device,
                     const VkBufferViewCreateInfo *pCreateInfo);
 
+static inline struct VkExtent3D
+tu_sanitize_image_extent(const VkImageType imageType,
+                         const struct VkExtent3D imageExtent)
+{
+   switch (imageType) {
+   case VK_IMAGE_TYPE_1D:
+      return (VkExtent3D) { imageExtent.width, 1, 1 };
+   case VK_IMAGE_TYPE_2D:
+      return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 };
+   case VK_IMAGE_TYPE_3D:
+      return imageExtent;
+   default:
+      unreachable("invalid image type");
+   }
+}
+
+static inline struct VkOffset3D
+tu_sanitize_image_offset(const VkImageType imageType,
+                         const struct VkOffset3D imageOffset)
+{
+   switch (imageType) {
+   case VK_IMAGE_TYPE_1D:
+      return (VkOffset3D) { imageOffset.x, 0, 0 };
+   case VK_IMAGE_TYPE_2D:
+      return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 };
+   case VK_IMAGE_TYPE_3D:
+      return imageOffset;
+   default:
+      unreachable("invalid image type");
+   }
+}
+
 struct tu_attachment_info
 {
    struct tu_image_view *attachment;
@@ -1773,146 +1351,100 @@ struct tu_attachment_info
 
 struct tu_framebuffer
 {
-   struct vk_object_base base;
-
    uint32_t width;
    uint32_t height;
    uint32_t layers;
 
-   /* size of the first tile */
-   VkExtent2D tile0;
-   /* number of tiles */
-   VkExtent2D tile_count;
-
-   /* size of the first VSC pipe */
-   VkExtent2D pipe0;
-   /* number of VSC pipes */
-   VkExtent2D pipe_count;
-
-   /* pipe register values */
-   uint32_t pipe_config[MAX_VSC_PIPES];
-   uint32_t pipe_sizes[MAX_VSC_PIPES];
-
    uint32_t attachment_count;
    struct tu_attachment_info attachments[0];
 };
 
-void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
-                             const struct tu_device *device,
-                             const struct tu_render_pass *pass);
-
-struct tu_subpass_barrier {
+struct tu_subpass_barrier
+{
    VkPipelineStageFlags src_stage_mask;
-   VkPipelineStageFlags dst_stage_mask;
    VkAccessFlags src_access_mask;
    VkAccessFlags dst_access_mask;
-   bool incoherent_ccu_color, incoherent_ccu_depth;
 };
 
+void
+tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
+                   const struct tu_subpass_barrier *barrier);
+
 struct tu_subpass_attachment
 {
    uint32_t attachment;
-
-   /* For input attachments, true if it needs to be patched to refer to GMEM
-    * in GMEM mode. This is false if it hasn't already been written as an
-    * attachment.
-    */
-   bool patch_input_gmem;
+   VkImageLayout layout;
 };
 
 struct tu_subpass
 {
    uint32_t input_count;
    uint32_t color_count;
-   uint32_t resolve_count;
-   bool resolve_depth_stencil;
-
-   bool feedback_loop_color;
-   bool feedback_loop_ds;
-
-   /* True if we must invalidate UCHE thanks to a feedback loop. */
-   bool feedback_invalidate;
-
-   /* In other words - framebuffer fetch support */
-   bool raster_order_attachment_access;
-
    struct tu_subpass_attachment *input_attachments;
    struct tu_subpass_attachment *color_attachments;
    struct tu_subpass_attachment *resolve_attachments;
    struct tu_subpass_attachment depth_stencil_attachment;
 
-   VkSampleCountFlagBits samples;
-
-   uint32_t srgb_cntl;
-   uint32_t multiview_mask;
+   /** Subpass has at least one resolve attachment */
+   bool has_resolve;
 
    struct tu_subpass_barrier start_barrier;
+
+   uint32_t view_mask;
+   VkSampleCountFlagBits max_sample_count;
 };
 
 struct tu_render_pass_attachment
 {
    VkFormat format;
    uint32_t samples;
-   uint32_t cpp;
-   VkImageAspectFlags clear_mask;
-   uint32_t clear_views;
-   bool load;
-   bool store;
-   int32_t gmem_offset;
-   /* for D32S8 separate stencil: */
-   bool load_stencil;
-   bool store_stencil;
-   int32_t gmem_offset_stencil;
+   VkAttachmentLoadOp load_op;
+   VkAttachmentLoadOp stencil_load_op;
+   VkImageLayout initial_layout;
+   VkImageLayout final_layout;
+   uint32_t view_mask;
 };
 
 struct tu_render_pass
 {
-   struct vk_object_base base;
-
    uint32_t attachment_count;
    uint32_t subpass_count;
-   uint32_t gmem_pixels;
-   uint32_t tile_align_w;
    struct tu_subpass_attachment *subpass_attachments;
    struct tu_render_pass_attachment *attachments;
    struct tu_subpass_barrier end_barrier;
    struct tu_subpass subpasses[0];
 };
 
-#define PERF_CNTRS_REG 4
-
-struct tu_perf_query_data
-{
-   uint32_t gid;      /* group-id */
-   uint32_t cid;      /* countable-id within the group */
-   uint32_t cntr_reg; /* counter register within the group */
-   uint32_t pass;     /* pass index that countables can be requested */
-   uint32_t app_idx;  /* index provided by apps */
-};
+VkResult
+tu_device_init_meta(struct tu_device *device);
+void
+tu_device_finish_meta(struct tu_device *device);
 
 struct tu_query_pool
 {
-   struct vk_object_base base;
-
-   VkQueryType type;
    uint32_t stride;
+   uint32_t availability_offset;
    uint64_t size;
-   uint32_t pipeline_statistics;
-   struct tu_bo *bo;
+   char *ptr;
+   VkQueryType type;
+   uint32_t pipeline_stats_mask;
+};
 
-   /* For performance query */
-   const struct fd_perfcntr_group *perf_group;
-   uint32_t perf_group_count;
-   uint32_t counter_index_count;
-   struct tu_perf_query_data perf_query_data[0];
+struct tu_semaphore
+{
+   uint32_t syncobj;
+   uint32_t temp_syncobj;
 };
 
-uint32_t
-tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
+void
+tu_set_descriptor_set(struct tu_cmd_buffer *cmd_buffer,
+                      VkPipelineBindPoint bind_point,
+                      struct tu_descriptor_set *set,
+                      unsigned idx);
 
 void
-tu_update_descriptor_sets(const struct tu_device *device,
+tu_update_descriptor_sets(struct tu_device *device,
+                          struct tu_cmd_buffer *cmd_buffer,
                           VkDescriptorSet overrideSet,
                           uint32_t descriptorWriteCount,
                           const VkWriteDescriptorSet *pDescriptorWrites,
@@ -1921,24 +1453,25 @@ tu_update_descriptor_sets(const struct tu_device *device,
 
 void
 tu_update_descriptor_set_with_template(
-   const struct tu_device *device,
+   struct tu_device *device,
+   struct tu_cmd_buffer *cmd_buffer,
    struct tu_descriptor_set *set,
    VkDescriptorUpdateTemplate descriptorUpdateTemplate,
    const void *pData);
 
-VkResult
-tu_physical_device_init(struct tu_physical_device *device,
-                        struct tu_instance *instance);
-VkResult
-tu_enumerate_devices(struct tu_instance *instance);
+void
+tu_meta_push_descriptor_set(struct tu_cmd_buffer *cmd_buffer,
+                            VkPipelineBindPoint pipelineBindPoint,
+                            VkPipelineLayout _layout,
+                            uint32_t set,
+                            uint32_t descriptorWriteCount,
+                            const VkWriteDescriptorSet *pDescriptorWrites);
 
 int
-tu_device_get_gpu_timestamp(struct tu_device *dev,
-                            uint64_t *ts);
+tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id);
 
 int
-tu_device_get_suspend_count(struct tu_device *dev,
-                            uint64_t *suspend_count);
+tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size);
 
 int
 tu_drm_submitqueue_new(const struct tu_device *dev,
@@ -1948,116 +1481,76 @@ tu_drm_submitqueue_new(const struct tu_device *dev,
 void
 tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
 
+uint32_t
+tu_gem_new(const struct tu_device *dev, uint64_t size, uint32_t flags);
+uint32_t
+tu_gem_import_dmabuf(const struct tu_device *dev,
+                     int prime_fd,
+                     uint64_t size);
 int
-tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2);
-
-int
-tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);
-
-VkResult
-tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);
-
+tu_gem_export_dmabuf(const struct tu_device *dev, uint32_t gem_handle);
 void
-tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
-                         void *ts_from, uint32_t from_offset,
-                         void *ts_to, uint32_t to_offset,
-                         uint32_t count);
-
-
-VkResult
-tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
-                            struct u_trace **trace_copy);
-
-/* If we copy trace and timestamps we will have to free them. */
-struct tu_u_trace_cmd_data
-{
-   struct tu_cs *timestamp_copy_cs;
-   struct u_trace *trace;
-};
-
-/* Data necessary to retrieve timestamps and clean all
- * associated resources afterwards.
- */
-struct tu_u_trace_submission_data
-{
-   uint32_t submission_id;
-   /* We have to know when timestamps are available,
-    * this sync object indicates it.
-    */
-   struct tu_u_trace_syncobj *syncobj;
-
-   uint32_t cmd_buffer_count;
-   uint32_t last_buffer_with_tracepoints;
-   struct tu_u_trace_cmd_data *cmd_trace_data;
-};
-
-VkResult
-tu_u_trace_submission_data_create(
-   struct tu_device *device,
-   struct tu_cmd_buffer **cmd_buffers,
-   uint32_t cmd_buffer_count,
-   struct tu_u_trace_submission_data **submission_data);
-
-void
-tu_u_trace_submission_data_finish(
-   struct tu_device *device,
-   struct tu_u_trace_submission_data *submission_data);
+tu_gem_close(const struct tu_device *dev, uint32_t gem_handle);
+uint64_t
+tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle);
+uint64_t
+tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle);
+
+#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType)                          \
+                                                                             \
+   static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
+   {                                                                         \
+      return (struct __tu_type *) _handle;                                   \
+   }                                                                         \
+                                                                             \
+   static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
+   {                                                                         \
+      return (__VkType) _obj;                                                \
+   }
+
+#define TU_DEFINE_NONDISP_HANDLE_CASTS(__tu_type, __VkType)                  \
+                                                                             \
+   static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
+   {                                                                         \
+      return (struct __tu_type *) (uintptr_t) _handle;                       \
+   }                                                                         \
+                                                                             \
+   static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
+   {                                                                         \
+      return (__VkType)(uintptr_t) _obj;                                     \
+   }
 
 #define TU_FROM_HANDLE(__tu_type, __name, __handle)                          \
-   VK_FROM_HANDLE(__tu_type, __name, __handle)
-
-VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
-                       VK_OBJECT_TYPE_COMMAND_BUFFER)
-VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
-VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
-                       VK_OBJECT_TYPE_INSTANCE)
-VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
-                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
-VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
-
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
-                               VK_OBJECT_TYPE_COMMAND_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
-                               VK_OBJECT_TYPE_BUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
-                               VK_OBJECT_TYPE_BUFFER_VIEW)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
-                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
-                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
-                               VkDescriptorSetLayout,
-                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
-                               VkDescriptorUpdateTemplate,
-                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
-                               VK_OBJECT_TYPE_DEVICE_MEMORY)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
-                               VK_OBJECT_TYPE_FRAMEBUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
-                               VK_OBJECT_TYPE_IMAGE_VIEW);
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
-                               VK_OBJECT_TYPE_PIPELINE_CACHE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
-                               VK_OBJECT_TYPE_PIPELINE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
-                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
-                               VK_OBJECT_TYPE_QUERY_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
-                               VK_OBJECT_TYPE_RENDER_PASS)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
-                               VK_OBJECT_TYPE_SAMPLER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
-                               VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
-
-/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
-#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
-
-void
-update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
+   struct __tu_type *__name = __tu_type##_from_handle(__handle)
+
+TU_DEFINE_HANDLE_CASTS(tu_cmd_buffer, VkCommandBuffer)
+TU_DEFINE_HANDLE_CASTS(tu_device, VkDevice)
+TU_DEFINE_HANDLE_CASTS(tu_instance, VkInstance)
+TU_DEFINE_HANDLE_CASTS(tu_physical_device, VkPhysicalDevice)
+TU_DEFINE_HANDLE_CASTS(tu_queue, VkQueue)
+
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, VkCommandPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, VkBuffer)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, VkBufferView)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, VkDescriptorPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, VkDescriptorSet)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout,
+                               VkDescriptorSetLayout)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template,
+                               VkDescriptorUpdateTemplate)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, VkDeviceMemory)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_fence, VkFence)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_event, VkEvent)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, VkFramebuffer)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image, VkImage)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, VkImageView);
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, VkPipelineCache)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, VkPipeline)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_shader_module, VkShaderModule)
+TU_DEFINE_NONDISP_HANDLE_CASTS(tu_semaphore, VkSemaphore)
 
 #endif /* TU_PRIVATE_H */
diff --git a/lib/mesa/src/freedreno/vulkan/vk_format.h b/lib/mesa/src/freedreno/vulkan/vk_format.h
index c335a12ae..4e13bc9c0 100644
--- a/lib/mesa/src/freedreno/vulkan/vk_format.h
+++ b/lib/mesa/src/freedreno/vulkan/vk_format.h
@@ -3,7 +3,7 @@
  * Copyright © 2016 Bas Nieuwenhuizen
  *
  * Based on u_format.h which is:
- * Copyright 2009-2010 VMware, Inc.
+ * Copyright 2009-2010 Vmware, Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
@@ -29,29 +29,549 @@
 
 #include <assert.h>
 #include <util/macros.h>
-#include <util/format/u_format.h>
-#include <vulkan/util/vk_format.h>
 
 #include <vulkan/vulkan.h>
 
+enum vk_format_layout
+{
+   /**
+    * Formats with vk_format_block::width == vk_format_block::height == 1
+    * that can be described as an ordinary data structure.
+    */
+   VK_FORMAT_LAYOUT_PLAIN = 0,
+
+   /**
+    * Formats with sub-sampled channels.
+    *
+    * This is for formats like YVYU where there is less than one sample per
+    * pixel.
+    */
+   VK_FORMAT_LAYOUT_SUBSAMPLED = 3,
+
+   /**
+    * S3 Texture Compression formats.
+    */
+   VK_FORMAT_LAYOUT_S3TC = 4,
+
+   /**
+    * Red-Green Texture Compression formats.
+    */
+   VK_FORMAT_LAYOUT_RGTC = 5,
+
+   /**
+    * Ericsson Texture Compression
+    */
+   VK_FORMAT_LAYOUT_ETC = 6,
+
+   /**
+    * BC6/7 Texture Compression
+    */
+   VK_FORMAT_LAYOUT_BPTC = 7,
+
+   /**
+    * ASTC
+    */
+   VK_FORMAT_LAYOUT_ASTC = 8,
+
+   /**
+    * Everything else that doesn't fit in any of the above layouts.
+    */
+   VK_FORMAT_LAYOUT_OTHER = 9
+};
+
+struct vk_format_block
+{
+   /** Block width in pixels */
+   unsigned width;
+
+   /** Block height in pixels */
+   unsigned height;
+
+   /** Block size in bits */
+   unsigned bits;
+};
+
+enum vk_format_type
+{
+   VK_FORMAT_TYPE_VOID = 0,
+   VK_FORMAT_TYPE_UNSIGNED = 1,
+   VK_FORMAT_TYPE_SIGNED = 2,
+   VK_FORMAT_TYPE_FIXED = 3,
+   VK_FORMAT_TYPE_FLOAT = 4
+};
+
+enum vk_format_colorspace
+{
+   VK_FORMAT_COLORSPACE_RGB = 0,
+   VK_FORMAT_COLORSPACE_SRGB = 1,
+   VK_FORMAT_COLORSPACE_YUV = 2,
+   VK_FORMAT_COLORSPACE_ZS = 3
+};
+
+struct vk_format_channel_description
+{
+   unsigned type : 5;
+   unsigned normalized : 1;
+   unsigned pure_integer : 1;
+   unsigned scaled : 1;
+   unsigned size : 8;
+   unsigned shift : 16;
+};
+
+struct vk_format_description
+{
+   VkFormat format;
+   const char *name;
+   const char *short_name;
+
+   struct vk_format_block block;
+   enum vk_format_layout layout;
+
+   unsigned nr_channels : 3;
+   unsigned is_array : 1;
+   unsigned is_bitmask : 1;
+   unsigned is_mixed : 1;
+
+   struct vk_format_channel_description channel[4];
+
+   unsigned char swizzle[4];
+
+   enum vk_format_colorspace colorspace;
+};
+
+extern const struct vk_format_description vk_format_description_table[];
+
+const struct vk_format_description *
+vk_format_description(VkFormat format);
+
+/**
+ * Return total bits needed for the pixel format per block.
+ */
+static inline unsigned
+vk_format_get_blocksizebits(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return 0;
+   }
+
+   return desc->block.bits;
+}
+
+/**
+ * Return bytes per block (not pixel) for the given format.
+ */
+static inline unsigned
+vk_format_get_blocksize(VkFormat format)
+{
+   unsigned bits = vk_format_get_blocksizebits(format);
+   unsigned bytes = bits / 8;
+
+   assert(bits % 8 == 0);
+   assert(bytes > 0);
+   if (bytes == 0) {
+      bytes = 1;
+   }
+
+   return bytes;
+}
+
+static inline unsigned
+vk_format_get_blockwidth(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return 1;
+   }
+
+   return desc->block.width;
+}
+
+static inline unsigned
+vk_format_get_blockheight(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return 1;
+   }
+
+   return desc->block.height;
+}
+
+static inline unsigned
+vk_format_get_block_count_width(VkFormat format, unsigned width)
+{
+   unsigned blockwidth = vk_format_get_blockwidth(format);
+   return (width + blockwidth - 1) / blockwidth;
+}
+
+static inline unsigned
+vk_format_get_block_count_height(VkFormat format, unsigned height)
+{
+   unsigned blockheight = vk_format_get_blockheight(format);
+   return (height + blockheight - 1) / blockheight;
+}
+
+static inline unsigned
+vk_format_get_block_count(VkFormat format, unsigned width, unsigned height)
+{
+   return vk_format_get_block_count_width(format, width) *
+          vk_format_get_block_count_height(format, height);
+}
+
+/**
+ * Return the index of the first non-void channel
+ * -1 if no non-void channels
+ */
+static inline int
+vk_format_get_first_non_void_channel(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+   int i;
+
+   for (i = 0; i < 4; i++)
+      if (desc->channel[i].type != VK_FORMAT_TYPE_VOID)
+         break;
+
+   if (i == 4)
+      return -1;
+
+   return i;
+}
+
+enum vk_swizzle
+{
+   VK_SWIZZLE_X,
+   VK_SWIZZLE_Y,
+   VK_SWIZZLE_Z,
+   VK_SWIZZLE_W,
+   VK_SWIZZLE_0,
+   VK_SWIZZLE_1,
+   VK_SWIZZLE_NONE,
+   VK_SWIZZLE_MAX, /**< Number of enums counter (must be last) */
+};
+
+static inline VkImageAspectFlags
+vk_format_aspects(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_UNDEFINED:
+      return 0;
+
+   case VK_FORMAT_S8_UINT:
+      return VK_IMAGE_ASPECT_STENCIL_BIT;
+
+   case VK_FORMAT_D16_UNORM_S8_UINT:
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case VK_FORMAT_D32_SFLOAT_S8_UINT:
+      return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+
+   case VK_FORMAT_D16_UNORM:
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+   case VK_FORMAT_D32_SFLOAT:
+      return VK_IMAGE_ASPECT_DEPTH_BIT;
+
+   default:
+      return VK_IMAGE_ASPECT_COLOR_BIT;
+   }
+}
+
+static inline enum vk_swizzle
+tu_swizzle_conv(VkComponentSwizzle component,
+                const unsigned char chan[4],
+                VkComponentSwizzle vk_swiz)
+{
+   int x;
+
+   if (vk_swiz == VK_COMPONENT_SWIZZLE_IDENTITY)
+      vk_swiz = component;
+   switch (vk_swiz) {
+   case VK_COMPONENT_SWIZZLE_ZERO:
+      return VK_SWIZZLE_0;
+   case VK_COMPONENT_SWIZZLE_ONE:
+      return VK_SWIZZLE_1;
+   case VK_COMPONENT_SWIZZLE_R:
+      for (x = 0; x < 4; x++)
+         if (chan[x] == 0)
+            return x;
+      return VK_SWIZZLE_0;
+   case VK_COMPONENT_SWIZZLE_G:
+      for (x = 0; x < 4; x++)
+         if (chan[x] == 1)
+            return x;
+      return VK_SWIZZLE_0;
+   case VK_COMPONENT_SWIZZLE_B:
+      for (x = 0; x < 4; x++)
+         if (chan[x] == 2)
+            return x;
+      return VK_SWIZZLE_0;
+   case VK_COMPONENT_SWIZZLE_A:
+      for (x = 0; x < 4; x++)
+         if (chan[x] == 3)
+            return x;
+      return VK_SWIZZLE_1;
+   default:
+      unreachable("Illegal swizzle");
+   }
+}
+
+static inline void
+vk_format_compose_swizzles(const VkComponentMapping *mapping,
+                           const unsigned char swz[4],
+                           enum vk_swizzle dst[4])
+{
+   dst[0] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_R, swz, mapping->r);
+   dst[1] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_G, swz, mapping->g);
+   dst[2] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_B, swz, mapping->b);
+   dst[3] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_A, swz, mapping->a);
+}
+
+static inline bool
+vk_format_is_compressed(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return false;
+   }
+
+   switch (desc->layout) {
+   case VK_FORMAT_LAYOUT_S3TC:
+   case VK_FORMAT_LAYOUT_RGTC:
+   case VK_FORMAT_LAYOUT_ETC:
+   case VK_FORMAT_LAYOUT_BPTC:
+   case VK_FORMAT_LAYOUT_ASTC:
+      /* XXX add other formats in the future */
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+vk_format_has_depth(const struct vk_format_description *desc)
+{
+   return desc->colorspace == VK_FORMAT_COLORSPACE_ZS &&
+          desc->swizzle[0] != VK_SWIZZLE_NONE;
+}
+
+static inline bool
+vk_format_has_stencil(const struct vk_format_description *desc)
+{
+   return desc->colorspace == VK_FORMAT_COLORSPACE_ZS &&
+          desc->swizzle[1] != VK_SWIZZLE_NONE;
+}
+
+static inline bool
+vk_format_is_depth_or_stencil(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return false;
+   }
+
+   return vk_format_has_depth(desc) || vk_format_has_stencil(desc);
+}
+
+static inline bool
+vk_format_is_depth(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return false;
+   }
+
+   return vk_format_has_depth(desc);
+}
+
+static inline bool
+vk_format_is_stencil(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return false;
+   }
+
+   return vk_format_has_stencil(desc);
+}
+
+static inline bool
+vk_format_is_color(VkFormat format)
+{
+   return !vk_format_is_depth_or_stencil(format);
+}
+
+static inline bool
+vk_format_has_alpha(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+
+   return (desc->colorspace == VK_FORMAT_COLORSPACE_RGB ||
+           desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) &&
+          desc->swizzle[3] != VK_SWIZZLE_1;
+}
+
+static inline VkFormat
+vk_format_depth_only(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_D16_UNORM_S8_UINT:
+      return VK_FORMAT_D16_UNORM;
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+      return VK_FORMAT_X8_D24_UNORM_PACK32;
+   case VK_FORMAT_D32_SFLOAT_S8_UINT:
+      return VK_FORMAT_D32_SFLOAT;
+   default:
+      return format;
+   }
+}
+
+static inline bool
+vk_format_is_int(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+   int channel = vk_format_get_first_non_void_channel(format);
+
+   return channel >= 0 && desc->channel[channel].pure_integer;
+}
+
+static inline bool
+vk_format_is_srgb(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+   return desc->colorspace == VK_FORMAT_COLORSPACE_SRGB;
+}
+
+static inline VkFormat
+vk_format_no_srgb(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_R8_SRGB:
+      return VK_FORMAT_R8_UNORM;
+   case VK_FORMAT_R8G8_SRGB:
+      return VK_FORMAT_R8G8_UNORM;
+   case VK_FORMAT_R8G8B8_SRGB:
+      return VK_FORMAT_R8G8B8_UNORM;
+   case VK_FORMAT_B8G8R8_SRGB:
+      return VK_FORMAT_B8G8R8_UNORM;
+   case VK_FORMAT_R8G8B8A8_SRGB:
+      return VK_FORMAT_R8G8B8A8_UNORM;
+   case VK_FORMAT_B8G8R8A8_SRGB:
+      return VK_FORMAT_B8G8R8A8_UNORM;
+   case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+      return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
+   case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
+      return VK_FORMAT_BC1_RGB_UNORM_BLOCK;
+   case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
+      return VK_FORMAT_BC1_RGBA_UNORM_BLOCK;
+   case VK_FORMAT_BC2_SRGB_BLOCK:
+      return VK_FORMAT_BC2_UNORM_BLOCK;
+   case VK_FORMAT_BC3_SRGB_BLOCK:
+      return VK_FORMAT_BC3_UNORM_BLOCK;
+   case VK_FORMAT_BC7_SRGB_BLOCK:
+      return VK_FORMAT_BC7_UNORM_BLOCK;
+   case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+      return VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK;
+   case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+      return VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK;
+   case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+      return VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK;
+   default:
+      assert(!vk_format_is_srgb(format));
+      return format;
+   }
+}
+
+static inline VkFormat
+vk_format_stencil_only(VkFormat format)
+{
+   return VK_FORMAT_S8_UINT;
+}
+
 static inline unsigned
 vk_format_get_component_bits(VkFormat format,
-                             enum util_format_colorspace colorspace,
+                             enum vk_format_colorspace colorspace,
                              unsigned component)
 {
+   const struct vk_format_description *desc = vk_format_description(format);
+   enum vk_format_colorspace desc_colorspace;
+
+   assert(format);
+   if (!format) {
+      return 0;
+   }
+
+   assert(component < 4);
+
+   /* Treat RGB and SRGB as equivalent. */
+   if (colorspace == VK_FORMAT_COLORSPACE_SRGB) {
+      colorspace = VK_FORMAT_COLORSPACE_RGB;
+   }
+   if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) {
+      desc_colorspace = VK_FORMAT_COLORSPACE_RGB;
+   } else {
+      desc_colorspace = desc->colorspace;
+   }
+
+   if (desc_colorspace != colorspace) {
+      return 0;
+   }
+
+   switch (desc->swizzle[component]) {
+   case VK_SWIZZLE_X:
+      return desc->channel[0].size;
+   case VK_SWIZZLE_Y:
+      return desc->channel[1].size;
+   case VK_SWIZZLE_Z:
+      return desc->channel[2].size;
+   case VK_SWIZZLE_W:
+      return desc->channel[3].size;
+   default:
+      return 0;
+   }
+}
+
+static inline VkFormat
+vk_to_non_srgb_format(VkFormat format)
+{
    switch (format) {
-   case VK_FORMAT_G8B8G8R8_422_UNORM:
-   case VK_FORMAT_B8G8R8G8_422_UNORM:
-   case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
-   case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
-      /* util_format_get_component_bits doesn't return what we want */
-      return 8;
+   case VK_FORMAT_R8_SRGB:
+      return VK_FORMAT_R8_UNORM;
+   case VK_FORMAT_R8G8_SRGB:
+      return VK_FORMAT_R8G8_UNORM;
+   case VK_FORMAT_R8G8B8_SRGB:
+      return VK_FORMAT_R8G8B8_UNORM;
+   case VK_FORMAT_B8G8R8_SRGB:
+      return VK_FORMAT_B8G8R8_UNORM;
+   case VK_FORMAT_R8G8B8A8_SRGB:
+      return VK_FORMAT_R8G8B8A8_UNORM;
+   case VK_FORMAT_B8G8R8A8_SRGB:
+      return VK_FORMAT_B8G8R8A8_UNORM;
+   case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+      return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
    default:
-      break;
+      return format;
    }
+}
 
-   return util_format_get_component_bits(vk_format_to_pipe_format(format),
-                                         colorspace, component);
+static inline unsigned
+vk_format_get_nr_components(VkFormat format)
+{
+   const struct vk_format_description *desc = vk_format_description(format);
+   return desc->nr_channels;
 }
 
 #endif /* VK_FORMAT_H */
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp
index 0a70c6881..0ad6087e5 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.cpp
@@ -24,11 +24,9 @@
 
 namespace nv50_ir {
 
-ConverterCommon::ConverterCommon(Program *prog, nv50_ir_prog_info *info,
-                                 nv50_ir_prog_info_out *info_out)
+ConverterCommon::ConverterCommon(Program *prog, nv50_ir_prog_info *info)
    :  BuildUtil(prog),
-      info(info),
-      info_out(info_out) {}
+      info(info) {}
 
 ConverterCommon::Subroutine *
 ConverterCommon::getSubroutine(unsigned ip)
@@ -84,7 +82,7 @@ ConverterCommon::handleUserClipPlanes()
    int n, i, c;
 
    for (c = 0; c < 4; ++c) {
-      for (i = 0; i < info_out->io.genUserClip; ++i) {
+      for (i = 0; i < info->io.genUserClip; ++i) {
          Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot,
                                 TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
          Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
@@ -95,13 +93,13 @@ ConverterCommon::handleUserClipPlanes()
       }
    }
 
-   const int first = info_out->numOutputs - (info_out->io.genUserClip + 3) / 4;
+   const int first = info->numOutputs - (info->io.genUserClip + 3) / 4;
 
-   for (i = 0; i < info_out->io.genUserClip; ++i) {
+   for (i = 0; i < info->io.genUserClip; ++i) {
       n = i / 4 + first;
       c = i % 4;
       Symbol *sym =
-         mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info_out->out[n].slot[c] * 4);
+         mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info->out[n].slot[c] * 4);
       mkStore(OP_EXPORT, TYPE_F32, sym, NULL, res[i]);
    }
 }
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h
index a144ca23a..e44eea86a 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_common.h
@@ -28,7 +28,7 @@ namespace nv50_ir {
 class ConverterCommon : public BuildUtil
 {
 public:
-   ConverterCommon(Program *, nv50_ir_prog_info *, nv50_ir_prog_info_out *);
+   ConverterCommon(Program *, nv50_ir_prog_info *);
 protected:
    struct Subroutine
    {
@@ -50,7 +50,6 @@ protected:
    } sub;
 
    struct nv50_ir_prog_info *info;
-   struct nv50_ir_prog_info_out *info_out;
    Value *fragCoord[4];
    Value *clipVtx[4];
    Value *outBase; // base address of vertex out patch (for TCP)
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
index 5df3f3168..950923a0d 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
@@ -25,13 +25,11 @@
 #include "compiler/nir/nir.h"
 
 #include "util/u_debug.h"
-#include "util/u_prim.h"
 
 #include "codegen/nv50_ir.h"
 #include "codegen/nv50_ir_from_common.h"
 #include "codegen/nv50_ir_lowering_helper.h"
 #include "codegen/nv50_ir_util.h"
-#include "tgsi/tgsi_from_mesa.h"
 
 #if __cplusplus >= 201103L
 #include <unordered_map>
@@ -60,32 +58,17 @@ type_size(const struct glsl_type *type, bool bindless)
    return glsl_count_attribute_slots(type, false);
 }
 
-static void
-function_temp_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
-{
-   assert(glsl_type_is_vector_or_scalar(type));
-
-   if (glsl_type_is_scalar(type)) {
-      glsl_get_natural_size_align_bytes(type, size, align);
-   } else {
-      unsigned comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
-      unsigned length = glsl_get_vector_elements(type);
-
-      *size = comp_size * length;
-      *align = 0x10;
-   }
-}
-
 class Converter : public ConverterCommon
 {
 public:
-   Converter(Program *, nir_shader *, nv50_ir_prog_info *, nv50_ir_prog_info_out *);
+   Converter(Program *, nir_shader *, nv50_ir_prog_info *);
 
    bool run();
 private:
    typedef std::vector<LValue*> LValues;
    typedef unordered_map<unsigned, LValues> NirDefMap;
    typedef unordered_map<unsigned, nir_load_const_instr*> ImmediateMap;
+   typedef unordered_map<unsigned, uint32_t> NirArrayLMemOffsets;
    typedef unordered_map<unsigned, BasicBlock*> NirBlockMap;
 
    CacheMode convert(enum gl_access_qualifier);
@@ -98,6 +81,8 @@ private:
    LValues& convert(nir_register *);
    LValues& convert(nir_ssa_def *);
 
+   ImgFormat convertGLImgFormat(GLuint);
+
    Value* getSrc(nir_alu_src *, uint8_t component = 0);
    Value* getSrc(nir_register *, uint8_t);
    Value* getSrc(nir_src *, uint8_t, bool indirect = false);
@@ -137,10 +122,9 @@ private:
 
    DataType getDType(nir_alu_instr *);
    DataType getDType(nir_intrinsic_instr *);
+   DataType getDType(nir_intrinsic_instr *, bool isSigned);
    DataType getDType(nir_op, uint8_t);
 
-   DataFile getFile(nir_intrinsic_op);
-
    std::vector<DataType> getSTypes(nir_alu_instr *);
    DataType getSType(nir_src &, bool isFloat, bool isSigned);
 
@@ -160,6 +144,7 @@ private:
    bool visit(nir_alu_instr *);
    bool visit(nir_block *);
    bool visit(nir_cf_node *);
+   bool visit(nir_deref_instr *);
    bool visit(nir_function *);
    bool visit(nir_if *);
    bool visit(nir_instr *);
@@ -171,16 +156,21 @@ private:
    bool visit(nir_tex_instr *);
 
    // tex stuff
+   Value* applyProjection(Value *src, Value *proj);
    unsigned int getNIRArgCount(TexInstruction::Target&);
 
+   // image stuff
+   uint16_t handleDeref(nir_deref_instr *, Value * & indirect, const nir_variable * &);
+   CacheMode getCacheModeFromVar(const nir_variable *);
+
    nir_shader *nir;
 
    NirDefMap ssaDefs;
    NirDefMap regDefs;
    ImmediateMap immediates;
+   NirArrayLMemOffsets regToLmemOffset;
    NirBlockMap blocks;
    unsigned int curLoopDepth;
-   unsigned int curIfDepth;
 
    BasicBlock *exit;
    Value *zero;
@@ -195,14 +185,10 @@ private:
    };
 };
 
-Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info,
-                     nv50_ir_prog_info_out *info_out)
-   : ConverterCommon(prog, info, info_out),
+Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info)
+   : ConverterCommon(prog, info),
      nir(nir),
      curLoopDepth(0),
-     curIfDepth(0),
-     exit(NULL),
-     immInsertPos(NULL),
      clipVertexOutput(-1)
 {
    zero = mkImm((uint32_t)0);
@@ -274,33 +260,29 @@ Converter::getDType(nir_alu_instr *insn)
 DataType
 Converter::getDType(nir_intrinsic_instr *insn)
 {
-   bool isFloat, isSigned;
+   bool isSigned;
    switch (insn->intrinsic) {
-   case nir_intrinsic_bindless_image_atomic_fadd:
-   case nir_intrinsic_global_atomic_fadd:
-   case nir_intrinsic_image_atomic_fadd:
-   case nir_intrinsic_shared_atomic_fadd:
-   case nir_intrinsic_ssbo_atomic_fadd:
-      isFloat = true;
-      isSigned = false;
-      break;
    case nir_intrinsic_shared_atomic_imax:
    case nir_intrinsic_shared_atomic_imin:
    case nir_intrinsic_ssbo_atomic_imax:
    case nir_intrinsic_ssbo_atomic_imin:
-      isFloat = false;
       isSigned = true;
       break;
    default:
-      isFloat = false;
       isSigned = false;
       break;
    }
 
+   return getDType(insn, isSigned);
+}
+
+DataType
+Converter::getDType(nir_intrinsic_instr *insn, bool isSigned)
+{
    if (insn->dest.is_ssa)
-      return typeOfSize(insn->dest.ssa.bit_size / 8, isFloat, isSigned);
+      return typeOfSize(insn->dest.ssa.bit_size / 8, false, isSigned);
    else
-      return typeOfSize(insn->dest.reg.reg->bit_size / 8, isFloat, isSigned);
+      return typeOfSize(insn->dest.reg.reg->bit_size / 8, false, isSigned);
 }
 
 DataType
@@ -358,29 +340,6 @@ Converter::getSType(nir_src &src, bool isFloat, bool isSigned)
    return ty;
 }
 
-DataFile
-Converter::getFile(nir_intrinsic_op op)
-{
-   switch (op) {
-   case nir_intrinsic_load_global:
-   case nir_intrinsic_store_global:
-   case nir_intrinsic_load_global_constant:
-      return FILE_MEMORY_GLOBAL;
-   case nir_intrinsic_load_scratch:
-   case nir_intrinsic_store_scratch:
-      return FILE_MEMORY_LOCAL;
-   case nir_intrinsic_load_shared:
-   case nir_intrinsic_store_shared:
-      return FILE_MEMORY_SHARED;
-   case nir_intrinsic_load_kernel_input:
-      return FILE_SHADER_INPUT;
-   default:
-      ERROR("couldn't get DateFile for op %s\n", nir_intrinsic_infos[op].name);
-      assert(false);
-   }
-   return FILE_NULL;
-}
-
 operation
 Converter::getOperation(nir_op op)
 {
@@ -480,7 +439,7 @@ Converter::getOperation(nir_op op)
    case nir_op_flt32:
    case nir_op_ilt32:
    case nir_op_ult32:
-   case nir_op_fneu32:
+   case nir_op_fne32:
    case nir_op_ine32:
       return OP_SET;
    case nir_op_ishl:
@@ -492,6 +451,9 @@ Converter::getOperation(nir_op op)
       return OP_SIN;
    case nir_op_fsqrt:
       return OP_SQRT;
+   case nir_op_fsub:
+   case nir_op_isub:
+      return OP_SUB;
    case nir_op_ftrunc:
       return OP_TRUNC;
    case nir_op_ixor:
@@ -543,39 +505,43 @@ Converter::getOperation(nir_intrinsic_op op)
       return OP_RESTART;
    case nir_intrinsic_bindless_image_atomic_add:
    case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_deref_atomic_add:
    case nir_intrinsic_bindless_image_atomic_and:
    case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_deref_atomic_and:
    case nir_intrinsic_bindless_image_atomic_comp_swap:
    case nir_intrinsic_image_atomic_comp_swap:
+   case nir_intrinsic_image_deref_atomic_comp_swap:
    case nir_intrinsic_bindless_image_atomic_exchange:
    case nir_intrinsic_image_atomic_exchange:
-   case nir_intrinsic_bindless_image_atomic_imax:
-   case nir_intrinsic_image_atomic_imax:
-   case nir_intrinsic_bindless_image_atomic_umax:
-   case nir_intrinsic_image_atomic_umax:
-   case nir_intrinsic_bindless_image_atomic_imin:
-   case nir_intrinsic_image_atomic_imin:
-   case nir_intrinsic_bindless_image_atomic_umin:
-   case nir_intrinsic_image_atomic_umin:
+   case nir_intrinsic_image_deref_atomic_exchange:
+   case nir_intrinsic_bindless_image_atomic_max:
+   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_deref_atomic_max:
+   case nir_intrinsic_bindless_image_atomic_min:
+   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_deref_atomic_min:
    case nir_intrinsic_bindless_image_atomic_or:
    case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_deref_atomic_or:
    case nir_intrinsic_bindless_image_atomic_xor:
    case nir_intrinsic_image_atomic_xor:
-   case nir_intrinsic_bindless_image_atomic_inc_wrap:
-   case nir_intrinsic_image_atomic_inc_wrap:
-   case nir_intrinsic_bindless_image_atomic_dec_wrap:
-   case nir_intrinsic_image_atomic_dec_wrap:
+   case nir_intrinsic_image_deref_atomic_xor:
       return OP_SUREDP;
    case nir_intrinsic_bindless_image_load:
    case nir_intrinsic_image_load:
+   case nir_intrinsic_image_deref_load:
       return OP_SULDP;
    case nir_intrinsic_bindless_image_samples:
    case nir_intrinsic_image_samples:
+   case nir_intrinsic_image_deref_samples:
    case nir_intrinsic_bindless_image_size:
    case nir_intrinsic_image_size:
+   case nir_intrinsic_image_deref_size:
       return OP_SUQ;
    case nir_intrinsic_bindless_image_store:
    case nir_intrinsic_image_store:
+   case nir_intrinsic_image_deref_store:
       return OP_SUSTP;
    default:
       ERROR("couldn't get operation for nir_intrinsic_op %u\n", op);
@@ -603,10 +569,6 @@ Converter::getSubOp(nir_op op)
    case nir_op_imul_high:
    case nir_op_umul_high:
       return NV50_IR_SUBOP_MUL_HIGH;
-   case nir_op_ishl:
-   case nir_op_ishr:
-   case nir_op_ushr:
-      return NV50_IR_SUBOP_SHIFT_WRAP;
    default:
       return 0;
    }
@@ -617,78 +579,61 @@ Converter::getSubOp(nir_intrinsic_op op)
 {
    switch (op) {
    case nir_intrinsic_bindless_image_atomic_add:
-   case nir_intrinsic_global_atomic_add:
    case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_deref_atomic_add:
    case nir_intrinsic_shared_atomic_add:
    case nir_intrinsic_ssbo_atomic_add:
       return  NV50_IR_SUBOP_ATOM_ADD;
-   case nir_intrinsic_bindless_image_atomic_fadd:
-   case nir_intrinsic_global_atomic_fadd:
-   case nir_intrinsic_image_atomic_fadd:
-   case nir_intrinsic_shared_atomic_fadd:
-   case nir_intrinsic_ssbo_atomic_fadd:
-      return  NV50_IR_SUBOP_ATOM_ADD;
    case nir_intrinsic_bindless_image_atomic_and:
-   case nir_intrinsic_global_atomic_and:
    case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_deref_atomic_and:
    case nir_intrinsic_shared_atomic_and:
    case nir_intrinsic_ssbo_atomic_and:
       return  NV50_IR_SUBOP_ATOM_AND;
    case nir_intrinsic_bindless_image_atomic_comp_swap:
-   case nir_intrinsic_global_atomic_comp_swap:
    case nir_intrinsic_image_atomic_comp_swap:
+   case nir_intrinsic_image_deref_atomic_comp_swap:
    case nir_intrinsic_shared_atomic_comp_swap:
    case nir_intrinsic_ssbo_atomic_comp_swap:
       return  NV50_IR_SUBOP_ATOM_CAS;
    case nir_intrinsic_bindless_image_atomic_exchange:
-   case nir_intrinsic_global_atomic_exchange:
    case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_deref_atomic_exchange:
    case nir_intrinsic_shared_atomic_exchange:
    case nir_intrinsic_ssbo_atomic_exchange:
       return  NV50_IR_SUBOP_ATOM_EXCH;
    case nir_intrinsic_bindless_image_atomic_or:
-   case nir_intrinsic_global_atomic_or:
    case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_deref_atomic_or:
    case nir_intrinsic_shared_atomic_or:
    case nir_intrinsic_ssbo_atomic_or:
       return  NV50_IR_SUBOP_ATOM_OR;
-   case nir_intrinsic_bindless_image_atomic_imax:
-   case nir_intrinsic_bindless_image_atomic_umax:
-   case nir_intrinsic_global_atomic_imax:
-   case nir_intrinsic_global_atomic_umax:
-   case nir_intrinsic_image_atomic_imax:
-   case nir_intrinsic_image_atomic_umax:
+   case nir_intrinsic_bindless_image_atomic_max:
+   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_deref_atomic_max:
    case nir_intrinsic_shared_atomic_imax:
    case nir_intrinsic_shared_atomic_umax:
    case nir_intrinsic_ssbo_atomic_imax:
    case nir_intrinsic_ssbo_atomic_umax:
       return  NV50_IR_SUBOP_ATOM_MAX;
-   case nir_intrinsic_bindless_image_atomic_imin:
-   case nir_intrinsic_bindless_image_atomic_umin:
-   case nir_intrinsic_global_atomic_imin:
-   case nir_intrinsic_global_atomic_umin:
-   case nir_intrinsic_image_atomic_imin:
-   case nir_intrinsic_image_atomic_umin:
+   case nir_intrinsic_bindless_image_atomic_min:
+   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_deref_atomic_min:
    case nir_intrinsic_shared_atomic_imin:
    case nir_intrinsic_shared_atomic_umin:
    case nir_intrinsic_ssbo_atomic_imin:
    case nir_intrinsic_ssbo_atomic_umin:
       return  NV50_IR_SUBOP_ATOM_MIN;
    case nir_intrinsic_bindless_image_atomic_xor:
-   case nir_intrinsic_global_atomic_xor:
    case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_deref_atomic_xor:
    case nir_intrinsic_shared_atomic_xor:
    case nir_intrinsic_ssbo_atomic_xor:
       return  NV50_IR_SUBOP_ATOM_XOR;
-   case nir_intrinsic_bindless_image_atomic_inc_wrap:
-   case nir_intrinsic_image_atomic_inc_wrap:
-      return NV50_IR_SUBOP_ATOM_INC;
-   case nir_intrinsic_bindless_image_atomic_dec_wrap:
-   case nir_intrinsic_image_atomic_dec_wrap:
-      return NV50_IR_SUBOP_ATOM_DEC;
 
    case nir_intrinsic_group_memory_barrier:
    case nir_intrinsic_memory_barrier:
+   case nir_intrinsic_memory_barrier_atomic_counter:
    case nir_intrinsic_memory_barrier_buffer:
    case nir_intrinsic_memory_barrier_image:
       return NV50_IR_SUBOP_MEMBAR(M, GL);
@@ -721,7 +666,7 @@ Converter::getCondCode(nir_op op)
    case nir_op_ilt32:
    case nir_op_ult32:
       return CC_LT;
-   case nir_op_fneu32:
+   case nir_op_fne32:
       return CC_NEU;
    case nir_op_ine32:
       return CC_NE;
@@ -753,8 +698,6 @@ Converter::convert(nir_dest *dest)
 Converter::LValues&
 Converter::convert(nir_register *reg)
 {
-   assert(!reg->num_array_elems);
-
    NirDefMap::iterator it = regDefs.find(reg->index);
    if (it != regDefs.end())
       return it->second;
@@ -914,6 +857,256 @@ vert_attrib_to_tgsi_semantic(gl_vert_attrib slot, unsigned *name, unsigned *inde
    }
 }
 
+static void
+varying_slot_to_tgsi_semantic(gl_varying_slot slot, unsigned *name, unsigned *index)
+{
+   assert(name && index);
+
+   if (slot >= VARYING_SLOT_TESS_MAX) {
+      ERROR("invalid varying slot %u\n", slot);
+      assert(false);
+      return;
+   }
+
+   if (slot >= VARYING_SLOT_PATCH0) {
+      *name = TGSI_SEMANTIC_PATCH;
+      *index = slot - VARYING_SLOT_PATCH0;
+      return;
+   }
+
+   if (slot >= VARYING_SLOT_VAR0) {
+      *name = TGSI_SEMANTIC_GENERIC;
+      *index = slot - VARYING_SLOT_VAR0;
+      return;
+   }
+
+   if (slot >= VARYING_SLOT_TEX0 && slot <= VARYING_SLOT_TEX7) {
+      *name = TGSI_SEMANTIC_TEXCOORD;
+      *index = slot - VARYING_SLOT_TEX0;
+      return;
+   }
+
+   switch (slot) {
+   case VARYING_SLOT_BFC0:
+      *name = TGSI_SEMANTIC_BCOLOR;
+      *index = 0;
+      break;
+   case VARYING_SLOT_BFC1:
+      *name = TGSI_SEMANTIC_BCOLOR;
+      *index = 1;
+      break;
+   case VARYING_SLOT_CLIP_DIST0:
+      *name = TGSI_SEMANTIC_CLIPDIST;
+      *index = 0;
+      break;
+   case VARYING_SLOT_CLIP_DIST1:
+      *name = TGSI_SEMANTIC_CLIPDIST;
+      *index = 1;
+      break;
+   case VARYING_SLOT_CLIP_VERTEX:
+      *name = TGSI_SEMANTIC_CLIPVERTEX;
+      *index = 0;
+      break;
+   case VARYING_SLOT_COL0:
+      *name = TGSI_SEMANTIC_COLOR;
+      *index = 0;
+      break;
+   case VARYING_SLOT_COL1:
+      *name = TGSI_SEMANTIC_COLOR;
+      *index = 1;
+      break;
+   case VARYING_SLOT_EDGE:
+      *name = TGSI_SEMANTIC_EDGEFLAG;
+      *index = 0;
+      break;
+   case VARYING_SLOT_FACE:
+      *name = TGSI_SEMANTIC_FACE;
+      *index = 0;
+      break;
+   case VARYING_SLOT_FOGC:
+      *name = TGSI_SEMANTIC_FOG;
+      *index = 0;
+      break;
+   case VARYING_SLOT_LAYER:
+      *name = TGSI_SEMANTIC_LAYER;
+      *index = 0;
+      break;
+   case VARYING_SLOT_PNTC:
+      *name = TGSI_SEMANTIC_PCOORD;
+      *index = 0;
+      break;
+   case VARYING_SLOT_POS:
+      *name = TGSI_SEMANTIC_POSITION;
+      *index = 0;
+      break;
+   case VARYING_SLOT_PRIMITIVE_ID:
+      *name = TGSI_SEMANTIC_PRIMID;
+      *index = 0;
+      break;
+   case VARYING_SLOT_PSIZ:
+      *name = TGSI_SEMANTIC_PSIZE;
+      *index = 0;
+      break;
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+      *name = TGSI_SEMANTIC_TESSINNER;
+      *index = 0;
+      break;
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+      *name = TGSI_SEMANTIC_TESSOUTER;
+      *index = 0;
+      break;
+   case VARYING_SLOT_VIEWPORT:
+      *name = TGSI_SEMANTIC_VIEWPORT_INDEX;
+      *index = 0;
+      break;
+   default:
+      ERROR("unknown varying slot %u\n", slot);
+      assert(false);
+      break;
+   }
+}
+
+static void
+frag_result_to_tgsi_semantic(unsigned slot, unsigned *name, unsigned *index)
+{
+   if (slot >= FRAG_RESULT_DATA0) {
+      *name = TGSI_SEMANTIC_COLOR;
+      *index = slot - FRAG_RESULT_COLOR - 2; // intentional
+      return;
+   }
+
+   switch (slot) {
+   case FRAG_RESULT_COLOR:
+      *name = TGSI_SEMANTIC_COLOR;
+      *index = 0;
+      break;
+   case FRAG_RESULT_DEPTH:
+      *name = TGSI_SEMANTIC_POSITION;
+      *index = 0;
+      break;
+   case FRAG_RESULT_SAMPLE_MASK:
+      *name = TGSI_SEMANTIC_SAMPLEMASK;
+      *index = 0;
+      break;
+   default:
+      ERROR("unknown frag result slot %u\n", slot);
+      assert(false);
+      break;
+   }
+}
+
+// copy of _mesa_sysval_to_semantic
+static void
+system_val_to_tgsi_semantic(unsigned val, unsigned *name, unsigned *index)
+{
+   *index = 0;
+   switch (val) {
+   // Vertex shader
+   case SYSTEM_VALUE_VERTEX_ID:
+      *name = TGSI_SEMANTIC_VERTEXID;
+      break;
+   case SYSTEM_VALUE_INSTANCE_ID:
+      *name = TGSI_SEMANTIC_INSTANCEID;
+      break;
+   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+      *name = TGSI_SEMANTIC_VERTEXID_NOBASE;
+      break;
+   case SYSTEM_VALUE_BASE_VERTEX:
+      *name = TGSI_SEMANTIC_BASEVERTEX;
+      break;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      *name = TGSI_SEMANTIC_BASEINSTANCE;
+      break;
+   case SYSTEM_VALUE_DRAW_ID:
+      *name = TGSI_SEMANTIC_DRAWID;
+      break;
+
+   // Geometry shader
+   case SYSTEM_VALUE_INVOCATION_ID:
+      *name = TGSI_SEMANTIC_INVOCATIONID;
+      break;
+
+   // Fragment shader
+   case SYSTEM_VALUE_FRAG_COORD:
+      *name = TGSI_SEMANTIC_POSITION;
+      break;
+   case SYSTEM_VALUE_FRONT_FACE:
+      *name = TGSI_SEMANTIC_FACE;
+      break;
+   case SYSTEM_VALUE_SAMPLE_ID:
+      *name = TGSI_SEMANTIC_SAMPLEID;
+      break;
+   case SYSTEM_VALUE_SAMPLE_POS:
+      *name = TGSI_SEMANTIC_SAMPLEPOS;
+      break;
+   case SYSTEM_VALUE_SAMPLE_MASK_IN:
+      *name = TGSI_SEMANTIC_SAMPLEMASK;
+      break;
+   case SYSTEM_VALUE_HELPER_INVOCATION:
+      *name = TGSI_SEMANTIC_HELPER_INVOCATION;
+      break;
+
+   // Tessellation shader
+   case SYSTEM_VALUE_TESS_COORD:
+      *name = TGSI_SEMANTIC_TESSCOORD;
+      break;
+   case SYSTEM_VALUE_VERTICES_IN:
+      *name = TGSI_SEMANTIC_VERTICESIN;
+      break;
+   case SYSTEM_VALUE_PRIMITIVE_ID:
+      *name = TGSI_SEMANTIC_PRIMID;
+      break;
+   case SYSTEM_VALUE_TESS_LEVEL_OUTER:
+      *name = TGSI_SEMANTIC_TESSOUTER;
+      break;
+   case SYSTEM_VALUE_TESS_LEVEL_INNER:
+      *name = TGSI_SEMANTIC_TESSINNER;
+      break;
+
+   // Compute shader
+   case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
+      *name = TGSI_SEMANTIC_THREAD_ID;
+      break;
+   case SYSTEM_VALUE_WORK_GROUP_ID:
+      *name = TGSI_SEMANTIC_BLOCK_ID;
+      break;
+   case SYSTEM_VALUE_NUM_WORK_GROUPS:
+      *name = TGSI_SEMANTIC_GRID_SIZE;
+      break;
+   case SYSTEM_VALUE_LOCAL_GROUP_SIZE:
+      *name = TGSI_SEMANTIC_BLOCK_SIZE;
+      break;
+
+   // ARB_shader_ballot
+   case SYSTEM_VALUE_SUBGROUP_SIZE:
+      *name = TGSI_SEMANTIC_SUBGROUP_SIZE;
+      break;
+   case SYSTEM_VALUE_SUBGROUP_INVOCATION:
+      *name = TGSI_SEMANTIC_SUBGROUP_INVOCATION;
+      break;
+   case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
+      *name = TGSI_SEMANTIC_SUBGROUP_EQ_MASK;
+      break;
+   case SYSTEM_VALUE_SUBGROUP_GE_MASK:
+      *name = TGSI_SEMANTIC_SUBGROUP_GE_MASK;
+      break;
+   case SYSTEM_VALUE_SUBGROUP_GT_MASK:
+      *name = TGSI_SEMANTIC_SUBGROUP_GT_MASK;
+      break;
+   case SYSTEM_VALUE_SUBGROUP_LE_MASK:
+      *name = TGSI_SEMANTIC_SUBGROUP_LE_MASK;
+      break;
+   case SYSTEM_VALUE_SUBGROUP_LT_MASK:
+      *name = TGSI_SEMANTIC_SUBGROUP_LT_MASK;
+      break;
+
+   default:
+      ERROR("unknown system value %u\n", val);
+      assert(false);
+      break;
+   }
+}
+
 void
 Converter::setInterpolate(nv50_ir_varying *var,
                           uint8_t mode,
@@ -949,7 +1142,7 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
    uint16_t slots;
    switch (stage) {
    case Program::TYPE_GEOMETRY:
-      slots = type->count_attribute_slots(false);
+      slots = type->uniform_locations();
       if (input)
          slots /= info.gs.vertices_in;
       break;
@@ -957,9 +1150,9 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
    case Program::TYPE_TESSELLATION_EVAL:
       // remove first dimension
       if (var->data.patch || (!input && stage == Program::TYPE_TESSELLATION_EVAL))
-         slots = type->count_attribute_slots(false);
+         slots = type->uniform_locations();
       else
-         slots = type->fields.array->count_attribute_slots(false);
+         slots = type->fields.array->uniform_locations();
       break;
    default:
       slots = type->count_attribute_slots(false);
@@ -969,94 +1162,93 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
    return slots;
 }
 
-static uint8_t
-getMaskForType(const glsl_type *type, uint8_t slot) {
-   uint16_t comp = type->without_array()->components();
-   comp = comp ? comp : 4;
-
-   if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
-      comp *= 2;
-      if (comp > 4) {
-         if (slot % 2)
-            comp -= 4;
-         else
-            comp = 4;
-      }
-   }
-
-   return (1 << comp) - 1;
-}
-
 bool Converter::assignSlots() {
    unsigned name;
    unsigned index;
 
    info->io.viewportId = -1;
-   info_out->numInputs = 0;
-   info_out->numOutputs = 0;
-   info_out->numSysVals = 0;
+   info->numInputs = 0;
+   info->numOutputs = 0;
 
-   uint8_t i;
-   BITSET_FOREACH_SET(i, nir->info.system_values_read, SYSTEM_VALUE_MAX) {
-      info_out->sv[info_out->numSysVals].sn = tgsi_get_sysval_semantic(i);
-      info_out->sv[info_out->numSysVals].si = 0;
-      info_out->sv[info_out->numSysVals].input = 0; // TODO inferSysValDirection(sn);
+   // we have to fixup the uniform locations for arrays
+   unsigned numImages = 0;
+   nir_foreach_variable(var, &nir->uniforms) {
+      const glsl_type *type = var->type;
+      if (!type->without_array()->is_image())
+         continue;
+      var->data.driver_location = numImages;
+      numImages += type->is_array() ? type->arrays_of_arrays_size() : 1;
+   }
+
+   info->numSysVals = 0;
+   for (uint8_t i = 0; i < SYSTEM_VALUE_MAX; ++i) {
+      if (!(nir->info.system_values_read & 1ull << i))
+         continue;
+
+      system_val_to_tgsi_semantic(i, &name, &index);
+      info->sv[info->numSysVals].sn = name;
+      info->sv[info->numSysVals].si = index;
+      info->sv[info->numSysVals].input = 0; // TODO inferSysValDirection(sn);
 
       switch (i) {
       case SYSTEM_VALUE_INSTANCE_ID:
-         info_out->io.instanceId = info_out->numSysVals;
+         info->io.instanceId = info->numSysVals;
          break;
       case SYSTEM_VALUE_TESS_LEVEL_INNER:
       case SYSTEM_VALUE_TESS_LEVEL_OUTER:
-         info_out->sv[info_out->numSysVals].patch = 1;
+         info->sv[info->numSysVals].patch = 1;
          break;
       case SYSTEM_VALUE_VERTEX_ID:
-         info_out->io.vertexId = info_out->numSysVals;
+         info->io.vertexId = info->numSysVals;
          break;
       default:
          break;
       }
 
-      info_out->numSysVals += 1;
+      info->numSysVals += 1;
    }
 
    if (prog->getType() == Program::TYPE_COMPUTE)
       return true;
 
-   nir_foreach_shader_in_variable(var, nir) {
+   nir_foreach_variable(var, &nir->inputs) {
       const glsl_type *type = var->type;
       int slot = var->data.location;
       uint16_t slots = calcSlots(type, prog->getType(), nir->info, true, var);
+      uint32_t comp = type->is_array() ? type->without_array()->component_slots()
+                                       : type->component_slots();
+      uint32_t frac = var->data.location_frac;
       uint32_t vary = var->data.driver_location;
+
+      if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
+         if (comp > 2)
+            slots *= 2;
+      }
+
       assert(vary + slots <= PIPE_MAX_SHADER_INPUTS);
 
       switch(prog->getType()) {
       case Program::TYPE_FRAGMENT:
-         tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
-                                      &name, &index);
+         varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
          for (uint16_t i = 0; i < slots; ++i) {
-            setInterpolate(&info_out->in[vary + i], var->data.interpolation,
+            setInterpolate(&info->in[vary + i], var->data.interpolation,
                            var->data.centroid | var->data.sample, name);
          }
          break;
       case Program::TYPE_GEOMETRY:
-         tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
-                                      &name, &index);
+         varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
          break;
       case Program::TYPE_TESSELLATION_CONTROL:
       case Program::TYPE_TESSELLATION_EVAL:
-         tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
-                                      &name, &index);
+         varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
          if (var->data.patch && name == TGSI_SEMANTIC_PATCH)
-            info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots);
+            info->numPatchConstants = MAX2(info->numPatchConstants, index + slots);
          break;
       case Program::TYPE_VERTEX:
-         if (slot >= VERT_ATTRIB_GENERIC0)
-            slot = VERT_ATTRIB_GENERIC0 + vary;
          vert_attrib_to_tgsi_semantic((gl_vert_attrib)slot, &name, &index);
          switch (name) {
          case TGSI_SEMANTIC_EDGEFLAG:
-            info_out->io.edgeFlagIn = vary;
+            info->io.edgeFlagIn = vary;
             break;
          default:
             break;
@@ -1068,44 +1260,55 @@ bool Converter::assignSlots() {
       }
 
       for (uint16_t i = 0u; i < slots; ++i, ++vary) {
-         nv50_ir_varying *v = &info_out->in[vary];
-
-         v->patch = var->data.patch;
-         v->sn = name;
-         v->si = index + i;
-         v->mask |= getMaskForType(type, i) << var->data.location_frac;
+         info->in[vary].id = vary;
+         info->in[vary].patch = var->data.patch;
+         info->in[vary].sn = name;
+         info->in[vary].si = index + i;
+         if (glsl_base_type_is_64bit(type->without_array()->base_type))
+            if (i & 0x1)
+               info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
+            else
+               info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
+         else
+            info->in[vary].mask |= ((1 << comp) - 1) << frac;
       }
-      info_out->numInputs = std::max<uint8_t>(info_out->numInputs, vary);
+      info->numInputs = std::max<uint8_t>(info->numInputs, vary);
    }
 
-   nir_foreach_shader_out_variable(var, nir) {
+   nir_foreach_variable(var, &nir->outputs) {
       const glsl_type *type = var->type;
       int slot = var->data.location;
       uint16_t slots = calcSlots(type, prog->getType(), nir->info, false, var);
+      uint32_t comp = type->is_array() ? type->without_array()->component_slots()
+                                       : type->component_slots();
+      uint32_t frac = var->data.location_frac;
       uint32_t vary = var->data.driver_location;
 
+      if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
+         if (comp > 2)
+            slots *= 2;
+      }
+
       assert(vary < PIPE_MAX_SHADER_OUTPUTS);
 
       switch(prog->getType()) {
       case Program::TYPE_FRAGMENT:
-         tgsi_get_gl_frag_result_semantic((gl_frag_result)slot, &name, &index);
+         frag_result_to_tgsi_semantic((gl_frag_result)slot, &name, &index);
          switch (name) {
          case TGSI_SEMANTIC_COLOR:
             if (!var->data.fb_fetch_output)
-               info_out->prop.fp.numColourResults++;
-            if (var->data.location == FRAG_RESULT_COLOR &&
-                nir->info.outputs_written & BITFIELD64_BIT(var->data.location))
-            info_out->prop.fp.separateFragData = true;
+               info->prop.fp.numColourResults++;
+            info->prop.fp.separateFragData = true;
             // sometimes we get FRAG_RESULT_DATAX with data.index 0
             // sometimes we get FRAG_RESULT_DATA0 with data.index X
             index = index == 0 ? var->data.index : index;
             break;
          case TGSI_SEMANTIC_POSITION:
-            info_out->io.fragDepth = vary;
-            info_out->prop.fp.writesDepth = true;
+            info->io.fragDepth = vary;
+            info->prop.fp.writesDepth = true;
             break;
          case TGSI_SEMANTIC_SAMPLEMASK:
-            info_out->io.sampleMask = vary;
+            info->io.sampleMask = vary;
             break;
          default:
             break;
@@ -1115,22 +1318,21 @@ bool Converter::assignSlots() {
       case Program::TYPE_TESSELLATION_CONTROL:
       case Program::TYPE_TESSELLATION_EVAL:
       case Program::TYPE_VERTEX:
-         tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true,
-                                      &name, &index);
+         varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index);
 
          if (var->data.patch && name != TGSI_SEMANTIC_TESSINNER &&
              name != TGSI_SEMANTIC_TESSOUTER)
-            info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots);
+            info->numPatchConstants = MAX2(info->numPatchConstants, index + slots);
 
          switch (name) {
          case TGSI_SEMANTIC_CLIPDIST:
-            info_out->io.genUserClip = -1;
+            info->io.genUserClip = -1;
             break;
          case TGSI_SEMANTIC_CLIPVERTEX:
             clipVertexOutput = vary;
             break;
          case TGSI_SEMANTIC_EDGEFLAG:
-            info_out->io.edgeFlagOut = vary;
+            info->io.edgeFlagOut = vary;
             break;
          case TGSI_SEMANTIC_POSITION:
             if (clipVertexOutput < 0)
@@ -1146,33 +1348,39 @@ bool Converter::assignSlots() {
       }
 
       for (uint16_t i = 0u; i < slots; ++i, ++vary) {
-         nv50_ir_varying *v = &info_out->out[vary];
-         v->patch = var->data.patch;
-         v->sn = name;
-         v->si = index + i;
-         v->mask |= getMaskForType(type, i) << var->data.location_frac;
+         info->out[vary].id = vary;
+         info->out[vary].patch = var->data.patch;
+         info->out[vary].sn = name;
+         info->out[vary].si = index + i;
+         if (glsl_base_type_is_64bit(type->without_array()->base_type))
+            if (i & 0x1)
+               info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
+            else
+               info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
+         else
+            info->out[vary].mask |= ((1 << comp) - 1) << frac;
 
          if (nir->info.outputs_read & 1ull << slot)
-            v->oread = 1;
+            info->out[vary].oread = 1;
       }
-      info_out->numOutputs = std::max<uint8_t>(info_out->numOutputs, vary);
+      info->numOutputs = std::max<uint8_t>(info->numOutputs, vary);
    }
 
-   if (info_out->io.genUserClip > 0) {
-      info_out->io.clipDistances = info_out->io.genUserClip;
+   if (info->io.genUserClip > 0) {
+      info->io.clipDistances = info->io.genUserClip;
 
-      const unsigned int nOut = (info_out->io.genUserClip + 3) / 4;
+      const unsigned int nOut = (info->io.genUserClip + 3) / 4;
 
       for (unsigned int n = 0; n < nOut; ++n) {
-         unsigned int i = info_out->numOutputs++;
-         info_out->out[i].id = i;
-         info_out->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
-         info_out->out[i].si = n;
-         info_out->out[i].mask = ((1 << info_out->io.clipDistances) - 1) >> (n * 4);
+         unsigned int i = info->numOutputs++;
+         info->out[i].id = i;
+         info->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
+         info->out[i].si = n;
+         info->out[i].mask = ((1 << info->io.clipDistances) - 1) >> (n * 4);
       }
    }
 
-   return info->assignSlots(info_out) == 0;
+   return info->assignSlots(info) == 0;
 }
 
 uint32_t
@@ -1222,7 +1430,7 @@ Converter::getSlotAddress(nir_intrinsic_instr *insn, uint8_t idx, uint8_t slot)
    assert(!input || idx < PIPE_MAX_SHADER_INPUTS);
    assert(input || idx < PIPE_MAX_SHADER_OUTPUTS);
 
-   const nv50_ir_varying *vary = input ? info_out->in : info_out->out;
+   const nv50_ir_varying *vary = input ? info->in : info->out;
    return vary[idx].slot[slot] * 4;
 }
 
@@ -1280,63 +1488,66 @@ Converter::storeTo(nir_intrinsic_instr *insn, DataFile file, operation op,
       }
 
       mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address), indirect0,
-              split[0])->perPatch = info_out->out[idx].patch;
+              split[0])->perPatch = info->out[idx].patch;
       mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address + 4), indirect0,
-              split[1])->perPatch = info_out->out[idx].patch;
+              split[1])->perPatch = info->out[idx].patch;
    } else {
       if (op == OP_EXPORT)
          src = mkMov(getSSA(size), src, ty)->getDef(0);
       mkStore(op, ty, mkSymbol(file, 0, ty, address), indirect0,
-              src)->perPatch = info_out->out[idx].patch;
+              src)->perPatch = info->out[idx].patch;
    }
 }
 
 bool
 Converter::parseNIR()
 {
-   info_out->bin.tlsSpace = nir->scratch_size;
-   info_out->io.clipDistances = nir->info.clip_distance_array_size;
-   info_out->io.cullDistances = nir->info.cull_distance_array_size;
-   info_out->io.layer_viewport_relative = nir->info.layer_viewport_relative;
+   info->bin.tlsSpace = 0;
+   info->io.clipDistances = nir->info.clip_distance_array_size;
+   info->io.cullDistances = nir->info.cull_distance_array_size;
 
    switch(prog->getType()) {
    case Program::TYPE_COMPUTE:
-      info->prop.cp.numThreads[0] = nir->info.workgroup_size[0];
-      info->prop.cp.numThreads[1] = nir->info.workgroup_size[1];
-      info->prop.cp.numThreads[2] = nir->info.workgroup_size[2];
-      info_out->bin.smemSize = std::max(info_out->bin.smemSize, nir->info.shared_size);
+      info->prop.cp.numThreads[0] = nir->info.cs.local_size[0];
+      info->prop.cp.numThreads[1] = nir->info.cs.local_size[1];
+      info->prop.cp.numThreads[2] = nir->info.cs.local_size[2];
+      info->bin.smemSize = nir->info.cs.shared_size;
       break;
    case Program::TYPE_FRAGMENT:
-      info_out->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests;
-      prog->persampleInvocation =
-         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
-         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
-      info_out->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage;
-      info_out->prop.fp.readsSampleLocations =
-         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
-      info_out->prop.fp.usesDiscard = nir->info.fs.uses_discard || nir->info.fs.uses_demote;
-      info_out->prop.fp.usesSampleMaskIn =
-         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
+      info->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests;
+      info->prop.fp.persampleInvocation =
+         (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_ID) ||
+         (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
+      info->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage;
+      info->prop.fp.readsSampleLocations =
+         (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
+      info->prop.fp.usesDiscard = nir->info.fs.uses_discard;
+      info->prop.fp.usesSampleMaskIn =
+         !!(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN);
       break;
    case Program::TYPE_GEOMETRY:
-      info_out->prop.gp.instanceCount = nir->info.gs.invocations;
-      info_out->prop.gp.maxVertices = nir->info.gs.vertices_out;
-      info_out->prop.gp.outputPrim = nir->info.gs.output_primitive;
+      info->prop.gp.inputPrim = nir->info.gs.input_primitive;
+      info->prop.gp.instanceCount = nir->info.gs.invocations;
+      info->prop.gp.maxVertices = nir->info.gs.vertices_out;
+      info->prop.gp.outputPrim = nir->info.gs.output_primitive;
       break;
    case Program::TYPE_TESSELLATION_CONTROL:
    case Program::TYPE_TESSELLATION_EVAL:
-      info_out->prop.tp.domain = u_tess_prim_from_shader(nir->info.tess._primitive_mode);
-      info_out->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out;
-      info_out->prop.tp.outputPrim =
+      if (nir->info.tess.primitive_mode == GL_ISOLINES)
+         info->prop.tp.domain = GL_LINES;
+      else
+         info->prop.tp.domain = nir->info.tess.primitive_mode;
+      info->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out;
+      info->prop.tp.outputPrim =
          nir->info.tess.point_mode ? PIPE_PRIM_POINTS : PIPE_PRIM_TRIANGLES;
-      info_out->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3;
-      info_out->prop.tp.winding = !nir->info.tess.ccw;
+      info->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3;
+      info->prop.tp.winding = !nir->info.tess.ccw;
       break;
    case Program::TYPE_VERTEX:
-      info_out->prop.vp.usesDrawParameters =
-         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
-         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
-         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
+      info->prop.vp.usesDrawParameters =
+         (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX)) ||
+         (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE)) ||
+         (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID));
       break;
    default:
       break;
@@ -1359,7 +1570,7 @@ Converter::visit(nir_function *function)
 
    setPosition(entry, true);
 
-   if (info_out->io.genUserClip > 0) {
+   if (info->io.genUserClip > 0) {
       for (int c = 0; c < 4; ++c)
          clipVtx[c] = getScratch();
    }
@@ -1381,6 +1592,16 @@ Converter::visit(nir_function *function)
       break;
    }
 
+   nir_foreach_register(reg, &function->impl->registers) {
+      if (reg->num_array_elems) {
+         // TODO: packed variables would be nice, but MemoryOpt fails
+         // replace 4 with reg->num_components
+         uint32_t size = 4 * reg->num_array_elems * (reg->bit_size / 8);
+         regToLmemOffset[reg->index] = info->bin.tlsSpace;
+         info->bin.tlsSpace += size;
+      }
+   }
+
    nir_index_ssa_defs(function->impl);
    foreach_list_typed(nir_cf_node, node, node, &function->impl->body) {
       if (!visit(node))
@@ -1392,7 +1613,7 @@ Converter::visit(nir_function *function)
 
    if ((prog->getType() == Program::TYPE_VERTEX ||
         prog->getType() == Program::TYPE_TESSELLATION_EVAL)
-       && info_out->io.genUserClip > 0)
+       && info->io.genUserClip > 0)
       handleUserClipPlanes();
 
    // TODO: for non main function this needs to be a OP_RETURN
@@ -1435,69 +1656,64 @@ Converter::visit(nir_block *block)
 bool
 Converter::visit(nir_if *nif)
 {
-   curIfDepth++;
-
    DataType sType = getSType(nif->condition, false, false);
    Value *src = getSrc(&nif->condition, 0);
 
    nir_block *lastThen = nir_if_last_then_block(nif);
    nir_block *lastElse = nir_if_last_else_block(nif);
 
-   BasicBlock *headBB = bb;
+   assert(!lastThen->successors[1]);
+   assert(!lastElse->successors[1]);
+
    BasicBlock *ifBB = convert(nir_if_first_then_block(nif));
    BasicBlock *elseBB = convert(nir_if_first_else_block(nif));
 
    bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE);
    bb->cfg.attach(&elseBB->cfg, Graph::Edge::TREE);
 
-   bool insertJoins = lastThen->successors[0] == lastElse->successors[0];
+   // we only insert joinats, if both nodes end up at the end of the if again.
+   // the reason for this to not happens are breaks/continues/ret/... which
+   // have their own handling
+   if (lastThen->successors[0] == lastElse->successors[0])
+      bb->joinAt = mkFlow(OP_JOINAT, convert(lastThen->successors[0]),
+                          CC_ALWAYS, NULL);
+
    mkFlow(OP_BRA, elseBB, CC_EQ, src)->setType(sType);
 
    foreach_list_typed(nir_cf_node, node, node, &nif->then_list) {
       if (!visit(node))
          return false;
    }
-
    setPosition(convert(lastThen), true);
-   if (!bb->isTerminated()) {
+   if (!bb->getExit() ||
+       !bb->getExit()->asFlow() ||
+        bb->getExit()->asFlow()->op == OP_JOIN) {
       BasicBlock *tailBB = convert(lastThen->successors[0]);
       mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
       bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
-   } else {
-      insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
    }
 
    foreach_list_typed(nir_cf_node, node, node, &nif->else_list) {
       if (!visit(node))
          return false;
    }
-
    setPosition(convert(lastElse), true);
-   if (!bb->isTerminated()) {
+   if (!bb->getExit() ||
+       !bb->getExit()->asFlow() ||
+        bb->getExit()->asFlow()->op == OP_JOIN) {
       BasicBlock *tailBB = convert(lastElse->successors[0]);
       mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
       bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
-   } else {
-      insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
    }
 
-   /* only insert joins for the most outer if */
-   if (--curIfDepth)
-      insertJoins = false;
-
-   /* we made sure that all threads would converge at the same block */
-   if (insertJoins) {
-      BasicBlock *conv = convert(lastThen->successors[0]);
-      setPosition(headBB->getExit(), false);
-      headBB->joinAt = mkFlow(OP_JOINAT, conv, CC_ALWAYS, NULL);
-      setPosition(conv, false);
+   if (lastThen->successors[0] == lastElse->successors[0]) {
+      setPosition(convert(lastThen->successors[0]), true);
       mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
    }
 
    return true;
 }
 
-// TODO: add convergency
 bool
 Converter::visit(nir_loop *loop)
 {
@@ -1505,8 +1721,8 @@ Converter::visit(nir_loop *loop)
    func->loopNestingBound = std::max(func->loopNestingBound, curLoopDepth);
 
    BasicBlock *loopBB = convert(nir_loop_first_block(loop));
-   BasicBlock *tailBB = convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
-
+   BasicBlock *tailBB =
+      convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
    bb->cfg.attach(&loopBB->cfg, Graph::Edge::TREE);
 
    mkFlow(OP_PREBREAK, tailBB, CC_ALWAYS, NULL);
@@ -1517,15 +1733,19 @@ Converter::visit(nir_loop *loop)
       if (!visit(node))
          return false;
    }
-
-   if (!bb->isTerminated()) {
-      mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
-      bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+   Instruction *insn = bb->getExit();
+   if (bb->cfg.incidentCount() != 0) {
+      if (!insn || !insn->asFlow()) {
+         mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
+         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+      } else if (insn && insn->op == OP_BRA && !insn->getPredicate() &&
+                 tailBB->cfg.incidentCount() == 0) {
+         // RA doesn't like having blocks around with no incident edge,
+         // so we create a fake one to make it happy
+         bb->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
+      }
    }
 
-   if (tailBB->cfg.incidentCount() == 0)
-      loopBB->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
-
    curLoopDepth -= 1;
 
    return true;
@@ -1539,6 +1759,8 @@ Converter::visit(nir_instr *insn)
    switch (insn->type) {
    case nir_instr_type_alu:
       return visit(nir_instr_as_alu(insn));
+   case nir_instr_type_deref:
+      return visit(nir_instr_as_deref(insn));
    case nir_instr_type_intrinsic:
       return visit(nir_instr_as_intrinsic(insn));
    case nir_instr_type_jump:
@@ -1568,18 +1790,17 @@ Converter::convert(nir_intrinsic_op intr)
       return SV_DRAWID;
    case nir_intrinsic_load_front_face:
       return SV_FACE;
-   case nir_intrinsic_is_helper_invocation:
    case nir_intrinsic_load_helper_invocation:
       return SV_THREAD_KILL;
    case nir_intrinsic_load_instance_id:
       return SV_INSTANCE_ID;
    case nir_intrinsic_load_invocation_id:
       return SV_INVOCATION_ID;
-   case nir_intrinsic_load_workgroup_size:
+   case nir_intrinsic_load_local_group_size:
       return SV_NTID;
    case nir_intrinsic_load_local_invocation_id:
       return SV_TID;
-   case nir_intrinsic_load_num_workgroups:
+   case nir_intrinsic_load_num_work_groups:
       return SV_NCTAID;
    case nir_intrinsic_load_patch_vertices_in:
       return SV_VERTEX_COUNT;
@@ -1611,10 +1832,8 @@ Converter::convert(nir_intrinsic_op intr)
       return SV_TESS_OUTER;
    case nir_intrinsic_load_vertex_id:
       return SV_VERTEX_ID;
-   case nir_intrinsic_load_workgroup_id:
+   case nir_intrinsic_load_work_group_id:
       return SV_CTAID;
-   case nir_intrinsic_load_work_dim:
-      return SV_WORK_DIM;
    default:
       ERROR("unknown SVSemantic for nir_intrinsic_op %s\n",
             nir_intrinsic_infos[intr].name);
@@ -1623,12 +1842,73 @@ Converter::convert(nir_intrinsic_op intr)
    }
 }
 
+ImgFormat
+Converter::convertGLImgFormat(GLuint format)
+{
+#define FMT_CASE(a, b) \
+  case GL_ ## a: return nv50_ir::FMT_ ## b
+
+   switch (format) {
+   FMT_CASE(NONE, NONE);
+
+   FMT_CASE(RGBA32F, RGBA32F);
+   FMT_CASE(RGBA16F, RGBA16F);
+   FMT_CASE(RG32F, RG32F);
+   FMT_CASE(RG16F, RG16F);
+   FMT_CASE(R11F_G11F_B10F, R11G11B10F);
+   FMT_CASE(R32F, R32F);
+   FMT_CASE(R16F, R16F);
+
+   FMT_CASE(RGBA32UI, RGBA32UI);
+   FMT_CASE(RGBA16UI, RGBA16UI);
+   FMT_CASE(RGB10_A2UI, RGB10A2UI);
+   FMT_CASE(RGBA8UI, RGBA8UI);
+   FMT_CASE(RG32UI, RG32UI);
+   FMT_CASE(RG16UI, RG16UI);
+   FMT_CASE(RG8UI, RG8UI);
+   FMT_CASE(R32UI, R32UI);
+   FMT_CASE(R16UI, R16UI);
+   FMT_CASE(R8UI, R8UI);
+
+   FMT_CASE(RGBA32I, RGBA32I);
+   FMT_CASE(RGBA16I, RGBA16I);
+   FMT_CASE(RGBA8I, RGBA8I);
+   FMT_CASE(RG32I, RG32I);
+   FMT_CASE(RG16I, RG16I);
+   FMT_CASE(RG8I, RG8I);
+   FMT_CASE(R32I, R32I);
+   FMT_CASE(R16I, R16I);
+   FMT_CASE(R8I, R8I);
+
+   FMT_CASE(RGBA16, RGBA16);
+   FMT_CASE(RGB10_A2, RGB10A2);
+   FMT_CASE(RGBA8, RGBA8);
+   FMT_CASE(RG16, RG16);
+   FMT_CASE(RG8, RG8);
+   FMT_CASE(R16, R16);
+   FMT_CASE(R8, R8);
+
+   FMT_CASE(RGBA16_SNORM, RGBA16_SNORM);
+   FMT_CASE(RGBA8_SNORM, RGBA8_SNORM);
+   FMT_CASE(RG16_SNORM, RG16_SNORM);
+   FMT_CASE(RG8_SNORM, RG8_SNORM);
+   FMT_CASE(R16_SNORM, R16_SNORM);
+   FMT_CASE(R8_SNORM, R8_SNORM);
+
+   FMT_CASE(BGRA_INTEGER, BGRA8);
+   default:
+      ERROR("unknown format %x\n", format);
+      assert(false);
+      return nv50_ir::FMT_NONE;
+   }
+#undef FMT_CASE
+}
+
 bool
 Converter::visit(nir_intrinsic_instr *insn)
 {
    nir_intrinsic_op op = insn->intrinsic;
    const nir_intrinsic_info &opInfo = nir_intrinsic_infos[op];
-   unsigned dest_components = nir_intrinsic_dest_components(insn);
 
    switch (op) {
    case nir_intrinsic_load_uniform: {
@@ -1636,7 +1916,7 @@ Converter::visit(nir_intrinsic_instr *insn)
       const DataType dType = getDType(insn);
       Value *indirect;
       uint32_t coffset = getIndirect(insn, 0, 0, indirect);
-      for (uint8_t i = 0; i < dest_components; ++i) {
+      for (uint8_t i = 0; i < insn->num_components; ++i) {
          loadFrom(FILE_MEMORY_CONST, 0, dType, newDefs[i], 16 * coffset, i, indirect);
       }
       break;
@@ -1647,7 +1927,7 @@ Converter::visit(nir_intrinsic_instr *insn)
       DataType dType = getSType(insn->src[0], false, false);
       uint32_t idx = getIndirect(insn, op == nir_intrinsic_store_output ? 1 : 2, 0, indirect);
 
-      for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
             continue;
 
@@ -1655,7 +1935,7 @@ Converter::visit(nir_intrinsic_instr *insn)
          Value *src = getSrc(&insn->src[0], i);
          switch (prog->getType()) {
          case Program::TYPE_FRAGMENT: {
-            if (info_out->out[idx].sn == TGSI_SEMANTIC_POSITION) {
+            if (info->out[idx].sn == TGSI_SEMANTIC_POSITION) {
                // TGSI uses a different interface than NIR, TGSI stores that
                // value in the z component, NIR in X
                offset += 2;
@@ -1664,9 +1944,8 @@ Converter::visit(nir_intrinsic_instr *insn)
             break;
          }
          case Program::TYPE_GEOMETRY:
-         case Program::TYPE_TESSELLATION_EVAL:
          case Program::TYPE_VERTEX: {
-            if (info_out->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) {
+            if (info->io.genUserClip > 0 && idx == clipVertexOutput) {
                mkMov(clipVtx[i], src);
                src = clipVtx[i];
             }
@@ -1701,19 +1980,19 @@ Converter::visit(nir_intrinsic_instr *insn)
          srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LAYER, 0)));
          srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_SAMPLE_INDEX, 0)));
 
-         for (uint8_t i = 0u; i < dest_components; ++i) {
+         for (uint8_t i = 0u; i < insn->num_components; ++i) {
             defs.push_back(newDefs[i]);
             mask |= 1 << i;
          }
 
          TexInstruction *texi = mkTex(OP_TXF, TEX_TARGET_2D_MS_ARRAY, 0, 0, defs, srcs);
-         texi->tex.levelZero = true;
+         texi->tex.levelZero = 1;
          texi->tex.mask = mask;
          texi->tex.useOffsets = 0;
          texi->tex.r = 0xffff;
          texi->tex.s = 0xffff;
 
-         info_out->prop.fp.readsFramebuffer = true;
+         info->prop.fp.readsFramebuffer = true;
          break;
       }
 
@@ -1724,29 +2003,19 @@ Converter::visit(nir_intrinsic_instr *insn)
       uint32_t mode = 0;
 
       uint32_t idx = getIndirect(insn, op == nir_intrinsic_load_interpolated_input ? 1 : 0, 0, indirect);
-      nv50_ir_varying& vary = input ? info_out->in[idx] : info_out->out[idx];
+      nv50_ir_varying& vary = input ? info->in[idx] : info->out[idx];
 
       // see load_barycentric_* handling
       if (prog->getType() == Program::TYPE_FRAGMENT) {
+         mode = translateInterpMode(&vary, nvirOp);
          if (op == nir_intrinsic_load_interpolated_input) {
             ImmediateValue immMode;
             if (getSrc(&insn->src[0], 1)->getUniqueInsn()->src(0).getImmediate(immMode))
-               mode = immMode.reg.data.u32;
-         }
-         if (mode == NV50_IR_INTERP_DEFAULT)
-            mode |= translateInterpMode(&vary, nvirOp);
-         else {
-            if (vary.linear) {
-               nvirOp = OP_LINTERP;
-               mode |= NV50_IR_INTERP_LINEAR;
-            } else {
-               nvirOp = OP_PINTERP;
-               mode |= NV50_IR_INTERP_PERSPECTIVE;
-            }
+               mode |= immMode.reg.data.u32;
          }
       }
 
-      for (uint8_t i = 0u; i < dest_components; ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          uint32_t address = getSlotAddress(insn, idx, i);
          Symbol *sym = mkSymbol(input ? FILE_SHADER_INPUT : FILE_SHADER_OUTPUT, 0, dType, address);
          if (prog->getType() == Program::TYPE_FRAGMENT) {
@@ -1789,6 +2058,18 @@ Converter::visit(nir_intrinsic_instr *insn)
       }
       break;
    }
+   case nir_intrinsic_load_kernel_input: {
+      assert(prog->getType() == Program::TYPE_COMPUTE);
+      assert(insn->num_components == 1);
+
+      LValues &newDefs = convert(&insn->dest);
+      const DataType dType = getDType(insn);
+      Value *indirect;
+      uint32_t idx = getIndirect(insn, 0, 0, indirect, true);
+
+      mkLoad(dType, newDefs[0], mkSymbol(FILE_SHADER_INPUT, 0, dType, idx), indirect);
+      break;
+   }
    case nir_intrinsic_load_barycentric_at_offset:
    case nir_intrinsic_load_barycentric_at_sample:
    case nir_intrinsic_load_barycentric_centroid:
@@ -1815,11 +2096,8 @@ Converter::visit(nir_intrinsic_instr *insn)
       } else if (op == nir_intrinsic_load_barycentric_pixel) {
          mode = NV50_IR_INTERP_DEFAULT;
       } else if (op == nir_intrinsic_load_barycentric_at_sample) {
-         info_out->prop.fp.readsSampleLocations = true;
-         Value *sample = getSSA();
-         mkOp3(OP_SELP, TYPE_U32, sample, mkImm(0), getSrc(&insn->src[0], 0), mkImm(0))
-            ->subOp = 2;
-         mkOp1(OP_PIXLD, TYPE_U32, newDefs[0], sample)->subOp = NV50_IR_SUBOP_PIXLD_OFFSET;
+         info->prop.fp.readsSampleLocations = true;
+         mkOp1(OP_PIXLD, TYPE_U32, newDefs[0], getSrc(&insn->src[0], 0))->subOp = NV50_IR_SUBOP_PIXLD_OFFSET;
          mode = NV50_IR_INTERP_OFFSET;
       } else {
          unreachable("all intrinsics already handled above");
@@ -1828,11 +2106,9 @@ Converter::visit(nir_intrinsic_instr *insn)
       loadImm(newDefs[1], mode);
       break;
    }
-   case nir_intrinsic_demote:
    case nir_intrinsic_discard:
       mkOp(OP_DISCARD, TYPE_NONE, NULL);
       break;
-   case nir_intrinsic_demote_if:
    case nir_intrinsic_discard_if: {
       Value *pred = getSSA(1, FILE_PREDICATE);
       if (insn->num_components > 1) {
@@ -1848,13 +2124,12 @@ Converter::visit(nir_intrinsic_instr *insn)
    case nir_intrinsic_load_base_instance:
    case nir_intrinsic_load_draw_id:
    case nir_intrinsic_load_front_face:
-   case nir_intrinsic_is_helper_invocation:
    case nir_intrinsic_load_helper_invocation:
    case nir_intrinsic_load_instance_id:
    case nir_intrinsic_load_invocation_id:
-   case nir_intrinsic_load_workgroup_size:
+   case nir_intrinsic_load_local_group_size:
    case nir_intrinsic_load_local_invocation_id:
-   case nir_intrinsic_load_num_workgroups:
+   case nir_intrinsic_load_num_work_groups:
    case nir_intrinsic_load_patch_vertices_in:
    case nir_intrinsic_load_primitive_id:
    case nir_intrinsic_load_sample_id:
@@ -1870,13 +2145,12 @@ Converter::visit(nir_intrinsic_instr *insn)
    case nir_intrinsic_load_tess_level_inner:
    case nir_intrinsic_load_tess_level_outer:
    case nir_intrinsic_load_vertex_id:
-   case nir_intrinsic_load_workgroup_id:
-   case nir_intrinsic_load_work_dim: {
+   case nir_intrinsic_load_work_group_id: {
       const DataType dType = getDType(insn);
       SVSemantic sv = convert(op);
       LValues &newDefs = convert(&insn->dest);
 
-      for (uint8_t i = 0u; i < nir_intrinsic_dest_components(insn); ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          Value *def;
          if (typeSizeof(dType) == 8)
             def = getSSA();
@@ -1928,12 +2202,12 @@ Converter::visit(nir_intrinsic_instr *insn)
 
       if (op == nir_intrinsic_read_first_invocation) {
          mkOp1(OP_VOTE, TYPE_U32, tmp, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY;
-         mkOp1(OP_BREV, TYPE_U32, tmp, tmp);
+         mkOp2(OP_EXTBF, TYPE_U32, tmp, tmp, mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
          mkOp1(OP_BFIND, TYPE_U32, tmp, tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT;
       } else
          tmp = getSrc(&insn->src[1], 0);
 
-      for (uint8_t i = 0; i < dest_components; ++i) {
+      for (uint8_t i = 0; i < insn->num_components; ++i) {
          mkOp3(OP_SHFL, dType, newDefs[i], getSrc(&insn->src[0], i), tmp, mkImm(0x1f))
             ->subOp = NV50_IR_SUBOP_SHFL_IDX;
       }
@@ -1949,10 +2223,10 @@ Converter::visit(nir_intrinsic_instr *insn)
 
       Value *vtxBase = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(4, FILE_ADDRESS),
                               mkImm(baseVertex), indirectVertex);
-      for (uint8_t i = 0u; i < dest_components; ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          uint32_t address = getSlotAddress(insn, idx, i);
          loadFrom(FILE_SHADER_INPUT, 0, dType, newDefs[i], address, 0,
-                  indirectOffset, vtxBase, info_out->in[idx].patch);
+                  indirectOffset, vtxBase, info->in[idx].patch);
       }
       break;
    }
@@ -1972,24 +2246,19 @@ Converter::visit(nir_intrinsic_instr *insn)
 
       vtxBase = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, FILE_ADDRESS), outBase, vtxBase);
 
-      for (uint8_t i = 0u; i < dest_components; ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          uint32_t address = getSlotAddress(insn, idx, i);
          loadFrom(FILE_SHADER_OUTPUT, 0, dType, newDefs[i], address, 0,
-                  indirectOffset, vtxBase, info_out->in[idx].patch);
+                  indirectOffset, vtxBase, info->in[idx].patch);
       }
       break;
    }
-   case nir_intrinsic_emit_vertex: {
-      if (info_out->io.genUserClip > 0)
+   case nir_intrinsic_emit_vertex:
+      if (info->io.genUserClip > 0)
          handleUserClipPlanes();
-      uint32_t idx = nir_intrinsic_stream_id(insn);
-      mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
-      break;
-   }
+      // fallthrough
    case nir_intrinsic_end_primitive: {
       uint32_t idx = nir_intrinsic_stream_id(insn);
-      if (idx)
-         break;
       mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
       break;
    }
@@ -2001,13 +2270,13 @@ Converter::visit(nir_intrinsic_instr *insn)
       uint32_t index = getIndirect(&insn->src[0], 0, indirectIndex) + 1;
       uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
 
-      for (uint8_t i = 0u; i < dest_components; ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          loadFrom(FILE_MEMORY_CONST, index, dType, newDefs[i], offset, i,
                   indirectOffset, indirectIndex);
       }
       break;
    }
-   case nir_intrinsic_get_ssbo_size: {
+   case nir_intrinsic_get_buffer_size: {
       LValues &newDefs = convert(&insn->dest);
       const DataType dType = getDType(insn);
       Value *indirectBuffer;
@@ -2024,7 +2293,7 @@ Converter::visit(nir_intrinsic_instr *insn)
       uint32_t buffer = getIndirect(&insn->src[1], 0, indirectBuffer);
       uint32_t offset = getIndirect(&insn->src[2], 0, indirectOffset);
 
-      for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
             continue;
          Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, sType,
@@ -2032,7 +2301,7 @@ Converter::visit(nir_intrinsic_instr *insn)
          mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i))
             ->setIndirect(0, 1, indirectBuffer);
       }
-      info_out->io.globalAccess |= 0x2;
+      info->io.globalAccess |= 0x2;
       break;
    }
    case nir_intrinsic_load_ssbo: {
@@ -2043,15 +2312,14 @@ Converter::visit(nir_intrinsic_instr *insn)
       uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer);
       uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
 
-      for (uint8_t i = 0u; i < dest_components; ++i)
+      for (uint8_t i = 0u; i < insn->num_components; ++i)
          loadFrom(FILE_MEMORY_BUFFER, buffer, dType, newDefs[i], offset, i,
                   indirectOffset, indirectBuffer);
 
-      info_out->io.globalAccess |= 0x1;
+      info->io.globalAccess |= 0x1;
       break;
    }
    case nir_intrinsic_shared_atomic_add:
-   case nir_intrinsic_shared_atomic_fadd:
    case nir_intrinsic_shared_atomic_and:
    case nir_intrinsic_shared_atomic_comp_swap:
    case nir_intrinsic_shared_atomic_exchange:
@@ -2074,7 +2342,6 @@ Converter::visit(nir_intrinsic_instr *insn)
       break;
    }
    case nir_intrinsic_ssbo_atomic_add:
-   case nir_intrinsic_ssbo_atomic_fadd:
    case nir_intrinsic_ssbo_atomic_and:
    case nir_intrinsic_ssbo_atomic_comp_swap:
    case nir_intrinsic_ssbo_atomic_exchange:
@@ -2100,72 +2367,23 @@ Converter::visit(nir_intrinsic_instr *insn)
       atom->setIndirect(0, 1, indirectBuffer);
       atom->subOp = getSubOp(op);
 
-      info_out->io.globalAccess |= 0x2;
-      break;
-   }
-   case nir_intrinsic_global_atomic_add:
-   case nir_intrinsic_global_atomic_fadd:
-   case nir_intrinsic_global_atomic_and:
-   case nir_intrinsic_global_atomic_comp_swap:
-   case nir_intrinsic_global_atomic_exchange:
-   case nir_intrinsic_global_atomic_or:
-   case nir_intrinsic_global_atomic_imax:
-   case nir_intrinsic_global_atomic_imin:
-   case nir_intrinsic_global_atomic_umax:
-   case nir_intrinsic_global_atomic_umin:
-   case nir_intrinsic_global_atomic_xor: {
-      const DataType dType = getDType(insn);
-      LValues &newDefs = convert(&insn->dest);
-      Value *address;
-      uint32_t offset = getIndirect(&insn->src[0], 0, address);
-
-      Symbol *sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, dType, offset);
-      Instruction *atom =
-         mkOp2(OP_ATOM, dType, newDefs[0], sym, getSrc(&insn->src[1], 0));
-      if (op == nir_intrinsic_global_atomic_comp_swap)
-         atom->setSrc(2, getSrc(&insn->src[2], 0));
-      atom->setIndirect(0, 0, address);
-      atom->subOp = getSubOp(op);
-
-      info_out->io.globalAccess |= 0x2;
+      info->io.globalAccess |= 0x2;
       break;
    }
    case nir_intrinsic_bindless_image_atomic_add:
-   case nir_intrinsic_bindless_image_atomic_fadd:
    case nir_intrinsic_bindless_image_atomic_and:
    case nir_intrinsic_bindless_image_atomic_comp_swap:
    case nir_intrinsic_bindless_image_atomic_exchange:
-   case nir_intrinsic_bindless_image_atomic_imax:
-   case nir_intrinsic_bindless_image_atomic_umax:
-   case nir_intrinsic_bindless_image_atomic_imin:
-   case nir_intrinsic_bindless_image_atomic_umin:
+   case nir_intrinsic_bindless_image_atomic_max:
+   case nir_intrinsic_bindless_image_atomic_min:
    case nir_intrinsic_bindless_image_atomic_or:
    case nir_intrinsic_bindless_image_atomic_xor:
-   case nir_intrinsic_bindless_image_atomic_inc_wrap:
-   case nir_intrinsic_bindless_image_atomic_dec_wrap:
    case nir_intrinsic_bindless_image_load:
    case nir_intrinsic_bindless_image_samples:
    case nir_intrinsic_bindless_image_size:
-   case nir_intrinsic_bindless_image_store:
-   case nir_intrinsic_image_atomic_add:
-   case nir_intrinsic_image_atomic_fadd:
-   case nir_intrinsic_image_atomic_and:
-   case nir_intrinsic_image_atomic_comp_swap:
-   case nir_intrinsic_image_atomic_exchange:
-   case nir_intrinsic_image_atomic_imax:
-   case nir_intrinsic_image_atomic_umax:
-   case nir_intrinsic_image_atomic_imin:
-   case nir_intrinsic_image_atomic_umin:
-   case nir_intrinsic_image_atomic_or:
-   case nir_intrinsic_image_atomic_xor:
-   case nir_intrinsic_image_atomic_inc_wrap:
-   case nir_intrinsic_image_atomic_dec_wrap:
-   case nir_intrinsic_image_load:
-   case nir_intrinsic_image_samples:
-   case nir_intrinsic_image_size:
-   case nir_intrinsic_image_store: {
+   case nir_intrinsic_bindless_image_store: {
       std::vector<Value*> srcs, defs;
-      Value *indirect;
+      Value *indirect = getSrc(&insn->src[0], 0);
       DataType ty;
 
       uint32_t mask = 0;
@@ -2182,84 +2400,40 @@ Converter::visit(nir_intrinsic_instr *insn)
          }
       }
 
-      int lod_src = -1;
-      bool bindless = false;
       switch (op) {
       case nir_intrinsic_bindless_image_atomic_add:
-      case nir_intrinsic_bindless_image_atomic_fadd:
       case nir_intrinsic_bindless_image_atomic_and:
       case nir_intrinsic_bindless_image_atomic_comp_swap:
       case nir_intrinsic_bindless_image_atomic_exchange:
-      case nir_intrinsic_bindless_image_atomic_imax:
-      case nir_intrinsic_bindless_image_atomic_umax:
-      case nir_intrinsic_bindless_image_atomic_imin:
-      case nir_intrinsic_bindless_image_atomic_umin:
+      case nir_intrinsic_bindless_image_atomic_max:
+      case nir_intrinsic_bindless_image_atomic_min:
       case nir_intrinsic_bindless_image_atomic_or:
       case nir_intrinsic_bindless_image_atomic_xor:
-      case nir_intrinsic_bindless_image_atomic_inc_wrap:
-      case nir_intrinsic_bindless_image_atomic_dec_wrap:
          ty = getDType(insn);
-         bindless = true;
-         info_out->io.globalAccess |= 0x2;
-         mask = 0x1;
-         break;
-      case nir_intrinsic_image_atomic_add:
-      case nir_intrinsic_image_atomic_fadd:
-      case nir_intrinsic_image_atomic_and:
-      case nir_intrinsic_image_atomic_comp_swap:
-      case nir_intrinsic_image_atomic_exchange:
-      case nir_intrinsic_image_atomic_imax:
-      case nir_intrinsic_image_atomic_umax:
-      case nir_intrinsic_image_atomic_imin:
-      case nir_intrinsic_image_atomic_umin:
-      case nir_intrinsic_image_atomic_or:
-      case nir_intrinsic_image_atomic_xor:
-      case nir_intrinsic_image_atomic_inc_wrap:
-      case nir_intrinsic_image_atomic_dec_wrap:
-         ty = getDType(insn);
-         bindless = false;
-         info_out->io.globalAccess |= 0x2;
          mask = 0x1;
+         info->io.globalAccess |= 0x2;
          break;
       case nir_intrinsic_bindless_image_load:
-      case nir_intrinsic_image_load:
          ty = TYPE_U32;
-         bindless = op == nir_intrinsic_bindless_image_load;
-         info_out->io.globalAccess |= 0x1;
-         lod_src = 4;
+         info->io.globalAccess |= 0x1;
          break;
       case nir_intrinsic_bindless_image_store:
-      case nir_intrinsic_image_store:
          ty = TYPE_U32;
-         bindless = op == nir_intrinsic_bindless_image_store;
-         info_out->io.globalAccess |= 0x2;
-         lod_src = 5;
          mask = 0xf;
+         info->io.globalAccess |= 0x2;
          break;
       case nir_intrinsic_bindless_image_samples:
          mask = 0x8;
-         FALLTHROUGH;
-      case nir_intrinsic_image_samples:
          ty = TYPE_U32;
-         bindless = op == nir_intrinsic_bindless_image_samples;
-         mask = 0x8;
          break;
       case nir_intrinsic_bindless_image_size:
-      case nir_intrinsic_image_size:
-         assert(nir_src_as_uint(insn->src[1]) == 0);
          ty = TYPE_U32;
-         bindless = op == nir_intrinsic_bindless_image_size;
          break;
       default:
          unreachable("unhandled image opcode");
          break;
       }
 
-      if (bindless)
-         indirect = getSrc(&insn->src[0], 0);
-      else
-         location = getIndirect(&insn->src[0], 0, indirect);
-
       // coords
       if (opInfo.num_srcs >= 2)
          for (unsigned int i = 0u; i < argCount; ++i)
@@ -2269,21 +2443,22 @@ Converter::visit(nir_intrinsic_instr *insn)
       if (opInfo.num_srcs >= 3 && target.isMS())
          srcs.push_back(getSrc(&insn->src[2], 0));
 
-      if (opInfo.num_srcs >= 4 && lod_src != 4) {
+      if (opInfo.num_srcs >= 4) {
          unsigned components = opInfo.src_components[3] ? opInfo.src_components[3] : insn->num_components;
          for (uint8_t i = 0u; i < components; ++i)
             srcs.push_back(getSrc(&insn->src[3], i));
       }
 
-      if (opInfo.num_srcs >= 5 && lod_src != 5)
+      if (opInfo.num_srcs >= 5)
          // 1 for aotmic swap
          for (uint8_t i = 0u; i < opInfo.src_components[4]; ++i)
             srcs.push_back(getSrc(&insn->src[4], i));
 
       TexInstruction *texi = mkTex(getOperation(op), target.getEnum(), location, 0, defs, srcs);
-      texi->tex.bindless = bindless;
-      texi->tex.format = nv50_ir::TexInstruction::translateImgFormat(nir_intrinsic_format(insn));
+      texi->tex.bindless = false;
+      texi->tex.format = &nv50_ir::TexInstruction::formatTable[convertGLImgFormat(nir_intrinsic_format(insn))];
       texi->tex.mask = mask;
+      texi->tex.bindless = true;
       texi->cache = convert(nir_intrinsic_access(insn));
       texi->setType(ty);
       texi->subOp = getSubOp(op);
@@ -2293,40 +2468,134 @@ Converter::visit(nir_intrinsic_instr *insn)
 
       break;
    }
-   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_image_deref_atomic_add:
+   case nir_intrinsic_image_deref_atomic_and:
+   case nir_intrinsic_image_deref_atomic_comp_swap:
+   case nir_intrinsic_image_deref_atomic_exchange:
+   case nir_intrinsic_image_deref_atomic_max:
+   case nir_intrinsic_image_deref_atomic_min:
+   case nir_intrinsic_image_deref_atomic_or:
+   case nir_intrinsic_image_deref_atomic_xor:
+   case nir_intrinsic_image_deref_load:
+   case nir_intrinsic_image_deref_samples:
+   case nir_intrinsic_image_deref_size:
+   case nir_intrinsic_image_deref_store: {
+      const nir_variable *tex;
+      std::vector<Value*> srcs, defs;
+      Value *indirect;
+      DataType ty;
+
+      uint32_t mask = 0;
+      nir_deref_instr *deref = nir_src_as_deref(insn->src[0]);
+      const glsl_type *type = deref->type;
+      TexInstruction::Target target =
+         convert((glsl_sampler_dim)type->sampler_dimensionality,
+                 type->sampler_array, type->sampler_shadow);
+      unsigned int argCount = getNIRArgCount(target);
+      uint16_t location = handleDeref(deref, indirect, tex);
+
+      if (opInfo.has_dest) {
+         LValues &newDefs = convert(&insn->dest);
+         for (uint8_t i = 0u; i < newDefs.size(); ++i) {
+            defs.push_back(newDefs[i]);
+            mask |= 1 << i;
+         }
+      }
+
+      switch (op) {
+      case nir_intrinsic_image_deref_atomic_add:
+      case nir_intrinsic_image_deref_atomic_and:
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+      case nir_intrinsic_image_deref_atomic_exchange:
+      case nir_intrinsic_image_deref_atomic_max:
+      case nir_intrinsic_image_deref_atomic_min:
+      case nir_intrinsic_image_deref_atomic_or:
+      case nir_intrinsic_image_deref_atomic_xor:
+         ty = getDType(insn);
+         mask = 0x1;
+         info->io.globalAccess |= 0x2;
+         break;
+      case nir_intrinsic_image_deref_load:
+         ty = TYPE_U32;
+         info->io.globalAccess |= 0x1;
+         break;
+      case nir_intrinsic_image_deref_store:
+         ty = TYPE_U32;
+         mask = 0xf;
+         info->io.globalAccess |= 0x2;
+         break;
+      case nir_intrinsic_image_deref_samples:
+         mask = 0x8;
+         ty = TYPE_U32;
+         break;
+      case nir_intrinsic_image_deref_size:
+         ty = TYPE_U32;
+         break;
+      default:
+         unreachable("unhandled image opcode");
+         break;
+      }
+
+      // coords
+      if (opInfo.num_srcs >= 2)
+         for (unsigned int i = 0u; i < argCount; ++i)
+            srcs.push_back(getSrc(&insn->src[1], i));
+
+      // the sampler is just another src added after coords
+      if (opInfo.num_srcs >= 3 && target.isMS())
+         srcs.push_back(getSrc(&insn->src[2], 0));
+
+      if (opInfo.num_srcs >= 4) {
+         unsigned components = opInfo.src_components[3] ? opInfo.src_components[3] : insn->num_components;
+         for (uint8_t i = 0u; i < components; ++i)
+            srcs.push_back(getSrc(&insn->src[3], i));
+      }
+
+      if (opInfo.num_srcs >= 5)
+         // 1 for aotmic swap
+         for (uint8_t i = 0u; i < opInfo.src_components[4]; ++i)
+            srcs.push_back(getSrc(&insn->src[4], i));
+
+      TexInstruction *texi = mkTex(getOperation(op), target.getEnum(), location, 0, defs, srcs);
+      texi->tex.bindless = false;
+      texi->tex.format = &nv50_ir::TexInstruction::formatTable[convertGLImgFormat(tex->data.image.format)];
+      texi->tex.mask = mask;
+      texi->cache = getCacheModeFromVar(tex);
+      texi->setType(ty);
+      texi->subOp = getSubOp(op);
+
+      if (indirect)
+         texi->setIndirectR(indirect);
+
+      break;
+   }
    case nir_intrinsic_store_shared: {
       DataType sType = getSType(insn->src[0], false, false);
       Value *indirectOffset;
       uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
-      if (indirectOffset)
-         indirectOffset = mkOp1v(OP_MOV, TYPE_U32, getSSA(4, FILE_ADDRESS), indirectOffset);
 
-      for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+      for (uint8_t i = 0u; i < insn->num_components; ++i) {
          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
             continue;
-         Symbol *sym = mkSymbol(getFile(op), 0, sType, offset + i * typeSizeof(sType));
+         Symbol *sym = mkSymbol(FILE_MEMORY_SHARED, 0, sType, offset + i * typeSizeof(sType));
          mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i));
       }
       break;
    }
-   case nir_intrinsic_load_kernel_input:
-   case nir_intrinsic_load_scratch:
    case nir_intrinsic_load_shared: {
       const DataType dType = getDType(insn);
       LValues &newDefs = convert(&insn->dest);
       Value *indirectOffset;
       uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
-      if (indirectOffset)
-         indirectOffset = mkOp1v(OP_MOV, TYPE_U32, getSSA(4, FILE_ADDRESS), indirectOffset);
 
-      for (uint8_t i = 0u; i < dest_components; ++i)
-         loadFrom(getFile(op), 0, dType, newDefs[i], offset, i, indirectOffset);
+      for (uint8_t i = 0u; i < insn->num_components; ++i)
+         loadFrom(FILE_MEMORY_SHARED, 0, dType, newDefs[i], offset, i, indirectOffset);
 
       break;
    }
-   case nir_intrinsic_control_barrier: {
+   case nir_intrinsic_barrier: {
       // TODO: add flag to shader_info
-      info_out->numBarriers = 1;
+      info->numBarriers = 1;
       Instruction *bar = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0));
       bar->fixed = 1;
       bar->subOp = NV50_IR_SUBOP_BAR_SYNC;
@@ -2334,6 +2603,7 @@ Converter::visit(nir_intrinsic_instr *insn)
    }
    case nir_intrinsic_group_memory_barrier:
    case nir_intrinsic_memory_barrier:
+   case nir_intrinsic_memory_barrier_atomic_counter:
    case nir_intrinsic_memory_barrier_buffer:
    case nir_intrinsic_memory_barrier_image:
    case nir_intrinsic_memory_barrier_shared: {
@@ -2342,8 +2612,6 @@ Converter::visit(nir_intrinsic_instr *insn)
       bar->subOp = getSubOp(op);
       break;
    }
-   case nir_intrinsic_memory_barrier_tcs_patch:
-      break;
    case nir_intrinsic_shader_clock: {
       const DataType dType = getDType(insn);
       LValues &newDefs = convert(&insn->dest);
@@ -2352,23 +2620,22 @@ Converter::visit(nir_intrinsic_instr *insn)
       mkOp1(OP_RDSV, dType, newDefs[1], mkSysVal(SV_CLOCK, 0))->fixed = 1;
       break;
    }
-   case nir_intrinsic_load_global:
-   case nir_intrinsic_load_global_constant: {
+   case nir_intrinsic_load_global: {
       const DataType dType = getDType(insn);
       LValues &newDefs = convert(&insn->dest);
       Value *indirectOffset;
       uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
 
-      for (auto i = 0u; i < dest_components; ++i)
+      for (auto i = 0u; i < insn->num_components; ++i)
          loadFrom(FILE_MEMORY_GLOBAL, 0, dType, newDefs[i], offset, i, indirectOffset);
 
-      info_out->io.globalAccess |= 0x1;
+      info->io.globalAccess |= 0x1;
       break;
    }
    case nir_intrinsic_store_global: {
       DataType sType = getSType(insn->src[0], false, false);
 
-      for (auto i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
+      for (auto i = 0u; i < insn->num_components; ++i) {
          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
             continue;
          if (typeSizeof(sType) == 8) {
@@ -2386,7 +2653,7 @@ Converter::visit(nir_intrinsic_instr *insn)
          }
       }
 
-      info_out->io.globalAccess |= 0x2;
+      info->io.globalAccess |= 0x2;
       break;
    }
    default:
@@ -2410,6 +2677,7 @@ Converter::visit(nir_jump_instr *insn)
    case nir_jump_continue: {
       bool isBreak = insn->type == nir_jump_break;
       nir_block *block = insn->instr.block;
+      assert(!block->successors[1]);
       BasicBlock *target = convert(block->successors[0]);
       mkFlow(isBreak ? OP_BREAK : OP_CONT, target, CC_ALWAYS, NULL);
       bb->cfg.attach(&target->cfg, isBreak ? Graph::Edge::CROSS : Graph::Edge::BACK);
@@ -2529,6 +2797,8 @@ Converter::visit(nir_alu_instr *insn)
    case nir_op_ushr:
    case nir_op_fsin:
    case nir_op_fsqrt:
+   case nir_op_fsub:
+   case nir_op_isub:
    case nir_op_ftrunc:
    case nir_op_ishl:
    case nir_op_ixor: {
@@ -2600,7 +2870,7 @@ Converter::visit(nir_alu_instr *insn)
    case nir_op_flt32:
    case nir_op_ilt32:
    case nir_op_ult32:
-   case nir_op_fneu32:
+   case nir_op_fne32:
    case nir_op_ine32: {
       DEFAULT_CHECKS;
       LValues &newDefs = convert(&insn->dest);
@@ -2616,12 +2886,58 @@ Converter::visit(nir_alu_instr *insn)
       i->sType = sTypes[0];
       break;
    }
+   // those are weird ALU ops and need special handling, because
+   //   1. they are always componend based
+   //   2. they basically just merge multiple values into one data type
    case nir_op_mov:
+      if (!insn->dest.dest.is_ssa && insn->dest.dest.reg.reg->num_array_elems) {
+         nir_reg_dest& reg = insn->dest.dest.reg;
+         uint32_t goffset = regToLmemOffset[reg.reg->index];
+         uint8_t comps = reg.reg->num_components;
+         uint8_t size = reg.reg->bit_size / 8;
+         uint8_t csize = 4 * size; // TODO after fixing MemoryOpts: comps * size;
+         uint32_t aoffset = csize * reg.base_offset;
+         Value *indirect = NULL;
+
+         if (reg.indirect)
+            indirect = mkOp2v(OP_MUL, TYPE_U32, getSSA(4, FILE_ADDRESS),
+                              getSrc(reg.indirect, 0), mkImm(csize));
+
+         for (uint8_t i = 0u; i < comps; ++i) {
+            if (!((1u << i) & insn->dest.write_mask))
+               continue;
+
+            Symbol *sym = mkSymbol(FILE_MEMORY_LOCAL, 0, dType, goffset + aoffset + i * size);
+            mkStore(OP_STORE, dType, sym, indirect, getSrc(&insn->src[0], i));
+         }
+         break;
+      } else if (!insn->src[0].src.is_ssa && insn->src[0].src.reg.reg->num_array_elems) {
+         LValues &newDefs = convert(&insn->dest);
+         nir_reg_src& reg = insn->src[0].src.reg;
+         uint32_t goffset = regToLmemOffset[reg.reg->index];
+         // uint8_t comps = reg.reg->num_components;
+         uint8_t size = reg.reg->bit_size / 8;
+         uint8_t csize = 4 * size; // TODO after fixing MemoryOpts: comps * size;
+         uint32_t aoffset = csize * reg.base_offset;
+         Value *indirect = NULL;
+
+         if (reg.indirect)
+            indirect = mkOp2v(OP_MUL, TYPE_U32, getSSA(4, FILE_ADDRESS), getSrc(reg.indirect, 0), mkImm(csize));
+
+         for (uint8_t i = 0u; i < newDefs.size(); ++i)
+            loadFrom(FILE_MEMORY_LOCAL, 0, dType, newDefs[i], goffset + aoffset, i, indirect);
+
+         break;
+      } else {
+         LValues &newDefs = convert(&insn->dest);
+         for (LValues::size_type c = 0u; c < newDefs.size(); ++c) {
+            mkMov(newDefs[c], getSrc(&insn->src[0], c), dType);
+         }
+      }
+      break;
    case nir_op_vec2:
    case nir_op_vec3:
-   case nir_op_vec4:
-   case nir_op_vec8:
-   case nir_op_vec16: {
+   case nir_op_vec4: {
       LValues &newDefs = convert(&insn->dest);
       for (LValues::size_type c = 0u; c < newDefs.size(); ++c) {
          mkMov(newDefs[c], getSrc(&insn->src[c]), dType);
@@ -2717,7 +3033,7 @@ Converter::visit(nir_alu_instr *insn)
    case nir_op_bfm: {
       DEFAULT_CHECKS;
       LValues &newDefs = convert(&insn->dest);
-      mkOp2(OP_BMSK, dType, newDefs[0], getSrc(&insn->src[1]), getSrc(&insn->src[0]))->subOp = NV50_IR_SUBOP_BMSK_W;
+      mkOp3(OP_INSBF, dType, newDefs[0], getSrc(&insn->src[0]), loadImm(NULL, 0x808), getSrc(&insn->src[1]));
       break;
    }
    case nir_op_bitfield_insert: {
@@ -2737,69 +3053,17 @@ Converter::visit(nir_alu_instr *insn)
    case nir_op_bitfield_reverse: {
       DEFAULT_CHECKS;
       LValues &newDefs = convert(&insn->dest);
-      mkOp1(OP_BREV, TYPE_U32, newDefs[0], getSrc(&insn->src[0]));
+      mkOp2(OP_EXTBF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
       break;
    }
    case nir_op_find_lsb: {
       DEFAULT_CHECKS;
       LValues &newDefs = convert(&insn->dest);
       Value *tmp = getSSA();
-      mkOp1(OP_BREV, TYPE_U32, tmp, getSrc(&insn->src[0]));
+      mkOp2(OP_EXTBF, TYPE_U32, tmp, getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
       mkOp1(OP_BFIND, TYPE_U32, newDefs[0], tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT;
       break;
    }
-   case nir_op_extract_u8: {
-      DEFAULT_CHECKS;
-      LValues &newDefs = convert(&insn->dest);
-      Value *prmt = getSSA();
-      mkOp2(OP_OR, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x4440));
-      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-      break;
-   }
-   case nir_op_extract_i8: {
-      DEFAULT_CHECKS;
-      LValues &newDefs = convert(&insn->dest);
-      Value *prmt = getSSA();
-      mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x1111), loadImm(NULL, 0x8880));
-      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-      break;
-   }
-   case nir_op_extract_u16: {
-      DEFAULT_CHECKS;
-      LValues &newDefs = convert(&insn->dest);
-      Value *prmt = getSSA();
-      mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x22), loadImm(NULL, 0x4410));
-      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-      break;
-   }
-   case nir_op_extract_i16: {
-      DEFAULT_CHECKS;
-      LValues &newDefs = convert(&insn->dest);
-      Value *prmt = getSSA();
-      mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x2222), loadImm(NULL, 0x9910));
-      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-      break;
-   }
-   case nir_op_urol: {
-      DEFAULT_CHECKS;
-      LValues &newDefs = convert(&insn->dest);
-      mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]),
-            getSrc(&insn->src[1]), getSrc(&insn->src[0]))
-         ->subOp = NV50_IR_SUBOP_SHF_L |
-                   NV50_IR_SUBOP_SHF_W |
-                   NV50_IR_SUBOP_SHF_HI;
-      break;
-   }
-   case nir_op_uror: {
-      DEFAULT_CHECKS;
-      LValues &newDefs = convert(&insn->dest);
-      mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]),
-            getSrc(&insn->src[1]), getSrc(&insn->src[0]))
-         ->subOp = NV50_IR_SUBOP_SHF_R |
-                   NV50_IR_SUBOP_SHF_W |
-                   NV50_IR_SUBOP_SHF_LO;
-      break;
-   }
    // boolean conversions
    case nir_op_b2f32: {
       DEFAULT_CHECKS;
@@ -2845,7 +3109,6 @@ Converter::visit(nir_alu_instr *insn)
    }
    default:
       ERROR("unknown nir_op %s\n", info.name);
-      assert(false);
       return false;
    }
 
@@ -2917,6 +3180,14 @@ Converter::convert(glsl_sampler_dim dim, bool isArray, bool isShadow)
 }
 #undef CASE_SAMPLER
 
+Value*
+Converter::applyProjection(Value *src, Value *proj)
+{
+   if (!proj)
+      return src;
+   return mkOp2v(OP_MUL, TYPE_F32, getScratch(), src, proj);
+}
+
 unsigned int
 Converter::getNIRArgCount(TexInstruction::Target& target)
 {
@@ -2928,14 +3199,70 @@ Converter::getNIRArgCount(TexInstruction::Target& target)
    return result;
 }
 
+uint16_t
+Converter::handleDeref(nir_deref_instr *deref, Value * &indirect, const nir_variable * &tex)
+{
+   typedef std::pair<uint32_t,Value*> DerefPair;
+   std::list<DerefPair> derefs;
+
+   uint16_t result = 0;
+   while (deref->deref_type != nir_deref_type_var) {
+      switch (deref->deref_type) {
+      case nir_deref_type_array: {
+         Value *indirect;
+         uint8_t size = type_size(deref->type, true);
+         result += size * getIndirect(&deref->arr.index, 0, indirect);
+
+         if (indirect) {
+            derefs.push_front(std::make_pair(size, indirect));
+         }
+
+         break;
+      }
+      case nir_deref_type_struct: {
+         result += nir_deref_instr_parent(deref)->type->struct_location_offset(deref->strct.index);
+         break;
+      }
+      case nir_deref_type_var:
+      default:
+         unreachable("nir_deref_type_var reached in handleDeref!");
+         break;
+      }
+      deref = nir_deref_instr_parent(deref);
+   }
+
+   indirect = NULL;
+   for (std::list<DerefPair>::const_iterator it = derefs.begin(); it != derefs.end(); ++it) {
+      Value *offset = mkOp2v(OP_MUL, TYPE_U32, getSSA(), loadImm(getSSA(), it->first), it->second);
+      if (indirect)
+         indirect = mkOp2v(OP_ADD, TYPE_U32, getSSA(), indirect, offset);
+      else
+         indirect = offset;
+   }
+
+   tex = nir_deref_instr_get_variable(deref);
+   assert(tex);
+
+   return result + tex->data.driver_location;
+}
+
 CacheMode
 Converter::convert(enum gl_access_qualifier access)
 {
-   if (access & ACCESS_VOLATILE)
+   switch (access) {
+   case ACCESS_VOLATILE:
       return CACHE_CV;
-   if (access & ACCESS_COHERENT)
+   case ACCESS_COHERENT:
       return CACHE_CG;
-   return CACHE_CA;
+   default:
+      return CACHE_CA;
+   }
+}
+
+CacheMode
+Converter::getCacheModeFromVar(const nir_variable *var)
+{
+   return convert(var->data.image.access);
 }
 
 bool
@@ -2959,6 +3286,7 @@ Converter::visit(nir_tex_instr *insn)
       std::vector<nir_src*> offsets;
       uint8_t mask = 0;
       bool lz = false;
+      Value *proj = NULL;
       TexInstruction::Target target = convert(insn->sampler_dim, insn->is_array, insn->is_shadow);
       operation op = getOperation(insn->op);
 
@@ -2971,6 +3299,7 @@ Converter::visit(nir_tex_instr *insn)
       int msIdx = nir_tex_instr_src_index(insn, nir_tex_src_ms_index);
       int lodIdx = nir_tex_instr_src_index(insn, nir_tex_src_lod);
       int offsetIdx = nir_tex_instr_src_index(insn, nir_tex_src_offset);
+      int projIdx = nir_tex_instr_src_index(insn, nir_tex_src_projector);
       int sampOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_offset);
       int texOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_texture_offset);
       int sampHandleIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_handle);
@@ -2979,9 +3308,12 @@ Converter::visit(nir_tex_instr *insn)
       bool bindless = sampHandleIdx != -1 || texHandleIdx != -1;
       assert((sampHandleIdx != -1) == (texHandleIdx != -1));
 
+      if (projIdx != -1)
+         proj = mkOp1v(OP_RCP, TYPE_F32, getScratch(), getSrc(&insn->src[projIdx].src, 0));
+
       srcs.resize(insn->coord_components);
       for (uint8_t i = 0u; i < insn->coord_components; ++i)
-         srcs[i] = getSrc(&insn->src[coordsIdx].src, i);
+         srcs[i] = applyProjection(getSrc(&insn->src[coordsIdx].src, i), proj);
 
       // sometimes we get less args than target.getArgCount, but codegen expects the latter
       if (insn->coord_components) {
@@ -3009,7 +3341,7 @@ Converter::visit(nir_tex_instr *insn)
       if (offsetIdx != -1)
          offsets.push_back(&insn->src[offsetIdx].src);
       if (compIdx != -1)
-         srcs.push_back(getSrc(&insn->src[compIdx].src, 0));
+         srcs.push_back(applyProjection(getSrc(&insn->src[compIdx].src, 0), proj));
       if (texOffIdx != -1) {
          srcs.push_back(getSrc(&insn->src[texOffIdx].src, 0));
          texOffIdx = srcs.size() - 1;
@@ -3111,34 +3443,23 @@ Converter::visit(nir_tex_instr *insn)
    return true;
 }
 
-/* nouveau's RA doesn't track the liveness of exported registers in the fragment
- * shader, so we need all the store_outputs to appear at the end of the shader
- * with no other instructions that might generate a temp value in between them.
- */
-static void
-nv_nir_move_stores_to_end(nir_shader *s)
+bool
+Converter::visit(nir_deref_instr *deref)
 {
-   nir_function_impl *impl = nir_shader_get_entrypoint(s);
-   nir_block *block = nir_impl_last_block(impl);
-   nir_instr *first_store = NULL;
-
-   nir_foreach_instr_safe(instr, block) {
-      if (instr == first_store)
-         break;
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-      if (intrin->intrinsic == nir_intrinsic_store_output) {
-         nir_instr_remove(instr);
-         nir_instr_insert(nir_after_block(block), instr);
-
-         if (!first_store)
-            first_store = instr;
-      }
+   // we just ignore those, because images intrinsics are the only place where
+   // we should end up with deref sources and those have to backtrack anyway
+   // to get the nir_variable. This code just exists to handle some special
+   // cases.
+   switch (deref->deref_type) {
+   case nir_deref_type_array:
+   case nir_deref_type_struct:
+   case nir_deref_type_var:
+      break;
+   default:
+      ERROR("unknown nir_deref_instr %u\n", deref->deref_type);
+      return false;
    }
-   nir_metadata_preserve(impl,
-                         nir_metadata_block_index |
-                         nir_metadata_dominance);
+   return true;
 }
 
 bool
@@ -3149,44 +3470,18 @@ Converter::run()
    if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
       nir_print_shader(nir, stderr);
 
-   struct nir_lower_subgroups_options subgroup_options = {};
-   subgroup_options.subgroup_size = 32;
-   subgroup_options.ballot_bit_size = 32;
-   subgroup_options.ballot_components = 1;
-   subgroup_options.lower_elect = true;
-
-   /* prepare for IO lowering */
-   NIR_PASS_V(nir, nir_opt_deref);
-   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
-   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
-   /* codegen assumes vec4 alignment for memory */
-   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, function_temp_type_info);
-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset);
-   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
-
-   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-              type_size, (nir_lower_io_options)0);
+   struct nir_lower_subgroups_options subgroup_options = {
+      .subgroup_size = 32,
+      .ballot_bit_size = 32,
+   };
 
+   NIR_PASS_V(nir, nir_lower_io, nir_var_all, type_size, (nir_lower_io_options)0);
    NIR_PASS_V(nir, nir_lower_subgroups, &subgroup_options);
-
-   struct nir_lower_tex_options tex_options = {};
-   tex_options.lower_txp = ~0;
-
-   NIR_PASS_V(nir, nir_lower_tex, &tex_options);
-
+   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
-   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-   NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
-
-   /*TODO: improve this lowering/optimisation loop so that we can use
-    *      nir_opt_idiv_const effectively before this.
-    */
-   nir_lower_idiv_options idiv_options = {
-      .imprecise_32bit_lowering = false,
-      .allow_fp16 = true,
-   };
-   NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL);
+   NIR_PASS_V(nir, nir_lower_phis_to_scalar);
 
    do {
       progress = false;
@@ -3199,21 +3494,11 @@ Converter::run()
       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_dce);
       NIR_PASS(progress, nir, nir_opt_dead_cf);
-      NIR_PASS(progress, nir, nir_lower_64bit_phis);
    } while (progress);
 
-   nir_move_options move_options =
-      (nir_move_options)(nir_move_const_undef |
-                         nir_move_load_ubo |
-                         nir_move_load_uniform |
-                         nir_move_load_input);
-   NIR_PASS_V(nir, nir_opt_sink, move_options);
-   NIR_PASS_V(nir, nir_opt_move, move_options);
-
-   if (nir->info.stage == MESA_SHADER_FRAGMENT)
-      NIR_PASS_V(nir, nv_nir_move_stores_to_end);
-
    NIR_PASS_V(nir, nir_lower_bool_to_int32);
+   NIR_PASS_V(nir, nir_lower_locals_to_regs);
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
    NIR_PASS_V(nir, nir_convert_from_ssa, true);
 
    // Garbage collect dead instructions
@@ -3245,150 +3530,17 @@ Converter::run()
 namespace nv50_ir {
 
 bool
-Program::makeFromNIR(struct nv50_ir_prog_info *info,
-                     struct nv50_ir_prog_info_out *info_out)
+Program::makeFromNIR(struct nv50_ir_prog_info *info)
 {
    nir_shader *nir = (nir_shader*)info->bin.source;
-   Converter converter(this, nir, info, info_out);
+   Converter converter(this, nir, info);
    bool result = converter.run();
    if (!result)
       return result;
    LoweringHelper lowering;
    lowering.run(this);
-   tlsSize = info_out->bin.tlsSpace;
+   tlsSize = info->bin.tlsSpace;
    return result;
 }
 
 } // namespace nv50_ir
-
-static nir_shader_compiler_options
-nvir_nir_shader_compiler_options(int chipset)
-{
-   nir_shader_compiler_options op = {};
-   op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET);
-   op.lower_ffma16 = false;
-   op.lower_ffma32 = false;
-   op.lower_ffma64 = false;
-   op.fuse_ffma16 = false; /* nir doesn't track mad vs fma */
-   op.fuse_ffma32 = false; /* nir doesn't track mad vs fma */
-   op.fuse_ffma64 = false; /* nir doesn't track mad vs fma */
-   op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET);
-   op.lower_flrp32 = true;
-   op.lower_flrp64 = true;
-   op.lower_fpow = false; // TODO: nir's lowering is broken, or we could use it
-   op.lower_fsat = false;
-   op.lower_fsqrt = false; // TODO: only before gm200
-   op.lower_sincos = false;
-   op.lower_fmod = true;
-   op.lower_bitfield_extract = false;
-   op.lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
-   op.lower_bitfield_insert = false;
-   op.lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
-   op.lower_bitfield_insert_to_bitfield_select = false;
-   op.lower_bitfield_reverse = false;
-   op.lower_bit_count = false;
-   op.lower_ifind_msb = false;
-   op.lower_find_lsb = false;
-   op.lower_uadd_carry = true; // TODO
-   op.lower_usub_borrow = true; // TODO
-   op.lower_mul_high = false;
-   op.lower_fneg = false;
-   op.lower_ineg = false;
-   op.lower_scmp = true; // TODO: not implemented yet
-   op.lower_vector_cmp = false;
-   op.lower_bitops = false;
-   op.lower_isign = (chipset >= NVISA_GV100_CHIPSET);
-   op.lower_fsign = (chipset >= NVISA_GV100_CHIPSET);
-   op.lower_fdph = false;
-   op.lower_fdot = false;
-   op.fdot_replicates = false; // TODO
-   op.lower_ffloor = false; // TODO
-   op.lower_ffract = true;
-   op.lower_fceil = false; // TODO
-   op.lower_ftrunc = false;
-   op.lower_ldexp = true;
-   op.lower_pack_half_2x16 = true;
-   op.lower_pack_unorm_2x16 = true;
-   op.lower_pack_snorm_2x16 = true;
-   op.lower_pack_unorm_4x8 = true;
-   op.lower_pack_snorm_4x8 = true;
-   op.lower_unpack_half_2x16 = true;
-   op.lower_unpack_unorm_2x16 = true;
-   op.lower_unpack_snorm_2x16 = true;
-   op.lower_unpack_unorm_4x8 = true;
-   op.lower_unpack_snorm_4x8 = true;
-   op.lower_pack_split = false;
-   op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET);
-   op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET);
-   op.lower_insert_byte = true;
-   op.lower_insert_word = true;
-   op.lower_all_io_to_temps = false;
-   op.lower_all_io_to_elements = false;
-   op.vertex_id_zero_based = false;
-   op.lower_base_vertex = false;
-   op.lower_helper_invocation = false;
-   op.optimize_sample_mask_in = false;
-   op.lower_cs_local_index_to_id = true;
-   op.lower_cs_local_id_to_index = false;
-   op.lower_device_index_to_zero = false; // TODO
-   op.lower_wpos_pntc = false; // TODO
-   op.lower_hadd = true; // TODO
-   op.lower_uadd_sat = true; // TODO
-   op.lower_iadd_sat = true; // TODO
-   op.vectorize_io = false;
-   op.lower_to_scalar = false;
-   op.unify_interfaces = false;
-   op.use_interpolated_input_intrinsics = true;
-   op.lower_mul_2x32_64 = true; // TODO
-   op.lower_rotate = (chipset < NVISA_GV100_CHIPSET);
-   op.has_imul24 = false;
-   op.intel_vec4 = false;
-   op.max_unroll_iterations = 32;
-   op.lower_int64_options = (nir_lower_int64_options) (
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) |
-      nir_lower_divmod64 |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) |
-      ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) |
-      nir_lower_ufind_msb64
-   );
-   op.lower_doubles_options = (nir_lower_doubles_options) (
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) |
-      nir_lower_dmod |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) |
-      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0)
-   );
-   return op;
-}
-
-static const nir_shader_compiler_options g80_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_G80_CHIPSET);
-static const nir_shader_compiler_options gf100_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_GF100_CHIPSET);
-static const nir_shader_compiler_options gm107_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_GM107_CHIPSET);
-static const nir_shader_compiler_options gv100_nir_shader_compiler_options =
-nvir_nir_shader_compiler_options(NVISA_GV100_CHIPSET);
-
-const nir_shader_compiler_options *
-nv50_ir_nir_shader_compiler_options(int chipset)
-{
-   if (chipset >= NVISA_GV100_CHIPSET)
-      return &gv100_nir_shader_compiler_options;
-   if (chipset >= NVISA_GM107_CHIPSET)
-      return &gm107_nir_shader_compiler_options;
-   if (chipset >= NVISA_GF100_CHIPSET)
-      return &gf100_nir_shader_compiler_options;
-   return &g80_nir_shader_compiler_options;
-}
diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c b/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c
index ea9d7be0d..47f6c1e53 100644
--- a/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/lib/mesa/src/gallium/drivers/panfrost/pan_assemble.c
@@ -1,6 +1,5 @@
 /*
  * © Copyright 2018 Alyssa Rosenzweig
- * Copyright (C) 2019-2020 Collabora, Ltd.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,29 +25,27 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "pan_bo.h"
 #include "pan_context.h"
-#include "pan_shader.h"
-#include "pan_util.h"
 
 #include "compiler/nir/nir.h"
 #include "nir/tgsi_to_nir.h"
+#include "midgard/midgard_compile.h"
 #include "util/u_dynarray.h"
-#include "util/u_upload_mgr.h"
 
 #include "tgsi/tgsi_dump.h"
 
 void
-panfrost_shader_compile(struct pipe_screen *pscreen,
-                        struct panfrost_pool *shader_pool,
-                        struct panfrost_pool *desc_pool,
-                        enum pipe_shader_ir ir_type,
-                        const void *ir,
-                        gl_shader_stage stage,
-                        struct panfrost_shader_state *state)
+panfrost_shader_compile(
+                struct panfrost_context *ctx,
+                struct mali_shader_meta *meta,
+                enum pipe_shader_ir ir_type,
+                const void *ir,
+                gl_shader_stage stage,
+                struct panfrost_shader_state *state,
+                uint64_t *outputs_written)
 {
-        struct panfrost_screen *screen = pan_screen(pscreen);
-        struct panfrost_device *dev = pan_device(pscreen);
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+        uint8_t *dst;
 
         nir_shader *s;
 
@@ -56,47 +53,117 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
                 s = nir_shader_clone(NULL, ir);
         } else {
                 assert (ir_type == PIPE_SHADER_IR_TGSI);
-                s = tgsi_to_nir(ir, pscreen, false);
+                s = tgsi_to_nir(ir, ctx->base.screen);
         }
 
-        /* Lower this early so the backends don't have to worry about it */
-        if (stage == MESA_SHADER_FRAGMENT)
-                NIR_PASS_V(s, nir_lower_fragcolor, state->nr_cbufs);
-
         s->info.stage = stage;
 
+        if (stage == MESA_SHADER_FRAGMENT) {
+                /* Inject the alpha test now if we need to */
+
+                if (state->alpha_state.enabled) {
+                        NIR_PASS_V(s, nir_lower_alpha_test, state->alpha_state.func, false);
+                }
+        }
+
         /* Call out to Midgard compiler given the above NIR */
-        struct panfrost_compile_inputs inputs = {
-                .gpu_id = dev->gpu_id,
-                .shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
+
+        midgard_program program = {
+                .alpha_ref = state->alpha_state.ref_value
         };
 
-        memcpy(inputs.rt_formats, state->rt_formats, sizeof(inputs.rt_formats));
+        midgard_compile_shader_nir(&ctx->compiler, s, &program, false);
+
+        /* Prepare the compiled binary for upload */
+        int size = program.compiled.size;
+        dst = program.compiled.data;
+
+        /* Upload the shader. The lookahead tag is ORed on as a tagged pointer.
+         * I bet someone just thought that would be a cute pun. At least,
+         * that's how I'd do it. */
+
+        state->bo = panfrost_drm_create_bo(screen, size, PAN_ALLOCATE_EXECUTE);
+        memcpy(state->bo->cpu, dst, size);
+        meta->shader = state->bo->gpu | program.first_tag;
+
+        util_dynarray_fini(&program.compiled);
+
+        /* Sysvals are prepended */
+        program.uniform_count += program.sysval_count;
+        state->sysval_count = program.sysval_count;
+        memcpy(state->sysval, program.sysvals, sizeof(state->sysval[0]) * state->sysval_count);
+
+        meta->midgard1.uniform_count = MIN2(program.uniform_count, program.uniform_cutoff);
+        meta->midgard1.work_count = program.work_register_count;
+
+        switch (stage) {
+        case MESA_SHADER_VERTEX:
+                meta->attribute_count = util_bitcount64(s->info.inputs_read);
+                meta->varying_count = util_bitcount64(s->info.outputs_written);
+                break;
+        case MESA_SHADER_FRAGMENT:
+                meta->attribute_count = 0;
+                meta->varying_count = util_bitcount64(s->info.inputs_read);
+                break;
+        case MESA_SHADER_COMPUTE:
+                /* TODO: images */
+                meta->attribute_count = 0;
+                meta->varying_count = 0;
+                break;
+        default:
+                unreachable("Unknown shader state");
+        }
 
-        struct util_dynarray binary;
+        state->can_discard = s->info.fs.uses_discard;
+        state->writes_point_size = program.writes_point_size;
+        state->reads_point_coord = false;
+        state->helper_invocations = s->info.fs.needs_helper_invocations;
 
-        util_dynarray_init(&binary, NULL);
-        screen->vtbl.compile_shader(s, &inputs, &binary, &state->info);
+        if (outputs_written)
+                *outputs_written = s->info.outputs_written;
 
-        if (binary.size) {
-                state->bin = panfrost_pool_take_ref(shader_pool,
-                        pan_pool_upload_aligned(&shader_pool->base,
-                                binary.data, binary.size, 128));
-        }
+        /* Separate as primary uniform count is truncated */
+        state->uniform_count = program.uniform_count;
 
+        meta->midgard1.unknown2 = 8; /* XXX */
 
-        /* Don't upload RSD for fragment shaders since they need draw-time
-         * merging for e.g. depth/stencil/alpha. RSDs are replaced by simpler
-         * shader program descriptors on Valhall, which can be preuploaded even
-         * for fragment shaders. */
-        bool upload = !(stage == MESA_SHADER_FRAGMENT && dev->arch <= 7);
-        screen->vtbl.prepare_shader(state, desc_pool, upload);
+        unsigned default_vec1_swizzle = panfrost_get_default_swizzle(1);
+        unsigned default_vec2_swizzle = panfrost_get_default_swizzle(2);
+        unsigned default_vec4_swizzle = panfrost_get_default_swizzle(4);
 
-        panfrost_analyze_sysvals(state);
+        /* Iterate the varyings and emit the corresponding descriptor */
+        for (unsigned i = 0; i < meta->varying_count; ++i) {
+                unsigned location = program.varyings[i];
 
-        util_dynarray_fini(&binary);
+                /* Default to a vec4 varying */
+                struct mali_attr_meta v = {
+                        .format = MALI_RGBA32F,
+                        .swizzle = default_vec4_swizzle,
+                        .unknown1 = 0x2,
+                };
 
-        /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against
-         * a NULL context */
-        ralloc_free(s);
+                /* Check for special cases, otherwise assume general varying */
+
+                if (location == VARYING_SLOT_POS) {
+                        v.format = MALI_VARYING_POS;
+                } else if (location == VARYING_SLOT_PSIZ) {
+                        v.format = MALI_R16F;
+                        v.swizzle = default_vec1_swizzle;
+
+                        state->writes_point_size = true;
+                } else if (location == VARYING_SLOT_PNTC) {
+                        v.format = MALI_RG16F;
+                        v.swizzle = default_vec2_swizzle;
+
+                        state->reads_point_coord = true;
+                } else if (location == VARYING_SLOT_FACE) {
+                        v.format = MALI_R32I;
+                        v.swizzle = default_vec1_swizzle;
+
+                        state->reads_face = true;
+                }
+
+                state->varyings[i] = v;
+                state->varyings_loc[i] = location;
+        }
 }
diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c b/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c
index 844016ee9..d0b2e1322 100644
--- a/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c
+++ b/lib/mesa/src/gallium/drivers/panfrost/pan_compute.c
@@ -1,6 +1,5 @@
 /*
  * Copyright (C) 2019 Collabora, Ltd.
- * Copyright (C) 2019 Red Hat Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -27,10 +26,7 @@
  */
 
 #include "pan_context.h"
-#include "pan_bo.h"
-#include "pan_shader.h"
 #include "util/u_memory.h"
-#include "nir_serialize.h"
 
 /* Compute CSOs are tracked like graphics shader CSOs, but are
  * considerably simpler. We do not implement multiple
@@ -43,38 +39,23 @@ panfrost_create_compute_state(
         const struct pipe_compute_state *cso)
 {
         struct panfrost_context *ctx = pan_context(pctx);
-        struct panfrost_screen *screen = pan_screen(pctx->screen);
 
         struct panfrost_shader_variants *so = CALLOC_STRUCT(panfrost_shader_variants);
         so->cbase = *cso;
         so->is_compute = true;
 
-        struct panfrost_shader_state *v = calloc(1, sizeof(*v));
-        so->variants = v;
+        struct panfrost_shader_state *v = &so->variants[0];
 
         so->variant_count = 1;
         so->active_variant = 0;
 
-        if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) {
-                struct blob_reader reader;
-                const struct pipe_binary_program_header *hdr = cso->prog;
+        v->tripipe = malloc(sizeof(struct mali_shader_meta));
 
-                blob_reader_init(&reader, hdr->blob, hdr->num_bytes);
+        panfrost_shader_compile(ctx, v->tripipe,
+                        cso->ir_type, cso->prog,
+                        MESA_SHADER_COMPUTE, v, NULL);
 
-                const struct nir_shader_compiler_options *options =
-                        screen->vtbl.get_compiler_options();
 
-                so->cbase.prog = nir_deserialize(NULL, options, &reader);
-                so->cbase.ir_type = PIPE_SHADER_IR_NIR;
-        }
-
-        panfrost_shader_compile(pctx->screen, &ctx->shaders, &ctx->descs,
-                        so->cbase.ir_type, so->cbase.prog, MESA_SHADER_COMPUTE,
-                        v);
-
-        /* There are no variants so we won't need the NIR again */
-        ralloc_free((void *)so->cbase.prog);
-        so->cbase.prog = NULL;
 
         return so;
 }
@@ -83,57 +64,76 @@ static void
 panfrost_bind_compute_state(struct pipe_context *pipe, void *cso)
 {
         struct panfrost_context *ctx = pan_context(pipe);
-        ctx->shader[PIPE_SHADER_COMPUTE] = cso;
+
+        struct panfrost_shader_variants *variants =
+                (struct panfrost_shader_variants *) cso;
+
+        ctx->shader[PIPE_SHADER_COMPUTE] = variants;
 }
 
 static void
 panfrost_delete_compute_state(struct pipe_context *pipe, void *cso)
 {
-        struct panfrost_shader_variants *so =
-                (struct panfrost_shader_variants *)cso;
-
-        free(so->variants);
         free(cso);
 }
 
-static void
-panfrost_set_compute_resources(struct pipe_context *pctx,
-                         unsigned start, unsigned count,
-                         struct pipe_surface **resources)
-{
-        /* TODO */
-}
+/* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
+ * construct the COMPUTE job and some of its payload.
+ */
 
 static void
-panfrost_set_global_binding(struct pipe_context *pctx,
-                      unsigned first, unsigned count,
-                      struct pipe_resource **resources,
-                      uint32_t **handles)
+panfrost_launch_grid(struct pipe_context *pipe,
+                const struct pipe_grid_info *info)
 {
-        if (!resources)
-                return;
+        struct panfrost_context *ctx = pan_context(pipe);
 
-        struct panfrost_context *ctx = pan_context(pctx);
-        struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+        ctx->compute_grid = info;
 
-        for (unsigned i = first; i < first + count; ++i) {
-                struct panfrost_resource *rsrc = pan_resource(resources[i]);
-                panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_COMPUTE);
+        struct mali_job_descriptor_header job = {
+                .job_type = JOB_TYPE_COMPUTE,
+                .job_descriptor_size = 1,
+                .job_barrier = 1
+        };
 
-                util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
-                                0, rsrc->base.width0);
+        /* TODO: Stub */
+        struct midgard_payload_vertex_tiler *payload = &ctx->payloads[PIPE_SHADER_COMPUTE];
 
-                /* The handle points to uint32_t, but space is allocated for 64 bits */
-                memcpy(handles[i], &rsrc->image.data.bo->ptr.gpu, sizeof(mali_ptr));
-        }
-}
+        panfrost_emit_for_draw(ctx, false);
 
-static void
-panfrost_memory_barrier(struct pipe_context *pctx, unsigned flags)
-{
-        /* TODO: Be smart and only flush the minimum needed, maybe emitting a
-         * cache flush job if that would help */
-        panfrost_flush_all_batches(pan_context(pctx), "Memory barrier");
+        /* Compute jobs have a "compute FBD". It's not a real framebuffer
+         * descriptor - there is no framebuffer - but it takes the place of
+         * one. As far as I can tell, it's actually the beginning of a
+         * single-render-target framebuffer descriptor with almost everything
+         * zeroed out.
+         */
+        struct mali_compute_fbd compute_fbd = {
+                .unknown1 = {
+                        0, 0x1F, 0, 0, 0, 0, 0, 0
+                }
+        };
+
+        payload->postfix.framebuffer =
+                panfrost_upload_transient(ctx, &compute_fbd, sizeof(compute_fbd));
+
+        /* Invoke according to the grid info */
+
+        panfrost_pack_work_groups_compute(&payload->prefix,
+                        info->grid[0], info->grid[1], info->grid[2],
+                        info->block[0], info->block[1], info->block[2]);
+
+        /* Upload the payload */
+
+        struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(*payload));
+        memcpy(transfer.cpu, &job, sizeof(job));
+        memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload));
+
+        /* TODO: Do we want a special compute-only batch? */
+        struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
+
+        /* Queue the job */
+        panfrost_scoreboard_queue_compute_job(batch, transfer);
+
+        panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME);
 }
 
 void
@@ -143,8 +143,7 @@ panfrost_compute_context_init(struct pipe_context *pctx)
         pctx->bind_compute_state = panfrost_bind_compute_state;
         pctx->delete_compute_state = panfrost_delete_compute_state;
 
-        pctx->set_compute_resources = panfrost_set_compute_resources;
-        pctx->set_global_binding = panfrost_set_global_binding;
-
-        pctx->memory_barrier = panfrost_memory_barrier;
+        pctx->launch_grid = panfrost_launch_grid;
 }
+
+
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c
index d4637ce72..d2c6378a0 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c
@@ -25,502 +25,294 @@
  *
  **************************************************************************/
 
+#include <stdio.h>
+
 #include "pipe/p_video_codec.h"
-#include "radeon_vcn_enc.h"
-#include "radeon_video.h"
-#include "si_pipe.h"
-#include "util/u_video.h"
-#include "util/u_memory.h"
-#include "radeon_efc.h"
 
-#include <stdio.h>
+#include "util/u_video.h"
 
-#define RENCODE_FW_INTERFACE_MAJOR_VERSION         1
-#define RENCODE_FW_INTERFACE_MINOR_VERSION         1
-
-#define RENCODE_IB_PARAM_SESSION_INFO              0x00000001
-#define RENCODE_IB_PARAM_TASK_INFO                 0x00000002
-#define RENCODE_IB_PARAM_SESSION_INIT              0x00000003
-#define RENCODE_IB_PARAM_LAYER_CONTROL             0x00000004
-#define RENCODE_IB_PARAM_LAYER_SELECT              0x00000005
-#define RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT 0x00000006
-#define RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT   0x00000007
-#define RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE  0x00000008
-#define RENCODE_IB_PARAM_QUALITY_PARAMS            0x00000009
-#define RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU        0x0000000a
-#define RENCODE_IB_PARAM_SLICE_HEADER              0x0000000b
-#define RENCODE_IB_PARAM_INPUT_FORMAT              0x0000000c
-#define RENCODE_IB_PARAM_OUTPUT_FORMAT             0x0000000d
-#define RENCODE_IB_PARAM_ENCODE_PARAMS             0x0000000f
-#define RENCODE_IB_PARAM_INTRA_REFRESH             0x00000010
-#define RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER     0x00000011
-#define RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER    0x00000012
-#define RENCODE_IB_PARAM_FEEDBACK_BUFFER           0x00000015
-#define RENCODE_IB_PARAM_EFC_CONFIG                0x0000000e
-
-#define RENCODE_HEVC_IB_PARAM_SLICE_CONTROL        0x00100001
-#define RENCODE_HEVC_IB_PARAM_SPEC_MISC            0x00100002
-#define RENCODE_HEVC_IB_PARAM_LOOP_FILTER          0x00100003
-
-#define RENCODE_H264_IB_PARAM_SLICE_CONTROL        0x00200001
-#define RENCODE_H264_IB_PARAM_SPEC_MISC            0x00200002
-#define RENCODE_H264_IB_PARAM_ENCODE_PARAMS        0x00200003
-#define RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER    0x00200004
-
-static void radeon_enc_op_balance(struct radeon_encoder *enc)
-{
-   RADEON_ENC_BEGIN(RENCODE_IB_OP_SET_BALANCE_ENCODING_MODE);
-   RADEON_ENC_END();
-}
+#include "si_pipe.h"
+#include "radeon_video.h"
+#include "radeon_vcn_enc.h"
 
-static void radeon_enc_slice_header_hevc(struct radeon_encoder *enc)
+#define RENCODE_FW_INTERFACE_MAJOR_VERSION		1
+#define RENCODE_FW_INTERFACE_MINOR_VERSION		1
+
+#define RENCODE_IB_PARAM_SESSION_INFO				0x00000001
+#define RENCODE_IB_PARAM_TASK_INFO  				0x00000002
+#define RENCODE_IB_PARAM_SESSION_INIT				0x00000003
+#define RENCODE_IB_PARAM_LAYER_CONTROL				0x00000004
+#define RENCODE_IB_PARAM_LAYER_SELECT				0x00000005
+#define RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT 		0x00000006
+#define RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT	 	0x00000007
+#define RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE 		0x00000008
+#define RENCODE_IB_PARAM_QUALITY_PARAMS 			0x00000009
+#define RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU 			0x0000000a
+#define RENCODE_IB_PARAM_SLICE_HEADER				0x0000000b
+#define RENCODE_IB_PARAM_INPUT_FORMAT				0x0000000c
+#define RENCODE_IB_PARAM_OUTPUT_FORMAT				0x0000000d
+#define RENCODE_IB_PARAM_ENCODE_PARAMS				0x0000000f
+#define RENCODE_IB_PARAM_INTRA_REFRESH				0x00000010
+#define RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER  		0x00000011
+#define RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER 		0x00000012
+#define RENCODE_IB_PARAM_FEEDBACK_BUFFER			0x00000015
+
+#define RENCODE_HEVC_IB_PARAM_SLICE_CONTROL			0x00100001
+#define RENCODE_HEVC_IB_PARAM_SPEC_MISC 			0x00100002
+#define RENCODE_HEVC_IB_PARAM_LOOP_FILTER			0x00100003
+
+#define RENCODE_H264_IB_PARAM_SLICE_CONTROL			0x00200001
+#define RENCODE_H264_IB_PARAM_SPEC_MISC 			0x00200002
+#define RENCODE_H264_IB_PARAM_ENCODE_PARAMS			0x00200003
+#define RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER 		0x00200004
+
+static void radeon_enc_quality_params(struct radeon_encoder *enc)
 {
-   uint32_t instruction[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
-   uint32_t num_bits[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
-   unsigned int inst_index = 0;
-   unsigned int cdw_start = 0;
-   unsigned int cdw_filled = 0;
-   unsigned int bits_copied = 0;
-   RADEON_ENC_BEGIN(enc->cmd.slice_header);
-   radeon_enc_reset(enc);
-   radeon_enc_set_emulation_prevention(enc, false);
-
-   cdw_start = enc->cs.current.cdw;
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.nal_unit_type, 6);
-   radeon_enc_code_fixed_bits(enc, 0x0, 6);
-   radeon_enc_code_fixed_bits(enc, 0x1, 3);
-
-   radeon_enc_flush_headers(enc);
-   instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
-   num_bits[inst_index] = enc->bits_output - bits_copied;
-   bits_copied = enc->bits_output;
-   inst_index++;
-
-   instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_FIRST_SLICE;
-   inst_index++;
-
-   if ((enc->enc_pic.nal_unit_type >= 16) && (enc->enc_pic.nal_unit_type <= 23))
-      radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
-   radeon_enc_code_ue(enc, 0x0);
-
-   radeon_enc_flush_headers(enc);
-   instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
-   num_bits[inst_index] = enc->bits_output - bits_copied;
-   bits_copied = enc->bits_output;
-   inst_index++;
-
-   instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SLICE_SEGMENT;
-   inst_index++;
-
-   instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_DEPENDENT_SLICE_END;
-   inst_index++;
-
-   switch (enc->enc_pic.picture_type) {
-   case PIPE_H2645_ENC_PICTURE_TYPE_I:
-   case PIPE_H2645_ENC_PICTURE_TYPE_IDR:
-      radeon_enc_code_ue(enc, 0x2);
-      break;
-   case PIPE_H2645_ENC_PICTURE_TYPE_P:
-   case PIPE_H2645_ENC_PICTURE_TYPE_SKIP:
-      radeon_enc_code_ue(enc, 0x1);
-      break;
-   case PIPE_H2645_ENC_PICTURE_TYPE_B:
-      radeon_enc_code_ue(enc, 0x0);
-      break;
-   default:
-      radeon_enc_code_ue(enc, 0x1);
-   }
-
-   if ((enc->enc_pic.nal_unit_type != 19) && (enc->enc_pic.nal_unit_type != 20)) {
-      radeon_enc_code_fixed_bits(enc, enc->enc_pic.pic_order_cnt, enc->enc_pic.log2_max_poc);
-      if (enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P)
-         radeon_enc_code_fixed_bits(enc, 0x1, 1);
-      else {
-         radeon_enc_code_fixed_bits(enc, 0x0, 1);
-         radeon_enc_code_fixed_bits(enc, 0x0, 1);
-         radeon_enc_code_ue(enc, 0x0);
-         radeon_enc_code_ue(enc, 0x0);
-      }
-   }
-
-   if (enc->enc_pic.sample_adaptive_offset_enabled_flag) {
-      radeon_enc_flush_headers(enc);
-      instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
-      num_bits[inst_index] = enc->bits_output - bits_copied;
-      bits_copied = enc->bits_output;
-      inst_index++;
-
-      instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SAO_ENABLE;
-      inst_index++;
-   }
-
-   if ((enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P) ||
-       (enc->enc_pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_B)) {
-      radeon_enc_code_fixed_bits(enc, 0x0, 1);
-      radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.cabac_init_flag, 1);
-      radeon_enc_code_ue(enc, 5 - enc->enc_pic.max_num_merge_cand);
-   }
-
-   radeon_enc_flush_headers(enc);
-   instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
-   num_bits[inst_index] = enc->bits_output - bits_copied;
-   bits_copied = enc->bits_output;
-   inst_index++;
-
-   instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_SLICE_QP_DELTA;
-   inst_index++;
-
-   if ((enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled) &&
-       (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled ||
-        enc->enc_pic.sample_adaptive_offset_enabled_flag)) {
-       if (enc->enc_pic.sample_adaptive_offset_enabled_flag) {
-           radeon_enc_flush_headers(enc);
-           instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
-           num_bits[inst_index] = enc->bits_output - bits_copied;
-           bits_copied = enc->bits_output;
-           inst_index++;
-
-           instruction[inst_index] = RENCODE_HEVC_HEADER_INSTRUCTION_LOOP_FILTER_ACROSS_SLICES_ENABLE;
-           inst_index++;
-       }
-       else
-           radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1);
-   }
-
-   radeon_enc_flush_headers(enc);
-   instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_COPY;
-   num_bits[inst_index] = enc->bits_output - bits_copied;
-   bits_copied = enc->bits_output;
-   inst_index++;
-   instruction[inst_index] = RENCODE_HEADER_INSTRUCTION_END;
-
-   cdw_filled = enc->cs.current.cdw - cdw_start;
-   for (int i = 0; i < RENCODE_SLICE_HEADER_TEMPLATE_MAX_TEMPLATE_SIZE_IN_DWORDS - cdw_filled; i++)
-      RADEON_ENC_CS(0x00000000);
-
-   for (int j = 0; j < RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS; j++) {
-      RADEON_ENC_CS(instruction[j]);
-      RADEON_ENC_CS(num_bits[j]);
-   }
-
-   RADEON_ENC_END();
+	enc->enc_pic.quality_params.vbaq_mode = 0;
+	enc->enc_pic.quality_params.scene_change_sensitivity = 0;
+	enc->enc_pic.quality_params.scene_change_min_idr_interval = 0;
+	enc->enc_pic.quality_params.two_pass_search_center_map_mode = 0;
+
+	RADEON_ENC_BEGIN(enc->cmd.quality_params);
+	RADEON_ENC_CS(enc->enc_pic.quality_params.vbaq_mode);
+	RADEON_ENC_CS(enc->enc_pic.quality_params.scene_change_sensitivity);
+	RADEON_ENC_CS(enc->enc_pic.quality_params.scene_change_min_idr_interval);
+	RADEON_ENC_CS(enc->enc_pic.quality_params.two_pass_search_center_map_mode);
+	RADEON_ENC_END();
 }
 
 static void radeon_enc_loop_filter_hevc(struct radeon_encoder *enc)
 {
-   RADEON_ENC_BEGIN(enc->cmd.deblocking_filter_hevc);
-   RADEON_ENC_CS(enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled);
-   RADEON_ENC_CS(enc->enc_pic.hevc_deblock.deblocking_filter_disabled);
-   RADEON_ENC_CS(enc->enc_pic.hevc_deblock.beta_offset_div2);
-   RADEON_ENC_CS(enc->enc_pic.hevc_deblock.tc_offset_div2);
-   RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cb_qp_offset);
-   RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cr_qp_offset);
-   RADEON_ENC_CS(!enc->enc_pic.sample_adaptive_offset_enabled_flag);
-   RADEON_ENC_END();
+	RADEON_ENC_BEGIN(enc->cmd.deblocking_filter_hevc);
+	RADEON_ENC_CS(enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled);
+	RADEON_ENC_CS(enc->enc_pic.hevc_deblock.deblocking_filter_disabled);
+	RADEON_ENC_CS(enc->enc_pic.hevc_deblock.beta_offset_div2);
+	RADEON_ENC_CS(enc->enc_pic.hevc_deblock.tc_offset_div2);
+	RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cb_qp_offset);
+	RADEON_ENC_CS(enc->enc_pic.hevc_deblock.cr_qp_offset);
+	RADEON_ENC_CS(1);
+	RADEON_ENC_END();
 }
 
 static void radeon_enc_nalu_sps_hevc(struct radeon_encoder *enc)
 {
-   RADEON_ENC_BEGIN(enc->cmd.nalu);
-   RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_SPS);
-   uint32_t *size_in_bytes = &enc->cs.current.buf[enc->cs.current.cdw++];
-   int i;
-
-   radeon_enc_reset(enc);
-   radeon_enc_set_emulation_prevention(enc, false);
-   radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
-   radeon_enc_code_fixed_bits(enc, 0x4201, 16);
-   radeon_enc_byte_align(enc);
-   radeon_enc_set_emulation_prevention(enc, true);
-   radeon_enc_code_fixed_bits(enc, 0x0, 4);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1, 3);
-   radeon_enc_code_fixed_bits(enc, 0x1, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 2);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_tier_flag, 1);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_profile_idc, 5);
-
-   if (enc->enc_pic.general_profile_idc == 2)
-      radeon_enc_code_fixed_bits(enc, 0x20000000, 32);
-   else
-      radeon_enc_code_fixed_bits(enc, 0x60000000, 32);
-
-   radeon_enc_code_fixed_bits(enc, 0xb0000000, 32);
-   radeon_enc_code_fixed_bits(enc, 0x0, 16);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_level_idc, 8);
-
-   for (i = 0; i < (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i++)
-      radeon_enc_code_fixed_bits(enc, 0x0, 2);
-
-   if ((enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) > 0) {
-      for (i = (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i < 8; i++)
-         radeon_enc_code_fixed_bits(enc, 0x0, 2);
-   }
-
-   radeon_enc_code_ue(enc, 0x0);
-   radeon_enc_code_ue(enc, enc->enc_pic.chroma_format_idc);
-   radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_width);
-   radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_height);
-
-   if ((enc->enc_pic.crop_left != 0) || (enc->enc_pic.crop_right  != 0) ||
-       (enc->enc_pic.crop_top  != 0) || (enc->enc_pic.crop_bottom != 0)) {
-      radeon_enc_code_fixed_bits(enc, 0x1, 1);
-      radeon_enc_code_ue(enc, enc->enc_pic.crop_left);
-      radeon_enc_code_ue(enc, enc->enc_pic.crop_right);
-      radeon_enc_code_ue(enc, enc->enc_pic.crop_top);
-      radeon_enc_code_ue(enc, enc->enc_pic.crop_bottom);
-   } else if (enc->enc_pic.session_init.padding_width  != 0 ||
-              enc->enc_pic.session_init.padding_height != 0) {
-      radeon_enc_code_fixed_bits(enc, 0x1, 1);
-      radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_width / 2);
-      radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_width / 2);
-      radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_height / 2);
-      radeon_enc_code_ue(enc, enc->enc_pic.session_init.padding_height / 2);
-   } else
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
-   radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_luma_minus8);
-   radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_chroma_minus8);
-   radeon_enc_code_ue(enc, enc->enc_pic.log2_max_poc - 4);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_ue(enc, 1);
-   radeon_enc_code_ue(enc, 0x0);
-   radeon_enc_code_ue(enc, 0x0);
-   radeon_enc_code_ue(enc, enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3);
-   // Only support CTBSize 64
-   radeon_enc_code_ue(enc,
-                      6 - (enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3 + 3));
-   radeon_enc_code_ue(enc, enc->enc_pic.log2_min_transform_block_size_minus2);
-   radeon_enc_code_ue(enc, enc->enc_pic.log2_diff_max_min_transform_block_size);
-   radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_inter);
-   radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_intra);
-
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, !enc->enc_pic.hevc_spec_misc.amp_disabled, 1);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.sample_adaptive_offset_enabled_flag, 1);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.pcm_enabled_flag, 1);
-
-   radeon_enc_code_ue(enc, 1);
-   radeon_enc_code_ue(enc, 1);
-   radeon_enc_code_ue(enc, 0);
-   radeon_enc_code_ue(enc, 0);
-   radeon_enc_code_fixed_bits(enc, 0x1, 1);
-
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
-   radeon_enc_code_fixed_bits(enc, 0, 1);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.strong_intra_smoothing_enabled, 1);
-
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-
-   radeon_enc_code_fixed_bits(enc, 0x1, 1);
-
-   radeon_enc_byte_align(enc);
-   radeon_enc_flush_headers(enc);
-   *size_in_bytes = (enc->bits_output + 7) / 8;
-   RADEON_ENC_END();
+	RADEON_ENC_BEGIN(enc->cmd.nalu);
+	RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_SPS);
+	uint32_t *size_in_bytes = &enc->cs->current.buf[enc->cs->current.cdw++];
+	int i;
+
+	radeon_enc_reset(enc);
+	radeon_enc_set_emulation_prevention(enc, false);
+	radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
+	radeon_enc_code_fixed_bits(enc, 0x4201, 16);
+	radeon_enc_byte_align(enc);
+	radeon_enc_set_emulation_prevention(enc, true);
+	radeon_enc_code_fixed_bits(enc, 0x0, 4);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1, 3);
+	radeon_enc_code_fixed_bits(enc, 0x1, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 2);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_tier_flag, 1);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_profile_idc, 5);
+	radeon_enc_code_fixed_bits(enc, 0x60000000, 32);
+	radeon_enc_code_fixed_bits(enc, 0xb0000000, 32);
+	radeon_enc_code_fixed_bits(enc, 0x0, 16);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.general_level_idc, 8);
+
+	for (i = 0; i < (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) ; i++)
+		radeon_enc_code_fixed_bits(enc, 0x0, 2);
+
+	if ((enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1) > 0) {
+		for (i = (enc->enc_pic.layer_ctrl.max_num_temporal_layers - 1); i < 8; i++)
+			radeon_enc_code_fixed_bits(enc, 0x0, 2);
+	}
+
+	radeon_enc_code_ue(enc, 0x0);
+	radeon_enc_code_ue(enc, enc->enc_pic.chroma_format_idc);
+	radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_width);
+	radeon_enc_code_ue(enc, enc->enc_pic.session_init.aligned_picture_height);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_luma_minus8);
+	radeon_enc_code_ue(enc, enc->enc_pic.bit_depth_chroma_minus8);
+	radeon_enc_code_ue(enc, enc->enc_pic.log2_max_poc - 4);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_ue(enc, 1);
+	radeon_enc_code_ue(enc, 0x0);
+	radeon_enc_code_ue(enc, 0x0);
+	radeon_enc_code_ue(enc, enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3);
+	//Only support CTBSize 64
+	radeon_enc_code_ue(enc, 6 - (enc->enc_pic.hevc_spec_misc.log2_min_luma_coding_block_size_minus3 + 3));
+	radeon_enc_code_ue(enc, enc->enc_pic.log2_min_transform_block_size_minus2);
+	radeon_enc_code_ue(enc, enc->enc_pic.log2_diff_max_min_transform_block_size);
+	radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_inter);
+	radeon_enc_code_ue(enc, enc->enc_pic.max_transform_hierarchy_depth_intra);
+
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, !enc->enc_pic.hevc_spec_misc.amp_disabled, 1);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.sample_adaptive_offset_enabled_flag, 1);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.pcm_enabled_flag, 1);
+
+	radeon_enc_code_ue(enc, 1);
+	radeon_enc_code_ue(enc, 1);
+	radeon_enc_code_ue(enc, 0);
+	radeon_enc_code_ue(enc, 0);
+	radeon_enc_code_fixed_bits(enc, 0x1, 1);
+
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+
+	radeon_enc_code_fixed_bits(enc, 0, 1);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.strong_intra_smoothing_enabled, 1);
+
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+
+	radeon_enc_code_fixed_bits(enc, 0x1, 1);
+
+	radeon_enc_byte_align(enc);
+	radeon_enc_flush_headers(enc);
+	*size_in_bytes = (enc->bits_output + 7) / 8;
+	RADEON_ENC_END();
 }
 
 static void radeon_enc_nalu_pps_hevc(struct radeon_encoder *enc)
 {
-   RADEON_ENC_BEGIN(enc->cmd.nalu);
-   RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_PPS);
-   uint32_t *size_in_bytes = &enc->cs.current.buf[enc->cs.current.cdw++];
-   radeon_enc_reset(enc);
-   radeon_enc_set_emulation_prevention(enc, false);
-   radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
-   radeon_enc_code_fixed_bits(enc, 0x4401, 16);
-   radeon_enc_byte_align(enc);
-   radeon_enc_set_emulation_prevention(enc, true);
-   radeon_enc_code_ue(enc, 0x0);
-   radeon_enc_code_ue(enc, 0x0);
-   radeon_enc_code_fixed_bits(enc, 0x1, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 4);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, 0x1, 1);
-   radeon_enc_code_ue(enc, 0x0);
-   radeon_enc_code_ue(enc, 0x0);
-   radeon_enc_code_se(enc, 0x0);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.constrained_intra_pred_flag, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   if (enc->enc_pic.rc_session_init.rate_control_method == RENCODE_RATE_CONTROL_METHOD_NONE)
-      radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   else {
-      radeon_enc_code_fixed_bits(enc, 0x1, 1);
-      radeon_enc_code_ue(enc, 0x0);
-   }
-   radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cb_qp_offset);
-   radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cr_qp_offset);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 2);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1);
-   radeon_enc_code_fixed_bits(enc, 0x1, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.deblocking_filter_disabled, 1);
-
-   if (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled) {
-      radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.beta_offset_div2);
-      radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.tc_offset_div2);
-   }
-
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_fixed_bits(enc, 0x0, 1);
-   radeon_enc_code_ue(enc, enc->enc_pic.log2_parallel_merge_level_minus2);
-   radeon_enc_code_fixed_bits(enc, 0x0, 2);
-
-   radeon_enc_code_fixed_bits(enc, 0x1, 1);
-
-   radeon_enc_byte_align(enc);
-   radeon_enc_flush_headers(enc);
-   *size_in_bytes = (enc->bits_output + 7) / 8;
-   RADEON_ENC_END();
-}
-
-static void radeon_enc_session_init(struct radeon_encoder *enc)
-{
-   enc->enc_pic.session_init.encode_standard = RENCODE_ENCODE_STANDARD_H264;
-   enc->enc_pic.session_init.aligned_picture_width = align(enc->base.width, 16);
-   enc->enc_pic.session_init.aligned_picture_height = align(enc->base.height, 16);
-   enc->enc_pic.session_init.padding_width = enc->enc_pic.session_init.aligned_picture_width - enc->base.width;
-   enc->enc_pic.session_init.padding_height = enc->enc_pic.session_init.aligned_picture_height - enc->base.height;
-   enc->enc_pic.session_init.pre_encode_mode = RENCODE_PREENCODE_MODE_NONE;
-   enc->enc_pic.session_init.pre_encode_chroma_enabled = FALSE;
-
-   RADEON_ENC_BEGIN(enc->cmd.session_init);
-   RADEON_ENC_CS(enc->enc_pic.session_init.encode_standard);
-   RADEON_ENC_CS(enc->enc_pic.session_init.aligned_picture_width);
-   RADEON_ENC_CS(enc->enc_pic.session_init.aligned_picture_height);
-   RADEON_ENC_CS(enc->enc_pic.session_init.padding_width);
-   RADEON_ENC_CS(enc->enc_pic.session_init.padding_height);
-   RADEON_ENC_CS(enc->enc_pic.session_init.pre_encode_mode);
-   RADEON_ENC_CS(enc->enc_pic.session_init.pre_encode_chroma_enabled);
-   RADEON_ENC_END();
-}
-
-static void radeon_enc_efc_config(struct radeon_encoder *enc)
-{
-   if (enc->efc == NULL) {
-      enc->efc = CALLOC_STRUCT(rvid_buffer);
-      int buffer_size = 46 * 1024;
-      if (!si_vid_create_buffer(enc->screen, enc->efc, buffer_size, PIPE_USAGE_DYNAMIC)) {
-         RVID_ERR("Can't create EFC conversion table buffer.\n");
-         FREE(enc->efc);
-         return;
-      }
-
-      uint32_t *ptr = enc->ws->buffer_map(enc->ws, enc->efc->res->buf, &enc->cs, PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
-      memcpy(ptr, Yuv_st2084_rec2020_st2084_rec2020_2000, 46817);
-      enc->ws->buffer_unmap(enc->ws, enc->efc->res->buf);
-   }
-
-   enc->enc_pic.efc_params.coef_buffer_size = 46817;
-   enc->enc_pic.efc_params.cm_program_register_data_size = 1728;
-
-   assert(enc->efc);
-
-   RADEON_ENC_BEGIN(enc->cmd.efc_params);
-   RADEON_ENC_WRITE(enc->efc->res->buf, enc->efc->res->domains, 0x0);
-   RADEON_ENC_CS(enc->enc_pic.efc_params.coef_buffer_size);
-   RADEON_ENC_CS(enc->enc_pic.efc_params.cm_program_register_data_size);
-   RADEON_ENC_END();
+	RADEON_ENC_BEGIN(enc->cmd.nalu);
+	RADEON_ENC_CS(RENCODE_DIRECT_OUTPUT_NALU_TYPE_PPS);
+	uint32_t *size_in_bytes = &enc->cs->current.buf[enc->cs->current.cdw++];
+	radeon_enc_reset(enc);
+	radeon_enc_set_emulation_prevention(enc, false);
+	radeon_enc_code_fixed_bits(enc, 0x00000001, 32);
+	radeon_enc_code_fixed_bits(enc, 0x4401, 16);
+	radeon_enc_byte_align(enc);
+	radeon_enc_set_emulation_prevention(enc, true);
+	radeon_enc_code_ue(enc, 0x0);
+	radeon_enc_code_ue(enc, 0x0);
+	radeon_enc_code_fixed_bits(enc, 0x1, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 4);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, 0x1, 1);
+	radeon_enc_code_ue(enc, 0x0);
+	radeon_enc_code_ue(enc, 0x0);
+	radeon_enc_code_se(enc, 0x0);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.constrained_intra_pred_flag, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cb_qp_offset);
+	radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cr_qp_offset);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 2);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.loop_filter_across_slices_enabled, 1);
+	radeon_enc_code_fixed_bits(enc, 0x1, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_deblock.deblocking_filter_disabled, 1);
+
+	if (!enc->enc_pic.hevc_deblock.deblocking_filter_disabled) {
+		radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.beta_offset_div2);
+		radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.tc_offset_div2);
+	}
+
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_fixed_bits(enc, 0x0, 1);
+	radeon_enc_code_ue(enc, enc->enc_pic.log2_parallel_merge_level_minus2);
+	radeon_enc_code_fixed_bits(enc, 0x0, 2);
+
+	radeon_enc_code_fixed_bits(enc, 0x1, 1);
+
+	radeon_enc_byte_align(enc);
+	radeon_enc_flush_headers(enc);
+	*size_in_bytes = (enc->bits_output + 7) / 8;
+	RADEON_ENC_END();
 }
 
 static void radeon_enc_input_format(struct radeon_encoder *enc)
 {
-   RADEON_ENC_BEGIN(enc->cmd.input_format);
-   RADEON_ENC_CS(enc->enc_pic.input_format.input_color_volume);
-   RADEON_ENC_CS(enc->enc_pic.input_format.input_color_space);
-   RADEON_ENC_CS(enc->enc_pic.input_format.input_color_range);
-   RADEON_ENC_CS(enc->enc_pic.input_format.input_chroma_subsampling);
-   RADEON_ENC_CS(enc->enc_pic.input_format.input_chroma_location);
-   RADEON_ENC_CS(enc->enc_pic.input_format.input_color_bit_depth);
-   RADEON_ENC_CS(enc->enc_pic.input_format.input_color_packing_format);
-   RADEON_ENC_END();
+	RADEON_ENC_BEGIN(enc->cmd.input_format);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_END();
 }
 
 static void radeon_enc_output_format(struct radeon_encoder *enc)
 {
-   RADEON_ENC_BEGIN(enc->cmd.output_format);
-   RADEON_ENC_CS(enc->enc_pic.output_format.output_color_volume);
-   RADEON_ENC_CS(enc->enc_pic.output_format.output_color_range);
-   RADEON_ENC_CS(enc->enc_pic.output_format.output_chroma_location);
-   RADEON_ENC_CS(enc->enc_pic.output_format.output_color_bit_depth);
-   RADEON_ENC_END();
+	RADEON_ENC_BEGIN(enc->cmd.output_format);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_CS(0);
+	RADEON_ENC_END();
 }
 
 static void encode(struct radeon_encoder *enc)
 {
-   enc->before_encode(enc);
-   enc->session_info(enc);
-   enc->total_task_size = 0;
-   enc->task_info(enc, enc->need_feedback);
-   enc->efc_params(enc);
-   enc->encode_headers(enc);
-   enc->ctx(enc);
-   enc->bitstream(enc);
-   enc->feedback(enc);
-   enc->intra_refresh(enc);
-   enc->input_format(enc);
-   enc->output_format(enc);
-
-   enc->op_preset(enc);
-   enc->op_enc(enc);
-   *enc->p_task_size = (enc->total_task_size);
+	enc->session_info(enc);
+	enc->total_task_size = 0;
+	enc->task_info(enc, enc->need_feedback);
+
+	enc->encode_headers(enc);
+	enc->ctx(enc);
+	enc->bitstream(enc);
+	enc->feedback(enc);
+	enc->intra_refresh(enc);
+	enc->input_format(enc);
+	enc->output_format(enc);
+
+	enc->op_speed(enc);
+	enc->op_enc(enc);
+	*enc->p_task_size = (enc->total_task_size);
 }
 
 void radeon_enc_2_0_init(struct radeon_encoder *enc)
 {
-   radeon_enc_1_2_init(enc);
-   enc->encode = encode;
-   enc->input_format = radeon_enc_input_format;
-   enc->output_format = radeon_enc_output_format;
-   enc->efc_params = radeon_enc_efc_config;
-
-   if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
-      enc->session_init = radeon_enc_session_init;
-   }
-   if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
-      enc->deblocking_filter = radeon_enc_loop_filter_hevc;
-      enc->nalu_sps = radeon_enc_nalu_sps_hevc;
-      enc->nalu_pps = radeon_enc_nalu_pps_hevc;
-      enc->slice_header = radeon_enc_slice_header_hevc;
-      enc->op_preset = radeon_enc_op_balance;
-   }
-
-   enc->cmd.session_info = RENCODE_IB_PARAM_SESSION_INFO;
-   enc->cmd.task_info = RENCODE_IB_PARAM_TASK_INFO;
-   enc->cmd.session_init = RENCODE_IB_PARAM_SESSION_INIT;
-   enc->cmd.layer_control = RENCODE_IB_PARAM_LAYER_CONTROL;
-   enc->cmd.layer_select = RENCODE_IB_PARAM_LAYER_SELECT;
-   enc->cmd.rc_session_init = RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT;
-   enc->cmd.rc_layer_init = RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT;
-   enc->cmd.rc_per_pic = RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE;
-   enc->cmd.quality_params = RENCODE_IB_PARAM_QUALITY_PARAMS;
-   enc->cmd.nalu = RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU;
-   enc->cmd.slice_header = RENCODE_IB_PARAM_SLICE_HEADER;
-   enc->cmd.input_format = RENCODE_IB_PARAM_INPUT_FORMAT;
-   enc->cmd.output_format = RENCODE_IB_PARAM_OUTPUT_FORMAT;
-   enc->cmd.enc_params = RENCODE_IB_PARAM_ENCODE_PARAMS;
-   enc->cmd.intra_refresh = RENCODE_IB_PARAM_INTRA_REFRESH;
-   enc->cmd.ctx = RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER;
-   enc->cmd.bitstream = RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER;
-   enc->cmd.feedback = RENCODE_IB_PARAM_FEEDBACK_BUFFER;
-   enc->cmd.slice_control_hevc = RENCODE_HEVC_IB_PARAM_SLICE_CONTROL;
-   enc->cmd.spec_misc_hevc = RENCODE_HEVC_IB_PARAM_SPEC_MISC;
-   enc->cmd.deblocking_filter_hevc = RENCODE_HEVC_IB_PARAM_LOOP_FILTER;
-   enc->cmd.slice_control_h264 = RENCODE_H264_IB_PARAM_SLICE_CONTROL;
-   enc->cmd.spec_misc_h264 = RENCODE_H264_IB_PARAM_SPEC_MISC;
-   enc->cmd.enc_params_h264 = RENCODE_H264_IB_PARAM_ENCODE_PARAMS;
-   enc->cmd.deblocking_filter_h264 = RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER;
-   enc->cmd.efc_params = RENCODE_IB_PARAM_EFC_CONFIG;
-
-   enc->enc_pic.session_info.interface_version =
-      ((RENCODE_FW_INTERFACE_MAJOR_VERSION << RENCODE_IF_MAJOR_VERSION_SHIFT) |
-       (RENCODE_FW_INTERFACE_MINOR_VERSION << RENCODE_IF_MINOR_VERSION_SHIFT));
+	radeon_enc_1_2_init(enc);
+	enc->encode = encode;
+	enc->quality_params = radeon_enc_quality_params;
+	enc->input_format = radeon_enc_input_format;
+	enc->output_format = radeon_enc_output_format;
+
+	if (u_reduce_video_profile(enc->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
+		enc->deblocking_filter = radeon_enc_loop_filter_hevc;
+		enc->nalu_sps = radeon_enc_nalu_sps_hevc;
+		enc->nalu_pps = radeon_enc_nalu_pps_hevc;
+	}
+
+	enc->cmd.session_info = RENCODE_IB_PARAM_SESSION_INFO;
+	enc->cmd.task_info = RENCODE_IB_PARAM_TASK_INFO;
+	enc->cmd.session_init = RENCODE_IB_PARAM_SESSION_INIT;
+	enc->cmd.layer_control = RENCODE_IB_PARAM_LAYER_CONTROL;
+	enc->cmd.layer_select = RENCODE_IB_PARAM_LAYER_SELECT;
+	enc->cmd.rc_session_init = RENCODE_IB_PARAM_RATE_CONTROL_SESSION_INIT;
+	enc->cmd.rc_layer_init = RENCODE_IB_PARAM_RATE_CONTROL_LAYER_INIT;
+	enc->cmd.rc_per_pic = RENCODE_IB_PARAM_RATE_CONTROL_PER_PICTURE;
+	enc->cmd.quality_params = RENCODE_IB_PARAM_QUALITY_PARAMS;
+	enc->cmd.nalu = RENCODE_IB_PARAM_DIRECT_OUTPUT_NALU;
+	enc->cmd.slice_header = RENCODE_IB_PARAM_SLICE_HEADER;
+	enc->cmd.input_format = RENCODE_IB_PARAM_INPUT_FORMAT;
+	enc->cmd.output_format = RENCODE_IB_PARAM_OUTPUT_FORMAT;
+	enc->cmd.enc_params = RENCODE_IB_PARAM_ENCODE_PARAMS;
+	enc->cmd.intra_refresh = RENCODE_IB_PARAM_INTRA_REFRESH;
+	enc->cmd.ctx = RENCODE_IB_PARAM_ENCODE_CONTEXT_BUFFER;
+	enc->cmd.bitstream = RENCODE_IB_PARAM_VIDEO_BITSTREAM_BUFFER;
+	enc->cmd.feedback = RENCODE_IB_PARAM_FEEDBACK_BUFFER;
+	enc->cmd.slice_control_hevc = RENCODE_HEVC_IB_PARAM_SLICE_CONTROL;
+	enc->cmd.spec_misc_hevc = RENCODE_HEVC_IB_PARAM_SPEC_MISC;
+	enc->cmd.deblocking_filter_hevc = RENCODE_HEVC_IB_PARAM_LOOP_FILTER;
+	enc->cmd.slice_control_h264 = RENCODE_H264_IB_PARAM_SLICE_CONTROL;
+	enc->cmd.spec_misc_h264 = RENCODE_H264_IB_PARAM_SPEC_MISC;
+	enc->cmd.enc_params_h264 = RENCODE_H264_IB_PARAM_ENCODE_PARAMS;
+	enc->cmd.deblocking_filter_h264 = RENCODE_H264_IB_PARAM_DEBLOCKING_FILTER;
+
+	enc->enc_pic.session_info.interface_version =
+		((RENCODE_FW_INTERFACE_MAJOR_VERSION << RENCODE_IF_MAJOR_VERSION_SHIFT) |
+		(RENCODE_FW_INTERFACE_MINOR_VERSION << RENCODE_IF_MINOR_VERSION_SHIFT));
 }
author	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:10:09 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:10:09 +0000
commit	d1e8c371581041f403dcdcff4ab8a88e970d221e (patch)
tree	621cf3eea9401b6fc19ce2a6dc5aa7579ecc8c70
parent	81f619d3e99a3a218e6318d06c2bc1a36052e75d (diff)