Import Mesa 23.1.9

author: Jonathan Gray <jsg@cvs.openbsd.org> 2023-11-02 04:34:57 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2023-11-02 04:34:57 +0000
commit: 32aeb3c41fedbbd7b11aacfec48e8f699d16bff0 (patch)
tree: fc5893a490729ebf6b87b83eebf5d4ebfdfccf27 /lib/mesa/src/intel/vulkan
parent: 286ec9d289bada8abb84753c461cfa3432866e98 (diff)
47 files changed, 5966 insertions, 386 deletions
diff --git a/lib/mesa/src/intel/vulkan/anv_android.c b/lib/mesa/src/intel/vulkan/anv_android.c
index 8a17f0a24..6e98763dd 100644
--- a/lib/mesa/src/intel/vulkan/anv_android.c
+++ b/lib/mesa/src/intel/vulkan/anv_android.c
@@ -34,17 +34,14 @@
 #include <sync/sync.h>
 
 #include "anv_private.h"
+#include "vk_android.h"
 #include "vk_common_entrypoints.h"
 #include "vk_util.h"
 
 static int anv_hal_open(const struct hw_module_t* mod, const char* id, struct hw_device_t** dev);
 static int anv_hal_close(struct hw_device_t *dev);
 
-static void UNUSED
-static_asserts(void)
-{
-   STATIC_ASSERT(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC);
-}
+static_assert(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC, "");
 
 PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
    .common = {
@@ -142,8 +139,8 @@ vk_format_from_android(unsigned android_format, unsigned android_usage)
    }
 }
 
-static inline unsigned
-android_format_from_vk(unsigned vk_format)
+unsigned
+anv_ahb_format_for_vk_format(VkFormat vk_format)
 {
    switch (vk_format) {
    case VK_FORMAT_R8G8B8A8_UNORM:
@@ -167,12 +164,6 @@ android_format_from_vk(unsigned vk_format)
    }
 }
 
-static VkFormatFeatureFlags
-features2_to_features(VkFormatFeatureFlags2 features2)
-{
-   return features2 & VK_ALL_FORMAT_FEATURE_FLAG_BITS;
-}
-
 static VkResult
 get_ahw_buffer_format_properties2(
    VkDevice device_h,
@@ -201,9 +192,9 @@ get_ahw_buffer_format_properties2(
    VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
 
    p->format = vk_format_from_android(desc.format, desc.usage);
+   p->externalFormat = p->format;
 
    const struct anv_format *anv_format = anv_get_format(p->format);
-   p->externalFormat = (uint64_t) (uintptr_t) anv_format;
 
    /* Default to OPTIMAL tiling but set to linear in case
     * of AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER usage.
@@ -214,7 +205,7 @@ get_ahw_buffer_format_properties2(
       tiling = VK_IMAGE_TILING_LINEAR;
 
    p->formatFeatures =
-      anv_get_image_format_features2(device->info, p->format, anv_format,
+      anv_get_image_format_features2(device->physical, p->format, anv_format,
                                      tiling, NULL);
 
    /* "Images can be created with an external format even if the Android hardware
@@ -274,7 +265,7 @@ anv_GetAndroidHardwareBufferPropertiesANDROID(
       format_prop->format                 = format_prop2.format;
       format_prop->externalFormat         = format_prop2.externalFormat;
       format_prop->formatFeatures         =
-         features2_to_features(format_prop2.formatFeatures);
+         vk_format_features2_to_features(format_prop2.formatFeatures);
       format_prop->samplerYcbcrConversionComponents =
          format_prop2.samplerYcbcrConversionComponents;
       format_prop->suggestedYcbcrModel    = format_prop2.suggestedYcbcrModel;
@@ -309,81 +300,21 @@ anv_GetAndroidHardwareBufferPropertiesANDROID(
    return VK_SUCCESS;
 }
 
-VkResult
-anv_GetMemoryAndroidHardwareBufferANDROID(
-   VkDevice device_h,
-   const VkMemoryGetAndroidHardwareBufferInfoANDROID *pInfo,
-   struct AHardwareBuffer **pBuffer)
-{
-   ANV_FROM_HANDLE(anv_device_memory, mem, pInfo->memory);
-
-   /* Some quotes from Vulkan spec:
-    *
-    * "If the device memory was created by importing an Android hardware
-    * buffer, vkGetMemoryAndroidHardwareBufferANDROID must return that same
-    * Android hardware buffer object."
-    *
-    * "VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID must
-    * have been included in VkExportMemoryAllocateInfo::handleTypes when
-    * memory was created."
-    */
-   if (mem->ahw) {
-      *pBuffer = mem->ahw;
-      /* Increase refcount. */
-      AHardwareBuffer_acquire(mem->ahw);
-      return VK_SUCCESS;
-   }
-
-   return VK_ERROR_OUT_OF_HOST_MEMORY;
-}
-
-#endif
-
-/* Construct ahw usage mask from image usage bits, see
- * 'AHardwareBuffer Usage Equivalence' in Vulkan spec.
- */
-uint64_t
-anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
-                            const VkImageUsageFlags vk_usage)
-{
-   uint64_t ahw_usage = 0;
-#if ANDROID_API_LEVEL >= 26
-   if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
-
-   if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
-
-   if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT;
-
-   if (vk_create & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_CUBE_MAP;
-
-   if (vk_create & VK_IMAGE_CREATE_PROTECTED_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_PROTECTED_CONTENT;
-
-   /* No usage bits set - set at least one GPU usage. */
-   if (ahw_usage == 0)
-      ahw_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
 #endif
-   return ahw_usage;
-}
 
 /*
  * Called from anv_AllocateMemory when import AHardwareBuffer.
  */
 VkResult
 anv_import_ahw_memory(VkDevice device_h,
-                      struct anv_device_memory *mem,
-                      const VkImportAndroidHardwareBufferInfoANDROID *info)
+                      struct anv_device_memory *mem)
 {
 #if ANDROID_API_LEVEL >= 26
    ANV_FROM_HANDLE(anv_device, device, device_h);
 
    /* Import from AHardwareBuffer to anv_device_memory. */
    const native_handle_t *handle =
-      AHardwareBuffer_getNativeHandle(info->buffer);
+      AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
 
    /* NOTE - We support buffers with only one handle but do not error on
     * multiple handle case. Reason is that we want to support YUV formats
@@ -399,14 +330,6 @@ anv_import_ahw_memory(VkDevice device_h,
                                           &mem->bo);
    assert(result == VK_SUCCESS);
 
-   /* "If the vkAllocateMemory command succeeds, the implementation must
-    * acquire a reference to the imported hardware buffer, which it must
-    * release when the device memory object is freed. If the command fails,
-    * the implementation must not retain a reference."
-    */
-   AHardwareBuffer_acquire(info->buffer);
-   mem->ahw = info->buffer;
-
    return VK_SUCCESS;
 #else
    return VK_ERROR_EXTENSION_NOT_PRESENT;
@@ -414,70 +337,6 @@ anv_import_ahw_memory(VkDevice device_h,
 }
 
 VkResult
-anv_create_ahw_memory(VkDevice device_h,
-                      struct anv_device_memory *mem,
-                      const VkMemoryAllocateInfo *pAllocateInfo)
-{
-#if ANDROID_API_LEVEL >= 26
-   const VkMemoryDedicatedAllocateInfo *dedicated_info =
-      vk_find_struct_const(pAllocateInfo->pNext,
-                           MEMORY_DEDICATED_ALLOCATE_INFO);
-
-   uint32_t w = 0;
-   uint32_t h = 1;
-   uint32_t layers = 1;
-   uint32_t format = 0;
-   uint64_t usage = 0;
-
-   /* If caller passed dedicated information. */
-   if (dedicated_info && dedicated_info->image) {
-      ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
-      w = image->vk.extent.width;
-      h = image->vk.extent.height;
-      layers = image->vk.array_layers;
-      format = android_format_from_vk(image->vk.format);
-      usage = anv_ahw_usage_from_vk_usage(image->vk.create_flags, image->vk.usage);
-   } else if (dedicated_info && dedicated_info->buffer) {
-      ANV_FROM_HANDLE(anv_buffer, buffer, dedicated_info->buffer);
-      w = buffer->vk.size;
-      format = AHARDWAREBUFFER_FORMAT_BLOB;
-      usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
-              AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
-   } else {
-      w = pAllocateInfo->allocationSize;
-      format = AHARDWAREBUFFER_FORMAT_BLOB;
-      usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
-              AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
-   }
-
-   struct AHardwareBuffer *ahw = NULL;
-   struct AHardwareBuffer_Desc desc = {
-      .width = w,
-      .height = h,
-      .layers = layers,
-      .format = format,
-      .usage = usage,
-    };
-
-   if (AHardwareBuffer_allocate(&desc, &ahw) != 0)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   const VkImportAndroidHardwareBufferInfoANDROID import_info = {
-      .buffer = ahw,
-   };
-   VkResult result = anv_import_ahw_memory(device_h, mem, &import_info);
-
-   /* Release a reference to avoid leak for AHB allocation. */
-   AHardwareBuffer_release(ahw);
-
-   return result;
-#else
-   return VK_ERROR_EXTENSION_NOT_PRESENT;
-#endif
-
-}
-
-VkResult
 anv_image_init_from_gralloc(struct anv_device *device,
                             struct anv_image *image,
                             const VkImageCreateInfo *base_info,
@@ -536,6 +395,8 @@ anv_image_init_from_gralloc(struct anv_device *device,
                                                base_info->tiling);
    assert(format != ISL_FORMAT_UNSUPPORTED);
 
+   anv_info.stride = gralloc_info->stride * (isl_format_get_layout(format)->bpb / 8);
+
    result = anv_image_init(device, image, &anv_info);
    if (result != VK_SUCCESS)
       goto fail_init;
@@ -548,8 +409,8 @@ anv_image_init_from_gralloc(struct anv_device *device,
                                      &mem_reqs);
 
    VkDeviceSize aligned_image_size =
-      align_u64(mem_reqs.memoryRequirements.size,
-                mem_reqs.memoryRequirements.alignment);
+      align64(mem_reqs.memoryRequirements.size,
+              mem_reqs.memoryRequirements.alignment);
 
    if (bo->size < aligned_image_size) {
       result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
diff --git a/lib/mesa/src/intel/vulkan/anv_android.h b/lib/mesa/src/intel/vulkan/anv_android.h
index 4490d3b24..e1f099e1f 100644
--- a/lib/mesa/src/intel/vulkan/anv_android.h
+++ b/lib/mesa/src/intel/vulkan/anv_android.h
@@ -44,14 +44,12 @@ VkResult anv_image_bind_from_gralloc(struct anv_device *device,
                                      struct anv_image *image,
                                      const VkNativeBufferANDROID *gralloc_info);
 
-uint64_t anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
-                                     const VkImageUsageFlags vk_usage);
+unsigned anv_ahb_format_for_vk_format(VkFormat vk_format);
 
 VkResult anv_import_ahw_memory(VkDevice device_h,
-                               struct anv_device_memory *mem,
-                               const VkImportAndroidHardwareBufferInfoANDROID *info);
+                               struct anv_device_memory *mem);
 
 VkResult anv_create_ahw_memory(VkDevice device_h,
                                struct anv_device_memory *mem,
-                               const VkMemoryAllocateInfo *pAllocateInfo);
+                               const VkMemoryDedicatedAllocateInfo *dedicated_info);
 #endif /* ANV_ANDROID_H */
diff --git a/lib/mesa/src/intel/vulkan/anv_android_stubs.c b/lib/mesa/src/intel/vulkan/anv_android_stubs.c
index d5bc11949..4e8c05f57 100644
--- a/lib/mesa/src/intel/vulkan/anv_android_stubs.c
+++ b/lib/mesa/src/intel/vulkan/anv_android_stubs.c
@@ -39,17 +39,9 @@ VkResult anv_image_bind_from_gralloc(struct anv_device *device,
    return VK_ERROR_EXTENSION_NOT_PRESENT;
 }
 
-uint64_t
-anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
-                            const VkImageUsageFlags vk_usage)
-{
-   return 0;
-}
-
 VkResult
 anv_import_ahw_memory(VkDevice device_h,
-                      struct anv_device_memory *mem,
-                      const VkImportAndroidHardwareBufferInfoANDROID *info)
+                      struct anv_device_memory *mem)
 {
    return VK_ERROR_EXTENSION_NOT_PRESENT;
 }
@@ -57,7 +49,7 @@ anv_import_ahw_memory(VkDevice device_h,
 VkResult
 anv_create_ahw_memory(VkDevice device_h,
                       struct anv_device_memory *mem,
-                      const VkMemoryAllocateInfo *pAllocateInfo)
+                      const VkMemoryDedicatedAllocateInfo *dedicated_info)
 {
    return VK_ERROR_EXTENSION_NOT_PRESENT;
 }
diff --git a/lib/mesa/src/intel/vulkan/anv_bo_sync.c b/lib/mesa/src/intel/vulkan/anv_bo_sync.c
index 149ae2c2b..c48d52d28 100644
--- a/lib/mesa/src/intel/vulkan/anv_bo_sync.c
+++ b/lib/mesa/src/intel/vulkan/anv_bo_sync.c
@@ -24,6 +24,7 @@
 #include "anv_private.h"
 
 #include "util/os_time.h"
+#include "util/perf/cpu_trace.h"
 
 static struct anv_bo_sync *
 to_anv_bo_sync(struct vk_sync *sync)
@@ -105,6 +106,7 @@ anv_bo_sync_wait(struct vk_device *vk_device,
 {
    struct anv_device *device = container_of(vk_device, struct anv_device, vk);
    VkResult result;
+   MESA_TRACE_FUNC();
 
    uint32_t pending = wait_count;
    while (pending) {
diff --git a/lib/mesa/src/intel/vulkan/anv_generated_indirect_draws.c b/lib/mesa/src/intel/vulkan/anv_generated_indirect_draws.c
new file mode 100644
index 000000000..003dbc88c
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/anv_generated_indirect_draws.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "compiler/brw_compiler.h"
+#include "compiler/brw_nir.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "dev/intel_debug.h"
+#include "util/macros.h"
+
+#include "anv_generated_indirect_draws.h"
+
+#include "shaders/gfx9_generated_draws_spv.h"
+#include "shaders/gfx11_generated_draws_spv.h"
+
+/* This pass takes vulkan descriptor bindings 0 & 1 and turns them into global
+ * 64bit addresses. Binding 2 is left UBO that would normally be accessed
+ * through the binding table but it fully promoted to push constants.
+ *
+ * As a result we're not using the binding table at all which is nice because
+ * of the side command buffer we use for the generating shader does not
+ * interact with the binding table allocation.
+ */
+static bool
+lower_vulkan_descriptors_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor)
+      return false;
+
+   nir_instr *res_index_instr = intrin->src[0].ssa->parent_instr;
+   assert(res_index_instr->type == nir_instr_type_intrinsic);
+   nir_intrinsic_instr *res_index_intrin =
+      nir_instr_as_intrinsic(res_index_instr);
+   assert(res_index_intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
+
+   b->cursor = nir_after_instr(instr);
+
+   nir_ssa_def *desc_value = NULL;
+   switch (nir_intrinsic_binding(res_index_intrin)) {
+   case 0: {
+      desc_value =
+         nir_load_ubo(b, 1, 64,
+                      nir_imm_int(b, 2),
+                      nir_imm_int(b,
+                                  offsetof(struct anv_generated_indirect_params,
+                                           indirect_data_addr)),
+                      .align_mul = 8,
+                      .align_offset = 0,
+                      .range_base = 0,
+                      .range = ~0);
+      desc_value =
+         nir_vec4(b,
+                  nir_unpack_64_2x32_split_x(b, desc_value),
+                  nir_unpack_64_2x32_split_y(b, desc_value),
+                  nir_imm_int(b, 0),
+                  nir_imm_int(b, 0));
+      break;
+   }
+
+   case 1: {
+      desc_value =
+         nir_load_ubo(b, 1, 64,
+                      nir_imm_int(b, 2),
+                      nir_imm_int(b,
+                                  offsetof(struct anv_generated_indirect_params,
+                                           generated_cmds_addr)),
+                      .align_mul = 8,
+                      .align_offset = 0,
+                      .range_base = 0,
+                      .range = ~0);
+      desc_value =
+         nir_vec4(b,
+                  nir_unpack_64_2x32_split_x(b, desc_value),
+                  nir_unpack_64_2x32_split_y(b, desc_value),
+                  nir_imm_int(b, 0),
+                  nir_imm_int(b, 0));
+      break;
+   }
+
+   case 2: {
+      desc_value =
+         nir_load_ubo(b, 1, 64,
+                      nir_imm_int(b, 2),
+                      nir_imm_int(b,
+                                  offsetof(struct anv_generated_indirect_params,
+                                           draw_ids_addr)),
+                      .align_mul = 8,
+                      .align_offset = 0,
+                      .range_base = 0,
+                      .range = ~0);
+      desc_value =
+         nir_vec4(b,
+                  nir_unpack_64_2x32_split_x(b, desc_value),
+                  nir_unpack_64_2x32_split_y(b, desc_value),
+                  nir_imm_int(b, 0),
+                  nir_imm_int(b, 0));
+      break;
+   }
+
+   case 3:
+      desc_value =
+         nir_vec2(b,
+                  nir_imm_int(b, 2),
+                  nir_imm_int(b, 0));
+      break;
+   }
+
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc_value);
+
+   return true;
+}
+
+static bool
+lower_vulkan_descriptors(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader,
+                                       lower_vulkan_descriptors_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
+
+static struct anv_shader_bin *
+compile_upload_spirv(struct anv_device *device,
+                     const void *key,
+                     uint32_t key_size,
+                     const uint32_t *spirv_source,
+                     uint32_t spirv_source_size,
+                     uint32_t sends_count_expectation)
+{
+   struct spirv_to_nir_options spirv_options = {
+      .caps = {
+         .int64 = true,
+      },
+      .ubo_addr_format = nir_address_format_32bit_index_offset,
+      .ssbo_addr_format = nir_address_format_64bit_global_32bit_offset,
+      .environment = NIR_SPIRV_VULKAN,
+      .create_library = false,
+   };
+   const nir_shader_compiler_options *nir_options =
+      device->physical->compiler->nir_options[MESA_SHADER_FRAGMENT];
+
+   nir_shader* nir =
+      spirv_to_nir(spirv_source, spirv_source_size,
+                   NULL, 0, MESA_SHADER_FRAGMENT, "main",
+                   &spirv_options, nir_options);
+
+   assert(nir != NULL);
+
+   nir->info.internal = true;
+
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   /* Pick off the single entrypoint that we want */
+   nir_remove_non_entrypoints(nir);
+
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_dce);
+   NIR_PASS_V(nir, nir_opt_cse);
+   NIR_PASS_V(nir, nir_opt_gcm, true);
+   NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_split_per_member_structs);
+
+   struct brw_compiler *compiler = device->physical->compiler;
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   NIR_PASS_V(nir, nir_propagate_invariant, false);
+
+   NIR_PASS_V(nir, nir_lower_input_attachments,
+            &(nir_input_attachment_options) {
+               .use_fragcoord_sysval = true,
+               .use_layer_id_sysval = true,
+            });
+
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   /* Do vectorizing here. For some reason when trying to do it in the back
+    * this just isn't working.
+    */
+   nir_load_store_vectorize_options options = {
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo,
+      .callback = brw_nir_should_vectorize_mem,
+      .robust_modes = (nir_variable_mode)0,
+   };
+   NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options);
+
+   NIR_PASS_V(nir, lower_vulkan_descriptors);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
+              nir_address_format_32bit_index_offset);
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+              nir_address_format_64bit_global_32bit_offset);
+
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_constant_folding);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   struct brw_wm_prog_key wm_key;
+   memset(&wm_key, 0, sizeof(wm_key));
+
+   struct brw_wm_prog_data wm_prog_data = {
+      .base.nr_params = nir->num_uniforms / 4,
+   };
+
+   brw_nir_analyze_ubo_ranges(compiler, nir, NULL, wm_prog_data.base.ubo_ranges);
+
+   struct brw_compile_stats stats[3];
+   struct brw_compile_fs_params params = {
+      .nir = nir,
+      .key = &wm_key,
+      .prog_data = &wm_prog_data,
+      .stats = stats,
+      .log_data = device,
+      .debug_flag = DEBUG_WM,
+   };
+   const unsigned *program = brw_compile_fs(compiler, nir, &params);
+
+   unsigned stat_idx = 0;
+   if (wm_prog_data.dispatch_8) {
+      assert(stats[stat_idx].spills == 0);
+      assert(stats[stat_idx].fills == 0);
+      assert(stats[stat_idx].sends == sends_count_expectation);
+      stat_idx++;
+   }
+   if (wm_prog_data.dispatch_16) {
+      assert(stats[stat_idx].spills == 0);
+      assert(stats[stat_idx].fills == 0);
+      assert(stats[stat_idx].sends == sends_count_expectation);
+      stat_idx++;
+   }
+   if (wm_prog_data.dispatch_32) {
+      assert(stats[stat_idx].spills == 0);
+      assert(stats[stat_idx].fills == 0);
+      assert(stats[stat_idx].sends == sends_count_expectation * 2);
+      stat_idx++;
+   }
+
+   struct anv_pipeline_bind_map bind_map;
+   memset(&bind_map, 0, sizeof(bind_map));
+
+   struct anv_push_descriptor_info push_desc_info = {};
+
+   struct anv_shader_bin *kernel =
+      anv_device_upload_kernel(device,
+                               device->internal_cache,
+                               nir->info.stage,
+                               key, key_size, program,
+                               wm_prog_data.base.program_size,
+                               &wm_prog_data.base, sizeof(wm_prog_data),
+                               NULL, 0, NULL, &bind_map,
+                               &push_desc_info);
+
+   ralloc_free(nir);
+
+   return kernel;
+}
+
+VkResult
+anv_device_init_generated_indirect_draws(struct anv_device *device)
+{
+   const struct intel_l3_weights w =
+      intel_get_default_l3_weights(device->info,
+                                   true /* wants_dc_cache */,
+                                   false /* needs_slm */);
+   device->generated_draw_l3_config = intel_get_l3_config(device->info, w);
+
+   struct {
+      char name[40];
+   } indirect_draws_key = {
+      .name = "anv-generated-indirect-draws",
+   };
+
+   device->generated_draw_kernel =
+      anv_device_search_for_kernel(device,
+                                   device->internal_cache,
+                                   &indirect_draws_key,
+                                   sizeof(indirect_draws_key),
+                                   NULL);
+   if (device->generated_draw_kernel == NULL) {
+      const uint32_t *spirv_source =
+         device->info->ver >= 11 ?
+         gfx11_generated_draws_spv_source :
+         gfx9_generated_draws_spv_source;
+      const uint32_t spirv_source_size =
+         device->info->ver >= 11 ?
+         ARRAY_SIZE(gfx11_generated_draws_spv_source) :
+         ARRAY_SIZE(gfx9_generated_draws_spv_source);
+      const uint32_t send_count =
+         device->info->ver >= 11 ?
+         11 /* 2 * (2 loads + 3 stores) + 1 store */ :
+         17 /* 2 * (2 loads + 6 stores) + 1 store */;
+
+      device->generated_draw_kernel =
+         compile_upload_spirv(device,
+                              &indirect_draws_key,
+                              sizeof(indirect_draws_key),
+                              spirv_source, spirv_source_size, send_count);
+   }
+   if (device->generated_draw_kernel == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, device->generated_draw_kernel);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_device_finish_generated_indirect_draws(struct anv_device *device)
+{
+}
diff --git a/lib/mesa/src/intel/vulkan/anv_generated_indirect_draws.h b/lib/mesa/src/intel/vulkan/anv_generated_indirect_draws.h
new file mode 100644
index 000000000..e8ab8553a
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/anv_generated_indirect_draws.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_GENERATED_INDIRECT_DRAWS_H
+#define ANV_GENERATED_INDIRECT_DRAWS_H
+
+#include <stdint.h>
+
+#define ANV_GENERATED_FLAG_INDEXED    BITFIELD_BIT(0)
+#define ANV_GENERATED_FLAG_PREDICATED BITFIELD_BIT(1)
+#define ANV_GENERATED_FLAG_DRAWID     BITFIELD_BIT(2)
+#define ANV_GENERATED_FLAG_BASE       BITFIELD_BIT(3)
+
+/* This needs to match common_generated_draws.glsl :
+ *
+ *    layout(set = 0, binding = 2) uniform block
+ */
+struct anv_generated_indirect_draw_params {
+   /* Draw ID buffer address (only used on Gfx9) */
+   uint64_t draw_id_addr;
+   /* Indirect data buffer address (only used on Gfx9) */
+   uint64_t indirect_data_addr;
+   /* Stride between each elements of the indirect data buffer */
+   uint32_t indirect_data_stride;
+   uint32_t flags; /* 0-7: bits, 8-15: mocs, 16-23: cmd_dws */
+   /* Base number of the draw ID, it is added to the index computed from the
+    * gl_FragCoord
+    */
+   uint32_t draw_base;
+
+   /* Number of draws to generate */
+   uint32_t draw_count;
+
+   /* Maximum number of draws (equals to draw_count for indirect draws without
+    * an indirect count)
+    */
+   uint32_t max_draw_count;
+
+   /* Instance multiplier for multi view */
+   uint32_t instance_multiplier;
+
+   /* Address where to jump at after the generated draw (only used with
+    * indirect draw count variants)
+    */
+   uint64_t end_addr;
+};
+
+struct anv_generated_indirect_params {
+   struct anv_generated_indirect_draw_params draw;
+
+   /* Global address of binding 0 */
+   uint64_t indirect_data_addr;
+
+   /* Global address of binding 1 */
+   uint64_t generated_cmds_addr;
+
+   /* Global address of binding 2 */
+   uint64_t draw_ids_addr;
+
+   /* CPU side pointer to the previous item when number of draws has to be
+    * split into smaller chunks, see while loop in
+    * genX(cmd_buffer_emit_indirect_generated_draws)
+    */
+   struct anv_generated_indirect_params *prev;
+};
+
+#endif /* ANV_GENERATED_INDIRECT_DRAWS_H */
diff --git a/lib/mesa/src/intel/vulkan/anv_kmd_backend.c b/lib/mesa/src/intel/vulkan/anv_kmd_backend.c
new file mode 100644
index 000000000..8ce882bba
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/anv_kmd_backend.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+
+#include "anv_kmd_backend.h"
+#include "anv_private.h"
+
+const struct anv_kmd_backend *
+anv_kmd_backend_get(enum intel_kmd_type type)
+{
+   switch (type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_kmd_backend_get();
+   case INTEL_KMD_TYPE_XE:
+      return anv_xe_kmd_backend_get();
+   case INTEL_KMD_TYPE_STUB:
+      return anv_stub_kmd_backend_get();
+   default:
+      return NULL;
+   }
+}
diff --git a/lib/mesa/src/intel/vulkan/anv_kmd_backend.h b/lib/mesa/src/intel/vulkan/anv_kmd_backend.h
new file mode 100644
index 000000000..76c5f2f27
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/anv_kmd_backend.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+#include "vk_sync.h"
+
+#include "dev/intel_device_info.h"
+#include "dev/intel_kmd.h"
+
+struct anv_bo;
+enum anv_bo_alloc_flags;
+struct anv_cmd_buffer;
+struct anv_device;
+struct anv_queue;
+struct anv_query_pool;
+struct anv_utrace_submit;
+
+struct anv_kmd_backend {
+   /*
+    * Create a gem buffer.
+    * Return the gem handle in case of success otherwise returns 0.
+    */
+   uint32_t (*gem_create)(struct anv_device *device,
+                          const struct intel_memory_class_instance **regions,
+                          uint16_t num_regions, uint64_t size,
+                          enum anv_bo_alloc_flags alloc_flags,
+                          uint64_t *actual_size);
+   void (*gem_close)(struct anv_device *device, uint32_t handle);
+   /* Returns MAP_FAILED on error */
+   void *(*gem_mmap)(struct anv_device *device, struct anv_bo *bo,
+                     uint64_t offset, uint64_t size,
+                     VkMemoryPropertyFlags property_flags);
+   int (*gem_vm_bind)(struct anv_device *device, struct anv_bo *bo);
+   int (*gem_vm_unbind)(struct anv_device *device, struct anv_bo *bo);
+   VkResult (*execute_simple_batch)(struct anv_queue *queue,
+                                    struct anv_bo *batch_bo,
+                                    uint32_t batch_bo_size);
+   VkResult (*queue_exec_locked)(struct anv_queue *queue,
+                                 uint32_t wait_count,
+                                 const struct vk_sync_wait *waits,
+                                 uint32_t cmd_buffer_count,
+                                 struct anv_cmd_buffer **cmd_buffers,
+                                 uint32_t signal_count,
+                                 const struct vk_sync_signal *signals,
+                                 struct anv_query_pool *perf_query_pool,
+                                 uint32_t perf_query_pass);
+   VkResult (*queue_exec_trace)(struct anv_queue *queue,
+                                struct anv_utrace_submit *submit);
+};
+
+const struct anv_kmd_backend *anv_kmd_backend_get(enum intel_kmd_type type);
+
+/* Internal functions, should only be called by anv_kmd_backend_get() */
+const struct anv_kmd_backend *anv_i915_kmd_backend_get(void);
+const struct anv_kmd_backend *anv_xe_kmd_backend_get(void);
+const struct anv_kmd_backend *anv_stub_kmd_backend_get(void);
diff --git a/lib/mesa/src/intel/vulkan/anv_mesh_perprim_wa.c b/lib/mesa/src/intel/vulkan/anv_mesh_perprim_wa.c
new file mode 100644
index 000000000..f7346b6dc
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/anv_mesh_perprim_wa.c
@@ -0,0 +1,557 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "nir_builder.h"
+
+/*
+ * Wa_14015590813 for gfx 12.5.
+ *
+ * This file implements workaround for HW bug, which leads to fragment shader
+ * reading incorrect per-primitive data if mesh shader, in addition to writing
+ * per-primitive data, also writes to gl_ClipDistance.
+ *
+ * The suggested solution to that bug is to not use per-primitive data by:
+ * - creating new vertices for provoking vertices shared by multiple primitives
+ * - converting per-primitive attributes read by fragment shader to flat
+ *   per-vertex attributes for the provoking vertex
+ * - modifying fragment shader to read those per-vertex attributes
+ *
+ * There are at least 2 type of failures not handled very well:
+ * - if the number of varying slots overflows, than only some attributes will
+ *   be converted, leading to corruption of those unconverted attributes
+ * - if the overall MUE size is so large it doesn't fit in URB, then URB
+ *   allocation will fail in some way; unfortunately there's no good way to
+ *   say how big MUE will be at this moment and back out
+ *
+ * This workaround needs to be applied before linking, so that unused outputs
+ * created by this code are removed at link time.
+ *
+ * This workaround can be controlled by a driconf option to either disable it,
+ * lower its scope or force enable it.
+ *
+ * Option "anv_mesh_conv_prim_attrs_to_vert_attrs" is evaluated like this:
+ *  value == 0 - disable workaround
+ *  value < 0 - enable ONLY if workaround is required
+ *  value > 0 - enable ALWAYS, even if it's not required
+ *  abs(value) >= 1 - attribute conversion
+ *  abs(value) >= 2 - attribute conversion and vertex duplication
+ *
+ *  Default: -2 (both parts of the work around, ONLY if it's required)
+ *
+ */
+
+static bool
+anv_mesh_convert_attrs_prim_to_vert(struct nir_shader *nir,
+                                    gl_varying_slot *wa_mapping,
+                                    uint64_t fs_inputs,
+                                    const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                                    void *mem_ctx,
+                                    const bool dup_vertices,
+                                    const bool force_conversion)
+{
+   uint64_t per_primitive_outputs = nir->info.per_primitive_outputs;
+   per_primitive_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
+
+   if (per_primitive_outputs == 0)
+      return false;
+
+   uint64_t outputs_written = nir->info.outputs_written;
+   uint64_t other_outputs = outputs_written & ~per_primitive_outputs;
+
+   if ((other_outputs & (VARYING_BIT_CLIP_DIST0 | VARYING_BIT_CLIP_DIST1)) == 0)
+      if (!force_conversion)
+         return false;
+
+   uint64_t all_outputs = outputs_written;
+   unsigned attrs = 0;
+
+   uint64_t remapped_outputs = outputs_written & per_primitive_outputs;
+   remapped_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
+
+   /* Skip locations not read by the fragment shader, because they will
+    * be eliminated at linking time. Note that some fs inputs may be
+    * removed only after optimizations, so it's possible that we will
+    * create too many variables.
+    */
+   remapped_outputs &= fs_inputs;
+
+   /* Figure out the mapping between per-primitive and new per-vertex outputs. */
+   nir_foreach_shader_out_variable(var, nir) {
+      int location = var->data.location;
+
+      if (!(BITFIELD64_BIT(location) & remapped_outputs))
+         continue;
+
+      /* Although primitive shading rate, layer and viewport have predefined
+       * place in MUE Primitive Header (so we can't really move them anywhere),
+       * we have to copy them to per-vertex space if fragment shader reads them.
+       */
+      assert(location == VARYING_SLOT_PRIMITIVE_SHADING_RATE ||
+             location == VARYING_SLOT_LAYER ||
+             location == VARYING_SLOT_VIEWPORT ||
+             location == VARYING_SLOT_PRIMITIVE_ID ||
+             location >= VARYING_SLOT_VAR0);
+
+      const struct glsl_type *type = var->type;
+      if (nir_is_arrayed_io(var, MESA_SHADER_MESH) || var->data.per_view) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+
+      unsigned num_slots = glsl_count_attribute_slots(type, false);
+
+      for (gl_varying_slot slot = VARYING_SLOT_VAR0; slot <= VARYING_SLOT_VAR31; slot++) {
+         uint64_t mask = BITFIELD64_MASK(num_slots) << slot;
+         if ((all_outputs & mask) == 0) {
+            wa_mapping[location] = slot;
+            all_outputs |= mask;
+            attrs++;
+            break;
+         }
+      }
+
+      if (wa_mapping[location] == 0) {
+         fprintf(stderr, "Not enough space for hardware per-primitive data corruption work around.\n");
+         break;
+      }
+   }
+
+   if (attrs == 0)
+      if (!force_conversion)
+         return false;
+
+   unsigned provoking_vertex = 0;
+
+   const VkPipelineRasterizationStateCreateInfo *rs_info = pCreateInfo->pRasterizationState;
+   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
+      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
+   if (rs_pv_info && rs_pv_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
+      provoking_vertex = 2;
+
+   unsigned vertices_per_primitive =
+         num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   b.cursor = nir_after_cf_list(&impl->body);
+
+   /* wait for all subgroups to finish */
+   nir_scoped_barrier(&b, NIR_SCOPE_WORKGROUP);
+
+   nir_ssa_def *zero = nir_imm_int(&b, 0);
+
+   nir_ssa_def *local_invocation_index = nir_build_load_local_invocation_index(&b);
+
+   nir_ssa_def *cmp = nir_ieq(&b, local_invocation_index, zero);
+   nir_if *if_stmt = nir_push_if(&b, cmp);
+   {
+      nir_variable *primitive_count_var = NULL;
+      nir_variable *primitive_indices_var = NULL;
+
+      unsigned num_other_variables = 0;
+      nir_foreach_shader_out_variable(var, b.shader) {
+         if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
+            continue;
+         num_other_variables++;
+      }
+
+      nir_deref_instr **per_vertex_derefs =
+            ralloc_array(mem_ctx, nir_deref_instr *, num_other_variables);
+
+      unsigned num_per_vertex_variables = 0;
+
+      unsigned processed = 0;
+      nir_foreach_shader_out_variable(var, b.shader) {
+         if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
+            continue;
+
+         switch (var->data.location) {
+            case VARYING_SLOT_PRIMITIVE_COUNT:
+               primitive_count_var = var;
+               break;
+            case VARYING_SLOT_PRIMITIVE_INDICES:
+               primitive_indices_var = var;
+               break;
+            default: {
+               const struct glsl_type *type = var->type;
+               assert(glsl_type_is_array(type));
+               const struct glsl_type *array_element_type =
+                     glsl_get_array_element(type);
+
+               if (dup_vertices) {
+                  /*
+                   * Resize type of array output to make space for one extra
+                   * vertex attribute for each primitive, so we ensure that
+                   * the provoking vertex is not shared between primitives.
+                   */
+                  const struct glsl_type *new_type =
+                        glsl_array_type(array_element_type,
+                                        glsl_get_length(type) +
+                                        nir->info.mesh.max_primitives_out,
+                                        0);
+
+                  var->type = new_type;
+               }
+
+               per_vertex_derefs[num_per_vertex_variables++] =
+                     nir_build_deref_var(&b, var);
+               break;
+            }
+         }
+
+         ++processed;
+      }
+      assert(processed == num_other_variables);
+
+      assert(primitive_count_var != NULL);
+      assert(primitive_indices_var != NULL);
+
+      /* Update types of derefs to match type of variables they (de)reference. */
+      if (dup_vertices) {
+         nir_foreach_function(function, b.shader) {
+            if (!function->impl)
+               continue;
+
+            nir_foreach_block(block, function->impl) {
+               nir_foreach_instr(instr, block) {
+                  if (instr->type != nir_instr_type_deref)
+                     continue;
+
+                  nir_deref_instr *deref = nir_instr_as_deref(instr);
+                  if (deref->deref_type != nir_deref_type_var)
+                     continue;
+
+                  if (deref->var->type != deref->type)
+                     deref->type = deref->var->type;
+               }
+            }
+         }
+      }
+
+      /* indexed by slot of per-prim attribute */
+      struct {
+         nir_deref_instr *per_prim_deref;
+         nir_deref_instr *per_vert_deref;
+      } mapping[VARYING_SLOT_MAX] = {{NULL, NULL}, };
+
+      /* Create new per-vertex output variables mirroring per-primitive variables
+       * and create derefs for both old and new variables.
+       */
+      nir_foreach_shader_out_variable(var, b.shader) {
+         gl_varying_slot location = var->data.location;
+
+         if ((BITFIELD64_BIT(location) & (outputs_written & per_primitive_outputs)) == 0)
+            continue;
+         if (wa_mapping[location] == 0)
+            continue;
+
+         const struct glsl_type *type = var->type;
+         assert(glsl_type_is_array(type));
+         const struct glsl_type *array_element_type = glsl_get_array_element(type);
+
+         const struct glsl_type *new_type =
+               glsl_array_type(array_element_type,
+                               nir->info.mesh.max_vertices_out +
+                               (dup_vertices ? nir->info.mesh.max_primitives_out : 0),
+                               0);
+
+         nir_variable *new_var =
+               nir_variable_create(b.shader, nir_var_shader_out, new_type, var->name);
+         assert(wa_mapping[location] >= VARYING_SLOT_VAR0);
+         assert(wa_mapping[location] <= VARYING_SLOT_VAR31);
+         new_var->data.location = wa_mapping[location];
+         new_var->data.interpolation = INTERP_MODE_FLAT;
+
+         mapping[location].per_vert_deref = nir_build_deref_var(&b, new_var);
+         mapping[location].per_prim_deref = nir_build_deref_var(&b, var);
+      }
+
+      nir_ssa_def *trueconst = nir_imm_true(&b);
+
+      /*
+       * for each Primitive (0 : primitiveCount)
+       *    if VertexUsed[PrimitiveIndices[Primitive][provoking vertex]]
+       *       create 1 new vertex at offset "Vertex"
+       *       copy per vert attributes of provoking vertex to the new one
+       *       update PrimitiveIndices[Primitive][provoking vertex]
+       *       Vertex++
+       *    else
+       *       VertexUsed[PrimitiveIndices[Primitive][provoking vertex]] := true
+       *
+       *    for each attribute : mapping
+       *       copy per_prim_attr(Primitive) to per_vert_attr[Primitive][provoking vertex]
+       */
+
+      /* primitive count */
+      nir_ssa_def *primitive_count = nir_load_var(&b, primitive_count_var);
+
+      /* primitive index */
+      nir_variable *primitive_var =
+            nir_local_variable_create(impl, glsl_uint_type(), "Primitive");
+      nir_deref_instr *primitive_deref = nir_build_deref_var(&b, primitive_var);
+      nir_store_deref(&b, primitive_deref, zero, 1);
+
+      /* vertex index */
+      nir_variable *vertex_var =
+            nir_local_variable_create(impl, glsl_uint_type(), "Vertex");
+      nir_deref_instr *vertex_deref = nir_build_deref_var(&b, vertex_var);
+      nir_store_deref(&b, vertex_deref, nir_imm_int(&b, nir->info.mesh.max_vertices_out), 1);
+
+      /* used vertices bitvector */
+      const struct glsl_type *used_vertex_type =
+            glsl_array_type(glsl_bool_type(),
+                            nir->info.mesh.max_vertices_out,
+                            0);
+      nir_variable *used_vertex_var =
+            nir_local_variable_create(impl, used_vertex_type, "VertexUsed");
+      nir_deref_instr *used_vertex_deref =
+               nir_build_deref_var(&b, used_vertex_var);
+      /* Initialize it as "not used" */
+      for (unsigned i = 0; i < nir->info.mesh.max_vertices_out; ++i) {
+         nir_deref_instr *indexed_used_vertex_deref =
+                        nir_build_deref_array(&b, used_vertex_deref, nir_imm_int(&b, i));
+         nir_store_deref(&b, indexed_used_vertex_deref, nir_imm_false(&b), 1);
+      }
+
+      nir_loop *loop = nir_push_loop(&b);
+      {
+         nir_ssa_def *primitive = nir_load_deref(&b, primitive_deref);
+         nir_ssa_def *cmp = nir_ige(&b, primitive, primitive_count);
+
+         nir_if *loop_check = nir_push_if(&b, cmp);
+         nir_jump(&b, nir_jump_break);
+         nir_pop_if(&b, loop_check);
+
+         nir_deref_instr *primitive_indices_deref =
+               nir_build_deref_var(&b, primitive_indices_var);
+         nir_deref_instr *indexed_primitive_indices_deref;
+         nir_ssa_def *src_vertex;
+         nir_ssa_def *prim_indices;
+
+         if (nir->info.mesh.nv) {
+            /* flat array, but we can deref each index directly */
+            nir_ssa_def *index_index =
+                  nir_imul(&b, primitive, nir_imm_int(&b, vertices_per_primitive));
+            index_index = nir_iadd(&b, index_index, nir_imm_int(&b, provoking_vertex));
+            indexed_primitive_indices_deref = nir_build_deref_array(&b, primitive_indices_deref, index_index);
+            src_vertex = nir_load_deref(&b, indexed_primitive_indices_deref);
+            prim_indices = NULL;
+         } else {
+            /* array of vectors, we have to extract index out of array deref */
+            indexed_primitive_indices_deref = nir_build_deref_array(&b, primitive_indices_deref, primitive);
+            prim_indices = nir_load_deref(&b, indexed_primitive_indices_deref);
+            src_vertex = nir_channel(&b, prim_indices, provoking_vertex);
+         }
+
+         nir_ssa_def *dst_vertex = nir_load_deref(&b, vertex_deref);
+
+         nir_deref_instr *indexed_used_vertex_deref =
+                        nir_build_deref_array(&b, used_vertex_deref, src_vertex);
+         nir_ssa_def *used_vertex = nir_load_deref(&b, indexed_used_vertex_deref);
+         if (!dup_vertices)
+            used_vertex = nir_imm_false(&b);
+
+         nir_if *vertex_used_check = nir_push_if(&b, used_vertex);
+         {
+            for (unsigned a = 0; a < num_per_vertex_variables; ++a) {
+               nir_deref_instr *attr_arr = per_vertex_derefs[a];
+               nir_deref_instr *src = nir_build_deref_array(&b, attr_arr, src_vertex);
+               nir_deref_instr *dst = nir_build_deref_array(&b, attr_arr, dst_vertex);
+
+               nir_copy_deref(&b, dst, src);
+            }
+
+            if (nir->info.mesh.nv) {
+               nir_store_deref(&b, indexed_primitive_indices_deref, dst_vertex, 1);
+            } else {
+               /* replace one component of primitive indices vector */
+               nir_ssa_def *new_val =
+                     nir_vector_insert_imm(&b, prim_indices, dst_vertex, provoking_vertex);
+
+               /* and store complete vector */
+               nir_store_deref(&b, indexed_primitive_indices_deref, new_val,
+                               BITFIELD_MASK(vertices_per_primitive));
+            }
+
+            nir_store_deref(&b, vertex_deref, nir_iadd_imm(&b, dst_vertex, 1), 1);
+
+            for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
+               if (!mapping[i].per_vert_deref)
+                  continue;
+
+               nir_deref_instr *src =
+                     nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
+               nir_deref_instr *dst =
+                     nir_build_deref_array(&b, mapping[i].per_vert_deref, dst_vertex);
+
+               nir_copy_deref(&b, dst, src);
+            }
+         }
+         nir_push_else(&b, vertex_used_check);
+         {
+            nir_store_deref(&b, indexed_used_vertex_deref, trueconst, 1);
+
+            for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
+               if (!mapping[i].per_vert_deref)
+                  continue;
+
+               nir_deref_instr *src =
+                     nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
+               nir_deref_instr *dst =
+                     nir_build_deref_array(&b, mapping[i].per_vert_deref, src_vertex);
+
+               nir_copy_deref(&b, dst, src);
+            }
+
+         }
+         nir_pop_if(&b, vertex_used_check);
+
+         nir_store_deref(&b, primitive_deref, nir_iadd_imm(&b, primitive, 1), 1);
+      }
+      nir_pop_loop(&b, loop);
+   }
+   nir_pop_if(&b, if_stmt); /* local_invocation_index == 0 */
+
+   if (dup_vertices)
+      nir->info.mesh.max_vertices_out += nir->info.mesh.max_primitives_out;
+
+   if (should_print_nir(nir)) {
+      printf("%s\n", __func__);
+      nir_print_shader(nir, stdout);
+   }
+
+   /* deal with copy_derefs */
+   NIR_PASS(_, nir, nir_split_var_copies);
+   NIR_PASS(_, nir, nir_lower_var_copies);
+
+   nir_shader_gather_info(nir, impl);
+
+   return true;
+}
+
+static bool
+anv_frag_update_derefs_instr(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_instr_as_deref(instr);
+   if (deref->deref_type != nir_deref_type_var)
+      return false;
+
+   nir_variable *var = deref->var;
+   if (!(var->data.mode & nir_var_shader_in))
+      return false;
+
+   int location = var->data.location;
+   nir_deref_instr **new_derefs = (nir_deref_instr **)data;
+   if (new_derefs[location] == NULL)
+      return false;
+
+   assert(deref->dest.is_ssa);
+   assert(new_derefs[location]->dest.is_ssa);
+
+   nir_instr_remove(&deref->instr);
+   nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_derefs[location]->dest.ssa);
+
+   return true;
+}
+
+static bool
+anv_frag_update_derefs(nir_shader *shader, nir_deref_instr **mapping)
+{
+   return nir_shader_instructions_pass(shader, anv_frag_update_derefs_instr,
+                                       nir_metadata_none, (void *)mapping);
+}
+
+/* Update fragment shader inputs with new ones. */
+static void
+anv_frag_convert_attrs_prim_to_vert(struct nir_shader *nir,
+                                    gl_varying_slot *wa_mapping)
+{
+   /* indexed by slot of per-prim attribute */
+   nir_deref_instr *new_derefs[VARYING_SLOT_MAX] = {NULL, };
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   b.cursor = nir_before_cf_list(&impl->body);
+
+   nir_foreach_shader_in_variable_safe(var, nir) {
+      gl_varying_slot location = var->data.location;
+      gl_varying_slot new_location = wa_mapping[location];
+      if (new_location == 0)
+         continue;
+
+      assert(wa_mapping[new_location] == 0);
+
+      nir_variable *new_var =
+            nir_variable_create(b.shader, nir_var_shader_in, var->type, var->name);
+      new_var->data.location = new_location;
+      new_var->data.location_frac = var->data.location_frac;
+      new_var->data.interpolation = INTERP_MODE_FLAT;
+
+      new_derefs[location] = nir_build_deref_var(&b, new_var);
+   }
+
+   NIR_PASS(_, nir, anv_frag_update_derefs, new_derefs);
+
+   nir_shader_gather_info(nir, impl);
+}
+
+void
+anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir,
+                           struct nir_shader *fs_nir,
+                           struct anv_device *device,
+                           const VkGraphicsPipelineCreateInfo *info)
+{
+   const struct intel_device_info *devinfo = device->info;
+
+   int mesh_conv_prim_attrs_to_vert_attrs =
+         device->physical->instance->mesh_conv_prim_attrs_to_vert_attrs;
+   if (mesh_conv_prim_attrs_to_vert_attrs < 0 &&
+         !intel_needs_workaround(devinfo, 14015590813))
+      mesh_conv_prim_attrs_to_vert_attrs = 0;
+
+   if (mesh_conv_prim_attrs_to_vert_attrs != 0) {
+      uint64_t fs_inputs = 0;
+      nir_foreach_shader_in_variable(var, fs_nir)
+         fs_inputs |= BITFIELD64_BIT(var->data.location);
+
+      void *stage_ctx = ralloc_context(NULL);
+
+      gl_varying_slot wa_mapping[VARYING_SLOT_MAX] = { 0, };
+
+      const bool dup_vertices = abs(mesh_conv_prim_attrs_to_vert_attrs) >= 2;
+      const bool force_conversion = mesh_conv_prim_attrs_to_vert_attrs > 0;
+
+      if (anv_mesh_convert_attrs_prim_to_vert(ms_nir, wa_mapping,
+                                              fs_inputs, info, stage_ctx,
+                                              dup_vertices, force_conversion))
+         anv_frag_convert_attrs_prim_to_vert(fs_nir, wa_mapping);
+
+      ralloc_free(stage_ctx);
+   }
+}
diff --git a/lib/mesa/src/intel/vulkan/anv_nir_compute_push_layout.c b/lib/mesa/src/intel/vulkan/anv_nir_compute_push_layout.c
index 22478e7e3..1d4b8009e 100644
--- a/lib/mesa/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/lib/mesa/src/intel/vulkan/anv_nir_compute_push_layout.c
@@ -67,12 +67,13 @@ anv_nir_compute_push_layout(nir_shader *nir,
                break;
             }
 
-            case nir_intrinsic_load_desc_set_address_intel:
-               push_start = MIN2(push_start,
-                  offsetof(struct anv_push_constants, desc_sets));
-               push_end = MAX2(push_end, push_start +
+            case nir_intrinsic_load_desc_set_address_intel: {
+               unsigned base = offsetof(struct anv_push_constants, desc_sets);
+               push_start = MIN2(push_start, base);
+               push_end = MAX2(push_end, base +
                   sizeof_field(struct anv_push_constants, desc_sets));
                break;
+            }
 
             default:
                break;
@@ -117,7 +118,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
     * push_end (no push constants is indicated by push_start = UINT_MAX).
     */
    push_start = MIN2(push_start, push_end);
-   push_start = align_down_u32(push_start, 32);
+   push_start = ROUND_DOWN_TO(push_start, 32);
 
    /* For vec4 our push data size needs to be aligned to a vec4 and for
     * scalar, it needs to be aligned to a DWORD.
diff --git a/lib/mesa/src/intel/vulkan/anv_nir_lower_ubo_loads.c b/lib/mesa/src/intel/vulkan/anv_nir_lower_ubo_loads.c
index 5a170352c..f1609a22c 100644
--- a/lib/mesa/src/intel/vulkan/anv_nir_lower_ubo_loads.c
+++ b/lib/mesa/src/intel/vulkan/anv_nir_lower_ubo_loads.c
@@ -47,7 +47,7 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
    unsigned byte_size = bit_size / 8;
 
    nir_ssa_def *val;
-   if (nir_src_is_const(load->src[1])) {
+   if (!nir_src_is_divergent(load->src[0]) && nir_src_is_const(load->src[1])) {
       uint32_t offset = nir_src_as_uint(load->src[1]);
 
       /* Things should be component-aligned. */
diff --git a/lib/mesa/src/intel/vulkan/anv_perf.c b/lib/mesa/src/intel/vulkan/anv_perf.c
index 49cbef52a..3b23067ab 100644
--- a/lib/mesa/src/intel/vulkan/anv_perf.c
+++ b/lib/mesa/src/intel/vulkan/anv_perf.c
@@ -109,7 +109,10 @@ anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
    properties[p++] = metric_id;
 
    properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
-   properties[p++] = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
+   properties[p++] =
+      device->info->verx10 >= 125 ?
+      I915_OA_FORMAT_A24u40_A14u32_B8_C8 :
+      I915_OA_FORMAT_A32u40_A4u32_B8_C8;
 
    properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
    properties[p++] = 31; /* slowest sampling period */
@@ -363,7 +366,10 @@ VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
 
       vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
          desc->flags = 0; /* None so far. */
-         snprintf(desc->name, sizeof(desc->name), "%s", intel_counter->name);
+         snprintf(desc->name, sizeof(desc->name), "%s",
+                  INTEL_DEBUG(DEBUG_PERF_SYMBOL_NAMES) ?
+                  intel_counter->symbol_name :
+                  intel_counter->name);
          snprintf(desc->category, sizeof(desc->category), "%s", intel_counter->category);
          snprintf(desc->description, sizeof(desc->description), "%s", intel_counter->desc);
       }
@@ -430,10 +436,12 @@ anv_perf_write_pass_results(struct intel_perf_config *perf,
                             const struct intel_perf_query_result *accumulated_results,
                             union VkPerformanceCounterResultKHR *results)
 {
+   const struct intel_perf_query_info *query = pool->pass_query[pass];
+
    for (uint32_t c = 0; c < pool->n_counters; c++) {
       const struct intel_perf_counter_pass *counter_pass = &pool->counter_pass[c];
 
-      if (counter_pass->pass != pass)
+      if (counter_pass->query != query)
          continue;
 
       switch (pool->pass_query[pass]->kind) {
diff --git a/lib/mesa/src/intel/vulkan/anv_utrace.c b/lib/mesa/src/intel/vulkan/anv_utrace.c
index 3a35aefe4..99dfc50d4 100644
--- a/lib/mesa/src/intel/vulkan/anv_utrace.c
+++ b/lib/mesa/src/intel/vulkan/anv_utrace.c
@@ -23,15 +23,19 @@
 
 #include "anv_private.h"
 
+#include "ds/intel_tracepoints.h"
+#include "genxml/gen8_pack.h"
 #include "perf/intel_perf.h"
 
+#include "vulkan/runtime/vk_common_entrypoints.h"
+
 static uint32_t
 command_buffers_count_utraces(struct anv_device *device,
                               uint32_t cmd_buffer_count,
                               struct anv_cmd_buffer **cmd_buffers,
                               uint32_t *utrace_copies)
 {
-   if (!u_trace_context_actively_tracing(&device->ds.trace_context))
+   if (!u_trace_should_process(&device->ds.trace_context))
       return 0;
 
    uint32_t utraces = 0;
@@ -47,25 +51,25 @@ command_buffers_count_utraces(struct anv_device *device,
 }
 
 static void
-anv_utrace_delete_flush_data(struct u_trace_context *utctx,
-                             void *flush_data)
+anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
 {
    struct anv_device *device =
       container_of(utctx, struct anv_device, ds.trace_context);
-   struct anv_utrace_flush_copy *flush = flush_data;
+   struct anv_utrace_submit *submit = submit_data;
+
+   intel_ds_flush_data_fini(&submit->ds);
 
-   intel_ds_flush_data_fini(&flush->ds);
+   if (submit->trace_bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
 
-   if (flush->trace_bo) {
-      assert(flush->batch_bo);
-      anv_reloc_list_finish(&flush->relocs, &device->vk.alloc);
-      anv_device_release_bo(device, flush->batch_bo);
-      anv_device_release_bo(device, flush->trace_bo);
+   if (submit->batch_bo) {
+      anv_reloc_list_finish(&submit->relocs, &device->vk.alloc);
+      anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
    }
 
-   vk_sync_destroy(&device->vk, flush->sync);
+   vk_sync_destroy(&device->vk, submit->sync);
 
-   vk_free(&device->vk.alloc, flush);
+   vk_free(&device->vk.alloc, submit);
 }
 
 static void
@@ -77,13 +81,13 @@ anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx,
 {
    struct anv_device *device =
       container_of(utctx, struct anv_device, ds.trace_context);
-   struct anv_utrace_flush_copy *flush = cmdstream;
+   struct anv_utrace_submit *submit = cmdstream;
    struct anv_address from_addr = (struct anv_address) {
       .bo = ts_from, .offset = from_offset * sizeof(uint64_t) };
    struct anv_address to_addr = (struct anv_address) {
       .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
 
-   anv_genX(device->info, emit_so_memcpy)(&flush->memcpy_state,
+   anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
                                            to_addr, from_addr, count * sizeof(uint64_t));
 }
 
@@ -91,7 +95,7 @@ VkResult
 anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
                                     uint32_t cmd_buffer_count,
                                     struct anv_cmd_buffer **cmd_buffers,
-                                    struct anv_utrace_flush_copy **out_flush_data)
+                                    struct anv_utrace_submit **out_submit)
 {
    struct anv_device *device = queue->device;
    uint32_t utrace_copies = 0;
@@ -100,94 +104,105 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
                                                     cmd_buffers,
                                                     &utrace_copies);
    if (!utraces) {
-      *out_flush_data = NULL;
+      *out_submit = NULL;
       return VK_SUCCESS;
    }
 
    VkResult result;
-   struct anv_utrace_flush_copy *flush =
-      vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_flush_copy),
+   struct anv_utrace_submit *submit =
+      vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
                 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-   if (!flush)
+   if (!submit)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   intel_ds_flush_data_init(&flush->ds, &queue->ds, queue->ds.submission_id);
+   intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
 
    result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
-                           0, 0, &flush->sync);
+                           0, 0, &submit->sync);
    if (result != VK_SUCCESS)
       goto error_sync;
 
    if (utrace_copies > 0) {
       result = anv_bo_pool_alloc(&device->utrace_bo_pool,
                                  utrace_copies * 4096,
-                                 &flush->trace_bo);
+                                 &submit->trace_bo);
       if (result != VK_SUCCESS)
          goto error_trace_buf;
 
+      uint32_t batch_size = 512; /* 128 dwords of setup */
+      if (device->info->verx10 == 120 || intel_device_info_is_dg2(device->info)) {
+         /* Enable/Disable preemption at the begin/end */
+         batch_size += 2 * (250 /* 250 MI_NOOPs*/ +
+                            6   /* PIPE_CONTROL */ +
+                            3   /* MI_LRI */) * 4 /* dwords */;
+      }
+      batch_size += 256 * utrace_copies; /* 64 dwords per copy */
+      batch_size = align(batch_size + 4, 8); /* MI_BATCH_BUFFER_END */
+
       result = anv_bo_pool_alloc(&device->utrace_bo_pool,
-                                 /* 128 dwords of setup + 64 dwords per copy */
-                                 align_u32(512 + 64 * utrace_copies, 4096),
-                                 &flush->batch_bo);
+                                 align(batch_size, 4096),
+                                 &submit->batch_bo);
       if (result != VK_SUCCESS)
          goto error_batch_buf;
 
-      result = anv_reloc_list_init(&flush->relocs, &device->vk.alloc);
+      result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc);
       if (result != VK_SUCCESS)
          goto error_reloc_list;
 
-      flush->batch.alloc = &device->vk.alloc;
-      flush->batch.relocs = &flush->relocs;
-      anv_batch_set_storage(&flush->batch,
-                            (struct anv_address) { .bo = flush->batch_bo, },
-                            flush->batch_bo->map, flush->batch_bo->size);
+      submit->batch.alloc = &device->vk.alloc;
+      submit->batch.relocs = &submit->relocs;
+      anv_batch_set_storage(&submit->batch,
+                            (struct anv_address) { .bo = submit->batch_bo, },
+                            submit->batch_bo->map, submit->batch_bo->size);
 
       /* Emit the copies */
-      anv_genX(device->info, emit_so_memcpy_init)(&flush->memcpy_state,
-                                                   device,
-                                                   &flush->batch);
+      anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
+                                                  device,
+                                                  &submit->batch);
       for (uint32_t i = 0; i < cmd_buffer_count; i++) {
          if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
-            u_trace_flush(&cmd_buffers[i]->trace, flush, false);
+            u_trace_flush(&cmd_buffers[i]->trace, submit, false);
          } else {
             u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
                                  u_trace_end_iterator(&cmd_buffers[i]->trace),
-                                 &flush->ds.trace,
-                                 flush,
+                                 &submit->ds.trace,
+                                 submit,
                                  anv_device_utrace_emit_copy_ts_buffer);
          }
       }
-      anv_genX(device->info, emit_so_memcpy_fini)(&flush->memcpy_state);
+      anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
+      anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
 
-      u_trace_flush(&flush->ds.trace, flush, true);
+      u_trace_flush(&submit->ds.trace, submit, true);
 
-      if (flush->batch.status != VK_SUCCESS) {
-         result = flush->batch.status;
+      if (submit->batch.status != VK_SUCCESS) {
+         result = submit->batch.status;
          goto error_batch;
       }
    } else {
       for (uint32_t i = 0; i < cmd_buffer_count; i++) {
          assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
-         u_trace_flush(&cmd_buffers[i]->trace, flush, i == (cmd_buffer_count - 1));
+         u_trace_flush(&cmd_buffers[i]->trace, submit, i == (cmd_buffer_count - 1));
       }
    }
 
-   flush->queue = queue;
+   submit->queue = queue;
 
-   *out_flush_data = flush;
+   *out_submit = submit;
 
    return VK_SUCCESS;
 
  error_batch:
-   anv_reloc_list_finish(&flush->relocs, &device->vk.alloc);
+   anv_reloc_list_finish(&submit->relocs, &device->vk.alloc);
  error_reloc_list:
-   anv_bo_pool_free(&device->utrace_bo_pool, flush->batch_bo);
+   anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
  error_batch_buf:
-   anv_bo_pool_free(&device->utrace_bo_pool, flush->trace_bo);
+   anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
  error_trace_buf:
-   vk_sync_destroy(&device->vk, flush->sync);
+   vk_sync_destroy(&device->vk, submit->sync);
  error_sync:
-   vk_free(&device->vk.alloc, flush);
+   intel_ds_flush_data_fini(&submit->ds);
+   vk_free(&device->vk.alloc, submit);
    return result;
 }
 
@@ -200,7 +215,7 @@ anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
    struct anv_bo *bo = NULL;
    UNUSED VkResult result =
       anv_bo_pool_alloc(&device->utrace_bo_pool,
-                        align_u32(size_b, 4096),
+                        align(size_b, 4096),
                         &bo);
    assert(result == VK_SUCCESS);
 
@@ -222,15 +237,17 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs,
                      void *timestamps, unsigned idx,
                      bool end_of_pipe)
 {
-   struct anv_cmd_buffer *cmd_buffer =
-      container_of(ut, struct anv_cmd_buffer, trace);
-   struct anv_device *device = cmd_buffer->device;
+   struct anv_device *device =
+      container_of(ut->utctx, struct anv_device, ds.trace_context);
+   struct anv_batch *batch =
+      cs != NULL ? cs :
+      &container_of(ut, struct anv_cmd_buffer, trace)->batch;
    struct anv_bo *bo = timestamps;
 
    enum anv_timestamp_capture_type capture_type =
       (end_of_pipe) ? ANV_TIMESTAMP_CAPTURE_END_OF_PIPE
                     : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
-   device->physical->cmd_emit_timestamp(&cmd_buffer->batch, device,
+   device->physical->cmd_emit_timestamp(batch, device,
                                         (struct anv_address) {
                                            .bo = bo,
                                            .offset = idx * sizeof(uint64_t) },
@@ -244,13 +261,13 @@ anv_utrace_read_ts(struct u_trace_context *utctx,
    struct anv_device *device =
       container_of(utctx, struct anv_device, ds.trace_context);
    struct anv_bo *bo = timestamps;
-   struct anv_utrace_flush_copy *flush = flush_data;
+   struct anv_utrace_submit *submit = flush_data;
 
    /* Only need to stall on results for the first entry: */
    if (idx == 0) {
       UNUSED VkResult result =
          vk_sync_wait(&device->vk,
-                      flush->sync,
+                      submit->sync,
                       0,
                       VK_SYNC_WAIT_COMPLETE,
                       os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
@@ -271,7 +288,7 @@ anv_device_utrace_init(struct anv_device *device)
 {
    anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace");
    intel_ds_device_init(&device->ds, device->info, device->fd,
-                        device->physical->local_minor - 128,
+                        device->physical->local_minor,
                         INTEL_DS_API_VULKAN);
    u_trace_context_init(&device->ds.trace_context,
                         &device->ds,
@@ -279,14 +296,14 @@ anv_device_utrace_init(struct anv_device *device)
                         anv_utrace_destroy_ts_buffer,
                         anv_utrace_record_ts,
                         anv_utrace_read_ts,
-                        anv_utrace_delete_flush_data);
+                        anv_utrace_delete_submit);
 
    for (uint32_t q = 0; q < device->queue_count; q++) {
       struct anv_queue *queue = &device->queues[q];
 
       intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
-                                   intel_engines_class_to_string(queue->family->engine_class),
-                                   queue->index_in_family);
+                                 intel_engines_class_to_string(queue->family->engine_class),
+                                 queue->vk.index_in_family);
    }
 }
 
@@ -319,6 +336,8 @@ anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
       { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT,           .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
       { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT,          .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
       { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT,               .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
+      { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT,             .ds = INTEL_DS_END_OF_PIPE_BIT, },
    };
 
    enum intel_ds_stall_flag ret = 0;
@@ -329,3 +348,140 @@ anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
 
    return ret;
 }
+
+void anv_CmdBeginDebugUtilsLabelEXT(
+   VkCommandBuffer _commandBuffer,
+   const VkDebugUtilsLabelEXT *pLabelInfo)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
+
+   vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
+
+   trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
+}
+
+void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
+
+   if (cmd_buffer->vk.labels.size > 0) {
+      const VkDebugUtilsLabelEXT *label =
+         util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
+
+      trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
+                                            strlen(label->pLabelName),
+                                            label->pLabelName);
+   }
+
+   vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
+}
+
+void
+anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
+{
+   struct anv_device *device = queue->device;
+
+   VkResult result;
+   struct anv_utrace_submit *submit =
+      vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!submit)
+      return;
+
+   submit->queue = queue;
+
+   intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
+
+   result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
+                           0, 0, &submit->sync);
+   if (result != VK_SUCCESS)
+      goto error_trace;
+
+   result = anv_bo_pool_alloc(&device->utrace_bo_pool, 4096,
+                              &submit->batch_bo);
+   if (result != VK_SUCCESS)
+      goto error_sync;
+
+   result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc);
+   if (result != VK_SUCCESS)
+      goto error_batch_bo;
+
+   submit->batch.alloc = &device->vk.alloc;
+   submit->batch.relocs = &submit->relocs;
+   anv_batch_set_storage(&submit->batch,
+                         (struct anv_address) { .bo = submit->batch_bo, },
+                         submit->batch_bo->map, submit->batch_bo->size);
+
+   if (frame) {
+      if (begin)
+         trace_intel_begin_frame(&submit->ds.trace, &submit->batch);
+      else
+         trace_intel_end_frame(&submit->ds.trace, &submit->batch,
+                               device->debug_frame_desc->frame_id);
+   } else {
+      if (begin) {
+         trace_intel_begin_queue_annotation(&submit->ds.trace, &submit->batch);
+      } else {
+         trace_intel_end_queue_annotation(&submit->ds.trace,
+                                          &submit->batch,
+                                          strlen(label),
+                                          label);
+      }
+   }
+
+   anv_batch_emit(&submit->batch, GFX8_MI_BATCH_BUFFER_END, bbs);
+   anv_batch_emit(&submit->batch, GFX8_MI_NOOP, noop);
+
+   if (submit->batch.status != VK_SUCCESS) {
+      result = submit->batch.status;
+      goto error_reloc_list;
+   }
+
+   u_trace_flush(&submit->ds.trace, submit, true);
+
+   pthread_mutex_lock(&device->mutex);
+   device->kmd_backend->queue_exec_trace(queue, submit);
+   pthread_mutex_unlock(&device->mutex);
+
+   return;
+
+ error_reloc_list:
+   anv_reloc_list_finish(&submit->relocs, &device->vk.alloc);
+ error_batch_bo:
+   anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
+ error_sync:
+   vk_sync_destroy(&device->vk, submit->sync);
+ error_trace:
+   intel_ds_flush_data_fini(&submit->ds);
+   vk_free(&device->vk.alloc, submit);
+}
+
+void
+anv_QueueBeginDebugUtilsLabelEXT(
+   VkQueue _queue,
+   const VkDebugUtilsLabelEXT *pLabelInfo)
+{
+   VK_FROM_HANDLE(anv_queue, queue, _queue);
+
+   vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
+
+   anv_queue_trace(queue, pLabelInfo->pLabelName,
+                   false /* frame */, true /* begin */);
+}
+
+void
+anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
+{
+   VK_FROM_HANDLE(anv_queue, queue, _queue);
+
+   if (queue->vk.labels.size > 0) {
+      const VkDebugUtilsLabelEXT *label =
+         util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
+      anv_queue_trace(queue, label->pLabelName,
+                      false /* frame */, false /* begin */);
+
+      u_trace_context_process(&queue->device->ds.trace_context, true);
+   }
+
+   vk_common_QueueEndDebugUtilsLabelEXT(_queue);
+}
diff --git a/lib/mesa/src/intel/vulkan/anv_video.c b/lib/mesa/src/intel/vulkan/anv_video.c
new file mode 100644
index 000000000..38a3b09b2
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/anv_video.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright © 2021 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "vk_video/vulkan_video_codecs_common.h"
+
+VkResult
+anv_CreateVideoSessionKHR(VkDevice _device,
+                           const VkVideoSessionCreateInfoKHR *pCreateInfo,
+                           const VkAllocationCallbacks *pAllocator,
+                           VkVideoSessionKHR *pVideoSession)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   struct anv_video_session *vid =
+      vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*vid), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!vid)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   memset(vid, 0, sizeof(struct anv_video_session));
+
+   VkResult result = vk_video_session_init(&device->vk,
+                                           &vid->vk,
+                                           pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, vid);
+      return result;
+   }
+
+   *pVideoSession = anv_video_session_to_handle(vid);
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyVideoSessionKHR(VkDevice _device,
+                           VkVideoSessionKHR _session,
+                           const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session, vid, _session);
+   if (!_session)
+      return;
+
+   vk_object_base_finish(&vid->vk.base);
+   vk_free2(&device->vk.alloc, pAllocator, vid);
+}
+
+VkResult
+anv_CreateVideoSessionParametersKHR(VkDevice _device,
+                                     const VkVideoSessionParametersCreateInfoKHR *pCreateInfo,
+                                     const VkAllocationCallbacks *pAllocator,
+                                     VkVideoSessionParametersKHR *pVideoSessionParameters)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session, vid, pCreateInfo->videoSession);
+   ANV_FROM_HANDLE(anv_video_session_params, templ, pCreateInfo->videoSessionParametersTemplate);
+   struct anv_video_session_params *params =
+      vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*params), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!params)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result = vk_video_session_parameters_init(&device->vk,
+                                                      &params->vk,
+                                                      &vid->vk,
+                                                      templ ? &templ->vk : NULL,
+                                                      pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, params);
+      return result;
+   }
+
+   *pVideoSessionParameters = anv_video_session_params_to_handle(params);
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyVideoSessionParametersKHR(VkDevice _device,
+                                      VkVideoSessionParametersKHR _params,
+                                      const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session_params, params, _params);
+   if (!_params)
+      return;
+   vk_video_session_parameters_finish(&device->vk, &params->vk);
+   vk_free2(&device->vk.alloc, pAllocator, params);
+}
+
+VkResult
+anv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice,
+                                           const VkVideoProfileInfoKHR *pVideoProfile,
+                                           VkVideoCapabilitiesKHR *pCapabilities)
+{
+   pCapabilities->minBitstreamBufferOffsetAlignment = 32;
+   pCapabilities->minBitstreamBufferSizeAlignment = 32;
+   pCapabilities->pictureAccessGranularity.width = ANV_MB_WIDTH;
+   pCapabilities->pictureAccessGranularity.height = ANV_MB_HEIGHT;
+   pCapabilities->minCodedExtent.width = ANV_MB_WIDTH;
+   pCapabilities->minCodedExtent.height = ANV_MB_HEIGHT;
+   pCapabilities->maxCodedExtent.width = 4096;
+   pCapabilities->maxCodedExtent.height = 4096;
+   pCapabilities->flags = VK_VIDEO_CAPABILITY_SEPARATE_REFERENCE_IMAGES_BIT_KHR;
+
+   struct VkVideoDecodeCapabilitiesKHR *dec_caps = (struct VkVideoDecodeCapabilitiesKHR *)
+      vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_CAPABILITIES_KHR);
+   if (dec_caps)
+      dec_caps->flags = VK_VIDEO_DECODE_CAPABILITY_DPB_AND_OUTPUT_COINCIDE_BIT_KHR;
+
+   switch (pVideoProfile->videoCodecOperation) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR: {
+      struct VkVideoDecodeH264CapabilitiesKHR *ext = (struct VkVideoDecodeH264CapabilitiesKHR *)
+         vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H264_CAPABILITIES_KHR);
+      pCapabilities->maxDpbSlots = 17;
+      pCapabilities->maxActiveReferencePictures = 16;
+
+      ext->fieldOffsetGranularity.x = 0;
+      ext->fieldOffsetGranularity.y = 0;
+      ext->maxLevelIdc = 51;
+      strcpy(pCapabilities->stdHeaderVersion.extensionName, VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_EXTENSION_NAME);
+      pCapabilities->stdHeaderVersion.specVersion = VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_SPEC_VERSION;
+      break;
+   }
+   default:
+      break;
+   }
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_GetPhysicalDeviceVideoFormatPropertiesKHR(VkPhysicalDevice physicalDevice,
+                                               const VkPhysicalDeviceVideoFormatInfoKHR *pVideoFormatInfo,
+                                               uint32_t *pVideoFormatPropertyCount,
+                                               VkVideoFormatPropertiesKHR *pVideoFormatProperties)
+{
+   *pVideoFormatPropertyCount = 1;
+
+   if (!pVideoFormatProperties)
+      return VK_SUCCESS;
+
+   pVideoFormatProperties[0].format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+   pVideoFormatProperties[0].imageType = VK_IMAGE_TYPE_2D;
+   pVideoFormatProperties[0].imageTiling = VK_IMAGE_TILING_OPTIMAL;
+   pVideoFormatProperties[0].imageUsageFlags = pVideoFormatInfo->imageUsage;
+   return VK_SUCCESS;
+}
+
+static void
+get_h264_video_session_mem_reqs(struct anv_video_session *vid,
+                                VkVideoSessionMemoryRequirementsKHR *mem_reqs,
+                                uint32_t memory_types)
+{
+   uint32_t width_in_mb = align(vid->vk.max_coded.width, ANV_MB_WIDTH) / ANV_MB_WIDTH;
+   /* intra row store is width in macroblocks * 64 */
+   mem_reqs[0].memoryBindIndex = ANV_VID_MEM_H264_INTRA_ROW_STORE;
+   mem_reqs[0].memoryRequirements.size = width_in_mb * 64;
+   mem_reqs[0].memoryRequirements.alignment = 4096;
+   mem_reqs[0].memoryRequirements.memoryTypeBits = memory_types;
+
+   /* deblocking filter row store is width in macroblocks * 64 * 4*/
+   mem_reqs[1].memoryBindIndex = ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE;
+   mem_reqs[1].memoryRequirements.size = width_in_mb * 64 * 4;
+   mem_reqs[1].memoryRequirements.alignment = 4096;
+   mem_reqs[1].memoryRequirements.memoryTypeBits = memory_types;
+
+   /* bsd mpc row scratch is width in macroblocks * 64 * 2 */
+   mem_reqs[2].memoryBindIndex = ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH;
+   mem_reqs[2].memoryRequirements.size = width_in_mb * 64 * 2;
+   mem_reqs[2].memoryRequirements.alignment = 4096;
+   mem_reqs[2].memoryRequirements.memoryTypeBits = memory_types;
+
+   /* mpr row scratch is width in macroblocks * 64 * 2 */
+   mem_reqs[3].memoryBindIndex = ANV_VID_MEM_H264_MPR_ROW_SCRATCH;
+   mem_reqs[3].memoryRequirements.size = width_in_mb * 64 * 2;
+   mem_reqs[3].memoryRequirements.alignment = 4096;
+   mem_reqs[3].memoryRequirements.memoryTypeBits = memory_types;
+}
+
+VkResult
+anv_GetVideoSessionMemoryRequirementsKHR(VkDevice _device,
+                                         VkVideoSessionKHR videoSession,
+                                         uint32_t *pVideoSessionMemoryRequirementsCount,
+                                         VkVideoSessionMemoryRequirementsKHR *mem_reqs)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session, vid, videoSession);
+
+   switch (vid->vk.op) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+      *pVideoSessionMemoryRequirementsCount = ANV_VIDEO_MEM_REQS_H264;
+      break;
+   default:
+      unreachable("unknown codec");
+   }
+   if (!mem_reqs)
+      return VK_SUCCESS;
+
+   uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1;
+   switch (vid->vk.op) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+      get_h264_video_session_mem_reqs(vid, mem_reqs, memory_types);
+      break;
+   default:
+      unreachable("unknown codec");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_UpdateVideoSessionParametersKHR(VkDevice _device,
+                                     VkVideoSessionParametersKHR _params,
+                                     const VkVideoSessionParametersUpdateInfoKHR *pUpdateInfo)
+{
+   ANV_FROM_HANDLE(anv_video_session_params, params, _params);
+   return vk_video_session_parameters_update(&params->vk, pUpdateInfo);
+}
+
+static void
+copy_bind(struct anv_vid_mem *dst,
+          const VkBindVideoSessionMemoryInfoKHR *src)
+{
+   dst->mem = anv_device_memory_from_handle(src->memory);
+   dst->offset = src->memoryOffset;
+   dst->size = src->memorySize;
+}
+
+VkResult
+anv_BindVideoSessionMemoryKHR(VkDevice _device,
+                              VkVideoSessionKHR videoSession,
+                              uint32_t bind_mem_count,
+                              const VkBindVideoSessionMemoryInfoKHR *bind_mem)
+{
+   ANV_FROM_HANDLE(anv_video_session, vid, videoSession);
+
+   assert(bind_mem_count == 4);
+   switch (vid->vk.op) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+      for (unsigned i = 0; i < bind_mem_count; i++) {
+         copy_bind(&vid->vid_mem[bind_mem[i].memoryBindIndex], &bind_mem[i]);
+      }
+      break;
+   default:
+      unreachable("unknown codec");
+   }
+   return VK_SUCCESS;
+}
diff --git a/lib/mesa/src/intel/vulkan/genX_acceleration_structure.c b/lib/mesa/src/intel/vulkan/genX_acceleration_structure.c
index 3958452f0..4c675e985 100644
--- a/lib/mesa/src/intel/vulkan/genX_acceleration_structure.c
+++ b/lib/mesa/src/intel/vulkan/genX_acceleration_structure.c
@@ -31,6 +31,7 @@
 
 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
 
 #if GFX_VERx10 >= 125
 
@@ -167,7 +168,7 @@ get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
    struct MKSizeEstimate est = {};
 
    uint64_t size = sizeof(BVHBase);
-   size = align_u64(size, 64);
+   size = align64(size, 64);
 
    /* Must immediately follow BVHBase because we use fixed offset to nodes. */
    est.node_data_start = size;
@@ -258,25 +259,25 @@ get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
       unreachable("Unsupported acceleration structure type");
    }
 
-   size = align_u64(size, 64);
+   size = align64(size, 64);
    est.instance_descs_start = size;
    size += sizeof(struct InstanceDesc) * num_instances;
 
    est.geo_meta_data_start = size;
    size += sizeof(struct GeoMetaData) * pInfo->geometryCount;
-   size = align_u64(size, 64);
+   size = align64(size, 64);
 
-   assert(size == align_u64(size, 64));
+   assert(size == align64(size, 64));
    est.back_pointer_start = size;
 
    const bool alloc_backpointers = false; /* RT TODO */
    if (alloc_backpointers) {
       size += est.max_inner_nodes * sizeof(uint32_t);
-      size = align_u64(size, 64);
+      size = align64(size, 64);
    }
 
    assert(size < UINT32_MAX);
-   est.sizeTotal = align_u64(size, 64);
+   est.sizeTotal = align64(size, 64);
 
    return est;
 }
@@ -392,62 +393,6 @@ genX(GetAccelerationStructureBuildSizesKHR)(
    pSizeInfo->updateScratchSize = gpu_size_info.updateScratchSize;
 }
 
-VkResult
-genX(CreateAccelerationStructureKHR)(
-    VkDevice                                    _device,
-    const VkAccelerationStructureCreateInfoKHR* pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkAccelerationStructureKHR*                 pAccelerationStructure)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
-   struct anv_acceleration_structure *accel;
-
-   accel = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8,
-                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (accel == NULL)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   vk_object_base_init(&device->vk, &accel->base,
-                       VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);
-
-   accel->size = pCreateInfo->size;
-   accel->address = anv_address_add(buffer->address, pCreateInfo->offset);
-
-   *pAccelerationStructure = anv_acceleration_structure_to_handle(accel);
-
-   return VK_SUCCESS;
-}
-
-void
-genX(DestroyAccelerationStructureKHR)(
-    VkDevice                                    _device,
-    VkAccelerationStructureKHR                  accelerationStructure,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_acceleration_structure, accel, accelerationStructure);
-
-   if (!accel)
-      return;
-
-   vk_object_base_finish(&accel->base);
-   vk_free2(&device->vk.alloc, pAllocator, accel);
-}
-
-VkDeviceAddress
-genX(GetAccelerationStructureDeviceAddressKHR)(
-    VkDevice                                    device,
-    const VkAccelerationStructureDeviceAddressInfoKHR* pInfo)
-{
-   ANV_FROM_HANDLE(anv_acceleration_structure, accel,
-                   pInfo->accelerationStructure);
-
-   assert(!anv_address_is_null(accel->address));
-
-   return anv_address_physical(accel->address);
-}
-
 void
 genX(GetDeviceAccelerationStructureCompatibilityKHR)(
     VkDevice                                    _device,
@@ -703,12 +648,12 @@ cmd_build_acceleration_structures(
       const uint32_t *pMaxPrimitiveCounts =
          ppMaxPrimitiveCounts ? ppMaxPrimitiveCounts[i] : NULL;
 
-      ANV_FROM_HANDLE(anv_acceleration_structure, dst_accel,
+      ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel,
                       pInfo->dstAccelerationStructure);
 
       bs->build_method = device->bvh_build_method;
 
-      bs->bvh_addr = dst_accel->address;
+      bs->bvh_addr = anv_address_from_u64(vk_acceleration_structure_get_va(dst_accel));
 
       bs->estimate = get_gpu_size_estimate(pInfo, pBuildRangeInfos,
                                            pMaxPrimitiveCounts);
@@ -872,6 +817,17 @@ cmd_build_acceleration_structures(
                    &data, sizeof(data));
    }
 
+   if (anv_cmd_buffer_is_render_queue(cmd_buffer))
+      genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   /* Due to the nature of GRL and its heavy use of jumps/predication, we
+    * cannot tell exactly in what order the CFE_STATE we insert are going to
+    * be executed. So always use the largest possible size.
+    */
+   genX(cmd_buffer_ensure_cfe_state)(
+      cmd_buffer,
+      cmd_buffer->device->physical->max_grl_scratch_size);
+
    /* Round 1 : init_globals kernel */
    genX(grl_misc_batched_init_globals)(
       cmd_buffer,
@@ -1162,24 +1118,26 @@ genX(CmdCopyAccelerationStructureKHR)(
     const VkCopyAccelerationStructureInfoKHR*   pInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_acceleration_structure, src_accel, pInfo->src);
-   ANV_FROM_HANDLE(anv_acceleration_structure, dst_accel, pInfo->dst);
+   ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
+   ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
 
    assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR ||
           pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR);
 
    if (pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR) {
-      struct anv_address src_size_addr = anv_address_add(
-         src_accel->address,
-         offsetof(struct BVHBase, Meta.allocationSize));
-      genX(grl_copy_clone_indirect)(cmd_buffer,
-                                    anv_address_physical(dst_accel->address),
-                                    anv_address_physical(src_accel->address),
-                                    anv_address_physical(src_size_addr));
+      uint64_t src_size_addr =
+         vk_acceleration_structure_get_va(src_accel) +
+         offsetof(struct BVHBase, Meta.allocationSize);
+      genX(grl_copy_clone_indirect)(
+         cmd_buffer,
+         vk_acceleration_structure_get_va(dst_accel),
+         vk_acceleration_structure_get_va(src_accel),
+         src_size_addr);
    } else {
-      genX(grl_copy_compact)(cmd_buffer,
-                             anv_address_physical(dst_accel->address),
-                             anv_address_physical(src_accel->address));
+      genX(grl_copy_compact)(
+         cmd_buffer,
+         vk_acceleration_structure_get_va(dst_accel),
+         vk_acceleration_structure_get_va(src_accel));
    }
 
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
@@ -1191,19 +1149,20 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)(
     const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_acceleration_structure, src_accel, pInfo->src);
+   ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
    struct anv_device *device = cmd_buffer->device;
-   struct anv_address src_size_addr = anv_address_add(
-      src_accel->address,
-      offsetof(struct BVHBase, Meta.allocationSize));
+   uint64_t src_size_addr =
+      vk_acceleration_structure_get_va(src_accel) +
+      offsetof(struct BVHBase, Meta.allocationSize);
 
    assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR);
 
-   genX(grl_copy_serialize_indirect)(cmd_buffer,
-                                     pInfo->dst.deviceAddress,
-                                     anv_address_physical(src_accel->address),
-                                     anv_address_physical(device->rt_uuid_addr),
-                                     anv_address_physical(src_size_addr));
+   genX(grl_copy_serialize_indirect)(
+      cmd_buffer,
+      pInfo->dst.deviceAddress,
+      vk_acceleration_structure_get_va(src_accel),
+      anv_address_physical(device->rt_uuid_addr),
+      src_size_addr);
 
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
 }
@@ -1214,16 +1173,17 @@ genX(CmdCopyMemoryToAccelerationStructureKHR)(
     const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_acceleration_structure, dst_accel, pInfo->dst);
+   ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
 
    assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR);
 
    uint64_t src_size_addr = pInfo->src.deviceAddress +
       offsetof(struct SerializationHeader, DeserializedSizeInBytes);
-   genX(grl_copy_deserialize_indirect)(cmd_buffer,
-                                       anv_address_physical(dst_accel->address),
-                                       pInfo->src.deviceAddress,
-                                       src_size_addr);
+   genX(grl_copy_deserialize_indirect)(
+      cmd_buffer,
+      vk_acceleration_structure_get_va(dst_accel),
+      pInfo->src.deviceAddress,
+      src_size_addr);
 
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
 }
diff --git a/lib/mesa/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/lib/mesa/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
new file mode 100644
index 000000000..ccb1bd7a2
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
@@ -0,0 +1,750 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_GENERATED_INDIRECT_DRAW_H
+#define GENX_CMD_GENERATED_INDIRECT_DRAW_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "common/intel_genX_state.h"
+
+#include "anv_private.h"
+#include "anv_generated_indirect_draws.h"
+
+/* This is a maximum number of items a fragment shader can generate due to the
+ * viewport size.
+ */
+#define MAX_GENERATED_DRAW_COUNT (8192 * 8192)
+
+static void
+genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   struct anv_device *device = cmd_buffer->device;
+   const struct anv_shader_bin *draw_kernel = device->generated_draw_kernel;
+   const struct brw_wm_prog_data *prog_data =
+      brw_wm_prog_data_const(draw_kernel->prog_data);
+
+   uint32_t *dw = anv_batch_emitn(batch,
+                                  1 + 2 * GENX(VERTEX_ELEMENT_STATE_length),
+                                  GENX(3DSTATE_VERTEX_ELEMENTS));
+   /* You might think there is some shady stuff going here and you would be
+    * right. We're setting up 2 VERTEX_ELEMENT_STATE yet we're only providing
+    * 1 (positions) VERTEX_BUFFER_STATE later.
+    *
+    * Find more about how to set up a 3D pipeline with a fragment shader but
+    * without a vertex shader in blorp_emit_vertex_elements() in
+    * blorp_genX_exec.h.
+    */
+   GENX(VERTEX_ELEMENT_STATE_pack)(
+      batch, dw + 1, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex = 1,
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementOffset = 0,
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+      });
+   GENX(VERTEX_ELEMENT_STATE_pack)(
+      batch, dw + 3, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex   = 0,
+         .Valid               = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+         .SourceElementOffset = 0,
+         .Component0Control   = VFCOMP_STORE_SRC,
+         .Component1Control   = VFCOMP_STORE_SRC,
+         .Component2Control   = VFCOMP_STORE_SRC,
+         .Component3Control   = VFCOMP_STORE_1_FP,
+      });
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf);
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+      sgvs.InstanceIDEnable = true;
+      sgvs.InstanceIDComponentNumber = COMP_1;
+      sgvs.InstanceIDElementOffset = 0;
+   }
+#if GFX_VER >= 11
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
+#endif
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 0;
+   }
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+      topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+   }
+
+   /* Emit URB setup.  We tell it that the VS is active because we want it to
+    * allocate space for the VS.  Even though one isn't run, we need VUEs to
+    * store the data that VF is going to pass to SOL.
+    */
+   const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
+
+   genX(emit_l3_config)(batch, device, device->generated_draw_l3_config);
+
+   cmd_buffer->state.current_l3_config = device->generated_draw_l3_config;
+
+   enum intel_urb_deref_block_size deref_block_size;
+   genX(emit_urb_setup)(device, batch, device->generated_draw_l3_config,
+                        VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
+                        entry_size, &deref_block_size);
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
+      ps_blend.HasWriteableRT = true;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm);
+
+#if GFX_VER >= 12
+   anv_batch_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+      db.DepthBoundsTestEnable = false;
+      db.DepthBoundsTestMinValue = 0.0;
+      db.DepthBoundsTestMaxValue = 1.0;
+   }
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
+   anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+      sm.SampleMask = 0x1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
+   anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
+   anv_batch_emit(batch, GENX(3DSTATE_TE), te);
+   anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
+
+#if GFX_VERx10 >= 125
+   if (device->vk.enabled_extensions.NV_mesh_shader ||
+       device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
+      anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
+   }
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
+
+   anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
+
+   anv_batch_emit(batch, GENX(3DSTATE_CLIP), clip) {
+      clip.PerspectiveDivideDisable = true;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SF), sf) {
+#if GFX_VER >= 12
+      sf.DerefBlockSize = deref_block_size;
+#endif
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_RASTER), raster) {
+      raster.CullMode = CULLMODE_NONE;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
+      sbe.VertexURBEntryReadOffset = 1;
+      sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+      sbe.VertexURBEntryReadLength = MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
+      sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+      sbe.ForceVertexURBEntryReadLength = true;
+      sbe.ForceVertexURBEntryReadOffset = true;
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_WM), wm);
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+      intel_set_ps_dispatch_state(&ps, device->info, prog_data,
+                                  1 /* rasterization_samples */,
+                                  0 /* msaa_flags */);
+
+      ps.VectorMaskEnable       = prog_data->uses_vmask;
+
+      ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0;
+      ps.PushConstantEnable     = prog_data->base.nr_params > 0 ||
+                                  prog_data->base.ubo_ranges[0].length;
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+      ps.KernelStartPointer0 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+      ps.KernelStartPointer2 = draw_kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 2);
+
+      ps.MaximumNumberofThreadsPerPSD = device->info->max_threads_per_psd - 1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
+      psx.PixelShaderValid = true;
+      psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+      psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+      psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
+      psx.PixelShaderComputesStencil = prog_data->computed_stencil;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * GENX(CC_VIEWPORT_length), 32);
+      struct GENX(CC_VIEWPORT) cc_viewport = {
+         .MinimumDepth = 0.0f,
+         .MaximumDepth = 1.0f,
+      };
+      GENX(CC_VIEWPORT_pack)(NULL, cc_state.map, &cc_viewport);
+      cc.CCViewportPointer = cc_state.offset;
+   }
+
+#if GFX_VER >= 12
+   /* Disable Primitive Replication. */
+   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_HS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_DS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_GS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
+      alloc.ConstantBufferOffset = 0;
+      alloc.ConstantBufferSize   = cmd_buffer->device->info->max_constant_urb_size_kb;
+   }
+
+#if GFX_VERx10 == 125
+   /* DG2: Wa_22011440098
+    * MTL: Wa_18022330953
+    *
+    * In 3D mode, after programming push constant alloc command immediately
+    * program push constant command(ZERO length) without any commit between
+    * them.
+    *
+    * Note that Wa_16011448509 isn't needed here as all address bits are zero.
+    */
+   anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+      /* Update empty push constants for all stages (bitmask = 11111b) */
+      c.ShaderUpdateEnable = 0x1f;
+      c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+   }
+#endif
+
+#if GFX_VER == 9
+   /* Allocate a binding table for Gfx9 for 2 reason :
+    *
+    *   1. we need a to emit a 3DSTATE_BINDING_TABLE_POINTERS_PS to make the
+    *      HW apply the preceeding 3DSTATE_CONSTANT_PS
+    *
+    *   2. Emitting an empty 3DSTATE_BINDING_TABLE_POINTERS_PS would cause RT
+    *      writes (even though they're empty) to disturb later writes
+    *      (probably due to RT cache)
+    *
+    * Our binding table only has one entry to the null surface.
+    */
+   uint32_t bt_offset;
+   cmd_buffer->generation_bt_state =
+      anv_cmd_buffer_alloc_binding_table(cmd_buffer, 1, &bt_offset);
+   if (cmd_buffer->generation_bt_state.map == NULL) {
+      VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+      if (result != VK_SUCCESS)
+         return;
+
+      /* Re-emit state base addresses so we get the new surface state base
+       * address before we start emitting binding tables etc.
+       */
+      genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+
+      cmd_buffer->generation_bt_state =
+         anv_cmd_buffer_alloc_binding_table(cmd_buffer, 1, &bt_offset);
+      assert(cmd_buffer->generation_bt_state.map != NULL);
+   }
+
+   uint32_t *bt_map = cmd_buffer->generation_bt_state.map;
+   bt_map[0] = anv_bindless_state_for_binding_table(
+      cmd_buffer->device->null_surface_state).offset + bt_offset;
+
+   cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+#endif
+
+   cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0);
+   cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER |
+                                    ANV_CMD_DIRTY_XFB_ENABLE);
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   cmd_buffer->state.gfx.push_constant_stages = VK_SHADER_STAGE_FRAGMENT_BIT;
+   vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state);
+}
+
+static void
+genX(cmd_buffer_emit_generate_draws_vertex)(struct anv_cmd_buffer *cmd_buffer,
+                                            uint32_t draw_count)
+{
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   struct anv_state vs_data_state =
+      anv_cmd_buffer_alloc_dynamic_state(
+         cmd_buffer, 9 * sizeof(uint32_t), 32);
+
+   float x0 = 0.0f, x1 = MIN2(draw_count, 8192);
+   float y0 = 0.0f, y1 = DIV_ROUND_UP(draw_count, 8192);
+   float z = 0.0f;
+
+   float *vertices = vs_data_state.map;
+   vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */
+   vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */
+   vertices[6] = x0; vertices[7] = y0; vertices[8] = z; /* v2 */
+
+   uint32_t *dw = anv_batch_emitn(batch,
+                                  1 + GENX(VERTEX_BUFFER_STATE_length),
+                                  GENX(3DSTATE_VERTEX_BUFFERS));
+   GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
+                                  &(struct GENX(VERTEX_BUFFER_STATE)) {
+         .VertexBufferIndex     = 0,
+         .AddressModifyEnable   = true,
+         .BufferStartingAddress = (struct anv_address) {
+            .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+            .offset = vs_data_state.offset,
+         },
+         .BufferPitch           = 3 * sizeof(float),
+         .BufferSize            = 9 * sizeof(float),
+         .MOCS                  = anv_mocs(cmd_buffer->device, NULL, 0),
+#if GFX_VER >= 12
+         .L3BypassDisable       = true,
+#endif
+      });
+}
+
+static void
+genX(cmd_buffer_emit_generated_push_data)(struct anv_cmd_buffer *cmd_buffer,
+                                          struct anv_state push_data_state)
+{
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+   struct anv_address push_data_addr = anv_state_pool_state_address(
+      &cmd_buffer->device->dynamic_state_pool, push_data_state);
+
+   /* Don't use 3DSTATE_CONSTANT_ALL on Gfx12.0 due to Wa_16011448509 */
+#if GFX_VERx10 > 120
+   const uint32_t num_dwords = GENX(3DSTATE_CONSTANT_ALL_length) +
+      GENX(3DSTATE_CONSTANT_ALL_DATA_length);
+   uint32_t *dw =
+      anv_batch_emitn(batch, num_dwords,
+                      GENX(3DSTATE_CONSTANT_ALL),
+                      .ShaderUpdateEnable = BITFIELD_BIT(MESA_SHADER_FRAGMENT),
+                      .PointerBufferMask = 0x1,
+                      .MOCS = anv_mocs(cmd_buffer->device, NULL, 0));
+
+   GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+      batch, dw + GENX(3DSTATE_CONSTANT_ALL_length),
+      &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+         .PointerToConstantBuffer = push_data_addr,
+         .ConstantBufferReadLength = DIV_ROUND_UP(push_data_state.alloc_size, 32),
+      });
+#else
+   /* The Skylake PRM contains the following restriction:
+    *
+    *    "The driver must ensure The following case does not occur
+    *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+    *     buffer 3 read length equal to zero committed followed by a
+    *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+    *     zero committed."
+    *
+    * To avoid this, we program the highest slot.
+    */
+   anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_PS), c) {
+      c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+      c.ConstantBody.ReadLength[3] = DIV_ROUND_UP(push_data_state.alloc_size, 32);
+      c.ConstantBody.Buffer[3] = push_data_addr;
+   }
+#endif
+}
+
+static struct anv_generated_indirect_params *
+genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
+                                     struct anv_address generated_cmds_addr,
+                                     uint32_t generated_cmd_stride,
+                                     struct anv_address indirect_data_addr,
+                                     uint32_t indirect_data_stride,
+                                     struct anv_address draw_id_addr,
+                                     uint32_t item_base,
+                                     uint32_t item_count,
+                                     struct anv_address count_addr,
+                                     uint32_t max_count,
+                                     bool indexed)
+{
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+
+   genX(cmd_buffer_emit_generate_draws_vertex)(cmd_buffer, item_count);
+
+   struct anv_state push_data_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         sizeof(struct anv_generated_indirect_params),
+                                         ANV_UBO_ALIGNMENT);
+
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   struct anv_generated_indirect_params *push_data = push_data_state.map;
+   *push_data = (struct anv_generated_indirect_params) {
+      .draw                      = {
+         .draw_id_addr           = anv_address_physical(draw_id_addr),
+         .indirect_data_addr     = anv_address_physical(indirect_data_addr),
+         .indirect_data_stride   = indirect_data_stride,
+         .flags                  = (indexed ? ANV_GENERATED_FLAG_INDEXED : 0) |
+                                   (cmd_buffer->state.conditional_render_enabled ?
+                                    ANV_GENERATED_FLAG_PREDICATED : 0) |
+                                   ((vs_prog_data->uses_firstvertex ||
+                                     vs_prog_data->uses_baseinstance) ?
+                                    ANV_GENERATED_FLAG_BASE : 0) |
+                                   (vs_prog_data->uses_drawid ? ANV_GENERATED_FLAG_DRAWID : 0) |
+                                   (anv_mocs(cmd_buffer->device, indirect_data_addr.bo,
+                                             ISL_SURF_USAGE_VERTEX_BUFFER_BIT) << 8) |
+                                   ((generated_cmd_stride / 4) << 16),
+         .draw_base              = item_base,
+         /* If count_addr is not NULL, we'll edit it through a the command
+          * streamer.
+          */
+         .draw_count             = anv_address_is_null(count_addr) ? max_count : 0,
+         .max_draw_count         = max_count,
+         .instance_multiplier    = pipeline->instance_multiplier,
+      },
+      .indirect_data_addr        = anv_address_physical(indirect_data_addr),
+      .generated_cmds_addr       = anv_address_physical(generated_cmds_addr),
+      .draw_ids_addr             = anv_address_physical(draw_id_addr),
+   };
+
+   if (!anv_address_is_null(count_addr)) {
+      /* Copy the draw count into the push constants so that the generation
+       * gets the value straight away and doesn't even need to access memory.
+       */
+      struct mi_builder b;
+      mi_builder_init(&b, cmd_buffer->device->info, batch);
+      mi_memcpy(&b,
+                anv_address_add((struct anv_address) {
+                      .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+                      .offset = push_data_state.offset,
+                   },
+                   offsetof(struct anv_generated_indirect_params, draw.draw_count)),
+                count_addr, 4);
+
+      /* Make sure the memcpy landed for the generating draw call to pick up
+       * the value.
+       */
+      anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+      }
+   }
+
+   /* Only emit the data after the memcpy above. */
+   genX(cmd_buffer_emit_generated_push_data)(cmd_buffer, push_data_state);
+
+#if GFX_VER == 9
+   /* Why are the push constants not flushed without a binding table
+    * update??
+    */
+   anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), btp) {
+      btp.PointertoPSBindingTable = cmd_buffer->generation_bt_state.offset;
+   }
+#endif
+
+   anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = _3DPRIM_RECTLIST;
+      prim.VertexCountPerInstance   = 3;
+      prim.InstanceCount            = 1;
+   }
+
+   return push_data;
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = true;
+   }
+#endif
+
+   anv_batch_emit_ensure_space(&cmd_buffer->generation_batch, 4);
+
+   trace_intel_begin_generate_draws(&cmd_buffer->trace);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.BatchBufferStartAddress =
+         anv_batch_current_address(&cmd_buffer->generation_batch);
+   }
+
+   cmd_buffer->generation_return_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+   trace_intel_end_generate_draws(&cmd_buffer->trace);
+
+   genX(cmd_buffer_emit_generate_draws_pipeline)(cmd_buffer);
+
+}
+
+static struct anv_address
+genX(cmd_buffer_get_draw_id_addr)(struct anv_cmd_buffer *cmd_buffer,
+                                  uint32_t draw_id_count)
+{
+#if GFX_VER >= 11
+   return ANV_NULL_ADDRESS;
+#else
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   if (!vs_prog_data->uses_drawid)
+      return ANV_NULL_ADDRESS;
+
+   struct anv_state draw_id_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * draw_id_count, 4);
+   return anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+                                       draw_id_state);
+#endif
+}
+
+static uint32_t
+genX(cmd_buffer_get_generated_draw_stride)(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* With the extended parameters in 3DPRIMITIVE on Gfx11+ we can emit
+    * everything. Prior to this, we need to emit a couple of
+    * VERTEX_BUFFER_STATE.
+    */
+#if GFX_VER >= 11
+   return 4 * GENX(3DPRIMITIVE_EXTENDED_length);
+#else
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   uint32_t len = 0;
+
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance ||
+       vs_prog_data->uses_drawid) {
+      len += 4; /* 3DSTATE_VERTEX_BUFFERS */
+
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         len += 4 * GENX(VERTEX_BUFFER_STATE_length);
+
+      if (vs_prog_data->uses_drawid)
+         len += 4 * GENX(VERTEX_BUFFER_STATE_length);
+   }
+
+   return len + 4 * GENX(3DPRIMITIVE_length);
+#endif
+}
+
+static void
+genX(cmd_buffer_rewrite_forward_end_addr)(struct anv_cmd_buffer *cmd_buffer,
+                                          struct anv_generated_indirect_params *params)
+{
+   /* We don't know the end_addr until we have emitted all the generation
+    * draws. Go and edit the address of all the push parameters.
+    */
+   uint64_t end_addr =
+      anv_address_physical(anv_batch_current_address(&cmd_buffer->batch));
+   while (params != NULL) {
+      params->draw.end_addr = end_addr;
+      params = params->prev;
+   }
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer,
+                                               struct anv_address indirect_data_addr,
+                                               uint32_t indirect_data_stride,
+                                               struct anv_address count_addr,
+                                               uint32_t max_draw_count,
+                                               bool indexed)
+{
+   const bool start_generation_batch =
+      anv_address_is_null(cmd_buffer->generation_return_addr);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   struct anv_address draw_id_addr =
+      genX(cmd_buffer_get_draw_id_addr)(cmd_buffer, max_draw_count);
+
+#if GFX_VER == 9
+   /* Mark the VB-0 as using the entire dynamic state pool area, but only for
+    * the draw call starting the generation batch. All the following ones will
+    * use the same area.
+    */
+   if (start_generation_batch) {
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 0,
+                                                     (struct anv_address) {
+                                                        .offset = DYNAMIC_STATE_POOL_MIN_ADDRESS,
+                                                     },
+                                                     DYNAMIC_STATE_POOL_SIZE);
+   }
+
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (vs_prog_data->uses_baseinstance ||
+       vs_prog_data->uses_firstvertex) {
+      /* We're using the indirect buffer directly to source base instance &
+       * first vertex values. Mark the entire area as used.
+       */
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+                                                     indirect_data_addr,
+                                                     indirect_data_stride * max_draw_count);
+   }
+
+   if (vs_prog_data->uses_drawid) {
+      /* Mark the whole draw id buffer as used. */
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+                                                     draw_id_addr,
+                                                     sizeof(uint32_t) * max_draw_count);
+   }
+#endif
+
+   /* Apply the pipeline flush here so the indirect data is available for the
+    * generation shader.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   if (start_generation_batch)
+      genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
+
+   /* In order to have the vertex fetch gather the data we need to have a non
+    * 0 stride. It's possible to have a 0 stride given by the application when
+    * draw_count is 1, but we need a correct value for the
+    * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this
+    * correctly :
+    *
+    * Vulkan spec, vkCmdDrawIndirect:
+    *
+    *   "If drawCount is less than or equal to one, stride is ignored."
+    */
+   assert(indirect_data_stride > 0);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   /* Emit the 3D state in the main batch. */
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   const uint32_t draw_cmd_stride =
+      genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
+
+   struct anv_generated_indirect_params *last_params = NULL;
+   uint32_t item_base = 0;
+   while (item_base < max_draw_count) {
+      const uint32_t item_count = MIN2(max_draw_count - item_base,
+                                       MAX_GENERATED_DRAW_COUNT);
+      const uint32_t draw_cmd_size = item_count * draw_cmd_stride;
+
+      /* Ensure we have enough contiguous space for all the draws so that the
+       * compute shader can edit all the 3DPRIMITIVEs from a single base
+       * address.
+       *
+       * TODO: we might have to split that if the amount of space is to large (at
+       *       1Mb?).
+       */
+      VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch,
+                                                    draw_cmd_size);
+      if (result != VK_SUCCESS)
+         return;
+
+      struct anv_generated_indirect_params *params =
+         genX(cmd_buffer_emit_generate_draws)(
+            cmd_buffer,
+            anv_batch_current_address(&cmd_buffer->batch),
+            draw_cmd_stride,
+            anv_address_add(indirect_data_addr,
+                            item_base * indirect_data_stride),
+            indirect_data_stride,
+            anv_address_add(draw_id_addr, 4 * item_base),
+            item_base,
+            item_count,
+            count_addr,
+            max_draw_count,
+            indexed);
+
+      anv_batch_advance(&cmd_buffer->batch, draw_cmd_size);
+
+      item_base += item_count;
+
+      params->prev = last_params;
+      last_params = params;
+   }
+
+   genX(cmd_buffer_rewrite_forward_end_addr)(cmd_buffer, last_params);
+
+#if GFX_VER == 9
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
+#endif
+}
+
+static void
+genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* No return address setup means we don't have to do anything */
+   if (anv_address_is_null(cmd_buffer->generation_return_addr))
+      return;
+
+   struct anv_batch *batch = &cmd_buffer->generation_batch;
+
+   /* Wait for all the generation vertex shader to generate the commands. */
+   genX(emit_apply_pipe_flushes)(batch,
+                                 cmd_buffer->device,
+                                 _3D,
+#if GFX_VER == 9
+                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
+#endif
+                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                                 ANV_PIPE_CS_STALL_BIT,
+                                 NULL /* query_bits */);
+
+#if GFX_VER >= 12
+   anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = false;
+   }
+#else
+   /* Prior to Gfx12 we cannot disable the CS prefetch but it doesn't matter
+    * as the prefetch shouldn't follow the MI_BATCH_BUFFER_START.
+    */
+#endif
+
+   /* Return to the main batch. */
+   anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.BatchBufferStartAddress = cmd_buffer->generation_return_addr;
+   }
+
+   cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
+}
+
+#endif /* GENX_CMD_GENERATED_INDIRECT_DRAW_H */
diff --git a/lib/mesa/src/intel/vulkan/genX_cmd_draw_helpers.h b/lib/mesa/src/intel/vulkan/genX_cmd_draw_helpers.h
new file mode 100644
index 000000000..8db6b5e75
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/genX_cmd_draw_helpers.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_DRAW_HELPERS_H
+#define GENX_CMD_DRAW_HELPERS_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+
+#if GFX_VER < 11
+static void
+emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
+               struct anv_address addr,
+               uint32_t size, uint32_t index)
+{
+   uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
+                                 GENX(3DSTATE_VERTEX_BUFFERS));
+
+   GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
+      &(struct GENX(VERTEX_BUFFER_STATE)) {
+         .VertexBufferIndex = index,
+         .AddressModifyEnable = true,
+         .BufferPitch = 0,
+         .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
+                          ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+         .NullVertexBuffer = size == 0,
+         .BufferStartingAddress = addr,
+         .BufferSize = size
+      });
+
+#if GFX_VER == 9
+   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
+                                                  index, addr, size);
+#endif
+}
+
+static void
+emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_address addr)
+{
+   emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
+}
+
+static void
+emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
+                          uint32_t base_vertex, uint32_t base_instance)
+{
+   if (base_vertex == 0 && base_instance == 0) {
+      emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
+      return;
+   }
+
+   struct anv_state id_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
+
+   ((uint32_t *)id_state.map)[0] = base_vertex;
+   ((uint32_t *)id_state.map)[1] = base_instance;
+
+   struct anv_address addr =
+      anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+                                    id_state);
+
+   emit_base_vertex_instance_bo(cmd_buffer, addr);
+}
+
+static void
+emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
+{
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
+
+   ((uint32_t *)state.map)[0] = draw_index;
+
+   struct anv_address addr =
+      anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+                                   state);
+
+   emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
+}
+#endif /* GFX_VER <= 11 */
+
+static void
+update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t access_type)
+{
+#if GFX_VER == 9
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   uint64_t vb_used = dyn->vi->bindings_valid;
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance)
+      vb_used |= 1ull << ANV_SVGS_VB_INDEX;
+   if (vs_prog_data->uses_drawid)
+      vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
+
+   genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
+                                                       access_type,
+                                                       vb_used);
+#endif
+}
+
+#if GFX_VER < 11
+ALWAYS_INLINE static void
+cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
+                                           const struct brw_vs_prog_data *vs_prog_data,
+                                           uint32_t base_vertex,
+                                           uint32_t base_instance,
+                                           uint32_t draw_id,
+                                           bool force_flush)
+{
+   bool emitted = false;
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance) {
+      emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
+      emitted = true;
+   }
+   if (vs_prog_data->uses_drawid) {
+      emit_draw_index(cmd_buffer, draw_id);
+      emitted = true;
+   }
+   /* Emitting draw index or vertex index BOs may result in needing
+    * additional VF cache flushes.
+    */
+   if (emitted || force_flush)
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+#endif
+
+#endif /* GENX_CMD_DRAW_HELPERS_H */
diff --git a/lib/mesa/src/intel/vulkan/genX_video.c b/lib/mesa/src/intel/vulkan/genX_video.c
new file mode 100644
index 000000000..0192d8703
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/genX_video.c
@@ -0,0 +1,447 @@
+/*
+ * Copyright © 2021 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+void
+genX(CmdBeginVideoCodingKHR)(VkCommandBuffer commandBuffer,
+                             const VkVideoBeginCodingInfoKHR *pBeginInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_video_session, vid, pBeginInfo->videoSession);
+   ANV_FROM_HANDLE(anv_video_session_params, params, pBeginInfo->videoSessionParameters);
+
+   cmd_buffer->video.vid = vid;
+   cmd_buffer->video.params = params;
+}
+
+void
+genX(CmdControlVideoCodingKHR)(VkCommandBuffer commandBuffer,
+                               const VkVideoCodingControlInfoKHR *pCodingControlInfo)
+{
+
+}
+
+void
+genX(CmdEndVideoCodingKHR)(VkCommandBuffer commandBuffer,
+                           const VkVideoEndCodingInfoKHR *pEndCodingInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->video.vid = NULL;
+   cmd_buffer->video.params = NULL;
+}
+
+static void
+anv_h264_decode_video(struct anv_cmd_buffer *cmd_buffer,
+                      const VkVideoDecodeInfoKHR *frame_info)
+{
+   ANV_FROM_HANDLE(anv_buffer, src_buffer, frame_info->srcBuffer);
+   struct anv_video_session *vid = cmd_buffer->video.vid;
+   struct anv_video_session_params *params = cmd_buffer->video.params;
+   const struct VkVideoDecodeH264PictureInfoKHR *h264_pic_info =
+      vk_find_struct_const(frame_info->pNext, VIDEO_DECODE_H264_PICTURE_INFO_KHR);
+   const StdVideoH264SequenceParameterSet *sps = vk_video_find_h264_dec_std_sps(&params->vk, h264_pic_info->pStdPictureInfo->seq_parameter_set_id);
+   const StdVideoH264PictureParameterSet *pps = vk_video_find_h264_dec_std_pps(&params->vk, h264_pic_info->pStdPictureInfo->pic_parameter_set_id);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+      flush.DWordLength = 2;
+      flush.VideoPipelineCacheInvalidate = 1;
+   };
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_FORCE_WAKEUP), wake) {
+      wake.MFXPowerWellControl = 1;
+      wake.MaskBits = 768;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+      mfx.MFXSyncControlFlag = 1;
+   }
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_PIPE_MODE_SELECT), sel) {
+      sel.StandardSelect = SS_AVC;
+      sel.CodecSelect = Decode;
+      sel.DecoderShortFormatMode = ShortFormatDriverInterface;
+      sel.DecoderModeSelect = VLDMode; // Hardcoded
+
+      sel.PreDeblockingOutputEnable = 0;
+      sel.PostDeblockingOutputEnable = 1;
+   }
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+      mfx.MFXSyncControlFlag = 1;
+   }
+#endif
+
+   const struct anv_image_view *iv = anv_image_view_from_handle(frame_info->dstPictureResource.imageViewBinding);
+   const struct anv_image *img = iv->image;
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_SURFACE_STATE), ss) {
+      ss.Width = img->vk.extent.width - 1;
+      ss.Height = img->vk.extent.height - 1;
+      ss.SurfaceFormat = PLANAR_420_8; // assert on this?
+      ss.InterleaveChroma = 1;
+      ss.SurfacePitch = img->planes[0].primary_surface.isl.row_pitch_B - 1;
+      ss.TiledSurface = img->planes[0].primary_surface.isl.tiling != ISL_TILING_LINEAR;
+      ss.TileWalk = TW_YMAJOR;
+
+      ss.YOffsetforUCb = ss.YOffsetforVCr =
+         img->planes[1].primary_surface.memory_range.offset / img->planes[0].primary_surface.isl.row_pitch_B;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_PIPE_BUF_ADDR_STATE), buf) {
+      bool use_pre_deblock = false;
+      if (use_pre_deblock) {
+         buf.PreDeblockingDestinationAddress = anv_image_address(img,
+                                                                 &img->planes[0].primary_surface.memory_range);
+      } else {
+         buf.PostDeblockingDestinationAddress = anv_image_address(img,
+                                                                  &img->planes[0].primary_surface.memory_range);
+      }
+      buf.PreDeblockingDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.PreDeblockingDestinationAddress.bo, 0),
+      };
+      buf.PostDeblockingDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.PostDeblockingDestinationAddress.bo, 0),
+      };
+
+      buf.IntraRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_INTRA_ROW_STORE].mem->bo, vid->vid_mem[ANV_VID_MEM_H264_INTRA_ROW_STORE].offset };
+      buf.IntraRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.IntraRowStoreScratchBufferAddress.bo, 0),
+      };
+      buf.DeblockingFilterRowStoreScratchAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE].mem->bo, vid->vid_mem[ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE].offset };
+      buf.DeblockingFilterRowStoreScratchAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterRowStoreScratchAddress.bo, 0),
+      };
+      buf.MBStatusBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.MBILDBStreamOutBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.SecondMBILDBStreamOutBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.ScaledReferenceSurfaceAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.OriginalUncompressedPictureSourceAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.StreamOutDataDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      struct anv_bo *ref_bo = NULL;
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         const struct anv_image_view *ref_iv = anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+         int idx = frame_info->pReferenceSlots[i].slotIndex;
+         buf.ReferencePictureAddress[idx] = anv_image_address(ref_iv->image,
+                                                              &ref_iv->image->planes[0].primary_surface.memory_range);
+
+         if (i == 0) {
+            ref_bo = ref_iv->image->bindings[0].address.bo;
+         }
+      }
+      buf.ReferencePictureAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, ref_bo, 0),
+      };
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_IND_OBJ_BASE_ADDR_STATE), index_obj) {
+      index_obj.MFXIndirectBitstreamObjectAddress = anv_address_add(src_buffer->address,
+                                                                    frame_info->srcBufferOffset & ~4095);
+      index_obj.MFXIndirectBitstreamObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, src_buffer->address.bo, 0),
+      };
+      index_obj.MFXIndirectMVObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      index_obj.MFDIndirectITCOEFFObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      index_obj.MFDIndirectITDBLKObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      index_obj.MFCIndirectPAKBSEObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_BSP_BUF_BASE_ADDR_STATE), bsp) {
+      bsp.BSDMPCRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].offset };
+
+      bsp.BSDMPCRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, bsp.BSDMPCRowStoreScratchBufferAddress.bo, 0),
+      };
+      bsp.MPRRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_MPR_ROW_SCRATCH].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].offset };
+
+      bsp.MPRRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, bsp.MPRRowStoreScratchBufferAddress.bo, 0),
+      };
+      bsp.BitplaneReadBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_DPB_STATE), avc_dpb) {
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         const struct VkVideoDecodeH264DpbSlotInfoKHR *dpb_slot =
+            vk_find_struct_const(frame_info->pReferenceSlots[i].pNext, VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR);
+         const StdVideoDecodeH264ReferenceInfo *ref_info = dpb_slot->pStdReferenceInfo;
+         int idx = frame_info->pReferenceSlots[i].slotIndex;
+         avc_dpb.NonExistingFrame[idx] = ref_info->flags.is_non_existing;
+         avc_dpb.LongTermFrame[idx] = ref_info->flags.used_for_long_term_reference;
+         if (!ref_info->flags.top_field_flag && !ref_info->flags.bottom_field_flag)
+            avc_dpb.UsedforReference[idx] = 3;
+         else
+            avc_dpb.UsedforReference[idx] = ref_info->flags.top_field_flag | (ref_info->flags.bottom_field_flag << 1);
+         avc_dpb.LTSTFrameNumberList[idx] = ref_info->FrameNum;
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_PICID_STATE), picid) {
+      picid.PictureIDRemappingDisable = true;
+   }
+
+   uint32_t pic_height = sps->pic_height_in_map_units_minus1 + 1;
+   if (!sps->flags.frame_mbs_only_flag)
+      pic_height *= 2;
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_AVC_IMG_STATE), avc_img) {
+      avc_img.FrameWidth = sps->pic_width_in_mbs_minus1;
+      avc_img.FrameHeight = pic_height - 1;
+      avc_img.FrameSize = (sps->pic_width_in_mbs_minus1 + 1) * pic_height;
+
+      if (!h264_pic_info->pStdPictureInfo->flags.field_pic_flag)
+         avc_img.ImageStructure = FramePicture;
+      else if (h264_pic_info->pStdPictureInfo->flags.bottom_field_flag)
+         avc_img.ImageStructure = BottomFieldPicture;
+      else
+         avc_img.ImageStructure = TopFieldPicture;
+
+      avc_img.WeightedBiPredictionIDC = pps->weighted_bipred_idc;
+      avc_img.WeightedPredictionEnable = pps->flags.weighted_pred_flag;
+      avc_img.FirstChromaQPOffset = pps->chroma_qp_index_offset;
+      avc_img.SecondChromaQPOffset = pps->second_chroma_qp_index_offset;
+      avc_img.FieldPicture = h264_pic_info->pStdPictureInfo->flags.field_pic_flag;
+      avc_img.MBAFFMode = (sps->flags.mb_adaptive_frame_field_flag &&
+                           !h264_pic_info->pStdPictureInfo->flags.field_pic_flag);
+      avc_img.FrameMBOnly = sps->flags.frame_mbs_only_flag;
+      avc_img._8x8IDCTTransformMode = pps->flags.transform_8x8_mode_flag;
+      avc_img.Direct8x8Inference = sps->flags.direct_8x8_inference_flag;
+      avc_img.ConstrainedIntraPrediction = pps->flags.constrained_intra_pred_flag;
+      avc_img.NonReferencePicture = !h264_pic_info->pStdPictureInfo->flags.is_reference;
+      avc_img.EntropyCodingSyncEnable = pps->flags.entropy_coding_mode_flag;
+      avc_img.ChromaFormatIDC = sps->chroma_format_idc;
+      avc_img.TrellisQuantizationChromaDisable = true;
+      avc_img.NumberofReferenceFrames = frame_info->referenceSlotCount;
+      avc_img.NumberofActiveReferencePicturesfromL0 = pps->num_ref_idx_l0_default_active_minus1 + 1;
+      avc_img.NumberofActiveReferencePicturesfromL1 = pps->num_ref_idx_l1_default_active_minus1 + 1;
+      avc_img.InitialQPValue = pps->pic_init_qp_minus26;
+      avc_img.PicOrderPresent = pps->flags.bottom_field_pic_order_in_frame_present_flag;
+      avc_img.DeltaPicOrderAlwaysZero = sps->flags.delta_pic_order_always_zero_flag;
+      avc_img.PicOrderCountType = sps->pic_order_cnt_type;
+      avc_img.DeblockingFilterControlPresent = pps->flags.deblocking_filter_control_present_flag;
+      avc_img.RedundantPicCountPresent = pps->flags.redundant_pic_cnt_present_flag;
+      avc_img.Log2MaxFrameNumber = sps->log2_max_frame_num_minus4;
+      avc_img.Log2MaxPicOrderCountLSB = sps->log2_max_pic_order_cnt_lsb_minus4;
+      avc_img.CurrentPictureFrameNumber = h264_pic_info->pStdPictureInfo->frame_num;
+   }
+
+   if (pps->flags.pic_scaling_matrix_present_flag) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_4x4_Intra_MATRIX;
+         for (unsigned m = 0; m < 3; m++)
+            for (unsigned q = 0; q < 16; q++)
+               qm.ForwardQuantizerMatrix[m * 16 + q] = pps->pScalingLists->ScalingList4x4[m][q];
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_4x4_Inter_MATRIX;
+         for (unsigned m = 0; m < 3; m++)
+            for (unsigned q = 0; q < 16; q++)
+               qm.ForwardQuantizerMatrix[m * 16 + q] = pps->pScalingLists->ScalingList4x4[m + 3][q];
+      }
+      if (pps->flags.transform_8x8_mode_flag) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+            qm.DWordLength = 16;
+            qm.AVC = AVC_8x8_Intra_MATRIX;
+            for (unsigned q = 0; q < 64; q++)
+               qm.ForwardQuantizerMatrix[q] = pps->pScalingLists->ScalingList8x8[0][q];
+         }
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+            qm.DWordLength = 16;
+            qm.AVC = AVC_8x8_Inter_MATRIX;
+            for (unsigned q = 0; q < 64; q++)
+               qm.ForwardQuantizerMatrix[q] = pps->pScalingLists->ScalingList8x8[3][q];
+         }
+      }
+   } else if (sps->flags.seq_scaling_matrix_present_flag) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_4x4_Intra_MATRIX;
+         for (unsigned m = 0; m < 3; m++)
+            for (unsigned q = 0; q < 16; q++)
+               qm.ForwardQuantizerMatrix[m * 16 + q] = sps->pScalingLists->ScalingList4x4[m][q];
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_4x4_Inter_MATRIX;
+         for (unsigned m = 0; m < 3; m++)
+            for (unsigned q = 0; q < 16; q++)
+               qm.ForwardQuantizerMatrix[m * 16 + q] = sps->pScalingLists->ScalingList4x4[m + 3][q];
+      }
+      if (pps->flags.transform_8x8_mode_flag) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+            qm.DWordLength = 16;
+            qm.AVC = AVC_8x8_Intra_MATRIX;
+            for (unsigned q = 0; q < 64; q++)
+               qm.ForwardQuantizerMatrix[q] = sps->pScalingLists->ScalingList8x8[0][q];
+         }
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+            qm.DWordLength = 16;
+            qm.AVC = AVC_8x8_Inter_MATRIX;
+            for (unsigned q = 0; q < 64; q++)
+               qm.ForwardQuantizerMatrix[q] = sps->pScalingLists->ScalingList8x8[3][q];
+         }
+      }
+   } else {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_4x4_Intra_MATRIX;
+         for (unsigned q = 0; q < 3 * 16; q++)
+            qm.ForwardQuantizerMatrix[q] = 0x10;
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_4x4_Inter_MATRIX;
+         for (unsigned q = 0; q < 3 * 16; q++)
+            qm.ForwardQuantizerMatrix[q] = 0x10;
+      }
+      if (pps->flags.transform_8x8_mode_flag) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+            qm.DWordLength = 16;
+            qm.AVC = AVC_8x8_Intra_MATRIX;
+            for (unsigned q = 0; q < 64; q++)
+               qm.ForwardQuantizerMatrix[q] = 0x10;
+         }
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+            qm.DWordLength = 16;
+            qm.AVC = AVC_8x8_Inter_MATRIX;
+            for (unsigned q = 0; q < 64; q++)
+               qm.ForwardQuantizerMatrix[q] = 0x10;
+         }
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_AVC_DIRECTMODE_STATE), avc_directmode) {
+      /* bind reference frame DMV */
+      struct anv_bo *dmv_bo = NULL;
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         int idx = frame_info->pReferenceSlots[i].slotIndex;
+         const struct VkVideoDecodeH264DpbSlotInfoKHR *dpb_slot =
+            vk_find_struct_const(frame_info->pReferenceSlots[i].pNext, VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR);
+         const struct anv_image_view *ref_iv = anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+         const StdVideoDecodeH264ReferenceInfo *ref_info = dpb_slot->pStdReferenceInfo;
+         avc_directmode.DirectMVBufferAddress[idx] = anv_image_address(ref_iv->image,
+                                                                     &ref_iv->image->vid_dmv_top_surface);
+         if (i == 0) {
+            dmv_bo = ref_iv->image->bindings[0].address.bo;
+         }
+         avc_directmode.POCList[2 * idx] = ref_info->PicOrderCnt[0];
+         avc_directmode.POCList[2 * idx + 1] = ref_info->PicOrderCnt[1];
+      }
+      avc_directmode.DirectMVBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, dmv_bo, 0),
+      };
+
+      avc_directmode.DirectMVBufferWriteAddress = anv_image_address(img,
+                                                                    &img->vid_dmv_top_surface);
+      avc_directmode.DirectMVBufferWriteAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, img->bindings[0].address.bo, 0),
+      };
+      avc_directmode.POCList[32] = h264_pic_info->pStdPictureInfo->PicOrderCnt[0];
+      avc_directmode.POCList[33] = h264_pic_info->pStdPictureInfo->PicOrderCnt[1];
+   }
+
+   uint32_t buffer_offset = frame_info->srcBufferOffset & 4095;
+#define HEADER_OFFSET 3
+   for (unsigned s = 0; s < h264_pic_info->sliceCount; s++) {
+      bool last_slice = s == (h264_pic_info->sliceCount - 1);
+      uint32_t current_offset = h264_pic_info->pSliceOffsets[s];
+      uint32_t this_end;
+      if (!last_slice) {
+         uint32_t next_offset = h264_pic_info->pSliceOffsets[s + 1];
+         uint32_t next_end = h264_pic_info->pSliceOffsets[s + 2];
+         if (s == h264_pic_info->sliceCount - 2)
+            next_end = frame_info->srcBufferRange;
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_SLICEADDR), sliceaddr) {
+            sliceaddr.IndirectBSDDataLength = next_end - next_offset - HEADER_OFFSET;
+            /* start decoding after the 3-byte header. */
+            sliceaddr.IndirectBSDDataStartAddress = buffer_offset + next_offset + HEADER_OFFSET;
+         };
+         this_end = next_offset;
+      } else
+         this_end = frame_info->srcBufferRange;
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_BSD_OBJECT), avc_bsd) {
+         avc_bsd.IndirectBSDDataLength = this_end - current_offset - HEADER_OFFSET;
+         /* start decoding after the 3-byte header. */
+         avc_bsd.IndirectBSDDataStartAddress = buffer_offset + current_offset + HEADER_OFFSET;
+         avc_bsd.InlineData.LastSlice = last_slice;
+         avc_bsd.InlineData.FixPrevMBSkipped = 1;
+         avc_bsd.InlineData.IntraPredictionErrorControl = 1;
+         avc_bsd.InlineData.Intra8x84x4PredictionErrorConcealmentControl = 1;
+         avc_bsd.InlineData.ISliceConcealmentMode = 1;
+      };
+   }
+}
+
+void
+genX(CmdDecodeVideoKHR)(VkCommandBuffer commandBuffer,
+                        const VkVideoDecodeInfoKHR *frame_info)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   switch (cmd_buffer->video.vid->vk.op) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+      anv_h264_decode_video(cmd_buffer, frame_info);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+void
+genX(CmdEncodeVideoKHR)(VkCommandBuffer commandBuffer,
+                        const VkVideoEncodeInfoKHR *pEncodeInfo)
+{
+}
+#endif
diff --git a/lib/mesa/src/intel/vulkan/gfx8_cmd_buffer.c b/lib/mesa/src/intel/vulkan/gfx8_cmd_buffer.c
index 34337c21f..f9c13954d 100644
--- a/lib/mesa/src/intel/vulkan/gfx8_cmd_buffer.c
+++ b/lib/mesa/src/intel/vulkan/gfx8_cmd_buffer.c
@@ -55,7 +55,9 @@ genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
       pc.RenderTargetCacheFlushEnable = true;
 #if GFX_VER >= 12
       pc.TileCacheFlushEnable = true;
+#endif
 
+#if INTEL_NEEDS_WA_1409600907
       /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
        * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
        */
@@ -209,6 +211,24 @@ want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer,
           wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
 }
 
+static UNUSED bool
+geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
+{
+   const struct brw_tcs_prog_data *tcs_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ?
+      get_tcs_prog_data(pipeline) : NULL;
+   const struct brw_tes_prog_data *tes_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
+      get_tes_prog_data(pipeline) : NULL;
+   const struct brw_gs_prog_data *gs_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) ?
+      get_gs_prog_data(pipeline) : NULL;
+
+   return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
+          (tes_prog_data && tes_prog_data->include_primitive_id) ||
+          (gs_prog_data && gs_prog_data->include_primitive_id);
+}
+
 static void
 genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -230,7 +250,21 @@ genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer)
       te.MaximumTessellationFactorOdd = 63.0;
       te.MaximumTessellationFactorNotOdd = 64.0;
 #if GFX_VERx10 >= 125
-      te.TessellationDistributionMode = TEDMODE_RR_FREE;
+      if (intel_needs_workaround(cmd_buffer->device->info, 22012785325))
+         te.TessellationDistributionMode = TEDMODE_RR_STRICT;
+      else
+         te.TessellationDistributionMode = TEDMODE_RR_FREE;
+
+      if (intel_needs_workaround(cmd_buffer->device->info, 14015297576)) {
+         /* Wa_14015297576:
+          *
+          * Disable Tessellation Distribution when primitive Id is enabled.
+          */
+         if (pipeline->primitive_id_override ||
+             geom_or_tess_prim_id_used(pipeline))
+            te.TessellationDistributionMode = TEDMODE_OFF;
+      }
+
       te.TessellationDistributionLevel = TEDLEVEL_PATCH;
       /* 64_TRIANGLES */
       te.SmallPatchThreshold = 3;
@@ -315,7 +349,8 @@ genX(emit_shading_rate)(struct anv_batch *batch,
                         const struct vk_fragment_shading_rate_state *fsr)
 {
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-   const bool cps_enable = wm_prog_data && wm_prog_data->per_coarse_pixel_dispatch;
+   const bool cps_enable = wm_prog_data &&
+      brw_wm_prog_data_is_coarse(wm_prog_data, 0);
 
 #if GFX_VER == 11
    anv_batch_emit(batch, GENX(3DSTATE_CPS), cps) {
@@ -392,6 +427,36 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
       &cmd_buffer->vk.dynamic_graphics_state;
 
    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI)) {
+      const uint32_t ve_count =
+         pipeline->vs_input_elements + pipeline->svgs_count;
+      const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
+      uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                                    GENX(3DSTATE_VERTEX_ELEMENTS));
+
+      if (p) {
+         if (ve_count == 0) {
+            memcpy(p + 1, cmd_buffer->device->empty_vs_input,
+                   sizeof(cmd_buffer->device->empty_vs_input));
+         } else if (ve_count == pipeline->vertex_input_elems) {
+            /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so
+             * everything is in pipeline->vertex_input_data and we can just
+             * memcpy
+             */
+            memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
+         } else {
+            /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
+            genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
+                                    pipeline, dyn->vi);
+            /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
+            memcpy(p + 1 + 2 * pipeline->vs_input_elements,
+                   pipeline->vertex_input_data,
+                   4 * 2 * pipeline->vertex_input_elems);
+         }
+      }
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
       genX(cmd_emit_te)(cmd_buffer);
    }
@@ -650,8 +715,12 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
 #endif
 
    if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations &&
-       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS))
-      genX(emit_sample_pattern)(&cmd_buffer->batch, dyn->ms.sample_locations);
+       (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
+        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE))) {
+      genX(emit_sample_pattern)(&cmd_buffer->batch,
+                                dyn->ms.sample_locations_enable ?
+                                dyn->ms.sample_locations : NULL);
+   }
 
    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
diff --git a/lib/mesa/src/intel/vulkan/grl/genX_grl.h b/lib/mesa/src/intel/vulkan/grl/genX_grl.h
index 6617e210b..57aefa72d 100644
--- a/lib/mesa/src/intel/vulkan/grl/genX_grl.h
+++ b/lib/mesa/src/intel/vulkan/grl/genX_grl.h
@@ -24,13 +24,15 @@
 #ifndef ANV_GRL_H
 #define ANV_GRL_H
 
+#include "grl/grl_cl_kernel.h"
+#include "genxml/gen_macros.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "anv_private.h"
-#include "grl/grl_cl_kernel.h"
-#include "genxml/gen_macros.h"
+struct anv_cmd_buffer;
+struct anv_kernel_arg;
 
 void
 genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
@@ -42,6 +44,9 @@ genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
 void
 genX(grl_load_rt_uuid)(uint8_t *out_uuid);
 
+uint32_t
+genX(grl_max_scratch_size)(void);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/lib/mesa/src/intel/vulkan/grl/genX_grl_dispatch.c b/lib/mesa/src/intel/vulkan/grl/genX_grl_dispatch.c
index a320e6faa..eff7c4074 100644
--- a/lib/mesa/src/intel/vulkan/grl/genX_grl_dispatch.c
+++ b/lib/mesa/src/intel/vulkan/grl/genX_grl_dispatch.c
@@ -21,6 +21,7 @@
  * IN THE SOFTWARE.
  */
 
+#include "anv_private.h"
 #include "genX_grl.h"
 
 static struct anv_shader_bin *
@@ -89,3 +90,19 @@ genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
    genX(cmd_buffer_dispatch_kernel)(cmd_buffer, &ak, global_size,
                                     arg_count, args);
 }
+
+uint32_t
+genX(grl_max_scratch_size)(void)
+{
+   uint32_t scratch_size = 0;
+
+   for (uint32_t i = 0; i < GRL_CL_KERNEL_MAX; i++) {
+      struct brw_kernel kernel_data;
+      genX(grl_get_cl_kernel)(&kernel_data, i);
+
+      scratch_size = MAX2(kernel_data.prog_data.base.total_scratch,
+                          scratch_size);
+   }
+
+   return scratch_size;
+}
diff --git a/lib/mesa/src/intel/vulkan/grl/genX_grl_uuid.cpp b/lib/mesa/src/intel/vulkan/grl/genX_grl_uuid.cpp
index 9f4335892..cf6b425fe 100644
--- a/lib/mesa/src/intel/vulkan/grl/genX_grl_uuid.cpp
+++ b/lib/mesa/src/intel/vulkan/grl/genX_grl_uuid.cpp
@@ -24,15 +24,16 @@
 #include <assert.h>
 #include <string.h>
 
+#include "genX_grl.h"
 #include "include/GRLGen12.h"
 
 #include "vulkan/vulkan_core.h"
 
 extern "C" void
-gfx125_grl_load_rt_uuid(uint8_t *out_uuid);
+genX(grl_load_rt_uuid)(uint8_t *out_uuid);
 
 extern "C" void
-gfx125_grl_load_rt_uuid(uint8_t *out_uuid)
+genX(grl_load_rt_uuid)(uint8_t *out_uuid)
 {
    assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE);
    memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE);
diff --git a/lib/mesa/src/intel/vulkan/grl/grl_cl_kernel_gen.py b/lib/mesa/src/intel/vulkan/grl/grl_cl_kernel_gen.py
index 4b0b8babd..c7efeff53 100644
--- a/lib/mesa/src/intel/vulkan/grl/grl_cl_kernel_gen.py
+++ b/lib/mesa/src/intel/vulkan/grl/grl_cl_kernel_gen.py
@@ -36,13 +36,13 @@ TEMPLATE_H = Template(COPYRIGHT + """
 #ifndef GRL_CL_KERNEL_H
 #define GRL_CL_KERNEL_H
 
+#include "genxml/gen_macros.h"
+#include "compiler/brw_kernel.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "genxml/gen_macros.h"
-#include "compiler/brw_kernel.h"
-
 enum grl_cl_kernel {
 % for k in kernels:
     GRL_CL_KERNEL_${k.upper()},
@@ -50,7 +50,7 @@ enum grl_cl_kernel {
     GRL_CL_KERNEL_MAX,
 };
 
-const char *grl_cl_kernel_name(enum grl_cl_kernel kernel);
+const char *genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel);
 
 const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id);
 
@@ -73,7 +73,7 @@ TEMPLATE_C = Template(COPYRIGHT + """
 % endfor
 
 const char *
-grl_cl_kernel_name(enum grl_cl_kernel kernel)
+genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel)
 {
     switch (kernel) {
 % for k in kernels:
diff --git a/lib/mesa/src/intel/vulkan/grl/grl_metakernel_gen.py b/lib/mesa/src/intel/vulkan/grl/grl_metakernel_gen.py
index 029ecf30f..0a14113a3 100644
--- a/lib/mesa/src/intel/vulkan/grl/grl_metakernel_gen.py
+++ b/lib/mesa/src/intel/vulkan/grl/grl_metakernel_gen.py
@@ -866,7 +866,7 @@ C_PROLOGUE = COPYRIGHT + '''
 
 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
-#include "genxml/gen_rt_pack.h"
+#include "genxml/genX_rt_pack.h"
 
 /* We reserve :
  *    - GPR 14 for secondary command buffer returns
diff --git a/lib/mesa/src/intel/vulkan/grl/meson.build b/lib/mesa/src/intel/vulkan/grl/meson.build
index 979414c07..c0056b349 100644
--- a/lib/mesa/src/intel/vulkan/grl/meson.build
+++ b/lib/mesa/src/intel/vulkan/grl/meson.build
@@ -142,6 +142,7 @@ foreach t : [['125', 'gfx125', 'dg2']]
                                    # without modifying grl source code, remove
                                    # if fixed there
       ],
+      env: ['MESA_SHADER_CACHE_DISABLE=true'],
       depends : [prog_intel_clc]
     )
   endforeach
@@ -165,11 +166,11 @@ foreach t : [['125', 'gfx125', 'dg2']]
       inc_intel,
     ],
     c_args : [
-      no_override_init_args, c_sse2_args,
+      no_override_init_args, sse2_args,
       '-DGFX_VERx10=@0@'.format(verX10),
     ],
     cpp_args : [
-      no_override_init_args, c_sse2_args,
+      sse2_args,
       '-DGFX_VERx10=@0@'.format(verX10),
     ],
     dependencies : [
@@ -196,7 +197,6 @@ libgrl = static_library(
   ],
   link_whole : [grl_genX_libs],
   dependencies : [libgrl_deps, idep_anv_headers],
-  install : true,
 )
 idep_grl = declare_dependency(
   link_with : libgrl,
diff --git a/lib/mesa/src/intel/vulkan/i915/anv_batch_chain.c b/lib/mesa/src/intel/vulkan/i915/anv_batch_chain.c
new file mode 100644
index 000000000..ff6e7d1ae
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/i915/anv_batch_chain.c
@@ -0,0 +1,813 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "i915/anv_batch_chain.h"
+#include "anv_private.h"
+#include "anv_measure.h"
+
+#include "perf/intel_perf.h"
+#include "util/u_debug.h"
+
+#include "drm-uapi/i915_drm.h"
+
+struct anv_execbuf {
+   struct drm_i915_gem_execbuffer2           execbuf;
+
+   struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;
+
+   struct drm_i915_gem_exec_object2 *        objects;
+   uint32_t                                  bo_count;
+   uint32_t                                  bo_array_length;
+   struct anv_bo **                          bos;
+
+   uint32_t                                  syncobj_count;
+   uint32_t                                  syncobj_array_length;
+   struct drm_i915_gem_exec_fence *          syncobjs;
+   uint64_t *                                syncobj_values;
+
+   uint32_t                                  cmd_buffer_count;
+   struct anv_query_pool                     *perf_query_pool;
+
+   const VkAllocationCallbacks *             alloc;
+   VkSystemAllocationScope                   alloc_scope;
+
+   int                                       perf_query_pass;
+};
+
+static void
+anv_execbuf_finish(struct anv_execbuf *exec)
+{
+   vk_free(exec->alloc, exec->syncobjs);
+   vk_free(exec->alloc, exec->syncobj_values);
+   vk_free(exec->alloc, exec->objects);
+   vk_free(exec->alloc, exec->bos);
+}
+
+static void
+anv_execbuf_add_ext(struct anv_execbuf *exec,
+                    uint32_t ext_name,
+                    struct i915_user_extension *ext)
+{
+   __u64 *iter = &exec->execbuf.cliprects_ptr;
+
+   exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;
+
+   while (*iter != 0) {
+      iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
+   }
+
+   ext->name = ext_name;
+
+   *iter = (uintptr_t) ext;
+}
+
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+                          struct anv_execbuf *exec,
+                          uint32_t dep_words,
+                          BITSET_WORD *deps,
+                          uint32_t extra_flags);
+
+static VkResult
+anv_execbuf_add_bo(struct anv_device *device,
+                   struct anv_execbuf *exec,
+                   struct anv_bo *bo,
+                   struct anv_reloc_list *relocs,
+                   uint32_t extra_flags)
+{
+   struct drm_i915_gem_exec_object2 *obj = NULL;
+
+   if (bo->exec_obj_index < exec->bo_count &&
+       exec->bos[bo->exec_obj_index] == bo)
+      obj = &exec->objects[bo->exec_obj_index];
+
+   if (obj == NULL) {
+      /* We've never seen this one before.  Add it to the list and assign
+       * an id that we can use later.
+       */
+      if (exec->bo_count >= exec->bo_array_length) {
+         uint32_t new_len = exec->objects ? exec->bo_array_length * 2 : 64;
+
+         struct drm_i915_gem_exec_object2 *new_objects =
+            vk_realloc(exec->alloc, exec->objects,
+                       new_len * sizeof(*new_objects), 8, exec->alloc_scope);
+         if (new_objects == NULL)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         exec->objects = new_objects;
+
+         struct anv_bo **new_bos =
+            vk_realloc(exec->alloc, exec->bos, new_len * sizeof(*new_bos), 8,
+                       exec->alloc_scope);
+         if (new_bos == NULL)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         exec->bos = new_bos;
+         exec->bo_array_length = new_len;
+      }
+
+      assert(exec->bo_count < exec->bo_array_length);
+
+      bo->exec_obj_index = exec->bo_count++;
+      obj = &exec->objects[bo->exec_obj_index];
+      exec->bos[bo->exec_obj_index] = bo;
+
+      obj->handle = bo->gem_handle;
+      obj->relocation_count = 0;
+      obj->relocs_ptr = 0;
+      obj->alignment = 0;
+      obj->offset = bo->offset;
+      obj->flags = bo->flags | extra_flags;
+      obj->rsvd1 = 0;
+      obj->rsvd2 = 0;
+   }
+
+   if (extra_flags & EXEC_OBJECT_WRITE) {
+      obj->flags |= EXEC_OBJECT_WRITE;
+      obj->flags &= ~EXEC_OBJECT_ASYNC;
+   }
+
+   if (relocs != NULL) {
+      return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
+                                       relocs->deps, extra_flags);
+   }
+
+   return VK_SUCCESS;
+}
+
+/* Add BO dependencies to execbuf */
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+                          struct anv_execbuf *exec,
+                          uint32_t dep_words,
+                          BITSET_WORD *deps,
+                          uint32_t extra_flags)
+{
+   for (uint32_t w = 0; w < dep_words; w++) {
+      BITSET_WORD mask = deps[w];
+      while (mask) {
+         int i = u_bit_scan(&mask);
+         uint32_t gem_handle = w * BITSET_WORDBITS + i;
+         struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+         assert(bo->refcount > 0);
+         VkResult result =
+            anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_execbuf_add_syncobj(struct anv_device *device,
+                        struct anv_execbuf *exec,
+                        uint32_t syncobj,
+                        uint32_t flags,
+                        uint64_t timeline_value)
+{
+   if (exec->syncobj_count >= exec->syncobj_array_length) {
+      uint32_t new_len = MAX2(exec->syncobj_array_length * 2, 16);
+
+      struct drm_i915_gem_exec_fence *new_syncobjs =
+         vk_realloc(exec->alloc, exec->syncobjs,
+                    new_len * sizeof(*new_syncobjs), 8, exec->alloc_scope);
+      if (new_syncobjs == NULL)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      exec->syncobjs = new_syncobjs;
+
+      if (exec->syncobj_values) {
+         uint64_t *new_syncobj_values =
+            vk_realloc(exec->alloc, exec->syncobj_values,
+                       new_len * sizeof(*new_syncobj_values), 8,
+                       exec->alloc_scope);
+         if (new_syncobj_values == NULL)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         exec->syncobj_values = new_syncobj_values;
+      }
+
+      exec->syncobj_array_length = new_len;
+   }
+
+   if (timeline_value && !exec->syncobj_values) {
+      exec->syncobj_values =
+         vk_zalloc(exec->alloc, exec->syncobj_array_length *
+                                sizeof(*exec->syncobj_values),
+                   8, exec->alloc_scope);
+      if (!exec->syncobj_values)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   exec->syncobjs[exec->syncobj_count] = (struct drm_i915_gem_exec_fence) {
+      .handle = syncobj,
+      .flags = flags,
+   };
+   if (exec->syncobj_values)
+      exec->syncobj_values[exec->syncobj_count] = timeline_value;
+
+   exec->syncobj_count++;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_execbuf_add_sync(struct anv_device *device,
+                     struct anv_execbuf *execbuf,
+                     struct vk_sync *sync,
+                     bool is_signal,
+                     uint64_t value)
+{
+   /* It's illegal to signal a timeline with value 0 because that's never
+    * higher than the current value.  A timeline wait on value 0 is always
+    * trivial because 0 <= uint64_t always.
+    */
+   if ((sync->flags & VK_SYNC_IS_TIMELINE) && value == 0)
+      return VK_SUCCESS;
+
+   if (vk_sync_is_anv_bo_sync(sync)) {
+      struct anv_bo_sync *bo_sync =
+         container_of(sync, struct anv_bo_sync, sync);
+
+      assert(is_signal == (bo_sync->state == ANV_BO_SYNC_STATE_RESET));
+
+      return anv_execbuf_add_bo(device, execbuf, bo_sync->bo, NULL,
+                                is_signal ? EXEC_OBJECT_WRITE : 0);
+   } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
+      struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(sync);
+
+      if (!(sync->flags & VK_SYNC_IS_TIMELINE))
+         value = 0;
+
+      return anv_execbuf_add_syncobj(device, execbuf, syncobj->syncobj,
+                                     is_signal ? I915_EXEC_FENCE_SIGNAL :
+                                                 I915_EXEC_FENCE_WAIT,
+                                     value);
+   }
+
+   unreachable("Invalid sync type");
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
+                             struct anv_cmd_buffer *cmd_buffer)
+{
+   VkResult result;
+   /* Add surface dependencies (BOs) to the execbuf */
+   result = anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
+                                      cmd_buffer->surface_relocs.dep_words,
+                                      cmd_buffer->surface_relocs.deps, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* First, we walk over all of the bos we've seen and add them and their
+    * relocations to the validate list.
+    */
+   struct anv_batch_bo **bbo;
+   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+                                  (*bbo)->bo, &(*bbo)->relocs, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   struct anv_bo **bo_entry;
+   u_vector_foreach(bo_entry, &cmd_buffer->dynamic_bos) {
+      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+                                  *bo_entry, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+pin_state_pool(struct anv_device *device,
+               struct anv_execbuf *execbuf,
+               struct anv_state_pool *pool)
+{
+   anv_block_pool_foreach_bo(bo, &pool->block_pool) {
+      VkResult result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
+                              struct anv_queue *queue,
+                              struct anv_cmd_buffer **cmd_buffers,
+                              uint32_t num_cmd_buffers)
+{
+   struct anv_device *device = queue->device;
+   VkResult result;
+
+   /* Edit the tail of the command buffers to chain them all together if they
+    * can be.
+    */
+   anv_cmd_buffer_chain_command_buffers(cmd_buffers, num_cmd_buffers);
+
+   for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+      anv_measure_submit(cmd_buffers[i]);
+      result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   /* Add all the global BOs to the object list for softpin case. */
+   result = pin_state_pool(device, execbuf, &device->scratch_surface_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->bindless_surface_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->dynamic_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->general_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->instruction_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->binding_table_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Add the BOs for all user allocated memory objects because we can't
+    * track after binding updates of VK_EXT_descriptor_indexing.
+    */
+   list_for_each_entry(struct anv_device_memory, mem,
+                       &device->memory_objects, link) {
+      result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   /* Add all the private BOs from images because we can't track after binding
+    * updates of VK_EXT_descriptor_indexing.
+    */
+   list_for_each_entry(struct anv_image, image,
+                       &device->image_private_objects, link) {
+      struct anv_bo *private_bo =
+         image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+      result = anv_execbuf_add_bo(device, execbuf, private_bo, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   struct anv_batch_bo *first_batch_bo =
+      list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);
+
+   /* The kernel requires that the last entry in the validation list be the
+    * batch buffer to execute.  We can simply swap the element
+    * corresponding to the first batch_bo in the chain with the last
+    * element in the list.
+    */
+   if (first_batch_bo->bo->exec_obj_index != execbuf->bo_count - 1) {
+      uint32_t idx = first_batch_bo->bo->exec_obj_index;
+      uint32_t last_idx = execbuf->bo_count - 1;
+
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+      assert(execbuf->bos[idx] == first_batch_bo->bo);
+
+      execbuf->objects[idx] = execbuf->objects[last_idx];
+      execbuf->bos[idx] = execbuf->bos[last_idx];
+      execbuf->bos[idx]->exec_obj_index = idx;
+
+      execbuf->objects[last_idx] = tmp_obj;
+      execbuf->bos[last_idx] = first_batch_bo->bo;
+      first_batch_bo->bo->exec_obj_index = last_idx;
+   }
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_clflush)
+      anv_cmd_buffer_clflush(cmd_buffers, num_cmd_buffers);
+#endif
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      /* We'll fill in batch length later when chaining batches. */
+      .batch_len = 0,
+      .cliprects_ptr = 0,
+      .num_cliprects = 0,
+      .DR1 = 0,
+      .DR4 = 0,
+      .flags = I915_EXEC_NO_RELOC |
+               I915_EXEC_HANDLE_LUT |
+               queue->exec_flags,
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+   };
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = anv_execbuf_add_bo(device, execbuf,
+                                        device->trivial_batch_bo,
+                                        NULL, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */
+      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+   };
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
+                     struct anv_utrace_submit *submit)
+{
+   struct anv_device *device = queue->device;
+
+   /* Always add the workaround BO as it includes a driver identifier for the
+    * error_state.
+    */
+   VkResult result = anv_execbuf_add_bo(device, execbuf,
+                                        device->workaround_bo,
+                                        NULL, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = anv_execbuf_add_bo(device, execbuf,
+                               submit->batch_bo,
+                               &submit->relocs, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = anv_execbuf_add_sync(device, execbuf, submit->sync,
+                                 true /* is_signal */, 0 /* value */);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (submit->batch_bo->exec_obj_index != execbuf->bo_count - 1) {
+      uint32_t idx = submit->batch_bo->exec_obj_index;
+      uint32_t last_idx = execbuf->bo_count - 1;
+
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+      assert(execbuf->bos[idx] == submit->batch_bo);
+
+      execbuf->objects[idx] = execbuf->objects[last_idx];
+      execbuf->bos[idx] = execbuf->bos[last_idx];
+      execbuf->bos[idx]->exec_obj_index = idx;
+
+      execbuf->objects[last_idx] = tmp_obj;
+      execbuf->bos[last_idx] = submit->batch_bo;
+      submit->batch_bo->exec_obj_index = last_idx;
+   }
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_clflush)
+      intel_flush_range(submit->batch_bo->map, submit->batch_bo->size);
+#endif
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      .batch_len = submit->batch.next - submit->batch.start,
+      .flags = I915_EXEC_NO_RELOC |
+               I915_EXEC_HANDLE_LUT |
+               I915_EXEC_FENCE_ARRAY |
+               queue->exec_flags,
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+      .num_cliprects = execbuf->syncobj_count,
+      .cliprects_ptr = (uintptr_t)execbuf->syncobjs,
+   };
+
+   return VK_SUCCESS;
+}
+
+static int
+anv_gem_execbuffer(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf)
+{
+   if (execbuf->flags & I915_EXEC_FENCE_OUT)
+      return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2_WR, execbuf);
+   else
+      return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf);
+}
+
+static VkResult
+anv_queue_exec_utrace_locked(struct anv_queue *queue,
+                             struct anv_utrace_submit *submit)
+{
+   assert(submit->batch_bo);
+
+   struct anv_device *device = queue->device;
+   struct anv_execbuf execbuf = {
+      .alloc = &device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+
+   VkResult result = setup_utrace_execbuf(&execbuf, queue, submit);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+   if (ret)
+      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+
+ error:
+   anv_execbuf_finish(&execbuf);
+
+   return result;
+}
+
+static void
+anv_i915_debug_submit(const struct anv_execbuf *execbuf)
+{
+   uint32_t total_size_kb = 0, total_vram_only_size_kb = 0;
+   for (uint32_t i = 0; i < execbuf->bo_count; i++) {
+      const struct anv_bo *bo = execbuf->bos[i];
+      total_size_kb += bo->size / 1024;
+      if (bo->vram_only)
+         total_vram_only_size_kb += bo->size / 1024;
+   }
+
+   fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0 (aperture: %.1fMb, %.1fMb VRAM only)\n",
+           execbuf->execbuf.batch_start_offset, execbuf->execbuf.batch_len,
+           (float)total_size_kb / 1024.0f,
+           (float)total_vram_only_size_kb / 1024.0f);
+   for (uint32_t i = 0; i < execbuf->bo_count; i++) {
+      const struct anv_bo *bo = execbuf->bos[i];
+      uint64_t size = bo->size + bo->_ccs_size;
+
+      fprintf(stderr, "   BO: addr=0x%016"PRIx64"-0x%016"PRIx64" size=%7"PRIu64
+              "KB handle=%05u capture=%u vram_only=%u name=%s\n",
+              bo->offset, bo->offset + size - 1, size / 1024, bo->gem_handle,
+              (bo->flags & EXEC_OBJECT_CAPTURE) != 0,
+              bo->vram_only, bo->name);
+   }
+}
+
+VkResult
+i915_queue_exec_locked(struct anv_queue *queue,
+                       uint32_t wait_count,
+                       const struct vk_sync_wait *waits,
+                       uint32_t cmd_buffer_count,
+                       struct anv_cmd_buffer **cmd_buffers,
+                       uint32_t signal_count,
+                       const struct vk_sync_signal *signals,
+                       struct anv_query_pool *perf_query_pool,
+                       uint32_t perf_query_pass)
+{
+   struct anv_device *device = queue->device;
+   struct anv_utrace_submit *utrace_submit = NULL;
+   struct anv_execbuf execbuf = {
+      .alloc = &queue->device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+      .perf_query_pass = perf_query_pass,
+   };
+
+   /* Flush the trace points first, they need to be moved */
+   VkResult result =
+      anv_device_utrace_flush_cmd_buffers(queue,
+                                          cmd_buffer_count,
+                                          cmd_buffers,
+                                          &utrace_submit);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   if (utrace_submit && !utrace_submit->batch_bo) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    utrace_submit->sync,
+                                    true /* is_signal */,
+                                    0);
+      if (result != VK_SUCCESS)
+         goto error;
+
+      /* When The utrace submission doesn't have its own batch buffer*/
+      utrace_submit = NULL;
+   }
+
+   /* Always add the workaround BO as it includes a driver identifier for the
+    * error_state.
+    */
+   result =
+      anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   for (uint32_t i = 0; i < wait_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    waits[i].sync,
+                                    false /* is_signal */,
+                                    waits[i].wait_value);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   for (uint32_t i = 0; i < signal_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    signals[i].sync,
+                                    true /* is_signal */,
+                                    signals[i].signal_value);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   if (queue->sync) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    queue->sync,
+                                    true /* is_signal */,
+                                    0 /* signal_value */);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   if (cmd_buffer_count) {
+      result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
+                                             cmd_buffers,
+                                             cmd_buffer_count);
+   } else {
+      result = setup_empty_execbuf(&execbuf, queue);
+   }
+
+   if (result != VK_SUCCESS)
+      goto error;
+
+   const bool has_perf_query =
+      perf_query_pool && perf_query_pass >= 0 && cmd_buffer_count;
+
+   if (INTEL_DEBUG(DEBUG_SUBMIT))
+      anv_i915_debug_submit(&execbuf);
+
+   anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
+                                   perf_query_pool, perf_query_pass);
+
+   if (execbuf.syncobj_values) {
+      execbuf.timeline_fences.fence_count = execbuf.syncobj_count;
+      execbuf.timeline_fences.handles_ptr = (uintptr_t)execbuf.syncobjs;
+      execbuf.timeline_fences.values_ptr = (uintptr_t)execbuf.syncobj_values;
+      anv_execbuf_add_ext(&execbuf,
+                          DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,
+                          &execbuf.timeline_fences.base);
+   } else if (execbuf.syncobjs) {
+      execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+      execbuf.execbuf.num_cliprects = execbuf.syncobj_count;
+      execbuf.execbuf.cliprects_ptr = (uintptr_t)execbuf.syncobjs;
+   }
+
+   if (has_perf_query) {
+      assert(perf_query_pass < perf_query_pool->n_passes);
+      struct intel_perf_query_info *query_info =
+         perf_query_pool->pass_query[perf_query_pass];
+
+      /* Some performance queries just the pipeline statistic HW, no need for
+       * OA in that case, so no need to reconfigure.
+       */
+      if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) &&
+          (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
+           query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
+         int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+                               (void *)(uintptr_t) query_info->oa_metrics_set_id);
+         if (ret < 0) {
+            result = vk_device_set_lost(&device->vk,
+                                        "i915-perf config failed: %s",
+                                        strerror(errno));
+         }
+      }
+
+      struct anv_bo *pass_batch_bo = perf_query_pool->bo;
+
+      struct drm_i915_gem_exec_object2 query_pass_object = {
+         .handle = pass_batch_bo->gem_handle,
+         .offset = pass_batch_bo->offset,
+         .flags  = pass_batch_bo->flags,
+      };
+      struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
+         .buffers_ptr = (uintptr_t) &query_pass_object,
+         .buffer_count = 1,
+         .batch_start_offset = khr_perf_query_preamble_offset(perf_query_pool,
+                                                              perf_query_pass),
+         .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags,
+         .rsvd1 = device->context_id,
+      };
+
+      int ret = queue->device->info->no_hw ? 0 :
+         anv_gem_execbuffer(queue->device, &query_pass_execbuf);
+      if (ret)
+         result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+   }
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+   if (ret) {
+      anv_i915_debug_submit(&execbuf);
+      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+   }
+
+   if (result == VK_SUCCESS && queue->sync) {
+      result = vk_sync_wait(&device->vk, queue->sync, 0,
+                            VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+      if (result != VK_SUCCESS)
+         result = vk_queue_set_lost(&queue->vk, "sync wait failed");
+   }
+
+ error:
+   anv_execbuf_finish(&execbuf);
+
+   if (result == VK_SUCCESS && utrace_submit)
+      result = anv_queue_exec_utrace_locked(queue, utrace_submit);
+
+   return result;
+}
+
+VkResult
+i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                          uint32_t batch_bo_size)
+{
+   struct anv_device *device = queue->device;
+   struct anv_execbuf execbuf = {
+      .alloc = &queue->device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+
+   VkResult result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
+      .batch_start_offset = 0,
+      .batch_len = batch_bo_size,
+      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+   };
+
+   if (anv_gem_execbuffer(device, &execbuf.execbuf)) {
+      result = vk_device_set_lost(&device->vk, "anv_gem_execbuffer failed: %m");
+      goto fail;
+   }
+
+   result = anv_device_wait(device, batch_bo, INT64_MAX);
+   if (result != VK_SUCCESS)
+      result = vk_device_set_lost(&device->vk,
+                                  "anv_device_wait failed: %m");
+
+fail:
+   anv_execbuf_finish(&execbuf);
+   return result;
+}
+
+VkResult
+i915_queue_exec_trace(struct anv_queue *queue,
+                      struct anv_utrace_submit *submit)
+{
+   assert(submit->batch_bo);
+
+   return anv_queue_exec_utrace_locked(queue, submit);
+}
diff --git a/lib/mesa/src/intel/vulkan/i915/anv_batch_chain.h b/lib/mesa/src/intel/vulkan/i915/anv_batch_chain.h
new file mode 100644
index 000000000..5e3f14fd0
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/i915/anv_batch_chain.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+
+#include "vk_sync.h"
+
+struct anv_queue;
+struct anv_bo;
+struct anv_cmd_buffer;
+struct anv_query_pool;
+struct anv_utrace_submit;
+
+VkResult
+i915_queue_exec_trace(struct anv_queue *queue,
+                      struct anv_utrace_submit *submit);
+VkResult
+i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                          uint32_t batch_bo_size);
+VkResult
+i915_queue_exec_locked(struct anv_queue *queue,
+                       uint32_t wait_count,
+                       const struct vk_sync_wait *waits,
+                       uint32_t cmd_buffer_count,
+                       struct anv_cmd_buffer **cmd_buffers,
+                       uint32_t signal_count,
+                       const struct vk_sync_signal *signals,
+                       struct anv_query_pool *perf_query_pool,
+                       uint32_t perf_query_pass);
diff --git a/lib/mesa/src/intel/vulkan/i915/anv_device.c b/lib/mesa/src/intel/vulkan/i915/anv_device.c
new file mode 100644
index 000000000..ada5a85e8
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/i915/anv_device.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "i915/anv_device.h"
+#include "anv_private.h"
+
+#include "common/intel_defines.h"
+
+#include "drm-uapi/i915_drm.h"
+
+static int
+vk_priority_to_i915(VkQueueGlobalPriorityKHR priority)
+{
+   switch (priority) {
+   case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
+      return INTEL_CONTEXT_LOW_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
+      return INTEL_CONTEXT_MEDIUM_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
+      return INTEL_CONTEXT_HIGH_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR:
+      return INTEL_CONTEXT_REALTIME_PRIORITY;
+   default:
+      unreachable("Invalid priority");
+   }
+}
+
+static int
+anv_gem_set_context_param(int fd, uint32_t context, uint32_t param, uint64_t value)
+{
+   if (param == I915_CONTEXT_PARAM_PRIORITY)
+      value = vk_priority_to_i915(value);
+
+   int err = 0;
+   if (!intel_gem_set_context_param(fd, context, param, value))
+      err = -errno;
+   return err;
+}
+
+static bool
+anv_gem_has_context_priority(int fd, VkQueueGlobalPriorityKHR priority)
+{
+   return !anv_gem_set_context_param(fd, 0, I915_CONTEXT_PARAM_PRIORITY,
+                                     priority);
+}
+
+VkResult
+anv_i915_physical_device_get_parameters(struct anv_physical_device *device)
+{
+   VkResult result = VK_SUCCESS;
+   int val, fd = device->local_fd;
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT, &val) || !val) {
+       result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                          "kernel missing gem wait");
+       return result;
+   }
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2, &val) || !val) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing execbuf2");
+      return result;
+   }
+
+   if (!device->info.has_llc &&
+       (!intel_gem_get_param(fd, I915_PARAM_MMAP_VERSION, &val) || val < 1)) {
+       result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                          "kernel missing wc mmap");
+       return result;
+   }
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN, &val) || !val) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing softpin");
+      return result;
+   }
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE_ARRAY, &val) || !val) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing syncobj support");
+      return result;
+   }
+
+   if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC, &val))
+      device->has_exec_async = val;
+   if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE, &val))
+      device->has_exec_capture = val;
+
+   /* Start with medium; sorted low to high */
+   const VkQueueGlobalPriorityKHR priorities[] = {
+         VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR,
+         VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+         VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR,
+         VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR,
+   };
+   device->max_context_priority = VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+   for (unsigned i = 0; i < ARRAY_SIZE(priorities); i++) {
+      if (!anv_gem_has_context_priority(fd, priorities[i]))
+         break;
+      device->max_context_priority = priorities[i];
+   }
+
+   if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_TIMELINE_FENCES, &val))
+      device->has_exec_timeline = val;
+
+   return result;
+}
+
+VkResult
+anv_i915_device_setup_context(struct anv_device *device,
+                              const VkDeviceCreateInfo *pCreateInfo,
+                              const uint32_t num_queues)
+{
+   struct anv_physical_device *physical_device = device->physical;
+   VkResult result = VK_SUCCESS;
+
+   if (device->physical->engine_info) {
+      /* The kernel API supports at most 64 engines */
+      assert(num_queues <= 64);
+      enum intel_engine_class engine_classes[64];
+      int engine_count = 0;
+      for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+         const VkDeviceQueueCreateInfo *queueCreateInfo =
+            &pCreateInfo->pQueueCreateInfos[i];
+
+         assert(queueCreateInfo->queueFamilyIndex <
+                physical_device->queue.family_count);
+         struct anv_queue_family *queue_family =
+            &physical_device->queue.families[queueCreateInfo->queueFamilyIndex];
+
+         for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++)
+            engine_classes[engine_count++] = queue_family->engine_class;
+      }
+      if (!intel_gem_create_context_engines(device->fd,
+                                            physical_device->engine_info,
+                                            engine_count, engine_classes,
+                                            (uint32_t *)&device->context_id))
+         result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                            "kernel context creation failed");
+   } else {
+      assert(num_queues == 1);
+      if (!intel_gem_create_context(device->fd, &device->context_id))
+         result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+   }
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Here we tell the kernel not to attempt to recover our context but
+    * immediately (on the next batchbuffer submission) report that the
+    * context is lost, and we will do the recovery ourselves.  In the case
+    * of Vulkan, recovery means throwing VK_ERROR_DEVICE_LOST and letting
+    * the client clean up the pieces.
+    */
+   anv_gem_set_context_param(device->fd, device->context_id,
+                             I915_CONTEXT_PARAM_RECOVERABLE, false);
+
+   /* Check if client specified queue priority. */
+   const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+      vk_find_struct_const(pCreateInfo->pQueueCreateInfos[0].pNext,
+                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+
+   VkQueueGlobalPriorityKHR priority =
+      queue_priority ? queue_priority->globalPriority :
+         VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+   /* As per spec, the driver implementation may deny requests to acquire
+    * a priority above the default priority (MEDIUM) if the caller does not
+    * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR
+    * is returned.
+    */
+   if (physical_device->max_context_priority >= VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+      int err = anv_gem_set_context_param(device->fd, device->context_id,
+                                          I915_CONTEXT_PARAM_PRIORITY,
+                                          priority);
+      if (err != 0 && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+         result = vk_error(device, VK_ERROR_NOT_PERMITTED_KHR);
+         goto fail_context;
+      }
+   }
+
+   return result;
+
+fail_context:
+   intel_gem_destroy_context(device->fd, device->context_id);
+   return result;
+}
+
+static int
+anv_gem_context_get_reset_stats(int fd, int context,
+                                uint32_t *active, uint32_t *pending)
+{
+   struct drm_i915_reset_stats stats = {
+      .ctx_id = context,
+   };
+
+   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
+   if (ret == 0) {
+      *active = stats.batch_active;
+      *pending = stats.batch_pending;
+   }
+
+   return ret;
+}
+
+VkResult
+anv_i915_device_check_status(struct vk_device *vk_device)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   uint32_t active = 0, pending = 0;
+   int ret = anv_gem_context_get_reset_stats(device->fd, device->context_id,
+                                             &active, &pending);
+   if (ret == -1) {
+      /* We don't know the real error. */
+      return vk_device_set_lost(&device->vk, "get_reset_stats failed: %m");
+   }
+
+   if (active) {
+      return vk_device_set_lost(&device->vk, "GPU hung on one of our command buffers");
+   } else if (pending) {
+      return vk_device_set_lost(&device->vk, "GPU hung with commands in-flight");
+   }
+
+   return VK_SUCCESS;
+}
diff --git a/lib/mesa/src/intel/vulkan/i915/anv_device.h b/lib/mesa/src/intel/vulkan/i915/anv_device.h
new file mode 100644
index 000000000..af42c2241
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/i915/anv_device.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+#include "vk_device.h"
+
+struct anv_device;
+struct anv_physical_device;
+
+VkResult
+anv_i915_physical_device_get_parameters(struct anv_physical_device *device);
+
+VkResult
+anv_i915_device_setup_context(struct anv_device *device,
+                              const VkDeviceCreateInfo *pCreateInfo,
+                              const uint32_t num_queues);
+
+VkResult anv_i915_device_check_status(struct vk_device *vk_device);
diff --git a/lib/mesa/src/intel/vulkan/i915/anv_kmd_backend.c b/lib/mesa/src/intel/vulkan/i915/anv_kmd_backend.c
new file mode 100644
index 000000000..a3c26dede
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/i915/anv_kmd_backend.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+
+#include "anv_private.h"
+
+#include "i915/anv_batch_chain.h"
+
+#include "drm-uapi/i915_drm.h"
+
+static uint32_t
+i915_gem_create(struct anv_device *device,
+                const struct intel_memory_class_instance **regions,
+                uint16_t num_regions, uint64_t size,
+                enum anv_bo_alloc_flags alloc_flags,
+                uint64_t *actual_size)
+{
+   if (unlikely(!device->info->mem.use_class_instance)) {
+      assert(num_regions == 1 &&
+             device->physical->sys.region == regions[0]);
+
+      struct drm_i915_gem_create gem_create = {
+            .size = size,
+      };
+      if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create))
+         return 0;
+
+      *actual_size = gem_create.size;
+      return gem_create.handle;
+   }
+
+   struct drm_i915_gem_memory_class_instance i915_regions[2];
+   assert(num_regions <= ARRAY_SIZE(i915_regions));
+
+   for (uint16_t i = 0; i < num_regions; i++) {
+      i915_regions[i].memory_class = regions[i]->klass;
+      i915_regions[i].memory_instance = regions[i]->instance;
+   }
+
+   uint32_t flags = 0;
+   if (alloc_flags & (ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE) &&
+       !(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM))
+      if (device->physical->vram_non_mappable.size > 0)
+         flags |= I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS;
+
+   struct drm_i915_gem_create_ext_memory_regions ext_regions = {
+      .base = { .name = I915_GEM_CREATE_EXT_MEMORY_REGIONS },
+      .num_regions = num_regions,
+      .regions = (uintptr_t)i915_regions,
+   };
+   struct drm_i915_gem_create_ext gem_create = {
+      .size = size,
+      .extensions = (uintptr_t) &ext_regions,
+      .flags = flags,
+   };
+
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE_EXT, &gem_create))
+      return 0;
+
+   *actual_size = gem_create.size;
+   return gem_create.handle;
+}
+
+static void
+i915_gem_close(struct anv_device *device, uint32_t handle)
+{
+   struct drm_gem_close close = {
+      .handle = handle,
+   };
+
+   intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static void *
+i915_gem_mmap_offset(struct anv_device *device, struct anv_bo *bo,
+                     uint64_t size, uint32_t flags)
+{
+   struct drm_i915_gem_mmap_offset gem_mmap = {
+      .handle = bo->gem_handle,
+      .flags = flags,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &gem_mmap))
+      return MAP_FAILED;
+
+   return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               device->fd, gem_mmap.offset);
+}
+
+static void *
+i915_gem_mmap_legacy(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+                      uint64_t size, uint32_t flags)
+{
+   struct drm_i915_gem_mmap gem_mmap = {
+      .handle = bo->gem_handle,
+      .offset = offset,
+      .size = size,
+      .flags = flags,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap))
+      return MAP_FAILED;
+
+   return (void *)(uintptr_t) gem_mmap.addr_ptr;
+}
+
+static uint32_t
+mmap_calc_flags(struct anv_device *device, struct anv_bo *bo,
+                VkMemoryPropertyFlags property_flags)
+{
+   if (device->info->has_local_mem)
+      return I915_MMAP_OFFSET_FIXED;
+
+   uint32_t flags = 0;
+   if (!device->info->has_llc &&
+       (property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
+      flags |= I915_MMAP_WC;
+   if (bo->map_wc)
+      flags |= I915_MMAP_WC;
+   if (!(property_flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
+      flags |= I915_MMAP_WC;
+
+   if (likely(device->physical->info.has_mmap_offset))
+      flags = (flags & I915_MMAP_WC) ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB;
+   return flags;
+}
+
+static void *
+i915_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+              uint64_t size, VkMemoryPropertyFlags property_flags)
+{
+   const uint32_t flags = mmap_calc_flags(device, bo, property_flags);
+
+   if (likely(device->physical->info.has_mmap_offset))
+      return i915_gem_mmap_offset(device, bo, size, flags);
+   return i915_gem_mmap_legacy(device, bo, offset, size, flags);
+}
+
+static int
+i915_gem_vm_bind(struct anv_device *device, struct anv_bo *bo)
+{
+   return 0;
+}
+
+static int
+i915_gem_vm_unbind(struct anv_device *device, struct anv_bo *bo)
+{
+   return 0;
+}
+
+const struct anv_kmd_backend *
+anv_i915_kmd_backend_get(void)
+{
+   static const struct anv_kmd_backend i915_backend = {
+      .gem_create = i915_gem_create,
+      .gem_close = i915_gem_close,
+      .gem_mmap = i915_gem_mmap,
+      .gem_vm_bind = i915_gem_vm_bind,
+      .gem_vm_unbind = i915_gem_vm_unbind,
+      .execute_simple_batch = i915_execute_simple_batch,
+      .queue_exec_locked = i915_queue_exec_locked,
+      .queue_exec_trace = i915_queue_exec_trace,
+   };
+   return &i915_backend;
+}
diff --git a/lib/mesa/src/intel/vulkan/layers/anv_android_layer.c b/lib/mesa/src/intel/vulkan/layers/anv_android_layer.c
new file mode 100644
index 000000000..e36eb820a
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/layers/anv_android_layer.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+VKAPI_ATTR VkResult VKAPI_CALL
+android_CreateImageView(VkDevice _device,
+                        const VkImageViewCreateInfo *pCreateInfo,
+                        const VkAllocationCallbacks *pAllocator,
+                        VkImageView *pView)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const struct util_format_description *fmt =
+      vk_format_description(pCreateInfo->format);
+
+   /* Throw error in case application tries to create ASTC view on gfx125.
+    * This is done to avoid gpu hang that can result in using the unsupported
+    * format.
+    */
+   if (fmt && fmt->layout == UTIL_FORMAT_LAYOUT_ASTC &&
+       device->info->verx10 >= 125) {
+      return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "ASTC format not supported (%s).", __func__);
+   }
+   return anv_CreateImageView(_device, pCreateInfo, pAllocator, pView);
+}
diff --git a/lib/mesa/src/intel/vulkan/layers/anv_doom64.c b/lib/mesa/src/intel/vulkan/layers/anv_doom64.c
new file mode 100644
index 000000000..80ca74f97
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/layers/anv_doom64.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/set.h"
+#include "anv_private.h"
+#include "vk_common_entrypoints.h"
+
+/**
+ * The DOOM 64 rendering corruption is happening because the game always uses
+ * ```
+ * vkCmdPipelineBarrier(VK_IMAGE_LAYOUT_UNDEFINED ->
+ *                      VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
+ * vkCmdCopyBufferToImage(...)
+ * vkCmdPipelineBarrier(VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ->
+ *                      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
+ * ```
+ * when it wants to update its texture atlas image.
+ *
+ * According to spec, transitioning from VK_IMAGE_LAYOUT_UNDEFINED means
+ * that the current image content might be discarded, but the game relies
+ * on it being fully preserved.
+ *
+ * This work-around layer implements super-barebone layout tracking: allows
+ * the first transition from VK_IMAGE_LAYOUT_UNDEFINED, but replaces
+ * oldLayout with VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL for each
+ * subsequent transition of that image.
+ *
+ * Gen12+ does not ambiguate CCS data on transition from VK_IMAGE_LAYOUT_UNDEFINED
+ * so it preserves all compressed information, and this WA is not needed.
+ */
+
+VKAPI_ATTR void VKAPI_CALL
+doom64_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
+                          VkPipelineStageFlags srcStageMask,
+                          VkPipelineStageFlags dstStageMask,
+                          VkDependencyFlags dependencyFlags,
+                          uint32_t memoryBarrierCount,
+                          const VkMemoryBarrier* pMemoryBarriers,
+                          uint32_t bufferMemoryBarrierCount,
+                          const VkBufferMemoryBarrier* pBufferMemoryBarriers,
+                          uint32_t imageMemoryBarrierCount,
+                          const VkImageMemoryBarrier* pImageMemoryBarriers)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, command_buffer, commandBuffer);
+   assert(command_buffer && command_buffer->device);
+
+   VkImageMemoryBarrier fixed_barrier;
+   struct set * defined_images =
+      command_buffer->device->workarounds.doom64_images;
+
+   if (defined_images &&
+       imageMemoryBarrierCount == 1 && pImageMemoryBarriers &&
+       pImageMemoryBarriers[0].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+       pImageMemoryBarriers[0].newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+      ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[0].image);
+
+      if (!_mesa_set_search(defined_images, image)) {
+         _mesa_set_add(defined_images, image);
+      } else {
+         memcpy(&fixed_barrier, pImageMemoryBarriers, sizeof(VkImageMemoryBarrier));
+
+         fixed_barrier.oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+         pImageMemoryBarriers = (const VkImageMemoryBarrier*) &fixed_barrier;
+      }
+   }
+
+   vk_common_CmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask,
+                                dependencyFlags, memoryBarrierCount,
+                                pMemoryBarriers, bufferMemoryBarrierCount,
+                                pBufferMemoryBarriers,
+                                imageMemoryBarrierCount,
+                                pImageMemoryBarriers);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+doom64_CreateImage(VkDevice _device, const VkImageCreateInfo* pCreateInfo,
+                   const VkAllocationCallbacks* pAllocator, VkImage* pImage)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   assert(device);
+
+   if (!device->workarounds.doom64_images) {
+      device->workarounds.doom64_images = _mesa_pointer_set_create(NULL);
+
+      if (!device->workarounds.doom64_images) {
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+   }
+
+   return anv_CreateImage(_device, pCreateInfo, pAllocator, pImage);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+doom64_DestroyImage(VkDevice _device, VkImage _image,
+                    const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image, image, _image);
+   assert(device);
+
+   struct set * defined_images = device->workarounds.doom64_images;
+
+   if (image && defined_images) {
+      _mesa_set_remove_key(defined_images, image);
+
+      if (!defined_images->entries) {
+         _mesa_set_destroy(defined_images, NULL);
+         device->workarounds.doom64_images = NULL;
+      }
+   }
+
+   anv_DestroyImage(_device, _image, pAllocator);
+}
diff --git a/lib/mesa/src/intel/vulkan/meson.build b/lib/mesa/src/intel/vulkan/meson.build
index 9e54716df..be8a37e84 100644
--- a/lib/mesa/src/intel/vulkan/meson.build
+++ b/lib/mesa/src/intel/vulkan/meson.build
@@ -18,11 +18,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+subdir('shaders')
+
 inc_anv = include_directories('.')
 
 anv_flags = [
   no_override_init_args,
-  c_sse2_args,
+  sse2_args,
 ]
 
 anv_cpp_flags = []
@@ -38,24 +40,13 @@ anv_entrypoints = custom_target(
     '--device-prefix', 'gfx11',
     '--device-prefix', 'gfx12',
     '--device-prefix', 'gfx125',
-    '--device-prefix', 'hitman3'
+    '--device-prefix', 'doom64',
+    '--device-prefix', 'hitman3',
+    '--device-prefix', 'android'
   ],
   depend_files : vk_entrypoints_gen_depend_files,
 )
 
-float64_spv_h = custom_target(
-  'float64_spv.h',
-  input : [glsl2spirv, float64_glsl_file],
-  output : 'float64_spv.h',
-  command : [
-    prog_python, '@INPUT@', '@OUTPUT@',
-    '--create-entry', 'main',
-    '--vn', 'float64_spv_source',
-    '--glsl-version', '450',
-    '-Olib',
-  ]
-)
-
 idep_anv_headers = declare_dependency(
   sources : [anv_entrypoints[0]],
   include_directories : inc_anv,
@@ -87,23 +78,21 @@ intel_icd = custom_target(
   install : true,
 )
 
-if meson.version().version_compare('>= 0.58')
-  _dev_icdname = 'intel_devenv_icd.@0@.json'.format(host_machine.cpu())
-  custom_target(
-    'intel_devenv_icd',
-    input : [vk_icd_gen, vk_api_xml],
-    output : _dev_icdname,
-    command : [
-      prog_python, '@INPUT0@',
-      '--api-version', '1.3', '--xml', '@INPUT1@',
-      '--lib-path', meson.current_build_dir() / 'libvulkan_intel.so',
-      '--out', '@OUTPUT@',
-    ],
-    build_by_default : true,
-  )
+_dev_icdname = 'intel_devenv_icd.@0@.json'.format(host_machine.cpu())
+_dev_icd = custom_target(
+  'intel_devenv_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : _dev_icdname,
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', meson.current_build_dir() / 'libvulkan_intel.so',
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+)
 
-  devenv.append('VK_ICD_FILENAMES', meson.current_build_dir() / _dev_icdname)
-endif
+devenv.append('VK_ICD_FILENAMES', _dev_icd.full_path())
 
 libanv_per_hw_ver_libs = []
 anv_per_hw_ver_files = files(
@@ -113,6 +102,7 @@ anv_per_hw_ver_files = files(
   'genX_pipeline.c',
   'genX_query.c',
   'genX_state.c',
+  'genX_video.c',
 )
 if with_intel_vk_rt
   anv_per_hw_ver_files += files('genX_acceleration_structure.c',)
@@ -125,7 +115,7 @@ foreach g : [['90', ['gfx8_cmd_buffer.c']],
   _gfx_ver = g[0]
   libanv_per_hw_ver_libs += static_library(
     'anv_per_hw_ver@0@'.format(_gfx_ver),
-    [anv_per_hw_ver_files, g[1], anv_entrypoints[0]],
+    [anv_per_hw_ver_files, g[1], anv_entrypoints[0], generated_draws_spvs, ],
     include_directories : [
       inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel,
     ],
@@ -141,7 +131,21 @@ foreach g : [['90', ['gfx8_cmd_buffer.c']],
 endforeach
 
 libanv_files = files(
+  'i915/anv_batch_chain.c',
+  'i915/anv_batch_chain.h',
+  'i915/anv_device.c',
+  'i915/anv_device.h',
+  'i915/anv_kmd_backend.c',
+  'layers/anv_doom64.c',
   'layers/anv_hitman3.c',
+  'layers/anv_android_layer.c',
+  'xe/anv_batch_chain.c',
+  'xe/anv_batch_chain.h',
+  'xe/anv_kmd_backend.c',
+  'xe/anv_device.c',
+  'xe/anv_device.h',
+  'xe/anv_queue.c',
+  'xe/anv_queue.h',
   'anv_allocator.c',
   'anv_android.h',
   'anv_batch_chain.c',
@@ -151,17 +155,19 @@ libanv_files = files(
   'anv_descriptor_set.c',
   'anv_device.c',
   'anv_formats.c',
+  'anv_generated_indirect_draws.c',
   'anv_genX.h',
   'anv_image.c',
+  'anv_kmd_backend.c',
+  'anv_kmd_backend.h',
   'anv_measure.c',
   'anv_measure.h',
+  'anv_mesh_perprim_wa.c',
   'anv_nir.h',
-  'anv_nir_add_base_work_group_id.c',
   'anv_nir_apply_pipeline_layout.c',
   'anv_nir_compute_push_layout.c',
   'anv_nir_lower_multiview.c',
   'anv_nir_lower_ubo_loads.c',
-  'anv_nir_lower_ycbcr_textures.c',
   'anv_nir_push_descriptor_analysis.c',
   'anv_perf.c',
   'anv_pipeline.c',
@@ -170,6 +176,7 @@ libanv_files = files(
   'anv_queue.c',
   'anv_util.c',
   'anv_utrace.c',
+  'anv_video.c',
   'anv_wsi.c',
 )
 
@@ -208,6 +215,7 @@ libanv_common = static_library(
   [
     libanv_files, anv_entrypoints, sha1_h,
     gen_xml_pack, float64_spv_h,
+    generated_draws_spvs,
   ],
   include_directories : [
     inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
@@ -216,7 +224,7 @@ libanv_common = static_library(
   c_args : anv_flags,
   cpp_args : anv_cpp_flags,
   gnu_symbol_visibility : 'hidden',
-  dependencies : anv_deps,
+  dependencies : anv_deps
 )
 
 libvulkan_intel = shared_library(
@@ -227,17 +235,18 @@ libvulkan_intel = shared_library(
   ],
   link_whole : [libanv_common, libanv_per_hw_ver_libs] + optional_libgrl,
   link_with : [
-    libintel_compiler, libintel_dev, libisl, libblorp, libintel_perf,
+    libintel_compiler, libisl, libblorp, libintel_perf,
   ],
   dependencies : [
     dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common,
     idep_nir, idep_genxml, idep_vulkan_util, idep_vulkan_wsi,
     idep_vulkan_runtime, idep_mesautil, idep_xmlconfig,
-    idep_intel_driver_ds,
+    idep_intel_driver_ds, idep_intel_dev,
   ],
   c_args : anv_flags,
   gnu_symbol_visibility : 'hidden',
-  link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+  link_args : [vulkan_icd_link_args, ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+  link_depends : vulkan_icd_link_depends,
   install : true,
 )
 
@@ -263,13 +272,13 @@ if with_tests
     ],
     link_whole : libanv_common,
     link_with : [
-      libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev,
+      libanv_per_hw_ver_libs, libintel_compiler, libintel_common,
       libisl, libblorp, libintel_perf,
     ] + optional_libgrl,
     dependencies : [
       dep_thread, dep_dl, dep_m, anv_deps,
       idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime,
-      idep_mesautil,
+      idep_mesautil, idep_intel_dev,
     ],
     c_args : anv_flags,
     gnu_symbol_visibility : 'hidden',
@@ -283,12 +292,12 @@ if with_tests
       executable(
         t,
         ['tests/@0@.c'.format(t), anv_entrypoints[0]],
-        c_args : [ c_sse2_args ],
+        c_args : [ sse2_args ],
         link_with : libvulkan_intel_test,
         dependencies : [
           dep_libdrm, dep_thread, dep_m, dep_valgrind,
           idep_vulkan_util, idep_vulkan_wsi_headers,
-          idep_vulkan_runtime, idep_intel_driver_ds,
+          idep_vulkan_runtime, idep_intel_driver_ds, idep_intel_dev,
         ],
         include_directories : [
           inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
diff --git a/lib/mesa/src/intel/vulkan/shaders/common_generated_draws.glsl b/lib/mesa/src/intel/vulkan/shaders/common_generated_draws.glsl
new file mode 100644
index 000000000..06ea7781c
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/shaders/common_generated_draws.glsl
@@ -0,0 +1,133 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define BITFIELD_BIT(i) (1u << i)
+
+#define ANV_GENERATED_FLAG_INDEXED    BITFIELD_BIT(0)
+#define ANV_GENERATED_FLAG_PREDICATED BITFIELD_BIT(1)
+#define ANV_GENERATED_FLAG_DRAWID     BITFIELD_BIT(2)
+#define ANV_GENERATED_FLAG_BASE       BITFIELD_BIT(3)
+
+/* These 3 bindings will be accessed through A64 messages */
+layout(set = 0, binding = 0, std430) buffer Storage0 {
+   uint indirect_data[];
+};
+
+layout(set = 0, binding = 1, std430) buffer Storage1 {
+   uint commands[];
+};
+
+layout(set = 0, binding = 2, std430) buffer Storage2 {
+   uint draw_ids[];
+};
+
+/* This data will be provided through push constants. */
+layout(set = 0, binding = 3) uniform block {
+   uint64_t draw_id_addr;
+   uint64_t indirect_data_addr;
+   uint indirect_data_stride;
+   uint flags;
+   uint draw_base;
+   uint draw_count;
+   uint max_draw_count;
+   uint instance_multiplier;
+   uint64_t end_addr;
+};
+
+void write_VERTEX_BUFFER_STATE(uint write_offset,
+                               uint mocs,
+                               uint buffer_idx,
+                               uint64_t address,
+                               uint size)
+{
+   commands[write_offset + 0] = (0          << 0  |    /* Buffer Pitch */
+                                 0          << 13 |    /* Null Vertex Buffer */
+                                 1          << 14 |    /* Address Modify Enable */
+                                 mocs       << 16 |    /* MOCS */
+                                 buffer_idx << 26);    /* Vertex Buffer Index */
+   commands[write_offset + 1]  = uint(address & 0xffffffff);
+   commands[write_offset + 2]  = uint(address >> 32);
+   commands[write_offset + 3]  = size;
+}
+
+void write_3DPRIMITIVE(uint write_offset,
+                       bool is_predicated,
+                       bool is_indexed,
+                       uint vertex_count_per_instance,
+                       uint start_vertex_location,
+                       uint instance_count,
+                       uint start_instance_location,
+                       uint base_vertex_location)
+{
+   commands[write_offset + 0] = (3 << 29 |         /* Command Type */
+                                 3 << 27 |         /* Command SubType */
+                                 3 << 24 |         /* 3D Command Opcode */
+                                 uint(is_predicated) << 8 |
+                                 5 << 0);          /* DWord Length */
+   commands[write_offset + 1] = uint(is_indexed) << 8;
+   commands[write_offset + 2] = vertex_count_per_instance;
+   commands[write_offset + 3] = start_vertex_location;
+   commands[write_offset + 4] = instance_count;
+   commands[write_offset + 5] = start_instance_location;
+   commands[write_offset + 6] = base_vertex_location;
+}
+
+void write_3DPRIMITIVE_EXTENDED(uint write_offset,
+                                bool is_predicated,
+                                bool is_indexed,
+                                uint vertex_count_per_instance,
+                                uint start_vertex_location,
+                                uint instance_count,
+                                uint start_instance_location,
+                                uint base_vertex_location,
+                                uint param_base_vertex,
+                                uint param_base_instance,
+                                uint param_draw_id)
+{
+   commands[write_offset + 0] = (3 << 29 |         /* Command Type */
+                                 3 << 27 |         /* Command SubType */
+                                 3 << 24 |         /* 3D Command Opcode */
+                                 1 << 11 |         /* Extended Parameter Enable */
+                                 uint(is_predicated) << 8 |
+                                 8 << 0);          /* DWord Length */
+   commands[write_offset + 1] = uint(is_indexed) << 8;
+   commands[write_offset + 2] = vertex_count_per_instance;
+   commands[write_offset + 3] = start_vertex_location;
+   commands[write_offset + 4] = instance_count;
+   commands[write_offset + 5] = start_instance_location;
+   commands[write_offset + 6] = base_vertex_location;
+   commands[write_offset + 7] = param_base_vertex;
+   commands[write_offset + 8] = param_base_instance;
+   commands[write_offset + 9] = param_draw_id;
+}
+
+void write_MI_BATCH_BUFFER_START(uint write_offset,
+                                 uint64_t addr)
+{
+   commands[write_offset + 0] = (0  << 29 | /* Command Type */
+                                 49 << 23 | /* MI Command Opcode */
+                                 1  << 8  | /* Address Space Indicator (PPGTT) */
+                                 1  << 0);  /* DWord Length */
+   commands[write_offset + 1] = uint(addr & 0xffffffff);
+   commands[write_offset + 2] = uint(addr >> 32);
+}
diff --git a/lib/mesa/src/intel/vulkan/shaders/gfx11_generated_draws.glsl b/lib/mesa/src/intel/vulkan/shaders/gfx11_generated_draws.glsl
new file mode 100644
index 000000000..8745f7bab
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/shaders/gfx11_generated_draws.glsl
@@ -0,0 +1,85 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 450
+#extension GL_ARB_gpu_shader_int64 : enable
+#extension GL_GOOGLE_include_directive : enable
+
+#include "common_generated_draws.glsl"
+
+void main()
+{
+   bool is_indexed = (flags & ANV_GENERATED_FLAG_INDEXED) != 0;
+   bool is_predicated = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0;
+   uint _3dprim_dw_size = (flags >> 16) & 0xff;
+   uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x);
+   uint indirect_data_offset = item_idx * indirect_data_stride / 4;
+   uint cmd_idx = item_idx * _3dprim_dw_size;
+   uint draw_id = draw_base + item_idx;
+
+   if (draw_id < draw_count) {
+      if (is_indexed) {
+         /* Loading a VkDrawIndexedIndirectCommand */
+         uint index_count    = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_index    = indirect_data[indirect_data_offset + 2];
+         uint vertex_offset  = indirect_data[indirect_data_offset + 3];
+         uint first_instance = indirect_data[indirect_data_offset + 4];
+
+         write_3DPRIMITIVE_EXTENDED(cmd_idx,
+                                    is_predicated,
+                                    is_indexed,
+                                    index_count,
+                                    first_index,
+                                    instance_count,
+                                    first_instance,
+                                    vertex_offset,
+                                    vertex_offset,
+                                    first_instance,
+                                    draw_id);
+      } else {
+         /* Loading a VkDrawIndirectCommand structure */
+         uint vertex_count   = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_vertex   = indirect_data[indirect_data_offset + 2];
+         uint first_instance = indirect_data[indirect_data_offset + 3];
+
+         write_3DPRIMITIVE_EXTENDED(cmd_idx,
+                                    is_predicated,
+                                    is_indexed,
+                                    vertex_count,
+                                    first_vertex,
+                                    instance_count,
+                                    first_instance,
+                                    0 /* base_vertex_location */,
+                                    first_vertex,
+                                    first_instance,
+                                    draw_id);
+      }
+   } else if (draw_id == draw_count && draw_id < max_draw_count) {
+      /* Only write a jump forward in the batch if we have fewer elements than
+       * the max draw count.
+       */
+      write_MI_BATCH_BUFFER_START(cmd_idx, end_addr);
+   }
+}
diff --git a/lib/mesa/src/intel/vulkan/shaders/gfx9_generated_draws.glsl b/lib/mesa/src/intel/vulkan/shaders/gfx9_generated_draws.glsl
new file mode 100644
index 000000000..9850b19c3
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/shaders/gfx9_generated_draws.glsl
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 450
+#extension GL_ARB_gpu_shader_int64 : enable
+#extension GL_GOOGLE_include_directive : enable
+
+#include "common_generated_draws.glsl"
+
+void main()
+{
+   bool is_indexed = (flags & ANV_GENERATED_FLAG_INDEXED) != 0;
+   bool is_predicated = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0;
+   bool uses_base = (flags & ANV_GENERATED_FLAG_BASE) != 0;
+   bool uses_drawid = (flags & ANV_GENERATED_FLAG_DRAWID) != 0;
+   uint mocs = (flags >> 8) & 0xff;
+   uint _3dprim_dw_size = (flags >> 16) & 0xff;
+   uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x);
+   uint indirect_data_offset = item_idx * indirect_data_stride / 4;
+   uint cmd_idx = item_idx * _3dprim_dw_size;
+   uint draw_id = draw_base + item_idx;
+
+   if (draw_id < draw_count) {
+      if (is_indexed) {
+         /* Loading a VkDrawIndexedIndirectCommand */
+         uint index_count    = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_index    = indirect_data[indirect_data_offset + 2];
+         uint vertex_offset  = indirect_data[indirect_data_offset + 3];
+         uint first_instance = indirect_data[indirect_data_offset + 4];
+
+         if (uses_base || uses_drawid) {
+            uint state_vertex_len =
+               1 + (uses_base ? 4 : 0) + (uses_drawid ? 4 : 0);
+            commands[cmd_idx] =
+               (3  << 29 |                    /* Command Type */
+                3  << 27 |                    /* Command SubType */
+                0  << 24 |                    /* 3D Command Opcode */
+                8  << 16 |                    /* 3D Command Sub Opcode */
+                (state_vertex_len - 2) << 0); /* DWord Length */
+            cmd_idx += 1;
+            if (uses_base) {
+               uint64_t indirect_draw_data_addr =
+                  indirect_data_addr + item_idx * indirect_data_stride + 12;
+               write_VERTEX_BUFFER_STATE(cmd_idx,
+                                         mocs,
+                                         31,
+                                         indirect_draw_data_addr,
+                                         8);
+               cmd_idx += 4;
+            }
+            if (uses_drawid) {
+               uint64_t draw_idx_addr = draw_id_addr + 4 * item_idx;
+               draw_ids[draw_id] = draw_id;
+               write_VERTEX_BUFFER_STATE(cmd_idx,
+                                         mocs,
+                                         32,
+                                         draw_idx_addr,
+                                         4);
+               cmd_idx += 4;
+            }
+         }
+         write_3DPRIMITIVE(cmd_idx,
+                           is_predicated,
+                           is_indexed,
+                           index_count,
+                           first_index,
+                           instance_count,
+                           first_instance,
+                           vertex_offset);
+      } else {
+         /* Loading a VkDrawIndirectCommand structure */
+         uint vertex_count   = indirect_data[indirect_data_offset + 0];
+         uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
+         uint first_vertex   = indirect_data[indirect_data_offset + 2];
+         uint first_instance = indirect_data[indirect_data_offset + 3];
+
+         if (uses_base || uses_drawid) {
+            uint state_vertex_len =
+               1 + (uses_base ? 4 : 0) + (uses_drawid ? 4 : 0);
+            commands[cmd_idx] =
+               (3  << 29 |                    /* Command Type */
+                3  << 27 |                    /* Command SubType */
+                0  << 24 |                    /* 3D Command Opcode */
+                8  << 16 |                    /* 3D Command Sub Opcode */
+                (state_vertex_len - 2) << 0); /* DWord Length */
+            cmd_idx += 1;
+            if (uses_base) {
+               uint64_t indirect_draw_data_addr =
+                  indirect_data_addr + item_idx * indirect_data_stride + 8;
+               write_VERTEX_BUFFER_STATE(cmd_idx,
+                                         mocs,
+                                         31,
+                                         indirect_draw_data_addr,
+                                         8);
+               cmd_idx += 4;
+            }
+            if (uses_drawid) {
+               uint64_t draw_idx_addr = draw_id_addr + 4 * item_idx;
+               draw_ids[draw_id] = draw_id;
+               write_VERTEX_BUFFER_STATE(cmd_idx,
+                                         mocs,
+                                         32,
+                                         draw_idx_addr,
+                                         4);
+               cmd_idx += 4;
+            }
+         }
+         write_3DPRIMITIVE(cmd_idx,
+                           is_predicated,
+                           is_indexed,
+                           vertex_count,
+                           first_vertex,
+                           instance_count,
+                           first_instance,
+                           0 /* base_vertex_location */);
+      }
+   } else if (draw_id == draw_count && draw_id < max_draw_count) {
+      /* Only write a jump forward in the batch if we have fewer elements than
+       * the max draw count.
+       */
+      write_MI_BATCH_BUFFER_START(cmd_idx, end_addr);
+   }
+}
diff --git a/lib/mesa/src/intel/vulkan/shaders/meson.build b/lib/mesa/src/intel/vulkan/shaders/meson.build
new file mode 100644
index 000000000..2f1952ee5
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/shaders/meson.build
@@ -0,0 +1,56 @@
+# Copyright © 2022 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+float64_spv_h = custom_target(
+  'float64_spv.h',
+  input : [glsl2spirv, float64_glsl_file],
+  output : 'float64_spv.h',
+  command : [
+    prog_python, '@INPUT@', '@OUTPUT@',
+    prog_glslang,
+    '--create-entry', 'main',
+    '--vn', 'float64_spv_source',
+    '--glsl-version', '450',
+    '-Olib',
+  ]
+)
+
+generated_draws_shaders = [
+  'gfx9_generated_draws.glsl',
+  'gfx11_generated_draws.glsl',
+]
+
+generated_draws_spvs = []
+foreach f : generated_draws_shaders
+  spv_filename = f.replace('.glsl', '_spv.h')
+  src_name = f.replace('.glsl', '_spv_source')
+  generated_draws_spvs += custom_target(
+    spv_filename,
+    input : [glsl2spirv, f, files('common_generated_draws.glsl')],
+    output : spv_filename,
+    command : [
+      prog_python, '@INPUT0@', '@INPUT1@', '@OUTPUT@',
+      prog_glslang,
+      '--vn', src_name,
+      '--glsl-version', '450',
+      '--stage', 'frag',
+      '-I' + meson.current_source_dir(),
+    ])
+endforeach
diff --git a/lib/mesa/src/intel/vulkan/tests/block_pool_grow_first.c b/lib/mesa/src/intel/vulkan/tests/block_pool_grow_first.c
index 7359b66cb..5ad230392 100644
--- a/lib/mesa/src/intel/vulkan/tests/block_pool_grow_first.c
+++ b/lib/mesa/src/intel/vulkan/tests/block_pool_grow_first.c
@@ -36,7 +36,9 @@ int main(void)
    const uint32_t block_size = 16 * 1024;
    const uint32_t initial_size = block_size / 2;
 
+   test_device_info_init(&physical_device.info);
    anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
    anv_bo_cache_init(&device.bo_cache, &device);
    anv_block_pool_init(&pool, &device, "test", 4096, initial_size);
diff --git a/lib/mesa/src/intel/vulkan/tests/state_pool_padding.c b/lib/mesa/src/intel/vulkan/tests/state_pool_padding.c
index b76ba8ad6..845767a35 100644
--- a/lib/mesa/src/intel/vulkan/tests/state_pool_padding.c
+++ b/lib/mesa/src/intel/vulkan/tests/state_pool_padding.c
@@ -30,7 +30,9 @@ int main(void)
    struct anv_device device = {};
    struct anv_state_pool state_pool;
 
+   test_device_info_init(&physical_device.info);
    anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
    anv_bo_cache_init(&device.bo_cache, &device);
    anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096);
diff --git a/lib/mesa/src/intel/vulkan/tests/test_common.h b/lib/mesa/src/intel/vulkan/tests/test_common.h
index 3f883e3bd..ae84935f3 100644
--- a/lib/mesa/src/intel/vulkan/tests/test_common.h
+++ b/lib/mesa/src/intel/vulkan/tests/test_common.h
@@ -32,3 +32,8 @@
          abort();                                                       \
       }                                                                 \
    } while (false)
+
+static inline void test_device_info_init(struct intel_device_info *info)
+{
+   info->mem_alignment = 4096;
+}
diff --git a/lib/mesa/src/intel/vulkan/xe/anv_batch_chain.c b/lib/mesa/src/intel/vulkan/xe/anv_batch_chain.c
new file mode 100644
index 000000000..dbcf989d7
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/xe/anv_batch_chain.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "xe/anv_batch_chain.h"
+
+#include "anv_private.h"
+
+#include <xf86drm.h>
+
+#include "drm-uapi/xe_drm.h"
+
+VkResult
+xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                        uint32_t batch_bo_size)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
+   uint32_t syncobj_handle;
+
+   if (drmSyncobjCreate(device->fd, 0, &syncobj_handle))
+      return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
+
+   struct drm_xe_sync sync = {
+      .flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
+      .handle = syncobj_handle,
+   };
+   struct drm_xe_exec exec = {
+      .engine_id = queue->engine_id,
+      .num_batch_buffer = 1,
+      .address = batch_bo->offset,
+      .num_syncs = 1,
+      .syncs = (uintptr_t)&sync,
+   };
+
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
+      result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
+      goto exec_error;
+   }
+
+   struct drm_syncobj_wait wait = {
+      .handles = (uintptr_t)&syncobj_handle,
+      .timeout_nsec = INT64_MAX,
+      .count_handles = 1,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
+      result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
+
+exec_error:
+   drmSyncobjDestroy(device->fd, syncobj_handle);
+
+   return result;
+}
+
+#define TYPE_SIGNAL true
+#define TYPE_WAIT false
+
+static void
+xe_exec_fill_sync(struct drm_xe_sync *xe_sync, struct vk_sync *vk_sync,
+                  uint64_t value, bool signal)
+{
+   if (unlikely(!vk_sync_type_is_drm_syncobj(vk_sync->type))) {
+      unreachable("Unsupported sync type");
+      return;
+   }
+
+   const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
+   xe_sync->handle = syncobj->syncobj;
+
+   if (value) {
+      xe_sync->flags |= DRM_XE_SYNC_TIMELINE_SYNCOBJ;
+      xe_sync->timeline_value = value;
+   } else {
+      xe_sync->flags |= DRM_XE_SYNC_SYNCOBJ;
+   }
+
+   if (signal)
+      xe_sync->flags |= DRM_XE_SYNC_SIGNAL;
+}
+
+static VkResult
+xe_exec_process_syncs(struct anv_queue *queue,
+                      uint32_t wait_count, const struct vk_sync_wait *waits,
+                      uint32_t signal_count, const struct vk_sync_signal *signals,
+                      struct anv_utrace_submit *utrace_submit,
+                      struct drm_xe_sync **ret, uint32_t *ret_count)
+{
+   struct anv_device *device = queue->device;
+   uint32_t num_syncs = wait_count + signal_count + (utrace_submit ? 1 : 0) +
+                        (queue->sync ? 1 : 0);
+
+   if (!num_syncs)
+      return VK_SUCCESS;
+
+   struct drm_xe_sync *xe_syncs = vk_zalloc(&device->vk.alloc,
+                                            sizeof(*xe_syncs) * num_syncs, 8,
+                                            VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!xe_syncs)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   uint32_t count = 0;
+
+   /* Signal the utrace sync only if it doesn't have a batch. Otherwise the
+    * it's the utrace batch that should signal its own sync.
+    */
+   if (utrace_submit && !utrace_submit->batch_bo) {
+      struct drm_xe_sync *xe_sync = &xe_syncs[count++];
+
+      xe_exec_fill_sync(xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL);
+   }
+
+   for (uint32_t i = 0; i < wait_count; i++) {
+      struct drm_xe_sync *xe_sync = &xe_syncs[count++];
+      const struct vk_sync_wait *vk_wait = &waits[i];
+
+      xe_exec_fill_sync(xe_sync, vk_wait->sync, vk_wait->wait_value,
+                        TYPE_WAIT);
+   }
+
+   for (uint32_t i = 0; i < signal_count; i++) {
+      struct drm_xe_sync *xe_sync = &xe_syncs[count++];
+      const struct vk_sync_signal *vk_signal = &signals[i];
+
+      xe_exec_fill_sync(xe_sync, vk_signal->sync, vk_signal->signal_value,
+                        TYPE_SIGNAL);
+   }
+
+   if (queue->sync) {
+      struct drm_xe_sync *xe_sync = &xe_syncs[count++];
+
+      xe_exec_fill_sync(xe_sync, queue->sync, 0,
+                        TYPE_SIGNAL);
+   }
+
+   assert(count == num_syncs);
+   *ret = xe_syncs;
+   *ret_count = num_syncs;
+   return VK_SUCCESS;
+}
+
+static void
+xe_exec_print_debug(struct anv_queue *queue, uint32_t cmd_buffer_count,
+                    struct anv_cmd_buffer **cmd_buffers, struct anv_query_pool *perf_query_pool,
+                    uint32_t perf_query_pass, struct drm_xe_exec *exec)
+{
+   if (INTEL_DEBUG(DEBUG_SUBMIT))
+      fprintf(stderr, "Batch offset=0x%016"PRIx64" on queue %u\n",
+              (uint64_t)exec->address, queue->vk.index_in_family);
+
+   anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
+                                   perf_query_pool, perf_query_pass);
+}
+
+VkResult
+xe_queue_exec_utrace_locked(struct anv_queue *queue,
+                            struct anv_utrace_submit *utrace_submit)
+{
+   struct anv_device *device = queue->device;
+   struct drm_xe_sync xe_sync = {};
+
+   xe_exec_fill_sync(&xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_clflush)
+      intel_flush_range(utrace_submit->batch_bo->map,
+                        utrace_submit->batch_bo->size);
+#endif
+
+   struct drm_xe_exec exec = {
+      .engine_id = queue->engine_id,
+      .num_batch_buffer = 1,
+      .syncs = (uintptr_t)&xe_sync,
+      .num_syncs = 1,
+      .address = utrace_submit->batch_bo->offset,
+   };
+   if (likely(!device->info->no_hw)) {
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+         return vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+xe_queue_exec_locked(struct anv_queue *queue,
+                     uint32_t wait_count,
+                     const struct vk_sync_wait *waits,
+                     uint32_t cmd_buffer_count,
+                     struct anv_cmd_buffer **cmd_buffers,
+                     uint32_t signal_count,
+                     const struct vk_sync_signal *signals,
+                     struct anv_query_pool *perf_query_pool,
+                     uint32_t perf_query_pass)
+{
+   struct anv_device *device = queue->device;
+   struct anv_utrace_submit *utrace_submit = NULL;
+   VkResult result;
+
+   result = anv_device_utrace_flush_cmd_buffers(queue, cmd_buffer_count,
+                                                cmd_buffers, &utrace_submit);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct drm_xe_sync *xe_syncs = NULL;
+   uint32_t xe_syncs_count = 0;
+   result = xe_exec_process_syncs(queue, wait_count, waits,
+                                  signal_count, signals,
+                                  utrace_submit,
+                                  &xe_syncs, &xe_syncs_count);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* If we have no batch for utrace, just forget about it now. */
+   if (utrace_submit && !utrace_submit->batch_bo)
+      utrace_submit = NULL;
+
+   struct drm_xe_exec exec = {
+      .engine_id = queue->engine_id,
+      .num_batch_buffer = 1,
+      .syncs = (uintptr_t)xe_syncs,
+      .num_syncs = xe_syncs_count,
+   };
+
+   if (cmd_buffer_count) {
+      anv_cmd_buffer_chain_command_buffers(cmd_buffers, cmd_buffer_count);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+      if (device->physical->memory.need_clflush)
+         anv_cmd_buffer_clflush(cmd_buffers, cmd_buffer_count);
+#endif
+
+      struct anv_cmd_buffer *first_cmd_buffer = cmd_buffers[0];
+      struct anv_batch_bo *first_batch_bo = list_first_entry(&first_cmd_buffer->batch_bos,
+                                                             struct anv_batch_bo, link);
+      exec.address = first_batch_bo->bo->offset;
+   } else {
+      exec.address = device->trivial_batch_bo->offset;
+   }
+
+   xe_exec_print_debug(queue, cmd_buffer_count, cmd_buffers, perf_query_pool,
+                       perf_query_pass, &exec);
+
+   /* TODO: add perfetto stuff when Xe supports it */
+
+   if (!device->info->no_hw) {
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+         result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+   }
+   vk_free(&device->vk.alloc, xe_syncs);
+
+   if (result == VK_SUCCESS && queue->sync) {
+      result = vk_sync_wait(&device->vk, queue->sync, 0,
+                            VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+      if (result != VK_SUCCESS)
+         result = vk_queue_set_lost(&queue->vk, "sync wait failed");
+   }
+
+   if (result == VK_SUCCESS && utrace_submit)
+      result = xe_queue_exec_utrace_locked(queue, utrace_submit);
+
+   return result;
+}
diff --git a/lib/mesa/src/intel/vulkan/xe/anv_batch_chain.h b/lib/mesa/src/intel/vulkan/xe/anv_batch_chain.h
new file mode 100644
index 000000000..9ee877e04
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/xe/anv_batch_chain.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+#include "vk_sync.h"
+
+struct anv_queue;
+struct anv_bo;
+struct anv_cmd_buffer;
+struct anv_query_pool;
+struct anv_utrace_submit;
+
+VkResult
+xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                        uint32_t batch_bo_size);
+VkResult
+xe_queue_exec_locked(struct anv_queue *queue,
+                     uint32_t wait_count,
+                     const struct vk_sync_wait *waits,
+                     uint32_t cmd_buffer_count,
+                     struct anv_cmd_buffer **cmd_buffers,
+                     uint32_t signal_count,
+                     const struct vk_sync_signal *signals,
+                     struct anv_query_pool *perf_query_pool,
+                     uint32_t perf_query_pass);
+
+VkResult
+xe_queue_exec_utrace_locked(struct anv_queue *queue,
+                            struct anv_utrace_submit *utrace_submit);
diff --git a/lib/mesa/src/intel/vulkan/xe/anv_device.c b/lib/mesa/src/intel/vulkan/xe/anv_device.c
new file mode 100644
index 000000000..a5827d968
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/xe/anv_device.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "xe/anv_device.h"
+#include "anv_private.h"
+
+#include "drm-uapi/xe_drm.h"
+
+bool anv_xe_device_destroy_vm(struct anv_device *device)
+{
+   struct drm_xe_vm_destroy destroy = {
+      .vm_id = device->vm_id,
+   };
+   return intel_ioctl(device->fd, DRM_IOCTL_XE_VM_DESTROY, &destroy) == 0;
+}
+
+VkResult anv_xe_device_setup_vm(struct anv_device *device)
+{
+   struct drm_xe_vm_create create = {
+      .flags = DRM_XE_VM_CREATE_SCRATCH_PAGE,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_VM_CREATE, &create) != 0)
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "vm creation failed");
+
+   device->vm_id = create.vm_id;
+   return VK_SUCCESS;
+}
+
+enum drm_sched_priority
+anv_vk_priority_to_drm_sched_priority(VkQueueGlobalPriorityKHR vk_priority)
+{
+   switch (vk_priority) {
+   case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
+      return DRM_SCHED_PRIORITY_MIN;
+   case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
+      return DRM_SCHED_PRIORITY_NORMAL;
+   case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
+      return DRM_SCHED_PRIORITY_HIGH;
+   default:
+      unreachable("Invalid priority");
+      return DRM_SCHED_PRIORITY_MIN;
+   }
+}
+
+static VkQueueGlobalPriorityKHR
+drm_sched_priority_to_vk_priority(enum drm_sched_priority drm_sched_priority)
+{
+   switch (drm_sched_priority) {
+   case DRM_SCHED_PRIORITY_MIN:
+      return VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+   case DRM_SCHED_PRIORITY_NORMAL:
+      return VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+   case DRM_SCHED_PRIORITY_HIGH:
+      return VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR;
+   default:
+      unreachable("Invalid drm_sched_priority");
+      return VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+   }
+}
+
+static void *
+xe_query_alloc_fetch(struct anv_physical_device *device, uint32_t query_id)
+{
+   struct drm_xe_device_query query = {
+      .query = query_id,
+   };
+   if (intel_ioctl(device->local_fd, DRM_IOCTL_XE_DEVICE_QUERY, &query))
+      return NULL;
+
+   void *data = calloc(1, query.size);
+   if (!data)
+      return NULL;
+
+   query.data = (uintptr_t)data;
+   if (intel_ioctl(device->local_fd, DRM_IOCTL_XE_DEVICE_QUERY, &query)) {
+      free(data);
+      return NULL;
+   }
+
+   return data;
+}
+
+VkResult
+anv_xe_physical_device_get_parameters(struct anv_physical_device *device)
+{
+   struct drm_xe_query_config *config;
+
+   config = xe_query_alloc_fetch(device, DRM_XE_DEVICE_QUERY_CONFIG);
+   if (!config)
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "unable to query device config");
+
+   device->has_exec_timeline = true;
+   device->max_context_priority =
+         drm_sched_priority_to_vk_priority(config->info[XE_QUERY_CONFIG_MAX_ENGINE_PRIORITY]);
+
+   free(config);
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_xe_device_check_status(struct vk_device *vk_device)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   VkResult result = VK_SUCCESS;
+
+   for (uint32_t i = 0; i < device->queue_count; i++) {
+      struct drm_xe_engine_get_property engine_get_property = {
+         .engine_id = device->queues[i].engine_id,
+         .property = XE_ENGINE_GET_PROPERTY_BAN,
+      };
+      int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_ENGINE_GET_PROPERTY,
+                            &engine_get_property);
+
+      if (ret || engine_get_property.value) {
+         result = vk_device_set_lost(&device->vk, "One or more queues banned");
+         break;
+      }
+   }
+
+   return result;
+}
diff --git a/lib/mesa/src/intel/vulkan/xe/anv_device.h b/lib/mesa/src/intel/vulkan/xe/anv_device.h
new file mode 100644
index 000000000..669d5639c
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/xe/anv_device.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+
+#include "vulkan/vulkan_core.h"
+#include "vk_device.h"
+
+#include "drm-uapi/gpu_scheduler.h"
+
+struct anv_device;
+struct anv_physical_device;
+
+bool anv_xe_device_destroy_vm(struct anv_device *device);
+VkResult anv_xe_device_setup_vm(struct anv_device *device);
+VkResult anv_xe_device_check_status(struct vk_device *vk_device);
+
+VkResult
+anv_xe_physical_device_get_parameters(struct anv_physical_device *device);
+enum drm_sched_priority
+anv_vk_priority_to_drm_sched_priority(VkQueueGlobalPriorityKHR vk_priority);
diff --git a/lib/mesa/src/intel/vulkan/xe/anv_kmd_backend.c b/lib/mesa/src/intel/vulkan/xe/anv_kmd_backend.c
new file mode 100644
index 000000000..46c4939e4
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/xe/anv_kmd_backend.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+#include <xf86drm.h>
+
+#include "anv_private.h"
+
+#include "xe/anv_batch_chain.h"
+
+#include "drm-uapi/xe_drm.h"
+
+static uint32_t
+xe_gem_create(struct anv_device *device,
+              const struct intel_memory_class_instance **regions,
+              uint16_t regions_count, uint64_t size,
+              enum anv_bo_alloc_flags alloc_flags,
+              uint64_t *actual_size)
+{
+   struct drm_xe_gem_create gem_create = {
+     /* From xe_drm.h: If a VM is specified, this BO must:
+      * 1. Only ever be bound to that VM.
+      * 2. Cannot be exported as a PRIME fd.
+      */
+     .vm_id = alloc_flags & ANV_BO_ALLOC_EXTERNAL ? 0 : device->vm_id,
+     .size = align64(size, device->info->mem_alignment),
+     .flags = alloc_flags & ANV_BO_ALLOC_SCANOUT ? XE_GEM_CREATE_FLAG_SCANOUT : 0,
+   };
+   for (uint16_t i = 0; i < regions_count; i++)
+      gem_create.flags |= BITFIELD_BIT(regions[i]->instance);
+
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_GEM_CREATE, &gem_create))
+      return 0;
+
+   *actual_size = gem_create.size;
+   return gem_create.handle;
+}
+
+static void
+xe_gem_close(struct anv_device *device, uint32_t handle)
+{
+   struct drm_gem_close close = {
+      .handle = handle,
+   };
+   intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static void *
+xe_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+            uint64_t size, VkMemoryPropertyFlags property_flags)
+{
+   struct drm_xe_gem_mmap_offset args = {
+      .handle = bo->gem_handle,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_GEM_MMAP_OFFSET, &args))
+      return MAP_FAILED;
+
+   return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               device->fd, args.offset);
+}
+
+static inline int
+xe_gem_vm_bind_op(struct anv_device *device, struct anv_bo *bo, uint32_t op)
+{
+   uint32_t syncobj_handle;
+   int ret = drmSyncobjCreate(device->fd, 0, &syncobj_handle);
+
+   if (ret)
+      return ret;
+
+   struct drm_xe_sync sync = {
+      .flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
+      .handle = syncobj_handle,
+   };
+   struct drm_xe_vm_bind args = {
+      .vm_id = device->vm_id,
+      .num_binds = 1,
+      .bind.obj = op == XE_VM_BIND_OP_UNMAP ? 0 : bo->gem_handle,
+      .bind.obj_offset = 0,
+      .bind.range = bo->actual_size,
+      .bind.addr = intel_48b_address(bo->offset),
+      .bind.op = op,
+      .num_syncs = 1,
+      .syncs = (uintptr_t)&sync,
+   };
+   ret = intel_ioctl(device->fd, DRM_IOCTL_XE_VM_BIND, &args);
+   if (ret)
+      goto bind_error;
+
+   struct drm_syncobj_wait wait = {
+      .handles = (uintptr_t)&syncobj_handle,
+      .timeout_nsec = INT64_MAX,
+      .count_handles = 1,
+      .flags = 0,
+      .first_signaled = 0,
+      .pad = 0,
+   };
+   intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait);
+
+bind_error:
+   drmSyncobjDestroy(device->fd, syncobj_handle);
+   return ret;
+}
+
+static int xe_gem_vm_bind(struct anv_device *device, struct anv_bo *bo)
+{
+   return xe_gem_vm_bind_op(device, bo, XE_VM_BIND_OP_MAP);
+}
+
+static int xe_gem_vm_unbind(struct anv_device *device, struct anv_bo *bo)
+{
+   return xe_gem_vm_bind_op(device, bo, XE_VM_BIND_OP_UNMAP);
+}
+
+const struct anv_kmd_backend *
+anv_xe_kmd_backend_get(void)
+{
+   static const struct anv_kmd_backend xe_backend = {
+      .gem_create = xe_gem_create,
+      .gem_close = xe_gem_close,
+      .gem_mmap = xe_gem_mmap,
+      .gem_vm_bind = xe_gem_vm_bind,
+      .gem_vm_unbind = xe_gem_vm_unbind,
+      .execute_simple_batch = xe_execute_simple_batch,
+      .queue_exec_locked = xe_queue_exec_locked,
+      .queue_exec_trace = xe_queue_exec_utrace_locked,
+   };
+   return &xe_backend;
+}
diff --git a/lib/mesa/src/intel/vulkan/xe/anv_queue.c b/lib/mesa/src/intel/vulkan/xe/anv_queue.c
new file mode 100644
index 000000000..5c42435c7
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/xe/anv_queue.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "xe/anv_queue.h"
+
+#include "anv_private.h"
+
+#include "common/xe/intel_engine.h"
+#include "common/intel_gem.h"
+
+#include "xe/anv_device.h"
+
+#include "drm-uapi/xe_drm.h"
+#include "drm-uapi/gpu_scheduler.h"
+
+VkResult
+anv_xe_create_engine(struct anv_device *device,
+                     struct anv_queue *queue,
+                     const VkDeviceQueueCreateInfo *pCreateInfo)
+{
+   struct anv_physical_device *physical = device->physical;
+   struct anv_queue_family *queue_family =
+      &physical->queue.families[pCreateInfo->queueFamilyIndex];
+   const struct intel_query_engine_info *engines = physical->engine_info;
+   struct drm_xe_engine_class_instance *instances;
+
+   instances = vk_alloc(&device->vk.alloc,
+                        sizeof(*instances) * queue_family->queueCount, 8,
+                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!instances)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   /* Build a list of all compatible HW engines */
+   uint32_t count = 0;
+   for (uint32_t i = 0; i < engines->num_engines; i++) {
+      const struct intel_engine_class_instance engine = engines->engines[i];
+      if (engine.engine_class != queue_family->engine_class)
+         continue;
+
+      instances[count].engine_class = intel_engine_class_to_xe(engine.engine_class);
+      instances[count].engine_instance = engine.engine_instance;
+      /* TODO: handle gt_id, MTL and newer platforms will have media engines
+       * in a separated gt
+       */
+      instances[count++].gt_id = 0;
+   }
+
+   assert(device->vm_id != 0);
+   struct drm_xe_engine_create create = {
+         /* Allows KMD to pick one of those engines for the submission queue */
+         .instances = (uintptr_t)instances,
+         .vm_id = device->vm_id,
+         .width = 1,
+         .num_placements = count,
+   };
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_ENGINE_CREATE, &create);
+   vk_free(&device->vk.alloc, instances);
+   if (ret)
+      return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create engine");
+
+   queue->engine_id = create.engine_id;
+
+   const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+   const VkQueueGlobalPriorityKHR priority = queue_priority ?
+                                             queue_priority->globalPriority :
+                                             VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+   /* As per spec, the driver implementation may deny requests to acquire
+    * a priority above the default priority (MEDIUM) if the caller does not
+    * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR
+    * is returned.
+    */
+   if (physical->max_context_priority >= VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+      if (priority > physical->max_context_priority)
+         goto priority_error;
+
+      struct drm_xe_engine_set_property engine_property = {
+         .engine_id = create.engine_id,
+         .property = XE_ENGINE_SET_PROPERTY_PRIORITY,
+         .value = anv_vk_priority_to_drm_sched_priority(priority),
+      };
+      ret = intel_ioctl(device->fd, DRM_IOCTL_XE_ENGINE_SET_PROPERTY,
+                        &engine_property);
+      if (ret && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR)
+         goto priority_error;
+   }
+
+   return VK_SUCCESS;
+
+priority_error:
+   anv_xe_destroy_engine(device, queue);
+   return vk_error(device, VK_ERROR_NOT_PERMITTED_KHR);
+}
+
+void
+anv_xe_destroy_engine(struct anv_device *device, struct anv_queue *queue)
+{
+   struct drm_xe_engine_destroy destroy = {
+      .engine_id = queue->engine_id,
+   };
+   intel_ioctl(device->fd, DRM_IOCTL_XE_ENGINE_DESTROY, &destroy);
+}
diff --git a/lib/mesa/src/intel/vulkan/xe/anv_queue.h b/lib/mesa/src/intel/vulkan/xe/anv_queue.h
new file mode 100644
index 000000000..646f0ef2f
--- /dev/null
+++ b/lib/mesa/src/intel/vulkan/xe/anv_queue.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_device;
+struct anv_queue;
+
+VkResult
+anv_xe_create_engine(struct anv_device *device,
+                     struct anv_queue *queue,
+                     const VkDeviceQueueCreateInfo *pCreateInfo);
+void
+anv_xe_destroy_engine(struct anv_device *device, struct anv_queue *queue);
author	Jonathan Gray <jsg@cvs.openbsd.org>	2023-11-02 04:34:57 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2023-11-02 04:34:57 +0000
commit	32aeb3c41fedbbd7b11aacfec48e8f699d16bff0 (patch)
tree	fc5893a490729ebf6b87b83eebf5d4ebfdfccf27 /lib/mesa/src/intel/vulkan
parent	286ec9d289bada8abb84753c461cfa3432866e98 (diff)