Import Mesa 23.3.6

author: Jonathan Gray <jsg@cvs.openbsd.org> 2024-04-02 09:30:07 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2024-04-02 09:30:07 +0000
commit: f54e142455cb3c9d1662dae7e096a32a47e5409b (patch)
tree: 440ecd46269f0eac25e349e1ed58f246490c5e26 /lib/mesa/src/broadcom/vulkan
parent: 36d8503c27530f68d655d3ef77a6eaa4dfd8ad65 (diff)
29 files changed, 3111 insertions, 1866 deletions
diff --git a/lib/mesa/src/broadcom/vulkan/meson.build b/lib/mesa/src/broadcom/vulkan/meson.build
index c3595cf73..3605e0965 100644
--- a/lib/mesa/src/broadcom/vulkan/meson.build
+++ b/lib/mesa/src/broadcom/vulkan/meson.build
@@ -25,7 +25,9 @@ v3dv_entrypoints = custom_target(
   command : [
     prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
     '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
+    '--beta', with_vulkan_beta.to_string(),
     '--device-prefix', 'ver42',
+    '--device-prefix', 'ver71',
   ],
   depend_files : vk_entrypoints_gen_depend_files,
 )
@@ -63,13 +65,11 @@ files_per_version = files(
   'v3dvx_pipeline.c',
   'v3dvx_meta_common.c',
   'v3dvx_pipeline.c',
+  'v3dvx_query.c',
   'v3dvx_queue.c',
 )
 
-# The vulkan driver only supports version >= 42, which is the version present in
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
-# driver.
-v3d_versions = ['42']
+v3d_versions = ['42', '71']
 
 v3dv_flags = []
 
@@ -100,7 +100,7 @@ if with_platform_wayland
 endif
 
 if with_platform_android
-  v3dv_deps += dep_android
+  v3dv_deps += [dep_android, idep_u_gralloc]
   v3dv_flags += '-DVK_USE_PLATFORM_ANDROID_KHR'
   libv3dv_files += files('v3dv_android.c')
 endif
@@ -112,7 +112,7 @@ foreach ver : v3d_versions
     [files_per_version, v3d_xml_pack, v3dv_entrypoints[0]],
     include_directories : [
       inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
-      inc_compiler, inc_util,
+      inc_util,
     ],
     c_args : [v3dv_flags, '-DV3D_VERSION=' + ver],
     gnu_symbol_visibility : 'hidden',
@@ -124,7 +124,7 @@ libvulkan_broadcom = shared_library(
   'vulkan_broadcom',
   [libv3dv_files, v3dv_entrypoints, sha1_h],
   include_directories : [
-    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util,
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_util,
   ],
   link_with : [
     libbroadcom_cle,
@@ -165,6 +165,7 @@ broadcom_icd = custom_target(
   ],
   build_by_default : true,
   install_dir : with_vulkan_icd_dir,
+  install_tag : 'runtime',
   install : true,
 )
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_android.c b/lib/mesa/src/broadcom/vulkan/v3dv_android.c
index d217aaf11..6c49e5d71 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_android.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_android.c
@@ -35,6 +35,9 @@
 #include <vulkan/vk_android_native_buffer.h>
 #include <vulkan/vk_icd.h>
 
+#include "vk_android.h"
+#include "vulkan/util/vk_enum_defines.h"
+
 #include "util/libsync.h"
 #include "util/log.h"
 #include "util/os_file.h"
@@ -112,117 +115,55 @@ v3dv_hal_close(struct hw_device_t *dev)
    return -1;
 }
 
-static int
-get_format_bpp(int native)
-{
-   int bpp;
-
-   switch (native) {
-   case HAL_PIXEL_FORMAT_RGBA_FP16:
-      bpp = 8;
-      break;
-   case HAL_PIXEL_FORMAT_RGBA_8888:
-   case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED:
-   case HAL_PIXEL_FORMAT_RGBX_8888:
-   case HAL_PIXEL_FORMAT_BGRA_8888:
-   case HAL_PIXEL_FORMAT_RGBA_1010102:
-      bpp = 4;
-      break;
-   case HAL_PIXEL_FORMAT_RGB_565:
-      bpp = 2;
-      break;
-   default:
-      bpp = 0;
-      break;
-   }
-
-   return bpp;
-}
-
-/* get buffer info from VkNativeBufferANDROID */
-static VkResult
-v3dv_gralloc_info_other(struct v3dv_device *device,
-                        const VkNativeBufferANDROID *native_buffer,
-                        int *out_stride,
-                        uint64_t *out_modifier)
-{
-   *out_stride = native_buffer->stride /*in pixels*/ *
-                 get_format_bpp(native_buffer->format);
-   *out_modifier = DRM_FORMAT_MOD_LINEAR;
-   return VK_SUCCESS;
-}
-
-static const char cros_gralloc_module_name[] = "CrOS Gralloc";
-
-#define CROS_GRALLOC_DRM_GET_BUFFER_INFO 4
-
-struct cros_gralloc0_buffer_info
-{
-   uint32_t drm_fourcc;
-   int num_fds;
-   int fds[4];
-   uint64_t modifier;
-   int offset[4];
-   int stride[4];
-};
-
-static VkResult
-v3dv_gralloc_info_cros(struct v3dv_device *device,
-                       const VkNativeBufferANDROID *native_buffer,
-                       int *out_stride,
-                       uint64_t *out_modifier)
+VkResult
+v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc,
+                                    struct u_gralloc_buffer_handle *in_hnd,
+                                    VkImageDrmFormatModifierExplicitCreateInfoEXT *out,
+                                    VkSubresourceLayout *out_layouts,
+                                    int max_planes)
 {
-   const gralloc_module_t *gralloc = device->gralloc;
-   struct cros_gralloc0_buffer_info info;
-   int ret;
+   struct u_gralloc_buffer_basic_info info;
 
-   ret = gralloc->perform(gralloc, CROS_GRALLOC_DRM_GET_BUFFER_INFO,
-                          native_buffer->handle, &info);
-   if (ret)
+   if (u_gralloc_get_buffer_basic_info(gralloc, in_hnd, &info) != 0)
       return VK_ERROR_INVALID_EXTERNAL_HANDLE;
 
-   *out_stride = info.stride[0];
-   *out_modifier = info.modifier;
+   if (info.num_planes > max_planes)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
 
-   return VK_SUCCESS;
-}
+   bool is_disjoint = false;
+   for (int i = 1; i < info.num_planes; i++) {
+      if (info.offsets[i] == 0) {
+         is_disjoint = true;
+         break;
+      }
+   }
 
-VkResult
-v3dv_gralloc_info(struct v3dv_device *device,
-                  const VkNativeBufferANDROID *native_buffer,
-                  int *out_dmabuf,
-                  int *out_stride,
-                  int *out_size,
-                  uint64_t *out_modifier)
-{
-   if (device->gralloc_type == V3DV_GRALLOC_UNKNOWN) {
-      /* get gralloc module for gralloc buffer info query */
-      int err = hw_get_module(GRALLOC_HARDWARE_MODULE_ID,
-                              (const hw_module_t **) &device->gralloc);
+   if (is_disjoint) {
+      /* We don't support disjoint planes yet */
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+   }
 
-      device->gralloc_type = V3DV_GRALLOC_OTHER;
+   memset(out_layouts, 0, sizeof(*out_layouts) * info.num_planes);
+   memset(out, 0, sizeof(*out));
 
-      if (err == 0) {
-         const gralloc_module_t *gralloc = device->gralloc;
-         mesa_logi("opened gralloc module name: %s", gralloc->common.name);
+   out->sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT;
+   out->pPlaneLayouts = out_layouts;
 
-         if (strcmp(gralloc->common.name, cros_gralloc_module_name) == 0 &&
-             gralloc->perform) {
-            device->gralloc_type = V3DV_GRALLOC_CROS;
-         }
-      }
+   out->drmFormatModifier = info.modifier;
+   out->drmFormatModifierPlaneCount = info.num_planes;
+   for (int i = 0; i < info.num_planes; i++) {
+      out_layouts[i].offset = info.offsets[i];
+      out_layouts[i].rowPitch = info.strides[i];
    }
 
-   *out_dmabuf = native_buffer->handle->data[0];
-   *out_size = lseek(*out_dmabuf, 0, SEEK_END);
-
-   if (device->gralloc_type == V3DV_GRALLOC_CROS) {
-      return v3dv_gralloc_info_cros(device, native_buffer, out_stride,
-                                    out_modifier);
-   } else {
-      return v3dv_gralloc_info_other(device, native_buffer, out_stride,
-                                     out_modifier);
+   if (info.drm_fourcc == DRM_FORMAT_YVU420) {
+      /* Swap the U and V planes to match the VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM */
+      VkSubresourceLayout tmp = out_layouts[1];
+      out_layouts[1] = out_layouts[2];
+      out_layouts[2] = tmp;
    }
+
+   return VK_SUCCESS;
 }
 
 VkResult
@@ -231,11 +172,8 @@ v3dv_import_native_buffer_fd(VkDevice device_h,
                              const VkAllocationCallbacks *alloc,
                              VkImage image_h)
 {
-   struct v3dv_image *image = NULL;
    VkResult result;
 
-   image = v3dv_image_from_handle(image_h);
-
    VkDeviceMemory memory_h;
 
    const VkMemoryDedicatedAllocateInfo ded_alloc = {
@@ -252,13 +190,12 @@ v3dv_import_native_buffer_fd(VkDevice device_h,
       .fd = os_dupfd_cloexec(native_buffer_fd),
    };
 
-   assert(image->plane_count == 1);
    result =
       v3dv_AllocateMemory(device_h,
                           &(VkMemoryAllocateInfo) {
                              .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
                              .pNext = &import_info,
-                             .allocationSize = image->planes[0].size,
+                             .allocationSize = lseek(native_buffer_fd, 0, SEEK_END),
                              .memoryTypeIndex = 0,
                           },
                           alloc, &memory_h);
@@ -274,8 +211,6 @@ v3dv_import_native_buffer_fd(VkDevice device_h,
    };
    v3dv_BindImageMemory2(device_h, 1, &bind_info);
 
-   image->is_native_buffer_memory = true;
-
    return VK_SUCCESS;
 
 fail_create_image:
@@ -417,6 +352,193 @@ v3dv_GetSwapchainGrallocUsage2ANDROID(
       *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER;
    }
 
+   if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID) {
+      uint64_t front_rendering_usage = 0;
+      u_gralloc_get_front_rendering_usage(device->gralloc, &front_rendering_usage);
+      *grallocProducerUsage |= front_rendering_usage;
+   }
+
    return VK_SUCCESS;
 }
 #endif
+
+/* ----------------------------- AHardwareBuffer --------------------------- */
+
+static VkResult
+get_ahb_buffer_format_properties2(VkDevice device_h, const struct AHardwareBuffer *buffer,
+                                  VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, device_h);
+
+   /* Get a description of buffer contents . */
+   AHardwareBuffer_Desc desc;
+   AHardwareBuffer_describe(buffer, &desc);
+
+   /* Verify description. */
+   const uint64_t gpu_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE |
+                              AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT |
+                              AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
+
+   /* "Buffer must be a valid Android hardware buffer object with at least
+    * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags."
+    */
+   if (!(desc.usage & (gpu_usage)))
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   /* Fill properties fields based on description. */
+   VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
+
+   p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+
+   p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+   p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+
+   p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+   p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+
+   VkFormatProperties2 format_properties = {.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2};
+
+   p->format = vk_ahb_format_to_image_format(desc.format);
+
+   VkFormat external_format = p->format;
+
+   if (p->format != VK_FORMAT_UNDEFINED)
+      goto finish;
+
+   /* External format only case
+    *
+    * From vkGetAndroidHardwareBufferPropertiesANDROID spec:
+    * "If the Android hardware buffer has one of the formats listed in the Format
+    * Equivalence table (see spec.), then format must have the equivalent Vulkan
+    * format listed in the table. Otherwise, format may be VK_FORMAT_UNDEFINED,
+    * indicating the Android hardware buffer can only be used with an external format."
+    *
+    * From SKIA source code analysis: p->format MUST be VK_FORMAT_UNDEFINED, if the
+    * format is not in the Equivalence table.
+    */
+
+   struct u_gralloc_buffer_handle gr_handle = {
+      .handle = AHardwareBuffer_getNativeHandle(buffer),
+      .pixel_stride = desc.stride,
+      .hal_format = desc.format,
+   };
+
+   struct u_gralloc_buffer_basic_info info;
+
+   if (u_gralloc_get_buffer_basic_info(device->gralloc, &gr_handle, &info) != 0)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   switch (info.drm_fourcc) {
+   case DRM_FORMAT_YVU420:
+      /* Assuming that U and V planes are swapped earlier */
+      external_format = VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM;
+      break;
+   case DRM_FORMAT_NV12:
+      external_format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+      break;
+   default:;
+      mesa_loge("Unsupported external DRM format: %d", info.drm_fourcc);
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+   }
+
+   struct u_gralloc_buffer_color_info color_info;
+   if (u_gralloc_get_buffer_color_info(device->gralloc, &gr_handle, &color_info) == 0) {
+      switch (color_info.yuv_color_space) {
+      case __DRI_YUV_COLOR_SPACE_ITU_REC601:
+         p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+         break;
+      case __DRI_YUV_COLOR_SPACE_ITU_REC709:
+         p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709;
+         break;
+      case __DRI_YUV_COLOR_SPACE_ITU_REC2020:
+         p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020;
+         break;
+      default:
+         break;
+      }
+
+      p->suggestedYcbcrRange = (color_info.sample_range == __DRI_YUV_NARROW_RANGE) ?
+         VK_SAMPLER_YCBCR_RANGE_ITU_NARROW : VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+      p->suggestedXChromaOffset = (color_info.horizontal_siting == __DRI_YUV_CHROMA_SITING_0_5) ?
+         VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN;
+      p->suggestedYChromaOffset = (color_info.vertical_siting == __DRI_YUV_CHROMA_SITING_0_5) ?
+         VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN;
+   }
+
+finish:
+
+   v3dv_GetPhysicalDeviceFormatProperties2(v3dv_physical_device_to_handle(device->pdevice),
+                                           external_format, &format_properties);
+
+   /* v3dv doesn't support direct sampling from linear images but has a logic to copy
+    * from linear to tiled images implicitly before sampling. Therefore expose optimal
+    * features for both linear and optimal tiling.
+    */
+   p->formatFeatures = format_properties.formatProperties.optimalTilingFeatures;
+   p->externalFormat = external_format;
+
+   /* From vkGetAndroidHardwareBufferPropertiesANDROID spec:
+    * "The formatFeatures member *must* include
+    *  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT and at least one of
+    *  VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT or
+    *  VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT"
+    */
+   p->formatFeatures |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT_KHR;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+v3dv_GetAndroidHardwareBufferPropertiesANDROID(VkDevice device_h,
+                                               const struct AHardwareBuffer *buffer,
+                                               VkAndroidHardwareBufferPropertiesANDROID *pProperties)
+{
+   V3DV_FROM_HANDLE(v3dv_device, dev, device_h);
+   struct v3dv_physical_device *pdevice = dev->pdevice;
+
+   VkResult result;
+
+   VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop =
+      vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID);
+
+   /* Fill format properties of an Android hardware buffer. */
+   if (format_prop) {
+      VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = {
+         .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID,
+      };
+      result = get_ahb_buffer_format_properties2(device_h, buffer, &format_prop2);
+      if (result != VK_SUCCESS)
+         return result;
+
+      format_prop->format                 = format_prop2.format;
+      format_prop->externalFormat         = format_prop2.externalFormat;
+      format_prop->formatFeatures         =
+         vk_format_features2_to_features(format_prop2.formatFeatures);
+      format_prop->samplerYcbcrConversionComponents =
+         format_prop2.samplerYcbcrConversionComponents;
+      format_prop->suggestedYcbcrModel    = format_prop2.suggestedYcbcrModel;
+      format_prop->suggestedYcbcrRange    = format_prop2.suggestedYcbcrRange;
+      format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset;
+      format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset;
+   }
+
+   VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 =
+      vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID);
+   if (format_prop2) {
+      result = get_ahb_buffer_format_properties2(device_h, buffer, format_prop2);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   const native_handle_t *handle = AHardwareBuffer_getNativeHandle(buffer);
+   assert(handle && handle->numFds > 0);
+   pProperties->allocationSize = lseek(handle->data[0], 0, SEEK_END);
+
+   /* All memory types. */
+   pProperties->memoryTypeBits = (1u << pdevice->memory.memoryTypeCount) - 1;
+
+   return VK_SUCCESS;
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_bo.c b/lib/mesa/src/broadcom/vulkan/v3dv_bo.c
index 9f1bf423a..2728a9233 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_bo.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_bo.c
@@ -339,7 +339,7 @@ v3dv_bo_map(struct v3dv_device *device, struct v3dv_bo *bo, uint32_t size)
    if (!ok)
       return false;
 
-   ok = v3dv_bo_wait(device, bo, PIPE_TIMEOUT_INFINITE);
+   ok = v3dv_bo_wait(device, bo, OS_TIMEOUT_INFINITE);
    if (!ok) {
       fprintf(stderr, "memory wait for map failed\n");
       return false;
@@ -359,7 +359,7 @@ v3dv_bo_unmap(struct v3dv_device *device, struct v3dv_bo *bo)
    bo->map_size = 0;
 }
 
-static boolean
+static bool
 reallocate_size_list(struct v3dv_bo_cache *cache,
                      struct v3dv_device *device,
                      uint32_t size)
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_cl.h b/lib/mesa/src/broadcom/vulkan/v3dv_cl.h
index db1dfb08c..7e17ac395 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_cl.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_cl.h
@@ -27,6 +27,7 @@
 #include "broadcom/cle/v3d_packet_helpers.h"
 
 #include "util/list.h"
+#include "util/macros.h"
 
 struct v3dv_bo;
 struct v3dv_job;
@@ -150,15 +151,9 @@ cl_aligned_reloc(struct v3dv_cl *cl,
 uint32_t v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment);
 void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
 
-/* We redefine ALIGN as a macro as we want to use cl_aligned_packet_length for
- * struct fields
- */
-#define ALIGN(value, alignment)                           \
-        (((value) + (alignment) - 1) & ~((alignment) - 1))
-
 #define cl_packet_header(packet) V3DX(packet ## _header)
 #define cl_packet_length(packet) V3DX(packet ## _length)
-#define cl_aligned_packet_length(packet, alignment) ALIGN(cl_packet_length(packet), alignment)
+#define cl_aligned_packet_length(packet, alignment) ALIGN_POT(cl_packet_length(packet), alignment)
 #define cl_packet_pack(packet)   V3DX(packet ## _pack)
 #define cl_packet_struct(packet) V3DX(packet)
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c b/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 449e532c6..dc01a0fa0 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -24,6 +24,7 @@
 #include "v3dv_private.h"
 #include "util/u_pack_color.h"
 #include "vk_util.h"
+#include "vulkan/runtime/vk_common_entrypoints.h"
 
 void
 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
@@ -348,6 +349,7 @@ job_compute_frame_tiling(struct v3dv_job *job,
                          uint32_t layers,
                          uint32_t render_target_count,
                          uint8_t max_internal_bpp,
+                         uint8_t total_color_bpp,
                          bool msaa,
                          bool double_buffer)
 {
@@ -360,13 +362,16 @@ job_compute_frame_tiling(struct v3dv_job *job,
    tiling->render_target_count = render_target_count;
    tiling->msaa = msaa;
    tiling->internal_bpp = max_internal_bpp;
+   tiling->total_color_bpp = total_color_bpp;
    tiling->double_buffer = double_buffer;
 
    /* Double-buffer is incompatible with MSAA */
    assert(!tiling->msaa || !tiling->double_buffer);
 
-   v3d_choose_tile_size(render_target_count, max_internal_bpp,
-                        tiling->msaa, tiling->double_buffer,
+   v3d_choose_tile_size(&job->device->devinfo,
+                        render_target_count,
+                        max_internal_bpp, total_color_bpp, msaa,
+                        tiling->double_buffer,
                         &tiling->tile_width, &tiling->tile_height);
 
    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
@@ -457,6 +462,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
                      bool allocate_tile_state_now,
                      uint32_t render_target_count,
                      uint8_t max_internal_bpp,
+                     uint8_t total_color_bpp,
                      bool msaa)
 {
    assert(job);
@@ -467,7 +473,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
    const struct v3dv_frame_tiling *tiling =
       job_compute_frame_tiling(job, width, height, layers,
                                render_target_count, max_internal_bpp,
-                               msaa, false);
+                               total_color_bpp, msaa, false);
 
    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
    v3dv_return_if_oom(NULL, job);
@@ -528,6 +534,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
                                job->frame_tiling.layers,
                                job->frame_tiling.render_target_count,
                                job->frame_tiling.internal_bpp,
+                               job->frame_tiling.total_color_bpp,
                                job->frame_tiling.msaa,
                                true);
 
@@ -1374,7 +1381,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
    }
 
    uint32_t att_count = 0;
-   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
+   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
 
    /* We only need to emit subpass clears as draw calls for color attachments
     * if the render area is not aligned to tile boundaries.
@@ -1434,7 +1441,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
    } else if (subpass->do_depth_clear_with_draw ||
               subpass->do_stencil_clear_with_draw) {
-      perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), "
+      perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), "
                  "falling back to vkCmdClearAttachments for "
                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
    }
@@ -1672,10 +1679,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
 
       const struct v3dv_framebuffer *framebuffer = state->framebuffer;
 
-      uint8_t internal_bpp;
+      uint8_t max_internal_bpp, total_color_bpp;
       bool msaa;
       v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
-         (framebuffer, state->attachments, subpass, &internal_bpp, &msaa);
+         (framebuffer, state->attachments, subpass,
+          &max_internal_bpp, &total_color_bpp, &msaa);
 
       /* From the Vulkan spec:
        *
@@ -1699,7 +1707,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
                            layers,
                            true, false,
                            subpass->color_count,
-                           internal_bpp,
+                           max_internal_bpp,
+                           total_color_bpp,
                            msaa);
    }
 
@@ -2062,6 +2071,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
+   if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+      if (memcmp(&dest->depth_bounds, &src->depth_bounds,
+                 sizeof(src->depth_bounds))) {
+         memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds));
+         dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+      }
+   }
+
    if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
       if (dest->line_width != src->line_width) {
          dest->line_width = src->line_width;
@@ -2131,39 +2148,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
    }
 }
 
-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
-void
-v3dv_viewport_compute_xform(const VkViewport *viewport,
-                            float scale[3],
-                            float translate[3])
-{
-   float x = viewport->x;
-   float y = viewport->y;
-   float half_width = 0.5f * viewport->width;
-   float half_height = 0.5f * viewport->height;
-   double n = viewport->minDepth;
-   double f = viewport->maxDepth;
-
-   scale[0] = half_width;
-   translate[0] = half_width + x;
-   scale[1] = half_height;
-   translate[1] = half_height + y;
-
-   scale[2] = (f - n);
-   translate[2] = n;
-
-   /* It seems that if the scale is small enough the hardware won't clip
-    * correctly so we work around this my choosing the smallest scale that
-    * seems to work.
-    *
-    * This case is exercised by CTS:
-    * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
-    */
-   const float min_abs_scale = 0.000009f;
-   if (fabs(scale[2]) < min_abs_scale)
-      scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
-}
-
 /* Considers the pipeline's negative_one_to_one state and applies it to the
  * current viewport transform if needed to produce the resulting Z translate
  * and scale parameters.
@@ -2216,9 +2200,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
           viewportCount * sizeof(*pViewports));
 
    for (uint32_t i = firstViewport; i < total_count; i++) {
-      v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
-                                  state->dynamic.viewport.scale[i],
-                                  state->dynamic.viewport.translate[i]);
+      v3dv_X(cmd_buffer->device, viewport_compute_xform)
+         (&state->dynamic.viewport.viewports[i],
+          state->dynamic.viewport.scale[i],
+          state->dynamic.viewport.translate[i]);
    }
 
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
@@ -2264,11 +2249,14 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
     */
    float *vptranslate = dynamic->viewport.translate[0];
    float *vpscale = dynamic->viewport.scale[0];
+   assert(vpscale[0] >= 0);
 
-   float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
-   float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
-   float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
-   float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
+   float vp_minx = vptranslate[0] - vpscale[0];
+   float vp_maxx = vptranslate[0] + vpscale[0];
+
+   /* With KHR_maintenance1 viewport may have negative Y */
+   float vp_miny = vptranslate[1] - fabsf(vpscale[1]);
+   float vp_maxy = vptranslate[1] + fabsf(vpscale[1]);
 
    /* Quoting from v3dx_emit:
     * "Clip to the scissor if it's enabled, but still clip to the
@@ -2297,11 +2285,6 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
                         cmd_buffer->state.render_area.extent.height);
 
-   minx = vp_minx;
-   miny = vp_miny;
-   maxx = vp_maxx;
-   maxy = vp_maxy;
-
    /* Clip against user provided scissor if needed.
     *
     * FIXME: right now we only allow one scissor. Below would need to be
@@ -2701,6 +2684,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
                         true, false,
                         old_job->frame_tiling.render_target_count,
                         old_job->frame_tiling.internal_bpp,
+                        old_job->frame_tiling.total_color_bpp,
                         true /* msaa */);
 
    v3dv_job_destroy(old_job);
@@ -2965,6 +2949,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
       v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
 
+   if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS)
+      v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
+
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
       v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
 
@@ -3309,24 +3296,6 @@ v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
       cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
 }
 
-static uint32_t
-get_index_size(VkIndexType index_type)
-{
-   switch (index_type) {
-   case VK_INDEX_TYPE_UINT8_EXT:
-      return 1;
-      break;
-   case VK_INDEX_TYPE_UINT16:
-      return 2;
-      break;
-   case VK_INDEX_TYPE_UINT32:
-      return 4;
-      break;
-   default:
-      unreachable("Unsupported index type");
-   }
-}
-
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
                         VkBuffer buffer,
@@ -3335,7 +3304,7 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   const uint32_t index_size = get_index_size(indexType);
+   const uint32_t index_size = vk_index_type_to_bytes(indexType);
    if (buffer == cmd_buffer->state.index_buffer.buffer &&
        offset == cmd_buffer->state.index_buffer.offset &&
        index_size == cmd_buffer->state.index_buffer.index_size) {
@@ -3412,9 +3381,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
                        float minDepthBounds,
                        float maxDepthBounds)
 {
-   /* We do not support depth bounds testing so we just ignore this. We are
-    * already asserting that pipelines don't enable the feature anyway.
-    */
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
+   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -3435,6 +3406,304 @@ v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
 }
 
+/**
+ * This checks a descriptor set to see if are binding any descriptors that would
+ * involve sampling from a linear image (the hardware only supports this for
+ * 1D images), and if so, attempts to create a tiled copy of the linear image
+ * and rewrite the descriptor set to use that instead.
+ *
+ * This was added to support a scenario with Android where some part of the UI
+ * wanted to show previews of linear swapchain images. For more details:
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712
+ *
+ * Currently this only supports a linear sampling from a simple 2D image, but
+ * it could be extended to support more cases if necessary.
+ */
+static void
+handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer,
+                                struct v3dv_descriptor_set *set,
+                                bool is_compute)
+{
+   for (int32_t i = 0; i < set->layout->binding_count; i++) {
+      const struct v3dv_descriptor_set_binding_layout *blayout =
+         &set->layout->binding[i];
+      if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
+          blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         continue;
+
+      struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index];
+      if (!desc->image_view)
+         continue;
+
+      struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image;
+      struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view;
+      if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D ||
+                          view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) {
+         continue;
+      }
+
+      /* FIXME: we can probably handle most of these restrictions too with
+       * a bit of extra effort.
+       */
+      if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D ||
+          view->vk.level_count != 1 || view->vk.layer_count != 1 ||
+          blayout->array_size != 1) {
+         fprintf(stderr, "Sampling from linear image is not supported. "
+                 "Expect corruption.\n");
+         continue;
+      }
+
+      /* We are sampling from a linear image. V3D doesn't support this
+       * so we create a tiled copy of the image and rewrite the descriptor
+       * to read from it instead.
+       */
+      perf_debug("Sampling from linear image is not supported natively and "
+                 "requires a copy.\n");
+
+      struct v3dv_device *device = cmd_buffer->device;
+      VkDevice vk_device = v3dv_device_to_handle(device);
+
+      /* Allocate shadow tiled image if needed, we only do this once for
+       * each image, on the first sampling attempt. We need to take a lock
+       * since we may be trying to do the same in another command buffer in
+       * a separate thread.
+       */
+      mtx_lock(&device->meta.mtx);
+      VkResult result;
+      VkImage tiled_image;
+      if (image->shadow) {
+         tiled_image = v3dv_image_to_handle(image->shadow);
+      } else {
+         VkImageCreateInfo image_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+            .flags = image->vk.create_flags,
+            .imageType = image->vk.image_type,
+            .format = image->vk.format,
+            .extent = {
+               image->vk.extent.width,
+               image->vk.extent.height,
+               image->vk.extent.depth,
+            },
+            .mipLevels = image->vk.mip_levels,
+            .arrayLayers = image->vk.array_layers,
+            .samples = image->vk.samples,
+            .tiling = VK_IMAGE_TILING_OPTIMAL,
+            .usage = image->vk.usage,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = 0,
+            .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+         };
+         result = v3dv_CreateImage(vk_device, &image_info,
+                                   &device->vk.alloc, &tiled_image);
+         if (result != VK_SUCCESS) {
+            fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                    "Expect corruption.\n");
+            mtx_unlock(&device->meta.mtx);
+            continue;
+         }
+
+         bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+         VkImageMemoryRequirementsInfo2 reqs_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
+            .image = tiled_image,
+         };
+
+         assert(image->plane_count <= V3DV_MAX_PLANE_COUNT);
+         for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) {
+            VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
+            VkImagePlaneMemoryRequirementsInfo plane_info = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO,
+               .planeAspect = plane_aspect,
+            };
+            if (disjoint)
+               reqs_info.pNext = &plane_info;
+
+            VkMemoryRequirements2 reqs = {
+               .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+            };
+            v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs);
+
+            VkDeviceMemory mem;
+            VkMemoryAllocateInfo alloc_info = {
+               .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+               .allocationSize = reqs.memoryRequirements.size,
+               .memoryTypeIndex = 0,
+            };
+            result = v3dv_AllocateMemory(vk_device, &alloc_info,
+                                         &device->vk.alloc, &mem);
+            if (result != VK_SUCCESS) {
+               fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                       "Expect corruption.\n");
+               v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
+               mtx_unlock(&device->meta.mtx);
+               continue;
+            }
+
+            VkBindImageMemoryInfo bind_info = {
+               .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
+               .image = tiled_image,
+               .memory = mem,
+               .memoryOffset = 0,
+            };
+            VkBindImagePlaneMemoryInfo plane_bind_info = {
+               .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO,
+               .planeAspect = plane_aspect,
+            };
+            if (disjoint)
+               bind_info.pNext = &plane_bind_info;
+            result = v3dv_BindImageMemory2(vk_device, 1, &bind_info);
+            if (result != VK_SUCCESS) {
+               fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                       "Expect corruption.\n");
+               v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
+               v3dv_FreeMemory(vk_device, mem, &device->vk.alloc);
+               mtx_unlock(&device->meta.mtx);
+               continue;
+            }
+         }
+
+         image->shadow = v3dv_image_from_handle(tiled_image);
+      }
+
+      /* Create a shadow view that refers to the tiled image if needed */
+      VkImageView tiled_view;
+      if (view->shadow) {
+         tiled_view = v3dv_image_view_to_handle(view->shadow);
+      } else {
+         VkImageViewCreateInfo view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = view->vk.create_flags,
+            .image = tiled_image,
+            .viewType = view->vk.view_type,
+            .format = view->vk.format,
+            .components = view->vk.swizzle,
+            .subresourceRange = {
+               .aspectMask = view->vk.aspects,
+               .baseMipLevel = view->vk.base_mip_level,
+               .levelCount = view->vk.level_count,
+               .baseArrayLayer = view->vk.base_array_layer,
+               .layerCount = view->vk.layer_count,
+            },
+         };
+         result = v3dv_create_image_view(device, &view_info, &tiled_view);
+         if (result != VK_SUCCESS) {
+            fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                    "Expect corruption.\n");
+            mtx_unlock(&device->meta.mtx);
+            continue;
+         }
+      }
+
+      view->shadow = v3dv_image_view_from_handle(tiled_view);
+
+      mtx_unlock(&device->meta.mtx);
+
+      /* Rewrite the descriptor to use the shadow view */
+      VkDescriptorImageInfo desc_image_info = {
+         .sampler = v3dv_sampler_to_handle(desc->sampler),
+         .imageView = tiled_view,
+         .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      };
+      VkWriteDescriptorSet write = {
+         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+         .dstSet = v3dv_descriptor_set_to_handle(set),
+         .dstBinding = i,
+         .dstArrayElement = 0, /* Assumes array_size is 1 */
+         .descriptorCount = 1,
+         .descriptorType = desc->type,
+         .pImageInfo = &desc_image_info,
+      };
+      v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
+
+      /* Now we need to actually copy the pixel data from the linear image
+       * into the tiled image storage to ensure it is up-to-date.
+       *
+       * FIXME: ideally we would track if the linear image is dirty and skip
+       * this step otherwise, but that would be a bit of a pain.
+       *
+       * Note that we need to place the copy job *before* the current job in
+       * the command buffer state so we have the tiled image ready to process
+       * an upcoming draw call in the current job that samples from it.
+       *
+       * Also, we need to use the TFU path for this copy, as any other path
+       * will use the tile buffer and would require a new framebuffer setup,
+       * thus requiring extra work to stop and resume any in-flight render
+       * pass. Since we are converting a full 2D texture here the TFU should
+       * be able to handle this.
+       */
+      for (int p = 0; p < image->plane_count; p++) {
+         VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
+         struct VkImageCopy2 copy_region = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
+            .srcSubresource = {
+               .aspectMask = image->plane_count == 1 ?
+                  view->vk.aspects : (view->vk.aspects & plane_aspect),
+               .mipLevel = view->vk.base_mip_level,
+               .baseArrayLayer = view->vk.base_array_layer,
+               .layerCount = view->vk.layer_count,
+            },
+            .srcOffset = {0, 0, 0 },
+            .dstSubresource = {
+               .aspectMask = image->plane_count == 1 ?
+                  view->vk.aspects : (view->vk.aspects & plane_aspect),
+               .mipLevel = view->vk.base_mip_level,
+               .baseArrayLayer = view->vk.base_array_layer,
+               .layerCount = view->vk.layer_count,
+            },
+            .dstOffset = { 0, 0, 0},
+            .extent = {
+               image->planes[p].width,
+               image->planes[p].height,
+               1,
+            },
+         };
+         struct v3dv_image *copy_src = image;
+         struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image);
+         bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src,
+                                                  &copy_region);
+         if (ok) {
+            /* This will emit the TFU job right before the current in-flight
+             * job (if any), since in-fight jobs are only added to the list
+             * when finished.
+             */
+            struct v3dv_job *tfu_job =
+               list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
+            assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU);
+            /* Serialize the copy since we don't know who is producing the linear
+             * image and we need the image to be ready by the time the copy
+             * executes.
+             */
+            tfu_job->serialize = V3DV_BARRIER_ALL;
+
+            /* Also, we need to ensure the TFU copy job completes before anyhing
+             * else coming after that may be using the tiled shadow copy.
+             */
+            if (cmd_buffer->state.job) {
+               /* If we already had an in-flight job (i.e. we are in a render
+                * pass) make sure the job waits for the TFU copy.
+                */
+               cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT;
+            } else {
+               /* Otherwise, make the the follow-up job syncs with the TFU
+                * job we just added when it is created by adding the
+                * corresponding barrier state.
+                */
+               if (!is_compute) {
+                  cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
+                  cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT;
+               } else {
+                  cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
+                  cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT;
+               }
+            }
+         } else {
+            fprintf(stderr, "Failed to copy linear 2D image for sampling."
+                    "TFU doesn't support copy. Expect corruption.\n");
+         }
+      }
+   }
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
                            VkPipelineBindPoint pipelineBindPoint,
@@ -3468,6 +3737,15 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
          descriptor_state->descriptor_sets[index] = set;
          dirty_stages |= set->layout->shader_stages;
          descriptor_state_changed = true;
+
+         /* Check if we are sampling from a linear 2D image. This is not
+          * supported in hardware, but may be required for some applications
+          * so we will transparently convert to tiled at the expense of
+          * performance.
+          */
+         handle_sample_from_linear_image(cmd_buffer, set,
+                                         pipelineBindPoint ==
+                                         VK_PIPELINE_BIND_POINT_COMPUTE);
       }
 
       for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
@@ -3846,6 +4124,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
 
 void
 v3dv_cmd_buffer_rewrite_indirect_csd_job(
+   struct v3dv_device *device,
    struct v3dv_csd_indirect_cpu_job_info *info,
    const uint32_t *wg_counts)
 {
@@ -3865,15 +4144,22 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
    submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
    submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 
-   submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
-                    (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+   uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
+                          (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+   if (device->devinfo.ver < 71 ||
+       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+      submit->cfg[4] = num_batches - 1;
+   } else {
+      submit->cfg[4] = num_batches;
+   }
    assert(submit->cfg[4] != ~0);
 
    if (info->needs_wg_uniform_rewrite) {
       /* Make sure the GPU is not currently accessing the indirect CL for this
        * job, since we are about to overwrite some of the uniform data.
        */
-      v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE);
+      v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE);
 
       for (uint32_t i = 0; i < 3; i++) {
          if (info->wg_uniform_offsets[i]) {
@@ -3899,6 +4185,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t **wg_uniform_offsets_out,
                           uint32_t *wg_size_out)
 {
+   struct v3dv_device *device = cmd_buffer->device;
    struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
    assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
    struct v3dv_shader_variant *cs_variant =
@@ -3957,18 +4244,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
    if (wg_size_out)
       *wg_size_out = wg_size;
 
-   submit->cfg[4] = num_batches - 1;
+   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+   if (device->devinfo.ver < 71 ||
+       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+      submit->cfg[4] = num_batches - 1;
+   } else {
+      submit->cfg[4] = num_batches;
+   }
    assert(submit->cfg[4] != ~0);
 
    assert(pipeline->shared_data->assembly_bo);
    struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
 
    submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
-   submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
    if (cs_variant->prog_data.base->single_seg)
       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
    if (cs_variant->prog_data.base->threads == 4)
       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
+   /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved  */
+   if (device->devinfo.ver < 71)
+      submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
 
    if (cs_variant->prog_data.cs->shared_size > 0) {
       job->csd.shared_memory =
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_device.c b/lib/mesa/src/broadcom/vulkan/v3dv_device.c
index 0590afeb7..027c35ffe 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_device.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_device.c
@@ -49,9 +49,14 @@
 #include "git_sha1.h"
 
 #include "util/build_id.h"
+#include "util/os_file.h"
 #include "util/u_debug.h"
 #include "util/format/u_format.h"
 
+#ifdef ANDROID
+#include "vk_android.h"
+#endif
+
 #ifdef VK_USE_PLATFORM_XCB_KHR
 #include <xcb/xcb.h>
 #include <xcb/dri3.h>
@@ -63,11 +68,14 @@
 #include "wayland-drm-client-protocol.h"
 #endif
 
-#ifndef ANDROID
-#   define V3DV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION)
-#else
-/* Android CDD require additional extensions for API v1.1+ */
-#   define V3DV_API_VERSION VK_MAKE_VERSION(1, 0, VK_HEADER_VERSION)
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION)
+
+#ifdef ANDROID
+#if ANDROID_API_LEVEL <= 32
+/* Android 12.1 and lower support only Vulkan API v1.1 */
+#undef V3DV_API_VERSION
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
+#endif
 #endif
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -156,9 +164,7 @@ get_device_extensions(const struct v3dv_physical_device *device,
       .KHR_shader_float_controls            = true,
       .KHR_shader_non_semantic_info         = true,
       .KHR_sampler_mirror_clamp_to_edge     = true,
-#ifndef ANDROID
       .KHR_sampler_ycbcr_conversion         = true,
-#endif
       .KHR_spirv_1_4                        = true,
       .KHR_storage_buffer_storage_class     = true,
       .KHR_timeline_semaphore               = true,
@@ -202,11 +208,237 @@ get_device_extensions(const struct v3dv_physical_device *device,
       .EXT_tooling_info                     = true,
       .EXT_vertex_attribute_divisor         = true,
 #ifdef ANDROID
+      .ANDROID_external_memory_android_hardware_buffer = true,
       .ANDROID_native_buffer                = true,
+      .EXT_queue_family_foreign             = true,
 #endif
    };
 }
 
+static void
+get_features(const struct v3dv_physical_device *physical_device,
+             struct vk_features *features)
+{
+   *features = (struct vk_features) {
+      /* Vulkan 1.0 */
+      .robustBufferAccess = true, /* This feature is mandatory */
+      .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
+      .imageCubeArray = true,
+      .independentBlend = true,
+      .geometryShader = true,
+      .tessellationShader = false,
+      .sampleRateShading = true,
+      .dualSrcBlend = false,
+      .logicOp = true,
+      .multiDrawIndirect = false,
+      .drawIndirectFirstInstance = true,
+      .depthClamp = physical_device->devinfo.ver >= 71,
+      .depthBiasClamp = true,
+      .fillModeNonSolid = true,
+      .depthBounds = physical_device->devinfo.ver >= 71,
+      .wideLines = true,
+      .largePoints = true,
+      .alphaToOne = true,
+      .multiViewport = false,
+      .samplerAnisotropy = true,
+      .textureCompressionETC2 = true,
+      .textureCompressionASTC_LDR = true,
+      /* Note that textureCompressionBC requires that the driver support all
+       * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim
+       * that we support it.
+       */
+      .textureCompressionBC = false,
+      .occlusionQueryPrecise = true,
+      .pipelineStatisticsQuery = false,
+      .vertexPipelineStoresAndAtomics = true,
+      .fragmentStoresAndAtomics = true,
+      .shaderTessellationAndGeometryPointSize = true,
+      .shaderImageGatherExtended = true,
+      .shaderStorageImageExtendedFormats = true,
+      .shaderStorageImageMultisample = false,
+      .shaderStorageImageReadWithoutFormat = true,
+      .shaderStorageImageWriteWithoutFormat = false,
+      .shaderUniformBufferArrayDynamicIndexing = false,
+      .shaderSampledImageArrayDynamicIndexing = false,
+      .shaderStorageBufferArrayDynamicIndexing = false,
+      .shaderStorageImageArrayDynamicIndexing = false,
+      .shaderClipDistance = true,
+      .shaderCullDistance = false,
+      .shaderFloat64 = false,
+      .shaderInt64 = false,
+      .shaderInt16 = false,
+      .shaderResourceResidency = false,
+      .shaderResourceMinLod = false,
+      .sparseBinding = false,
+      .sparseResidencyBuffer = false,
+      .sparseResidencyImage2D = false,
+      .sparseResidencyImage3D = false,
+      .sparseResidency2Samples = false,
+      .sparseResidency4Samples = false,
+      .sparseResidency8Samples = false,
+      .sparseResidency16Samples = false,
+      .sparseResidencyAliased = false,
+      .variableMultisampleRate = false,
+      .inheritedQueries = true,
+
+      /* Vulkan 1.1 */
+      .storageBuffer16BitAccess = true,
+      .uniformAndStorageBuffer16BitAccess = true,
+      .storagePushConstant16 = true,
+      .storageInputOutput16 = false,
+      .multiview = true,
+      .multiviewGeometryShader = false,
+      .multiviewTessellationShader = false,
+      .variablePointersStorageBuffer = true,
+      /* FIXME: this needs support for non-constant index on UBO/SSBO */
+      .variablePointers = false,
+      .protectedMemory = false,
+      .samplerYcbcrConversion = true,
+      .shaderDrawParameters = false,
+
+      /* Vulkan 1.2 */
+      .hostQueryReset = true,
+      .uniformAndStorageBuffer8BitAccess = true,
+      .uniformBufferStandardLayout = true,
+      /* V3D 4.2 wraps TMU vector accesses to 16-byte boundaries, so loads and
+       * stores of vectors that cross these boundaries would not work correctly
+       * with scalarBlockLayout and would need to be split into smaller vectors
+       * (and/or scalars) that don't cross these boundaries. For load/stores
+       * with dynamic offsets where we can't identify if the offset is
+       * problematic, we would always have to scalarize. Overall, this would
+       * not lead to best performance so let's just not support it.
+       */
+      .scalarBlockLayout = physical_device->devinfo.ver >= 71,
+      /* This tells applications 2 things:
+       *
+       * 1. If they can select just one aspect for barriers. For us barriers
+       *    decide if we need to split a job and we don't care if it is only
+       *    for one of the aspects of the image or both, so we don't really
+       *    benefit from seeing barriers that select just one aspect.
+       *
+       * 2. If they can program different layouts for each aspect. We
+       *    generally don't care about layouts, so again, we don't get any
+       *    benefits from this to limit the scope of image layout transitions.
+       *
+       * Still, Vulkan 1.2 requires this feature to be supported so we
+       * advertise it even though we don't really take advantage of it.
+       */
+      .separateDepthStencilLayouts = true,
+      .storageBuffer8BitAccess = true,
+      .storagePushConstant8 = true,
+      .imagelessFramebuffer = true,
+      .timelineSemaphore = true,
+
+      .samplerMirrorClampToEdge = true,
+
+      /* These are mandatory by Vulkan 1.2, however, we don't support any of
+       * the optional features affected by them (non 32-bit types for
+       * shaderSubgroupExtendedTypes and additional subgroup ballot for
+       * subgroupBroadcastDynamicId), so in practice setting them to true
+       * doesn't have any implications for us until we implement any of these
+       * optional features.
+       */
+      .shaderSubgroupExtendedTypes = true,
+      .subgroupBroadcastDynamicId = true,
+
+      .vulkanMemoryModel = true,
+      .vulkanMemoryModelDeviceScope = true,
+      .vulkanMemoryModelAvailabilityVisibilityChains = true,
+
+      .bufferDeviceAddress = true,
+      .bufferDeviceAddressCaptureReplay = false,
+      .bufferDeviceAddressMultiDevice = false,
+
+      /* Vulkan 1.3 */
+      .inlineUniformBlock  = true,
+      /* Inline buffers work like push constants, so after their are bound
+       * some of their contents may be copied into the uniform stream as soon
+       * as the next draw/dispatch is recorded in the command buffer. This means
+       * that if the client updates the buffer contents after binding it to
+       * a command buffer, the next queue submit of that command buffer may
+       * not use the latest update to the buffer contents, but the data that
+       * was present in the buffer at the time it was bound to the command
+       * buffer.
+       */
+      .descriptorBindingInlineUniformBlockUpdateAfterBind = false,
+      .pipelineCreationCacheControl = true,
+      .privateData = true,
+      .maintenance4 = true,
+      .shaderZeroInitializeWorkgroupMemory = true,
+      .synchronization2 = true,
+      .robustImageAccess = true,
+      .shaderIntegerDotProduct = true,
+
+      /* VK_EXT_4444_formats */
+      .formatA4R4G4B4 = true,
+      .formatA4B4G4R4 = true,
+
+      /* VK_EXT_custom_border_color */
+      .customBorderColors = true,
+      .customBorderColorWithoutFormat = false,
+
+      /* VK_EXT_index_type_uint8 */
+      .indexTypeUint8 = true,
+
+      /* VK_EXT_line_rasterization */
+      .rectangularLines = true,
+      .bresenhamLines = true,
+      .smoothLines = false,
+      .stippledRectangularLines = false,
+      .stippledBresenhamLines = false,
+      .stippledSmoothLines = false,
+
+      /* VK_EXT_color_write_enable */
+      .colorWriteEnable = true,
+
+      /* VK_KHR_pipeline_executable_properties */
+      .pipelineExecutableInfo = true,
+
+      /* VK_EXT_provoking_vertex */
+      .provokingVertexLast = true,
+      /* FIXME: update when supporting EXT_transform_feedback */
+      .transformFeedbackPreservesProvokingVertex = false,
+
+      /* VK_EXT_vertex_attribute_divisor */
+      .vertexAttributeInstanceRateDivisor = true,
+      .vertexAttributeInstanceRateZeroDivisor = false,
+
+      /* VK_KHR_performance_query */
+      .performanceCounterQueryPools = physical_device->caps.perfmon,
+      .performanceCounterMultipleQueryPools = false,
+
+      /* VK_EXT_texel_buffer_alignment */
+      .texelBufferAlignment = true,
+
+      /* VK_KHR_workgroup_memory_explicit_layout */
+      .workgroupMemoryExplicitLayout = true,
+      .workgroupMemoryExplicitLayoutScalarBlockLayout = false,
+      .workgroupMemoryExplicitLayout8BitAccess = true,
+      .workgroupMemoryExplicitLayout16BitAccess = true,
+
+      /* VK_EXT_border_color_swizzle */
+      .borderColorSwizzle = true,
+      .borderColorSwizzleFromImage = true,
+
+      /* VK_EXT_shader_module_identifier */
+      .shaderModuleIdentifier = true,
+
+      /* VK_EXT_depth_clip_control */
+      .depthClipControl = true,
+
+      /* VK_EXT_attachment_feedback_loop_layout */
+      .attachmentFeedbackLoopLayout = true,
+
+      /* VK_EXT_primitive_topology_list_restart */
+      .primitiveTopologyListRestart = true,
+      /* FIXME: we don't support tessellation shaders yet */
+      .primitiveTopologyPatchListRestart = false,
+
+      /* VK_EXT_pipeline_robustness */
+      .pipelineRobustness = true,
+   };
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EnumerateInstanceExtensionProperties(const char *pLayerName,
                                           uint32_t *pPropertyCount,
@@ -321,8 +553,6 @@ physical_device_finish(struct v3dv_physical_device *device)
    close(device->render_fd);
    if (device->display_fd >= 0)
       close(device->display_fd);
-   if (device->master_fd >= 0)
-      close(device->master_fd);
 
    free(device->name);
 
@@ -404,273 +634,6 @@ compute_memory_budget(struct v3dv_physical_device *device)
    return MIN2(heap_size, heap_used + heap_available);
 }
 
-#if !using_v3d_simulator
-#ifdef VK_USE_PLATFORM_XCB_KHR
-static int
-create_display_fd_xcb(VkIcdSurfaceBase *surface)
-{
-   int fd = -1;
-
-   xcb_connection_t *conn;
-   xcb_dri3_open_reply_t *reply = NULL;
-   if (surface) {
-      if (surface->platform == VK_ICD_WSI_PLATFORM_XLIB)
-         conn = XGetXCBConnection(((VkIcdSurfaceXlib *)surface)->dpy);
-      else
-         conn = ((VkIcdSurfaceXcb *)surface)->connection;
-   } else {
-      conn = xcb_connect(NULL, NULL);
-   }
-
-   if (xcb_connection_has_error(conn))
-      goto finish;
-
-   const xcb_setup_t *setup = xcb_get_setup(conn);
-   xcb_screen_iterator_t iter = xcb_setup_roots_iterator(setup);
-   xcb_screen_t *screen = iter.data;
-
-   xcb_dri3_open_cookie_t cookie;
-   cookie = xcb_dri3_open(conn, screen->root, None);
-   reply = xcb_dri3_open_reply(conn, cookie, NULL);
-   if (!reply)
-      goto finish;
-
-   if (reply->nfd != 1)
-      goto finish;
-
-   fd = xcb_dri3_open_reply_fds(conn, reply)[0];
-   fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
-
-finish:
-   if (!surface)
-      xcb_disconnect(conn);
-   if (reply)
-      free(reply);
-
-   return fd;
-}
-#endif
-
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-struct v3dv_wayland_info {
-   struct wl_drm *wl_drm;
-   int fd;
-   bool is_set;
-   bool authenticated;
-};
-
-static void
-v3dv_drm_handle_device(void *data, struct wl_drm *drm, const char *device)
-{
-   struct v3dv_wayland_info *info = data;
-   info->fd = open(device, O_RDWR | O_CLOEXEC);
-   info->is_set = info->fd != -1;
-   if (!info->is_set) {
-      fprintf(stderr, "v3dv_drm_handle_device: could not open %s (%s)\n",
-              device, strerror(errno));
-      return;
-   }
-
-   drm_magic_t magic;
-   if (drmGetMagic(info->fd, &magic)) {
-      fprintf(stderr, "v3dv_drm_handle_device: drmGetMagic failed\n");
-      close(info->fd);
-      info->fd = -1;
-      info->is_set = false;
-      return;
-   }
-   wl_drm_authenticate(info->wl_drm, magic);
-}
-
-static void
-v3dv_drm_handle_format(void *data, struct wl_drm *drm, uint32_t format)
-{
-}
-
-static void
-v3dv_drm_handle_authenticated(void *data, struct wl_drm *drm)
-{
-   struct v3dv_wayland_info *info = data;
-   info->authenticated = true;
-}
-
-static void
-v3dv_drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t value)
-{
-}
-
-struct wl_drm_listener v3dv_drm_listener = {
-   .device = v3dv_drm_handle_device,
-   .format = v3dv_drm_handle_format,
-   .authenticated = v3dv_drm_handle_authenticated,
-   .capabilities = v3dv_drm_handle_capabilities
-};
-
-static void
-v3dv_registry_global(void *data,
-                     struct wl_registry *registry,
-                     uint32_t name,
-                     const char *interface,
-                     uint32_t version)
-{
-   struct v3dv_wayland_info *info = data;
-   if (strcmp(interface, wl_drm_interface.name) == 0) {
-      info->wl_drm = wl_registry_bind(registry, name, &wl_drm_interface,
-                                      MIN2(version, 2));
-      wl_drm_add_listener(info->wl_drm, &v3dv_drm_listener, data);
-   };
-}
-
-static void
-v3dv_registry_global_remove_cb(void *data,
-                               struct wl_registry *registry,
-                               uint32_t name)
-{
-}
-
-static int
-create_display_fd_wayland(VkIcdSurfaceBase *surface)
-{
-   struct wl_display *display;
-   struct wl_registry *registry = NULL;
-
-   struct v3dv_wayland_info info = {
-      .wl_drm = NULL,
-      .fd = -1,
-      .is_set = false,
-      .authenticated = false
-   };
-
-   if (surface)
-      display = ((VkIcdSurfaceWayland *) surface)->display;
-   else
-      display = wl_display_connect(NULL);
-
-   if (!display)
-      return -1;
-
-   registry = wl_display_get_registry(display);
-   if (!registry) {
-      if (!surface)
-         wl_display_disconnect(display);
-      return -1;
-   }
-
-   static const struct wl_registry_listener registry_listener = {
-      v3dv_registry_global,
-      v3dv_registry_global_remove_cb
-   };
-   wl_registry_add_listener(registry, &registry_listener, &info);
-
-   wl_display_roundtrip(display); /* For the registry advertisement */
-   wl_display_roundtrip(display); /* For the DRM device event */
-   wl_display_roundtrip(display); /* For the authentication event */
-
-   wl_drm_destroy(info.wl_drm);
-   wl_registry_destroy(registry);
-
-   if (!surface)
-      wl_display_disconnect(display);
-
-   if (!info.is_set)
-      return -1;
-
-   if (!info.authenticated)
-      return -1;
-
-   return info.fd;
-}
-#endif
-
-/* Acquire an authenticated display fd without a surface reference. This is the
- * case where the application is making WSI allocations outside the Vulkan
- * swapchain context (only Zink, for now). Since we lack information about the
- * underlying surface we just try our best to figure out the correct display
- * and platform to use. It should work in most cases.
- */
-static void
-acquire_display_device_no_surface(struct v3dv_physical_device *pdevice)
-{
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-   pdevice->display_fd = create_display_fd_wayland(NULL);
-#endif
-
-#ifdef VK_USE_PLATFORM_XCB_KHR
-   if (pdevice->display_fd == -1)
-      pdevice->display_fd = create_display_fd_xcb(NULL);
-#endif
-
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
-   if (pdevice->display_fd == - 1 && pdevice->master_fd >= 0)
-      pdevice->display_fd = dup(pdevice->master_fd);
-#endif
-}
-
-/* Acquire an authenticated display fd from the surface. This is the regular
- * case where the application is using swapchains to create WSI allocations.
- * In this case we use the surface information to figure out the correct
- * display and platform combination.
- */
-static void
-acquire_display_device_surface(struct v3dv_physical_device *pdevice,
-                               VkIcdSurfaceBase *surface)
-{
-   /* Mesa will set both of VK_USE_PLATFORM_{XCB,XLIB} when building with
-    * platform X11, so only check for XCB and rely on XCB to get an
-    * authenticated device also for Xlib.
-    */
-#ifdef VK_USE_PLATFORM_XCB_KHR
-   if (surface->platform == VK_ICD_WSI_PLATFORM_XCB ||
-       surface->platform == VK_ICD_WSI_PLATFORM_XLIB) {
-      pdevice->display_fd = create_display_fd_xcb(surface);
-   }
-#endif
-
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-   if (surface->platform == VK_ICD_WSI_PLATFORM_WAYLAND)
-      pdevice->display_fd = create_display_fd_wayland(surface);
-#endif
-
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
-   if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY &&
-       pdevice->master_fd >= 0) {
-      pdevice->display_fd = dup(pdevice->master_fd);
-   }
-#endif
-}
-#endif /* !using_v3d_simulator */
-
-/* Attempts to get an authenticated display fd from the display server that
- * we can use to allocate BOs for presentable images.
- */
-VkResult
-v3dv_physical_device_acquire_display(struct v3dv_physical_device *pdevice,
-                                     VkIcdSurfaceBase *surface)
-{
-   VkResult result = VK_SUCCESS;
-   mtx_lock(&pdevice->mutex);
-
-   if (pdevice->display_fd != -1)
-      goto done;
-
-   /* When running on the simulator we do everything on a single render node so
-    * we don't need to get an authenticated display fd from the display server.
-    */
-#if !using_v3d_simulator
-   if (surface)
-      acquire_display_device_surface(pdevice, surface);
-   else
-      acquire_display_device_no_surface(pdevice);
-
-   if (pdevice->display_fd == -1)
-      result = VK_ERROR_INITIALIZATION_FAILED;
-#endif
-
-done:
-   mtx_unlock(&pdevice->mutex);
-   return result;
-}
-
 static bool
 v3d_has_feature(struct v3dv_physical_device *device, enum drm_v3d_param feature)
 {
@@ -763,11 +726,11 @@ v3dv_physical_device_init_disk_cache(struct v3dv_physical_device *device)
 
 static VkResult
 create_physical_device(struct v3dv_instance *instance,
-                       drmDevicePtr drm_render_device,
-                       drmDevicePtr drm_primary_device)
+                       drmDevicePtr gpu_device,
+                       drmDevicePtr display_device)
 {
    VkResult result = VK_SUCCESS;
-   int32_t master_fd = -1;
+   int32_t display_fd = -1;
    int32_t render_fd = -1;
 
    struct v3dv_physical_device *device =
@@ -783,14 +746,14 @@ create_physical_device(struct v3dv_instance *instance,
    vk_physical_device_dispatch_table_from_entrypoints(
       &dispatch_table, &wsi_physical_device_entrypoints, false);
 
-   result = vk_physical_device_init(&device->vk, &instance->vk, NULL,
-                                    &dispatch_table);
+   result = vk_physical_device_init(&device->vk, &instance->vk, NULL, NULL,
+                                    NULL, &dispatch_table);
 
    if (result != VK_SUCCESS)
       goto fail;
 
-   assert(drm_render_device);
-   const char *path = drm_render_device->nodes[DRM_NODE_RENDER];
+   assert(gpu_device);
+   const char *path = gpu_device->nodes[DRM_NODE_RENDER];
    render_fd = open(path, O_RDWR | O_CLOEXEC);
    if (render_fd < 0) {
       fprintf(stderr, "Opening %s failed: %s\n", path, strerror(errno));
@@ -805,12 +768,12 @@ create_physical_device(struct v3dv_instance *instance,
 
    const char *primary_path;
 #if !using_v3d_simulator
-   if (drm_primary_device)
-      primary_path = drm_primary_device->nodes[DRM_NODE_PRIMARY];
+   if (display_device)
+      primary_path = display_device->nodes[DRM_NODE_PRIMARY];
    else
       primary_path = NULL;
 #else
-   primary_path = drm_render_device->nodes[DRM_NODE_PRIMARY];
+   primary_path = gpu_device->nodes[DRM_NODE_PRIMARY];
 #endif
 
    struct stat primary_stat = {0}, render_stat = {0};
@@ -837,20 +800,23 @@ create_physical_device(struct v3dv_instance *instance,
    device->render_devid = render_stat.st_rdev;
 
 #if using_v3d_simulator
-   device->device_id = drm_render_device->deviceinfo.pci->device_id;
+   device->device_id = gpu_device->deviceinfo.pci->device_id;
 #endif
 
    if (instance->vk.enabled_extensions.KHR_display ||
+       instance->vk.enabled_extensions.KHR_xcb_surface ||
+       instance->vk.enabled_extensions.KHR_xlib_surface ||
+       instance->vk.enabled_extensions.KHR_wayland_surface ||
        instance->vk.enabled_extensions.EXT_acquire_drm_display) {
 #if !using_v3d_simulator
       /* Open the primary node on the vc4 display device */
-      assert(drm_primary_device);
-      master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+      assert(display_device);
+      display_fd = open(primary_path, O_RDWR | O_CLOEXEC);
 #else
       /* There is only one device with primary and render nodes.
        * Open its primary node.
        */
-      master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+      display_fd = open(primary_path, O_RDWR | O_CLOEXEC);
 #endif
    }
 
@@ -859,8 +825,7 @@ create_physical_device(struct v3dv_instance *instance,
 #endif
 
    device->render_fd = render_fd;    /* The v3d render node  */
-   device->display_fd = -1;          /* Authenticated vc4 primary node */
-   device->master_fd = master_fd;    /* Master vc4 primary node */
+   device->display_fd = display_fd;  /* Master vc4 primary node */
 
    if (!v3d_get_device_info(device->render_fd, &device->devinfo, &v3dv_ioctl)) {
       result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
@@ -895,8 +860,10 @@ create_physical_device(struct v3dv_instance *instance,
    device->next_program_id = 0;
 
    ASSERTED int len =
-      asprintf(&device->name, "V3D %d.%d",
-               device->devinfo.ver / 10, device->devinfo.ver % 10);
+      asprintf(&device->name, "V3D %d.%d.%d",
+               device->devinfo.ver / 10,
+               device->devinfo.ver % 10,
+               device->devinfo.rev);
    assert(len != -1);
 
    v3dv_physical_device_init_disk_cache(device);
@@ -928,36 +895,6 @@ create_physical_device(struct v3dv_instance *instance,
     */
    device->drm_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
 
-#if using_v3d_simulator
-   /* There are CTS tests which do the following:
-    *
-    *  1. Create a command buffer with a vkCmdWaitEvents()
-    *  2. Submit the command buffer
-    *  3. vkGetSemaphoreFdKHR() to try to get a sync_file
-    *  4. vkSetEvent()
-    *
-    * This deadlocks in the simulator because we have to wait for the syncobj
-    * to get a real fence in vkGetSemaphoreFdKHR(). This will never happen
-    * though because the simulator, unlike real hardware, executes ioctls
-    * synchronously in the same thread, which means that it will try to
-    * execute the wait for event immediately and never get to emit the
-    * signaling job that comes after the compute job that implements the wait
-    * in the command buffer, which would be responsible for creating the fence
-    * for the signaling semaphore.
-    *
-    * This behavior was seemingly allowed in previous Vulkan versions, however,
-    * this was fixed in Vulkan the 1.3.228 spec. From commit 355367640f2e:
-    *
-    *    "Clarify that vkCmdWaitEvents must not execute before a vkSetEvent it
-    *     waits on (internal issue 2971)"
-    *
-    * Either way, we disable sync file support in the simulator for now, until
-    * the CTS is fixed.
-    */
-   device->drm_syncobj_type.import_sync_file = NULL;
-   device->drm_syncobj_type.export_sync_file = NULL;
-#endif
-
    /* Multiwait is required for emulated timeline semaphores and is supported
     * by the v3d kernel interface.
     */
@@ -978,6 +915,7 @@ create_physical_device(struct v3dv_instance *instance,
    }
 
    get_device_extensions(device, &device->vk.supported_extensions);
+   get_features(device, &device->vk.supported_features);
 
    mtx_init(&device->mutex, mtx_plain);
 
@@ -991,8 +929,8 @@ fail:
 
    if (render_fd >= 0)
       close(render_fd);
-   if (master_fd >= 0)
-      close(master_fd);
+   if (display_fd >= 0)
+      close(display_fd);
 
    return result;
 }
@@ -1035,14 +973,13 @@ enumerate_devices(struct vk_instance *vk_instance)
             break;
       }
 #else
-      /* On actual hardware, we should have a render node (v3d)
-       * and a primary node (vc4). We will need to use the primary
-       * to allocate WSI buffers and share them with the render node
-       * via prime, but that is a privileged operation so we need the
-       * primary node to be authenticated, and for that we need the
-       * display server to provide the device fd (with DRI3), so we
-       * here we only check that the device is present but we don't
-       * try to open it.
+      /* On actual hardware, we should have a gpu device (v3d) and a display
+       * device (vc4). We will need to use the display device to allocate WSI
+       * buffers and share them with the render node via prime, but that is a
+       * privileged operation so we need t have an authenticated display fd
+       * and for that we need the display server to provide the it (with DRI3),
+       * so here we only check that the device is present but we don't try to
+       * open it.
        */
       if (devices[i]->bustype != DRM_BUS_PLATFORM)
          continue;
@@ -1050,7 +987,8 @@ enumerate_devices(struct vk_instance *vk_instance)
       if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
          char **compat = devices[i]->deviceinfo.platform->compatible;
          while (*compat) {
-            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
+            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
+                strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
                v3d_idx = i;
                break;
             }
@@ -1059,8 +997,9 @@ enumerate_devices(struct vk_instance *vk_instance)
       } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
          char **compat = devices[i]->deviceinfo.platform->compatible;
          while (*compat) {
-            if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
-                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
+            if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
+                strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
                vc4_idx = i;
                break;
             }
@@ -1071,9 +1010,10 @@ enumerate_devices(struct vk_instance *vk_instance)
    }
 
 #if !using_v3d_simulator
-   if (v3d_idx != -1 && vc4_idx != -1) {
-      result =
-         create_physical_device(instance, devices[v3d_idx], devices[vc4_idx]);
+   if (v3d_idx != -1) {
+      drmDevicePtr v3d_device = devices[v3d_idx];
+      drmDevicePtr vc4_device = vc4_idx != -1 ? devices[vc4_idx] : NULL;
+      result = create_physical_device(instance, v3d_device, vc4_device);
    }
 #endif
 
@@ -1082,238 +1022,6 @@ enumerate_devices(struct vk_instance *vk_instance)
    return result;
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
-                                VkPhysicalDeviceFeatures2 *pFeatures)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice);
-
-   struct vk_features features = {
-      /* Vulkan 1.0 */
-      .robustBufferAccess = true, /* This feature is mandatory */
-      .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
-      .imageCubeArray = true,
-      .independentBlend = true,
-      .geometryShader = true,
-      .tessellationShader = false,
-      .sampleRateShading = true,
-      .dualSrcBlend = false,
-      .logicOp = true,
-      .multiDrawIndirect = false,
-      .drawIndirectFirstInstance = true,
-      .depthClamp = false, /* Only available since V3D 4.5.1.1 */
-      .depthBiasClamp = true,
-      .fillModeNonSolid = true,
-      .depthBounds = false, /* Only available since V3D 4.3.16.2 */
-      .wideLines = true,
-      .largePoints = true,
-      .alphaToOne = true,
-      .multiViewport = false,
-      .samplerAnisotropy = true,
-      .textureCompressionETC2 = true,
-      .textureCompressionASTC_LDR = true,
-      /* Note that textureCompressionBC requires that the driver support all
-       * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim
-       * that we support it.
-       */
-      .textureCompressionBC = false,
-      .occlusionQueryPrecise = true,
-      .pipelineStatisticsQuery = false,
-      .vertexPipelineStoresAndAtomics = true,
-      .fragmentStoresAndAtomics = true,
-      .shaderTessellationAndGeometryPointSize = true,
-      .shaderImageGatherExtended = false,
-      .shaderStorageImageExtendedFormats = true,
-      .shaderStorageImageMultisample = false,
-      .shaderStorageImageReadWithoutFormat = true,
-      .shaderStorageImageWriteWithoutFormat = false,
-      .shaderUniformBufferArrayDynamicIndexing = false,
-      .shaderSampledImageArrayDynamicIndexing = false,
-      .shaderStorageBufferArrayDynamicIndexing = false,
-      .shaderStorageImageArrayDynamicIndexing = false,
-      .shaderClipDistance = true,
-      .shaderCullDistance = false,
-      .shaderFloat64 = false,
-      .shaderInt64 = false,
-      .shaderInt16 = false,
-      .shaderResourceResidency = false,
-      .shaderResourceMinLod = false,
-      .sparseBinding = false,
-      .sparseResidencyBuffer = false,
-      .sparseResidencyImage2D = false,
-      .sparseResidencyImage3D = false,
-      .sparseResidency2Samples = false,
-      .sparseResidency4Samples = false,
-      .sparseResidency8Samples = false,
-      .sparseResidency16Samples = false,
-      .sparseResidencyAliased = false,
-      .variableMultisampleRate = false,
-      .inheritedQueries = true,
-
-      /* Vulkan 1.1 */
-      .storageBuffer16BitAccess = true,
-      .uniformAndStorageBuffer16BitAccess = true,
-      .storagePushConstant16 = true,
-      .storageInputOutput16 = false,
-      .multiview = true,
-      .multiviewGeometryShader = false,
-      .multiviewTessellationShader = false,
-      .variablePointersStorageBuffer = true,
-      /* FIXME: this needs support for non-constant index on UBO/SSBO */
-      .variablePointers = false,
-      .protectedMemory = false,
-#ifdef ANDROID
-      .samplerYcbcrConversion = false,
-#else
-      .samplerYcbcrConversion = true,
-#endif
-      .shaderDrawParameters = false,
-
-      /* Vulkan 1.2 */
-      .hostQueryReset = true,
-      .uniformAndStorageBuffer8BitAccess = true,
-      .uniformBufferStandardLayout = true,
-      /* V3D 4.2 wraps TMU vector accesses to 16-byte boundaries, so loads and
-       * stores of vectors that cross these boundaries would not work correcly
-       * with scalarBlockLayout and would need to be split into smaller vectors
-       * (and/or scalars) that don't cross these boundaries. For load/stores
-       * with dynamic offsets where we can't identify if the offset is
-       * problematic, we would always have to scalarize. Overall, this would
-       * not lead to best performance so let's just not support it.
-       */
-      .scalarBlockLayout = false,
-      /* This tells applications 2 things:
-       *
-       * 1. If they can select just one aspect for barriers. For us barriers
-       *    decide if we need to split a job and we don't care if it is only
-       *    for one of the aspects of the image or both, so we don't really
-       *    benefit from seeing barriers that select just one aspect.
-       *
-       * 2. If they can program different layouts for each aspect. We
-       *    generally don't care about layouts, so again, we don't get any
-       *    benefits from this to limit the scope of image layout transitions.
-       *
-       * Still, Vulkan 1.2 requires this feature to be supported so we
-       * advertise it even though we don't really take advantage of it.
-       */
-      .separateDepthStencilLayouts = true,
-      .storageBuffer8BitAccess = true,
-      .storagePushConstant8 = true,
-      .imagelessFramebuffer = true,
-      .timelineSemaphore = true,
-
-      .samplerMirrorClampToEdge = true,
-
-      /* These are mandatory by Vulkan 1.2, however, we don't support any of
-       * the optional features affected by them (non 32-bit types for
-       * shaderSubgroupExtendedTypes and additional subgroup ballot for
-       * subgroupBroadcastDynamicId), so in practice setting them to true
-       * doesn't have any implications for us until we implement any of these
-       * optional features.
-       */
-      .shaderSubgroupExtendedTypes = true,
-      .subgroupBroadcastDynamicId = true,
-
-      .vulkanMemoryModel = true,
-      .vulkanMemoryModelDeviceScope = true,
-      .vulkanMemoryModelAvailabilityVisibilityChains = true,
-
-      .bufferDeviceAddress = true,
-      .bufferDeviceAddressCaptureReplay = false,
-      .bufferDeviceAddressMultiDevice = false,
-
-      /* Vulkan 1.3 */
-      .inlineUniformBlock  = true,
-      /* Inline buffers work like push constants, so after their are bound
-       * some of their contents may be copied into the uniform stream as soon
-       * as the next draw/dispatch is recorded in the command buffer. This means
-       * that if the client updates the buffer contents after binding it to
-       * a command buffer, the next queue submit of that command buffer may
-       * not use the latest update to the buffer contents, but the data that
-       * was present in the buffer at the time it was bound to the command
-       * buffer.
-       */
-      .descriptorBindingInlineUniformBlockUpdateAfterBind = false,
-      .pipelineCreationCacheControl = true,
-      .privateData = true,
-      .maintenance4 = true,
-      .shaderZeroInitializeWorkgroupMemory = true,
-      .synchronization2 = true,
-      .robustImageAccess = true,
-      .shaderIntegerDotProduct = true,
-
-      /* VK_EXT_4444_formats */
-      .formatA4R4G4B4 = true,
-      .formatA4B4G4R4 = true,
-
-      /* VK_EXT_custom_border_color */
-      .customBorderColors = true,
-      .customBorderColorWithoutFormat = false,
-
-      /* VK_EXT_index_type_uint8 */
-      .indexTypeUint8 = true,
-
-      /* VK_EXT_line_rasterization */
-      .rectangularLines = true,
-      .bresenhamLines = true,
-      .smoothLines = false,
-      .stippledRectangularLines = false,
-      .stippledBresenhamLines = false,
-      .stippledSmoothLines = false,
-
-      /* VK_EXT_color_write_enable */
-      .colorWriteEnable = true,
-
-      /* VK_KHR_pipeline_executable_properties */
-      .pipelineExecutableInfo = true,
-
-      /* VK_EXT_provoking_vertex */
-      .provokingVertexLast = true,
-      /* FIXME: update when supporting EXT_transform_feedback */
-      .transformFeedbackPreservesProvokingVertex = false,
-
-      /* VK_EXT_vertex_attribute_divisor */
-      .vertexAttributeInstanceRateDivisor = true,
-      .vertexAttributeInstanceRateZeroDivisor = false,
-
-      /* VK_KHR_performance_query */
-      .performanceCounterQueryPools = physical_device->caps.perfmon,
-      .performanceCounterMultipleQueryPools = false,
-
-      /* VK_EXT_texel_buffer_alignment */
-      .texelBufferAlignment = true,
-
-      /* VK_KHR_workgroup_memory_explicit_layout */
-      .workgroupMemoryExplicitLayout = true,
-      .workgroupMemoryExplicitLayoutScalarBlockLayout = false,
-      .workgroupMemoryExplicitLayout8BitAccess = true,
-      .workgroupMemoryExplicitLayout16BitAccess = true,
-
-      /* VK_EXT_border_color_swizzle */
-      .borderColorSwizzle = true,
-      .borderColorSwizzleFromImage = true,
-
-      /* VK_EXT_shader_module_identifier */
-      .shaderModuleIdentifier = true,
-
-      /* VK_EXT_depth_clip_control */
-      .depthClipControl = true,
-
-      /* VK_EXT_attachment_feedback_loop_layout */
-      .attachmentFeedbackLoopLayout = true,
-
-      /* VK_EXT_primitive_topology_list_restart */
-      .primitiveTopologyListRestart = true,
-      /* FIXME: we don't support tessellation shaders yet */
-      .primitiveTopologyPatchListRestart = false,
-
-      /* VK_EXT_pipeline_robustness */
-      .pipelineRobustness = true,
-   };
-
-   vk_get_physical_device_features(pFeatures, &features);
-}
-
 uint32_t
 v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev)
 {
@@ -1329,6 +1037,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
    switch (dev->devinfo.ver) {
    case 42:
       return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+   case 71:
+      return 0x55701C33; /* Broadcom deviceID for 2712 */
    default:
       unreachable("Unsupported V3D version");
    }
@@ -1357,6 +1067,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
    const VkSampleCountFlags supported_sample_counts =
       VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
 
+   const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
+
    struct timespec clock_res;
    clock_getres(CLOCK_MONOTONIC, &clock_res);
    const float timestamp_period =
@@ -1427,7 +1139,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxFragmentInputComponents               = max_varying_components,
       .maxFragmentOutputAttachments             = 4,
       .maxFragmentDualSrcAttachments            = 0,
-      .maxFragmentCombinedOutputResources       = MAX_RENDER_TARGETS +
+      .maxFragmentCombinedOutputResources       = max_rts +
                                                   MAX_STORAGE_BUFFERS +
                                                   MAX_STORAGE_IMAGES,
 
@@ -1440,7 +1152,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .subPixelPrecisionBits                    = V3D_COORD_SHIFT,
       .subTexelPrecisionBits                    = 8,
       .mipmapPrecisionBits                      = 8,
-      .maxDrawIndexedIndexValue                 = 0x00ffffff,
+      .maxDrawIndexedIndexValue                 = pdevice->devinfo.ver >= 71 ?
+                                                  0xffffffff : 0x00ffffff,
       .maxDrawIndirectCount                     = 0x7fffffff,
       .maxSamplerLodBias                        = 14.0f,
       .maxSamplerAnisotropy                     = 16.0f,
@@ -1467,7 +1180,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .framebufferDepthSampleCounts             = supported_sample_counts,
       .framebufferStencilSampleCounts           = supported_sample_counts,
       .framebufferNoAttachmentsSampleCounts     = supported_sample_counts,
-      .maxColorAttachments                      = MAX_RENDER_TARGETS,
+      .maxColorAttachments                      = max_rts,
       .sampledImageColorSampleCounts            = supported_sample_counts,
       .sampledImageIntegerSampleCounts          = supported_sample_counts,
       .sampledImageDepthSampleCounts            = supported_sample_counts,
@@ -1579,8 +1292,8 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
       .driverID = VK_DRIVER_ID_MESA_V3DV,
       .conformanceVersion = {
          .major = 1,
-         .minor = 2,
-         .subminor = 7,
+         .minor = 3,
+         .subminor = 6,
          .patch = 1,
       },
       .supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT,
@@ -1689,6 +1402,24 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->allowCommandBufferQueryCopies = true;
          break;
       }
+#ifdef ANDROID
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: {
+         VkPhysicalDevicePresentationPropertiesANDROID *props =
+            (VkPhysicalDevicePresentationPropertiesANDROID *)ext;
+         uint64_t front_rendering_usage = 0;
+         struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+         if (gralloc != NULL) {
+            u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage);
+            u_gralloc_destroy(&gralloc);
+         }
+         props->sharedImage = front_rendering_usage ? VK_TRUE
+                                                    : VK_FALSE;
+         break;
+      }
+#pragma GCC diagnostic pop
+#endif
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
          VkPhysicalDeviceDrmPropertiesEXT *props =
             (VkPhysicalDeviceDrmPropertiesEXT *)ext;
@@ -2001,6 +1732,11 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
       return vk_error(NULL, result);
    }
 
+#ifdef ANDROID
+   device->gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+   assert(device->gralloc);
+#endif
+
    device->instance = instance;
    device->pdevice = physical_device;
 
@@ -2034,7 +1770,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
                             device->instance->default_pipeline_cache_enabled);
    device->default_attribute_float =
-      v3dv_pipeline_create_default_attribute_values(device, NULL);
+      v3dv_X(device, create_default_attribute_values)(device, NULL);
 
    device->device_address_mem_ctx = ralloc_context(NULL);
    util_dynarray_init(&device->device_address_bo_list,
@@ -2067,6 +1803,9 @@ fail:
    v3dv_event_free_resources(device);
    v3dv_query_free_resources(device);
    vk_device_finish(&device->vk);
+#ifdef ANDROID
+   u_gralloc_destroy(&device->gralloc);
+#endif
    vk_free(&device->vk.alloc, device);
 
    return result;
@@ -2105,6 +1844,9 @@ v3dv_DestroyDevice(VkDevice _device,
    mtx_destroy(&device->query_mutex);
 
    vk_device_finish(&device->vk);
+#ifdef ANDROID
+   u_gralloc_destroy(&device->gralloc);
+#endif
    vk_free2(&device->vk.alloc, pAllocator, device);
 }
 
@@ -2244,18 +1986,8 @@ device_alloc_for_wsi(struct v3dv_device *device,
 #if using_v3d_simulator
       return device_alloc(device, mem, size);
 #else
-   /* If we are allocating for WSI we should have a swapchain and thus,
-    * we should've initialized the display device. However, Zink doesn't
-    * use swapchains, so in that case we can get here without acquiring the
-    * display device and we need to do it now.
-    */
    VkResult result;
    struct v3dv_physical_device *pdevice = device->pdevice;
-   if (unlikely(pdevice->display_fd < 0)) {
-      result = v3dv_physical_device_acquire_display(pdevice, NULL);
-      if (result != VK_SUCCESS)
-         return result;
-   }
    assert(pdevice->display_fd != -1);
 
    mem->is_for_wsi = true;
@@ -2329,7 +2061,7 @@ free_memory(struct v3dv_device *device,
 
    device_free(device, mem);
 
-   vk_object_free(&device->vk, pAllocator, mem);
+   vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -2354,13 +2086,10 @@ v3dv_AllocateMemory(VkDevice _device,
 
    assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
 
-   /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */
-   assert(pAllocateInfo->allocationSize > 0);
-
    /* We always allocate device memory in multiples of a page, so round up
     * requested size to that.
     */
-   const VkDeviceSize alloc_size = ALIGN(pAllocateInfo->allocationSize, 4096);
+   const VkDeviceSize alloc_size = align64(pAllocateInfo->allocationSize, 4096);
 
    if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE))
       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
@@ -2369,8 +2098,8 @@ v3dv_AllocateMemory(VkDevice _device,
    if (unlikely(heap_used + alloc_size > pdevice->memory.memoryHeaps[0].size))
       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
-   mem = vk_object_zalloc(&device->vk, pAllocator, sizeof(*mem),
-                          VK_OBJECT_TYPE_DEVICE_MEMORY);
+   mem = vk_device_memory_create(&device->vk, pAllocateInfo,
+                                 pAllocator, sizeof(*mem));
    if (mem == NULL)
       return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -2410,6 +2139,7 @@ v3dv_AllocateMemory(VkDevice _device,
    }
 
    VkResult result;
+
    if (wsi_info) {
       result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size);
    } else if (fd_info && fd_info->handleType) {
@@ -2419,12 +2149,22 @@ v3dv_AllocateMemory(VkDevice _device,
                                 fd_info->fd, alloc_size, &mem->bo);
       if (result == VK_SUCCESS)
          close(fd_info->fd);
+   } else if (mem->vk.ahardware_buffer) {
+#ifdef ANDROID
+      const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
+      assert(handle->numFds > 0);
+      size_t size = lseek(handle->data[0], 0, SEEK_END);
+      result = device_import_bo(device, pAllocator,
+                                handle->data[0], size, &mem->bo);
+#else
+      result = VK_ERROR_FEATURE_NOT_PRESENT;
+#endif
    } else {
       result = device_alloc(device, mem, alloc_size);
    }
 
    if (result != VK_SUCCESS) {
-      vk_object_free(&device->vk, pAllocator, mem);
+      vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
       return vk_error(device, result);
    }
 
@@ -2436,7 +2176,7 @@ v3dv_AllocateMemory(VkDevice _device,
 
    /* If this memory can be used via VK_KHR_buffer_device_address then we
     * will need to manually add the BO to any job submit that makes use of
-    * VK_KHR_buffer_device_address, since such jobs may produde buffer
+    * VK_KHR_buffer_device_address, since such jobs may produce buffer
     * load/store operations that may access any buffer memory allocated with
     * this flag and we don't have any means to tell which buffers will be
     * accessed through this mechanism since they don't even have to be bound
@@ -2668,11 +2408,44 @@ v3dv_BindImageMemory2(VkDevice _device,
                       const VkBindImageMemoryInfo *pBindInfos)
 {
    for (uint32_t i = 0; i < bindInfoCount; i++) {
-#ifndef ANDROID
+#ifdef ANDROID
+      V3DV_FROM_HANDLE(v3dv_device_memory, mem, pBindInfos[i].memory);
+      V3DV_FROM_HANDLE(v3dv_device, device, _device);
+      if (mem != NULL && mem->vk.ahardware_buffer) {
+         AHardwareBuffer_Desc description;
+         const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
+
+         V3DV_FROM_HANDLE(v3dv_image, image, pBindInfos[i].image);
+         AHardwareBuffer_describe(mem->vk.ahardware_buffer, &description);
+
+         struct u_gralloc_buffer_handle gr_handle = {
+            .handle = handle,
+            .pixel_stride = description.stride,
+            .hal_format = description.format,
+         };
+
+         VkResult result = v3dv_gralloc_to_drm_explicit_layout(
+            device->gralloc,
+            &gr_handle,
+            image->android_explicit_layout,
+            image->android_plane_layouts,
+            V3DV_MAX_PLANE_COUNT);
+         if (result != VK_SUCCESS)
+            return result;
+
+         result = v3dv_update_image_layout(
+            device, image, image->android_explicit_layout->drmFormatModifier,
+            /* disjoint = */ false, image->android_explicit_layout);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+#endif
+
       const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
          vk_find_struct_const(pBindInfos->pNext,
                               BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR);
       if (swapchain_info && swapchain_info->swapchain) {
+#ifndef ANDROID
          struct v3dv_image *swapchain_image =
             v3dv_wsi_get_image_from_swapchain(swapchain_info->swapchain,
                                               swapchain_info->imageIndex);
@@ -2685,8 +2458,8 @@ v3dv_BindImageMemory2(VkDevice _device,
             .memoryOffset = swapchain_image->planes[0].mem_offset,
          };
          bind_image_memory(&swapchain_bind);
-      } else
 #endif
+      } else
       {
          bind_image_memory(&pBindInfos[i]);
       }
@@ -2716,6 +2489,18 @@ get_buffer_memory_requirements(struct v3dv_buffer *buffer,
       .size = align64(buffer->size, buffer->alignment),
    };
 
+   /* UBO and SSBO may be read using ldunifa, which prefetches the next
+    * 4 bytes after a read. If the buffer's size is exactly a multiple
+    * of a page size and the shader reads the last 4 bytes with ldunifa
+    * the prefetching would read out of bounds and cause an MMU error,
+    * so we allocate extra space to avoid kernel error spamming.
+    */
+   bool can_ldunifa = buffer->usage &
+                      (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                       VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
+   if (can_ldunifa && (buffer->size % 4096 == 0))
+      pMemoryRequirements->memoryRequirements.size += buffer->alignment;
+
    vk_foreach_struct(ext, pMemoryRequirements->pNext) {
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
@@ -2978,7 +2763,7 @@ v3dv_CreateSampler(VkDevice _device,
       }
    }
 
-   v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
+   v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);
 
    *pSampler = v3dv_sampler_to_handle(sampler);
 
@@ -3079,9 +2864,9 @@ vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion)
     *
     *    - Loader interface v4 differs from v3 in:
     *        - The ICD must implement vk_icdGetPhysicalDeviceProcAddr().
-    * 
+    *
     *    - Loader interface v5 differs from v4 in:
-    *        - The ICD must support Vulkan API version 1.1 and must not return 
+    *        - The ICD must support Vulkan API version 1.1 and must not return
     *          VK_ERROR_INCOMPATIBLE_DRIVER from vkCreateInstance() unless a
     *          Vulkan Loader with interface v4 or smaller is being used and the
     *          application provides an API version that is greater than 1.0.
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_event.c b/lib/mesa/src/broadcom/vulkan/v3dv_event.c
index 966392400..a3aad37d9 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_event.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_event.c
@@ -33,20 +33,16 @@ get_set_event_cs()
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                   "set event cs");
 
-   b.shader->info.workgroup_size[0] = 1;
-   b.shader->info.workgroup_size[1] = 1;
-   b.shader->info.workgroup_size[2] = 1;
-
-   nir_ssa_def *buf =
+   nir_def *buf =
       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                 .desc_set = 0,
                                 .binding = 0,
                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
 
-   nir_ssa_def *offset =
+   nir_def *offset =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
 
-   nir_ssa_def *value =
+   nir_def *value =
       nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 4, .range = 4);
 
    nir_store_ssbo(&b, value, buf, offset,
@@ -62,23 +58,19 @@ get_wait_event_cs()
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                   "wait event cs");
 
-   b.shader->info.workgroup_size[0] = 1;
-   b.shader->info.workgroup_size[1] = 1;
-   b.shader->info.workgroup_size[2] = 1;
-
-   nir_ssa_def *buf =
+   nir_def *buf =
       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                 .desc_set = 0,
                                 .binding = 0,
                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
 
-   nir_ssa_def *offset =
+   nir_def *offset =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
 
    nir_loop *loop = nir_push_loop(&b);
-      nir_ssa_def *load =
+      nir_def *load =
          nir_load_ssbo(&b, 1, 8, buf, offset, .access = 0, .align_mul = 4);
-      nir_ssa_def *value = nir_i2i32(&b, load);
+      nir_def *value = nir_i2i32(&b, load);
 
       nir_if *if_stmt = nir_push_if(&b, nir_ieq_imm(&b, value, 1));
       nir_jump(&b, nir_jump_break);
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_formats.c b/lib/mesa/src/broadcom/vulkan/v3dv_formats.c
index ecb369963..01be6dcf4 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_formats.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_formats.c
@@ -22,13 +22,18 @@
  */
 
 #include "v3dv_private.h"
-#include "vk_util.h"
+#ifdef ANDROID
+#include "vk_android.h"
+#endif
 #include "vk_enum_defines.h"
+#include "vk_util.h"
 
 #include "drm-uapi/drm_fourcc.h"
 #include "util/format/u_format.h"
 #include "vulkan/wsi/wsi_common.h"
 
+#include <vulkan/vulkan_android.h>
+
 const uint8_t *
 v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f, uint8_t plane)
 {
@@ -169,6 +174,7 @@ image_format_plane_features(struct v3dv_physical_device *pdevice,
          if (desc->nr_channels == 1 && vk_format_is_int(vk_format))
             flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
       } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
+                 vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32 ||
                  vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
                  vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
          /* To comply with shaderStorageImageExtendedFormats */
@@ -291,7 +297,8 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
                   VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT |
                   VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT_KHR;
       }
-   } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32) {
+   } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 ||
+              vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
       flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT |
                VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT |
                VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
@@ -658,6 +665,7 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
    const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
    const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *drm_format_mod_info = NULL;
    VkExternalImageFormatProperties *external_props = NULL;
+   UNUSED VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
    VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
    VkImageTiling tiling = base_info->tiling;
 
@@ -698,6 +706,9 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
       case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
          external_props = (void *) s;
          break;
+      case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
+         android_usage = (void *)s;
+         break;
       case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
          ycbcr_props = (void *) s;
          break;
@@ -721,12 +732,28 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
          if (external_props)
             external_props->externalMemoryProperties = prime_fd_props;
          break;
+#ifdef ANDROID
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID:
+         if (external_props) {
+            external_props->externalMemoryProperties.exportFromImportedHandleTypes = 0;
+            external_props->externalMemoryProperties.compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;
+            external_props->externalMemoryProperties.externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT | VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT;
+         }
+         break;
+#endif
       default:
          result = VK_ERROR_FORMAT_NOT_SUPPORTED;
          break;
       }
    }
 
+   if (android_usage) {
+#ifdef ANDROID
+      android_usage->androidHardwareBufferUsage =
+         vk_image_usage_to_ahb_usage(base_info->flags, base_info->usage);
+#endif
+   }
+
 done:
    return result;
 }
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_image.c b/lib/mesa/src/broadcom/vulkan/v3dv_image.c
index 325cc7ce3..c02516960 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_image.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_image.c
@@ -28,6 +28,9 @@
 #include "util/u_math.h"
 #include "vk_util.h"
 #include "vulkan/wsi/wsi_common.h"
+#ifdef ANDROID
+#include "vk_android.h"
+#endif
 
 /**
  * Computes the HW's UIFblock padding for a given height/cpp.
@@ -70,27 +73,38 @@ v3d_get_ub_pad(uint32_t cpp, uint32_t height)
    return 0;
 }
 
-static void
+/**
+ * Computes the dimension with required padding for mip levels.
+ *
+ * This padding is required for width and height dimensions when the mip
+ * level is greater than 1, and for the depth dimension when the mip level
+ * is greater than 0. This function expects to be passed a mip level >= 1.
+ *
+ * Note: Hardware documentation seems to suggest that the third argument
+ * should be the utile dimensions, but through testing it was found that
+ * the block dimension should be used instead.
+ */
+static uint32_t
+v3d_get_dimension_mpad(uint32_t dimension, uint32_t level, uint32_t block_dimension)
+{
+   assert(level >= 1);
+   uint32_t pot_dim = u_minify(dimension, 1);
+   pot_dim = util_next_power_of_two(DIV_ROUND_UP(pot_dim, block_dimension));
+   uint32_t padded_dim = block_dimension * pot_dim;
+   return u_minify(padded_dim, level - 1);
+}
+
+static bool
 v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
-                       uint32_t plane_offset)
+                       uint32_t plane_offset,
+                       const VkSubresourceLayout *plane_layouts)
 {
    assert(image->planes[plane].cpp > 0);
-   /* Texture Base Adress needs to be 64-byte aligned */
-   assert(plane_offset % 64 == 0);
 
    uint32_t width = image->planes[plane].width;
    uint32_t height = image->planes[plane].height;
    uint32_t depth = image->vk.extent.depth;
 
-   /* Note that power-of-two padding is based on level 1.  These are not
-    * equivalent to just util_next_power_of_two(dimension), because at a
-    * level 0 dimension of 9, the level 1 power-of-two padded value is 4,
-    * not 8.
-    */
-   uint32_t pot_width = 2 * util_next_power_of_two(u_minify(width, 1));
-   uint32_t pot_height = 2 * util_next_power_of_two(u_minify(height, 1));
-   uint32_t pot_depth = 2 * util_next_power_of_two(u_minify(depth, 1));
-
    uint32_t utile_w = v3d_utile_width(image->planes[plane].cpp);
    uint32_t utile_h = v3d_utile_height(image->planes[plane].cpp);
    uint32_t uif_block_w = utile_w * 2;
@@ -99,6 +113,21 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
    uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
    uint32_t block_height = vk_format_get_blockheight(image->vk.format);
 
+   /* Note that power-of-two padding is based on level 1.  These are not
+    * equivalent to just util_next_power_of_two(dimension), because at a
+    * level 0 dimension of 9, the level 1 power-of-two padded value is 4,
+    * not 8. Additionally the pot padding is based on the block size.
+    */
+   uint32_t pot_width = 2 * v3d_get_dimension_mpad(width,
+                                                   1,
+                                                   block_width);
+   uint32_t pot_height = 2 * v3d_get_dimension_mpad(height,
+                                                    1,
+                                                    block_height);
+   uint32_t pot_depth = 2 * v3d_get_dimension_mpad(depth,
+                                                   1,
+                                                   1);
+
    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT ||
           image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
    bool msaa = image->vk.samples != VK_SAMPLE_COUNT_1_BIT;
@@ -109,14 +138,30 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
    assert(depth > 0);
    assert(image->vk.mip_levels >= 1);
 
-   uint32_t offset = plane_offset;
+   /* Texture Base Address needs to be 64-byte aligned. If we have an explicit
+    * plane layout we will return false to fail image creation with appropriate
+    * error code.
+    */
+   uint32_t offset;
+   if (plane_layouts) {
+      offset = plane_layouts[plane].offset;
+      if (offset % 64 != 0)
+         return false;
+   } else {
+      offset = plane_offset;
+   }
+   assert(plane_offset % 64 == 0);
+
    for (int32_t i = image->vk.mip_levels - 1; i >= 0; i--) {
       struct v3d_resource_slice *slice = &image->planes[plane].slices[i];
 
+      slice->width = u_minify(width, i);
+      slice->height = u_minify(height, i);
+
       uint32_t level_width, level_height, level_depth;
       if (i < 2) {
-         level_width = u_minify(width, i);
-         level_height = u_minify(height, i);
+         level_width = slice->width;
+         level_height = slice->height;
       } else {
          level_width = u_minify(pot_width, i);
          level_height = u_minify(pot_height, i);
@@ -179,6 +224,18 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
 
       slice->offset = offset;
       slice->stride = level_width * image->planes[plane].cpp;
+
+      /* We assume that rowPitch in the plane layout refers to level 0 */
+      if (plane_layouts && i == 0) {
+         if (plane_layouts[plane].rowPitch < slice->stride)
+            return false;
+         if (plane_layouts[plane].rowPitch % image->planes[plane].cpp)
+            return false;
+         if (image->tiled && (plane_layouts[plane].rowPitch % (4 * uif_block_w)))
+            return false;
+         slice->stride = plane_layouts[plane].rowPitch;
+      }
+
       slice->padded_height = level_height;
       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
           slice->tiling == V3D_TILING_UIF_XOR) {
@@ -222,7 +279,8 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
       image->planes[plane].alignment = 4096;
    } else {
       image->planes[plane].alignment =
-         (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) ? 64 : image->planes[plane].cpp;
+         (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) ?
+            64 : image->planes[plane].cpp;
    }
 
    uint32_t align_offset =
@@ -243,15 +301,36 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane,
       image->planes[plane].cube_map_stride =
          align(image->planes[plane].slices[0].offset +
                image->planes[plane].slices[0].size, 64);
+
+      if (plane_layouts && image->vk.array_layers > 1) {
+         if (plane_layouts[plane].arrayPitch % 64 != 0)
+            return false;
+         if (plane_layouts[plane].arrayPitch <
+             image->planes[plane].cube_map_stride) {
+            return false;
+         }
+         image->planes[plane].cube_map_stride = plane_layouts[plane].arrayPitch;
+      }
+
       image->planes[plane].size += image->planes[plane].cube_map_stride *
                                    (image->vk.array_layers - 1);
    } else {
       image->planes[plane].cube_map_stride = image->planes[plane].slices[0].size;
+      if (plane_layouts) {
+         /* We assume that depthPitch in the plane layout refers to level 0 */
+         if (plane_layouts[plane].depthPitch !=
+             image->planes[plane].slices[0].size) {
+               return false;
+         }
+      }
    }
+
+   return true;
 }
 
-static void
-v3d_setup_slices(struct v3dv_image *image, bool disjoint)
+static bool
+v3d_setup_slices(struct v3dv_image *image, bool disjoint,
+                 const VkSubresourceLayout *plane_layouts)
 {
    if (disjoint && image->plane_count == 1)
       disjoint = false;
@@ -259,11 +338,15 @@ v3d_setup_slices(struct v3dv_image *image, bool disjoint)
    uint32_t offset = 0;
    for (uint8_t plane = 0; plane < image->plane_count; plane++) {
       offset = disjoint ? 0 : offset;
-      v3d_setup_plane_slices(image, plane, offset);
+      if (!v3d_setup_plane_slices(image, plane, offset, plane_layouts)) {
+         assert(plane_layouts);
+         return false;
+      }
       offset += align(image->planes[plane].size, 64);
    }
 
    image->non_disjoint_size = disjoint ? 0 : offset;
+   return true;
 }
 
 uint32_t
@@ -280,6 +363,34 @@ v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer
 }
 
 VkResult
+v3dv_update_image_layout(struct v3dv_device *device,
+                         struct v3dv_image *image,
+                         uint64_t modifier,
+                         bool disjoint,
+                         const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info)
+{
+   assert(!explicit_mod_info ||
+          image->plane_count == explicit_mod_info->drmFormatModifierPlaneCount);
+
+   assert(!explicit_mod_info ||
+          modifier == explicit_mod_info->drmFormatModifier);
+
+   image->tiled = modifier != DRM_FORMAT_MOD_LINEAR;
+
+   image->vk.drm_format_mod = modifier;
+
+   bool ok =
+      v3d_setup_slices(image, disjoint,
+                       explicit_mod_info ? explicit_mod_info->pPlaneLayouts : NULL);
+   if (!ok) {
+      assert(explicit_mod_info);
+      return VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
 v3dv_image_init(struct v3dv_device *device,
                 const VkImageCreateInfo *pCreateInfo,
                 const VkAllocationCallbacks *pAllocator,
@@ -296,11 +407,20 @@ v3dv_image_init(struct v3dv_device *device,
     */
    VkImageTiling tiling = pCreateInfo->tiling;
    uint64_t modifier = DRM_FORMAT_MOD_INVALID;
+   const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = NULL;
+   const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info = NULL;
+#ifdef ANDROID
+   if (image->is_native_buffer_memory) {
+      assert(image->android_explicit_layout);
+      explicit_mod_info = image->android_explicit_layout;
+      modifier = explicit_mod_info->drmFormatModifier;
+   }
+#endif
    if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
-      const VkImageDrmFormatModifierListCreateInfoEXT *mod_info =
+      mod_info =
          vk_find_struct_const(pCreateInfo->pNext,
                               IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
-      const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info =
+      explicit_mod_info =
          vk_find_struct_const(pCreateInfo->pNext,
                               IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
       assert(mod_info || explicit_mod_info);
@@ -327,35 +447,20 @@ v3dv_image_init(struct v3dv_device *device,
       tiling = VK_IMAGE_TILING_LINEAR;
    }
 
-#ifdef ANDROID
-   const VkNativeBufferANDROID *native_buffer =
-      vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
-
-   int native_buf_fd = -1;
-   int native_buf_stride = 0;
-   int native_buf_size = 0;
-
-   if (native_buffer != NULL) {
-      VkResult result = v3dv_gralloc_info(device, native_buffer, &native_buf_fd,
-                                          &native_buf_stride, &native_buf_size,
-                                          &modifier);
-      if (result != VK_SUCCESS)
-         return result;
-
-      if (modifier != DRM_FORMAT_MOD_BROADCOM_UIF)
-         tiling = VK_IMAGE_TILING_LINEAR;
-   }
-#endif
+   if (modifier == DRM_FORMAT_MOD_INVALID)
+      modifier = (tiling == VK_IMAGE_TILING_OPTIMAL) ? DRM_FORMAT_MOD_BROADCOM_UIF
+                                                     : DRM_FORMAT_MOD_LINEAR;
 
    const struct v3dv_format *format =
-      v3dv_X(device, get_format)(pCreateInfo->format);
+      v3dv_X(device, get_format)(image->vk.format);
    v3dv_assert(format != NULL && format->plane_count);
 
    assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT ||
           pCreateInfo->samples == VK_SAMPLE_COUNT_4_BIT);
 
    image->format = format;
-   image->plane_count = vk_format_get_plane_count(pCreateInfo->format);
+
+   image->plane_count = vk_format_get_plane_count(image->vk.format);
 
    const struct vk_format_ycbcr_info *ycbcr_info =
       vk_format_get_ycbcr_info(image->vk.format);
@@ -378,12 +483,6 @@ v3dv_image_init(struct v3dv_device *device,
             ycbcr_info->planes[plane].denominator_scales[1];
       }
    }
-   image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL ||
-                  (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
-                   modifier != DRM_FORMAT_MOD_LINEAR);
-
-   image->vk.tiling = tiling;
-   image->vk.drm_format_mod = modifier;
 
    /* Our meta paths can create image views with compatible formats for any
     * image, so always set this flag to keep the common Vulkan image code
@@ -391,26 +490,18 @@ v3dv_image_init(struct v3dv_device *device,
     */
    image->vk.create_flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
 
-   bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
-   v3d_setup_slices(image, disjoint);
-
 #ifdef ANDROID
-   if (native_buffer != NULL) {
-      assert(image->plane_count == 1);
-      image->planes[0].slices[0].stride = native_buf_stride;
-      image->non_disjoint_size =
-         image->planes[0].slices[0].size =
-         image->planes[0].size = native_buf_size;
-
-      VkResult result = v3dv_import_native_buffer_fd(v3dv_device_to_handle(device),
-                                                     native_buf_fd, pAllocator,
-                                                     v3dv_image_to_handle(image));
-      if (result != VK_SUCCESS)
-         return result;
-   }
+   /* At this time, an AHB handle is not yet provided.
+    * Image layout will be filled up during vkBindImageMemory2
+    */
+   if (image->is_ahb)
+      return VK_SUCCESS;
 #endif
 
-   return VK_SUCCESS;
+   bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+
+   return v3dv_update_image_layout(device, image, modifier, disjoint,
+                                   explicit_mod_info);
 }
 
 static VkResult
@@ -419,21 +510,92 @@ create_image(struct v3dv_device *device,
              const VkAllocationCallbacks *pAllocator,
              VkImage *pImage)
 {
+   VkResult result;
    struct v3dv_image *image = NULL;
 
    image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image));
    if (image == NULL)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   VkResult result = v3dv_image_init(device, pCreateInfo, pAllocator, image);
-   if (result != VK_SUCCESS) {
-      vk_image_destroy(&device->vk, pAllocator, &image->vk);
-      return result;
+#ifdef ANDROID
+   const VkExternalMemoryImageCreateInfo *external_info =
+      vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO);
+
+   const VkNativeBufferANDROID *native_buffer =
+      vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
+
+   if (native_buffer != NULL)
+      image->is_native_buffer_memory = true;
+
+   image->is_ahb = external_info && (external_info->handleTypes &
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID);
+
+   assert(!(image->is_ahb && image->is_native_buffer_memory));
+
+   if (image->is_ahb || image->is_native_buffer_memory) {
+      image->android_explicit_layout = vk_alloc2(&device->vk.alloc, pAllocator,
+                                                 sizeof(VkImageDrmFormatModifierExplicitCreateInfoEXT),
+                                                 8,
+                                                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!image->android_explicit_layout) {
+         result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto fail;
+      }
+
+      image->android_plane_layouts = vk_alloc2(&device->vk.alloc, pAllocator,
+         sizeof(VkSubresourceLayout) * V3DV_MAX_PLANE_COUNT,
+         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!image->android_plane_layouts) {
+         result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto fail;
+      }
+   }
+
+   if (image->is_native_buffer_memory) {
+      struct u_gralloc_buffer_handle gr_handle = {
+         .handle = native_buffer->handle,
+         .hal_format = native_buffer->format,
+         .pixel_stride = native_buffer->stride,
+      };
+
+      result = v3dv_gralloc_to_drm_explicit_layout(device->gralloc,
+                                                   &gr_handle,
+                                                   image->android_explicit_layout,
+                                                   image->android_plane_layouts,
+                                                   V3DV_MAX_PLANE_COUNT);
+      if (result != VK_SUCCESS)
+         goto fail;
+   }
+#endif
+
+   result = v3dv_image_init(device, pCreateInfo, pAllocator, image);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+#ifdef ANDROID
+   if (image->is_native_buffer_memory) {
+      result = v3dv_import_native_buffer_fd(v3dv_device_to_handle(device),
+                                            native_buffer->handle->data[0], pAllocator,
+                                            v3dv_image_to_handle(image));
+      if (result != VK_SUCCESS)
+         goto fail;
    }
+#endif
 
    *pImage = v3dv_image_to_handle(image);
 
    return VK_SUCCESS;
+
+fail:
+#ifdef ANDROID
+   if (image->android_explicit_layout)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout);
+   if (image->android_plane_layouts)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts);
+#endif
+
+   vk_image_destroy(&device->vk, pAllocator, &image->vk);
+   return result;
 }
 
 static VkResult
@@ -534,8 +696,10 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
       v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer,
                         plane) - image->planes[plane].mem_offset;
    layout->rowPitch = slice->stride;
-   layout->depthPitch = image->planes[plane].cube_map_stride;
-   layout->arrayPitch = image->planes[plane].cube_map_stride;
+   layout->depthPitch = image->vk.image_type == VK_IMAGE_TYPE_3D ?
+                        image->planes[plane].cube_map_stride : 0;
+   layout->arrayPitch = image->vk.array_layers > 1 ?
+                        image->planes[plane].cube_map_stride : 0;
 
    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       layout->size = slice->size;
@@ -567,12 +731,33 @@ v3dv_DestroyImage(VkDevice _device,
    if (image == NULL)
       return;
 
+   /* If we have created a shadow tiled image for this image we must also free
+    * it (along with its memory allocation).
+    */
+   if (image->shadow) {
+      bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+      for (int i = 0; i < (disjoint ? image->plane_count : 1); i++) {
+         if (image->shadow->planes[i].mem) {
+            v3dv_FreeMemory(_device,
+                            v3dv_device_memory_to_handle(image->shadow->planes[i].mem),
+                            pAllocator);
+         }
+      }
+      v3dv_DestroyImage(_device, v3dv_image_to_handle(image->shadow),
+                        pAllocator);
+      image->shadow = NULL;
+   }
+
 #ifdef ANDROID
-   assert(image->plane_count == 1);
    if (image->is_native_buffer_memory)
       v3dv_FreeMemory(_device,
                       v3dv_device_memory_to_handle(image->planes[0].mem),
                       pAllocator);
+
+   if (image->android_explicit_layout)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout);
+   if (image->android_plane_layouts)
+      vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts);
 #endif
 
    vk_image_destroy(&device->vk, pAllocator, &image->vk);
@@ -641,8 +826,7 @@ create_image_view(struct v3dv_device *device,
     * makes sense to implement swizzle composition using VkSwizzle directly.
     */
    VkFormat format;
-   uint8_t image_view_swizzle[4];
-   if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
+   if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
        range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
       format = VK_FORMAT_R8G8B8A8_UINT;
       uint8_t stencil_aspect_swizzle[4] = {
@@ -652,11 +836,11 @@ create_image_view(struct v3dv_device *device,
       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);
 
       util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
-                                   image_view_swizzle);
+                                   iview->view_swizzle);
    } else {
-      format = pCreateInfo->format;
+      format = iview->vk.format;
       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
-                                           image_view_swizzle);
+                                           iview->view_swizzle);
    }
 
    iview->vk.view_format = format;
@@ -681,7 +865,7 @@ create_image_view(struct v3dv_device *device,
 
       const uint8_t *format_swizzle =
          v3dv_get_format_swizzle(device, format, plane);
-      util_format_compose_swizzles(format_swizzle, image_view_swizzle,
+      util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
                                    iview->planes[plane].swizzle);
 
       iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
@@ -725,6 +909,13 @@ v3dv_DestroyImageView(VkDevice _device,
    if (image_view == NULL)
       return;
 
+   if (image_view->shadow) {
+      v3dv_DestroyImageView(_device,
+                            v3dv_image_view_to_handle(image_view->shadow),
+                            pAllocator);
+      image_view->shadow = NULL;
+   }
+
    vk_image_view_destroy(&device->vk, pAllocator, &image_view->vk);
 }
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_limits.h b/lib/mesa/src/broadcom/vulkan/v3dv_limits.h
index 9cda9f0d6..4df172e6b 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_limits.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_limits.h
@@ -41,7 +41,7 @@
 #define MAX_STORAGE_IMAGES 4
 #define MAX_INPUT_ATTACHMENTS 4
 
-#define MAX_UNIFORM_BUFFERS 12
+#define MAX_UNIFORM_BUFFERS 16
 #define MAX_INLINE_UNIFORM_BUFFERS 4
 #define MAX_STORAGE_BUFFERS 8
 
@@ -50,8 +50,6 @@
 #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
                              MAX_DYNAMIC_STORAGE_BUFFERS)
 
-#define MAX_RENDER_TARGETS 4
-
 #define MAX_MULTIVIEW_VIEW_COUNT 16
 
 /* These are tunable parameters in the HW design, but all the V3D
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c b/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c
index 9d7e36928..8eeb03e57 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -73,7 +73,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
     *   conversion"
     */
    assert(image->plane_count == 1);
-   if (!v3dv_meta_can_use_tlb(image, 0, &origin, &fb_format))
+   if (!v3dv_meta_can_use_tlb(image, 0, 0, &origin, NULL, &fb_format))
       return false;
 
    uint32_t internal_type, internal_bpp;
@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
 
       v3dv_job_start_frame(job, width, height, max_layer,
                            false, true, 1, internal_bpp,
+                           4 * v3d_internal_bpp_words(internal_bpp),
                            image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
 
       struct v3dv_meta_framebuffer framebuffer;
@@ -329,7 +330,7 @@ get_clear_rect_vs()
       nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
    vs_out_pos->data.location = VARYING_SLOT_POS;
 
-   nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
+   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
    nir_store_var(&b, vs_out_pos, pos, 0xf);
 
    return b.shader;
@@ -352,8 +353,8 @@ get_clear_rect_gs(uint32_t push_constant_layer_base)
    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
                                (1ull << VARYING_SLOT_LAYER);
-   nir->info.gs.input_primitive = SHADER_PRIM_TRIANGLES;
-   nir->info.gs.output_primitive = SHADER_PRIM_TRIANGLE_STRIP;
+   nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
+   nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
    nir->info.gs.vertices_in = 3;
    nir->info.gs.vertices_out = 3;
    nir->info.gs.invocations = 1;
@@ -386,7 +387,7 @@ get_clear_rect_gs(uint32_t push_constant_layer_base)
       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
 
       /* gl_Layer from push constants */
-      nir_ssa_def *layer =
+      nir_def *layer =
          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                                 .base = push_constant_layer_base, .range = 4);
       nir_store_var(&b, gs_out_layer, layer, 0x1);
@@ -414,7 +415,7 @@ get_color_clear_rect_fs(uint32_t rt_idx, VkFormat format)
       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
    fs_out_color->data.location = FRAG_RESULT_DATA0 + rt_idx;
 
-   nir_ssa_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
+   nir_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
    nir_store_var(&b, fs_out_color, color_load, 0xf);
 
    return b.shader;
@@ -432,7 +433,7 @@ get_depth_clear_rect_fs()
                           "out_depth");
    fs_out_depth->data.location = FRAG_RESULT_DEPTH;
 
-   nir_ssa_def *depth_load =
+   nir_def *depth_load =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
 
    nir_store_var(&b, fs_out_depth, depth_load, 0x1);
@@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
    uint32_t bit_offset = 0;
 
    key |= rt_idx;
-   bit_offset += 2;
+   bit_offset += 3;
 
    key |= ((uint64_t) format) << bit_offset;
    bit_offset += 32;
@@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   /* We can only clear attachments in the current subpass */
-   assert(attachmentCount <= 5); /* 4 color + D/S */
+   /* We can have at most max_color_RTs + 1 D/S attachments */
+   assert(attachmentCount <=
+          V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);
 
+   /* We can only clear attachments in the current subpass */
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
 
    assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c b/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c
index 4d83e5379..f9779bf26 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -351,18 +351,37 @@ get_compatible_tlb_format(VkFormat format)
  * Checks if we can implement an image copy or clear operation using the TLB
  * hardware.
  *
+ * The extent and miplevel are only used to validate tile stores (to match the
+ * region to store against the miplevel dimensions to avoid avoid cases where
+ * the region to store is not a aligned to tile boundaries). If extent is
+ * NULL no checks are done (which is fine if the image will only be used for a
+ * TLB load or when we know in advance that the store will be for the entire
+ * size of the image miplevel).
+ *
  * For tlb copies we are doing a per-plane copy, so for multi-plane formats,
  * the compatible format will be single-plane.
  */
 bool
 v3dv_meta_can_use_tlb(struct v3dv_image *image,
                       uint8_t plane,
+                      uint8_t miplevel,
                       const VkOffset3D *offset,
+                      const VkExtent3D *extent,
                       VkFormat *compat_format)
 {
    if (offset->x != 0 || offset->y != 0)
       return false;
 
+   /* FIXME: this is suboptimal, what we really want to check is that the
+    * extent of the region to copy is the full slice or a multiple of the
+    * tile size.
+    */
+   if (extent) {
+      struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel];
+      if (slice->width != extent->width || slice->height != extent->height)
+         return false;
+   }
+
    if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
       if (compat_format)
          *compat_format = image->planes[plane].vk_format;
@@ -403,8 +422,11 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
    assert(plane < image->plane_count);
 
-   if (!v3dv_meta_can_use_tlb(image, plane, &region->imageOffset, &fb_format))
+   if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
+                              &region->imageOffset, &region->imageExtent,
+                              &fb_format)) {
       return false;
+   }
 
    uint32_t internal_type, internal_bpp;
    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -431,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
-                        1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        false);
 
    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -459,26 +482,89 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             VkFilter filter,
             bool dst_is_padded_image);
 
+
 /**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
+ * A structure that contains all the information we may need in various
+ * processes involving image to buffer copies implemented with blit paths.
  */
-static bool
-copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
-                          struct v3dv_buffer *buffer,
-                          struct v3dv_image *image,
-                          const VkBufferImageCopy2 *region)
+struct image_to_buffer_info {
+   /* Source image info */
+   VkFormat src_format;
+   uint8_t plane;
+   VkColorComponentFlags cmask;
+   VkComponentMapping cswizzle;
+   VkImageAspectFlags src_copy_aspect;
+   uint32_t block_width;
+   uint32_t block_height;
+
+   /* Destination buffer info */
+   VkFormat dst_format;
+   uint32_t buf_width;
+   uint32_t buf_height;
+   uint32_t buf_bpp;
+   VkImageAspectFlags dst_copy_aspect;
+};
+
+static VkImageBlit2
+blit_region_for_image_to_buffer(const VkOffset3D *offset,
+                                const VkExtent3D *extent,
+                                uint32_t mip_level,
+                                uint32_t base_layer,
+                                uint32_t layer_offset,
+                                struct image_to_buffer_info *info)
 {
-   bool handled = false;
+   VkImageBlit2 output = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
+      .srcSubresource = {
+         .aspectMask = info->src_copy_aspect,
+         .mipLevel = mip_level,
+         .baseArrayLayer = base_layer + layer_offset,
+         .layerCount = 1,
+      },
+      .srcOffsets = {
+         {
+            DIV_ROUND_UP(offset->x, info->block_width),
+            DIV_ROUND_UP(offset->y, info->block_height),
+            offset->z + layer_offset,
+         },
+         {
+            DIV_ROUND_UP(offset->x + extent->width, info->block_width),
+            DIV_ROUND_UP(offset->y + extent->height, info->block_height),
+            offset->z + layer_offset + 1,
+         },
+      },
+      .dstSubresource = {
+         .aspectMask = info->dst_copy_aspect,
+         .mipLevel = 0,
+         .baseArrayLayer = 0,
+         .layerCount = 1,
+      },
+      .dstOffsets = {
+         { 0, 0, 0 },
+         {
+            DIV_ROUND_UP(extent->width, info->block_width),
+            DIV_ROUND_UP(extent->height, info->block_height),
+            1
+         },
+      },
+   };
 
-   /* This path uses a shader blit which doesn't support linear images. Return
-    * early to avoid all the heavy lifting in preparation for the
-    * blit_shader() call that is bound to fail in that scenario.
-    */
-   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR &&
-       image->vk.image_type != VK_IMAGE_TYPE_1D) {
-      return handled;
-   }
+   return output;
+}
+
+/**
+ * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can
+ * use to implement buffer to image copies with blit paths.
+ *
+ * Returns false if the copy operation can't be implemented with a blit.
+ */
+static bool
+gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer,
+                            struct v3dv_image *image,
+                            const VkBufferImageCopy2 *region,
+                            struct image_to_buffer_info *out_info)
+{
+   bool supported = false;
 
    VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask;
    /* For multi-planar images we copy one plane at a time using an image alias
@@ -572,7 +658,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          break;
       default:
          unreachable("unsupported aspect");
-         return handled;
+         return supported;
       };
       break;
    case 2:
@@ -588,7 +674,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
       break;
    default:
       unreachable("unsupported bit-size");
-      return handled;
+      return supported;
    };
 
    /* The hardware doesn't support linear depth/stencil stores, so we
@@ -600,7 +686,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
    dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
 
    /* We should be able to handle the blit if we got this far */
-   handled = true;
+   supported = true;
 
    /* Obtain the 2D buffer region spec */
    uint32_t buf_width, buf_height;
@@ -619,98 +705,246 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
       vk_format_get_blockwidth(image->planes[plane].vk_format);
    uint32_t block_height =
       vk_format_get_blockheight(image->planes[plane].vk_format);
-   buf_width = buf_width / block_width;
-   buf_height = buf_height / block_height;
+   buf_width = DIV_ROUND_UP(buf_width, block_width);
+   buf_height = DIV_ROUND_UP(buf_height, block_height);
+
+   out_info->src_format = src_format;
+   out_info->dst_format = dst_format;
+   out_info->src_copy_aspect = src_copy_aspect;
+   out_info->dst_copy_aspect = dst_copy_aspect;
+   out_info->buf_width = buf_width;
+   out_info->buf_height = buf_height;
+   out_info->buf_bpp = buffer_bpp;
+   out_info->block_width = block_width;
+   out_info->block_height = block_height;
+   out_info->cmask = cmask;
+   out_info->cswizzle = cswizzle;
+   out_info->plane = plane;
+
+   return supported;
+}
 
-   /* Compute layers to copy */
-   uint32_t num_layers;
-   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
-      num_layers = region->imageSubresource.layerCount;
-   else
-      num_layers = region->imageExtent.depth;
-   assert(num_layers > 0);
+/* Creates a linear image to alias buffer memory. It also includes that image
+ * as a private object in the cmd_buffer.
+ *
+ * This is used for cases where we want to implement an image to buffer copy,
+ * but we need to rely on a mechanism that uses an image as destination, like
+ * blitting.
+ */
+static VkResult
+create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+                         struct v3dv_buffer *buffer,
+                         const VkBufferImageCopy2 *region,
+                         struct image_to_buffer_info *info,
+                         uint32_t layer,
+                         VkImage *out_image)
+{
+   VkImageCreateInfo image_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+      .imageType = VK_IMAGE_TYPE_2D,
+      .format = info->dst_format,
+      .extent = { info->buf_width, info->buf_height, 1 },
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = VK_SAMPLE_COUNT_1_BIT,
+      .tiling = VK_IMAGE_TILING_LINEAR,
+      .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+      .queueFamilyIndexCount = 0,
+      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
 
-   /* Our blit interface can see the real format of the images to detect
-    * copies between compressed and uncompressed images and adapt the
-    * blit region accordingly. Here we are just doing a raw copy of
-    * compressed data, but we are passing an uncompressed view of the
-    * buffer for the blit destination image (since compressed formats are
-    * not renderable), so we also want to provide an uncompressed view of
-    * the source image.
-    */
    VkResult result;
    struct v3dv_device *device = cmd_buffer->device;
    VkDevice _device = v3dv_device_to_handle(device);
-   if (vk_format_is_compressed(image->vk.format)) {
-      assert(image->plane_count == 1);
-      VkImage uiview;
-      VkImageCreateInfo uiview_info = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-         .imageType = VK_IMAGE_TYPE_3D,
-         .format = dst_format,
-         .extent = { buf_width, buf_height, image->vk.extent.depth },
-         .mipLevels = image->vk.mip_levels,
-         .arrayLayers = image->vk.array_layers,
-         .samples = image->vk.samples,
-         .tiling = image->vk.tiling,
-         .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-         .queueFamilyIndexCount = 0,
-         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-      };
-      result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview);
-      if (result != VK_SUCCESS)
-         return handled;
 
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)uiview,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+   VkImage buffer_image;
+   result =
+      v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
+   if (result != VK_SUCCESS)
+      return result;
 
-      result =
-         vk_common_BindImageMemory(_device, uiview,
-                                   v3dv_device_memory_to_handle(image->planes[plane].mem),
-                                   image->planes[plane].mem_offset);
-      if (result != VK_SUCCESS)
-         return handled;
+   *out_image = buffer_image;
+
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)buffer_image,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+
+   /* Bind the buffer memory to the image
+    */
+   VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
+      layer * info->buf_width * info->buf_height * info->buf_bpp;
+
+   result =
+      vk_common_BindImageMemory(_device, buffer_image,
+                                v3dv_device_memory_to_handle(buffer->mem),
+                                buffer_offset);
+   return result;
+}
+
+/**
+ * Creates an image with a single mip level that aliases the memory of a
+ * mip level in another image, re-interpreting the memory with an uncompressed
+ * format. The image is added to the command buffer as a private object for
+ * disposal.
+ */
+static bool
+create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer,
+                             struct v3dv_image *image,
+                             VkFormat format,
+                             uint32_t plane,
+                             uint32_t mip_level,
+                             uint32_t layer,
+                             VkImage *alias)
+{
+   VkResult result;
+   assert(!vk_format_is_compressed(format));
+
+   struct v3dv_device *device = cmd_buffer->device;
+   VkDevice vk_device = v3dv_device_to_handle(device);
+   uint32_t mip_width = image->planes[plane].slices[mip_level].width;
+   uint32_t mip_height = image->planes[plane].slices[mip_level].height;
+
+   uint32_t block_width =
+      vk_format_get_blockwidth(image->planes[plane].vk_format);
+   uint32_t block_height =
+      vk_format_get_blockheight(image->planes[plane].vk_format);
+
+   VkImageCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+      .imageType = image->vk.image_type,
+      .format = format,
+      .extent = { DIV_ROUND_UP(mip_width, block_width),
+                  DIV_ROUND_UP(mip_height, block_height),
+                  1 },
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = image->vk.samples,
+      .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
+      .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+      .queueFamilyIndexCount = 0,
+      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
+   result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias);
+   if (result != VK_SUCCESS)
+      return false;
+
+   /* The alias we have just created has just one mip, but we may be aliasing
+    * any mip in the original image. Because the slice setup changes based on
+    * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally)
+    * and this can influence the tiling layout selected for the slice, we want
+    * to make sure we copy the slice description from the actual mip level in
+    * the original image, and then rewrite any fields that we need for the
+    * alias. Particularly, we want to make the offset 0 because we are going to
+    * bind the underlying image memory exactly at the start of the selected mip.
+    * We also want to relax the image alignment requirements to the minimum
+    * (the one imposed by the Texture Base Address field) since we may not be
+    * aliasing a level 0 (for which we typically want a page alignment for
+    * optimal performance).
+    */
+   V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias);
+   v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level];
+   v3dv_alias->planes[plane].slices[0].width = info.extent.width;
+   v3dv_alias->planes[plane].slices[0].height = info.extent.height;
+   v3dv_alias->planes[plane].slices[0].offset = 0;
+   v3dv_alias->planes[plane].alignment = 64;
+
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)*alias,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+
+   result =
+      vk_common_BindImageMemory(vk_device, *alias,
+                                v3dv_device_memory_to_handle(image->planes[plane].mem),
+                                v3dv_layer_offset(image, mip_level, layer, plane));
+   return result == VK_SUCCESS;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
+                          struct v3dv_buffer *buffer,
+                          struct v3dv_image *image,
+                          const VkBufferImageCopy2 *region)
+{
+   bool handled = false;
+   struct image_to_buffer_info info;
 
-      image = v3dv_image_from_handle(uiview);
+   /* This path uses a shader blit which doesn't support linear images. Return
+    * early to avoid all the heavy lifting in preparation for the
+    * blit_shader() call that is bound to fail in that scenario.
+    */
+   if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) {
+      return handled;
    }
 
+   handled = gather_image_to_buffer_info(cmd_buffer, image, region,
+                                         &info);
+
+   if (!handled)
+      return handled;
+
+   /* We should be able to handle the blit if we got this far */
+   handled = true;
+
+   /* Compute layers to copy */
+   uint32_t num_layers;
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
+      num_layers = region->imageSubresource.layerCount;
+   else
+      num_layers = region->imageExtent.depth;
+   assert(num_layers > 0);
+
    /* Copy requested layers */
+   VkResult result;
+   VkImageBlit2 blit_region;
+   uint32_t mip_level = region->imageSubresource.mipLevel;
+   uint32_t base_layer = region->imageSubresource.baseArrayLayer;
    for (uint32_t i = 0; i < num_layers; i++) {
-      /* Create the destination blit image from the destination buffer */
-      VkImageCreateInfo image_info = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-         .imageType = VK_IMAGE_TYPE_2D,
-         .format = dst_format,
-         .extent = { buf_width, buf_height, 1 },
-         .mipLevels = 1,
-         .arrayLayers = 1,
-         .samples = VK_SAMPLE_COUNT_1_BIT,
-         .tiling = VK_IMAGE_TILING_LINEAR,
-         .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-         .queueFamilyIndexCount = 0,
-         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-      };
-
-      VkImage buffer_image;
-      result =
-         v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
-      if (result != VK_SUCCESS)
-         return handled;
+      uint32_t layer_offset = i;
+
+      if (vk_format_is_compressed(image->vk.format)) {
+         /* Our blit interface can see the real format of the images to detect
+          * copies between compressed and uncompressed images and adapt the
+          * blit region accordingly. Here we are just doing a raw copy of
+          * compressed data, but we are passing an uncompressed view of the
+          * buffer for the blit destination image (since compressed formats are
+          * not renderable), so we also want to provide an uncompressed view of
+          * the source image.
+          *
+          * It is important that we create the alias over the selected mip
+          * level (instead of aliasing the entire image) because an uncompressed
+          * view of the image won't have the same number of mip levels as the
+          * original image and the implicit mip size calculations the hw will
+          * do to sample from a non-zero mip level may not match exactly between
+          * compressed and uncompressed views.
+          */
+         VkImage alias;
+         if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format,
+                                           info.plane, mip_level,
+                                           base_layer + layer_offset,
+                                           &alias)) {
+            return handled;
+         }
 
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)buffer_image,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+         /* We are aliasing the selected mip level and layer with a
+          * single-mip and single-layer image.
+          */
+         image = v3dv_image_from_handle(alias);
+         mip_level = 0;
+         base_layer = 0;
+         layer_offset = 0;
+      }
 
-      /* Bind the buffer memory to the image */
-      VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
-         i * buf_width * buf_height * buffer_bpp;
+      /* Create the destination blit image from the destination buffer */
+      VkImage buffer_image;
       result =
-         vk_common_BindImageMemory(_device, buffer_image,
-                                   v3dv_device_memory_to_handle(buffer->mem),
-                                   buffer_offset);
+         create_image_from_buffer(cmd_buffer, buffer, region, &info,
+                                  i, &buffer_image);
       if (result != VK_SUCCESS)
          return handled;
 
@@ -722,48 +956,17 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
        * image, but that we need to blit to a S8D24 destination (the only
        * stencil format we support).
        */
-      const VkImageBlit2 blit_region = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
-         .srcSubresource = {
-            .aspectMask = src_copy_aspect,
-            .mipLevel = region->imageSubresource.mipLevel,
-            .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
-            .layerCount = 1,
-         },
-         .srcOffsets = {
-            {
-               DIV_ROUND_UP(region->imageOffset.x, block_width),
-               DIV_ROUND_UP(region->imageOffset.y, block_height),
-               region->imageOffset.z + i,
-            },
-            {
-               DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
-                            block_width),
-               DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
-                            block_height),
-               region->imageOffset.z + i + 1,
-            },
-         },
-         .dstSubresource = {
-            .aspectMask = dst_copy_aspect,
-            .mipLevel = 0,
-            .baseArrayLayer = 0,
-            .layerCount = 1,
-         },
-         .dstOffsets = {
-            { 0, 0, 0 },
-            {
-               DIV_ROUND_UP(region->imageExtent.width, block_width),
-               DIV_ROUND_UP(region->imageExtent.height, block_height),
-               1
-            },
-         },
-      };
+      blit_region =
+         blit_region_for_image_to_buffer(&region->imageOffset,
+                                         &region->imageExtent,
+                                         mip_level, base_layer, layer_offset,
+                                         &info);
 
       handled = blit_shader(cmd_buffer,
-                            v3dv_image_from_handle(buffer_image), dst_format,
-                            image, src_format,
-                            cmask, &cswizzle,
+                            v3dv_image_from_handle(buffer_image),
+                            info.dst_format,
+                            image, info.src_format,
+                            info.cmask, &info.cswizzle,
                             &blit_region, VK_FILTER_NEAREST, false);
       if (!handled) {
          /* This is unexpected, we should have a supported blit spec */
@@ -776,6 +979,107 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
    return true;
 }
 
+static bool
+copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+                               struct v3dv_image *dst,
+                               struct v3dv_image *src,
+                               const VkImageCopy2 *region);
+
+static VkImageCopy2
+image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region,
+                                      struct image_to_buffer_info *info,
+                                      uint32_t layer)
+{
+   VkImageCopy2 output = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
+      .srcSubresource = {
+         .aspectMask = info->src_copy_aspect,
+         .mipLevel = region->imageSubresource.mipLevel,
+         .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer,
+         .layerCount = 1,
+      },
+      .srcOffset = {
+            DIV_ROUND_UP(region->imageOffset.x, info->block_width),
+            DIV_ROUND_UP(region->imageOffset.y, info->block_height),
+            region->imageOffset.z,
+      },
+      .dstSubresource = {
+         .aspectMask = info->dst_copy_aspect,
+         .mipLevel = 0,
+         .baseArrayLayer = 0,
+         .layerCount = 1,
+      },
+      .dstOffset = { 0, 0, 0 },
+      .extent = {
+         DIV_ROUND_UP(region->imageExtent.width, info->block_width),
+         DIV_ROUND_UP(region->imageExtent.height, info->block_height),
+         1
+      },
+   };
+
+   return output;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+                                  struct v3dv_buffer *dst_buffer,
+                                  struct v3dv_image *src_image,
+                                  const VkBufferImageCopy2 *region)
+{
+   bool handled = false;
+   VkImage dst_buffer_image;
+   struct image_to_buffer_info info;
+
+   /* This is a requirement for copy_image_linear_texel_buffer below. We check
+    * it in advance in order to do an early return
+    */
+   if (src_image->tiled)
+      return false;
+
+   handled =
+      gather_image_to_buffer_info(cmd_buffer, src_image, region,
+                                  &info);
+   if (!handled)
+      return handled;
+
+   /* At this point the implementation should support the copy, any possible
+    * error below are for different reasons, like out-of-memory error
+    */
+   handled = true;
+
+   uint32_t num_layers;
+   if (src_image->vk.image_type != VK_IMAGE_TYPE_3D)
+      num_layers = region->imageSubresource.layerCount;
+   else
+      num_layers = region->imageExtent.depth;
+   assert(num_layers > 0);
+
+   VkResult result;
+   VkImageCopy2 image_region;
+   for (uint32_t layer = 0; layer < num_layers; layer++) {
+      /* Create the destination image from the destination buffer */
+      result =
+         create_image_from_buffer(cmd_buffer, dst_buffer, region, &info,
+                                  layer, &dst_buffer_image);
+      if (result != VK_SUCCESS)
+         return handled;
+
+      image_region =
+         image_copy_region_for_image_to_buffer(region, &info, layer);
+
+      handled =
+         copy_image_linear_texel_buffer(cmd_buffer,
+                                        v3dv_image_from_handle(dst_buffer_image),
+                                        src_image, &image_region);
+   }
+
+   return handled;
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
                               const VkCopyImageToBufferInfo2 *info)
@@ -798,6 +1102,9 @@ v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
       if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region))
          continue;
 
+      if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region))
+         continue;
+
       unreachable("Unsupported image to buffer copy.");
    }
    cmd_buffer->state.is_transfer = false;
@@ -819,7 +1126,7 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    }
 
    /* Destination can't be raster format */
-   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
+   if (!dst->tiled)
       return false;
 
    /* We can only do full copies, so if the format is D24S8 both aspects need
@@ -947,6 +1254,15 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    return true;
 }
 
+inline bool
+v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
+                               struct v3dv_image *dst,
+                               struct v3dv_image *src,
+                               const VkImageCopy2 *region)
+{
+   return copy_image_tfu(cmd_buffer, dst, src, region);
+}
+
 /**
  * Returns true if the implementation supports the requested operation (even if
  * it failed to process it, for example, due to an out-of-memory error).
@@ -965,9 +1281,12 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    assert(dst_plane < dst->plane_count);
 
    VkFormat fb_format;
-   if (!v3dv_meta_can_use_tlb(src, src_plane, &region->srcOffset, &fb_format) ||
-       !v3dv_meta_can_use_tlb(dst, dst_plane, &region->dstOffset, &fb_format))
+   if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel,
+                              &region->srcOffset, NULL, &fb_format) ||
+       !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel,
+                              &region->dstOffset, &region->extent, &fb_format)) {
       return false;
+   }
 
    /* From the Vulkan spec, VkImageCopy valid usage:
     *
@@ -1013,8 +1332,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers,
-                        false, true, 1, internal_bpp,
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
 
    struct v3dv_meta_framebuffer framebuffer;
@@ -1066,7 +1385,7 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
       .mipLevels = src->vk.mip_levels,
       .arrayLayers = src->vk.array_layers,
       .samples = src->vk.samples,
-      .tiling = src->vk.tiling,
+      .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
       .usage = src->vk.usage,
    };
 
@@ -1094,8 +1413,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_image *src,
                 const VkImageCopy2 *region)
 {
-   if (src->vk.tiling == VK_IMAGE_TILING_LINEAR &&
-       src->vk.image_type != VK_IMAGE_TYPE_1D)
+   if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D)
       return false;
 
    uint8_t src_plane =
@@ -1207,14 +1525,21 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
     * (since the region dimensions are already specified in terms of the source
     * image).
     */
+   uint32_t region_width = region->extent.width * src_scale_w;
+   uint32_t region_height = region->extent.height * src_scale_h;
+   if (src_block_w > 1)
+      region_width = util_next_power_of_two(region_width);
+   if (src_block_h > 1)
+      region_height = util_next_power_of_two(region_height);
+
    const VkOffset3D src_start = {
       region->srcOffset.x * src_scale_w,
       region->srcOffset.y * src_scale_h,
       region->srcOffset.z,
    };
    const VkOffset3D src_end = {
-      src_start.x + region->extent.width * src_scale_w,
-      src_start.y + region->extent.height * src_scale_h,
+      src_start.x + region_width,
+      src_start.y + region_height,
       src_start.z + region->extent.depth,
    };
 
@@ -1224,8 +1549,8 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       region->dstOffset.z,
    };
    const VkOffset3D dst_end = {
-      dst_start.x + region->extent.width * src_scale_w,
-      dst_start.y + region->extent.height * src_scale_h,
+      dst_start.x + region_width,
+      dst_start.y + region_height,
       dst_start.z + region->extent.depth,
    };
 
@@ -1253,7 +1578,7 @@ copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                                struct v3dv_image *src,
                                const VkImageCopy2 *region)
 {
-   if (src->vk.tiling != VK_IMAGE_TILING_LINEAR)
+   if (src->tiled)
       return false;
 
    /* Implementations are allowed to restrict linear images like this */
@@ -1507,7 +1832,7 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* Destination can't be raster format */
-   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
+   if (!image->tiled)
       return false;
 
    /* We can't copy D24S8 because buffer to image copies only copy one aspect
@@ -1539,11 +1864,13 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    else
       height = region->bufferImageHeight;
 
-   uint8_t plane =
+   const uint8_t plane =
       v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
 
-   if (width != image->planes[plane].width ||
-       height != image->planes[plane].height)
+   const uint32_t mip_level = region->imageSubresource.mipLevel;
+   const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
+
+   if (width != slice->width || height != slice->height)
       return false;
 
    /* Handle region semantics for compressed images */
@@ -1566,9 +1893,6 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    assert(format->plane_count == 1);
    const struct v3dv_format_plane *format_plane = &format->planes[0];
 
-   const uint32_t mip_level = region->imageSubresource.mipLevel;
-   const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
-
    uint32_t num_layers;
    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
@@ -1631,8 +1955,11 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
    assert(plane < image->plane_count);
 
-   if (!v3dv_meta_can_use_tlb(image, plane, &region->imageOffset, &fb_format))
+   if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
+                              &region->imageOffset, &region->imageExtent,
+                              &fb_format)) {
       return false;
+   }
 
    uint32_t internal_type, internal_bpp;
    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
@@ -1659,8 +1986,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
-                        1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        false);
 
    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -1832,7 +2160,7 @@ get_texel_buffer_copy_vs()
                           glsl_vec4_type(), "gl_Position");
    vs_out_pos->data.location = VARYING_SLOT_POS;
 
-   nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
+   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
    nir_store_var(&b, vs_out_pos, pos, 0xf);
 
    return b.shader;
@@ -1855,8 +2183,8 @@ get_texel_buffer_copy_gs()
    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
                                (1ull << VARYING_SLOT_LAYER);
-   nir->info.gs.input_primitive = SHADER_PRIM_TRIANGLES;
-   nir->info.gs.output_primitive = SHADER_PRIM_TRIANGLE_STRIP;
+   nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
+   nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
    nir->info.gs.vertices_in = 3;
    nir->info.gs.vertices_out = 3;
    nir->info.gs.invocations = 1;
@@ -1889,7 +2217,7 @@ get_texel_buffer_copy_gs()
       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
 
       /* gl_Layer from push constants */
-      nir_ssa_def *layer =
+      nir_def *layer =
          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                                 .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
                                 .range = 4);
@@ -1903,7 +2231,7 @@ get_texel_buffer_copy_gs()
    return nir;
 }
 
-static nir_ssa_def *
+static nir_def *
 load_frag_coord(nir_builder *b)
 {
    nir_foreach_shader_in_variable(var, b->shader) {
@@ -1967,24 +2295,24 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
    /* Load the box describing the pixel region we want to copy from the
     * texel buffer.
     */
-   nir_ssa_def *box =
+   nir_def *box =
       nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
                              .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
                              .range = 16);
 
    /* Load the buffer stride (this comes in texel units) */
-   nir_ssa_def *stride =
+   nir_def *stride =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                              .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
                              .range = 4);
 
    /* Load the buffer offset (this comes in texel units) */
-   nir_ssa_def *offset =
+   nir_def *offset =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                              .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
                              .range = 4);
 
-   nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
+   nir_def *coord = nir_f2i32(&b, load_frag_coord(&b));
 
    /* Load pixel data from texel buffer based on the x,y offset of the pixel
     * within the box. Texel buffers are 1D arrays of texels.
@@ -1994,28 +2322,26 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
     * texel buffer should always be within its bounds and we we don't need
     * to add a check for that here.
     */
-   nir_ssa_def *x_offset =
+   nir_def *x_offset =
       nir_isub(&b, nir_channel(&b, coord, 0),
                    nir_channel(&b, box, 0));
-   nir_ssa_def *y_offset =
+   nir_def *y_offset =
       nir_isub(&b, nir_channel(&b, coord, 1),
                    nir_channel(&b, box, 1));
-   nir_ssa_def *texel_offset =
+   nir_def *texel_offset =
       nir_iadd(&b, nir_iadd(&b, offset, x_offset),
                    nir_imul(&b, y_offset, stride));
 
-   nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+   nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def;
    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
    tex->op = nir_texop_txf;
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(texel_offset);
-   tex->src[1].src_type = nir_tex_src_texture_deref;
-   tex->src[1].src = nir_src_for_ssa(tex_deref);
+   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset);
+   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
    tex->dest_type = nir_type_uint32;
    tex->is_array = false;
    tex->coord_components = 1;
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
+   nir_def_init(&tex->instr, &tex->def, 4, 32);
    nir_builder_instr_insert(&b, &tex->instr);
 
    uint32_t swiz[4];
@@ -2027,7 +2353,7 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
    swiz[3] =
       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
-   nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4);
+   nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4);
    nir_store_var(&b, fs_out_color, s, 0xf);
 
    return b.shader;
@@ -2883,76 +3209,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
    }
 }
 
-/**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
- */
-static bool
-copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
-                         struct v3dv_image *image,
-                         struct v3dv_buffer *buffer,
-                         const VkBufferImageCopy2 *region)
-{
-   /* FIXME */
-   if (vk_format_is_depth_or_stencil(image->vk.format))
-      return false;
-
-   if (vk_format_is_compressed(image->vk.format))
-      return false;
-
-   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
-      return false;
-
-   uint32_t buffer_width, buffer_height;
-   if (region->bufferRowLength == 0)
-      buffer_width = region->imageExtent.width;
-   else
-      buffer_width = region->bufferRowLength;
-
-   if (region->bufferImageHeight == 0)
-      buffer_height = region->imageExtent.height;
-   else
-      buffer_height = region->bufferImageHeight;
-
-   uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
-   assert(plane < image->plane_count);
-
-   uint32_t buffer_stride = buffer_width * image->planes[plane].cpp;
-   uint32_t buffer_layer_stride = buffer_stride * buffer_height;
-
-   uint32_t num_layers;
-   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
-      num_layers = region->imageSubresource.layerCount;
-   else
-      num_layers = region->imageExtent.depth;
-   assert(num_layers > 0);
-
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
-                                     cmd_buffer, -1);
-   if (!job)
-      return true;
-
-   job->cpu.copy_buffer_to_image.image = image;
-   job->cpu.copy_buffer_to_image.buffer = buffer;
-   job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
-   job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
-   job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
-   job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
-   job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
-   job->cpu.copy_buffer_to_image.mip_level =
-      region->imageSubresource.mipLevel;
-   job->cpu.copy_buffer_to_image.base_layer =
-      region->imageSubresource.baseArrayLayer;
-   job->cpu.copy_buffer_to_image.layer_count = num_layers;
-   job->cpu.copy_buffer_to_image.plane = plane;
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-
-   return true;
-}
-
 VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
                               const VkCopyBufferToImageInfo2 *info)
@@ -3013,11 +3269,6 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
        * slow it might not be worth it and we should instead put more effort
        * in handling more cases with the other paths.
        */
-      if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &info->pRegions[r])) {
-         batch_size = 1;
-         goto handled;
-      }
-
       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
                                       batch_size, &info->pRegions[r], false)) {
          goto handled;
@@ -3072,7 +3323,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       return false;
 
    /* Destination can't be raster format */
-   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
+   if (!dst->tiled)
       return false;
 
    /* Source region must start at (0,0) */
@@ -3301,16 +3552,16 @@ create_blit_render_pass(struct v3dv_device *device,
    return result == VK_SUCCESS;
 }
 
-static nir_ssa_def *
+static nir_def *
 gen_tex_coords(nir_builder *b)
 {
-   nir_ssa_def *tex_box =
+   nir_def *tex_box =
       nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
 
-   nir_ssa_def *tex_z =
+   nir_def *tex_z =
       nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
 
-   nir_ssa_def *vertex_id = nir_load_vertex_id(b);
+   nir_def *vertex_id = nir_load_vertex_id(b);
 
    /* vertex 0: src0_x, src0_y
     * vertex 1: src0_x, src1_y
@@ -3323,11 +3574,11 @@ gen_tex_coords(nir_builder *b)
     * channel 1 is vertex id & 1 ? src1_y : src0_y
     */
 
-   nir_ssa_def *one = nir_imm_int(b, 1);
-   nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
-   nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
+   nir_def *one = nir_imm_int(b, 1);
+   nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2);
+   nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
 
-   nir_ssa_def *comp[4];
+   nir_def *comp[4];
    comp[0] = nir_bcsel(b, c0cmp,
                        nir_channel(b, tex_box, 0),
                        nir_channel(b, tex_box, 2));
@@ -3340,9 +3591,9 @@ gen_tex_coords(nir_builder *b)
    return nir_vec(b, comp, 4);
 }
 
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_read(struct nir_builder *b,
-                      nir_ssa_def *tex_pos,
+                      nir_def *tex_pos,
                       enum glsl_base_type tex_type,
                       enum glsl_sampler_dim dim)
 {
@@ -3355,57 +3606,49 @@ build_nir_tex_op_read(struct nir_builder *b,
    sampler->data.descriptor_set = 0;
    sampler->data.binding = 0;
 
-   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
    tex->sampler_dim = dim;
    tex->op = nir_texop_tex;
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(tex_pos);
-   tex->src[1].src_type = nir_tex_src_texture_deref;
-   tex->src[1].src = nir_src_for_ssa(tex_deref);
-   tex->src[2].src_type = nir_tex_src_sampler_deref;
-   tex->src[2].src = nir_src_for_ssa(tex_deref);
+   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
+   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
+   tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref);
    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
    tex->is_array = glsl_sampler_type_is_array(sampler_type);
    tex->coord_components = tex_pos->num_components;
 
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
+   nir_def_init(&tex->instr, &tex->def, 4, 32);
    nir_builder_instr_insert(b, &tex->instr);
-   return &tex->dest.ssa;
+   return &tex->def;
 }
 
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
                                  nir_variable *sampler,
-                                 nir_ssa_def *tex_deref,
+                                 nir_def *tex_deref,
                                  enum glsl_base_type tex_type,
-                                 nir_ssa_def *tex_pos,
-                                 nir_ssa_def *sample_idx)
+                                 nir_def *tex_pos,
+                                 nir_def *sample_idx)
 {
-   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4);
+   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
    tex->op = nir_texop_txf_ms;
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(tex_pos);
-   tex->src[1].src_type = nir_tex_src_texture_deref;
-   tex->src[1].src = nir_src_for_ssa(tex_deref);
-   tex->src[2].src_type = nir_tex_src_sampler_deref;
-   tex->src[2].src = nir_src_for_ssa(tex_deref);
-   tex->src[3].src_type = nir_tex_src_ms_index;
-   tex->src[3].src = nir_src_for_ssa(sample_idx);
+   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
+   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
+   tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx);
    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
    tex->is_array = false;
    tex->coord_components = tex_pos->num_components;
 
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
+   nir_def_init(&tex->instr, &tex->def, 4, 32);
    nir_builder_instr_insert(b, &tex->instr);
-   return &tex->dest.ssa;
+   return &tex->def;
 }
 
 /* Fetches all samples at the given position and averages them */
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_ms_resolve(struct nir_builder *b,
-                            nir_ssa_def *tex_pos,
+                            nir_def *tex_pos,
                             enum glsl_base_type tex_type,
                             VkSampleCountFlagBits src_samples)
 {
@@ -3419,10 +3662,10 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b,
 
    const bool is_int = glsl_base_type_is_integer(tex_type);
 
-   nir_ssa_def *tmp = NULL;
-   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+   nir_def *tmp = NULL;
+   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
    for (uint32_t i = 0; i < src_samples; i++) {
-      nir_ssa_def *s =
+      nir_def *s =
          build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
                                           tex_type, tex_pos,
                                           nir_imm_int(b, i));
@@ -3437,13 +3680,13 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b,
    }
 
    assert(!is_int);
-   return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples));
+   return nir_fmul_imm(b, tmp, 1.0f / src_samples);
 }
 
 /* Fetches the current sample (gl_SampleID) at the given position */
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op_ms_read(struct nir_builder *b,
-                         nir_ssa_def *tex_pos,
+                         nir_def *tex_pos,
                          enum glsl_base_type tex_type)
 {
    const struct glsl_type *sampler_type =
@@ -3453,17 +3696,17 @@ build_nir_tex_op_ms_read(struct nir_builder *b,
    sampler->data.descriptor_set = 0;
    sampler->data.binding = 0;
 
-   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
 
    return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
                                            tex_type, tex_pos,
                                            nir_load_sample_id(b));
 }
 
-static nir_ssa_def *
+static nir_def *
 build_nir_tex_op(struct nir_builder *b,
                  struct v3dv_device *device,
-                 nir_ssa_def *tex_pos,
+                 nir_def *tex_pos,
                  enum glsl_base_type tex_type,
                  VkSampleCountFlagBits dst_samples,
                  VkSampleCountFlagBits src_samples,
@@ -3507,10 +3750,10 @@ get_blit_vs()
    vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
    vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
 
-   nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
+   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
    nir_store_var(&b, vs_out_pos, pos, 0xf);
 
-   nir_ssa_def *tex_coord = gen_tex_coords(&b);
+   nir_def *tex_coord = gen_tex_coords(&b);
    nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
 
    return b.shader;
@@ -3561,11 +3804,11 @@ get_color_blit_fs(struct v3dv_device *device,
       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
    fs_out_color->data.location = FRAG_RESULT_DATA0;
 
-   nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
+   nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
    const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
    tex_coord = nir_channels(&b, tex_coord, channel_mask);
 
-   nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
+   nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
                                          dst_samples, src_samples, sampler_dim);
 
    /* For integer textures, if the bit-size of the destination is too small to
@@ -3580,7 +3823,7 @@ get_color_blit_fs(struct v3dv_device *device,
       enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
       enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
 
-      nir_ssa_def *c[4];
+      nir_def *c[4];
       for (uint32_t i = 0; i < 4; i++) {
          c[i] = nir_channel(&b, color, i);
 
@@ -3598,11 +3841,11 @@ get_color_blit_fs(struct v3dv_device *device,
 
          assert(dst_bit_size > 0);
          if (util_format_is_pure_uint(dst_pformat)) {
-            nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
+            nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
             c[i] = nir_umin(&b, c[i], max);
          } else {
-            nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
-            nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
+            nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
+            nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
             c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
          }
       }
@@ -4062,12 +4305,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    /* We don't support rendering to linear depth/stencil, this should have
     * been rewritten to a compatible color blit by the caller.
     */
-   assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR ||
-          !vk_format_is_depth_or_stencil(dst_format));
+   assert(dst->tiled || !vk_format_is_depth_or_stencil(dst_format));
 
    /* Can't sample from linear images */
-   if (src->vk.tiling == VK_IMAGE_TILING_LINEAR &&
-       src->vk.image_type != VK_IMAGE_TYPE_1D) {
+   if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) {
       return false;
    }
 
@@ -4538,8 +4779,10 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    assert(dst->plane_count == 1);
    assert(src->plane_count == 1);
 
-   if (!v3dv_meta_can_use_tlb(src, 0, &region->srcOffset, NULL) ||
-       !v3dv_meta_can_use_tlb(dst, 0, &region->dstOffset, NULL)) {
+   if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel,
+                              &region->srcOffset, NULL, NULL) ||
+       !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel,
+                              &region->dstOffset, &region->extent, NULL)) {
       return false;
    }
 
@@ -4572,8 +4815,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       (fb_format, region->srcSubresource.aspectMask,
        &internal_type, &internal_bpp);
 
-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
-                        1, internal_bpp, true);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        true);
 
    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pass.c b/lib/mesa/src/broadcom/vulkan/v3dv_pass.c
index 683acde62..0583faf6f 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_pass.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_pass.c
@@ -234,13 +234,15 @@ v3dv_CreateRenderPass2(VkDevice _device,
             .layout = desc->pDepthStencilAttachment->layout,
          };
 
-         /* GFXH-1461: if depth is cleared but stencil is loaded (or viceversa),
+         /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
           * the clear might get lost. If a subpass has this then we can't emit
-          * the clear using the TLB and we have to do it as a draw call.
+          * the clear using the TLB and we have to do it as a draw call. This
+          * issue is fixed since V3D 4.3.18.
           *
           * FIXME: separate stencil.
           */
-         if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+         if (device->devinfo.ver == 42 &&
+             subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
             struct v3dv_render_pass_attachment *att =
                &pass->attachments[subpass->ds_attachment.attachment];
             if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
@@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device,
    /* Granularity is defined by the tile size */
    assert(subpass_idx < pass->subpass_count);
    struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
-   const uint32_t color_attachment_count = subpass->color_count;
+   const uint32_t color_count = subpass->color_count;
 
    bool msaa = false;
-   uint32_t max_bpp = 0;
-   for (uint32_t i = 0; i < color_attachment_count; i++) {
+   uint32_t max_internal_bpp = 0;
+   uint32_t total_color_bpp = 0;
+   for (uint32_t i = 0; i < color_count; i++) {
       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
       if (attachment_idx == VK_ATTACHMENT_UNUSED)
          continue;
@@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device,
       v3dv_X(device, get_internal_type_bpp_for_output_format)
          (format->planes[0].rt_type, &internal_type, &internal_bpp);
 
-      max_bpp = MAX2(max_bpp, internal_bpp);
+      max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
+      total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
 
       if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
          msaa = true;
@@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device,
     * heuristics so we choose a conservative granularity here, with it disabled.
     */
    uint32_t width, height;
-   v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
+   v3d_choose_tile_size(&device->devinfo, color_count,
+                        max_internal_bpp, total_color_bpp, msaa,
                         false /* double-buffer */, &width, &height);
    *granularity = (VkExtent2D) {
       .width = width,
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c
index 116c0f70f..54a26cb14 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c
@@ -30,13 +30,14 @@
 #include "qpu/qpu_disasm.h"
 
 #include "compiler/nir/nir_builder.h"
-#include "nir/nir_vulkan.h"
 #include "nir/nir_serialize.h"
 
 #include "util/u_atomic.h"
 #include "util/u_prim.h"
 #include "util/os_time.h"
+#include "util/u_helpers.h"
 
+#include "vk_nir_convert_ycbcr.h"
 #include "vk_pipeline.h"
 #include "vulkan/util/vk_format.h"
 
@@ -192,8 +193,8 @@ const nir_shader_compiler_options v3dv_nir_options = {
    .lower_extract_word = true,
    .lower_insert_byte = true,
    .lower_insert_word = true,
-   .lower_bitfield_insert_to_shifts = true,
-   .lower_bitfield_extract_to_shifts = true,
+   .lower_bitfield_insert = true,
+   .lower_bitfield_extract = true,
    .lower_bitfield_reverse = true,
    .lower_bit_count = true,
    .lower_cs_local_id_to_index = true,
@@ -226,10 +227,10 @@ const nir_shader_compiler_options v3dv_nir_options = {
    .lower_isign = true,
    .lower_ldexp = true,
    .lower_mul_high = true,
-   .lower_wpos_pntc = true,
-   .lower_rotate = true,
+   .lower_wpos_pntc = false,
    .lower_to_scalar = true,
    .lower_device_index_to_zero = true,
+   .lower_fquantize2f16 = true,
    .has_fsub = true,
    .has_isub = true,
    .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
@@ -238,7 +239,7 @@ const nir_shader_compiler_options v3dv_nir_options = {
    .max_unroll_iterations = 16,
    .force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp),
    .divergence_analysis_options =
-      nir_divergence_multiple_workgroup_per_compute_subgroup
+      nir_divergence_multiple_workgroup_per_compute_subgroup,
 };
 
 const nir_shader_compiler_options *
@@ -546,7 +547,7 @@ lower_vulkan_resource_index(nir_builder *b,
       uint32_t start_index = 0;
       if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
           binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
-         start_index = MAX_INLINE_UNIFORM_BUFFERS;
+         start_index += MAX_INLINE_UNIFORM_BUFFERS;
       }
 
       index = descriptor_map_add(descriptor_map, set, binding,
@@ -555,14 +556,6 @@ lower_vulkan_resource_index(nir_builder *b,
                                  start_index,
                                  32 /* return_size: doesn't really apply for this case */,
                                  0);
-
-      /* We always reserve index 0 for push constants */
-      if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
-          binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
-          binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
-         index++;
-      }
-
       break;
    }
 
@@ -575,7 +568,7 @@ lower_vulkan_resource_index(nir_builder *b,
     * vulkan_load_descriptor return a vec2 providing an index and
     * offset. Our backend compiler only cares about the index part.
     */
-   nir_ssa_def_rewrite_uses(&instr->dest.ssa,
+   nir_def_rewrite_uses(&instr->def,
                             nir_imm_ivec2(b, index, 0));
    nir_instr_remove(&instr->instr);
 }
@@ -601,7 +594,7 @@ lower_tex_src(nir_builder *b,
               unsigned src_idx,
               struct lower_pipeline_layout_state *state)
 {
-   nir_ssa_def *index = NULL;
+   nir_def *index = NULL;
    unsigned base_index = 0;
    unsigned array_elements = 1;
    nir_tex_src *src = &instr->src[src_idx];
@@ -612,7 +605,6 @@ lower_tex_src(nir_builder *b,
    /* We compute first the offsets */
    nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
    while (deref->deref_type != nir_deref_type_var) {
-      assert(deref->parent.is_ssa);
       nir_deref_instr *parent =
          nir_instr_as_deref(deref->parent.ssa->parent_instr);
 
@@ -629,8 +621,8 @@ lower_tex_src(nir_builder *b,
          }
 
          index = nir_iadd(b, index,
-                          nir_imul(b, nir_imm_int(b, array_elements),
-                                   nir_ssa_for_src(b, deref->arr.index, 1)));
+                          nir_imul_imm(b, deref->arr.index.ssa,
+                                       array_elements));
       }
 
       array_elements *= glsl_get_length(parent->type);
@@ -645,8 +637,7 @@ lower_tex_src(nir_builder *b,
     * instr if needed
     */
    if (index) {
-      nir_instr_rewrite_src(&instr->instr, &src->src,
-                            nir_src_for_ssa(index));
+      nir_src_rewrite(&src->src, index);
 
       src->src_type = is_sampler ?
          nir_tex_src_sampler_offset :
@@ -658,7 +649,7 @@ lower_tex_src(nir_builder *b,
    uint32_t set = deref->var->data.descriptor_set;
    uint32_t binding = deref->var->data.binding;
    /* FIXME: this is a really simplified check for the precision to be used
-    * for the sampling. Right now we are ony checking for the variables used
+    * for the sampling. Right now we are only checking for the variables used
     * on the operation itself, but there are other cases that we could use to
     * infer the precision requirement.
     */
@@ -720,18 +711,20 @@ lower_sampler(nir_builder *b,
    int sampler_idx =
       nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
 
-   if (sampler_idx >= 0)
+   if (sampler_idx >= 0) {
+      assert(nir_tex_instr_need_sampler(instr));
       lower_tex_src(b, instr, sampler_idx, state);
+   }
 
    if (texture_idx < 0 && sampler_idx < 0)
       return false;
 
-   /* If we don't have a sampler, we assign it the idx we reserve for this
-    * case, and we ensure that it is using the correct return size.
+   /* If the instruction doesn't have a sampler (i.e. txf) we use backend_flags
+    * to bind a default sampler state to configure precission.
     */
    if (sampler_idx < 0) {
       state->needs_default_sampler_state = true;
-      instr->sampler_index = return_size == 16 ?
+      instr->backend_flags = return_size == 16 ?
          V3DV_NO_SAMPLER_16BIT_IDX : V3DV_NO_SAMPLER_32BIT_IDX;
    }
 
@@ -745,12 +738,11 @@ lower_image_deref(nir_builder *b,
                   struct lower_pipeline_layout_state *state)
 {
    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
-   nir_ssa_def *index = NULL;
+   nir_def *index = NULL;
    unsigned array_elements = 1;
    unsigned base_index = 0;
 
    while (deref->deref_type != nir_deref_type_var) {
-      assert(deref->parent.is_ssa);
       nir_deref_instr *parent =
          nir_instr_as_deref(deref->parent.ssa->parent_instr);
 
@@ -767,8 +759,8 @@ lower_image_deref(nir_builder *b,
          }
 
          index = nir_iadd(b, index,
-                          nir_imul(b, nir_imm_int(b, array_elements),
-                                   nir_ssa_for_src(b, deref->arr.index, 1)));
+                          nir_imul_imm(b, deref->arr.index.ssa,
+                                       array_elements));
       }
 
       array_elements *= glsl_get_length(parent->type);
@@ -833,23 +825,15 @@ lower_intrinsic(nir_builder *b,
       /* Loading the descriptor happens as part of load/store instructions,
        * so for us this is a no-op.
        */
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa, instr->src[0].ssa);
+      nir_def_rewrite_uses(&instr->def, instr->src[0].ssa);
       nir_instr_remove(&instr->instr);
       return true;
    }
 
    case nir_intrinsic_image_deref_load:
    case nir_intrinsic_image_deref_store:
-   case nir_intrinsic_image_deref_atomic_add:
-   case nir_intrinsic_image_deref_atomic_imin:
-   case nir_intrinsic_image_deref_atomic_umin:
-   case nir_intrinsic_image_deref_atomic_imax:
-   case nir_intrinsic_image_deref_atomic_umax:
-   case nir_intrinsic_image_deref_atomic_and:
-   case nir_intrinsic_image_deref_atomic_or:
-   case nir_intrinsic_image_deref_atomic_xor:
-   case nir_intrinsic_image_deref_atomic_exchange:
-   case nir_intrinsic_image_deref_atomic_comp_swap:
+   case nir_intrinsic_image_deref_atomic:
+   case nir_intrinsic_image_deref_atomic_swap:
    case nir_intrinsic_image_deref_size:
    case nir_intrinsic_image_deref_samples:
       lower_image_deref(b, instr, state);
@@ -907,6 +891,34 @@ lower_pipeline_layout_info(nir_shader *shader,
    return progress;
 }
 
+/* This flips gl_PointCoord.y to match Vulkan requirements */
+static bool
+lower_point_coord_cb(nir_builder *b, nir_intrinsic_instr *intr, void *_state)
+{
+   if (intr->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PNTC)
+      return false;
+
+   b->cursor = nir_after_instr(&intr->instr);
+   nir_def *result = &intr->def;
+   result =
+      nir_vector_insert_imm(b, result,
+                            nir_fsub_imm(b, 1.0, nir_channel(b, result, 1)), 1);
+   nir_def_rewrite_uses_after(&intr->def,
+                                  result, result->parent_instr);
+   return true;
+}
+
+static bool
+v3d_nir_lower_point_coord(nir_shader *s)
+{
+   assert(s->info.stage == MESA_SHADER_FRAGMENT);
+   return nir_shader_intrinsics_pass(s, lower_point_coord_cb,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance, NULL);
+}
 
 static void
 lower_fs_io(nir_shader *nir)
@@ -1043,24 +1055,22 @@ pipeline_populate_v3d_key(struct v3d_key *key,
       p_stage->robustness.storage_buffers == robust_buffer_enabled;
    key->robust_image_access =
       p_stage->robustness.images == robust_image_enabled;
-
-   key->environment = V3D_ENVIRONMENT_VULKAN;
 }
 
 /* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the
  * same. For not using prim_mode that is the one already used on v3d
  */
-static const enum pipe_prim_type vk_to_pipe_prim_type[] = {
-   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY,
+static const enum mesa_prim vk_to_mesa_prim[] = {
+   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = MESA_PRIM_POINTS,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = MESA_PRIM_LINES,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = MESA_PRIM_LINE_STRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = MESA_PRIM_TRIANGLES,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = MESA_PRIM_TRIANGLE_STRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = MESA_PRIM_TRIANGLE_FAN,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = MESA_PRIM_LINES_ADJACENCY,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = MESA_PRIM_LINE_STRIP_ADJACENCY,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = MESA_PRIM_TRIANGLES_ADJACENCY,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = MESA_PRIM_TRIANGLE_STRIP_ADJACENCY,
 };
 
 static const enum pipe_logicop vk_to_pipe_logicop[] = {
@@ -1100,11 +1110,22 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
 
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+   uint8_t topology = vk_to_mesa_prim[ia_info->topology];
+
+   key->is_points = (topology == MESA_PRIM_POINTS);
+   key->is_lines = (topology >= MESA_PRIM_LINES &&
+                    topology <= MESA_PRIM_LINE_STRIP);
+
+   if (key->is_points) {
+      /* This mask represents state for GL_ARB_point_sprite which is not
+       * relevant to Vulkan.
+       */
+      key->point_sprite_mask = 0;
+
+      /* Vulkan mandates upper left. */
+      key->point_coord_upper_left = true;
+   }
 
-   key->is_points = (topology == PIPE_PRIM_POINTS);
-   key->is_lines = (topology >= PIPE_PRIM_LINES &&
-                    topology <= PIPE_PRIM_LINE_STRIP);
    key->has_gs = has_geometry_shader;
 
    const VkPipelineColorBlendStateCreateInfo *cb_info =
@@ -1181,16 +1202,6 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
          else if (util_format_is_pure_sint(fb_pipe_format))
             key->int_color_rb |= 1 << i;
       }
-
-      if (key->is_points) {
-         /* This mask represents state for GL_ARB_point_sprite which is not
-          * relevant to Vulkan.
-          */
-         key->point_sprite_mask = 0;
-
-         /* Vulkan mandates upper left. */
-         key->point_coord_upper_left = true;
-      }
    }
 }
 
@@ -1271,11 +1282,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
     */
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+   uint8_t topology = vk_to_mesa_prim[ia_info->topology];
 
    /* FIXME: PRIM_POINTS is not enough, in gallium the full check is
-    * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
-   key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);
+    * MESA_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
+   key->per_vertex_point_size = (topology == MESA_PRIM_POINTS);
 
    key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
 
@@ -1340,8 +1351,10 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
       const VkVertexInputAttributeDescription *desc =
          &vi_info->pVertexAttributeDescriptions[i];
       assert(desc->location < MAX_VERTEX_ATTRIBS);
-      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
+          desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+      }
    }
 }
 
@@ -1790,7 +1803,7 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
    if (nir) {
       assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage));
 
-      /* A NIR cach hit doesn't avoid the large majority of pipeline stage
+      /* A NIR cache hit doesn't avoid the large majority of pipeline stage
        * creation so the cache hit is not recorded in the pipeline feedback
        * flags
        */
@@ -1910,6 +1923,11 @@ pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
                                 p_stage_gs != NULL,
                                 get_ucp_enable_mask(p_stage_vs));
 
+   if (key.is_points) {
+      assert(key.point_coord_upper_left);
+      NIR_PASS(_, p_stage_fs->nir, v3d_nir_lower_point_coord);
+   }
+
    VkResult vk_result;
    pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] =
       pipeline_compile_shader_variant(p_stage_fs, &key.base, sizeof(key),
@@ -1933,7 +1951,7 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
 
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   key->topology = vk_to_pipe_prim_type[ia_info->topology];
+   key->topology = vk_to_mesa_prim[ia_info->topology];
 
    const VkPipelineColorBlendStateCreateInfo *cb_info =
       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
@@ -1998,8 +2016,10 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
       const VkVertexInputAttributeDescription *desc =
          &vi_info->pVertexAttributeDescriptions[i];
       assert(desc->location < MAX_VERTEX_ATTRIBS);
-      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
+          desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+      }
    }
 
    assert(pipeline->subpass);
@@ -2130,19 +2150,19 @@ write_creation_feedback(struct v3dv_pipeline *pipeline,
    }
 }
 
-static enum shader_prim
+static enum mesa_prim
 multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
 {
    switch (pipeline->topology) {
-   case PIPE_PRIM_POINTS:
-      return SHADER_PRIM_POINTS;
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_STRIP:
-      return SHADER_PRIM_LINES;
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return SHADER_PRIM_TRIANGLES;
+   case MESA_PRIM_POINTS:
+      return MESA_PRIM_POINTS;
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_LINE_STRIP:
+      return MESA_PRIM_LINES;
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_TRIANGLE_STRIP:
+   case MESA_PRIM_TRIANGLE_FAN:
+      return MESA_PRIM_TRIANGLES;
    default:
       /* Since we don't allow GS with multiview, we can only see non-adjacency
        * primitives.
@@ -2151,19 +2171,19 @@ multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
    }
 }
 
-static enum shader_prim
+static enum mesa_prim
 multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
 {
    switch (pipeline->topology) {
-   case PIPE_PRIM_POINTS:
-      return SHADER_PRIM_POINTS;
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_STRIP:
-      return SHADER_PRIM_LINE_STRIP;
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return SHADER_PRIM_TRIANGLE_STRIP;
+   case MESA_PRIM_POINTS:
+      return MESA_PRIM_POINTS;
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_LINE_STRIP:
+      return MESA_PRIM_LINE_STRIP;
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_TRIANGLE_STRIP:
+   case MESA_PRIM_TRIANGLE_FAN:
+      return MESA_PRIM_TRIANGLE_STRIP;
    default:
       /* Since we don't allow GS with multiview, we can only see non-adjacency
        * primitives.
@@ -2232,7 +2252,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
    out_layer->data.location = VARYING_SLOT_LAYER;
 
    /* Get the view index value that we will write to gl_Layer */
-   nir_ssa_def *layer =
+   nir_def *layer =
       nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32);
 
    /* Emit all output vertices */
@@ -2612,13 +2632,8 @@ v3dv_dynamic_state_mask(VkDynamicState state)
       return V3DV_DYNAMIC_LINE_WIDTH;
    case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
       return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
-
-   /* Depth bounds testing is not available in in V3D 4.2 so here we are just
-    * ignoring this dynamic state. We are already asserting at pipeline creation
-    * time that depth bounds testing is not enabled.
-    */
    case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
-      return 0;
+      return V3DV_DYNAMIC_DEPTH_BOUNDS;
 
    default:
       unreachable("Unhandled dynamic state");
@@ -2636,6 +2651,7 @@ pipeline_init_dynamic_state(
    const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
 {
    /* Initialize to default values */
+   const struct v3d_device_info *devinfo = &pipeline->device->devinfo;
    struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
    memset(dynamic, 0, sizeof(*dynamic));
    dynamic->stencil_compare_mask.front = ~0;
@@ -2643,7 +2659,9 @@ pipeline_init_dynamic_state(
    dynamic->stencil_write_mask.front = ~0;
    dynamic->stencil_write_mask.back = ~0;
    dynamic->line_width = 1.0f;
-   dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1;
+   dynamic->color_write_enable =
+      (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1;
+   dynamic->depth_bounds.max = 1.0f;
 
    /* Create a mask of enabled dynamic states */
    uint32_t dynamic_states = 0;
@@ -2665,9 +2683,10 @@ pipeline_init_dynamic_state(
                       pViewportState->viewportCount);
 
          for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
-            v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
-                                        dynamic->viewport.scale[i],
-                                        dynamic->viewport.translate[i]);
+            v3dv_X(pipeline->device, viewport_compute_xform)
+               (&dynamic->viewport.viewports[i],
+                dynamic->viewport.scale[i],
+                dynamic->viewport.translate[i]);
          }
       }
 
@@ -2695,6 +2714,11 @@ pipeline_init_dynamic_state(
          dynamic->stencil_reference.front = pDepthStencilState->front.reference;
          dynamic->stencil_reference.back = pDepthStencilState->back.reference;
       }
+
+      if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+         dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds;
+         dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds;
+      }
    }
 
    if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
@@ -2806,62 +2830,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
    }
 }
 
-static bool
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
-{
-   for (uint8_t i = 0; i < pipeline->va_count; i++) {
-      if (vk_format_is_int(pipeline->va[i].vk_format))
-         return true;
-   }
-   return false;
-}
-
-/* @pipeline can be NULL. We assume in that case that all the attributes have
- * a float format (we only create an all-float BO once and we reuse it with
- * all float pipelines), otherwise we look at the actual type of each
- * attribute used with the specific pipeline passed in.
- */
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
-                                              struct v3dv_pipeline *pipeline)
-{
-   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
-   struct v3dv_bo *bo;
-
-   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
-
-   if (!bo) {
-      fprintf(stderr, "failed to allocate memory for the default "
-              "attribute values\n");
-      return NULL;
-   }
-
-   bool ok = v3dv_bo_map(device, bo, size);
-   if (!ok) {
-      fprintf(stderr, "failed to map default attribute values buffer\n");
-      return false;
-   }
-
-   uint32_t *attrs = bo->map;
-   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
-   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
-      attrs[i * 4 + 0] = 0;
-      attrs[i * 4 + 1] = 0;
-      attrs[i * 4 + 2] = 0;
-      VkFormat attr_format =
-         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
-      if (i < va_count && vk_format_is_int(attr_format)) {
-         attrs[i * 4 + 3] = 1;
-      } else {
-         attrs[i * 4 + 3] = fui(1.0);
-      }
-   }
-
-   v3dv_bo_unmap(device, bo);
-
-   return bo;
-}
-
 static void
 pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
                          const VkPipelineMultisampleStateCreateInfo *ms_info)
@@ -2909,7 +2877,7 @@ pipeline_init(struct v3dv_pipeline *pipeline,
 
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
-   pipeline->topology = vk_to_pipe_prim_type[ia_info->topology];
+   pipeline->topology = vk_to_mesa_prim[ia_info->topology];
 
    /* If rasterization is not enabled, various CreateInfo structs must be
     * ignored.
@@ -2964,7 +2932,9 @@ pipeline_init(struct v3dv_pipeline *pipeline,
    /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
     * feature and it shouldn't be used by any pipeline.
     */
-   assert(!ds_info || !ds_info->depthBoundsTestEnable);
+   assert(device->devinfo.ver >= 71 ||
+          !ds_info || !ds_info->depthBoundsTestEnable);
+   pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable;
 
    enable_depth_bias(pipeline, rs_info);
 
@@ -2996,9 +2966,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,
 
    v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
 
-   if (pipeline_has_integer_vertex_attrib(pipeline)) {
+   if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
       pipeline->default_attribute_values =
-         v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
+         v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
+
       if (!pipeline->default_attribute_values)
          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
    } else {
@@ -3106,14 +3077,20 @@ shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 }
 
 static void
-lower_cs_shared(struct nir_shader *nir)
+lower_compute(struct nir_shader *nir)
 {
    if (!nir->info.shared_memory_explicit_layout) {
       NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
                nir_var_mem_shared, shared_type_info);
    }
+
    NIR_PASS(_, nir, nir_lower_explicit_io,
             nir_var_mem_shared, nir_address_format_32bit_offset);
+
+   struct nir_lower_compute_system_values_options sysval_options = {
+      .has_base_workgroup_id = true,
+   };
+   NIR_PASS_V(nir, nir_lower_compute_system_values, &sysval_options);
 }
 
 static VkResult
@@ -3201,7 +3178,7 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
 
    v3d_optimize_nir(NULL, p_stage->nir);
    pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
-   lower_cs_shared(p_stage->nir);
+   lower_compute(p_stage->nir);
 
    VkResult result = VK_SUCCESS;
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c
index bafa8d759..3f58940c7 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c
@@ -542,7 +542,7 @@ shader_variant_create_from_blob(struct v3dv_device *device,
    if (blob->overrun)
       return NULL;
 
-   uint ulist_data_size = sizeof(uint32_t) * ulist_count;
+   size_t ulist_data_size = sizeof(uint32_t) * ulist_count;
    const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
    if (blob->overrun)
       return NULL;
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_private.h b/lib/mesa/src/broadcom/vulkan/v3dv_private.h
index 91c1ec2f6..21934d802 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_private.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_private.h
@@ -38,6 +38,7 @@
 
 #include "vk_descriptor_update_template.h"
 #include "vk_device.h"
+#include "vk_device_memory.h"
 #include "vk_format.h"
 #include "vk_instance.h"
 #include "vk_image.h"
@@ -64,6 +65,11 @@
 #define VG(x) ((void)0)
 #endif
 
+#ifdef ANDROID
+#include <vndk/hardware_buffer.h>
+#include "util/u_gralloc/u_gralloc.h"
+#endif
+
 #include "v3dv_limits.h"
 
 #include "common/v3d_device_info.h"
@@ -123,13 +129,15 @@ struct v3d_simulator_file;
 /* Minimum required by the Vulkan 1.1 spec */
 #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
 
+/* Maximum performance counters number */
+#define V3D_MAX_PERFCNT 93
+
 struct v3dv_physical_device {
    struct vk_physical_device vk;
 
    char *name;
    int32_t render_fd;
    int32_t display_fd;
-   int32_t master_fd;
 
    /* We need these because it is not clear how to detect
     * valid devids in a portable way
@@ -168,7 +176,7 @@ struct v3dv_physical_device {
    const struct v3d_compiler *compiler;
    uint32_t next_program_id;
 
-   uint64_t heap_used;
+   alignas(8) uint64_t heap_used;
 
    /* This array holds all our 'struct v3dv_bo' allocations. We use this
     * so we can add a refcount to our BOs and check if a particular BO
@@ -197,9 +205,6 @@ struct v3dv_physical_device {
    } caps;
 };
 
-VkResult v3dv_physical_device_acquire_display(struct v3dv_physical_device *pdevice,
-                                              VkIcdSurfaceBase *surface);
-
 static inline struct v3dv_bo *
 v3dv_device_lookup_bo(struct v3dv_physical_device *device, uint32_t handle)
 {
@@ -222,7 +227,9 @@ void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device);
 
 bool v3dv_meta_can_use_tlb(struct v3dv_image *image,
                            uint8_t plane,
+                           uint8_t miplevel,
                            const VkOffset3D *offset,
+                           const VkExtent3D *extent,
                            VkFormat *compat_format);
 
 struct v3dv_instance {
@@ -579,6 +586,9 @@ struct v3dv_device {
     * being float being float, allowing us to reuse the same BO for all
     * pipelines matching this requirement. Pipelines that need integer
     * attributes will create their own BO.
+    *
+    * Note that since v71 the default attribute values are not needed, so this
+    * can be NULL.
     */
    struct v3dv_bo *default_attribute_float;
 
@@ -586,17 +596,12 @@ struct v3dv_device {
    struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */
 
 #ifdef ANDROID
-   const void *gralloc;
-   enum {
-      V3DV_GRALLOC_UNKNOWN,
-      V3DV_GRALLOC_CROS,
-      V3DV_GRALLOC_OTHER,
-   } gralloc_type;
+   struct u_gralloc *gralloc;
 #endif
 };
 
 struct v3dv_device_memory {
-   struct vk_object_base base;
+   struct vk_device_memory vk;
 
    struct v3dv_bo *bo;
    const VkMemoryType *type;
@@ -670,6 +675,8 @@ struct v3d_resource_slice {
    uint32_t offset;
    uint32_t stride;
    uint32_t padded_height;
+   uint32_t width;
+   uint32_t height;
    /* Size of a single pane of the slice.  For 3D textures, there will be
     * a number of panes equal to the minified, power-of-two-aligned
     * depth.
@@ -724,9 +731,18 @@ struct v3dv_image {
       VkFormat vk_format;
    } planes[V3DV_MAX_PLANE_COUNT];
 
+   /* Used only when sampling a linear texture (which V3D doesn't support).
+    * This holds a tiled copy of the image we can use for that purpose.
+    */
+   struct v3dv_image *shadow;
+
 #ifdef ANDROID
    /* Image is backed by VK_ANDROID_native_buffer, */
    bool is_native_buffer_memory;
+   /* Image is backed by VK_ANDROID_external_memory_android_hardware_buffer */
+   bool is_ahb;
+   VkImageDrmFormatModifierExplicitCreateInfoEXT *android_explicit_layout;
+   VkSubresourceLayout *android_plane_layouts;
 #endif
 };
 
@@ -768,6 +784,8 @@ struct v3dv_image_view {
 
    const struct v3dv_format *format;
 
+   uint8_t view_swizzle[4];
+
    uint8_t plane_count;
    struct {
       uint8_t image_plane;
@@ -778,8 +796,8 @@ struct v3dv_image_view {
       uint32_t internal_type;
       uint32_t offset;
 
-      /* Precomputed (composed from createinfo->components and formar swizzle)
-       * swizzles to pass in to the shader key.
+      /* Precomputed swizzle (composed from the view swizzle and the format
+       * swizzle).
        *
        * This could be also included on the descriptor bo, but the shader state
        * packet doesn't need it on a bo, so we can just avoid a memory copy
@@ -796,6 +814,11 @@ struct v3dv_image_view {
        */
       uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH];
    } planes[V3DV_MAX_PLANE_COUNT];
+
+   /* Used only when sampling a linear texture (which V3D doesn't support).
+    * This would represent a view over the tiled shadow image.
+    */
+   struct v3dv_image_view *shadow;
 };
 
 VkResult v3dv_create_image_view(struct v3dv_device *device,
@@ -916,7 +939,7 @@ struct v3dv_framebuffer {
    uint32_t layers;
 
    /* Typically, edge tiles in the framebuffer have padding depending on the
-    * underlying tiling layout. One consequnce of this is that when the
+    * underlying tiling layout. One consequence of this is that when the
     * framebuffer dimensions are not aligned to tile boundaries, tile stores
     * would still write full tiles on the edges and write to the padded area.
     * If the framebuffer is aliasing a smaller region of a larger image, then
@@ -942,6 +965,7 @@ struct v3dv_frame_tiling {
    uint32_t layers;
    uint32_t render_target_count;
    uint32_t internal_bpp;
+   uint32_t total_color_bpp;
    bool     msaa;
    bool     double_buffer;
    uint32_t tile_width;
@@ -1036,7 +1060,8 @@ enum v3dv_dynamic_state_bits {
    V3DV_DYNAMIC_DEPTH_BIAS                = 1 << 6,
    V3DV_DYNAMIC_LINE_WIDTH                = 1 << 7,
    V3DV_DYNAMIC_COLOR_WRITE_ENABLE        = 1 << 8,
-   V3DV_DYNAMIC_ALL                       = (1 << 9) - 1,
+   V3DV_DYNAMIC_DEPTH_BOUNDS              = 1 << 9,
+   V3DV_DYNAMIC_ALL                       = (1 << 10) - 1,
 };
 
 /* Flags for dirty pipeline state.
@@ -1061,6 +1086,7 @@ enum v3dv_cmd_dirty_bits {
    V3DV_CMD_DIRTY_LINE_WIDTH                = 1 << 16,
    V3DV_CMD_DIRTY_VIEW_INDEX                = 1 << 17,
    V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE        = 1 << 18,
+   V3DV_CMD_DIRTY_DEPTH_BOUNDS              = 1 << 19,
 };
 
 struct v3dv_dynamic_state {
@@ -1097,6 +1123,11 @@ struct v3dv_dynamic_state {
       float slope_factor;
    } depth_bias;
 
+   struct {
+      float                                     min;
+      float                                     max;
+   } depth_bounds;
+
    float line_width;
 
    uint32_t color_write_enable;
@@ -1121,7 +1152,6 @@ enum v3dv_job_type {
    V3DV_JOB_TYPE_CPU_RESET_QUERIES,
    V3DV_JOB_TYPE_CPU_END_QUERY,
    V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
-   V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
    V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
    V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
 };
@@ -1160,20 +1190,6 @@ struct v3dv_submit_sync_info {
    struct vk_sync_signal *signals;
 };
 
-struct v3dv_copy_buffer_to_image_cpu_job_info {
-   struct v3dv_image *image;
-   struct v3dv_buffer *buffer;
-   uint32_t buffer_offset;
-   uint32_t buffer_stride;
-   uint32_t buffer_layer_stride;
-   VkOffset3D image_offset;
-   VkExtent3D image_extent;
-   uint32_t mip_level;
-   uint32_t base_layer;
-   uint32_t layer_count;
-   uint8_t plane;
-};
-
 struct v3dv_csd_indirect_cpu_job_info {
    struct v3dv_buffer *buffer;
    uint32_t offset;
@@ -1192,7 +1208,7 @@ struct v3dv_timestamp_query_cpu_job_info {
 };
 
 /* Number of perfmons required to handle all supported performance counters */
-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
                                        DRM_V3D_MAX_PERF_COUNTERS)
 
 struct v3dv_perf_query {
@@ -1327,7 +1343,6 @@ struct v3dv_job {
       struct v3dv_reset_query_cpu_job_info          query_reset;
       struct v3dv_end_query_info                    query_end;
       struct v3dv_copy_query_results_cpu_job_info   query_copy_results;
-      struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
       struct v3dv_csd_indirect_cpu_job_info         csd_indirect;
       struct v3dv_timestamp_query_cpu_job_info      query_timestamp;
    } cpu;
@@ -1365,6 +1380,7 @@ void v3dv_job_start_frame(struct v3dv_job *job,
                           bool allocate_tile_state_now,
                           uint32_t render_target_count,
                           uint8_t max_internal_bpp,
+                          uint8_t total_color_bpp,
                           bool msaa);
 
 bool v3dv_job_type_is_gpu(struct v3dv_job *job);
@@ -1482,7 +1498,7 @@ struct v3dv_cmd_buffer_state {
    /* FIXME: we have just one client-side BO for the push constants,
     * independently of the stageFlags in vkCmdPushConstants, and the
     * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage
-    * tunning in the future if it makes sense.
+    * tuning in the future if it makes sense.
     */
    uint32_t push_constants_size;
    uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4];
@@ -1663,7 +1679,7 @@ struct v3dv_query_pool {
    /* Only used with performance queries */
    struct {
       uint32_t ncounters;
-      uint8_t counters[V3D_PERFCNT_NUM];
+      uint8_t counters[V3D_MAX_PERFCNT];
 
       /* V3D has a limit on the number of counters we can track in a
        * single performance monitor, so if too many counters are requested
@@ -1799,7 +1815,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
 void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
                                  struct drm_v3d_submit_tfu *tfu);
 
-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
+void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
+                                              struct v3dv_csd_indirect_cpu_job_info *info,
                                               const uint32_t *wg_counts);
 
 void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
@@ -1827,6 +1844,11 @@ bool v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state
 void v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
                                            const VkDependencyInfoKHR *info);
 
+bool v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
+                                    struct v3dv_image *dst,
+                                    struct v3dv_image *src,
+                                    const VkImageCopy2 *region);
+
 struct v3dv_event {
    struct vk_object_base base;
 
@@ -2156,32 +2178,6 @@ struct v3dv_sampler {
 #define V3DV_NO_SAMPLER_16BIT_IDX 0
 #define V3DV_NO_SAMPLER_32BIT_IDX 1
 
-/*
- * Following two methods are using on the combined to/from texture/sampler
- * indices maps at v3dv_pipeline.
- */
-static inline uint32_t
-v3dv_pipeline_combined_index_key_create(uint32_t texture_index,
-                                        uint32_t sampler_index)
-{
-   return texture_index << 24 | sampler_index;
-}
-
-static inline void
-v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key,
-                                        uint32_t *texture_index,
-                                        uint32_t *sampler_index)
-{
-   uint32_t texture = combined_index_key >> 24;
-   uint32_t sampler = combined_index_key & 0xffffff;
-
-   if (texture_index)
-      *texture_index = texture;
-
-   if (sampler_index)
-      *sampler_index = sampler;
-}
-
 struct v3dv_descriptor_maps {
    struct v3dv_descriptor_map ubo_map;
    struct v3dv_descriptor_map ssbo_map;
@@ -2277,7 +2273,7 @@ struct v3dv_pipeline {
    } va[MAX_VERTEX_ATTRIBS];
    uint32_t va_count;
 
-   enum pipe_prim_type topology;
+   enum mesa_prim topology;
 
    struct v3dv_pipeline_shared_data *shared_data;
 
@@ -2285,7 +2281,8 @@ struct v3dv_pipeline {
    unsigned char sha1[20];
 
    /* In general we can reuse v3dv_device->default_attribute_float, so note
-    * that the following can be NULL.
+    * that the following can be NULL. In 7.x this is not used, so it will be
+    * always NULL.
     *
     * FIXME: the content of this BO will be small, so it could be improved to
     * be uploaded to a common BO. But as in most cases it will be NULL, it is
@@ -2319,6 +2316,9 @@ struct v3dv_pipeline {
       bool is_z16;
    } depth_bias;
 
+   /* Depth bounds */
+   bool depth_bounds_test_enabled;
+
    struct {
       void *mem_ctx;
       struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
@@ -2334,6 +2334,13 @@ struct v3dv_pipeline {
    uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
 };
 
+static inline bool
+v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
+{
+   return device->devinfo.ver > 71 ||
+          (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
+}
+
 static inline VkPipelineBindPoint
 v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
 {
@@ -2496,10 +2503,6 @@ void
 v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
                                     struct v3dv_pipeline_cache *cache);
 
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
-                                              struct v3dv_pipeline *pipeline);
-
 VkResult
 v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
                                       nir_shader *nir,
@@ -2522,7 +2525,7 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, base, VkBuffer,
                                VK_OBJECT_TYPE_BUFFER)
 VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, base, VkBufferView,
                                VK_OBJECT_TYPE_BUFFER_VIEW)
-VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, base, VkDeviceMemory,
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, vk.base, VkDeviceMemory,
                                VK_OBJECT_TYPE_DEVICE_MEMORY)
 VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, base, VkDescriptorPool,
                                VK_OBJECT_TYPE_DESCRIPTOR_POOL)
@@ -2604,12 +2607,32 @@ u64_compare(const void *key1, const void *key2)
    case 42:                                           \
       v3d_X_thing = &v3d42_##thing;                   \
       break;                                          \
+   case 71:                                           \
+      v3d_X_thing = &v3d71_##thing;                   \
+      break;                                          \
    default:                                           \
       unreachable("Unsupported hardware generation"); \
    }                                                  \
    v3d_X_thing;                                       \
 })
 
+/* Helper to get hw-specific macro values */
+#define V3DV_X(device, thing) ({                                \
+   __typeof(V3D42_##thing) V3D_X_THING;                         \
+   switch (device->devinfo.ver) {                               \
+   case 42:                                                     \
+      V3D_X_THING = V3D42_##thing;                              \
+      break;                                                    \
+   case 71:                                                     \
+      V3D_X_THING = V3D71_##thing;                              \
+      break;                                                    \
+   default:                                                     \
+      unreachable("Unsupported hardware generation");           \
+   }                                                            \
+   V3D_X_THING;                                                 \
+})
+
+
 
 /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
  * define v3dX for each version supported, because when we compile code that
@@ -2622,16 +2645,26 @@ u64_compare(const void *key1, const void *key2)
 #  define v3dX(x) v3d42_##x
 #  include "v3dvx_private.h"
 #  undef v3dX
+
+#  define v3dX(x) v3d71_##x
+#  include "v3dvx_private.h"
+#  undef v3dX
 #endif
 
+VkResult
+v3dv_update_image_layout(struct v3dv_device *device,
+                         struct v3dv_image *image,
+                         uint64_t modifier,
+                         bool disjoint,
+                         const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info);
+
 #ifdef ANDROID
 VkResult
-v3dv_gralloc_info(struct v3dv_device *device,
-                  const VkNativeBufferANDROID *gralloc_info,
-                  int *out_dmabuf,
-                  int *out_stride,
-                  int *out_size,
-                  uint64_t *out_modifier);
+v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc,
+                                    struct u_gralloc_buffer_handle *in_hnd,
+                                    VkImageDrmFormatModifierExplicitCreateInfoEXT *out,
+                                    VkSubresourceLayout *out_layouts,
+                                    int max_planes);
 
 VkResult
 v3dv_import_native_buffer_fd(VkDevice device_h,
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_query.c b/lib/mesa/src/broadcom/vulkan/v3dv_query.c
index 216dd1567..d6f93466d 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_query.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_query.c
@@ -23,7 +23,6 @@
 
 #include "v3dv_private.h"
 
-#include "common/v3d_performance_counters.h"
 #include "util/timespec.h"
 #include "compiler/nir/nir_builder.h"
 
@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device,
                            DRM_IOCTL_V3D_PERFMON_CREATE,
                            &req);
       if (ret)
-         fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
+         fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
 
       pool->queries[query].perf.kperfmon_ids[i] = req.id;
    }
@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device,
                               QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
 
       assert(pq_info);
-      assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
 
       pool->perfmon.ncounters = pq_info->counterIndexCount;
       for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device,
    assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
 
    struct v3dv_query *q = &pool->queries[query];
-   uint64_t counter_values[V3D_PERFCNT_NUM];
+   uint64_t counter_values[V3D_MAX_PERFCNT];
 
    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
       struct drm_v3d_perfmon_get_values req = {
@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
    VkPerformanceCounterKHR *pCounters,
    VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
 {
-   uint32_t desc_count = *pCounterCount;
+   V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
 
-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
-                          out, pCounters, pCounterCount);
-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
-                          out_desc, pCounterDescriptions, &desc_count);
-
-   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
-      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
-         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
-         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
-         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
-
-         unsigned char sha1_result[20];
-         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
-                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
-                            sha1_result);
-
-         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
-      }
-
-      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
-                               &out_desc, desc) {
-         desc->flags = 0;
-         snprintf(desc->name, sizeof(desc->name), "%s",
-            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
-         snprintf(desc->category, sizeof(desc->category), "%s",
-            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
-         snprintf(desc->description, sizeof(desc->description), "%s",
-            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
-      }
-   }
-
-   return vk_outarray_status(&out);
+   return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
+                                                                pCounters,
+                                                                pCounterDescriptions);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -1345,23 +1314,23 @@ v3dv_ReleaseProfilingLockKHR(VkDevice device)
 
 static inline void
 nir_set_query_availability(nir_builder *b,
-                           nir_ssa_def *buf,
-                           nir_ssa_def *offset,
-                           nir_ssa_def *query_idx,
-                           nir_ssa_def *avail)
+                           nir_def *buf,
+                           nir_def *offset,
+                           nir_def *query_idx,
+                           nir_def *avail)
 {
    offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
    nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
 }
 
-static inline nir_ssa_def *
+static inline nir_def *
 nir_get_query_availability(nir_builder *b,
-                           nir_ssa_def *buf,
-                           nir_ssa_def *offset,
-                           nir_ssa_def *query_idx)
+                           nir_def *buf,
+                           nir_def *offset,
+                           nir_def *query_idx)
 {
    offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
-   nir_ssa_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
+   nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
    return nir_i2i32(b, avail);
 }
 
@@ -1372,12 +1341,7 @@ get_set_query_availability_cs()
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                   "set query availability cs");
 
-   /* We rely on supergroup packing to maximize SIMD lane occupancy */
-   b.shader->info.workgroup_size[0] = 1;
-   b.shader->info.workgroup_size[1] = 1;
-   b.shader->info.workgroup_size[2] = 1;
-
-   nir_ssa_def *buf =
+   nir_def *buf =
       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                 .desc_set = 0,
                                 .binding = 0,
@@ -1387,15 +1351,15 @@ get_set_query_availability_cs()
     * ever change any of these parameters we need to update how we compute the
     * query index here.
     */
-   nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0);
+   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
 
-   nir_ssa_def *offset =
+   nir_def *offset =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
 
-   nir_ssa_def *query_idx =
+   nir_def *query_idx =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
 
-   nir_ssa_def *avail =
+   nir_def *avail =
       nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);
 
    query_idx = nir_iadd(&b, query_idx, wg_id);
@@ -1404,33 +1368,33 @@ get_set_query_availability_cs()
    return b.shader;
 }
 
-static inline nir_ssa_def *
-nir_get_occlusion_counter_offset(nir_builder *b, nir_ssa_def *query_idx)
+static inline nir_def *
+nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
 {
-   nir_ssa_def *query_group = nir_udiv_imm(b, query_idx, 16);
-   nir_ssa_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
-   nir_ssa_def *offset =
-      nir_iadd(b, nir_imul(b, query_group, nir_imm_int(b, 1024)),
-                  nir_imul(b, query_group_offset, nir_imm_int(b, 4)));
+   nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
+   nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
+   nir_def *offset =
+      nir_iadd(b, nir_imul_imm(b, query_group, 1024),
+                  nir_imul_imm(b, query_group_offset, 4));
    return offset;
 }
 
 static inline void
 nir_reset_occlusion_counter(nir_builder *b,
-                            nir_ssa_def *buf,
-                            nir_ssa_def *query_idx)
+                            nir_def *buf,
+                            nir_def *query_idx)
 {
-   nir_ssa_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
-   nir_ssa_def *zero = nir_imm_int(b, 0);
+   nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
+   nir_def *zero = nir_imm_int(b, 0);
    nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
 }
 
-static inline nir_ssa_def *
+static inline nir_def *
 nir_read_occlusion_counter(nir_builder *b,
-                           nir_ssa_def *buf,
-                           nir_ssa_def *query_idx)
+                           nir_def *buf,
+                           nir_def *query_idx)
 {
-   nir_ssa_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
+   nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
    return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
 }
 
@@ -1441,12 +1405,7 @@ get_reset_occlusion_query_cs()
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                   "reset occlusion query cs");
 
-   /* We rely on supergroup packing to maximize SIMD lane occupancy */
-   b.shader->info.workgroup_size[0] = 1;
-   b.shader->info.workgroup_size[1] = 1;
-   b.shader->info.workgroup_size[2] = 1;
-
-   nir_ssa_def *buf =
+   nir_def *buf =
       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                 .desc_set = 0,
                                 .binding = 0,
@@ -1456,15 +1415,15 @@ get_reset_occlusion_query_cs()
     * ever change any of these parameters we need to update how we compute the
     * query index here.
     */
-   nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0);
+   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
 
-   nir_ssa_def *avail_offset =
+   nir_def *avail_offset =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
 
-   nir_ssa_def *base_query_idx =
+   nir_def *base_query_idx =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
 
-   nir_ssa_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
+   nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
 
    nir_set_query_availability(&b, buf, avail_offset, query_idx,
                               nir_imm_intN_t(&b, 0, 8));
@@ -1475,21 +1434,21 @@ get_reset_occlusion_query_cs()
 
 static void
 write_query_buffer(nir_builder *b,
-                   nir_ssa_def *buf,
-                   nir_ssa_def **offset,
-                   nir_ssa_def *value,
+                   nir_def *buf,
+                   nir_def **offset,
+                   nir_def *value,
                    bool flag_64bit)
 {
    if (flag_64bit) {
       /* Create a 64-bit value using a vec2 with the .Y component set to 0
        * so we can write a 64-bit value in a single store.
        */
-      nir_ssa_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
+      nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
       nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
-      *offset = nir_iadd(b, *offset, nir_imm_int(b, 8));
+      *offset = nir_iadd_imm(b, *offset, 8);
    } else {
       nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
-      *offset = nir_iadd(b, *offset, nir_imm_int(b, 4));
+      *offset = nir_iadd_imm(b, *offset, 4);
    }
 }
 
@@ -1504,60 +1463,55 @@ get_copy_query_results_cs(VkQueryResultFlags flags)
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                   "copy query results cs");
 
-   /* We rely on supergroup packing to maximize SIMD lane occupancy */
-   b.shader->info.workgroup_size[0] = 1;
-   b.shader->info.workgroup_size[1] = 1;
-   b.shader->info.workgroup_size[2] = 1;
-
-   nir_ssa_def *buf =
+   nir_def *buf =
       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                 .desc_set = 0,
                                 .binding = 0,
                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
 
-   nir_ssa_def *buf_out =
+   nir_def *buf_out =
       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                 .desc_set = 1,
                                 .binding = 0,
                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
 
    /* Read push constants */
-   nir_ssa_def *avail_offset =
+   nir_def *avail_offset =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
 
-   nir_ssa_def *base_query_idx =
+   nir_def *base_query_idx =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
 
-   nir_ssa_def *base_offset_out =
+   nir_def *base_offset_out =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);
 
-   nir_ssa_def *stride =
+   nir_def *stride =
       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);
 
    /* This assumes a local size of 1 and a horizontal-only dispatch. If we
     * ever change any of these parameters we need to update how we compute the
     * query index here.
     */
-   nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0);
-   nir_ssa_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
+   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+   nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
 
    /* Read query availability if needed */
-   nir_ssa_def *avail = NULL;
+   nir_def *avail = NULL;
    if (flag_avail || !flag_partial)
       avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);
 
    /* Write occusion query result... */
-   nir_ssa_def *offset =
+   nir_def *offset =
       nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));
 
    /* ...if partial is requested, we always write */
    if(flag_partial) {
-      nir_ssa_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
+      nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
       write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
    } else {
       /*...otherwise, we only write if the query is available */
       nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
-         nir_ssa_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
+         nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
          write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
       nir_pop_if(&b, if_stmt);
    }
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_queue.c b/lib/mesa/src/broadcom/vulkan/v3dv_queue.c
index 9e1bc702f..a0942cf1c 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_queue.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_queue.c
@@ -135,7 +135,7 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
     * we handle those in the CPU.
     */
    if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
-      v3dv_bo_wait(job->device, info->pool->occlusion.bo, PIPE_TIMEOUT_INFINITE);
+      v3dv_bo_wait(job->device, info->pool->occlusion.bo, OS_TIMEOUT_INFINITE);
 
    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
       struct vk_sync_wait waits[info->count];
@@ -296,60 +296,6 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
 }
 
 static VkResult
-handle_copy_buffer_to_image_cpu_job(struct v3dv_queue *queue,
-                                    struct v3dv_job *job,
-                                    struct v3dv_submit_sync_info *sync_info)
-{
-   assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
-   struct v3dv_copy_buffer_to_image_cpu_job_info *info =
-      &job->cpu.copy_buffer_to_image;
-
-   /* Wait for all GPU work to finish first, since we may be accessing
-    * the BOs involved in the operation.
-    */
-   VkResult result = queue_wait_idle(queue, sync_info);
-   if (result != VK_SUCCESS)
-      return result;
-
-   /* Map BOs */
-   struct v3dv_bo *dst_bo = info->image->planes[info->plane].mem->bo;
-   assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
-   if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
-      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
-   void *dst_ptr = dst_bo->map;
-
-   struct v3dv_bo *src_bo = info->buffer->mem->bo;
-   assert(!src_bo->map || src_bo->map_size == src_bo->size);
-   if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
-      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
-   void *src_ptr = src_bo->map;
-
-   const struct v3d_resource_slice *slice =
-      &info->image->planes[info->plane].slices[info->mip_level];
-
-   const struct pipe_box box = {
-      info->image_offset.x, info->image_offset.y, info->base_layer,
-      info->image_extent.width, info->image_extent.height, info->layer_count,
-   };
-
-   /* Copy each layer */
-   for (uint32_t i = 0; i < info->layer_count; i++) {
-      const uint32_t dst_offset =
-         v3dv_layer_offset(info->image, info->mip_level,
-                           info->base_layer + i, info->plane);
-      const uint32_t src_offset =
-         info->buffer->mem_offset + info->buffer_offset +
-         info->buffer_layer_stride * i;
-      v3d_store_tiled_image(
-         dst_ptr + dst_offset, slice->stride,
-         src_ptr + src_offset, info->buffer_stride,
-         slice->tiling, info->image->planes[info->plane].cpp, slice->padded_height, &box);
-   }
-
-   return VK_SUCCESS;
-}
-
-static VkResult
 handle_timestamp_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
                                struct v3dv_submit_sync_info *sync_info)
 {
@@ -392,7 +338,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
 
    /* Make sure the GPU is no longer using the indirect buffer*/
    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
-   v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE);
+   v3dv_bo_wait(queue->device, info->buffer->mem->bo, OS_TIMEOUT_INFINITE);
 
    /* Map the indirect buffer and read the dispatch parameters */
    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
@@ -408,7 +354,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
 
    if (memcmp(group_counts, info->csd_job->csd.wg_count,
               sizeof(info->csd_job->csd.wg_count)) != 0) {
-      v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
+      v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
    }
 
    return VK_SUCCESS;
@@ -757,7 +703,7 @@ handle_cl_job(struct v3dv_queue *queue,
    if (job->tmu_dirty_rcl)
       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
 
-   /* If the job uses VK_KHR_buffer_device_addess we need to ensure all
+   /* If the job uses VK_KHR_buffer_device_address we need to ensure all
     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR
     * are included.
     */
@@ -923,7 +869,7 @@ handle_csd_job(struct v3dv_queue *queue,
 
    struct drm_v3d_submit_csd *submit = &job->csd.submit;
 
-   /* If the job uses VK_KHR_buffer_device_addess we need to ensure all
+   /* If the job uses VK_KHR_buffer_device_address we need to ensure all
     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR
     * are included.
     */
@@ -1014,8 +960,6 @@ queue_handle_job(struct v3dv_queue *queue,
       return handle_end_query_cpu_job(job, counter_pass_idx);
    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
       return handle_copy_query_results_cpu_job(job);
-   case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
-      return handle_copy_buffer_to_image_cpu_job(queue, job, sync_info);
    case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
       return handle_csd_indirect_cpu_job(queue, job, sync_info);
    case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c b/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c
index f3a98ab7e..098bfb648 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c
@@ -87,7 +87,7 @@ push_constants_bo_free(VkDevice _device,
  * This method checks if the ubo used for push constants is needed to be
  * updated or not.
  *
- * push contants ubo is only used for push constants accessed by a non-const
+ * push constants ubo is only used for push constants accessed by a non-const
  * index.
  */
 static void
@@ -288,9 +288,10 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
                                offset + dynamic_offset);
    } else {
       if (content == QUNIFORM_UBO_ADDR) {
-         /* We reserve index 0 for push constants and artificially increase our
-          * indices by one for that reason, fix that now before accessing the
-          * descriptor map.
+         /* We reserve UBO index 0 for push constants in Vulkan (and for the
+          * constant buffer in GL) so the compiler always adds one to all UBO
+          * indices, fix it up before we access the descriptor map, since
+          * indices start from 0 there.
           */
          assert(index > 0);
          index--;
@@ -497,7 +498,6 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
 
    struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
-
    for (int i = 0; i < uinfo->count; i++) {
       uint32_t data = uinfo->data[i];
 
@@ -519,13 +519,17 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
                               cmd_buffer, pipeline, variant->stage);
          break;
 
-      case QUNIFORM_VIEWPORT_X_SCALE:
-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
+      case QUNIFORM_VIEWPORT_X_SCALE: {
+         float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
+         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
          break;
+      }
 
-      case QUNIFORM_VIEWPORT_Y_SCALE:
-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
+      case QUNIFORM_VIEWPORT_Y_SCALE: {
+         float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
+         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
          break;
+      }
 
       case QUNIFORM_VIEWPORT_Z_OFFSET: {
          float translate_z;
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c b/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c
index 5efb1ea95..404a64d0e 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c
@@ -24,8 +24,6 @@
  */
 
 #include "v3dv_private.h"
-#include "drm-uapi/drm_fourcc.h"
-#include "wsi_common_entrypoints.h"
 #include "vk_util.h"
 #include "wsi_common.h"
 #include "wsi_common_drm.h"
@@ -41,19 +39,7 @@ static bool
 v3dv_wsi_can_present_on_device(VkPhysicalDevice _pdevice, int fd)
 {
    V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, _pdevice);
-
-   /* There are some instances with direct display extensions where this may be
-    * called before we have ever tried to create a swapchain, and therefore,
-    * before we have ever tried to acquire the display device, in which case we
-    * have to do it now.
-    */
-   if (unlikely(pdevice->display_fd < 0 && pdevice->master_fd >= 0)) {
-      VkResult result =
-         v3dv_physical_device_acquire_display(pdevice, NULL);
-      if (result != VK_SUCCESS)
-         return false;
-   }
-
+   assert(pdevice->display_fd != -1);
    return wsi_common_drm_devices_equal(fd, pdevice->display_fd);
 }
 
@@ -66,7 +52,7 @@ v3dv_wsi_init(struct v3dv_physical_device *physical_device)
                             v3dv_physical_device_to_handle(physical_device),
                             v3dv_wsi_proc_addr,
                             &physical_device->vk.instance->alloc,
-                            physical_device->master_fd, NULL,
+                            physical_device->display_fd, NULL,
                             &(struct wsi_device_options){.sw_device = false});
 
    if (result != VK_SUCCESS)
@@ -89,67 +75,6 @@ v3dv_wsi_finish(struct v3dv_physical_device *physical_device)
                      &physical_device->vk.instance->alloc);
 }
 
-static void
-constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps)
-{
-   /* Our display pipeline requires that images are linear, so we cannot
-    * ensure that our swapchain images can be sampled. If we are running under
-    * a compositor in windowed mode, the DRM modifier negotiation should
-    * probably end up selecting an UIF layout for the swapchain images but it
-    * may still choose linear and send images directly for scanout if the
-    * surface is in fullscreen mode for example. If we are not running under
-    * a compositor, then we would always need them to be linear anyway.
-    */
-   caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_SAMPLED_BIT;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
-{
-   VkResult result;
-   result = wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice,
-                                                        surface,
-                                                        pSurfaceCapabilities);
-   constraint_surface_capabilities(pSurfaceCapabilities);
-   return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
-    VkSurfaceCapabilities2KHR*                  pSurfaceCapabilities)
-{
-   VkResult result;
-   result = wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(physicalDevice,
-                                                         pSurfaceInfo,
-                                                         pSurfaceCapabilities);
-   constraint_surface_capabilities(&pSurfaceCapabilities->surfaceCapabilities);
-   return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_CreateSwapchainKHR(
-    VkDevice                                     _device,
-    const VkSwapchainCreateInfoKHR*              pCreateInfo,
-    const VkAllocationCallbacks*                 pAllocator,
-    VkSwapchainKHR*                              pSwapchain)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   struct v3dv_physical_device *pdevice = device->pdevice;
-
-   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface);
-   VkResult result =
-      v3dv_physical_device_acquire_display(pdevice, surface);
-   if (result != VK_SUCCESS)
-      return result;
-
-   return wsi_CreateSwapchainKHR(_device, pCreateInfo, pAllocator, pSwapchain);
-}
-
 struct v3dv_image *
 v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index)
 {
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index 0c23a33b5..011f5c8e1 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
    };
    config.width_in_pixels = tiling->width;
    config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
    config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
    config.multisample_mode_4x = tiling->msaa;
    config.double_buffer_in_non_ms_mode = tiling->double_buffer;
    config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      unreachable("HW generation 71 not supported yet.");
+#endif
 
    uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
    cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
@@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
    cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
       config.width_in_pixels = tiling->width;
       config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
       config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
       config.multisample_mode_4x = tiling->msaa;
       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideally we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
    }
 
    /* There's definitely nothing in the VCD cache we want. */
@@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
                                              iview->vk.base_array_layer + layer,
                                              image_plane);
 
+   /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
+    * is broken in earlier V3D versions.
+    */
+   assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
+
    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
       store.buffer_to_store = buffer;
       store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
@@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
       const VkImageAspectFlags aspects =
          vk_format_aspects(ds_attachment->desc.format);
 
+#if V3D_VERSION <= 42
+      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+       * for depth/stencil.
+       *
+       * There used to be some confusion regarding the Clear Tile Buffers
+       * Z/S bit also being broken, but we confirmed with Broadcom that this
+       * is not the case, it was just that some other hardware bugs (that we
+       * need to work around, such as GFXH-1461) could cause this bit to behave
+       * incorrectly.
+       *
+       * There used to be another issue where the RTs bit in the Clear Tile
+       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+       * fixed since V3D 4.1.
+       *
+       * So if we have to emit a clear of depth or stencil we don't use
+       * the per-buffer store clear bit, even if we need to store the buffers,
+       * instead we always have to use the Clear Tile Buffers Z/S bit.
+       * If we have configured the job to do early Z/S clearing, then we
+       * don't want to emit any Clear Tile Buffers command at all here.
+       *
+       * Note that GFXH-1689 is not reproduced in the simulator, where
+       * using the clear buffer bit in depth/stencil stores works fine.
+       */
+
       /* Only clear once on the first subpass that uses the attachment */
       uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
          ds_attachment->first_subpass :
@@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
                            ds_attachment->desc.stencilLoadOp,
                            subpass->do_stencil_clear_with_draw);
 
+      use_global_zs_clear = !state->job->early_zs_clear &&
+         (needs_depth_clear || needs_stencil_clear);
+#endif
+#if V3D_VERSION >= 71
+      /* The store command's clear buffer bit cannot be used for Z/S stencil:
+       * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
+       * so we don't want to emit redundant clears here.
+       */
+      use_global_zs_clear = false;
+#endif
+
       /* Skip the last store if it is not required */
       uint32_t ds_last_subpass = !pass->multiview_enabled ?
          ds_attachment->last_subpass :
@@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
          needs_stencil_store = subpass->resolve_stencil;
       }
 
-      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
-       * for depth/stencil.
-       *
-       * There used to be some confusion regarding the Clear Tile Buffers
-       * Z/S bit also being broken, but we confirmed with Broadcom that this
-       * is not the case, it was just that some other hardware bugs (that we
-       * need to work around, such as GFXH-1461) could cause this bit to behave
-       * incorrectly.
-       *
-       * There used to be another issue where the RTs bit in the Clear Tile
-       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
-       * fixed since V3D 4.1.
-       *
-       * So if we have to emit a clear of depth or stencil we don't use
-       * the per-buffer store clear bit, even if we need to store the buffers,
-       * instead we always have to use the Clear Tile Buffers Z/S bit.
-       * If we have configured the job to do early Z/S clearing, then we
-       * don't want to emit any Clear Tile Buffers command at all here.
-       *
-       * Note that GFXH-1689 is not reproduced in the simulator, where
-       * using the clear buffer bit in depth/stencil stores works fine.
-       */
-      use_global_zs_clear = !state->job->early_zs_clear &&
-         (needs_depth_clear || needs_stencil_clear);
       if (needs_depth_store || needs_stencil_store) {
          const uint32_t zs_buffer =
             v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
@@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
     * bit and instead we have to emit a single clear of all tile buffers.
     */
    if (use_global_zs_clear || use_global_rt_clear) {
+#if V3D_VERSION == 42
       cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
          clear.clear_z_stencil_buffer = use_global_zs_clear;
          clear.clear_all_render_targets = use_global_rt_clear;
       }
+#endif
+#if V3D_VERSION >= 71
+      cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
    }
 }
 
@@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
    }
 }
 
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ * FIXME: for v71 we are not returning all the possible combinations for
+ * render target internal type and clamp. For example for int types we are
+ * always using clamp int, and for 16f we are using clamp none or pos (that
+ * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
+ * summary right now we are just porting what we were doing on 4.2
+ */
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format)
+{
+#if V3D_VERSION == 42
+   if (vk_format_is_int(vk_format))
+      return V3D_RENDER_TARGET_CLAMP_INT;
+   else if (vk_format_is_srgb(vk_format))
+      return V3D_RENDER_TARGET_CLAMP_NORM;
+   else
+      return V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+   switch (rt_type) {
+   case V3D_INTERNAL_TYPE_8I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+   case V3D_INTERNAL_TYPE_8UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_8:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+   case V3D_INTERNAL_TYPE_16I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+   case V3D_INTERNAL_TYPE_16UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_16F:
+      return vk_format_is_srgb(vk_format) ?
+         V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+         V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+   case V3D_INTERNAL_TYPE_32I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+   case V3D_INTERNAL_TYPE_32UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_32F:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+   default:
+      unreachable("Unknown internal render target type");
+   }
+
+   return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+}
+
+static void
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
+                                           int rt,
+                                           uint32_t *rt_bpp,
+#if V3D_VERSION == 42
+                                           uint32_t *rt_type,
+                                           uint32_t *rt_clamp)
+#else
+                                           uint32_t *rt_type_clamp)
+#endif
+{
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   assert(state->subpass_idx < state->pass->subpass_count);
+   const struct v3dv_subpass *subpass =
+      &state->pass->subpasses[state->subpass_idx];
+
+   if (rt >= subpass->color_count)
+      return;
+
+   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+   const uint32_t attachment_idx = attachment->attachment;
+   if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      return;
+
+   assert(attachment_idx < state->framebuffer->attachment_count &&
+          attachment_idx < state->attachment_alloc_count);
+   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+   assert(vk_format_is_color(iview->vk.format));
+
+   assert(iview->plane_count == 1);
+   *rt_bpp = iview->planes[0].internal_bpp;
+#if V3D_VERSION == 42
+   *rt_type = iview->planes[0].internal_type;
+   *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+                                               iview->vk.format);
+#endif
+#if V3D_VERSION >= 71
+   *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+                                                    iview->vk.format);
+#endif
+}
+
 void
 v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
       config.number_of_render_targets = MAX2(subpass->color_count, 1);
       config.multisample_mode_4x = tiling->msaa;
       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideallly we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
 
       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
          const struct v3dv_image_view *iview =
@@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
           * Early-Z/S clearing is independent of Early Z/S testing, so it is
           * possible to enable one but not the other so long as their
           * respective requirements are met.
+          *
+          * From V3D 4.5.6, Z/S buffers are always cleared automatically
+          * between tiles, but we still want to enable early ZS clears
+          * when Z/S are not loaded or stored.
           */
          struct v3dv_render_pass_attachment *ds_attachment =
             &pass->attachments[ds_attachment_idx];
@@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          const VkImageAspectFlags ds_aspects =
             vk_format_aspects(ds_attachment->desc.format);
 
-         bool needs_depth_clear =
-            check_needs_clear(state,
-                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                              ds_attachment->first_subpass,
-                              ds_attachment->desc.loadOp,
-                              subpass->do_depth_clear_with_draw);
-
          bool needs_depth_store =
             v3dv_cmd_buffer_check_needs_store(state,
                                               ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
                                               ds_attachment->last_subpass,
                                               ds_attachment->desc.storeOp) ||
                                               subpass->resolve_depth;
+#if V3D_VERSION <= 42
+         bool needs_depth_clear =
+            check_needs_clear(state,
+                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                              ds_attachment->first_subpass,
+                              ds_attachment->desc.loadOp,
+                              subpass->do_depth_clear_with_draw);
 
          do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+#endif
+#if V3D_VERSION >= 71
+         bool needs_depth_load =
+            v3dv_cmd_buffer_check_needs_load(state,
+                                             ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                             ds_attachment->first_subpass,
+                                             ds_attachment->desc.loadOp,
+                                             ds_attachment->last_subpass,
+                                             ds_attachment->desc.storeOp);
+         do_early_zs_clear = !needs_depth_load && !needs_depth_store;
+#endif
+
          if (do_early_zs_clear &&
              vk_format_has_stencil(ds_attachment->desc.format)) {
             bool needs_stencil_load =
@@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
     */
    job->early_zs_clear = do_early_zs_clear;
 
+#if V3D_VERSION >= 71
+   uint32_t base_addr = 0;
+#endif
    for (uint32_t i = 0; i < subpass->color_count; i++) {
       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+#if V3D_VERSION >= 71
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+            rt.render_target_number = i;
+            rt.stride = 1; /* Unused */
+         }
+#endif
          continue;
+      }
 
       struct v3dv_image_view *iview =
          state->attachments[attachment_idx].image_view;
@@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
       const struct v3d_resource_slice *slice =
          &image->planes[plane].slices[iview->vk.base_mip_level];
 
-      const uint32_t *clear_color =
+      UNUSED const uint32_t *clear_color =
          &state->attachments[attachment_idx].clear_value.color[0];
 
-      uint32_t clear_pad = 0;
+      UNUSED uint32_t clear_pad = 0;
       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
           slice->tiling == V3D_TILING_UIF_XOR) {
          int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
@@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          }
       }
 
+#if V3D_VERSION == 42
       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
          clear.clear_color_low_32_bits = clear_color[0];
          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
@@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
             clear.render_target_number = i;
          };
       }
+#endif
+
+#if V3D_VERSION >= 71
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+         rt.clear_color_low_bits = clear_color[0];
+         cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
+                                                    &rt.internal_type_and_clamping);
+         rt.stride =
+            v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+                                                   v3d_internal_bpp_words(rt.internal_bpp));
+         rt.base_address = base_addr;
+         rt.render_target_number = i;
+
+         /* base_addr in multiples of 512 bits. We divide by 8 because stride
+          * is in 128-bit units, but it is packing 2 rows worth of data, so we
+          * need to divide it by 2 so it is only 1 row, and then again by 4 so
+          * it is in 512-bit units.
+          */
+         base_addr += (tiling->tile_height * rt.stride) / 8;
+      }
+
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+            rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
+               ((uint64_t) clear_color[1]) |
+               (((uint64_t) (clear_color[2] & 0xff)) << 32);
+            rt.render_target_number = i;
+         }
+      }
+
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+            rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+               (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
+               (((uint64_t) (clear_color[3])) << 24);
+            rt.render_target_number = i;
+         }
+      }
+#endif
+   }
+
+#if V3D_VERSION >= 71
+   /* If we don't have any color RTs, we still need to emit one and flag
+    * it as not used using stride = 1.
+    */
+   if (subpass->color_count == 0) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+         rt.stride = 1;
+      }
    }
+#endif
 
+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
           &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
           &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
           &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
           &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
    }
+#endif
 
    /* Ends rendering mode config. */
    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
       }
       if (cmd_buffer->state.tile_aligned_render_area &&
           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
             clear.clear_z_stencil_buffer = !job->early_zs_clear;
             clear.clear_all_render_targets = true;
          }
+#endif
+#if V3D_VERSION >= 71
+         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
+#endif
       }
       cl_emit(rcl, END_OF_TILE_MARKER, end);
    }
@@ -1055,6 +1286,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
 }
 
 void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+                            float scale[3],
+                            float translate[3])
+{
+   float x = viewport->x;
+   float y = viewport->y;
+   float half_width = 0.5f * viewport->width;
+   float half_height = 0.5f * viewport->height;
+   double n = viewport->minDepth;
+   double f = viewport->maxDepth;
+
+   scale[0] = half_width;
+   translate[0] = half_width + x;
+   scale[1] = half_height;
+   translate[1] = half_height + y;
+
+   scale[2] = (f - n);
+   translate[2] = n;
+
+   /* It seems that if the scale is small enough the hardware won't clip
+    * correctly so we work around this my choosing the smallest scale that
+    * seems to work.
+    *
+    * This case is exercised by CTS:
+    * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+    *
+    * V3D 7.x fixes this by using the new
+    * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
+    */
+#if V3D_VERSION <= 42
+   const float min_abs_scale = 0.0005f;
+   if (fabs(scale[2]) < min_abs_scale)
+      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+#endif
+}
+
+void
 v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
 {
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
@@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
    v3dv_return_if_oom(cmd_buffer, NULL);
 
+#if V3D_VERSION == 42
    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
    }
+#endif
+#if V3D_VERSION >= 71
+   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+      clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
+      clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
+   }
+#endif
 
    float translate_z, scale_z;
    v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
                                               &translate_z, &scale_z);
 
+#if V3D_VERSION == 42
    cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
       clip.viewport_z_offset_zc_to_zs = translate_z;
       clip.viewport_z_scale_zc_to_zs = scale_z;
    }
+#endif
+
+#if V3D_VERSION >= 71
+   /* If the Z scale is too small guardband clipping may not clip correctly */
+   if (fabsf(scale_z) < 0.01f) {
+      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
+         clip.viewport_z_offset_zc_to_zs = translate_z;
+         clip.viewport_z_scale_zc_to_zs = scale_z;
+      }
+   } else {
+      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+         clip.viewport_z_offset_zc_to_zs = translate_z;
+         clip.viewport_z_scale_zc_to_zs = scale_z;
+      }
+   }
+#endif
+
    cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
       /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
        * we are using OpenGL's [-1, 1] instead.
@@ -1103,8 +1397,28 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
    }
 
    cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
-      vp.viewport_centre_x_coordinate = vptranslate[0];
-      vp.viewport_centre_y_coordinate = vptranslate[1];
+      float vp_fine_x = vptranslate[0];
+      float vp_fine_y = vptranslate[1];
+      int32_t vp_coarse_x = 0;
+      int32_t vp_coarse_y = 0;
+
+      /* The fine coordinates must be unsigned, but coarse can be signed */
+      if (unlikely(vp_fine_x < 0)) {
+         int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64);
+         vp_fine_x += 64.0f * blocks_64;
+         vp_coarse_x -= blocks_64;
+      }
+
+      if (unlikely(vp_fine_y < 0)) {
+         int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64);
+         vp_fine_y += 64.0f * blocks_64;
+         vp_coarse_y -= blocks_64;
+      }
+
+      vp.fine_x = vp_fine_x;
+      vp.fine_y = vp_fine_y;
+      vp.coarse_x = vp_coarse_x;
+      vp.coarse_y = vp_coarse_y;
    }
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
@@ -1185,8 +1499,10 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
    cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
       bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
       bias.depth_offset_units = dynamic->depth_bias.constant_factor;
+#if V3D_VERSION <= 42
       if (pipeline->depth_bias.is_z16)
          bias.depth_offset_units *= 256.0f;
+#endif
       bias.limit = dynamic->depth_bias.depth_bias_clamp;
    }
 
@@ -1194,6 +1510,38 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
 }
 
 void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* No depthBounds support for v42, so this method is empty in that case.
+    *
+    * Note that this method is being called as v3dv_job_init flags all state
+    * as dirty. See FIXME note in v3dv_job_init.
+    */
+
+#if V3D_VERSION >= 71
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   assert(pipeline);
+
+   if (!pipeline->depth_bounds_test_enabled)
+      return;
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
+      bounds.lower_test_limit = dynamic->depth_bounds.min;
+      bounds.upper_test_limit = dynamic->depth_bounds.max;
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+#endif
+}
+
+void
 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
 {
    struct v3dv_job *job = cmd_buffer->state.job;
@@ -1236,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);
 
+   const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
+   const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
+
    const uint32_t blend_packets_size =
       cl_packet_length(BLEND_ENABLES) +
       cl_packet_length(BLEND_CONSTANT_COLOR) +
-      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
+      cl_packet_length(BLEND_CFG) * max_color_rts;
 
    v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
    v3dv_return_if_oom(cmd_buffer, NULL);
@@ -1251,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
          }
       }
 
-      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+      for (uint32_t i = 0; i < max_color_rts; i++) {
          if (pipeline->blend.enables & (1 << i))
             cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
       }
@@ -1278,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
 
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   uint32_t color_write_mask = ~dynamic->color_write_enable |
+                               pipeline->blend.color_write_masks;
+#if V3D_VERSION <= 42
+   /* Only 4 RTs */
+   color_write_mask &= 0xffff;
+#endif
+
    cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
-      mask.mask = (~dynamic->color_write_enable |
-                   pipeline->blend.color_write_masks) & 0xffff;
+      mask.mask = color_write_mask;
    }
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
@@ -1571,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);
 
-   bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
-
    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
    v3dv_return_if_oom(cmd_buffer, NULL);
 
    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
+#if V3D_VERSION == 42
+      bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
       config.early_z_enable = enable_ez;
       config.early_z_updates_enable = config.early_z_enable &&
          pipeline->z_updates_enable;
+#endif
    }
 }
 
@@ -1825,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
          gs_bin->prog_data.gs->base.threads == 4;
       shader.geometry_bin_mode_shader_start_in_final_thread_section =
          gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
       shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
       shader.geometry_bin_mode_shader_uniforms_address =
          gs_bin_uniforms;
 
@@ -1835,21 +2195,23 @@ emit_gs_shader_state_record(struct v3dv_job *job,
          gs->prog_data.gs->base.threads == 4;
       shader.geometry_render_mode_shader_start_in_final_thread_section =
          gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
       shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
       shader.geometry_render_mode_shader_uniforms_address =
          gs_render_uniforms;
    }
 }
 
 static uint8_t
-v3d_gs_output_primitive(enum shader_prim prim_type)
+v3d_gs_output_primitive(enum mesa_prim prim_type)
 {
     switch (prim_type) {
-    case SHADER_PRIM_POINTS:
+    case MESA_PRIM_POINTS:
         return GEOMETRY_SHADER_POINTS;
-    case SHADER_PRIM_LINE_STRIP:
+    case MESA_PRIM_LINE_STRIP:
         return GEOMETRY_SHADER_LINE_STRIP;
-    case SHADER_PRIM_TRIANGLE_STRIP:
+    case MESA_PRIM_TRIANGLE_STRIP:
         return GEOMETRY_SHADER_TRI_STRIP;
     default:
         unreachable("Unsupported primitive type");
@@ -2011,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
                                 pipeline->vpm_cfg.Gv);
    }
 
+#if V3D_VERSION == 42
    struct v3dv_bo *default_attribute_values =
       pipeline->default_attribute_values != NULL ?
       pipeline->default_attribute_values :
       pipeline->device->default_attribute_float;
+#endif
 
    cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
                           pipeline->shader_state_record, shader) {
@@ -2040,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
       shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
       shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
 
+#if V3D_VERSION == 42
       shader.address_of_default_attribute_values =
          v3dv_cl_address(default_attribute_values, 0);
+#endif
 
       shader.any_shader_reads_hardware_written_primitive_id =
          (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
@@ -2350,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
                                      buffer->mem_offset + offset);
    }
 }
-
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
-                                                 int rt,
-                                                 uint32_t *rt_bpp,
-                                                 uint32_t *rt_type,
-                                                 uint32_t *rt_clamp)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
-   assert(state->subpass_idx < state->pass->subpass_count);
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-
-   if (rt >= subpass->color_count)
-      return;
-
-   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
-   const uint32_t attachment_idx = attachment->attachment;
-   if (attachment_idx == VK_ATTACHMENT_UNUSED)
-      return;
-
-   assert(attachment_idx < state->framebuffer->attachment_count &&
-          attachment_idx < state->attachment_alloc_count);
-   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
-   assert(vk_format_is_color(iview->vk.format));
-
-   assert(iview->plane_count == 1);
-   *rt_bpp = iview->planes[0].internal_bpp;
-   *rt_type = iview->planes[0].internal_type;
-   if (vk_format_is_int(iview->vk.view_format))
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
-   else if (vk_format_is_srgb(iview->vk.view_format))
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
-   else
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
-}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_device.c b/lib/mesa/src/broadcom/vulkan/v3dvx_device.c
index e23598386..1b50d51e1 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_device.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_device.c
@@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = {
    [VK_COMPARE_OP_ALWAYS]                       = V3D_COMPARE_FUNC_ALWAYS,
 };
 
-
 static union pipe_color_union encode_border_color(
+   const struct v3dv_device *device,
    const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
 {
    const struct util_format_description *desc =
@@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color(
     * colors so we need to fix up the swizzle manually for this case.
     */
    uint8_t swizzle[4];
-   if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+   const bool v3d_has_reverse_swap_rb_bits =
+      v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
+   if (!v3d_has_reverse_swap_rb_bits &&
+       v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
        v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
       swizzle[0] = PIPE_SWIZZLE_W;
       swizzle[1] = PIPE_SWIZZLE_X;
       swizzle[2] = PIPE_SWIZZLE_Y;
       swizzle[3] = PIPE_SWIZZLE_Z;
+   }
+   /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
+    * we have to use the new reverse and swap_r/b flags in the texture shader
+    * state which will apply the format swizzle automatically when sampling
+    * the border color too and we should not apply it manually here.
+    */
+   else if (v3d_has_reverse_swap_rb_bits &&
+            (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
+             v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
+      swizzle[0] = PIPE_SWIZZLE_X;
+      swizzle[1] = PIPE_SWIZZLE_Y;
+      swizzle[2] = PIPE_SWIZZLE_Z;
+      swizzle[3] = PIPE_SWIZZLE_W;
    } else {
       memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
    }
@@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color(
                              (1 << (desc->channel[i].size - 1)) - 1);
    }
 
-   /* convert from float to expected format */
+#if V3D_VERSION <= 42
+   /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+    * for us. In V3D 4.x we need to manually convert floating point color
+    * values to the expected format.
+    */
    if (vk_format_is_srgb(bc_info->format) ||
        vk_format_is_compressed(bc_info->format)) {
       for (int i = 0; i < 4; i++)
@@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color(
          }
       }
    }
+#endif
 
    return border;
 }
 
 void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+                         struct v3dv_sampler *sampler,
                          const VkSamplerCreateInfo *pCreateInfo,
                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
 {
@@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
       s.border_color_mode = border_color_mode;
 
       if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
-         union pipe_color_union border = encode_border_color(bc_info);
+         union pipe_color_union border = encode_border_color(device, bc_info);
 
          s.border_color_word_0 = border.ui[0];
          s.border_color_word_1 = border.ui[1];
@@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
    const struct v3dv_framebuffer *framebuffer,
    const struct v3dv_cmd_buffer_attachment_state *attachments,
    const struct v3dv_subpass *subpass,
-   uint8_t *max_bpp,
+   uint8_t *max_internal_bpp,
+   uint8_t *total_color_bpp,
    bool *msaa)
 {
    STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
-   *max_bpp = V3D_INTERNAL_BPP_32;
+   *max_internal_bpp = V3D_INTERNAL_BPP_32;
+   *total_color_bpp = 0;
    *msaa = false;
 
    if (subpass) {
@@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
          assert(att);
          assert(att->plane_count == 1);
 
-         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-            *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+            const uint32_t internal_bpp = att->planes[0].internal_bpp;
+            *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+            *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+         }
 
          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
@@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
       }
-
       return;
    }
 
@@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
       assert(att);
       assert(att->plane_count == 1);
 
-      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-         *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+         const uint32_t internal_bpp = att->planes[0].internal_bpp;
+         *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+         *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+      }
 
       if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
          *msaa = true;
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c b/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c
index 45a1cf65b..2392e8367 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c
@@ -155,6 +155,7 @@ static const struct v3dv_format format_table[] = {
    FORMAT(A8B8G8R8_SRGB_PACK32,    SRGB8_ALPHA8, RGBA8,         SWIZ_XYZW, 16, true), /* RGBA8 sRGB */
    FORMAT(A2B10G10R10_UNORM_PACK32,RGB10_A2,     RGB10_A2,      SWIZ_XYZW, 16, true),
    FORMAT(A2B10G10R10_UINT_PACK32, RGB10_A2UI,   RGB10_A2UI,    SWIZ_XYZW, 16, false),
+   FORMAT(A2R10G10B10_UNORM_PACK32,RGB10_A2,     RGB10_A2,      SWIZ_ZYXW, 16, true),
    FORMAT(E5B9G9R9_UFLOAT_PACK32,  NO,           RGB9_E5,       SWIZ_XYZ1, 16, true),
    FORMAT(B10G11R11_UFLOAT_PACK32, R11F_G11F_B10F,R11F_G11F_B10F, SWIZ_XYZ1, 16, true),
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_image.c b/lib/mesa/src/broadcom/vulkan/v3dvx_image.c
index 80a3e5bfd..de984e812 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_image.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_image.c
@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
          tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
          tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
 
-         tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
-
          tex.texture_type = image_view->format->planes[plane].tex_type;
 
          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
@@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
 
          tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
 
-         tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
-
          /* At this point we don't have the job. That's the reason the first
           * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
           * add the bo to the job. This also means that we need to add manually
@@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
             v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
                               iplane);
          tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+         bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+
+         /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
+          * the reverse and/or swap_r/b swizzle from the format table with the
+          * image view swizzle. This, however, doesn't work for border colors,
+          * for that there is the reverse_standard_border_color.
+          *
+          * In v3d 7.x, however, there is no reverse_standard_border_color bit,
+          * since the reverse and swap_r/b bits also affect border colors. It is
+          * because of this that we absolutely need to use these bits with
+          * reversed and swpaped formats, since that's the only way to ensure
+          * correct border colors. In that case we don't want to program the
+          * swizzle to the composition of the format swizzle and the view
+          * swizzle like we do in v3d 4.x, since the format swizzle is applied
+          * via the reverse and swap_r/b bits.
+          */
+#if V3D_VERSION == 42
+         tex.srgb = is_srgb;
+         tex.reverse_standard_border_color =
+            image_view->planes[plane].channel_reverse;
+#endif
+#if V3D_VERSION >= 71
+         tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+
+         tex.reverse = image_view->planes[plane].channel_reverse;
+         tex.r_b_swap = image_view->planes[plane].swap_rb;
+
+         if (tex.reverse || tex.r_b_swap) {
+            tex.swizzle_r =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
+            tex.swizzle_g =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
+            tex.swizzle_b =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
+            tex.swizzle_a =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
+         }
+
+         tex.chroma_offset_x = 1;
+         tex.chroma_offset_y = 1;
+         /* See comment in XML field definition for rationale of the shifts */
+         tex.texture_base_pointer_cb = base_offset >> 6;
+         tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
       }
    }
 }
@@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
 
       assert(buffer_view->format->plane_count == 1);
       tex.texture_type = buffer_view->format->planes[0].tex_type;
-      tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+
+      bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
+#if V3D_VERSION == 42
+      tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+      tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif
 
       /* At this point we don't have the job. That's the reason the first
        * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
          buffer_view->offset;
 
       tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+      tex.chroma_offset_x = 1;
+      tex.chroma_offset_y = 1;
+      /* See comment in XML field definition for rationale of the shifts */
+      tex.texture_base_pointer_cb = base_offset >> 6;
+      tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
    }
 }
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c b/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c
index 04147b82c..858096f9e 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -26,6 +26,7 @@
 
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/common/v3d_tfu.h"
+#include "broadcom/common/v3d_util.h"
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/compiler/v3d_compiler.h"
 
@@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job,
       config.number_of_render_targets = 1;
       config.multisample_mode_4x = tiling->msaa;
       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideallly we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
       config.internal_depth_type = fb->internal_depth_type;
    }
 
+   const uint32_t *color = NULL;
    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
-      uint32_t clear_pad = 0;
+      UNUSED uint32_t clear_pad = 0;
       if (clear_info->image) {
          const struct v3dv_image *image = clear_info->image;
 
@@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job,
          }
       }
 
-      const uint32_t *color = &clear_info->clear_value->color[0];
+      color = &clear_info->clear_value->color[0];
+
+#if V3D_VERSION == 42
       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
          clear.clear_color_low_32_bits = color[0];
          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
@@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job,
             clear.render_target_number = 0;
          };
       }
+#endif
    }
 
+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
       rt.render_target_0_internal_bpp = tiling->internal_bpp;
       rt.render_target_0_internal_type = fb->internal_type;
       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
    }
+#endif
+
+#if V3D_VERSION >= 71
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+      if (color)
+         rt.clear_color_low_bits = color[0];
+      rt.internal_bpp = tiling->internal_bpp;
+      rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+                                                                      fb->vk_format);
+      rt.stride =
+         v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+                                                v3d_internal_bpp_words(rt.internal_bpp));
+      rt.base_address = 0;
+      rt.render_target_number = 0;
+   }
+
+   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+         rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
+            ((uint64_t) color[1]) |
+            (((uint64_t) (color[2] & 0xff)) << 32);
+         rt.render_target_number = 0;
+      }
+   }
+
+   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+         rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+            (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
+            (((uint64_t) (color[3])) << 24);
+         rt.render_target_number = 0;
+      }
+   }
+#endif
 
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
@@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job,
        */
       if (clear_value &&
           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
             clear.clear_z_stencil_buffer = true;
             clear.clear_all_render_targets = true;
          }
+#endif
+#if V3D_VERSION >= 71
+         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
       }
       cl_emit(rcl, END_OF_TILE_MARKER, end);
    }
@@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
 
    tfu.iia |= src_offset;
 
+#if V3D_VERSION <= 42
    if (src_tiling == V3D_TILING_RASTER) {
       tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
    } else {
@@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
                    V3D33_TFU_ICFG_FORMAT_SHIFT;
    }
    tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
+#endif
+#if V3D_VERSION >= 71
+   if (src_tiling == V3D_TILING_RASTER) {
+      tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
+   } else {
+      tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+                  (src_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D71_TFU_ICFG_IFORMAT_SHIFT;
+   }
+   tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
+#endif
 
    tfu.ioa = dst_offset;
 
+#if V3D_VERSION <= 42
    tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
                (dst_tiling - V3D_TILING_LINEARTILE)) <<
                 V3D33_TFU_IOA_FORMAT_SHIFT;
+#endif
+
+#if V3D_VERSION >= 71
+   tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
+                  (dst_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D71_TFU_IOC_FORMAT_SHIFT;
+
+   switch (dst_tiling) {
+   case V3D_TILING_UIF_NO_XOR:
+   case V3D_TILING_UIF_XOR:
+      tfu.v71.ioc |=
+         (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
+         V3D71_TFU_IOC_STRIDE_SHIFT;
+      break;
+   case V3D_TILING_RASTER:
+      tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
+                      V3D71_TFU_IOC_STRIDE_SHIFT;
+      break;
+   default:
+      break;
+   }
+#endif
 
    switch (src_tiling) {
    case V3D_TILING_UIF_NO_XOR:
@@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
    /* The TFU can handle raster sources but always produces UIF results */
    assert(dst_tiling != V3D_TILING_RASTER);
 
+#if V3D_VERSION <= 42
    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
     * OPAD field for the destination (how many extra UIF blocks beyond
     * those necessary to cover the height).
@@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
                       uif_block_h;
       tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
    }
+#endif
 
    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
 }
@@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
       uint32_t width, height;
       framebuffer_size_for_pixel_count(num_items, &width, &height);
 
-      v3dv_job_start_frame(job, width, height, 1, true, true,
-                           1, internal_bpp, false);
+      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                           false);
 
       struct v3dv_meta_framebuffer framebuffer;
       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
@@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
       uint32_t width, height;
       framebuffer_size_for_pixel_count(num_items, &width, &height);
 
-      v3dv_job_start_frame(job, width, height, 1, true, true,
-                           1, internal_bpp, false);
+      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                           false);
 
       struct v3dv_meta_framebuffer framebuffer;
       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c b/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c
index 45aec2623..ad22add15 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -223,14 +223,49 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
          config.depth_test_function = VK_COMPARE_OP_ALWAYS;
       }
 
-      /* EZ state will be updated at draw time based on bound pipeline state */
-      config.early_z_updates_enable = false;
-      config.early_z_enable = false;
-
       config.stencil_enable =
          ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
 
       pipeline->z_updates_enable = config.z_updates_enable;
+
+#if V3D_VERSION >= 71
+      /* From the Vulkan spec:
+       *
+       *    "depthClampEnable controls whether to clamp the fragment’s depth
+       *     values as described in Depth Test. If the pipeline is not created
+       *     with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
+       *     then enabling depth clamp will also disable clipping primitives to
+       *     the z planes of the frustrum as described in Primitive Clipping.
+       *     Otherwise depth clipping is controlled by the state set in
+       *     VkPipelineRasterizationDepthClipStateCreateInfoEXT."
+       *
+       * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually
+       * supported in the driver yet, so in practice we are always enabling Z
+       * clipping for now.
+       */
+      bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
+      bool z_clip_enable = false;
+      const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+         ds_info ? vk_find_struct_const(ds_info->pNext,
+                                        PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
+                   NULL;
+      if (clip_info)
+         z_clip_enable = clip_info->depthClipEnable;
+      else if (!z_clamp_enable)
+         z_clip_enable = true;
+
+      if (z_clip_enable) {
+         config.z_clipping_mode = pipeline->negative_one_to_one ?
+	    V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
+      } else {
+         config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
+      }
+
+      config.z_clamp_mode = z_clamp_enable;
+
+      config.depth_bounds_test_enable =
+              ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment;
+#endif
    };
 }
 
@@ -364,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
 static void
 pack_shader_state_record(struct v3dv_pipeline *pipeline)
 {
-   assert(sizeof(pipeline->shader_state_record) ==
+   assert(sizeof(pipeline->shader_state_record) >=
           cl_packet_length(GL_SHADER_STATE_RECORD));
 
    struct v3d_fs_prog_data *prog_data_fs =
@@ -388,7 +423,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
 
       if (!pipeline->has_gs) {
          shader.point_size_in_shaded_vertex_data =
-            pipeline->topology == PIPE_PRIM_POINTS;
+            pipeline->topology == MESA_PRIM_POINTS;
       } else {
          struct v3d_gs_prog_data *prog_data_gs =
             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]->prog_data.gs;
@@ -439,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
       shader.number_of_varyings_in_fragment_shader =
          prog_data_fs->num_inputs;
 
-      shader.coordinate_shader_propagate_nans = true;
-      shader.vertex_shader_propagate_nans = true;
-      shader.fragment_shader_propagate_nans = true;
-
       /* Note: see previous note about addresses */
       /* shader.coordinate_shader_code_address */
       /* shader.vertex_shader_code_address */
       /* shader.fragment_shader_code_address */
 
+#if V3D_VERSION == 42
+      shader.coordinate_shader_propagate_nans = true;
+      shader.vertex_shader_propagate_nans = true;
+      shader.fragment_shader_propagate_nans = true;
+
       /* FIXME: Use combined input/output size flag in the common case (also
        * on v3d, see v3dx_draw).
        */
@@ -455,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
          prog_data_vs_bin->separate_segments;
       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
          prog_data_vs->separate_segments;
-
       shader.coordinate_shader_input_vpm_segment_size =
          prog_data_vs_bin->separate_segments ?
          prog_data_vs_bin->vpm_input_size : 1;
       shader.vertex_shader_input_vpm_segment_size =
          prog_data_vs->separate_segments ?
          prog_data_vs->vpm_input_size : 1;
+#endif
+
+      /* On V3D 7.1 there isn't a specific flag to set if we are using
+       * shared/separate segments or not. We just set the value of
+       * vpm_input_size to 0, and set output to the max needed. That should be
+       * already properly set on prog_data_vs_bin
+       */
+#if V3D_VERSION == 71
+      shader.coordinate_shader_input_vpm_segment_size =
+         prog_data_vs_bin->vpm_input_size;
+      shader.vertex_shader_input_vpm_segment_size =
+         prog_data_vs->vpm_input_size;
+#endif
 
       shader.coordinate_shader_output_vpm_segment_size =
          prog_data_vs_bin->vpm_output_size;
@@ -663,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
       }
    }
 }
+
+#if V3D_VERSION == 42
+static bool
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+{
+   for (uint8_t i = 0; i < pipeline->va_count; i++) {
+      if (vk_format_is_int(pipeline->va[i].vk_format))
+         return true;
+   }
+   return false;
+}
+#endif
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION == 42
+   return pipeline_has_integer_vertex_attrib(pipeline);
+#endif
+
+   return false;
+}
+
+/* @pipeline can be NULL. In that case we assume the most common case. For
+ * example, for v42 we assume in that case that all the attributes have a
+ * float format (we only create an all-float BO once and we reuse it with all
+ * float pipelines), otherwise we look at the actual type of each attribute
+ * used with the specific pipeline passed in.
+ */
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+                                      struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION >= 71
+   return NULL;
+#endif
+
+   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+   struct v3dv_bo *bo;
+
+   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+
+   if (!bo) {
+      fprintf(stderr, "failed to allocate memory for the default "
+              "attribute values\n");
+      return NULL;
+   }
+
+   bool ok = v3dv_bo_map(device, bo, size);
+   if (!ok) {
+      fprintf(stderr, "failed to map default attribute values buffer\n");
+      return NULL;
+   }
+
+   uint32_t *attrs = bo->map;
+   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+      attrs[i * 4 + 0] = 0;
+      attrs[i * 4 + 1] = 0;
+      attrs[i * 4 + 2] = 0;
+      VkFormat attr_format =
+         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+      if (i < va_count && vk_format_is_int(attr_format)) {
+         attrs[i * 4 + 3] = 1;
+      } else {
+         attrs[i * 4 + 3] = fui(1.0);
+      }
+   }
+
+   v3dv_bo_unmap(device, bo);
+
+   return bo;
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_private.h b/lib/mesa/src/broadcom/vulkan/v3dvx_private.h
index c693952d0..0f5887eab 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_private.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_private.h
@@ -55,6 +55,9 @@ void
 v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
 
 void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
 
 void
@@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
                          uint32_t internal_size,
                          uint32_t *hw_color);
 
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
-                                                 int rt,
-                                                 uint32_t *rt_bpp,
-                                                 uint32_t *rt_type,
-                                                 uint32_t *rt_clamp);
-
 /* Used at v3dv_device */
 
 void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+                         struct v3dv_sampler *sampler,
                          const VkSamplerCreateInfo *pCreateInfo,
                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
 
@@ -143,7 +140,9 @@ void
 v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
                                             const struct v3dv_cmd_buffer_attachment_state *attachments,
                                             const struct v3dv_subpass *subpass,
-                                            uint8_t *max_bpp, bool *msaa);
+                                            uint8_t *max_internal_bpp,
+                                            uint8_t *total_color_bpp,
+                                            bool *msaa);
 
 #ifdef DEBUG
 void
@@ -165,6 +164,10 @@ v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format);
 bool
 v3dX(format_supports_blending)(const struct v3dv_format *format);
 
+/* FIXME: tex_format should be `enum V3DX(Texture_Data_Formats)`, but using
+ * that enum type in the header requires including v3dx_pack.h, which triggers
+ * circular include dependencies issues, so we're using a `uint32_t` for now.
+ */
 bool
 v3dX(tfu_supports_tex_format)(uint32_t tex_format);
 
@@ -309,10 +312,24 @@ void
 v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
                                   const VkPipelineVertexInputStateCreateInfo *vi_info,
                                   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
+
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+                                      struct v3dv_pipeline *pipeline);
+
 /* Used at v3dv_queue */
 void
 v3dX(job_emit_noop)(struct v3dv_job *job);
 
+/* Used at v3dv_query */
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+                                           VkPerformanceCounterKHR *pCounters,
+                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
+
 /* Used at v3dv_descriptor_set, and other descriptor set utils */
 uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
 
@@ -321,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
 uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
 
 uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+
+/* General utils */
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format);
+
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format);
+
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+                             float scale[3],
+                             float translate[3]);
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_query.c b/lib/mesa/src/broadcom/vulkan/v3dvx_query.c
new file mode 100644
index 000000000..e59a1e84f
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_query.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+
+#include "common/v3d_performance_counters.h"
+
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+                                           VkPerformanceCounterKHR *pCounters,
+                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+   uint32_t desc_count = *pCounterCount;
+
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+                          out, pCounters, pCounterCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+                          out_desc, pCounterDescriptions, &desc_count);
+
+   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+         unsigned char sha1_result[20];
+         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+                            sha1_result);
+
+         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+      }
+
+      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+                               &out_desc, desc) {
+         desc->flags = 0;
+         snprintf(desc->name, sizeof(desc->name), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+         snprintf(desc->category, sizeof(desc->category), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+         snprintf(desc->description, sizeof(desc->description), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c b/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c
index efe63de42..6eed2de9d 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c
@@ -29,7 +29,8 @@
 void
 v3dX(job_emit_noop)(struct v3dv_job *job)
 {
-   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false);
+   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
+                        V3D_INTERNAL_BPP_32, 4, false);
    v3dX(job_emit_binning_flush)(job);
 
    struct v3dv_cl *rcl = &job->rcl;
@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
       config.image_height_pixels = 1;
       config.number_of_render_targets = 1;
       config.multisample_mode_4x = false;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = 3; /* Tile size 64 */
+      config.log2_tile_height = 3; /* Tile size 64 */
+#endif
    }
 
+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
       rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
       rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
    }
+#endif
+#if V3D_VERSION >= 71
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+      rt.internal_bpp = V3D_INTERNAL_BPP_32;
+      rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
+      rt.stride = 1; /* Unused RT */
+   }
+#endif
 
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
       clear.z_clear_value = 1.0f;
author	Jonathan Gray <jsg@cvs.openbsd.org>	2024-04-02 09:30:07 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2024-04-02 09:30:07 +0000
commit	f54e142455cb3c9d1662dae7e096a32a47e5409b (patch)
tree	440ecd46269f0eac25e349e1ed58f246490c5e26 /lib/mesa/src/broadcom/vulkan
parent	36d8503c27530f68d655d3ef77a6eaa4dfd8ad65 (diff)