Import Mesa 21.3.7

author: Jonathan Gray <jsg@cvs.openbsd.org> 2022-02-24 01:57:18 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2022-02-24 01:57:18 +0000
commit: b24b5b9049e889ee4eb39b565bcc8d48bd45ab48 (patch)
tree: 658ca4e6b41655f49463c85edbaeda48979c394c /lib/mesa/src/broadcom
parent: 57768bbb154c2879d34ec20e401b19472e77aaf7 (diff)
79 files changed, 15600 insertions, 8987 deletions
diff --git a/lib/mesa/src/broadcom/ci/deqp-v3d-rpi4-gles.toml b/lib/mesa/src/broadcom/ci/deqp-v3d-rpi4-gles.toml
new file mode 100644
index 000000000..659a4ca9c
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/deqp-v3d-rpi4-gles.toml
@@ -0,0 +1,49 @@
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = [ "/deqp/mustpass/gles31-master.txt" ]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+version_check = "GL ES 3.1.*git"
+renderer_check = "V3D"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = [ "/deqp/mustpass/gles3-master.txt" ]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = [ "/deqp/mustpass/gles2-master.txt" ]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles31-khr-master.txt",
+    "/deqp/mustpass/gles3-khr-master.txt",
+    "/deqp/mustpass/gles2-khr-master.txt",
+]
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
diff --git a/lib/mesa/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt b/lib/mesa/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt
index 7a673b01f..6379afbe3 100644
--- a/lib/mesa/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt
+++ b/lib/mesa/src/broadcom/ci/deqp-v3dv-rpi4-fails.txt
@@ -1,148 +1,5 @@
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_general_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_general_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_general_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.optimal_general_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.optimal_general_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.optimal_general_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.optimal_optimal_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.optimal_optimal_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.optimal_optimal_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.general_general_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.general_general_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.general_general_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.general_optimal_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.general_optimal_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.general_optimal_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.optimal_general_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.optimal_general_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.optimal_general_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.optimal_optimal_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.optimal_optimal_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_sint_pack32.a8b8g8r8_sint_pack32.optimal_optimal_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.general_general_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.general_general_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.general_general_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.general_optimal_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.general_optimal_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.general_optimal_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.optimal_general_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.optimal_general_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.optimal_general_linear_stripes_z,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.optimal_optimal_linear_stripes_x,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.optimal_optimal_linear_stripes_y,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a8b8g8r8_uint_pack32.a8b8g8r8_uint_pack32.optimal_optimal_linear_stripes_z,Fail
-dEQP-VK.pipeline.logic_op.r16_uint.and,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.and_inverted,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.and_reverse,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.clear,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.copy_inverted,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.equivalent,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.invert,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.nand,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.no_op,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.nor,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.or,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.or_inverted,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.or_reverse,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.set,Crash
-dEQP-VK.pipeline.logic_op.r16_uint.xor,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.and,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.and_inverted,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.and_reverse,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.clear,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.copy_inverted,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.equivalent,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.invert,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.nand,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.no_op,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.nor,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.or,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.or_inverted,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.or_reverse,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.set,Crash
-dEQP-VK.pipeline.logic_op.r16g16_uint.xor,Crash
-dEQP-VK.pipeline.logic_op.r16g16b16a16_uint.copy_inverted,Fail
-dEQP-VK.pipeline.logic_op.r16g16b16a16_uint.equivalent,Fail
-dEQP-VK.pipeline.logic_op.r16g16b16a16_uint.invert,Fail
-dEQP-VK.pipeline.logic_op.r16g16b16a16_uint.nand,Fail
-dEQP-VK.pipeline.logic_op.r16g16b16a16_uint.nor,Fail
-dEQP-VK.pipeline.logic_op.r16g16b16a16_uint.or_inverted,Fail
-dEQP-VK.pipeline.logic_op.r16g16b16a16_uint.or_reverse,Fail
-dEQP-VK.pipeline.logic_op.r32_uint.and,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.and_inverted,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.and_reverse,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.clear,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.copy_inverted,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.equivalent,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.invert,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.nand,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.no_op,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.nor,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.or,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.or_inverted,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.or_reverse,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.set,Crash
-dEQP-VK.pipeline.logic_op.r32_uint.xor,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.and,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.and_inverted,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.and_reverse,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.clear,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.copy_inverted,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.equivalent,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.invert,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.nand,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.no_op,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.nor,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.or,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.or_inverted,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.or_reverse,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.set,Crash
-dEQP-VK.pipeline.logic_op.r32g32_uint.xor,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.and,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.and_inverted,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.and_reverse,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.clear,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.copy_inverted,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.equivalent,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.invert,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.nand,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.no_op,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.nor,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.or,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.or_inverted,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.or_reverse,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.set,Crash
-dEQP-VK.pipeline.logic_op.r8_uint.xor,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.and,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.and_inverted,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.and_reverse,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.clear,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.copy_inverted,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.equivalent,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.invert,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.nand,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.no_op,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.nor,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.or,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.or_inverted,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.or_reverse,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.set,Crash
-dEQP-VK.pipeline.logic_op.r8g8_uint.xor,Crash
-dEQP-VK.pipeline.logic_op.r8g8b8a8_uint.copy_inverted,Fail
-dEQP-VK.pipeline.logic_op.r8g8b8a8_uint.equivalent,Fail
-dEQP-VK.pipeline.logic_op.r8g8b8a8_uint.invert,Fail
-dEQP-VK.pipeline.logic_op.r8g8b8a8_uint.nand,Fail
-dEQP-VK.pipeline.logic_op.r8g8b8a8_uint.nor,Fail
-dEQP-VK.pipeline.logic_op.r8g8b8a8_uint.or_inverted,Fail
-dEQP-VK.pipeline.logic_op.r8g8b8a8_uint.or_reverse,Fail
-dEQP-VK.spirv_assembly.instruction.compute.vector_shuffle.vector_shuffle,Fail
-dEQP-VK.synchronization.basic.binary_semaphore.chain,Fail
-dEQP-VK.ycbcr.query.levels.geometry.r8g8b8a8_unorm,Crash
-dEQP-VK.ycbcr.query.levels.tess_control.r8g8b8a8_unorm,Crash
-dEQP-VK.ycbcr.query.levels.tess_eval.r8g8b8a8_unorm,Crash
-dEQP-VK.ycbcr.query.size_lod.geometry.r8g8b8a8_unorm,Crash
-dEQP-VK.ycbcr.query.size_lod.tess_control.r8g8b8a8_unorm,Crash
-dEQP-VK.ycbcr.query.size_lod.tess_eval.r8g8b8a8_unorm,Crash
+# This seems to fail due to the test error threshold being insufficient
+dEQP-VK.geometry.input.basic_primitive.line_strip_adjacency,Fail
+
+# CTS bug; fix submitted
+dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_single_buffer_geom,Fail
diff --git a/lib/mesa/src/broadcom/ci/deqp-vc4-rpi3-gles.toml b/lib/mesa/src/broadcom/ci/deqp-vc4-rpi3-gles.toml
new file mode 100644
index 000000000..218cb1835
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/deqp-vc4-rpi3-gles.toml
@@ -0,0 +1,25 @@
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = [ "/deqp/mustpass/gles2-master.txt" ]
+tests_per_group = 250
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
+version_check = "GL ES 2.0.*git"
+renderer_check = "VC4"
+
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [ "/deqp/mustpass/gles2-khr-master.txt" ]
+tests_per_group = 250
+deqp_args = [
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+    "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer",
+    "--deqp-surface-width=256",
+    "--deqp-visibility=hidden",
+]
diff --git a/lib/mesa/src/broadcom/ci/gitlab-ci.yml b/lib/mesa/src/broadcom/ci/gitlab-ci.yml
index c3d28777b..4f70ef1e1 100644
--- a/lib/mesa/src/broadcom/ci/gitlab-ci.yml
+++ b/lib/mesa/src/broadcom/ci/gitlab-ci.yml
@@ -2,32 +2,38 @@
   extends:
     - .baremetal-test-armhf
     - .vc4-rules
-    - .use-arm_test
+    - .use-debian/arm_test
   variables:
     BM_BOOTFS: /boot/raspberrypi_armhf
-    BM_KERNEL_MODULES: vc4
     BM_ROOTFS: /rootfs-armhf
     GPU_VERSION: vc4-rpi3
-    DEQP_EXPECTED_RENDERER: VC4
+    HWCI_KERNEL_MODULES: vc4
+    FLAKES_CHANNEL: "#videocore-ci"
   script:
     - ./install/bare-metal/poe-powered.sh
   needs:
-    - job: arm_test
+    - job: debian/arm_test
       artifacts: false
-    - meson-armhf
+    - debian-armhf
   tags:
     - igalia-rpi3
 
-vc4-rpi3-gles2:armhf:
+vc4-rpi3-gles:armhf:
   extends:
     - .vc4-rpi3-test:armhf
-  parallel: 4
+  parallel: 2
   variables:
-    BARE_METAL_TEST_SCRIPT: "/install/deqp-runner.sh"
-    DEQP_VER: gles2
-    # The vc4s are so slow that it takes about a minute to get through the
-    # default 500 tests in a group, triggering the serial watchdog.
+    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+    DEQP_SUITE: vc4-rpi3-gles
+
+vc4-rpi3-egl:armhf:
+  extends:
+    - .vc4-rpi3-test:armhf
+  variables:
+    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+    HWCI_START_XORG: 1
     DEQP_RUNNER_OPTIONS: "--tests-per-group 250"
+    DEQP_VER: egl
 
 .vc4-rpi3-piglit:armhf:
   extends:
@@ -35,9 +41,9 @@ vc4-rpi3-gles2:armhf:
     - .vc4-rpi3-test:armhf
     - .test-manual
   variables:
-    BARE_METAL_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
+    HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
     BM_POE_TIMEOUT: 180
-    BM_START_XORG: 1
+    HWCI_START_XORG: 1
     PIGLIT_PLATFORM: mixed_glx_egl
 
 vc4-rpi3-piglit-quick_gl:armhf:
@@ -60,89 +66,72 @@ vc4-rpi3-piglit-quick_shader:armhf:
   extends:
     - .baremetal-test-armhf
     - .v3d-rules
-    - .use-arm_test
+    - .use-debian/arm_test
   variables:
-    BARE_METAL_TEST_SCRIPT: "/install/deqp-runner.sh"
+    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
     BM_BOOTFS: /boot/raspberrypi_armhf
-    BM_KERNEL_MODULES: v3d,vc4
     BM_POE_TIMEOUT: 300
     BM_ROOTFS: /rootfs-armhf
-    DEQP_EXPECTED_RENDERER: V3D
+    FLAKES_CHANNEL: "#videocore-ci"
     GPU_VERSION: v3d-rpi4
+    HWCI_KERNEL_MODULES: v3d,vc4
   script:
     - ./install/bare-metal/poe-powered.sh
   needs:
-    - arm_test
-    - meson-armhf
+    - debian/arm_test
+    - debian-armhf
   tags:
     - igalia-rpi4
 
-v3d-rpi4-gles31:armhf:
-  extends:
-    - .v3d-rpi4-test:armhf
-  parallel: 2
-  variables:
-    DEQP_VER: gles31
-
-v3d-rpi4-gles3:armhf:
+v3d-rpi4-gles:armhf:
   extends:
     - .v3d-rpi4-test:armhf
-  parallel: 4
+  parallel: 8
   variables:
-    DEQP_VER: gles3
+    DEQP_SUITE: v3d-rpi4-gles
 
-v3d-rpi4-gles2:armhf:
+v3d-rpi4-egl:armhf:
   extends:
     - .v3d-rpi4-test:armhf
   variables:
-    DEQP_VER: gles2
+    HWCI_START_XORG: 1
+    DEQP_VER: egl
 
-.v3d-rpi4-piglit:armhf:
+v3d-rpi4-piglit:armhf:
   extends:
     - .piglit-test
     - .v3d-rpi4-test:armhf
-    - .test-manual
+  parallel: 4
   variables:
-    BARE_METAL_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
-    BM_START_XORG: 1
+    HWCI_TEST_SCRIPT: "/install/piglit/piglit-runner.sh"
+    HWCI_START_XORG: 1
     PIGLIT_PLATFORM: mixed_glx_egl
+    PIGLIT_PROFILES: all
 
-v3d-rpi4-piglit-quick_gl:armhf:
+v3dv-rpi4-vk:arm64:
   extends:
-    - .v3d-rpi4-piglit:armhf
-  parallel: 2
-  variables:
-    PIGLIT_PROFILES: quick_gl
-
-v3d-rpi4-piglit-quick_shader:armhf:
-  extends:
-    - .v3d-rpi4-piglit:armhf
-  variables:
-    PIGLIT_PROFILES: quick_shader
-
-v3dv-rpi4-vk:armhf:
-  extends:
-    - .baremetal-test-armhf
-    - .use-arm_test
+    - .baremetal-test
+    - .use-debian/arm_test
     - .v3dv-rules
-  parallel: 6
+  parallel: 8
   variables:
-    BARE_METAL_TEST_SCRIPT: "/install/deqp-runner.sh"
-    BM_BOOTFS: /boot/raspberrypi_armhf
-    BM_KERNEL_MODULES: v3d,vc4
+    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+    BM_BOOTFS: /boot/raspberrypi_arm64
     BM_POE_TIMEOUT: 300
-    BM_ROOTFS: /rootfs-armhf
-    CPU: arm7hlf
-    DEQP_EXPECTED_RENDERER: "V3D 4.2"
-    DEQP_FRACTION: 7
+    BM_ROOTFS: /rootfs-arm64
+    DEQP_EXPECTED_RENDERER: "V3D.4.2"
+    DEQP_FRACTION: 5
     DEQP_VER: vk
+    FLAKES_CHANNEL: "#videocore-ci"
     GPU_VERSION: v3dv-rpi4
-    VK_CPU: arm7hlf
+    HWCI_KERNEL_MODULES: v3d,vc4
+    MINIO_ARTIFACT_NAME: mesa-arm64
     VK_DRIVER: broadcom
   script:
     - ./install/bare-metal/poe-powered.sh
   needs:
-    - arm_test
-    - meson-armhf
+    - debian/arm_test
+    - job: debian-arm64
+      artifacts: false
   tags:
     - igalia-rpi4
diff --git a/lib/mesa/src/broadcom/ci/v3d-rpi4-fails.txt b/lib/mesa/src/broadcom/ci/v3d-rpi4-fails.txt
new file mode 100644
index 000000000..c0d90c2d2
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/v3d-rpi4-fails.txt
@@ -0,0 +1,330 @@
+glx@glx-make-current,Crash
+glx@glx-multi-window-single-context,Fail
+glx@glx-multithread-buffer,Fail
+glx@glx-query-drawable-glx_fbconfig_id-window,Fail
+glx@glx-swap-pixmap-bad,Fail
+glx@glx-visuals-depth -pixmap,Crash
+glx@glx-visuals-stencil -pixmap,Crash
+glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
+glx@glx_arb_create_context_no_error@no error,Fail
+glx@glx_ext_import_context@free context,Fail
+glx@glx_ext_import_context@get context id,Fail
+glx@glx_ext_import_context@get current display,Fail
+glx@glx_ext_import_context@import context- multi process,Fail
+glx@glx_ext_import_context@import context- single process,Fail
+glx@glx_ext_import_context@imported context has same context id,Fail
+glx@glx_ext_import_context@make current- multi process,Fail
+glx@glx_ext_import_context@make current- single process,Fail
+glx@glx_ext_import_context@query context info,Fail
+shaders@glsl-bug-110796,Fail
+spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
+spec@!opengl 1.0@gl-1.0-dlist-bitmap,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
+spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
+spec@!opengl 1.0@gl-1.0-spot-light,Fail
+spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Fail
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2,Fail
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=4,Fail
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Fail
+spec@!opengl 1.1@getteximage-depth,Fail
+spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT16,Fail
+spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT24,Fail
+spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT32,Fail
+spec@!opengl 1.1@getteximage-depth@GL_TEXTURE_1D_ARRAY-GL_DEPTH_COMPONENT,Fail
+spec@!opengl 1.1@getteximage-formats,Fail
+spec@!opengl 1.1@linestipple,Fail
+spec@!opengl 1.1@linestipple@Factor 2x,Fail
+spec@!opengl 1.1@linestipple@Factor 3x,Fail
+spec@!opengl 1.1@linestipple@Line loop,Fail
+spec@!opengl 1.1@linestipple@Line strip,Fail
+spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
+spec@!opengl 1.1@point-line-no-cull,Fail
+spec@!opengl 1.1@polygon-mode,Fail
+spec@!opengl 1.1@polygon-mode-offset,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@texwrap formats bordercolor,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_INTENSITY16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE12_ALPHA4- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGB16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA12- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_INTENSITY16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE12_ALPHA4- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGB16- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA12- swizzled- border color only,Fail
+spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA16- swizzled- border color only,Fail
+spec@!opengl 1.1@windowoverlap,Fail
+spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
+spec@!opengl 2.0@max-samplers,Fail
+spec@!opengl 2.0@max-samplers border,Fail
+spec@!opengl 2.1@pbo,Fail
+spec@!opengl 2.1@pbo@test_polygon_stip,Fail
+spec@!opengl 2.1@polygon-stipple-fs,Fail
+spec@!opengl es 3.0@gles-3.0-transform-feedback-uniform-buffer-object,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-fog,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Fail
+spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Fail
+spec@arb_compute_shader@minmax,Fail
+spec@arb_copy_buffer@targets,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F,Fail
+spec@arb_depth_buffer_float@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32F NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH32F_STENCIL8- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor@GL_DEPTH_COMPONENT32F- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH32F_STENCIL8- swizzled- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32F- swizzled- border color only,Fail
+spec@arb_depth_buffer_float@texwrap formats,Fail
+spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH32F_STENCIL8- NPOT,Fail
+spec@arb_depth_buffer_float@texwrap formats@GL_DEPTH_COMPONENT32F- NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT16 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT,Fail
+spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT NPOT,Fail
+spec@arb_depth_texture@texwrap formats bordercolor,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail
+spec@arb_depth_texture@texwrap formats,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
+spec@arb_framebuffer_object@fbo-drawbuffers-none use_frag_out,Fail
+spec@arb_pixel_buffer_object@pbo-getteximage,Fail
+spec@arb_pixel_buffer_object@texsubimage array pbo,Fail
+spec@arb_point_sprite@arb_point_sprite-checkerboard,Fail
+spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
+spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail
+spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgrad,Fail
+spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail
+spec@arb_texture_float@fbo-blending-formats,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_ALPHA32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY16F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE16F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE_ALPHA32F_ARB,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_RGB16F,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail
+spec@arb_texture_float@fbo-blending-formats@GL_RGBA32F,Fail
+spec@arb_texture_float@texwrap formats bordercolor,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_ALPHA32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_INTENSITY32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_LUMINANCE_ALPHA32F_ARB- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_RGB32F- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor@GL_RGBA32F- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_ALPHA32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_INTENSITY32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_LUMINANCE_ALPHA32F_ARB- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGB32F- swizzled- border color only,Fail
+spec@arb_texture_float@texwrap formats bordercolor-swizzled@GL_RGBA32F- swizzled- border color only,Fail
+spec@arb_texture_rectangle@1-1-linear-texture,Fail
+spec@arb_texture_rg@fbo-blending-formats-float,Fail
+spec@arb_texture_rg@fbo-blending-formats-float@GL_R32F,Fail
+spec@arb_texture_rg@fbo-blending-formats-float@GL_RG32F,Fail
+spec@arb_texture_rg@texwrap formats bordercolor,Fail
+spec@arb_texture_rg@texwrap formats bordercolor@GL_R16- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor@GL_RG16- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_R16- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats bordercolor-swizzled@GL_RG16- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor@GL_R32F- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor@GL_RG32F- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_R32F- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float bordercolor-swizzled@GL_RG32F- swizzled- border color only,Fail
+spec@arb_texture_rg@texwrap formats-float,Fail
+spec@arb_texture_rg@texwrap formats-float@GL_R32F- NPOT,Fail
+spec@arb_texture_rg@texwrap formats-float@GL_RG32F- NPOT,Fail
+spec@arb_transform_feedback2@change objects while paused (gles3),Fail
+spec@egl 1.4@egl-copy-buffers,Crash
+spec@egl 1.4@eglterminate then unbind context,Fail
+spec@egl_ext_protected_content@conformance,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
+spec@egl_khr_surfaceless_context@viewport,Fail
+spec@egl_mesa_configless_context@basic,Fail
+spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
+spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
+spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
+spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
+spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
+spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail
+spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail
+spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-isampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-sampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-usampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-isampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-sampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetchoffset@fs-texelfetch-usampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-isampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-sampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetchoffset@vs-texelfetch-usampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-isampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-sampler1darray,Fail
+spec@ext_gpu_shader4@execution@texelfetch@vs-texelfetch-usampler1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() 1darrayshadow,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture(bias) 1darrayshadow,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texture() cubeshadow,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegrad 1darrayshadow,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturegradoffset 1darrayshadow,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelod 1darrayshadow,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4texturelodoffset 1darrayshadow,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darray,Fail
+spec@ext_gpu_shader4@tex-miplevel-selection gpu4textureoffset 1darrayshadow,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled,Fail
+spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_STENCIL8- swizzled- border color only,Fail
+spec@ext_packed_depth_stencil@texwrap formats,Fail
+spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
+spec@ext_packed_float@query-rgba-signed-components,Fail
+spec@ext_texture_array@array-texture,Fail
+spec@ext_texture_array@fbo-generatemipmap-array rgb9_e5,Fail
+spec@ext_texture_array@fbo-generatemipmap-array,Fail
+spec@ext_texture_array@texsubimage array,Fail
+spec@ext_texture_integer@getteximage-clamping gl_arb_texture_rg,Fail
+spec@ext_texture_integer@getteximage-clamping,Fail
+spec@ext_texture_lod_bias@lodbias,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_ALPHA16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_INTENSITY16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_ALPHA16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_LUMINANCE16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_R16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RG16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGB16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor@GL_RGBA16_SNORM- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_ALPHA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_INTENSITY16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_ALPHA16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_LUMINANCE16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_R16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RG16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGB16_SNORM- swizzled- border color only,Fail
+spec@ext_texture_snorm@texwrap formats bordercolor-swizzled@GL_RGBA16_SNORM- swizzled- border color only,Fail
+spec@arb_texture_storage@texture-storage@cube array texture,Fail
+spec@glsl-1.10@execution@glsl-fs-inline-explosion,Crash
+spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
+spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
+spec@glsl-1.20@execution@clipping@vs-clip-vertex-primitives,Fail
+spec@glsl-1.20@execution@fs-underflow-mul-compare-zero,Fail
+spec@intel_performance_query@intel_performance_query-issue_2235,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-r16-unorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rg16-unorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-snorm-cube.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d-array.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-2d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-3d.vert,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.frag,Fail
+spec@nv_image_formats@compiler@declaration-disallow-rgba16-unorm-cube.vert,Fail
+spec@nv_read_depth@read_depth_gles3,Fail
+spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Crash
+spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail
diff --git a/lib/mesa/src/broadcom/ci/v3d-rpi4-flakes.txt b/lib/mesa/src/broadcom/ci/v3d-rpi4-flakes.txt
new file mode 100644
index 000000000..a17f2c79c
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/v3d-rpi4-flakes.txt
@@ -0,0 +1,11 @@
+dEQP-GLES31.functional.compute.shared_var.basic_type.ivec3_highp
+dEQP-GLES31.functional.ssbo.layout.single_basic_type.packed.highp_mat2
+KHR-GLES31.core.shader_image_load_store.basic-glsl-earlyFragTests
+
+glx@glx_arb_sync_control@swapbuffersmsc-divisor-zero
+glx@glx_arb_sync_control@waitformsc
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-32f_24_8_rev samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
+spec@arb_occlusion_query@occlusion_query_order
+spec@egl_chromium_sync_control@conformance
diff --git a/lib/mesa/src/broadcom/ci/v3d-rpi4-skips.txt b/lib/mesa/src/broadcom/ci/v3d-rpi4-skips.txt
new file mode 100644
index 000000000..e6b1076a5
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/v3d-rpi4-skips.txt
@@ -0,0 +1,40 @@
+# Slow tests (> 1 minute to run)
+spec@!opengl 1.1@streaming-texture-leak
+spec@!opengl 1.2@tex3d-maxsize
+spec@ext_texture_env_combine@texture-env-combine
+spec@glsl-1.10@execution@loops@glsl-fs-unroll-explosion
+spec@glsl-1.10@execution@loops@glsl-vs-unroll-explosion
+spec@!opengl 1.0@gl-1.0-blend-func
+
+# Extensions not supported
+spec@arb_gpu_shader_fp64.*
+spec@arb_gpu_shader_gpu5.*
+spec@arb_gpu_shader_int64.*
+spec@arb_tessellation_shader.*
+spec@arb_texture_cube_map.*
+spec@glsl-1.30.*
+spec@glsl-1.40.*
+spec@glsl-1.50.*
+spec@glsl-3.*
+spec@glsl-4.*
+spec@glsl-es-3.20.*
+# Slow tests (> 1 minute to run)
+spec@!opengl 1.1@streaming-texture-leak
+spec@!opengl 1.2@tex3d-maxsize
+spec@ext_texture_env_combine@texture-env-combine
+spec@glsl-1.10@execution@loops@glsl-fs-unroll-explosion
+spec@glsl-1.10@execution@loops@glsl-vs-unroll-explosion
+spec@!opengl 1.0@gl-1.0-blend-func
+
+# Extensions not supported
+spec@arb_gpu_shader_fp64.*
+spec@arb_gpu_shader_gpu5.*
+spec@arb_gpu_shader_int64.*
+spec@arb_tessellation_shader.*
+spec@arb_texture_cube_map.*
+spec@glsl-1.30.*
+spec@glsl-1.40.*
+spec@glsl-1.50.*
+spec@glsl-3.*
+spec@glsl-4.*
+spec@glsl-es-3.20.*
diff --git a/lib/mesa/src/broadcom/ci/v3dv-rpi4-flakes.txt b/lib/mesa/src/broadcom/ci/v3dv-rpi4-flakes.txt
new file mode 100644
index 000000000..0d22f002d
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/v3dv-rpi4-flakes.txt
@@ -0,0 +1,5 @@
+dEQP-VK.api.external.fence.opaque_fd.reset_permanent
+dEQP-VK.api.external.fence.opaque_fd.reset_temporary
+dEQP-VK.api.external.fence.opaque_fd.signal_export_import_wait_permanent
+dEQP-VK.ssbo.layout.instance_array_basic_type.std430.uvec4
+dEQP-VK.wsi.display.get_display_plane_capabilities
diff --git a/lib/mesa/src/broadcom/ci/v3dv-rpi4-skips.txt b/lib/mesa/src/broadcom/ci/v3dv-rpi4-skips.txt
new file mode 100644
index 000000000..bf6a82c19
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/v3dv-rpi4-skips.txt
@@ -0,0 +1,21 @@
+# Broadcom waivers
+dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
+dEQP-VK.rasterization.depth_bias.d32_sfloat
+
+# Timeout tests (> 1 minute to run)
+dEQP-VK.api.object_management.max_concurrent.query_pool
+dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+dEQP-VK.memory.mapping.dedicated_alloc.buffer.full.variable.implicit_unmap
+dEQP-VK.memory.mapping.dedicated_alloc.image.full.variable.implicit_unmap
+dEQP-VK.memory.mapping.suballocation.full.variable.implicit_unmap
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom
+dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert
+dEQP-VK.ssbo.layout.random.all_shared_buffer.5
+dEQP-VK.ssbo.layout.random.arrays_of_arrays.13
+dEQP-VK.ssbo.layout.random.nested_structs_arrays.0
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp
+dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat
+dEQP-VK.ubo.random.all_out_of_order_offsets.45
+dEQP-VK.ubo.random.all_shared_buffer.48
diff --git a/lib/mesa/src/broadcom/ci/vc4-rpi3-fails.txt b/lib/mesa/src/broadcom/ci/vc4-rpi3-fails.txt
new file mode 100644
index 000000000..d0833cd4f
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/vc4-rpi3-fails.txt
@@ -0,0 +1,1611 @@
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail
+KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3133
+KHR-GLES2.texture_3d.copy_sub_image.negative,Fail
+KHR-GLES2.texture_3d.copy_sub_image.rgba,Fail
+
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_linear_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_linear_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_linear_nearest_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_linear_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_mipmap_nearest_nearest_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.linear_nearest_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_linear_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_linear_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_linear_nearest_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_linear_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_mipmap_nearest_nearest_repeat_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_clamp_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_mirror_repeat_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_clamp_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_repeat,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_clamp,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_mirror,Fail
+KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_repeat,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3134
+KHR-GLES2.texture_3d.filtering.combinations.negative,Fail
+
+KHR-GLES2.texture_3d.filtering.formats.rgba8_linear,Fail
+KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest,Fail
+KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.formats.rgba8_nearest_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.128x32x64_linear_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.128x32x64_nearest_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.32x64x16_linear_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.32x64x16_nearest_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.3x7x5_nearest_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.4x8x8_nearest_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.63x63x63_linear_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest,Fail
+KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_linear,Fail
+KHR-GLES2.texture_3d.filtering.sizes.63x63x63_nearest_mipmap_nearest,Fail
+KHR-GLES2.texture_3d.framebuffer_texture.rgba,Fail
+KHR-GLES2.texture_3d.sub_image.rgba8,Fail
+dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_pbuffer,Crash
+dEQP-EGL.functional.color_clears.multi_context.gles2.rgb888_window,Crash
+dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_pbuffer,Crash
+dEQP-EGL.functional.color_clears.multi_context.gles2.rgba8888_window,Crash
+dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_pbuffer,Crash
+dEQP-EGL.functional.color_clears.multi_thread.gles2.rgb888_window,Crash
+dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_pbuffer,Crash
+dEQP-EGL.functional.color_clears.multi_thread.gles2.rgba8888_window,Crash
+dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_pbuffer,Crash
+dEQP-EGL.functional.color_clears.single_context.gles2.rgb888_window,Crash
+dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_pbuffer,Crash
+dEQP-EGL.functional.color_clears.single_context.gles2.rgba8888_window,Crash
+dEQP-EGL.functional.create_context.no_config,Fail
+dEQP-EGL.functional.render.multi_context.gles2.rgb888_pbuffer,Crash
+dEQP-EGL.functional.render.multi_context.gles2.rgb888_window,Crash
+dEQP-EGL.functional.render.multi_context.gles2.rgba8888_pbuffer,Crash
+dEQP-EGL.functional.render.multi_context.gles2.rgba8888_window,Crash
+dEQP-EGL.functional.render.multi_thread.gles2.rgb888_pbuffer,Crash
+dEQP-EGL.functional.render.multi_thread.gles2.rgb888_window,Crash
+dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_pbuffer,Crash
+dEQP-EGL.functional.render.multi_thread.gles2.rgba8888_window,Crash
+dEQP-EGL.functional.render.single_context.gles2.rgb888_pbuffer,Crash
+dEQP-EGL.functional.render.single_context.gles2.rgb888_window,Crash
+dEQP-EGL.functional.render.single_context.gles2.rgba8888_pbuffer,Crash
+dEQP-EGL.functional.render.single_context.gles2.rgba8888_window,Crash
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
+dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail
+dEQP-GLES2.functional.draw.draw_arrays.line_loop.multiple_attributes,Fail
+dEQP-GLES2.functional.draw.draw_arrays.line_loop.single_attribute,Fail
+dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail
+dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail
+dEQP-GLES2.functional.negative_api.vertex_array.vertex_attrib,Fail
+dEQP-GLES2.functional.negative_api.vertex_array.vertex_attribv,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_mirror_rgba8888,Fail
+dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail
+dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail
+dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.2d_rgba,Fail
+dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.cube_rgba,Fail
+dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail
+
+glx@glx-copy-sub-buffer samples=2,Crash
+glx@glx-copy-sub-buffer samples=4,Crash
+glx@glx-make-current,Crash
+glx@glx-multithread-buffer,Fail
+glx@glx-query-drawable-glx_fbconfig_id-window,Fail
+glx@glx-swap-pixmap-bad,Fail
+glx@glx-visuals-depth -pixmap,Crash
+glx@glx-visuals-depth,Crash
+glx@glx-visuals-stencil -pixmap,Crash
+glx@glx-visuals-stencil,Crash
+glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
+glx@glx_arb_create_context_no_error@no error,Fail
+glx@glx_ext_import_context@free context,Fail
+glx@glx_ext_import_context@get context id,Fail
+glx@glx_ext_import_context@get current display,Fail
+glx@glx_ext_import_context@import context- multi process,Fail
+glx@glx_ext_import_context@import context- single process,Fail
+glx@glx_ext_import_context@imported context has same context id,Fail
+glx@glx_ext_import_context@make current- multi process,Fail
+glx@glx_ext_import_context@make current- single process,Fail
+glx@glx_ext_import_context@query context info,Fail
+shaders@glsl-arb-fragment-coord-conventions,Fail
+shaders@glsl-bug-110796,Fail
+shaders@glsl-max-vertex-attrib,Fail
+shaders@glsl-predication-on-large-array,Fail
+spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
+spec@!opengl 1.0@gl-1.0-dlist-bitmap,Crash
+spec@!opengl 1.0@gl-1.0-drawbuffer-modes,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag-const,Fail
+spec@!opengl 1.0@gl-1.0-edgeflag-quads,Fail
+spec@!opengl 1.0@gl-1.0-logicop,Crash
+spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
+spec@!opengl 1.0@gl-1.0-scissor-offscreen,Fail
+spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
+spec@!opengl 1.1@clipflat,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_POLYGON)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLES)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: center top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: left top PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right bottom PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right middle PV: FIRST,Fail
+spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_LINE)- quadrant: right top PV: FIRST,Fail
+spec@!opengl 1.1@depthstencil-default_fb-blit samples=2,Crash
+spec@!opengl 1.1@depthstencil-default_fb-blit samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=2,Crash
+spec@!opengl 1.1@depthstencil-default_fb-clear samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-clear,Fail
+spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=2,Crash
+spec@!opengl 1.1@depthstencil-default_fb-copypixels samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=2,Crash
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-24_8 samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2,Crash
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=2,Crash
+spec@!opengl 1.1@depthstencil-default_fb-readpixels-24_8 samples=4,Crash
+spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=2,Crash
+spec@!opengl 1.1@depthstencil-default_fb-readpixels-float-and-ushort samples=4,Crash
+spec@!opengl 1.1@draw-pixels,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_line_loop,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_polygon,Crash
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quad_strip,Crash
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quads,Crash
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_triangle_fan,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_line_loop,Fail
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_polygon,Crash
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quad_strip,Crash
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quads,Crash
+spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_triangle_fan,Fail
+spec@!opengl 1.1@line-flat-clip-color,Fail
+spec@!opengl 1.1@linestipple,Fail
+spec@!opengl 1.1@linestipple@Baseline,Fail
+spec@!opengl 1.1@linestipple@Factor 2x,Fail
+spec@!opengl 1.1@linestipple@Factor 3x,Fail
+spec@!opengl 1.1@linestipple@Line loop,Fail
+spec@!opengl 1.1@linestipple@Line strip,Fail
+spec@!opengl 1.1@linestipple@Restarting lines within a single Begin-End block,Fail
+spec@!opengl 1.1@polygon-mode,Fail
+spec@!opengl 1.1@polygon-mode-offset,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 1: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 2: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 5: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
+spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
+spec@!opengl 1.1@read-front clear-front-first samples=2,Crash
+spec@!opengl 1.1@read-front clear-front-first samples=4,Crash
+spec@!opengl 1.1@read-front samples=2,Crash
+spec@!opengl 1.1@read-front samples=4,Crash
+spec@!opengl 1.1@tex-upside-down-miptree,Fail
+spec@!opengl 1.1@texsubimage-unpack,Fail
+spec@!opengl 1.1@texwrap 2d proj,Fail
+spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- NPOT- projected,Fail
+spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- projected,Fail
+spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- swizzled- projected,Fail
+spec@!opengl 1.1@texwrap 2d,Fail
+spec@!opengl 1.1@texwrap 2d@GL_RGBA8,Fail
+spec@!opengl 1.1@texwrap 2d@GL_RGBA8- NPOT,Fail
+spec@!opengl 1.1@texwrap 2d@GL_RGBA8- swizzled,Fail
+spec@!opengl 1.1@texwrap formats,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB10,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB10- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB10- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB10_A2,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB10_A2- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB10_A2- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB12,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB12- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB12- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB16,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB16- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB16- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB5,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB5- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB5- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB5_A1,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB5_A1- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB5_A1- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB8,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB8- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGB8- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA12,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA12- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA12- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA16,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA16- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA16- swizzled,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA8,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA8- NPOT,Fail
+spec@!opengl 1.1@texwrap formats@GL_RGBA8- swizzled,Fail
+spec@!opengl 1.1@windowoverlap,Fail
+spec@!opengl 1.2@copyteximage 3d,Fail
+spec@!opengl 1.2@getteximage-targets 3d,Fail
+spec@!opengl 1.2@lodclamp,Fail
+spec@!opengl 1.2@lodclamp-between,Fail
+spec@!opengl 1.2@lodclamp-between-max,Fail
+spec@!opengl 1.2@mipmap-setup,Fail
+spec@!opengl 1.2@tex3d,Fail
+spec@!opengl 1.2@tex3d-maxsize,Fail
+spec@!opengl 1.2@texwrap 3d proj,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail
+spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- swizzled- projected,Fail
+spec@!opengl 1.2@texwrap 3d,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail
+spec@!opengl 1.2@texwrap 3d@GL_RGBA8- swizzled,Fail
+spec@!opengl 1.3@tex3d-depth1,Fail
+spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
+spec@!opengl 1.4@tex-miplevel-selection,Fail
+spec@!opengl 1.4@tex-miplevel-selection-lod,Fail
+spec@!opengl 1.4@tex-miplevel-selection-lod-bias,Fail
+spec@!opengl 1.5@depth-tex-compare,Fail
+spec@!opengl 2.0@attrib-assignments,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag,Fail
+spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Fail
+spec@!opengl 2.0@occlusion-query-discard,Fail
+spec@!opengl 2.0@tex3d-npot,Fail
+spec@!opengl 2.1@minmax,Fail
+spec@!opengl 2.1@pbo,Fail
+spec@!opengl 2.1@pbo@test_polygon_stip,Fail
+spec@!opengl 2.1@polygon-stipple-fs,Fail
+spec@arb_arrays_of_arrays@execution@glsl-arrays-copy-size-mismatch,Fail
+spec@arb_depth_texture@depth-level-clamp,Fail
+spec@arb_depth_texture@texwrap formats,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT16- swizzled,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT24- swizzled,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
+spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- swizzled,Fail
+spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index,Crash
+spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index-user_varrays,Crash
+spec@arb_es2_compatibility@texwrap formats,Fail
+spec@arb_es2_compatibility@texwrap formats@GL_RGB565,Fail
+spec@arb_es2_compatibility@texwrap formats@GL_RGB565- NPOT,Fail
+spec@arb_es2_compatibility@texwrap formats@GL_RGB565- swizzled,Fail
+spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-integer,Fail
+spec@arb_fragment_coord_conventions@fp-arb-fragment-coord-conventions-none,Fail
+spec@arb_fragment_program@fp-indirections2,Fail
+spec@arb_fragment_program@minmax,Fail
+spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_depth24_stencil8,Fail
+spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index1,Fail
+spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index16,Fail
+spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index4,Fail
+spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index8,Fail
+spec@arb_framebuffer_object@fbo-attachments-blit-scaled-linear,Fail
+spec@arb_framebuffer_object@fbo-blit-stretch,Fail
+spec@arb_framebuffer_object@fbo-generatemipmap-3d,Fail
+spec@arb_framebuffer_object@fbo-mipmap-copypix,Fail
+spec@arb_framebuffer_object@framebuffer-blit-levels draw stencil,Fail
+spec@arb_framebuffer_object@framebuffer-blit-levels read stencil,Fail
+spec@arb_framebuffer_object@mixed-buffer-sizes,Fail
+spec@arb_framebuffer_object@same-attachment-glframebuffertexture2d-gl_depth_stencil_attachment,Fail
+spec@arb_framebuffer_srgb@arb_framebuffer_srgb-srgb_conformance,Fail
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample disabled render,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb downsample enabled render,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa disabled render,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer linear_to_srgb msaa enabled render,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample disabled render,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear downsample enabled render,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa disabled render,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled clear,Crash
+spec@arb_framebuffer_srgb@blit renderbuffer srgb_to_linear msaa enabled render,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample disabled render,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb downsample enabled render,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa disabled render,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture linear_to_srgb msaa enabled render,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample disabled render,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear downsample enabled render,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa disabled render,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled clear,Crash
+spec@arb_framebuffer_srgb@blit texture srgb_to_linear msaa enabled render,Crash
+spec@arb_internalformat_query2@all internalformat_<x>_size pname checks,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_ALPHA_SIZE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_BLUE_SIZE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_DEPTH_SIZE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_GREEN_SIZE,Fail
+spec@arb_internalformat_query2@all internalformat_<x>_size pname checks@GL_INTERNALFORMAT_RED_SIZE,Fail
+spec@arb_internalformat_query2@api error checks,Fail
+spec@arb_internalformat_query2@max dimensions related pname checks,Fail
+spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_COMBINED_DIMENSIONS,Fail
+spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_DEPTH,Fail
+spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_HEIGHT,Fail
+spec@arb_internalformat_query2@max dimensions related pname checks@GL_MAX_WIDTH,Fail
+spec@arb_occlusion_query2@render,Fail
+spec@arb_occlusion_query@occlusion_query,Fail
+spec@arb_occlusion_query@occlusion_query_conform,Fail
+spec@arb_occlusion_query@occlusion_query_meta_fragments,Fail
+spec@arb_occlusion_query@occlusion_query_meta_save,Fail
+spec@arb_pixel_buffer_object@fbo-pbo-readpixels-small,Fail
+spec@arb_pixel_buffer_object@pbo-getteximage,Fail
+spec@arb_pixel_buffer_object@texsubimage-unpack pbo,Fail
+spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
+spec@arb_provoking_vertex@arb-provoking-vertex-render,Fail
+spec@arb_sampler_objects@sampler-objects,Fail
+spec@arb_shader_texture_lod@execution@glsl-fs-texturelod-01,Fail
+spec@arb_texture_multisample@arb_texture_multisample-teximage-3d-multisample,Fail
+spec@arb_texture_rectangle@1-1-linear-texture,Fail
+spec@arb_texture_rectangle@copyteximage rect samples=2,Crash
+spec@arb_texture_rectangle@copyteximage rect samples=4,Crash
+spec@arb_texture_rectangle@texrect-many,Crash
+spec@arb_texture_storage@texture-storage,Fail
+spec@arb_texture_storage@texture-storage@3D mipmapped ,Fail
+spec@arb_texture_storage@texture-storage@3D non-mipmapped ,Fail
+spec@arb_vertex_program@minmax,Fail
+spec@egl 1.4@egl-copy-buffers,Crash
+spec@egl 1.4@eglterminate then unbind context,Fail
+spec@egl 1.4@largest possible eglcreatepbuffersurface and then glclear,Fail
+spec@egl_ext_protected_content@conformance,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
+spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
+spec@egl_khr_surfaceless_context@viewport,Fail
+spec@egl_mesa_configless_context@basic,Fail
+spec@ext_direct_state_access@multi-texture,Crash
+spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail
+spec@ext_direct_state_access@multi-texture@MultiTexSubImage1DEXT,Fail
+spec@ext_direct_state_access@textures,Fail
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@CopyTextureSubImage3DEXT,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex*,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail
+spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage2DEXT,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+spec@ext_direct_state_access@textures@TextureSubImage3DEXT,Fail
+spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
+spec@ext_framebuffer_multisample@blit-flipped 2 x,Crash
+spec@ext_framebuffer_multisample@blit-flipped 2 y,Crash
+spec@ext_framebuffer_multisample@blit-flipped 4 x,Crash
+spec@ext_framebuffer_multisample@blit-flipped 4 y,Crash
+spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 downsample,Crash
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 msaa,Crash
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 2 upsample,Crash
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 downsample,Crash
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 msaa,Crash
+spec@ext_framebuffer_multisample@clip-and-scissor-blit 4 upsample,Crash
+spec@ext_framebuffer_multisample@enable-flag,Crash
+spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
+spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
+spec@ext_framebuffer_multisample@line-smooth 2,Crash
+spec@ext_framebuffer_multisample@line-smooth 4,Crash
+spec@ext_framebuffer_multisample@multisample-blit 2 color linear,Crash
+spec@ext_framebuffer_multisample@multisample-blit 2 color,Crash
+spec@ext_framebuffer_multisample@multisample-blit 2 depth,Crash
+spec@ext_framebuffer_multisample@multisample-blit 2 stencil,Crash
+spec@ext_framebuffer_multisample@multisample-blit 4 color linear,Crash
+spec@ext_framebuffer_multisample@multisample-blit 4 color,Crash
+spec@ext_framebuffer_multisample@multisample-blit 4 depth,Crash
+spec@ext_framebuffer_multisample@multisample-blit 4 stencil,Crash
+spec@ext_framebuffer_multisample@no-color 2 depth combined,Crash
+spec@ext_framebuffer_multisample@no-color 2 depth single,Crash
+spec@ext_framebuffer_multisample@no-color 2 depth-computed combined,Crash
+spec@ext_framebuffer_multisample@no-color 2 depth-computed single,Crash
+spec@ext_framebuffer_multisample@no-color 2 stencil combined,Crash
+spec@ext_framebuffer_multisample@no-color 2 stencil single,Crash
+spec@ext_framebuffer_multisample@no-color 4 depth combined,Crash
+spec@ext_framebuffer_multisample@no-color 4 depth single,Crash
+spec@ext_framebuffer_multisample@no-color 4 depth-computed combined,Crash
+spec@ext_framebuffer_multisample@no-color 4 depth-computed single,Crash
+spec@ext_framebuffer_multisample@no-color 4 stencil combined,Crash
+spec@ext_framebuffer_multisample@no-color 4 stencil single,Crash
+spec@ext_framebuffer_multisample@point-smooth 2,Crash
+spec@ext_framebuffer_multisample@point-smooth 4,Crash
+spec@ext_framebuffer_multisample@polygon-smooth 2,Crash
+spec@ext_framebuffer_multisample@polygon-smooth 4,Crash
+spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 color,Fail
+spec@ext_framebuffer_multisample@sample-alpha-to-coverage 2 depth,Crash
+spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 color,Fail
+spec@ext_framebuffer_multisample@sample-alpha-to-coverage 4 depth,Crash
+spec@ext_framebuffer_multisample@sample-coverage 2 inverted,Crash
+spec@ext_framebuffer_multisample@sample-coverage 2 non-inverted,Crash
+spec@ext_framebuffer_multisample@sample-coverage 4 inverted,Crash
+spec@ext_framebuffer_multisample@sample-coverage 4 non-inverted,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 2 color downsample,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 2 color msaa,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 2 color upsample,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 2 depth downsample,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 2 depth msaa,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 2 depth upsample,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 4 color downsample,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 4 color msaa,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 4 color upsample,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 4 depth downsample,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 4 depth msaa,Crash
+spec@ext_framebuffer_multisample@unaligned-blit 4 depth upsample,Crash
+spec@ext_framebuffer_multisample@upsample 2 color linear,Crash
+spec@ext_framebuffer_multisample@upsample 2 color,Crash
+spec@ext_framebuffer_multisample@upsample 2 depth,Crash
+spec@ext_framebuffer_multisample@upsample 2 stencil,Crash
+spec@ext_framebuffer_multisample@upsample 4 color linear,Crash
+spec@ext_framebuffer_multisample@upsample 4 color,Crash
+spec@ext_framebuffer_multisample@upsample 4 depth,Crash
+spec@ext_framebuffer_multisample@upsample 4 stencil,Crash
+spec@ext_framebuffer_multisample_blit_scaled@negative-blit-scaled,Crash
+spec@ext_framebuffer_object@fbo-3d,Fail
+spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
+spec@ext_framebuffer_object@fbo-depth-sample-compare,Fail
+spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index1-blit,Fail
+spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index16-blit,Fail
+spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index4-blit,Fail
+spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index8-blit,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p016,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y210,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y212,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y410,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail
+spec@ext_occlusion_query_boolean@any-samples,Fail
+spec@ext_packed_depth_stencil@depth_stencil texture,Fail
+spec@ext_packed_depth_stencil@fbo-depthstencil-gl_depth24_stencil8-clear,Fail
+spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-blit,Fail
+spec@ext_packed_depth_stencil@texwrap formats,Fail
+spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8,Fail
+spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- NPOT,Fail
+spec@ext_packed_depth_stencil@texwrap formats@GL_DEPTH24_STENCIL8- swizzled,Fail
+spec@ext_provoking_vertex@provoking-vertex,Fail
+spec@ext_texture_format_bgra8888@api-errors,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor-swizzled,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor-swizzled@GL_SLUMINANCE8- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor-swizzled@GL_SLUMINANCE8_ALPHA8- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor-swizzled@GL_SRGB8- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor-swizzled@GL_SRGB8_ALPHA8- swizzled- border color only,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor@GL_SLUMINANCE8- border color only,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor@GL_SLUMINANCE8_ALPHA8- border color only,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor@GL_SRGB8- border color only,Fail
+spec@ext_texture_srgb@texwrap formats bordercolor@GL_SRGB8_ALPHA8- border color only,Fail
+spec@ext_texture_srgb@texwrap formats,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SLUMINANCE8,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SLUMINANCE8- NPOT,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SLUMINANCE8- swizzled,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SLUMINANCE8_ALPHA8,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SLUMINANCE8_ALPHA8- NPOT,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SLUMINANCE8_ALPHA8- swizzled,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SRGB8,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SRGB8- NPOT,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SRGB8- swizzled,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- NPOT,Fail
+spec@ext_texture_srgb@texwrap formats@GL_SRGB8_ALPHA8- swizzled,Fail
+spec@glsl-1.10@built-in constants,Fail
+spec@glsl-1.10@built-in constants@gl_MaxVertexAttribs,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-cos-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-cos-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-cos-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-cos-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp2-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp2-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp2-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-exp2-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log2-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log2-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log2-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-log2-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-pow-float-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-pow-vec2-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-pow-vec3-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-pow-vec4-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-sin-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-sin-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-sin-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-sin-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-tan-float,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-tan-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-tan-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@fs-tan-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-cos-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-cos-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-cos-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-cos-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp2-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp2-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp2-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-exp2-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log2-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log2-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log2-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-log2-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-pow-float-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-pow-vec2-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-pow-vec3-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-pow-vec4-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-sin-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-sin-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-sin-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-sin-vec4,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-tan-float,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-tan-vec2,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-tan-vec3,Fail
+spec@glsl-1.10@execution@built-in-functions@vs-tan-vec4,Fail
+spec@glsl-1.10@execution@fs-texture-select,Fail
+spec@glsl-1.10@execution@glsl-fs-convolution-2,Fail
+spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-2,Fail
+spec@glsl-1.10@execution@samplers@glsl-fs-sampler-numbering-3,Fail
+spec@glsl-1.10@execution@samplers@in-parameter-array,Fail
+spec@glsl-1.10@execution@texture3d,Fail
+spec@glsl-1.20@built-in constants,Fail
+spec@glsl-1.20@built-in constants@gl_MaxVertexAttribs,Fail
+spec@glsl-1.20@execution@fs-nan-builtin-max,Fail
+spec@glsl-1.20@execution@fs-nan-builtin-min,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 2dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() cube,Crash
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 1dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 2dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture(bias) cube,Crash
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1d_projvec4,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 1dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2d_projvec4,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 2dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj 3d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1d_projvec4,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 1dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2d_projvec4,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 2dshadow,Fail
+spec@glsl-1.20@execution@tex-miplevel-selection gl2:textureproj(bias) 3d,Fail
+spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-col-row-wr,Fail
+spec@glsl-1.20@execution@variable-indexing@vs-temp-array-mat4-index-col-row-wr,Fail
+spec@glsl-1.20@execution@vs-nan-builtin-max,Fail
+spec@glsl-1.20@execution@vs-nan-builtin-min,Fail
+spec@intel_performance_query@intel_performance_query-issue_2235,Fail
+spec@khr_texture_compression_astc@basic-gles,Fail
+spec@khr_texture_compression_astc@miptree-gl ldr,Fail
+spec@khr_texture_compression_astc@miptree-gl ldr@LDR Profile,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb-fp,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb-sd,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb-sd@sRGB skip decode,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb@sRGB decode,Fail
+spec@khr_texture_compression_astc@miptree-gles ldr,Fail
+spec@khr_texture_compression_astc@miptree-gles ldr@LDR Profile,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
+spec@oes_compressed_etc1_rgb8_texture@miptree,Fail
diff --git a/lib/mesa/src/broadcom/ci/vc4-rpi3-flakes.txt b/lib/mesa/src/broadcom/ci/vc4-rpi3-flakes.txt
new file mode 100644
index 000000000..895a2f767
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/vc4-rpi3-flakes.txt
@@ -0,0 +1,39 @@
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_neg_x_neg_y_neg_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_and_pos_y_pos_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_and_pos_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_x_neg_y_pos_z_and_neg_x_pos_y_neg_z
+dEQP-GLES2.functional.draw.random.51
+dEQP-GLES2.functional.fragment_ops.blend.rgb_func_alpha_func.src.one_minus_src_alpha_constant_color
+dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_direct_write_dynamic_loop_subscript_read_vertex
+dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_mediump_int_vertex
+dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.conditional_continue_vertex
+dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_inout_vertex
+dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_return_vertex
+dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_sequence_vertex
+dEQP-GLES2.functional.shaders.loops.while_constant_iterations.select_iteration_count_vertex
+dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.function_call_return_vertex
+dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.infinite_with_conditional_break_vertex
+dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.post_increment_vertex
+dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.single_iteration_vertex
+dEQP-GLES2.functional.shaders.operator.unary_operator.pre_decrement_result.mediump_vec3_fragment
+dEQP-GLES2.functional.shaders.random.exponential.fragment.51
+dEQP-GLES2.functional.shaders.random.texture.fragment.129
+dEQP-GLES2.functional.shaders.return.output_write_in_func_never_vertex
+dEQP-GLES2.functional.texture.filtering.2d.linear_linear_clamp_rgb888_pot
+dEQP-GLES2.functional.texture.filtering.cube.linear_mipmap_linear_nearest_mirror_rgba8888
+dEQP-GLES2.functional.texture.filtering.cube.nearest_linear_mirror_rgba8888_pot
+dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_linear_clamp_rgba8888
+dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_linear_nearest_repeat_l8
+dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_clamp_rgba8888
+dEQP-GLES2.functional.texture.filtering.cube.nearest_mipmap_nearest_linear_mirror_rgba8888
+dEQP-GLES2.functional.texture.mipmap.cube.generate.rgb565_fastest
+dEQP-GLES2.functional.texture.size.cube.256x256_rgb888
+
+glx@glx-multi-window-single-context
+shaders@glsl-vs-loop
+shaders@glsl-vs-loop-nested
+spec@arb_framebuffer_srgb@blit renderbuffer srgb single_sampled enabled clear
+spec@egl_chromium_sync_control@conformance
+spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-readpixels
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=2
+spec@!opengl 1.1@depthstencil-default_fb-drawpixels-float-and-ushort samples=4
diff --git a/lib/mesa/src/broadcom/ci/vc4-rpi3-skips.txt b/lib/mesa/src/broadcom/ci/vc4-rpi3-skips.txt
new file mode 100644
index 000000000..692eaff24
--- /dev/null
+++ b/lib/mesa/src/broadcom/ci/vc4-rpi3-skips.txt
@@ -0,0 +1,46 @@
+# Note: skips lists for CI are just a list of lines that, when
+# non-zero-length and not starting with '#', will regex match to
+# delete lines from the test list.  Be careful.
+
+# This is causing a binning memory overflow problem
+dEQP-GLES2.functional.fragment_ops.scissor.outside_render_line
+
+# These are very slow
+dEQP-GLES2.functional.uniform_api.random.3
+dEQP-GLES2.functional.uniform_api.random.79
+
+# Conformance issue: VC4 needs dynamic loops in the VS to cause a
+# shader link failure.
+#
+# The issue is that the HW doesn't have an exec mask at dispatch
+# for the VS, so the shouldn't-be-exec channels have undefined
+# contents and may cause infinite loops, leading to GPU hangs.  The
+# process of GPU hang reset causes flakes in whatever other jobs are
+# running simultaneously, so we can't even leave these in the flakes
+# list for tracking.
+dEQP-GLES2.functional.shaders.loops.*dynamic.*vertex
+
+# Timeout tests (> 1 minute to run)
+KHR-GLES2.texture_3d.filtering.sizes.3x7x5_linear_mipmap_linear
+KHR-GLES2.texture_3d.filtering.sizes.4x8x8_linear_mipmap_linear
+
+# Slow tests (> 1 minute to run)
+spec@ext_framebuffer_multisample@accuracy
+glx@glx-multithread-texture
+spec@arb_internalformat_query2@all internalformat_<x>_type pname checks
+spec@!opengl 1.1@streaming-texture-leak
+spec@!opengl 1.0@gl-1.0-blend-func
+shaders@glsl-predication-on-large-array
+
+# Extensions not supported
+spec@arb_gpu_shader_fp64.*
+spec@arb_gpu_shader_gpu5.*
+spec@arb_gpu_shader_int64.*
+spec@arb_tessellation_shader.*
+spec@arb_texture_cube_map.*
+spec@glsl-1.30.*
+spec@glsl-1.40.*
+spec@glsl-1.50.*
+spec@glsl-3.*
+spec@glsl-4.*
+spec@glsl-es-3.*
diff --git a/lib/mesa/src/broadcom/cle/v3d_decoder.c b/lib/mesa/src/broadcom/cle/v3d_decoder.c
index 364419074..97dd8ce84 100644
--- a/lib/mesa/src/broadcom/cle/v3d_decoder.c
+++ b/lib/mesa/src/broadcom/cle/v3d_decoder.c
@@ -674,11 +674,11 @@ v3d_spec_load(const struct v3d_device_info *devinfo)
 
         for (int i = 0; i < ARRAY_SIZE(genxml_files_table); i++) {
                 if (i != 0) {
-                        assert(genxml_files_table[i - 1].gen_10 <
-                               genxml_files_table[i].gen_10);
+                        assert(genxml_files_table[i - 1].ver_10 <
+                               genxml_files_table[i].ver_10);
                 }
 
-                if (genxml_files_table[i].gen_10 <= devinfo->ver) {
+                if (genxml_files_table[i].ver_10 <= devinfo->ver) {
                         text_offset = genxml_files_table[i].offset;
                         text_length = genxml_files_table[i].length;
                 }
diff --git a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
index 2fdc685ae..de80a6b64 100644
--- a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
+++ b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
@@ -950,11 +950,7 @@
     <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
 
-    <field name="Maximum BPP of all render targets" size="2" start="12" type="uint">
-      <value name="Render target maximum 32bpp" value="0"/>
-      <value name="Render target maximum 64bpp" value="1"/>
-      <value name="Render target maximum 128bpp" value="2"/>
-    </field>
+    <field name="Maximum BPP of all render targets" size="2" start="12" type="Internal BPP"/>
 
     <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/>
 
@@ -992,11 +988,7 @@
     <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
 
-    <field name="Maximum BPP of all render targets" size="2" start="40" type="uint">
-      <value name="Render target maximum 32bpp" value="0"/>
-      <value name="Render target maximum 64bpp" value="1"/>
-      <value name="Render target maximum 128bpp" value="2"/>
-    </field>
+    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
 
     <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
     <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
diff --git a/lib/mesa/src/broadcom/clif/clif_dump.c b/lib/mesa/src/broadcom/clif/clif_dump.c
index bf84c0b96..0aaa6b6ad 100644
--- a/lib/mesa/src/broadcom/clif/clif_dump.c
+++ b/lib/mesa/src/broadcom/clif/clif_dump.c
@@ -52,7 +52,7 @@ clif_dump_add_address_to_worklist(struct clif_dump *clif,
 
 struct clif_dump *
 clif_dump_init(const struct v3d_device_info *devinfo,
-               FILE *out, bool pretty)
+               FILE *out, bool pretty, bool nobin)
 {
         struct clif_dump *clif = rzalloc(NULL, struct clif_dump);
 
@@ -60,6 +60,7 @@ clif_dump_init(const struct v3d_device_info *devinfo,
         clif->out = out;
         clif->spec = v3d_spec_load(devinfo);
         clif->pretty = pretty;
+        clif->nobin = nobin;
 
         list_inithead(&clif->worklist);
 
@@ -238,6 +239,9 @@ static void
 clif_dump_binary(struct clif_dump *clif, struct clif_bo *bo,
                  uint32_t start, uint32_t end)
 {
+        if (clif->pretty && clif->nobin)
+                return;
+
         if (start == end)
                 return;
 
diff --git a/lib/mesa/src/broadcom/clif/clif_dump.h b/lib/mesa/src/broadcom/clif/clif_dump.h
index 8de3a2cbe..63f3ae77d 100644
--- a/lib/mesa/src/broadcom/clif/clif_dump.h
+++ b/lib/mesa/src/broadcom/clif/clif_dump.h
@@ -32,7 +32,7 @@ struct clif_dump;
 struct drm_v3d_submit_cl;
 
 struct clif_dump *clif_dump_init(const struct v3d_device_info *devinfo,
-                                 FILE *output, bool pretty);
+                                 FILE *output, bool pretty, bool nobin);
 void clif_dump(struct clif_dump *clif, const struct drm_v3d_submit_cl *submit);
 void clif_dump_destroy(struct clif_dump *clif);
 
diff --git a/lib/mesa/src/broadcom/clif/clif_private.h b/lib/mesa/src/broadcom/clif/clif_private.h
index 597d0b506..d96bfd12d 100644
--- a/lib/mesa/src/broadcom/clif/clif_private.h
+++ b/lib/mesa/src/broadcom/clif/clif_private.h
@@ -54,6 +54,11 @@ struct clif_dump {
          * output.
          */
         bool pretty;
+
+        /**
+         * Flag to no dump the binary resources.
+         */
+        bool nobin;
 };
 
 enum reloc_worklist_type {
diff --git a/lib/mesa/src/broadcom/common/v3d_debug.c b/lib/mesa/src/broadcom/common/v3d_debug.c
index 64a2426b9..508a2b7c7 100644
--- a/lib/mesa/src/broadcom/common/v3d_debug.c
+++ b/lib/mesa/src/broadcom/common/v3d_debug.c
@@ -34,33 +34,65 @@
 
 #include "common/v3d_debug.h"
 #include "util/macros.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
 #include "c11/threads.h"
 
 uint32_t V3D_DEBUG = 0;
 
-static const struct debug_control debug_control[] = {
-        { "cl",          V3D_DEBUG_CL},
-        { "clif",        V3D_DEBUG_CLIF},
-        { "qpu",         V3D_DEBUG_QPU},
-        { "vir",         V3D_DEBUG_VIR},
-        { "nir",         V3D_DEBUG_NIR},
-        { "tgsi",        V3D_DEBUG_TGSI},
-        { "shaderdb",    V3D_DEBUG_SHADERDB},
-        { "surface",     V3D_DEBUG_SURFACE},
-        { "perf",        V3D_DEBUG_PERF},
-        { "norast",      V3D_DEBUG_NORAST},
-        { "fs",          V3D_DEBUG_FS},
-        { "gs",          V3D_DEBUG_GS},
-        { "vs",          V3D_DEBUG_VS},
-        { "cs",          V3D_DEBUG_CS},
-        { "always_flush", V3D_DEBUG_ALWAYS_FLUSH},
-        { "precompile",  V3D_DEBUG_PRECOMPILE},
-        { "ra",          V3D_DEBUG_RA},
-        { "dump_spirv",  V3D_DEBUG_DUMP_SPIRV},
-        { NULL,    0 }
+static const struct debug_named_value debug_control[] = {
+        { "cl",          V3D_DEBUG_CL,
+          "Dump command list during creation" },
+        { "cl_nobin",    V3D_DEBUG_CL_NO_BIN,
+          "Dump command listduring creation, excluding binary resources" },
+        { "clif",        V3D_DEBUG_CLIF,
+          "Dump command list (CLIF format) during creation", },
+        { "qpu",         V3D_DEBUG_QPU,
+          "Dump generated QPU instructions" },
+        { "vir",         V3D_DEBUG_VIR,
+          "Dump VIR during program compile" },
+        { "nir",         V3D_DEBUG_NIR,
+          "Dump NIR during program compile" },
+        { "tgsi",        V3D_DEBUG_TGSI,
+          "Dump TGSI during program compile" },
+        { "shaderdb",    V3D_DEBUG_SHADERDB,
+          "Dump program compile information for shader-db analysis" },
+        { "surface",     V3D_DEBUG_SURFACE,
+          "Print resource layout information" },
+        { "perf",        V3D_DEBUG_PERF,
+          "Print during runtime performance-related events" },
+        { "norast",      V3D_DEBUG_NORAST,
+          "Skip actual hardware execution of commands" },
+        { "fs",          V3D_DEBUG_FS,
+          "Dump fragment shaders" },
+        { "gs",          V3D_DEBUG_GS,
+          "Dump geometry shaders" },
+        { "vs",          V3D_DEBUG_VS,
+          "Dump vertex shaders" },
+        { "cs",          V3D_DEBUG_CS,
+          "Dump computer shaders" },
+        { "always_flush", V3D_DEBUG_ALWAYS_FLUSH,
+          "Flush after each draw call" },
+        { "precompile",  V3D_DEBUG_PRECOMPILE,
+          "Precompiles shader variant at shader state creation time" },
+        { "ra",          V3D_DEBUG_RA,
+          "Dump register allocation failures" },
+        { "dump_spirv",  V3D_DEBUG_DUMP_SPIRV,
+          "Dump SPIR-V code" },
+        { "tmu32",  V3D_DEBUG_TMU_32BIT,
+          "Force 32-bit precision on all TMU operations" },
+        /* This can lead to incorrect behavior for applications that do
+         * require full 32-bit precision, but can improve performance
+         * for those that don't.
+         */
+        { "tmu16",  V3D_DEBUG_TMU_16BIT,
+          "Force 16-bit precision on all TMU operations" },
+        { "noloopunroll",  V3D_DEBUG_NO_LOOP_UNROLL,
+          "Disable loop unrolling" },
+        { NULL }
 };
 
+DEBUG_GET_ONCE_FLAGS_OPTION(v3d_debug, "V3D_DEBUG", debug_control, 0)
+
 uint32_t
 v3d_debug_flag_for_shader_stage(gl_shader_stage stage)
 {
@@ -76,20 +108,11 @@ v3d_debug_flag_for_shader_stage(gl_shader_stage stage)
         return flags[stage];
 }
 
-static void
-v3d_process_debug_variable_once(void)
-{
-        V3D_DEBUG = parse_debug_string(getenv("V3D_DEBUG"), debug_control);
-
-        if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
-                V3D_DEBUG |= V3D_DEBUG_NORAST;
-}
-
 void
 v3d_process_debug_variable(void)
 {
-        static once_flag v3d_process_debug_variable_flag = ONCE_FLAG_INIT;
+        V3D_DEBUG = debug_get_option_v3d_debug();
 
-        call_once(&v3d_process_debug_variable_flag,
-                  v3d_process_debug_variable_once);
+        if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
+                V3D_DEBUG |= V3D_DEBUG_NORAST;
 }
diff --git a/lib/mesa/src/broadcom/common/v3d_debug.h b/lib/mesa/src/broadcom/common/v3d_debug.h
index efa269758..72d632568 100644
--- a/lib/mesa/src/broadcom/common/v3d_debug.h
+++ b/lib/mesa/src/broadcom/common/v3d_debug.h
@@ -59,6 +59,10 @@ extern uint32_t V3D_DEBUG;
 #define V3D_DEBUG_PRECOMPILE        (1 << 15)
 #define V3D_DEBUG_RA                (1 << 16)
 #define V3D_DEBUG_DUMP_SPIRV        (1 << 17)
+#define V3D_DEBUG_TMU_32BIT         (1 << 18)
+#define V3D_DEBUG_TMU_16BIT         (1 << 19)
+#define V3D_DEBUG_NO_LOOP_UNROLL    (1 << 20)
+#define V3D_DEBUG_CL_NO_BIN         (1 << 21)
 
 #define V3D_DEBUG_SHADERS           (V3D_DEBUG_TGSI | V3D_DEBUG_NIR | \
                                      V3D_DEBUG_VIR | V3D_DEBUG_QPU | \
@@ -81,11 +85,6 @@ extern uint32_t V3D_DEBUG;
 #define dbg_printf(...)	fprintf(stderr, __VA_ARGS__)
 #endif /* HAVE_ANDROID_PLATFORM */
 
-#define DBG(flag, ...) do {                                     \
-        if (unlikely(V3D_DEBUG & (flag)))                       \
-                dbg_printf(__VA_ARGS__);                        \
-} while(0)
-
 extern uint32_t v3d_debug_flag_for_shader_stage(gl_shader_stage stage);
 
 extern void v3d_process_debug_variable(void);
diff --git a/lib/mesa/src/broadcom/common/v3d_limits.h b/lib/mesa/src/broadcom/common/v3d_limits.h
index a974ebc58..129e53e29 100644
--- a/lib/mesa/src/broadcom/common/v3d_limits.h
+++ b/lib/mesa/src/broadcom/common/v3d_limits.h
@@ -62,4 +62,6 @@
 #define V3D_MAX_POINT_SIZE 512.0f
 #define V3D_MAX_LINE_WIDTH 32
 
+#define V3D_MAX_BUFFER_RANGE (1 << 27)
+
 #endif /* V3D_LIMITS_H */
diff --git a/lib/mesa/src/broadcom/common/v3d_tiling.c b/lib/mesa/src/broadcom/common/v3d_tiling.c
new file mode 100644
index 000000000..22f84811e
--- /dev/null
+++ b/lib/mesa/src/broadcom/common/v3d_tiling.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file v3d_tiling.c
+ *
+ * Handles information about the V3D tiling formats, and loading and storing
+ * from them.
+ */
+
+#include <stdint.h>
+#include "v3d_tiling.h"
+#include "broadcom/common/v3d_cpu_tiling.h"
+
+/** Return the width in pixels of a 64-byte microtile. */
+uint32_t
+v3d_utile_width(int cpp)
+{
+        switch (cpp) {
+        case 1:
+        case 2:
+                return 8;
+        case 4:
+        case 8:
+                return 4;
+        case 16:
+                return 2;
+        default:
+                unreachable("unknown cpp");
+        }
+}
+
+/** Return the height in pixels of a 64-byte microtile. */
+uint32_t
+v3d_utile_height(int cpp)
+{
+        switch (cpp) {
+        case 1:
+                return 8;
+        case 2:
+        case 4:
+                return 4;
+        case 8:
+        case 16:
+                return 2;
+        default:
+                unreachable("unknown cpp");
+        }
+}
+
+/**
+ * Returns the byte address for a given pixel within a utile.
+ *
+ * Utiles are 64b blocks of pixels in raster order, with 32bpp being a 4x4
+ * arrangement.
+ */
+static inline uint32_t
+v3d_get_utile_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y)
+{
+        uint32_t utile_w = v3d_utile_width(cpp);
+
+        assert(x < utile_w && y < v3d_utile_height(cpp));
+
+        return x * cpp + y * utile_w * cpp;
+}
+
+/**
+ * Returns the byte offset for a given pixel in a LINEARTILE layout.
+ *
+ * LINEARTILE is a single line of utiles in either the X or Y direction.
+ */
+static inline uint32_t
+v3d_get_lt_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y)
+{
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
+        uint32_t utile_index_x = x / utile_w;
+        uint32_t utile_index_y = y / utile_h;
+
+        assert(utile_index_x == 0 || utile_index_y == 0);
+
+        return (64 * (utile_index_x + utile_index_y) +
+                v3d_get_utile_pixel_offset(cpp,
+                                           x & (utile_w - 1),
+                                           y & (utile_h - 1)));
+}
+
+/**
+ * Returns the byte offset for a given pixel in a UBLINEAR layout.
+ *
+ * UBLINEAR is the layout where pixels are arranged in UIF blocks (2x2
+ * utiles), and the UIF blocks are in 1 or 2 columns in raster order.
+ */
+static inline uint32_t
+v3d_get_ublinear_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y,
+                              int ublinear_number)
+{
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
+        uint32_t ub_w = utile_w * 2;
+        uint32_t ub_h = utile_h * 2;
+        uint32_t ub_x = x / ub_w;
+        uint32_t ub_y = y / ub_h;
+
+        return (256 * (ub_y * ublinear_number +
+                       ub_x) +
+                ((x & utile_w) ? 64 : 0) +
+                ((y & utile_h) ? 128 : 0) +
+                + v3d_get_utile_pixel_offset(cpp,
+                                             x & (utile_w - 1),
+                                             y & (utile_h - 1)));
+}
+
+static inline uint32_t
+v3d_get_ublinear_2_column_pixel_offset(uint32_t cpp, uint32_t image_h,
+                                       uint32_t x, uint32_t y)
+{
+        return v3d_get_ublinear_pixel_offset(cpp, x, y, 2);
+}
+
+static inline uint32_t
+v3d_get_ublinear_1_column_pixel_offset(uint32_t cpp, uint32_t image_h,
+                                       uint32_t x, uint32_t y)
+{
+        return v3d_get_ublinear_pixel_offset(cpp, x, y, 1);
+}
+
+/**
+ * Returns the byte offset for a given pixel in a UIF layout.
+ *
+ * UIF is the general V3D tiling layout shared across 3D, media, and scanout.
+ * It stores pixels in UIF blocks (2x2 utiles), and UIF blocks are stored in
+ * 4x4 groups, and those 4x4 groups are then stored in raster order.
+ */
+static inline uint32_t
+v3d_get_uif_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y,
+                         bool do_xor)
+{
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
+        uint32_t mb_width = utile_w * 2;
+        uint32_t mb_height = utile_h * 2;
+        uint32_t log2_mb_width = ffs(mb_width) - 1;
+        uint32_t log2_mb_height = ffs(mb_height) - 1;
+
+        /* Macroblock X, y */
+        uint32_t mb_x = x >> log2_mb_width;
+        uint32_t mb_y = y >> log2_mb_height;
+        /* X, y within the macroblock */
+        uint32_t mb_pixel_x = x - (mb_x << log2_mb_width);
+        uint32_t mb_pixel_y = y - (mb_y << log2_mb_height);
+
+        if (do_xor && (mb_x / 4) & 1)
+                mb_y ^= 0x10;
+
+        uint32_t mb_h = align(image_h, 1 << log2_mb_height) >> log2_mb_height;
+        uint32_t mb_id = ((mb_x / 4) * ((mb_h - 1) * 4)) + mb_x + mb_y * 4;
+
+        uint32_t mb_base_addr = mb_id * 256;
+
+        bool top = mb_pixel_y < utile_h;
+        bool left = mb_pixel_x < utile_w;
+
+        /* Docs have this in pixels, we do bytes here. */
+        uint32_t mb_tile_offset = (!top * 128 + !left * 64);
+
+        uint32_t utile_x = mb_pixel_x & (utile_w - 1);
+        uint32_t utile_y = mb_pixel_y & (utile_h - 1);
+
+        uint32_t mb_pixel_address = (mb_base_addr +
+                                     mb_tile_offset +
+                                     v3d_get_utile_pixel_offset(cpp,
+                                                                utile_x,
+                                                                utile_y));
+
+        return mb_pixel_address;
+}
+
+static inline uint32_t
+v3d_get_uif_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
+                             uint32_t x, uint32_t y)
+{
+        return v3d_get_uif_pixel_offset(cpp, image_h, x, y, true);
+}
+
+static inline uint32_t
+v3d_get_uif_no_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
+                                uint32_t x, uint32_t y)
+{
+        return v3d_get_uif_pixel_offset(cpp, image_h, x, y, false);
+}
+
+/* Loads/stores non-utile-aligned boxes by walking over the destination
+ * rectangle, computing the address on the GPU, and storing/loading a pixel at
+ * a time.
+ */
+static inline void
+v3d_move_pixels_unaligned(void *gpu, uint32_t gpu_stride,
+                          void *cpu, uint32_t cpu_stride,
+                          int cpp, uint32_t image_h,
+                          const struct pipe_box *box,
+                          uint32_t (*get_pixel_offset)(uint32_t cpp,
+                                                       uint32_t image_h,
+                                                       uint32_t x, uint32_t y),
+                          bool is_load)
+{
+        for (uint32_t y = 0; y < box->height; y++) {
+                void *cpu_row = cpu + y * cpu_stride;
+
+                for (int x = 0; x < box->width; x++) {
+                        uint32_t pixel_offset = get_pixel_offset(cpp, image_h,
+                                                                 box->x + x,
+                                                                 box->y + y);
+
+                        if (false) {
+                                fprintf(stderr, "%3d,%3d -> %d\n",
+                                        box->x + x, box->y + y,
+                                        pixel_offset);
+                        }
+
+                        if (is_load) {
+                                memcpy(cpu_row + x * cpp,
+                                       gpu + pixel_offset,
+                                       cpp);
+                        } else {
+                                memcpy(gpu + pixel_offset,
+                                       cpu_row + x * cpp,
+                                       cpp);
+                        }
+                }
+        }
+}
+
+/* Breaks the image down into utiles and calls either the fast whole-utile
+ * load/store functions, or the unaligned fallback case.
+ */
+static inline void
+v3d_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride,
+                               void *cpu, uint32_t cpu_stride,
+                               int cpp, uint32_t image_h,
+                               const struct pipe_box *box,
+                               uint32_t (*get_pixel_offset)(uint32_t cpp,
+                                                            uint32_t image_h,
+                                                            uint32_t x, uint32_t y),
+                               bool is_load)
+{
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
+        uint32_t utile_gpu_stride = utile_w * cpp;
+        uint32_t x1 = box->x;
+        uint32_t y1 = box->y;
+        uint32_t x2 = box->x + box->width;
+        uint32_t y2 = box->y + box->height;
+        uint32_t align_x1 = align(x1, utile_w);
+        uint32_t align_y1 = align(y1, utile_h);
+        uint32_t align_x2 = x2 & ~(utile_w - 1);
+        uint32_t align_y2 = y2 & ~(utile_h - 1);
+
+        /* Load/store all the whole utiles first. */
+        for (uint32_t y = align_y1; y < align_y2; y += utile_h) {
+                void *cpu_row = cpu + (y - box->y) * cpu_stride;
+
+                for (uint32_t x = align_x1; x < align_x2; x += utile_w) {
+                        void *utile_gpu = (gpu +
+                                           get_pixel_offset(cpp, image_h, x, y));
+                        void *utile_cpu = cpu_row + (x - box->x) * cpp;
+
+                        if (is_load) {
+                                v3d_load_utile(utile_cpu, cpu_stride,
+                                               utile_gpu, utile_gpu_stride);
+                        } else {
+                                v3d_store_utile(utile_gpu, utile_gpu_stride,
+                                                utile_cpu, cpu_stride);
+                        }
+                }
+        }
+
+        /* If there were no aligned utiles in the middle, load/store the whole
+         * thing unaligned.
+         */
+        if (align_y2 <= align_y1 ||
+            align_x2 <= align_x1) {
+                v3d_move_pixels_unaligned(gpu, gpu_stride,
+                                          cpu, cpu_stride,
+                                          cpp, image_h,
+                                          box,
+                                          get_pixel_offset, is_load);
+                return;
+        }
+
+        /* Load/store the partial utiles. */
+        struct pipe_box partial_boxes[4] = {
+                /* Top */
+                {
+                        .x = x1,
+                        .width = x2 - x1,
+                        .y = y1,
+                        .height = align_y1 - y1,
+                },
+                /* Bottom */
+                {
+                        .x = x1,
+                        .width = x2 - x1,
+                        .y = align_y2,
+                        .height = y2 - align_y2,
+                },
+                /* Left */
+                {
+                        .x = x1,
+                        .width = align_x1 - x1,
+                        .y = align_y1,
+                        .height = align_y2 - align_y1,
+                },
+                /* Right */
+                {
+                        .x = align_x2,
+                        .width = x2 - align_x2,
+                        .y = align_y1,
+                        .height = align_y2 - align_y1,
+                },
+        };
+        for (int i = 0; i < ARRAY_SIZE(partial_boxes); i++) {
+                void *partial_cpu = (cpu +
+                                     (partial_boxes[i].y - y1) * cpu_stride +
+                                     (partial_boxes[i].x - x1) * cpp);
+
+                v3d_move_pixels_unaligned(gpu, gpu_stride,
+                                          partial_cpu, cpu_stride,
+                                          cpp, image_h,
+                                          &partial_boxes[i],
+                                          get_pixel_offset, is_load);
+        }
+}
+
+static inline void
+v3d_move_pixels_general(void *gpu, uint32_t gpu_stride,
+                               void *cpu, uint32_t cpu_stride,
+                               int cpp, uint32_t image_h,
+                               const struct pipe_box *box,
+                               uint32_t (*get_pixel_offset)(uint32_t cpp,
+                                                            uint32_t image_h,
+                                                            uint32_t x, uint32_t y),
+                               bool is_load)
+{
+        switch (cpp) {
+        case 1:
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               1, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 2:
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               2, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 4:
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               4, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 8:
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               8, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        case 16:
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
+                                               cpu, cpu_stride,
+                                               16, image_h, box,
+                                               get_pixel_offset,
+                                               is_load);
+                break;
+        }
+}
+
+static inline void
+v3d_move_tiled_image(void *gpu, uint32_t gpu_stride,
+                     void *cpu, uint32_t cpu_stride,
+                     enum v3d_tiling_mode tiling_format,
+                     int cpp,
+                     uint32_t image_h,
+                     const struct pipe_box *box,
+                     bool is_load)
+{
+        switch (tiling_format) {
+        case V3D_TILING_UIF_XOR:
+                v3d_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        v3d_get_uif_xor_pixel_offset,
+                                        is_load);
+                break;
+        case V3D_TILING_UIF_NO_XOR:
+                v3d_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        v3d_get_uif_no_xor_pixel_offset,
+                                        is_load);
+                break;
+        case V3D_TILING_UBLINEAR_2_COLUMN:
+                v3d_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        v3d_get_ublinear_2_column_pixel_offset,
+                                        is_load);
+                break;
+        case V3D_TILING_UBLINEAR_1_COLUMN:
+                v3d_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        v3d_get_ublinear_1_column_pixel_offset,
+                                        is_load);
+                break;
+        case V3D_TILING_LINEARTILE:
+                v3d_move_pixels_general(gpu, gpu_stride,
+                                        cpu, cpu_stride,
+                                        cpp, image_h, box,
+                                        v3d_get_lt_pixel_offset,
+                                        is_load);
+                break;
+        default:
+                unreachable("Unsupported tiling format");
+                break;
+        }
+}
+
+/**
+ * Loads pixel data from the start (microtile-aligned) box in \p src to the
+ * start of \p dst according to the given tiling format.
+ */
+void
+v3d_load_tiled_image(void *dst, uint32_t dst_stride,
+                     void *src, uint32_t src_stride,
+                     enum v3d_tiling_mode tiling_format, int cpp,
+                     uint32_t image_h,
+                     const struct pipe_box *box)
+{
+        v3d_move_tiled_image(src, src_stride,
+                             dst, dst_stride,
+                             tiling_format,
+                             cpp,
+                             image_h,
+                             box,
+                             true);
+}
+
+/**
+ * Stores pixel data from the start of \p src into a (microtile-aligned) box in
+ * \p dst according to the given tiling format.
+ */
+void
+v3d_store_tiled_image(void *dst, uint32_t dst_stride,
+                      void *src, uint32_t src_stride,
+                      enum v3d_tiling_mode tiling_format, int cpp,
+                      uint32_t image_h,
+                      const struct pipe_box *box)
+{
+        v3d_move_tiled_image(dst, dst_stride,
+                             src, src_stride,
+                             tiling_format,
+                             cpp,
+                             image_h,
+                             box,
+                             false);
+}
diff --git a/lib/mesa/src/broadcom/common/v3d_tiling.h b/lib/mesa/src/broadcom/common/v3d_tiling.h
new file mode 100644
index 000000000..08ae7cce8
--- /dev/null
+++ b/lib/mesa/src/broadcom/common/v3d_tiling.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_TILING_H
+#define V3D_TILING_H
+
+#include "util/u_box.h"
+
+/* A UIFblock is a 256-byte region of memory that's 256-byte aligned.  These
+ * will be grouped in 4x4 blocks (left-to-right, then top-to-bottom) in a 4KB
+ * page.  Those pages are then arranged left-to-right, top-to-bottom, to cover
+ * an image.
+ *
+ * The inside of a UIFblock, for packed pixels, will be split into 4 64-byte
+ * utiles.  Utiles may be 8x8 (8bpp), 8x4(16bpp) or 4x4 (32bpp).
+ */
+
+/**
+ * Tiling mode enum used for v3d_resource.c, which maps directly to the Memory
+ * Format field of render target and Z/Stencil config.
+ */
+enum v3d_tiling_mode {
+        /* Untiled resources.  Not valid as texture inputs. */
+        V3D_TILING_RASTER,
+
+        /* Single line of u-tiles. */
+        V3D_TILING_LINEARTILE,
+
+        /* Departure from standard 4-UIF block column format. */
+        V3D_TILING_UBLINEAR_1_COLUMN,
+
+        /* Departure from standard 4-UIF block column format. */
+        V3D_TILING_UBLINEAR_2_COLUMN,
+
+        /* Normal tiling format: grouped in 4x4 UIFblocks, each of which is
+         * split 2x2 into utiles.
+         */
+        V3D_TILING_UIF_NO_XOR,
+
+        /* Normal tiling format: grouped in 4x4 UIFblocks, each of which is
+         * split 2x2 into utiles.
+         */
+        V3D_TILING_UIF_XOR,
+};
+
+uint32_t v3d_utile_width(int cpp) ATTRIBUTE_CONST;
+uint32_t v3d_utile_height(int cpp) ATTRIBUTE_CONST;
+bool v3d_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
+void v3d_load_tiled_image(void *dst, uint32_t dst_stride,
+                          void *src, uint32_t src_stride,
+                          enum v3d_tiling_mode tiling_format, int cpp,
+                          uint32_t image_h,
+                          const struct pipe_box *box);
+void v3d_store_tiled_image(void *dst, uint32_t dst_stride,
+                           void *src, uint32_t src_stride,
+                           enum v3d_tiling_mode tiling_format, int cpp,
+                           uint32_t image_h,
+                           const struct pipe_box *box);
+
+#endif /* V3D_TILING_H */
diff --git a/lib/mesa/src/broadcom/common/v3d_util.c b/lib/mesa/src/broadcom/common/v3d_util.c
new file mode 100644
index 000000000..424656fd8
--- /dev/null
+++ b/lib/mesa/src/broadcom/common/v3d_util.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_util.h"
+#include "util/macros.h"
+
+/* Choose a number of workgroups per supergroup that maximizes
+ * lane occupancy. We can pack up to 16 workgroups into a supergroup.
+ */
+uint32_t
+v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+                                         bool has_subgroups,
+                                         bool has_tsy_barrier,
+                                         uint32_t threads,
+                                         uint32_t num_wgs,
+                                         uint32_t wg_size)
+{
+   /* FIXME: subgroups may restrict supergroup packing. For now, we disable it
+    * completely if the shader uses subgroups.
+    */
+   if (has_subgroups)
+           return 1;
+
+   /* Compute maximum number of batches in a supergroup for this workgroup size.
+    * Each batch is 16 elements, and we can have up to 16 work groups in a
+    * supergroup:
+    *
+    * max_batches_per_sg = (wg_size * max_wgs_per_sg) / elements_per_batch
+    * since max_wgs_per_sg = 16 and elements_per_batch = 16, we get:
+    * max_batches_per_sg = wg_size
+    */
+   uint32_t max_batches_per_sg = wg_size;
+
+   /* QPU threads will stall at TSY barriers until the entire supergroup
+    * reaches the barrier. Limit the supergroup size to half the QPU threads
+    * available, so we can have at least 2 supergroups executing in parallel
+    * and we don't stall all our QPU threads when a supergroup hits a barrier.
+    */
+   if (has_tsy_barrier) {
+      uint32_t max_qpu_threads = devinfo->qpu_count * threads;
+      max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2);
+   }
+   uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size;
+
+   uint32_t best_wgs_per_sg = 1;
+   uint32_t best_unused_lanes = 16;
+   for (uint32_t wgs_per_sg = 1; wgs_per_sg <= max_wgs_per_sg; wgs_per_sg++) {
+      /* Don't try to pack more workgroups per supergroup than the total amount
+       * of workgroups dispatched.
+       */
+      if (wgs_per_sg > num_wgs)
+         return best_wgs_per_sg;
+
+      /* Compute wasted lines for this configuration and keep track of the
+       * config with less waste.
+       */
+      uint32_t unused_lanes = (16 - ((wgs_per_sg * wg_size) % 16)) & 0x0f;
+      if (unused_lanes == 0)
+         return wgs_per_sg;
+
+      if (unused_lanes < best_unused_lanes) {
+         best_wgs_per_sg = wgs_per_sg;
+         best_unused_lanes = unused_lanes;
+      }
+   }
+
+   return best_wgs_per_sg;
+}
diff --git a/lib/mesa/src/broadcom/common/v3d_util.h b/lib/mesa/src/broadcom/common/v3d_util.h
new file mode 100644
index 000000000..b9804f235
--- /dev/null
+++ b/lib/mesa/src/broadcom/common/v3d_util.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_UTIL_H
+#define V3D_UTIL_H
+
+#include "common/v3d_device_info.h"
+
+uint32_t
+v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+                                         bool has_subgroups,
+                                         bool has_tsy_barrier,
+                                         uint32_t threads,
+                                         uint32_t num_wgs,
+                                         uint32_t wg_size);
+
+#endif
diff --git a/lib/mesa/src/broadcom/compiler/nir_to_vir.c b/lib/mesa/src/broadcom/compiler/nir_to_vir.c
index c70d12881..d0a89f1a7 100644
--- a/lib/mesa/src/broadcom/compiler/nir_to_vir.c
+++ b/lib/mesa/src/broadcom/compiler/nir_to_vir.c
@@ -68,6 +68,39 @@
 #define V3D_TSY_DEC_SEMAPHORE       14
 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
 
+enum v3d_tmu_op_type
+{
+        V3D_TMU_OP_TYPE_REGULAR,
+        V3D_TMU_OP_TYPE_ATOMIC,
+        V3D_TMU_OP_TYPE_CACHE
+};
+
+static enum v3d_tmu_op_type
+v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
+{
+        switch(tmu_op) {
+        case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
+        case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
+        case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
+        case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
+        case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
+                return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
+        case V3D_TMU_OP_WRITE_UMAX:
+        case V3D_TMU_OP_WRITE_SMIN:
+        case V3D_TMU_OP_WRITE_SMAX:
+                assert(is_write);
+                FALLTHROUGH;
+        case V3D_TMU_OP_WRITE_AND_READ_INC:
+        case V3D_TMU_OP_WRITE_OR_READ_DEC:
+        case V3D_TMU_OP_WRITE_XOR_READ_NOT:
+                return V3D_TMU_OP_TYPE_ATOMIC;
+        case V3D_TMU_OP_REGULAR:
+                return V3D_TMU_OP_TYPE_REGULAR;
+
+        default:
+                unreachable("Unknown tmu_op\n");
+        }
+}
 static void
 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
 
@@ -282,6 +315,8 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
 
         if (c->disable_tmu_pipelining)
                 ntq_flush_tmu(c);
+        else if (c->tmu.flush_count > 1)
+                c->pipelined_any_tmu = true;
 }
 
 enum emit_mode {
@@ -565,11 +600,10 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                                         &tmu_writes);
                 }
 
-                /* The spec says that for atomics, the TYPE field is
-                 * ignored, but that doesn't seem to be the case for
-                 * CMPXCHG.  Just use the number of tmud writes we did
-                 * to decide the type (or choose "32bit" for atomic
-                 * reads, which has been fine).
+                /* For atomics we use 32bit except for CMPXCHG, that we need
+                 * to use VEC2. For the rest of the cases we use the number of
+                 * tmud writes we did to decide the type. For cache operations
+                 * the type is ignored.
                  */
                 uint32_t config = 0;
                 if (mode == MODE_EMIT) {
@@ -580,6 +614,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 assert(tmu_writes > 0);
                                 num_components = tmu_writes - 1;
                         }
+                        bool is_atomic =
+                                v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
+                                V3D_TMU_OP_TYPE_ATOMIC;
 
                         uint32_t perquad =
                                 is_load && !vir_in_nonuniform_control_flow(c)
@@ -587,7 +624,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 : GENERAL_TMU_LOOKUP_PER_PIXEL;
                         config = 0xffffff00 | tmu_op << 3 | perquad;
 
-                        if (num_components == 1) {
+                        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+                                config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
+                        } else if (is_atomic || num_components == 1) {
                                 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
                         } else {
                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
@@ -1191,6 +1230,18 @@ out:
         return V3D_QPU_COND_IFNA;
 }
 
+static struct qreg
+ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+        struct qreg result =
+                vir_MOV(c, vir_SEL(c, cond,
+                                   vir_uniform_ui(c, ~0),
+                                   vir_uniform_ui(c, 0)));
+        c->flags_temp = result.index;
+        c->flags_cond = cond;
+        return result;
+}
+
 static void
 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 {
@@ -1354,11 +1405,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 enum v3d_qpu_cond cond;
                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
                 assert(ok);
-                result = vir_MOV(c, vir_SEL(c, cond,
-                                            vir_uniform_ui(c, ~0),
-                                            vir_uniform_ui(c, 0)));
-                c->flags_temp = result.index;
-                c->flags_cond = cond;
+                result = ntq_emit_cond_to_bool(c, cond);
                 break;
         }
 
@@ -1438,11 +1485,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_uadd_carry:
                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
                            V3D_QPU_PF_PUSHC);
-                result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
-                                            vir_uniform_ui(c, ~0),
-                                            vir_uniform_ui(c, 0)));
-                c->flags_temp = result.index;
-                c->flags_cond = V3D_QPU_COND_IFA;
+                result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
                 break;
 
         case nir_op_pack_half_2x16_split:
@@ -1627,6 +1670,15 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
 static void
 emit_frag_end(struct v3d_compile *c)
 {
+        /* If the shader has no non-TLB side effects and doesn't write Z
+         * we can promote it to enabling early_fragment_tests even
+         * if the user didn't.
+         */
+        if (c->output_position_index == -1 &&
+            !(c->s->info.num_images || c->s->info.num_ssbos)) {
+                c->s->info.fs.early_fragment_tests = true;
+        }
+
         if (c->output_sample_mask_index != -1) {
                 vir_SETMSF_dest(c, vir_nop_reg(),
                                 vir_AND(c,
@@ -1651,7 +1703,8 @@ emit_frag_end(struct v3d_compile *c)
         }
 
         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
-        if (c->output_position_index != -1) {
+        if (c->output_position_index != -1 &&
+            !c->s->info.fs.early_fragment_tests) {
                 struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
                                                   c->outputs[c->output_position_index]);
                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
@@ -1711,17 +1764,22 @@ emit_frag_end(struct v3d_compile *c)
 static inline void
 vir_VPM_WRITE_indirect(struct v3d_compile *c,
                        struct qreg val,
-                       struct qreg vpm_index)
+                       struct qreg vpm_index,
+                       bool uniform_vpm_index)
 {
         assert(c->devinfo->ver >= 40);
-        vir_STVPMV(c, vpm_index, val);
+        if (uniform_vpm_index)
+                vir_STVPMV(c, vpm_index, val);
+        else
+                vir_STVPMD(c, vpm_index, val);
 }
 
 static void
 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
 {
         if (c->devinfo->ver >= 40) {
-                vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index));
+                vir_VPM_WRITE_indirect(c, val,
+                                       vir_uniform_ui(c, vpm_index), true);
         } else {
                 /* XXX: v3d33_vir_vpm_write_setup(c); */
                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
@@ -1774,7 +1832,7 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
 }
 
 void
-v3d_optimize_nir(struct nir_shader *s)
+v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
 {
         bool progress;
         unsigned lower_flrp =
@@ -1787,7 +1845,7 @@ v3d_optimize_nir(struct nir_shader *s)
 
                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
-                NIR_PASS(progress, s, nir_lower_phis_to_scalar);
+                NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
                 NIR_PASS(progress, s, nir_copy_prop);
                 NIR_PASS(progress, s, nir_opt_remove_phis);
                 NIR_PASS(progress, s, nir_opt_dce);
@@ -1825,6 +1883,14 @@ v3d_optimize_nir(struct nir_shader *s)
 
                 NIR_PASS(progress, s, nir_opt_undef);
                 NIR_PASS(progress, s, nir_lower_undef_to_zero);
+
+                if (c && !c->disable_loop_unrolling &&
+                    s->options->max_unroll_iterations > 0) {
+                       bool local_progress = false;
+                       NIR_PASS(local_progress, s, nir_opt_loop_unroll);
+                       c->unrolled_any_loops |= local_progress;
+                       progress |= local_progress;
+                }
         } while (progress);
 
         nir_move_options sink_opts =
@@ -1836,15 +1902,11 @@ v3d_optimize_nir(struct nir_shader *s)
 }
 
 static int
-driver_location_compare(const void *in_a, const void *in_b)
+driver_location_compare(const nir_variable *a, const nir_variable *b)
 {
-        const nir_variable *const *a = in_a;
-        const nir_variable *const *b = in_b;
-
-        if ((*a)->data.driver_location == (*b)->data.driver_location)
-                return (*a)->data.location_frac - (*b)->data.location_frac;
-
-        return (*a)->data.driver_location - (*b)->data.driver_location;
+        return a->data.driver_location == b->data.driver_location ?
+               a->data.location_frac - b->data.location_frac :
+               a->data.driver_location - b->data.driver_location;
 }
 
 static struct qreg
@@ -1984,49 +2046,36 @@ program_reads_point_coord(struct v3d_compile *c)
 }
 
 static void
-get_sorted_input_variables(struct v3d_compile *c,
-                           unsigned *num_entries,
-                           nir_variable ***vars)
-{
-        *num_entries = 0;
-        nir_foreach_shader_in_variable(var, c->s)
-                (*num_entries)++;
-
-        *vars = ralloc_array(c, nir_variable *, *num_entries);
-
-        unsigned i = 0;
-        nir_foreach_shader_in_variable(var, c->s)
-                (*vars)[i++] = var;
-
-        /* Sort the variables so that we emit the input setup in
-         * driver_location order.  This is required for VPM reads, whose data
-         * is fetched into the VPM in driver_location (TGSI register index)
-         * order.
-         */
-        qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare);
-}
-
-static void
 ntq_setup_gs_inputs(struct v3d_compile *c)
 {
-        nir_variable **vars;
-        unsigned num_entries;
-        get_sorted_input_variables(c, &num_entries, &vars);
-
-        for (unsigned i = 0; i < num_entries; i++) {
-                nir_variable *var = vars[i];
+        nir_sort_variables_with_modes(c->s, driver_location_compare,
+                                      nir_var_shader_in);
 
+        nir_foreach_shader_in_variable(var, c->s) {
                 /* All GS inputs are arrays with as many entries as vertices
                  * in the input primitive, but here we only care about the
                  * per-vertex input type.
                  */
-                const struct glsl_type *type = glsl_without_array(var->type);
+                assert(glsl_type_is_array(var->type));
+                const struct glsl_type *type = glsl_get_array_element(var->type);
                 unsigned array_len = MAX2(glsl_get_length(type), 1);
                 unsigned loc = var->data.driver_location;
 
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
                                   (loc + array_len) * 4);
 
+                if (var->data.compact) {
+                        for (unsigned j = 0; j < array_len; j++) {
+                                unsigned input_idx = c->num_inputs++;
+                                unsigned loc_frac = var->data.location_frac + j;
+                                unsigned loc = var->data.location + loc_frac / 4;
+                                unsigned comp = loc_frac % 4;
+                                c->input_slots[input_idx] =
+                                        v3d_slot_from_slot_and_component(loc, comp);
+                        }
+                       continue;
+                }
+
                 for (unsigned j = 0; j < array_len; j++) {
                         unsigned num_elements = glsl_get_vector_elements(type);
                         for (unsigned k = 0; k < num_elements; k++) {
@@ -2044,12 +2093,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
 static void
 ntq_setup_fs_inputs(struct v3d_compile *c)
 {
-        nir_variable **vars;
-        unsigned num_entries;
-        get_sorted_input_variables(c, &num_entries, &vars);
+        nir_sort_variables_with_modes(c->s, driver_location_compare,
+                                      nir_var_shader_in);
 
-        for (unsigned i = 0; i < num_entries; i++) {
-                nir_variable *var = vars[i];
+        nir_foreach_shader_in_variable(var, c->s) {
                 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
                 unsigned loc = var->data.driver_location;
 
@@ -2062,6 +2109,14 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
 
                 if (var->data.location == VARYING_SLOT_POS) {
                         emit_fragcoord_input(c, loc);
+                } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
+                           !c->fs_key->has_gs) {
+                        /* If the fragment shader reads gl_PrimitiveID and we
+                         * don't have a geometry shader in the pipeline to write
+                         * it then we program the hardware to inject it as
+                         * an implicit varying. Take it from there.
+                         */
+                        c->inputs[loc * 4] = c->primitive_id;
                 } else if (util_varying_is_point_coord(var->data.location,
                                                        c->fs_key->point_sprite_mask)) {
                         c->inputs[loc * 4 + 0] = c->point_x;
@@ -2342,8 +2397,16 @@ ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
 static void
 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
-        /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
-         * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
+        /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
+         *
+         * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
+         * if we don't support non-uniform offsets because we also set the
+         * lower_all_io_to_temps option in the NIR compiler. This ensures that
+         * any indirect indexing on in/out variables is turned into indirect
+         * indexing on temporary variables instead, that we handle by lowering
+         * to scratch. If we implement non-uniform offset here we might be able
+         * to avoid the temp and scratch lowering, which involves copying from
+         * the input to the temp variable, possibly making code more optimal.
          */
         unsigned offset =
                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
@@ -2448,10 +2511,10 @@ emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
          * different offsets in the VPM and we need to use the scatter write
          * instruction to have a different offset for each lane.
          */
-        if (nir_src_is_dynamically_uniform(instr->src[1]))
-                vir_VPM_WRITE_indirect(c, val, offset);
-        else
-                vir_STVPMD(c, offset, val);
+         bool is_uniform_offset =
+                 !vir_in_nonuniform_control_flow(c) &&
+                 !nir_src_is_divergent(instr->src[1]);
+         vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
 
         if (vir_in_nonuniform_control_flow(c)) {
                 struct qinst *last_inst =
@@ -2461,33 +2524,37 @@ emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
 }
 
 static void
+emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        assert(c->s->info.stage == MESA_SHADER_VERTEX);
+        assert(instr->num_components == 1);
+
+        uint32_t base = nir_intrinsic_base(instr);
+        struct qreg val = ntq_get_src(c, instr->src[0], 0);
+
+        if (nir_src_is_const(instr->src[1])) {
+                vir_VPM_WRITE(c, val,
+                              base + nir_src_as_uint(instr->src[1]));
+        } else {
+                struct qreg offset = vir_ADD(c,
+                                             ntq_get_src(c, instr->src[1], 1),
+                                             vir_uniform_ui(c, base));
+                bool is_uniform_offset =
+                        !vir_in_nonuniform_control_flow(c) &&
+                        !nir_src_is_divergent(instr->src[1]);
+                vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
+        }
+}
+
+static void
 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
-        /* XXX perf: Use stvpmv with uniform non-constant offsets and
-         * stvpmd with non-uniform offsets and enable
-         * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
-         */
-        if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
                ntq_emit_color_write(c, instr);
-        } else if (c->s->info.stage == MESA_SHADER_GEOMETRY)  {
+        else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
                emit_store_output_gs(c, instr);
-        } else {
-               assert(c->s->info.stage == MESA_SHADER_VERTEX);
-               assert(instr->num_components == 1);
-
-               uint32_t base = nir_intrinsic_base(instr);
-               if (nir_src_is_const(instr->src[1])) {
-                  vir_VPM_WRITE(c,
-                                ntq_get_src(c, instr->src[0], 0),
-                                base + nir_src_as_uint(instr->src[1]));
-               } else {
-                  vir_VPM_WRITE_indirect(c,
-                                         ntq_get_src(c, instr->src[0], 0),
-                                         vir_ADD(c,
-                                                 ntq_get_src(c, instr->src[1], 1),
-                                                 vir_uniform_ui(c, base)));
-               }
-        }
+        else
+               emit_store_output_vs(c, instr);
 }
 
 /**
@@ -2707,6 +2774,41 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
         }
 }
 
+static inline struct qreg
+emit_load_local_invocation_index(struct v3d_compile *c)
+{
+        return vir_SHR(c, c->cs_payload[1],
+                       vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
+}
+
+/* Various subgroup operations rely on the A flags, so this helper ensures that
+ * A flags represents currently active lanes in the subgroup.
+ */
+static void
+set_a_flags_for_subgroup(struct v3d_compile *c)
+{
+        /* MSF returns 0 for disabled lanes in compute shaders so
+         * PUSHZ will set A=1 for disabled lanes. We want the inverse
+         * of this but we don't have any means to negate the A flags
+         * directly, but we can do it by repeating the same operation
+         * with NORZ (A = ~A & ~Z).
+         */
+        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
+        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
+        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
+
+        /* If we are under non-uniform control flow we also need to
+         * AND the A flags with the current execute mask.
+         */
+        if (vir_in_nonuniform_control_flow(c)) {
+                const uint32_t bidx = c->cur_block->index;
+                vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
+                                           c->execute,
+                                           vir_uniform_ui(c, bidx)),
+                           V3D_QPU_UF_ANDZ);
+        }
+}
+
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
@@ -2772,7 +2874,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_get_ssbo_size:
                 ntq_store_dest(c, &instr->dest, 0,
                                vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
-                                           nir_src_as_uint(instr->src[0])));
+                                           nir_src_comp_as_uint(instr->src[0], 0)));
                 break;
 
         case nir_intrinsic_get_ubo_size:
@@ -2830,11 +2932,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_helper_invocation:
                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
-                struct qreg qdest = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
-                                                       vir_uniform_ui(c, ~0),
-                                                       vir_uniform_ui(c, 0)));
-                c->flags_temp = qdest.index;
-                c->flags_cond = V3D_QPU_COND_IFA;
+                struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
                 ntq_store_dest(c, &instr->dest, 0, qdest);
                 break;
 
@@ -2960,7 +3058,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 vir_emit_thrsw(c);
                 break;
 
-        case nir_intrinsic_load_num_work_groups:
+        case nir_intrinsic_load_num_workgroups:
                 for (int i = 0; i < 3; i++) {
                         ntq_store_dest(c, &instr->dest, i,
                                        vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
@@ -2968,27 +3066,49 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
-        case nir_intrinsic_load_local_invocation_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_SHR(c, c->cs_payload[1],
-                                       vir_uniform_ui(c, 32 - c->local_invocation_index_bits)));
+        case nir_intrinsic_load_workgroup_id: {
+                struct qreg x = vir_AND(c, c->cs_payload[0],
+                                         vir_uniform_ui(c, 0xffff));
+
+                struct qreg y = vir_SHR(c, c->cs_payload[0],
+                                         vir_uniform_ui(c, 16));
+
+                struct qreg z = vir_AND(c, c->cs_payload[1],
+                                         vir_uniform_ui(c, 0xffff));
+
+                /* We only support dispatch base in Vulkan */
+                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
+                        x = vir_ADD(c, x,
+                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
+                        y = vir_ADD(c, y,
+                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
+                        z = vir_ADD(c, z,
+                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
+                }
+
+                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
+                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
+                ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
                 break;
+        }
 
-        case nir_intrinsic_load_work_group_id:
+        case nir_intrinsic_load_local_invocation_index:
                 ntq_store_dest(c, &instr->dest, 0,
-                               vir_AND(c, c->cs_payload[0],
-                                       vir_uniform_ui(c, 0xffff)));
-                ntq_store_dest(c, &instr->dest, 1,
-                               vir_SHR(c, c->cs_payload[0],
-                                       vir_uniform_ui(c, 16)));
-                ntq_store_dest(c, &instr->dest, 2,
-                               vir_AND(c, c->cs_payload[1],
-                                       vir_uniform_ui(c, 0xffff)));
+                               emit_load_local_invocation_index(c));
                 break;
 
-        case nir_intrinsic_load_subgroup_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+        case nir_intrinsic_load_subgroup_id: {
+                /* This is basically the batch index, which is the Local
+                 * Invocation Index divided by the SIMD width).
+                 */
+                STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+                const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
+                struct qreg lii = emit_load_local_invocation_index(c);
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_SHR(c, lii,
+                                       vir_uniform_ui(c, divide_shift)));
                 break;
+        }
 
         case nir_intrinsic_load_per_vertex_input: {
                 /* The vertex shader writes all its used outputs into
@@ -3002,11 +3122,17 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  *
                  * col: vertex index, row = varying index
                  */
+                assert(nir_src_is_const(instr->src[1]));
+                uint32_t location =
+                        nir_intrinsic_io_semantics(instr).location +
+                        nir_src_as_uint(instr->src[1]);
+                uint32_t component = nir_intrinsic_component(instr);
+
                 int32_t row_idx = -1;
                 for (int i = 0; i < c->num_inputs; i++) {
                         struct v3d_varying_slot slot = c->input_slots[i];
-                        if (v3d_slot_get_slot(slot) == nir_intrinsic_io_semantics(instr).location &&
-                            v3d_slot_get_component(slot) == nir_intrinsic_component(instr)) {
+                        if (v3d_slot_get_slot(slot) == location &&
+                            v3d_slot_get_component(slot) == component) {
                                 row_idx = i;
                                 break;
                         }
@@ -3033,6 +3159,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  * VPM output header. According to docs, we should read this
                  * using ldvpm(v,d)_in (See Table 71).
                  */
+                assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
                 ntq_store_dest(c, &instr->dest, 0,
                                vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
                 break;
@@ -3146,6 +3273,37 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
         }
 
+        case nir_intrinsic_load_subgroup_size:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform_ui(c, V3D_CHANNELS));
+                break;
+
+        case nir_intrinsic_load_subgroup_invocation:
+                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+                break;
+
+        case nir_intrinsic_elect: {
+                set_a_flags_for_subgroup(c);
+                struct qreg first = vir_FLAFIRST(c);
+
+                /* Produce a boolean result from Flafirst */
+                vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
+                                           first, vir_uniform_ui(c, 1)),
+                                           V3D_QPU_PF_PUSHZ);
+                struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+                ntq_store_dest(c, &instr->dest, 0, result);
+                break;
+        }
+
+        case nir_intrinsic_load_num_subgroups:
+                unreachable("Should have been lowered");
+                break;
+
+        case nir_intrinsic_load_view_index:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
+                break;
+
         default:
                 fprintf(stderr, "Unknown intrinsic: ");
                 nir_print_instr(&instr->instr, stderr);
@@ -3632,9 +3790,15 @@ nir_to_vir(struct v3d_compile *c)
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
-                /* V3D 4.x can disable implicit point coordinate varyings if
-                 * they are not used.
-                 */
+                /* V3D 4.x can disable implicit varyings if they are not used */
+                c->fs_uses_primitive_id =
+                        nir_find_variable_with_location(c->s, nir_var_shader_in,
+                                                        VARYING_SLOT_PRIMITIVE_ID);
+                if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
+                       c->primitive_id =
+                               emit_fragment_varying(c, NULL, -1, 0, 0);
+                }
+
                 if (c->fs_key->is_points &&
                     (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
@@ -3668,9 +3832,9 @@ nir_to_vir(struct v3d_compile *c)
                 /* Set up the division between gl_LocalInvocationIndex and
                  * wg_in_mem in the payload reg.
                  */
-                int wg_size = (c->s->info.cs.local_size[0] *
-                               c->s->info.cs.local_size[1] *
-                               c->s->info.cs.local_size[2]);
+                int wg_size = (c->s->info.workgroup_size[0] *
+                               c->s->info.workgroup_size[1] *
+                               c->s->info.workgroup_size[2]);
                 c->local_invocation_index_bits =
                         ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
                 assert(c->local_invocation_index_bits <= 8);
@@ -3678,9 +3842,9 @@ nir_to_vir(struct v3d_compile *c)
                 if (c->s->info.shared_size) {
                         struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
                                                         vir_uniform_ui(c, 16));
-                        if (c->s->info.cs.local_size[0] != 1 ||
-                            c->s->info.cs.local_size[1] != 1 ||
-                            c->s->info.cs.local_size[2] != 1) {
+                        if (c->s->info.workgroup_size[0] != 1 ||
+                            c->s->info.workgroup_size[1] != 1 ||
+                            c->s->info.workgroup_size[2] != 1) {
                                 int wg_bits = (16 -
                                                c->local_invocation_index_bits);
                                 int wg_mask = (1 << wg_bits) - 1;
@@ -3731,46 +3895,6 @@ nir_to_vir(struct v3d_compile *c)
         }
 }
 
-const nir_shader_compiler_options v3d_nir_options = {
-        .lower_add_sat = true,
-        .lower_all_io_to_temps = true,
-        .lower_extract_byte = true,
-        .lower_extract_word = true,
-        .lower_bitfield_insert_to_shifts = true,
-        .lower_bitfield_extract_to_shifts = true,
-        .lower_bitfield_reverse = true,
-        .lower_bit_count = true,
-        .lower_cs_local_id_from_index = true,
-        .lower_ffract = true,
-        .lower_fmod = true,
-        .lower_pack_unorm_2x16 = true,
-        .lower_pack_snorm_2x16 = true,
-        .lower_pack_unorm_4x8 = true,
-        .lower_pack_snorm_4x8 = true,
-        .lower_unpack_unorm_4x8 = true,
-        .lower_unpack_snorm_4x8 = true,
-        .lower_pack_half_2x16 = true,
-        .lower_unpack_half_2x16 = true,
-        .lower_fdiv = true,
-        .lower_find_lsb = true,
-	.lower_ffma16 = true,
-	.lower_ffma32 = true,
-	.lower_ffma64 = true,
-        .lower_flrp32 = true,
-        .lower_fpow = true,
-        .lower_fsat = true,
-        .lower_fsqrt = true,
-        .lower_ifind_msb = true,
-        .lower_isign = true,
-        .lower_ldexp = true,
-        .lower_mul_high = true,
-        .lower_wpos_pntc = true,
-        .lower_rotate = true,
-        .lower_to_scalar = true,
-        .has_fsub = true,
-        .has_isub = true,
-};
-
 /**
  * When demoting a shader down to single-threaded, removes the THRSW
  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
@@ -3789,9 +3913,25 @@ vir_remove_thrsw(struct v3d_compile *c)
         c->last_thrsw = NULL;
 }
 
-void
-vir_emit_last_thrsw(struct v3d_compile *c)
+/**
+ * This makes sure we have a top-level last thread switch which signals the
+ * start of the last thread section, which may include adding a new thrsw
+ * instruction if needed. We don't allow spilling in the last thread section, so
+ * if we need to do any spills that inject additional thread switches later on,
+ * we ensure this thread switch will still be the last thread switch in the
+ * program, which makes last thread switch signalling a lot easier when we have
+ * spilling. If in the end we don't need to spill to compile the program and we
+ * injected a new thread switch instruction here only for that, we will
+ * eventually restore the previous last thread switch and remove the one we
+ * added here.
+ */
+static void
+vir_emit_last_thrsw(struct v3d_compile *c,
+                    struct qinst **restore_last_thrsw,
+                    bool *restore_scoreboard_lock)
 {
+        *restore_last_thrsw = c->last_thrsw;
+
         /* On V3D before 4.1, we need a TMU op to be outstanding when thread
          * switching, so disable threads if we didn't do any TMU ops (each of
          * which would have emitted a THRSW).
@@ -3800,7 +3940,7 @@ vir_emit_last_thrsw(struct v3d_compile *c)
                 c->threads = 1;
                 if (c->last_thrsw)
                         vir_remove_thrsw(c);
-                return;
+                *restore_last_thrsw = NULL;
         }
 
         /* If we're threaded and the last THRSW was in conditional code, then
@@ -3823,8 +3963,34 @@ vir_emit_last_thrsw(struct v3d_compile *c)
                 vir_emit_thrsw(c);
         }
 
+        /* If we have not inserted a last thread switch yet, do it now to ensure
+         * any potential spilling we do happens before this. If we don't spill
+         * in the end, we will restore the previous one.
+         */
+        if (*restore_last_thrsw == c->last_thrsw) {
+                if (*restore_last_thrsw)
+                        (*restore_last_thrsw)->is_last_thrsw = false;
+                *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
+                vir_emit_thrsw(c);
+        } else {
+                *restore_last_thrsw = c->last_thrsw;
+        }
+
+        assert(c->last_thrsw);
+        c->last_thrsw->is_last_thrsw = true;
+}
+
+static void
+vir_restore_last_thrsw(struct v3d_compile *c,
+                       struct qinst *thrsw,
+                       bool scoreboard_lock)
+{
+        assert(c->last_thrsw);
+        vir_remove_instruction(c, c->last_thrsw);
+        c->last_thrsw = thrsw;
         if (c->last_thrsw)
                 c->last_thrsw->is_last_thrsw = true;
+        c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
 }
 
 /* There's a flag in the shader for "center W is needed for reasons other than
@@ -3862,8 +4028,14 @@ v3d_nir_to_vir(struct v3d_compile *c)
 
         nir_to_vir(c);
 
+        bool restore_scoreboard_lock = false;
+        struct qinst *restore_last_thrsw;
+
         /* Emit the last THRSW before STVPM and TLB writes. */
-        vir_emit_last_thrsw(c);
+        vir_emit_last_thrsw(c,
+                            &restore_last_thrsw,
+                            &restore_scoreboard_lock);
+
 
         switch (c->s->info.stage) {
         case MESA_SHADER_FRAGMENT:
@@ -3962,6 +4134,12 @@ v3d_nir_to_vir(struct v3d_compile *c)
                         vir_remove_thrsw(c);
         }
 
+        /* If we didn't spill, then remove the last thread switch we injected
+         * artificially (if any) and restore the previous one.
+         */
+        if (!c->spills && c->last_thrsw != restore_last_thrsw)
+                vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
+
         if (c->spills &&
             (V3D_DEBUG & (V3D_DEBUG_VIR |
                           v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
diff --git a/lib/mesa/src/broadcom/compiler/qpu_schedule.c b/lib/mesa/src/broadcom/compiler/qpu_schedule.c
index 8af2e8ef2..7b9891e86 100644
--- a/lib/mesa/src/broadcom/compiler/qpu_schedule.c
+++ b/lib/mesa/src/broadcom/compiler/qpu_schedule.c
@@ -492,7 +492,8 @@ struct choose_scoreboard {
         int last_thrsw_tick;
         int last_branch_tick;
         int last_setmsf_tick;
-        bool tlb_locked;
+        bool first_thrsw_emitted;
+        bool last_thrsw_emitted;
         bool fixup_ldvary;
         int ldvary_count;
 };
@@ -576,10 +577,26 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
 }
 
 static bool
-pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+scoreboard_is_locked(struct choose_scoreboard *scoreboard,
+                     bool lock_scoreboard_on_first_thrsw)
+{
+        if (lock_scoreboard_on_first_thrsw) {
+                return scoreboard->first_thrsw_emitted &&
+                       scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+        }
+
+        return scoreboard->last_thrsw_emitted &&
+               scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct v3d_compile *c,
+                          struct choose_scoreboard *scoreboard,
                           const struct v3d_qpu_instr *inst)
 {
-        return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+        return qpu_inst_is_tlb(inst) &&
+               !scoreboard_is_locked(scoreboard,
+                                     c->lock_scoreboard_on_first_thrsw);
 }
 
 static bool
@@ -868,9 +885,9 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
         inst->flags.mc = inst->flags.ac;
         inst->flags.mpf = inst->flags.apf;
         inst->flags.muf = inst->flags.auf;
-        inst->flags.ac = V3D_QPU_PF_NONE;
+        inst->flags.ac = V3D_QPU_COND_NONE;
         inst->flags.apf = V3D_QPU_PF_NONE;
-        inst->flags.auf = V3D_QPU_PF_NONE;
+        inst->flags.auf = V3D_QPU_UF_NONE;
 }
 
 static bool
@@ -1053,12 +1070,12 @@ retry:
                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
                         continue;
 
-                /* "A scoreboard wait must not occur in the first two
-                 *  instructions of a fragment shader. This is either the
-                 *  explicit Wait for Scoreboard signal or an implicit wait
-                 *  with the first tile-buffer read or write instruction."
+                /* "Before doing a TLB access a scoreboard wait must have been
+                 *  done. This happens either on the first or last thread
+                 *  switch, depending on a setting (scb_wait_on_first_thrsw) in
+                 *  the shader state."
                  */
-                if (pixel_scoreboard_too_soon(scoreboard, inst))
+                if (pixel_scoreboard_too_soon(c, scoreboard, inst))
                         continue;
 
                 /* ldunif and ldvary both write r5, but ldunif does so a tick
@@ -1131,12 +1148,10 @@ retry:
                                 continue;
                         }
 
-                        /* Don't merge in something that will lock the TLB.
-                         * Hopwefully what we have in inst will release some
-                         * other instructions, allowing us to delay the
-                         * TLB-locking instruction until later.
+                        /* Don't merge TLB instructions before we have acquired
+                         * the scoreboard lock.
                          */
-                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+                        if (pixel_scoreboard_too_soon(c, scoreboard, inst))
                                 continue;
 
                         /* When we succesfully pair up an ldvary we then try
@@ -1273,9 +1288,6 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
 
         if (inst->sig.ldvary)
                 scoreboard->last_ldvary_tick = scoreboard->tick;
-
-        if (qpu_inst_is_tlb(inst))
-                scoreboard->tlb_locked = true;
 }
 
 static void
@@ -1490,6 +1502,11 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
                         return false;
                 }
 
+                if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+                    !inst->sig_magic) {
+                        return false;
+                }
+
                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
                         return false;
 
@@ -1747,6 +1764,8 @@ emit_thrsw(struct v3d_compile *c,
                 merge_inst = inst;
         }
 
+        scoreboard->first_thrsw_emitted = true;
+
         /* If we're emitting the last THRSW (other than program end), then
          * signal that to the HW by emitting two THRSWs in a row.
          */
@@ -1758,6 +1777,7 @@ emit_thrsw(struct v3d_compile *c,
                 struct qinst *second_inst =
                         (struct qinst *)merge_inst->link.next;
                 second_inst->qpu.sig.thrsw = true;
+                scoreboard->last_thrsw_emitted = true;
         }
 
         /* Make sure the thread end executes within the program lifespan */
@@ -1981,6 +2001,17 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
                 return false;
 
+        /* The implicit ldvary destination may not be written to by a signal
+         * in the instruction following ldvary. Since we are planning to move
+         * ldvary to the previous instruction, this means we need to check if
+         * the current instruction has any other signal that could create this
+         * conflict. The only other signal that can write to the implicit
+         * ldvary destination that is compatible with ldvary in the same
+         * instruction is ldunif.
+         */
+        if (inst->sig.ldunif)
+                return false;
+
         /* The previous instruction can't write to the same destination as the
          * ldvary.
          */
diff --git a/lib/mesa/src/broadcom/compiler/v3d_compiler.h b/lib/mesa/src/broadcom/compiler/v3d_compiler.h
index f229f414e..f728327f6 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_compiler.h
+++ b/lib/mesa/src/broadcom/compiler/v3d_compiler.h
@@ -299,6 +299,11 @@ enum quniform_contents {
          */
         QUNIFORM_NUM_WORK_GROUPS,
 
+        /* Base workgroup offset passed to vkCmdDispatchBase in the dimension
+         * selected by the data value.
+         */
+        QUNIFORM_WORK_GROUP_BASE,
+
         /**
          * Returns the the offset of the scratch buffer for register spilling.
          */
@@ -320,6 +325,11 @@ enum quniform_contents {
          * out-of-bounds accesses into the tile state during binning.
          */
         QUNIFORM_FB_LAYERS,
+
+        /**
+         * Current value of gl_ViewIndex for Multiview rendering.
+         */
+        QUNIFORM_VIEW_INDEX,
 };
 
 static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
@@ -416,6 +426,19 @@ struct v3d_fs_key {
         uint32_t point_sprite_mask;
 
         struct pipe_rt_blend_state blend;
+
+        /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios:
+         *
+         * - If there is a geometry shader, then gl_PrimitiveID must be written
+         *   by it and the fragment shader loads it as a regular explicit input
+         *   varying. This is the only valid use case in GLES 3.1.
+         *
+         * - If there is not a geometry shader (allowed since GLES 3.2 and
+         *   Vulkan 1.0), then gl_PrimitiveID must be implicitly written by
+         *   hardware and is considered an implicit input varying in the
+         *   fragment shader.
+         */
+        bool has_gs;
 };
 
 struct v3d_gs_key {
@@ -544,10 +567,10 @@ enum v3d_compilation_result {
 struct v3d_compiler {
         const struct v3d_device_info *devinfo;
         struct ra_regs *regs;
-        unsigned int reg_class_any[3];
-        unsigned int reg_class_r5[3];
-        unsigned int reg_class_phys[3];
-        unsigned int reg_class_phys_or_acc[3];
+        struct ra_class *reg_class_any[3];
+        struct ra_class *reg_class_r5[3];
+        struct ra_class *reg_class_phys[3];
+        struct ra_class *reg_class_phys_or_acc[3];
 };
 
 /**
@@ -631,6 +654,9 @@ struct v3d_compile {
         bool writes_z;
         bool uses_implicit_point_line_varyings;
 
+        /* True if a fragment shader reads gl_PrimitiveID */
+        bool fs_uses_primitive_id;
+
         /* If the fragment shader does anything that requires to force
          * per-sample MSAA, such as reading gl_SampleID.
          */
@@ -646,12 +672,14 @@ struct v3d_compile {
          * TMU spills.
          */
         bool disable_tmu_pipelining;
+        bool pipelined_any_tmu;
 
         /* Disable sorting of UBO loads with constant offset. This may
          * increase the chances of being able to compile shaders with high
          * register pressure.
          */
         bool disable_constant_ubo_load_sorting;
+        bool sorted_any_ubo_loads;
 
         /* Emits ldunif for each new uniform, even if the uniform was already
          * emitted in the same block. Useful to compile shaders with high
@@ -660,6 +688,10 @@ struct v3d_compile {
          */
         bool disable_ldunif_opt;
 
+        /* Disables loop unrolling to reduce register pressure. */
+        bool disable_loop_unrolling;
+        bool unrolled_any_loops;
+
         /* Minimum number of threads we are willing to use to register allocate
          * a shader with the current compilation strategy. This only prevents
          * us from lowering the thread count to register allocate successfully,
@@ -668,6 +700,13 @@ struct v3d_compile {
          */
         uint32_t min_threads_for_reg_alloc;
 
+        /* Whether TMU spills are allowed. If this is disabled it may cause
+         * register allocation to fail. We set this to favor other compilation
+         * strategies that can reduce register pressure and hopefully reduce or
+         * eliminate TMU spills in the shader.
+         */
+        bool tmu_spilling_allowed;
+
         /* The UBO index and block used with the last unifa load, as well as the
          * current unifa offset *after* emitting that load. This is used to skip
          * unifa writes (and their 3 delay slot) when the next UBO load reads
@@ -683,7 +722,7 @@ struct v3d_compile {
         struct qreg execute;
         bool in_control_flow;
 
-        struct qreg line_x, point_x, point_y;
+        struct qreg line_x, point_x, point_y, primitive_id;
 
         /**
          * Instance ID, which comes in before the vertex attribute payload if
@@ -710,6 +749,9 @@ struct v3d_compile {
         struct qreg cs_shared_offset;
         int local_invocation_index_bits;
 
+        /* If the shader uses subgroup functionality */
+        bool has_subgroups;
+
         uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
         uint32_t vpm_output_size;
 
@@ -833,6 +875,8 @@ struct v3d_prog_data {
         bool single_seg;
 
         bool tmu_dirty_rcl;
+
+        bool has_control_barrier;
 };
 
 struct v3d_vs_prog_data {
@@ -895,11 +939,16 @@ struct v3d_gs_prog_data {
 
         /* Number of GS invocations */
         uint8_t num_invocations;
+
+        bool writes_psiz;
 };
 
 struct v3d_fs_prog_data {
         struct v3d_prog_data base;
 
+        /* Whether the program reads gl_PrimitiveID */
+        bool uses_pid;
+
         struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
 
         /* Array of flat shade flags.
@@ -927,19 +976,38 @@ struct v3d_compute_prog_data {
         /* Size in bytes of the workgroup's shared space. */
         uint32_t shared_size;
         uint16_t local_size[3];
+        /* If the shader uses subgroup functionality */
+        bool has_subgroups;
+};
+
+struct vpm_config {
+   uint32_t As;
+   uint32_t Vc;
+   uint32_t Gs;
+   uint32_t Gd;
+   uint32_t Gv;
+   uint32_t Ve;
+   uint32_t gs_width;
 };
 
+bool
+v3d_compute_vpm_config(struct v3d_device_info *devinfo,
+                       struct v3d_vs_prog_data *vs_bin,
+                       struct v3d_vs_prog_data *vs,
+                       struct v3d_gs_prog_data *gs_bin,
+                       struct v3d_gs_prog_data *gs,
+                       struct vpm_config *vpm_cfg_bin,
+                       struct vpm_config *vpm_cfg);
+
 static inline bool
 vir_has_uniform(struct qinst *inst)
 {
         return inst->uniform != ~0;
 }
 
-extern const nir_shader_compiler_options v3d_nir_options;
-
 const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
 void v3d_compiler_free(const struct v3d_compiler *compiler);
-void v3d_optimize_nir(struct nir_shader *s);
+void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
 
 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       struct v3d_key *key,
@@ -981,6 +1049,7 @@ struct v3d_qpu_instr v3d_qpu_nop(void);
 struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
 struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst);
 void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond);
+enum v3d_qpu_cond vir_get_cond(struct qinst *inst);
 void vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf);
 void vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf);
 void vir_set_unpack(struct qinst *inst, int src,
@@ -988,7 +1057,6 @@ void vir_set_unpack(struct qinst *inst, int src,
 void vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack);
 
 struct qreg vir_get_temp(struct v3d_compile *c);
-void vir_emit_last_thrsw(struct v3d_compile *c);
 void vir_calculate_live_intervals(struct v3d_compile *c);
 int vir_get_nsrc(struct qinst *inst);
 bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
@@ -1216,6 +1284,8 @@ VIR_A_ALU1(NEG)
 VIR_A_ALU1(FLAPUSH)
 VIR_A_ALU1(FLBPUSH)
 VIR_A_ALU1(FLPOP)
+VIR_A_ALU0(FLAFIRST)
+VIR_A_ALU0(FLNAFIRST)
 VIR_A_ALU1(SETMSF)
 VIR_A_ALU1(SETREVF)
 VIR_A_ALU0(TIDX)
@@ -1345,30 +1415,6 @@ vir_TLB_COLOR_READ(struct v3d_compile *c)
         return vir_emit_def(c, ldtlb);
 }
 
-/*
-static inline struct qreg
-vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val), c->undef));
-}
-
-static inline struct qreg
-vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-static inline struct qreg
-vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-*/
-
 static inline struct qinst *
 vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 {
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
index 655f74fd4..895b1a391 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -24,6 +24,8 @@
 #include "compiler/v3d_compiler.h"
 #include "compiler/nir/nir_builder.h"
 
+#include "util/u_helpers.h"
+
 /**
  * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
  * intrinsics into something amenable to the V3D architecture.
@@ -325,6 +327,59 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
                 nir_intrinsic_set_component(instr, (comp + 2) % 4);
 }
 
+/* Sometimes the origin of gl_PointCoord is in the upper left rather than the
+ * lower left so we need to flip it.
+ *
+ * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
+ */
+static void
+v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
+                             nir_intrinsic_instr *intr)
+{
+        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+
+        /* Gallium uses lower_wpos_pntc */
+        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
+                return;
+
+        b->cursor = nir_after_instr(&intr->instr);
+
+        int comp = nir_intrinsic_component(intr);
+
+        nir_variable *input_var =
+                nir_find_variable_with_driver_location(c->s,
+                                                       nir_var_shader_in,
+                                                       nir_intrinsic_base(intr));
+
+        if (input_var && util_varying_is_point_coord(input_var->data.location,
+                                                     c->fs_key->point_sprite_mask)) {
+                assert(intr->num_components == 1);
+
+                nir_ssa_def *result = &intr->dest.ssa;
+
+                switch (comp) {
+                case 0:
+                case 1:
+                        if (!c->fs_key->is_points)
+                                result = nir_imm_float(b, 0.0);
+                        break;
+                case 2:
+                        result = nir_imm_float(b, 0.0);
+                        break;
+                case 3:
+                        result = nir_imm_float(b, 1.0);
+                        break;
+                }
+                if (c->fs_key->point_coord_upper_left && comp == 1)
+                        result = nir_fsub(b, nir_imm_float(b, 1.0), result);
+                if (result != &intr->dest.ssa) {
+                        nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
+                                                       result,
+                                                       result->parent_instr);
+                }
+        }
+}
+
 static void
 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                        struct nir_instr *instr,
@@ -338,6 +393,8 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
         case nir_intrinsic_load_input:
                 if (c->s->info.stage == MESA_SHADER_VERTEX)
                         v3d_nir_lower_vertex_input(c, b, intr);
+                else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+                        v3d_nir_lower_fragment_input(c, b, intr);
                 break;
 
         case nir_intrinsic_load_uniform:
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
index 2cd613b26..11782c734 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
@@ -202,12 +202,23 @@ v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt)
 }
 
 static nir_ssa_def *
-v3d_nir_get_tlb_color(nir_builder *b, int rt, int sample)
+v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
 {
-        nir_ssa_def *color[4];
-        for (int i = 0; i < 4; i++)
-                color[i] = nir_load_tlb_color_v3d(b, 1, 32, nir_imm_int(b, rt), .base = sample, .component = i);
+        uint32_t num_components =
+                util_format_get_nr_components(c->fs_key->color_fmt[rt].format);
 
+        nir_ssa_def *color[4];
+        for (int i = 0; i < 4; i++) {
+                if (i < num_components) {
+                        color[i] =
+                                nir_load_tlb_color_v3d(b, 1, 32, nir_imm_int(b, rt),
+                                                       .base = sample,
+                                                       .component = i);
+                } else {
+                        /* These will be DCEd */
+                        color[i] = nir_imm_int(b, 0);
+                }
+        }
         return nir_vec4(b, color[0], color[1], color[2], color[3]);
 }
 
@@ -224,6 +235,22 @@ v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b,
                 nir_ssa_def *dst =
                         v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]);
                 op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst);
+
+                /* In Vulkan we configure our integer RTs to clamp, so we need
+                 * to ignore result bits that don't fit in the destination RT
+                 * component size.
+                 */
+                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
+                        uint32_t bits =
+                                util_format_get_component_bits(
+                                        c->fs_key->color_fmt[rt].format,
+                                        UTIL_FORMAT_COLORSPACE_RGB, i);
+                        if (bits > 0 && bits < 32) {
+                                nir_ssa_def *mask =
+                                        nir_imm_int(b, (1u << bits) - 1);
+                                op_res[i] = nir_iand(b, op_res[i], mask);
+                        }
+                }
         }
 
         nir_ssa_def *r[4];
@@ -257,7 +284,7 @@ static nir_ssa_def *
 v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
                       nir_ssa_def *src, int rt, int sample)
 {
-        nir_ssa_def *dst = v3d_nir_get_tlb_color(b, rt, sample);
+        nir_ssa_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
 
         nir_ssa_def *src_chans[4], *dst_chans[4];
         for (unsigned i = 0; i < 4; i++) {
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
index e6a226b03..40f1cc23b 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
@@ -56,7 +56,7 @@ lower_load(struct v3d_compile *c,
            nir_builder *b,
            nir_intrinsic_instr *instr)
 {
-        uint32_t index = nir_src_as_uint(instr->src[0]);
+        uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
 
         nir_intrinsic_op op;
         if (instr->intrinsic == nir_intrinsic_load_ubo) {
@@ -75,7 +75,7 @@ lower_store(struct v3d_compile *c,
             nir_builder *b,
             nir_intrinsic_instr *instr)
 {
-        uint32_t index = nir_src_as_uint(instr->src[1]);
+        uint32_t index = nir_src_comp_as_uint(instr->src[1], 0);
         rewrite_offset(b, instr, index, 2, nir_intrinsic_get_ssbo_size);
 }
 
@@ -84,7 +84,7 @@ lower_atomic(struct v3d_compile *c,
              nir_builder *b,
              nir_intrinsic_instr *instr)
 {
-        uint32_t index = nir_src_as_uint(instr->src[0]);
+        uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
         rewrite_offset(b, instr, index, 1, nir_intrinsic_get_ssbo_size);
 }
 
diff --git a/lib/mesa/src/broadcom/compiler/vir.c b/lib/mesa/src/broadcom/compiler/vir.c
index 335a5a8e3..bf75a4da1 100644
--- a/lib/mesa/src/broadcom/compiler/vir.c
+++ b/lib/mesa/src/broadcom/compiler/vir.c
@@ -25,6 +25,7 @@
 #include "v3d_compiler.h"
 #include "util/u_prim.h"
 #include "compiler/nir/nir_schedule.h"
+#include "compiler/nir/nir_builder.h"
 
 int
 vir_get_nsrc(struct qinst *inst)
@@ -242,6 +243,19 @@ vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
         }
 }
 
+enum v3d_qpu_cond
+vir_get_cond(struct qinst *inst)
+{
+        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+
+        if (vir_is_add(inst))
+                return inst->qpu.flags.ac;
+        else if (vir_is_mul(inst))
+                return inst->qpu.flags.mc;
+        else /* NOP */
+                return V3D_QPU_COND_NONE;
+}
+
 void
 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
 {
@@ -525,7 +539,10 @@ vir_compile_init(const struct v3d_compiler *compiler,
                                       void *debug_output_data),
                  void *debug_output_data,
                  int program_id, int variant_id,
+                 uint32_t max_threads,
                  uint32_t min_threads_for_reg_alloc,
+                 bool tmu_spilling_allowed,
+                 bool disable_loop_unrolling,
                  bool disable_constant_ubo_load_sorting,
                  bool disable_tmu_pipelining,
                  bool fallback_scheduler)
@@ -537,14 +554,17 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->key = key;
         c->program_id = program_id;
         c->variant_id = variant_id;
-        c->threads = 4;
+        c->threads = max_threads;
         c->debug_output = debug_output;
         c->debug_output_data = debug_output_data;
         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
         c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
+        c->tmu_spilling_allowed = tmu_spilling_allowed;
         c->fallback_scheduler = fallback_scheduler;
         c->disable_tmu_pipelining = disable_tmu_pipelining;
         c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
+        c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
+                ? true : disable_loop_unrolling;
 
         s = nir_shader_clone(c, s);
         c->s = s;
@@ -754,6 +774,9 @@ v3d_gs_set_prog_data(struct v3d_compile *c,
 
         prog_data->out_prim_type = c->s->info.gs.output_primitive;
         prog_data->num_invocations = c->s->info.gs.invocations;
+
+        prog_data->writes_psiz =
+            c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
 }
 
 static void
@@ -791,6 +814,7 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
         prog_data->lock_scoreboard_on_first_thrsw =
                 c->lock_scoreboard_on_first_thrsw;
         prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
+        prog_data->uses_pid = c->fs_uses_primitive_id;
 }
 
 static void
@@ -799,9 +823,11 @@ v3d_cs_set_prog_data(struct v3d_compile *c,
 {
         prog_data->shared_size = c->s->info.shared_size;
 
-        prog_data->local_size[0] = c->s->info.cs.local_size[0];
-        prog_data->local_size[1] = c->s->info.cs.local_size[1];
-        prog_data->local_size[2] = c->s->info.cs.local_size[2];
+        prog_data->local_size[0] = c->s->info.workgroup_size[0];
+        prog_data->local_size[1] = c->s->info.workgroup_size[1];
+        prog_data->local_size[2] = c->s->info.workgroup_size[2];
+
+        prog_data->has_subgroups = c->has_subgroups;
 }
 
 static void
@@ -812,6 +838,7 @@ v3d_set_prog_data(struct v3d_compile *c,
         prog_data->single_seg = !c->last_thrsw;
         prog_data->spill_size = c->spill_size;
         prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
+        prog_data->has_control_barrier = c->s->info.uses_control_barrier;
 
         v3d_set_prog_data_uniforms(c, prog_data);
 
@@ -866,7 +893,7 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
         NIR_PASS_V(c->s, nir_remove_unused_io_vars,
                    nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
-        v3d_optimize_nir(c->s);
+        v3d_optimize_nir(c, c->s);
         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
 
         /* This must go before nir_lower_io */
@@ -900,7 +927,7 @@ v3d_nir_lower_gs_early(struct v3d_compile *c)
         NIR_PASS_V(c->s, nir_remove_unused_io_vars,
                    nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
-        v3d_optimize_nir(c->s);
+        v3d_optimize_nir(c, c->s);
         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
 
         /* This must go before nir_lower_io */
@@ -958,14 +985,6 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
                 /* The lowering pass can introduce new sysval reads */
                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
         }
-
-        /* If the shader has no non-TLB side effects, we can promote it to
-         * enabling early_fragment_tests even if the user didn't.
-         */
-        if (!(c->s->info.num_images ||
-              c->s->info.num_ssbos)) {
-                c->s->info.fs.early_fragment_tests = true;
-        }
 }
 
 static void
@@ -1330,11 +1349,10 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
 static bool
 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
 {
-        bool progress = false;
         nir_foreach_function(function, s) {
                 if (function->impl) {
                         nir_foreach_block(block, function->impl) {
-                                progress |=
+                                c->sorted_any_ubo_loads |=
                                         v3d_nir_sort_constant_ubo_loads_block(c, block);
                         }
                         nir_metadata_preserve(function->impl,
@@ -1342,6 +1360,77 @@ v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
                                               nir_metadata_dominance);
                 }
         }
+        return c->sorted_any_ubo_loads;
+}
+
+static void
+lower_load_num_subgroups(struct v3d_compile *c,
+                         nir_builder *b,
+                         nir_intrinsic_instr *intr)
+{
+        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
+        assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
+
+        b->cursor = nir_after_instr(&intr->instr);
+        uint32_t num_subgroups =
+                DIV_ROUND_UP(c->s->info.workgroup_size[0] *
+                             c->s->info.workgroup_size[1] *
+                             c->s->info.workgroup_size[2], V3D_CHANNELS);
+        nir_ssa_def *result = nir_imm_int(b, num_subgroups);
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
+        nir_instr_remove(&intr->instr);
+}
+
+static bool
+lower_subgroup_intrinsics(struct v3d_compile *c,
+                          nir_block *block, nir_builder *b)
+{
+        bool progress = false;
+        nir_foreach_instr_safe(inst, block) {
+                if (inst->type != nir_instr_type_intrinsic)
+                        continue;;
+
+                nir_intrinsic_instr *intr =
+                        nir_instr_as_intrinsic(inst);
+                if (!intr)
+                        continue;
+
+                switch (intr->intrinsic) {
+                case nir_intrinsic_load_num_subgroups:
+                        lower_load_num_subgroups(c, b, intr);
+                        progress = true;
+                        FALLTHROUGH;
+                case nir_intrinsic_load_subgroup_id:
+                case nir_intrinsic_load_subgroup_size:
+                case nir_intrinsic_load_subgroup_invocation:
+                case nir_intrinsic_elect:
+                        c->has_subgroups = true;
+                        break;
+                default:
+                        break;
+                }
+        }
+
+        return progress;
+}
+
+static bool
+v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
+{
+        bool progress = false;
+        nir_foreach_function(function, s) {
+                if (function->impl) {
+                        nir_builder b;
+                        nir_builder_init(&b, function->impl);
+
+                        nir_foreach_block(block, function->impl)
+                                progress |= lower_subgroup_intrinsics(c, block, &b);
+
+                        nir_metadata_preserve(function->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
         return progress;
 }
 
@@ -1405,18 +1494,21 @@ v3d_attempt_compile(struct v3d_compile *c)
 
         if (c->key->robust_buffer_access) {
            /* v3d_nir_lower_robust_buffer_access assumes constant buffer
-            * indices on ubo/ssbo intrinsics so run a copy propagation pass
-            * before we run the lowering to warrant this. We also want to run
-            * the lowering before v3d_optimize to clean-up redundant
-            * get_buffer_size calls produced in the pass.
+            * indices on ubo/ssbo intrinsics so run copy propagation and
+            * constant folding passes before we run the lowering to warrant
+            * this. We also want to run the lowering before v3d_optimize to
+            * clean-up redundant get_buffer_size calls produced in the pass.
             */
            NIR_PASS_V(c->s, nir_copy_prop);
+           NIR_PASS_V(c->s, nir_opt_constant_folding);
            NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
         }
 
         NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
 
-        v3d_optimize_nir(c->s);
+        NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
+
+        v3d_optimize_nir(c, c->s);
 
         /* Do late algebraic optimization to turn add(a, neg(b)) back into
          * subs, then the mandatory cleanup after algebraic.  Note that it may
@@ -1505,6 +1597,83 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
                         c->nop_count);
 }
 
+/* This is a list of incremental changes to the compilation strategy
+ * that will be used to try to compile the shader successfully. The
+ * default strategy is to enable all optimizations which will have
+ * the highest register pressure but is expected to produce most
+ * optimal code. Following strategies incrementally disable specific
+ * optimizations that are known to contribute to register pressure
+ * in order to be able to compile the shader successfully while meeting
+ * thread count requirements.
+ *
+ * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
+ * cover previous hardware as well (meaning that we are not limiting
+ * register allocation to any particular thread count). This is fine
+ * because v3d_nir_to_vir will cap this to the actual minimum.
+ */
+struct v3d_compiler_strategy {
+        const char *name;
+        uint32_t max_threads;
+        uint32_t min_threads;
+        bool disable_loop_unrolling;
+        bool disable_ubo_load_sorting;
+        bool disable_tmu_pipelining;
+        bool tmu_spilling_allowed;
+} static const strategies[] = {
+  /*0*/ { "default",                        4, 4, false, false, false, false },
+  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false, false },
+  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false, false },
+  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,  false },
+  /*4*/ { "lower thread count",             2, 1, false, false, false, false },
+  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, false },
+  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, false },
+  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  true  },
+  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  true  }
+};
+
+/**
+ * If a particular optimization didn't make any progress during a compile
+ * attempt disabling it alone won't allow us to compile the shader successfuly,
+ * since we'll end up with the same code. Detect these scenarios so we can
+ * avoid wasting time with useless compiles. We should also consider if the
+ * strategy changes other aspects of the compilation process though, like
+ * spilling, and not skip it in that case.
+ */
+static bool
+skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
+{
+   /* We decide if we can skip a strategy based on the optimizations that
+    * were active in the previous strategy, so we should only be calling this
+    * for strategies after the first.
+    */
+   assert(idx > 0);
+
+   /* Don't skip a strategy that changes spilling behavior */
+   if (strategies[idx].tmu_spilling_allowed !=
+       strategies[idx - 1].tmu_spilling_allowed) {
+           return false;
+   }
+
+   switch (idx) {
+   /* Loop unrolling: skip if we didn't unroll any loops */
+   case 1:
+   case 5:
+           return !c->unrolled_any_loops;
+   /* UBO load sorting: skip if we didn't sort any loads */
+   case 2:
+   case 6:
+           return !c->sorted_any_ubo_loads;
+   /* TMU pipelining: skip if we didn't pipeline any TMU ops */
+   case 3:
+   case 7:
+           return !c->pipelined_any_tmu;
+   /* Lower thread count: skip if we already tried less that 4 threads */
+   case 4:
+          return c->threads < 4;
+   default:
+           return false;
+   };
+}
 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       struct v3d_key *key,
                       struct v3d_prog_data **out_prog_data,
@@ -1515,40 +1684,41 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       int program_id, int variant_id,
                       uint32_t *final_assembly_size)
 {
-        struct v3d_compile *c;
+        struct v3d_compile *c = NULL;
+        for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
+                /* Fallback strategy */
+                if (i > 0) {
+                        assert(c);
+                        if (skip_compile_strategy(c, i))
+                                continue;
 
-        /* This is a list of incremental changes to the compilation strategy
-         * that will be used to try to compile the shader successfully. The
-         * default strategy is to enable all optimizations which will have
-         * the highest register pressure but is expected to produce most
-         * optimal code. Following strategies incrementally disable specific
-         * optimizations that are known to contribute to register pressure
-         * in order to be able to compile the shader successfully while meeting
-         * thread count requirements.
-         *
-         * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
-         * cover previous hardware as well (meaning that we are not limiting
-         * register allocation to any particular thread count). This is fine
-         * because v3d_nir_to_vir will cap this to the actual minimum.
-         */
-        struct v3d_compiler_strategy {
-                const char *name;
-                uint32_t min_threads_for_reg_alloc;
-        } static const strategies[] = {
-                { "default",                  4 },
-                { "disable UBO load sorting", 1 },
-                { "disable TMU pipelining",   1 },
-                { "fallback scheduler",       1 }
-        };
+                        char *debug_msg;
+                        int ret = asprintf(&debug_msg,
+                                           "Falling back to strategy '%s' for %s",
+                                           strategies[i].name,
+                                           vir_get_stage_name(c));
+
+                        if (ret >= 0) {
+                                if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
+                                        fprintf(stderr, "%s\n", debug_msg);
+
+                                c->debug_output(debug_msg, c->debug_output_data);
+                                free(debug_msg);
+                        }
+
+                        vir_compile_destroy(c);
+                }
 
-        for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
                 c = vir_compile_init(compiler, key, s,
                                      debug_output, debug_output_data,
                                      program_id, variant_id,
-                                     strategies[i].min_threads_for_reg_alloc,
-                                     i > 0, /* Disable UBO load sorting */
-                                     i > 1, /* Disable TMU pipelining */
-                                     i > 2  /* Fallback_scheduler */);
+                                     strategies[i].max_threads,
+                                     strategies[i].min_threads,
+                                     strategies[i].tmu_spilling_allowed,
+                                     strategies[i].disable_loop_unrolling,
+                                     strategies[i].disable_ubo_load_sorting,
+                                     strategies[i].disable_tmu_pipelining,
+                                     i == ARRAY_SIZE(strategies) - 1);
 
                 v3d_attempt_compile(c);
 
@@ -1557,23 +1727,6 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                     V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
                         break;
                 }
-
-                /* Fallback strategy */
-                char *debug_msg;
-                int ret = asprintf(&debug_msg,
-                                   "Falling back to strategy '%s' for %s",
-                                   strategies[i + 1].name,
-                                   vir_get_stage_name(c));
-
-                if (ret >= 0) {
-                        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
-                                fprintf(stderr, "%s\n", debug_msg);
-
-                        c->debug_output(debug_msg, c->debug_output_data);
-                        free(debug_msg);
-                }
-
-                vir_compile_destroy(c);
         }
 
         if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
@@ -1717,6 +1870,24 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
 {
         uint32_t count = 20;
         struct qinst *prev_inst = NULL;
+        assert(c->cur_block);
+
+#ifdef DEBUG
+        /* We can only reuse a uniform if it was emitted in the same block,
+         * so callers must make sure the current instruction is being emitted
+         * in the current block.
+         */
+        bool found = false;
+        vir_for_each_inst(inst, c->cur_block) {
+                if (&inst->link == c->cursor.link) {
+                        found = true;
+                        break;
+                }
+        }
+
+        assert(found || &c->cur_block->instructions == c->cursor.link);
+#endif
+
         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
                                      &c->cur_block->instructions, link) {
                 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
@@ -1817,3 +1988,174 @@ vir_get_stage_name(struct v3d_compile *c)
         else
                 return gl_shader_stage_name(c->s->info.stage);
 }
+
+static inline uint32_t
+compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
+{
+   assert(devinfo->vpm_size > 0);
+   const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
+   return devinfo->vpm_size / sector_size;
+}
+
+/* Computes various parameters affecting VPM memory configuration for programs
+ * involving geometry shaders to ensure the program fits in memory and honors
+ * requirements described in section "VPM usage" of the programming manual.
+ */
+static bool
+compute_vpm_config_gs(struct v3d_device_info *devinfo,
+                      struct v3d_vs_prog_data *vs,
+                      struct v3d_gs_prog_data *gs,
+                      struct vpm_config *vpm_cfg_out)
+{
+   const uint32_t A = vs->separate_segments ? 1 : 0;
+   const uint32_t Ad = vs->vpm_input_size;
+   const uint32_t Vd = vs->vpm_output_size;
+
+   const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
+
+   /* Try to fit program into our VPM memory budget by adjusting
+    * configurable parameters iteratively. We do this in two phases:
+    * the first phase tries to fit the program into the total available
+    * VPM memory. If we succeed at that, then the second phase attempts
+    * to fit the program into half of that budget so we can run bin and
+    * render programs in parallel.
+    */
+   struct vpm_config vpm_cfg[2];
+   struct vpm_config *final_vpm_cfg = NULL;
+   uint32_t phase = 0;
+
+   vpm_cfg[phase].As = 1;
+   vpm_cfg[phase].Gs = 1;
+   vpm_cfg[phase].Gd = gs->vpm_output_size;
+   vpm_cfg[phase].gs_width = gs->simd_width;
+
+   /* While there is a requirement that Vc >= [Vn / 16], this is
+    * always the case when tessellation is not present because in that
+    * case Vn can only be 6 at most (when input primitive is triangles
+    * with adjacency).
+    *
+    * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
+    * and Broadcom has not found it worth it to increase it beyond this
+    * in general. Increasing Vc also increases VPM memory pressure which
+    * can turn up being detrimental for performance in some scenarios.
+    */
+   vpm_cfg[phase].Vc = 2;
+
+   /* Gv is a constraint on the hardware to not exceed the
+    * specified number of vertex segments per GS batch. If adding a
+    * new primitive to a GS batch would result in a range of more
+    * than Gv vertex segments being referenced by the batch, then
+    * the hardware will flush the batch and start a new one. This
+    * means that we can choose any value we want, we just need to
+    * be aware that larger values improve GS batch utilization
+    * at the expense of more VPM memory pressure (which can affect
+    * other performance aspects, such as GS dispatch width).
+    * We start with the largest value, and will reduce it if we
+    * find that total memory pressure is too high.
+    */
+   vpm_cfg[phase].Gv = 3;
+   do {
+      /* When GS is present in absence of TES, then we need to satisfy
+       * that Ve >= Gv. We go with the smallest value of Ve to avoid
+       * increasing memory pressure.
+       */
+      vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
+
+      uint32_t vpm_sectors =
+         A * vpm_cfg[phase].As * Ad +
+         (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
+         vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
+
+      /* Ideally we want to use no more than half of the available
+       * memory so we can execute a bin and render program in parallel
+       * without stalls. If we achieved that then we are done.
+       */
+      if (vpm_sectors <= vpm_size / 2) {
+         final_vpm_cfg = &vpm_cfg[phase];
+         break;
+      }
+
+      /* At the very least, we should not allocate more than the
+       * total available VPM memory. If we have a configuration that
+       * succeeds at this we save it and continue to see if we can
+       * meet the half-memory-use criteria too.
+       */
+      if (phase == 0 && vpm_sectors <= vpm_size) {
+         vpm_cfg[1] = vpm_cfg[0];
+         phase = 1;
+      }
+
+      /* Try lowering Gv */
+      if (vpm_cfg[phase].Gv > 0) {
+         vpm_cfg[phase].Gv--;
+         continue;
+      }
+
+      /* Try lowering GS dispatch width */
+      if (vpm_cfg[phase].gs_width > 1) {
+         do {
+            vpm_cfg[phase].gs_width >>= 1;
+            vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
+         } while (vpm_cfg[phase].gs_width == 2);
+
+         /* Reset Gv to max after dropping dispatch width */
+         vpm_cfg[phase].Gv = 3;
+         continue;
+      }
+
+      /* We ran out of options to reduce memory pressure. If we
+       * are at phase 1 we have at least a valid configuration, so we
+       * we use that.
+       */
+      if (phase == 1)
+         final_vpm_cfg = &vpm_cfg[0];
+      break;
+   } while (true);
+
+   if (!final_vpm_cfg)
+      return false;
+
+   assert(final_vpm_cfg);
+   assert(final_vpm_cfg->Gd <= 16);
+   assert(final_vpm_cfg->Gv < 4);
+   assert(final_vpm_cfg->Ve < 4);
+   assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
+   assert(final_vpm_cfg->gs_width == 1 ||
+          final_vpm_cfg->gs_width == 4 ||
+          final_vpm_cfg->gs_width == 8 ||
+          final_vpm_cfg->gs_width == 16);
+
+   *vpm_cfg_out = *final_vpm_cfg;
+   return true;
+}
+
+bool
+v3d_compute_vpm_config(struct v3d_device_info *devinfo,
+                       struct v3d_vs_prog_data *vs_bin,
+                       struct v3d_vs_prog_data *vs,
+                       struct v3d_gs_prog_data *gs_bin,
+                       struct v3d_gs_prog_data *gs,
+                       struct vpm_config *vpm_cfg_bin,
+                       struct vpm_config *vpm_cfg)
+{
+   assert(vs && vs_bin);
+   assert((gs != NULL) == (gs_bin != NULL));
+
+   if (!gs) {
+      vpm_cfg_bin->As = 1;
+      vpm_cfg_bin->Ve = 0;
+      vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
+
+      vpm_cfg->As = 1;
+      vpm_cfg->Ve = 0;
+      vpm_cfg->Vc = vs->vcm_cache_size;
+   } else {
+      if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
+         return false;
+
+      if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
+         return false;
+   }
+
+   return true;
+}
diff --git a/lib/mesa/src/broadcom/compiler/vir_live_variables.c b/lib/mesa/src/broadcom/compiler/vir_live_variables.c
index 48d0201dc..2fd6430a0 100644
--- a/lib/mesa/src/broadcom/compiler/vir_live_variables.c
+++ b/lib/mesa/src/broadcom/compiler/vir_live_variables.c
@@ -28,9 +28,12 @@
 #include "util/register_allocate.h"
 #include "v3d_compiler.h"
 
+/* Keeps track of conditional / partial writes in a block */
 struct partial_update_state {
-        struct qinst *insts[4];
-        uint8_t channels;
+        /* Instruction doing a conditional or partial write */
+        struct qinst *inst;
+        /* Instruction that set the flags for the conditional write */
+        struct qinst *flags_inst;
 };
 
 static int
@@ -44,7 +47,8 @@ vir_reg_to_var(struct qreg reg)
 
 static void
 vir_setup_use(struct v3d_compile *c, struct qblock *block, int ip,
-              struct qreg src)
+              struct partial_update_state *partial_update_ht, struct qinst *inst,
+              struct qreg src, struct qinst *flags_inst)
 {
         int var = vir_reg_to_var(src);
         if (var == -1)
@@ -57,39 +61,39 @@ vir_setup_use(struct v3d_compile *c, struct qblock *block, int ip,
          * use of a variable without having completely
          * defined that variable within the block.
          */
-        if (!BITSET_TEST(block->def, var))
-                BITSET_SET(block->use, var);
-}
-
-static struct partial_update_state *
-get_partial_update_state(struct hash_table *partial_update_ht,
-                         struct qinst *inst)
-{
-        struct hash_entry *entry =
-                _mesa_hash_table_search(partial_update_ht,
-                                        &inst->dst.index);
-        if (entry)
-                return entry->data;
-
-        struct partial_update_state *state =
-                rzalloc(partial_update_ht, struct partial_update_state);
-
-        _mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state);
+        if (!BITSET_TEST(block->def, var)) {
+                /* If this use of var is conditional and the condition
+                 * and flags match those of a previous instruction
+                 * in the same block partially defining var then we
+                 * consider var completely defined within the block.
+                 */
+                if (BITSET_TEST(block->defout, var)) {
+                        struct partial_update_state *state =
+                                &partial_update_ht[var];
+                        if (state->inst) {
+                                if (vir_get_cond(inst) == vir_get_cond(state->inst) &&
+                                    flags_inst == state->flags_inst) {
+                                        return;
+                                }
+                        }
+                }
 
-        return state;
+                BITSET_SET(block->use, var);
+        }
 }
 
+/* The def[] bitset marks when an initialization in a
+ * block completely screens off previous updates of
+ * that variable.
+ */
 static void
 vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip,
-              struct hash_table *partial_update_ht, struct qinst *inst)
+              struct partial_update_state *partial_update, struct qinst *inst,
+              struct qinst *flags_inst)
 {
         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
                 return;
 
-        /* The def[] bitset marks when an initialization in a
-         * block completely screens off previous updates of
-         * that variable.
-         */
         int var = vir_reg_to_var(inst->dst);
         if (var == -1)
                 return;
@@ -115,62 +119,22 @@ vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip,
                 return;
         }
 
-        /* Finally, look at the condition code and packing and mark it as a
-         * def.  We need to make sure that we understand sequences
-         * instructions like:
-         *
-         *     mov.zs t0, t1
-         *     mov.zc t0, t2
+        /* Keep track of conditional writes.
          *
-         * or:
+         * Notice that the dst's live range for a conditional or partial writes
+         * will get extended up the control flow to the top of the program until
+         * we find a full write, making register allocation more difficult, so
+         * we should try our best to keep track of these and figure out if a
+         * combination of them actually writes the entire register so we can
+         * stop that process early and reduce liveness.
          *
-         *     mmov t0.8a, t1
-         *     mmov t0.8b, t2
-         *     mmov t0.8c, t3
-         *     mmov t0.8d, t4
-         *
-         * as defining the temp within the block, because otherwise dst's live
-         * range will get extended up the control flow to the top of the
-         * program.
+         * FIXME: Track partial updates via pack/unpack.
          */
-        struct partial_update_state *state =
-                get_partial_update_state(partial_update_ht, inst);
-        uint8_t mask = 0xf; /* XXX vir_channels_written(inst); */
-
-        if (inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
-            inst->qpu.flags.mc == V3D_QPU_COND_NONE) {
-                state->channels |= mask;
-        } else {
-                for (int i = 0; i < 4; i++) {
-                        if (!(mask & (1 << i)))
-                                continue;
-
-                        /* XXXif (state->insts[i] &&
-                            state->insts[i]->cond ==
-                            qpu_cond_complement(inst->cond))
-                                state->channels |= 1 << i;
-                        else
-                        */
-                                state->insts[i] = inst;
-                }
-        }
-
-        if (state->channels == 0xf)
-                BITSET_SET(block->def, var);
-}
-
-static void
-sf_state_clear(struct hash_table *partial_update_ht)
-{
-        hash_table_foreach(partial_update_ht, entry) {
-                struct partial_update_state *state = entry->data;
-
-                for (int i = 0; i < 4; i++) {
-                        if (state->insts[i] &&
-                            (state->insts[i]->qpu.flags.ac != V3D_QPU_COND_NONE ||
-                             state->insts[i]->qpu.flags.mc != V3D_QPU_COND_NONE))
-                                state->insts[i] = NULL;
-                }
+        struct partial_update_state *state = &partial_update[var];
+        if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
+            inst->qpu.flags.mc != V3D_QPU_COND_NONE) {
+                state->inst = inst;
+                state->flags_inst = flags_inst;
         }
 }
 
@@ -184,23 +148,36 @@ sf_state_clear(struct hash_table *partial_update_ht)
 static void
 vir_setup_def_use(struct v3d_compile *c)
 {
-        struct hash_table *partial_update_ht =
-                _mesa_hash_table_create(c, _mesa_hash_int, _mesa_key_int_equal);
+        struct partial_update_state *partial_update =
+                rzalloc_array(c, struct partial_update_state, c->num_temps);
         int ip = 0;
 
         vir_for_each_block(block, c) {
                 block->start_ip = ip;
 
-                _mesa_hash_table_clear(partial_update_ht, NULL);
+                memset(partial_update, 0,
+                       sizeof(struct partial_update_state) * c->num_temps);
+
+                struct qinst *flags_inst = NULL;
 
                 vir_for_each_inst(inst, block) {
-                        for (int i = 0; i < vir_get_nsrc(inst); i++)
-                                vir_setup_use(c, block, ip, inst->src[i]);
+                        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                                vir_setup_use(c, block, ip, partial_update,
+                                              inst, inst->src[i], flags_inst);
+                        }
 
-                        vir_setup_def(c, block, ip, partial_update_ht, inst);
+                        vir_setup_def(c, block, ip, partial_update,
+                                      inst, flags_inst);
 
-                        if (false /* XXX inst->uf */)
-                                sf_state_clear(partial_update_ht);
+                        if (inst->qpu.flags.apf != V3D_QPU_PF_NONE ||
+                            inst->qpu.flags.mpf != V3D_QPU_PF_NONE) {
+                               flags_inst = inst;
+                        }
+
+                        if (inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
+                            inst->qpu.flags.muf != V3D_QPU_UF_NONE) {
+                                flags_inst = NULL;
+                        }
 
                         /* Payload registers: r0/1/2 contain W, centroid W,
                          * and Z at program start.  Register allocation will
@@ -221,7 +198,7 @@ vir_setup_def_use(struct v3d_compile *c)
                 block->end_ip = ip;
         }
 
-        _mesa_hash_table_destroy(partial_update_ht, NULL);
+        ralloc_free(partial_update);
 }
 
 static bool
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
index 55469402e..64c762c88 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
@@ -149,25 +149,30 @@ check_first_ldunifa(struct v3d_compile *c,
 }
 
 static bool
-increment_unifa_address(struct v3d_compile *c, struct qinst *unifa)
+increment_unifa_address(struct v3d_compile *c, struct qblock *block, struct qinst *unifa)
 {
+        struct qblock *current_block = c->cur_block;
         if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
             unifa->qpu.alu.mul.op == V3D_QPU_M_MOV) {
                 c->cursor = vir_after_inst(unifa);
+                c->cur_block = block;
                 struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 vir_ADD_dest(c, unifa_reg, unifa->src[0], vir_uniform_ui(c, 4u));
                 vir_remove_instruction(c, unifa);
+                c->cur_block = current_block;
                 return true;
         }
 
         if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
             unifa->qpu.alu.add.op == V3D_QPU_A_ADD) {
                 c->cursor = vir_after_inst(unifa);
+                c->cur_block = block;
                 struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
                 struct qreg tmp =
                         vir_ADD(c, unifa->src[1], vir_uniform_ui(c, 4u));
                 vir_ADD_dest(c, unifa_reg, unifa->src[0], tmp);
                 vir_remove_instruction(c, unifa);
+                c->cur_block = current_block;
                 return true;
         }
 
@@ -271,7 +276,7 @@ vir_opt_dead_code(struct v3d_compile *c)
                          */
                         if (is_first_ldunifa) {
                                 assert(unifa);
-                                if (!increment_unifa_address(c, unifa))
+                                if (!increment_unifa_address(c, block, unifa))
                                         continue;
                         }
 
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c
index 8749f3cd6..4609ef9c3 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -107,9 +107,14 @@ vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block)
                         continue;
                 }
 
-                /* Flags aren't preserved across a thrsw. */
-                if (inst->qpu.sig.thrsw)
-                        last_flags = NULL;
+                /* Flags aren't preserved across a thrsw.
+                 *
+                 * In V3D 4.2+ flags are preserved across thread switches.
+                 */
+                if (c->devinfo->ver < 42) {
+                        if (inst->qpu.sig.thrsw)
+                                last_flags = NULL;
+                }
 
                 if (inst->qpu.flags.apf != V3D_QPU_PF_NONE ||
                     inst->qpu.flags.mpf != V3D_QPU_PF_NONE) {
diff --git a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
index 41fc25729..08698b4ec 100644
--- a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
+++ b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
@@ -164,10 +164,8 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
         }
 
         for (unsigned i = 0; i < c->num_temps; i++) {
-                int node = temp_to_node[i];
-
                 if (BITSET_TEST(c->spillable, i))
-                        ra_set_node_spill_cost(g, node, spill_costs[i]);
+                        ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]);
         }
 
         return ra_get_best_spill_node(g);
@@ -179,7 +177,12 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
 void
 v3d_setup_spill_base(struct v3d_compile *c)
 {
-        c->cursor = vir_before_block(vir_entry_block(c));
+        /* Setting up the spill base is done in the entry block; so change
+         * both the current block to emit and the cursor.
+         */
+        struct qblock *current_block = c->cur_block;
+        c->cur_block = vir_entry_block(c);
+        c->cursor = vir_before_block(c->cur_block);
 
         int start_num_temps = c->num_temps;
 
@@ -206,16 +209,16 @@ v3d_setup_spill_base(struct v3d_compile *c)
         for (int i = start_num_temps; i < c->num_temps; i++)
                 BITSET_CLEAR(c->spillable, i);
 
+        /* Restore the current block. */
+        c->cur_block = current_block;
         c->cursor = vir_after_block(c->cur_block);
 }
 
-static void
+static struct qinst *
 v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
 {
-        vir_ADD_dest(c, vir_reg(QFILE_MAGIC,
-                                V3D_QPU_WADDR_TMUA),
-                     c->spill_base,
-                     vir_uniform_ui(c, spill_offset));
+        return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
+                            c->spill_base, vir_uniform_ui(c, spill_offset));
 }
 
 
@@ -223,12 +226,17 @@ static void
 v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst,
                    struct qinst *position, uint32_t spill_offset)
 {
+        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+
         c->cursor = vir_after_inst(position);
-        inst->dst.index = c->num_temps++;
-        vir_MOV_dest(c, vir_reg(QFILE_MAGIC,
-                                V3D_QPU_WADDR_TMUD),
-                     inst->dst);
-        v3d_emit_spill_tmua(c, spill_offset);
+        inst->dst = vir_get_temp(c);
+        enum v3d_qpu_cond cond = vir_get_cond(inst);
+        struct qinst *tmp =
+                vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                             inst->dst);
+        tmp->qpu.flags.mc = cond;
+        tmp = v3d_emit_spill_tmua(c, spill_offset);
+        tmp->qpu.flags.ac = cond;
         vir_emit_thrsw(c);
         vir_TMUWT(c);
         c->spills++;
@@ -253,7 +261,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
         }
 
         struct qinst *last_thrsw = c->last_thrsw;
-        assert(!last_thrsw || last_thrsw->is_last_thrsw);
+        assert(last_thrsw && last_thrsw->is_last_thrsw);
 
         int start_num_temps = c->num_temps;
 
@@ -339,29 +347,13 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                                                                    spill_offset);
                                 }
                         }
-
-                        /* If we didn't have a last-thrsw inserted by nir_to_vir and
-                         * we've been inserting thrsws, then insert a new last_thrsw
-                         * right before we start the vpm/tlb sequence for the last
-                         * thread segment.
-                         */
-                        if (!is_uniform && !last_thrsw && c->last_thrsw &&
-                            (v3d_qpu_writes_vpm(&inst->qpu) ||
-                             v3d_qpu_uses_tlb(&inst->qpu))) {
-                                c->cursor = vir_before_inst(inst);
-                                vir_emit_thrsw(c);
-
-                                last_thrsw = c->last_thrsw;
-                                last_thrsw->is_last_thrsw = true;
-                        }
                 }
         }
 
         /* Make sure c->last_thrsw is the actual last thrsw, not just one we
          * inserted in our most recent unspill.
          */
-        if (last_thrsw)
-                c->last_thrsw = last_thrsw;
+        c->last_thrsw = last_thrsw;
 
         /* Don't allow spilling of our spilling instructions.  There's no way
          * they can help get things colored.
@@ -372,27 +364,63 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
         c->disable_ldunif_opt = had_disable_ldunif_opt;
 }
 
+struct node_to_temp_map {
+        uint32_t temp;
+        uint32_t priority;
+};
+
 struct v3d_ra_select_callback_data {
         uint32_t next_acc;
         uint32_t next_phys;
+        struct node_to_temp_map *map;
 };
 
-static unsigned int
-v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
+/* Choosing accumulators improves chances of merging QPU instructions
+ * due to these merges requiring that at most 2 rf registers are used
+ * by the add and mul instructions.
+ */
+static bool
+v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                   BITSET_WORD *regs,
+                   int priority)
 {
-        struct v3d_ra_select_callback_data *v3d_ra = data;
-        int r5 = ACC_INDEX + 5;
+        /* Favor accumulators if we have less that this number of physical
+         * registers. Accumulators have more restrictions (like being
+         * invalidated through thrsw), so running out of physical registers
+         * even if we have accumulators available can lead to register
+         * allocation failures.
+         */
+        static const int available_rf_threshold = 5;
+        int available_rf = 0 ;
+        for (int i = 0; i < PHYS_COUNT; i++) {
+                if (BITSET_TEST(regs, PHYS_INDEX + i))
+                        available_rf++;
+                if (available_rf >= available_rf_threshold)
+                        break;
+        }
+        if (available_rf < available_rf_threshold)
+                return true;
 
-        /* Choose r5 for our ldunifs if possible (nobody else can load to that
-         * reg, and it keeps the QPU cond field free from being occupied by
-         * ldunifrf).
+        /* Favor accumulators for short-lived temps (our priority represents
+         * liveness), to prevent long-lived temps from grabbing accumulators
+         * and preventing follow-up instructions from using them, potentially
+         * leading to large portions of the shader being unable to use
+         * accumulators and therefore merge instructions successfully.
          */
-        if (BITSET_TEST(regs, r5))
-                return r5;
+        static const int priority_threshold = 20;
+        if (priority <= priority_threshold)
+                return true;
+
+        return false;
+}
 
-        /* Choose an accumulator if possible (I think it's lower power than
-         * phys regs), but round-robin through them to give post-RA
-         * instruction selection more options.
+static bool
+v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                    BITSET_WORD *regs,
+                    unsigned int *out)
+{
+        /* Round-robin through our accumulators to give post-RA instruction
+         * selection more options.
          */
         for (int i = 0; i < ACC_COUNT; i++) {
                 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
@@ -400,20 +428,61 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
 
                 if (BITSET_TEST(regs, acc)) {
                         v3d_ra->next_acc = acc_off + 1;
-                        return acc;
+                        *out = acc;
+                        return true;
                 }
         }
 
+        return false;
+}
+
+static bool
+v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 BITSET_WORD *regs,
+                 unsigned int *out)
+{
         for (int i = 0; i < PHYS_COUNT; i++) {
                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
                 int phys = PHYS_INDEX + phys_off;
 
                 if (BITSET_TEST(regs, phys)) {
                         v3d_ra->next_phys = phys_off + 1;
-                        return phys;
+                        *out = phys;
+                        return true;
                 }
         }
 
+        return false;
+}
+
+static unsigned int
+v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
+{
+        struct v3d_ra_select_callback_data *v3d_ra = data;
+        int r5 = ACC_INDEX + 5;
+
+        /* Choose r5 for our ldunifs if possible (nobody else can load to that
+         * reg, and it keeps the QPU cond field free from being occupied by
+         * ldunifrf).
+         */
+        if (BITSET_TEST(regs, r5))
+                return r5;
+
+        unsigned int reg;
+        if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&
+            v3d_ra_select_accum(v3d_ra, regs, &reg)) {
+                return reg;
+        }
+
+        if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+                return reg;
+
+        /* If we ran out of physical registers try to assign an accumulator
+         * if we didn't favor that option earlier.
+         */
+        if (v3d_ra_select_accum(v3d_ra, regs, &reg))
+                return reg;
+
         unreachable("RA must pass us at least one possible reg.");
 }
 
@@ -426,44 +495,37 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
 
         compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
-                                          true);
+                                          false);
         if (!compiler->regs)
                 return false;
 
         for (int threads = 0; threads < max_thread_index; threads++) {
                 compiler->reg_class_any[threads] =
-                        ra_alloc_reg_class(compiler->regs);
+                        ra_alloc_contig_reg_class(compiler->regs, 1);
                 compiler->reg_class_r5[threads] =
-                        ra_alloc_reg_class(compiler->regs);
+                        ra_alloc_contig_reg_class(compiler->regs, 1);
                 compiler->reg_class_phys_or_acc[threads] =
-                        ra_alloc_reg_class(compiler->regs);
+                        ra_alloc_contig_reg_class(compiler->regs, 1);
                 compiler->reg_class_phys[threads] =
-                        ra_alloc_reg_class(compiler->regs);
+                        ra_alloc_contig_reg_class(compiler->regs, 1);
 
                 for (int i = PHYS_INDEX;
                      i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
-                        ra_class_add_reg(compiler->regs,
-                                         compiler->reg_class_phys_or_acc[threads], i);
-                        ra_class_add_reg(compiler->regs,
-                                         compiler->reg_class_phys[threads], i);
-                        ra_class_add_reg(compiler->regs,
-                                         compiler->reg_class_any[threads], i);
+                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                        ra_class_add_reg(compiler->reg_class_phys[threads], i);
+                        ra_class_add_reg(compiler->reg_class_any[threads], i);
                 }
 
                 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
-                        ra_class_add_reg(compiler->regs,
-                                         compiler->reg_class_phys_or_acc[threads], i);
-                        ra_class_add_reg(compiler->regs,
-                                         compiler->reg_class_any[threads], i);
+                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                        ra_class_add_reg(compiler->reg_class_any[threads], i);
                 }
                 /* r5 can only store a single 32-bit value, so not much can
                  * use it.
                  */
-                ra_class_add_reg(compiler->regs,
-                                 compiler->reg_class_r5[threads],
+                ra_class_add_reg(compiler->reg_class_r5[threads],
                                  ACC_INDEX + 5);
-                ra_class_add_reg(compiler->regs,
-                                 compiler->reg_class_any[threads],
+                ra_class_add_reg(compiler->reg_class_any[threads],
                                  ACC_INDEX + 5);
         }
 
@@ -472,11 +534,6 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         return true;
 }
 
-struct node_to_temp_map {
-        uint32_t temp;
-        uint32_t priority;
-};
-
 static int
 node_to_temp_priority(const void *in_a, const void *in_b)
 {
@@ -506,15 +563,15 @@ get_spill_batch_size(struct v3d_compile *c)
    return 20;
 }
 
-/* Don't emit spills using the TMU until we've dropped thread count first. Also,
- * don't spill if we have enabled any other optimization that can lead to
- * higher register pressure, such as TMU pipelining, we rather recompile without
- * the optimization in that case.
+/* Don't emit spills using the TMU until we've dropped thread count first. We,
+ * may also disable spilling when certain optimizations that are known to
+ * increase register pressure are active so we favor recompiling with
+ * optimizations disabled instead of spilling.
  */
 static inline bool
 tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
 {
-        return thread_index == 0 && c->disable_tmu_pipelining;
+        return thread_index == 0 && c->tmu_spilling_allowed;
 }
 
 #define CLASS_BIT_PHYS			(1 << 0)
@@ -532,6 +589,7 @@ tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
 struct qpu_reg *
 v3d_register_allocate(struct v3d_compile *c, bool *spilled)
 {
+        uint32_t UNUSED start_num_temps = c->num_temps;
         struct node_to_temp_map map[c->num_temps];
         uint32_t temp_to_node[c->num_temps];
         uint8_t class_bits[c->num_temps];
@@ -542,6 +600,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                  * RF0-2.
                  */
                 .next_phys = 3,
+                .map = map,
         };
 
         *spilled = false;
@@ -782,6 +841,12 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                 return NULL;
         }
 
+        /* Ensure that we are not accessing temp_to_node out of bounds. We
+         * should never trigger this assertion because `c->num_temps` only
+         * grows when we spill, in which case we return early and don't get
+         * here.
+         */
+        assert(start_num_temps == c->num_temps);
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
 
diff --git a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
index aa3354542..634b8961b 100644
--- a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
+++ b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
@@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr)
         return reg;
 }
 
-static inline struct qpu_reg
-qpu_acc(int acc)
-{
-        return qpu_magic(V3D_QPU_WADDR_R0 + acc);
-}
-
 struct v3d_qpu_instr
 v3d_qpu_nop(void)
 {
@@ -219,8 +213,13 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 src[i] = qpu_magic(qinst->src[i].index);
                                 break;
                         case QFILE_NULL:
+                                /* QFILE_NULL is an undef, so we can load
+                                 * anything. Using reg 0
+                                 */
+                                src[i] = qpu_reg(0);
+                                break;
                         case QFILE_LOAD_IMM:
-                                src[i] = qpu_acc(0);
+                                assert(!"not reached");
                                 break;
                         case QFILE_TEMP:
                                 src[i] = temp_registers[index];
@@ -238,7 +237,7 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 temp = new_qpu_nop_before(qinst);
                                 temp->qpu.sig.ldvpm = true;
 
-                                src[i] = qpu_acc(3);
+                                src[i] = qpu_magic(V3D_QPU_WADDR_R3);
                                 break;
                         }
                 }
diff --git a/lib/mesa/src/broadcom/meson.build b/lib/mesa/src/broadcom/meson.build
index f558aaca4..2e1145dd0 100644
--- a/lib/mesa/src/broadcom/meson.build
+++ b/lib/mesa/src/broadcom/meson.build
@@ -50,16 +50,35 @@ foreach ver : v3d_versions
   )
 endforeach
 
+v3d_args = ['-DV3D_BUILD_NEON']
+
+v3d_neon_c_args = []
+if host_machine.cpu_family() == 'arm'
+    v3d_neon_c_args = '-mfpu=neon'
+endif
+
+libv3d_neon = static_library(
+  'v3d_neon',
+  'common/v3d_tiling.c',
+  include_directories : [
+    inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
+  ],
+  c_args : [v3d_args, v3d_neon_c_args],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
+)
+
 libbroadcom_v3d = static_library(
   'libbroadcom_v3d',
   [
-    files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c'),
+    files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c', 'common/v3d_util.c'),
     v3d_xml_pack,
   ],
   include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
   link_whole : v3d_libs + per_version_libs,
+  link_with: [libv3d_neon],
   build_by_default : false,
   dependencies: [dep_valgrind, dep_thread],
 )
diff --git a/lib/mesa/src/broadcom/qpu/qpu_disasm.h b/lib/mesa/src/broadcom/qpu/qpu_disasm.h
index efdf8ddb5..b02ec91d7 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_disasm.h
+++ b/lib/mesa/src/broadcom/qpu/qpu_disasm.h
@@ -21,8 +21,8 @@
  * IN THE SOFTWARE.
  */
 
-#ifndef VC5_QPU_DISASM_H
-#define VC5_QPU_DISASM_H
+#ifndef QPU_DISASM_H
+#define QPU_DISASM_H
 
 #include "broadcom/common/v3d_device_info.h"
 
@@ -36,4 +36,4 @@ const char *v3d_qpu_disasm(const struct v3d_device_info *devinfo, uint64_t inst)
 void v3d_qpu_dump(const struct v3d_device_info *devinfo, const
                   struct v3d_qpu_instr *instr);
 
-#endif /* VC5_QPU_DISASM_H */
+#endif /* QPU_DISASM_H */
diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.c b/lib/mesa/src/broadcom/qpu/qpu_instr.c
index 0bda9a42c..569c5fc40 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_instr.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_instr.c
@@ -137,6 +137,8 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
                 [V3D_QPU_A_TMUWT] = "tmuwt",
                 [V3D_QPU_A_VPMSETUP] = "vpmsetup",
                 [V3D_QPU_A_VPMWT] = "vpmwt",
+                [V3D_QPU_A_FLAFIRST] = "flafirst",
+                [V3D_QPU_A_FLNAFIRST] = "flnafirst",
                 [V3D_QPU_A_LDVPMV_IN] = "ldvpmv_in",
                 [V3D_QPU_A_LDVPMV_OUT] = "ldvpmv_out",
                 [V3D_QPU_A_LDVPMD_IN] = "ldvpmd_in",
@@ -406,6 +408,8 @@ static const uint8_t add_op_args[] = {
         [V3D_QPU_A_BARRIERID] = D,
         [V3D_QPU_A_TMUWT] = D,
         [V3D_QPU_A_VPMWT] = D,
+        [V3D_QPU_A_FLAFIRST] = D,
+        [V3D_QPU_A_FLNAFIRST] = D,
 
         [V3D_QPU_A_VPMSETUP] = D | A,
 
@@ -930,6 +934,8 @@ v3d_qpu_reads_flags(const struct v3d_qpu_instr *inst)
                 case V3D_QPU_A_VFLNB:
                 case V3D_QPU_A_FLAPUSH:
                 case V3D_QPU_A_FLBPUSH:
+                case V3D_QPU_A_FLAFIRST:
+                case V3D_QPU_A_FLNAFIRST:
                         return true;
                 default:
                         break;
diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.h b/lib/mesa/src/broadcom/qpu/qpu_instr.h
index a87ed9ff3..4f165e939 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_instr.h
+++ b/lib/mesa/src/broadcom/qpu/qpu_instr.h
@@ -94,7 +94,6 @@ enum v3d_qpu_waddr {
         V3D_QPU_WADDR_R3 = 3,
         V3D_QPU_WADDR_R4 = 4,
         V3D_QPU_WADDR_R5 = 5,
-        /* 6 is reserved, but note 3.2.2.8: "Result Writes" */
         V3D_QPU_WADDR_NOP = 6,
         V3D_QPU_WADDR_TLB = 7,
         V3D_QPU_WADDR_TLBU = 8,
@@ -191,6 +190,8 @@ enum v3d_qpu_add_op {
         V3D_QPU_A_TMUWT,
         V3D_QPU_A_VPMSETUP,
         V3D_QPU_A_VPMWT,
+        V3D_QPU_A_FLAFIRST,
+        V3D_QPU_A_FLNAFIRST,
         V3D_QPU_A_LDVPMV_IN,
         V3D_QPU_A_LDVPMV_OUT,
         V3D_QPU_A_LDVPMD_IN,
diff --git a/lib/mesa/src/broadcom/qpu/qpu_pack.c b/lib/mesa/src/broadcom/qpu/qpu_pack.c
index 7502bbfb9..eee1e9f95 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_pack.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_pack.c
@@ -44,65 +44,65 @@
         (((inst) & ~(field ## _MASK)) | QPU_SET_FIELD(value, field))
 #endif /* QPU_MASK */
 
-#define VC5_QPU_OP_MUL_SHIFT                58
-#define VC5_QPU_OP_MUL_MASK                 QPU_MASK(63, 58)
+#define V3D_QPU_OP_MUL_SHIFT                58
+#define V3D_QPU_OP_MUL_MASK                 QPU_MASK(63, 58)
 
-#define VC5_QPU_SIG_SHIFT                   53
-#define VC5_QPU_SIG_MASK                    QPU_MASK(57, 53)
+#define V3D_QPU_SIG_SHIFT                   53
+#define V3D_QPU_SIG_MASK                    QPU_MASK(57, 53)
 
-#define VC5_QPU_COND_SHIFT                  46
-#define VC5_QPU_COND_MASK                   QPU_MASK(52, 46)
-#define VC5_QPU_COND_SIG_MAGIC_ADDR         (1 << 6)
+#define V3D_QPU_COND_SHIFT                  46
+#define V3D_QPU_COND_MASK                   QPU_MASK(52, 46)
+#define V3D_QPU_COND_SIG_MAGIC_ADDR         (1 << 6)
 
-#define VC5_QPU_MM                          QPU_MASK(45, 45)
-#define VC5_QPU_MA                          QPU_MASK(44, 44)
+#define V3D_QPU_MM                          QPU_MASK(45, 45)
+#define V3D_QPU_MA                          QPU_MASK(44, 44)
 
 #define V3D_QPU_WADDR_M_SHIFT               38
 #define V3D_QPU_WADDR_M_MASK                QPU_MASK(43, 38)
 
-#define VC5_QPU_BRANCH_ADDR_LOW_SHIFT       35
-#define VC5_QPU_BRANCH_ADDR_LOW_MASK        QPU_MASK(55, 35)
+#define V3D_QPU_BRANCH_ADDR_LOW_SHIFT       35
+#define V3D_QPU_BRANCH_ADDR_LOW_MASK        QPU_MASK(55, 35)
 
 #define V3D_QPU_WADDR_A_SHIFT               32
 #define V3D_QPU_WADDR_A_MASK                QPU_MASK(37, 32)
 
-#define VC5_QPU_BRANCH_COND_SHIFT           32
-#define VC5_QPU_BRANCH_COND_MASK            QPU_MASK(34, 32)
+#define V3D_QPU_BRANCH_COND_SHIFT           32
+#define V3D_QPU_BRANCH_COND_MASK            QPU_MASK(34, 32)
 
-#define VC5_QPU_BRANCH_ADDR_HIGH_SHIFT      24
-#define VC5_QPU_BRANCH_ADDR_HIGH_MASK       QPU_MASK(31, 24)
+#define V3D_QPU_BRANCH_ADDR_HIGH_SHIFT      24
+#define V3D_QPU_BRANCH_ADDR_HIGH_MASK       QPU_MASK(31, 24)
 
-#define VC5_QPU_OP_ADD_SHIFT                24
-#define VC5_QPU_OP_ADD_MASK                 QPU_MASK(31, 24)
+#define V3D_QPU_OP_ADD_SHIFT                24
+#define V3D_QPU_OP_ADD_MASK                 QPU_MASK(31, 24)
 
-#define VC5_QPU_MUL_B_SHIFT                 21
-#define VC5_QPU_MUL_B_MASK                  QPU_MASK(23, 21)
+#define V3D_QPU_MUL_B_SHIFT                 21
+#define V3D_QPU_MUL_B_MASK                  QPU_MASK(23, 21)
 
-#define VC5_QPU_BRANCH_MSFIGN_SHIFT         21
-#define VC5_QPU_BRANCH_MSFIGN_MASK          QPU_MASK(22, 21)
+#define V3D_QPU_BRANCH_MSFIGN_SHIFT         21
+#define V3D_QPU_BRANCH_MSFIGN_MASK          QPU_MASK(22, 21)
 
-#define VC5_QPU_MUL_A_SHIFT                 18
-#define VC5_QPU_MUL_A_MASK                  QPU_MASK(20, 18)
+#define V3D_QPU_MUL_A_SHIFT                 18
+#define V3D_QPU_MUL_A_MASK                  QPU_MASK(20, 18)
 
-#define VC5_QPU_ADD_B_SHIFT                 15
-#define VC5_QPU_ADD_B_MASK                  QPU_MASK(17, 15)
+#define V3D_QPU_ADD_B_SHIFT                 15
+#define V3D_QPU_ADD_B_MASK                  QPU_MASK(17, 15)
 
-#define VC5_QPU_BRANCH_BDU_SHIFT            15
-#define VC5_QPU_BRANCH_BDU_MASK             QPU_MASK(17, 15)
+#define V3D_QPU_BRANCH_BDU_SHIFT            15
+#define V3D_QPU_BRANCH_BDU_MASK             QPU_MASK(17, 15)
 
-#define VC5_QPU_BRANCH_UB                   QPU_MASK(14, 14)
+#define V3D_QPU_BRANCH_UB                   QPU_MASK(14, 14)
 
-#define VC5_QPU_ADD_A_SHIFT                 12
-#define VC5_QPU_ADD_A_MASK                  QPU_MASK(14, 12)
+#define V3D_QPU_ADD_A_SHIFT                 12
+#define V3D_QPU_ADD_A_MASK                  QPU_MASK(14, 12)
 
-#define VC5_QPU_BRANCH_BDI_SHIFT            12
-#define VC5_QPU_BRANCH_BDI_MASK             QPU_MASK(13, 12)
+#define V3D_QPU_BRANCH_BDI_SHIFT            12
+#define V3D_QPU_BRANCH_BDI_MASK             QPU_MASK(13, 12)
 
-#define VC5_QPU_RADDR_A_SHIFT               6
-#define VC5_QPU_RADDR_A_MASK                QPU_MASK(11, 6)
+#define V3D_QPU_RADDR_A_SHIFT               6
+#define V3D_QPU_RADDR_A_MASK                QPU_MASK(11, 6)
 
-#define VC5_QPU_RADDR_B_SHIFT               0
-#define VC5_QPU_RADDR_B_MASK                QPU_MASK(5, 0)
+#define V3D_QPU_RADDR_B_SHIFT               0
+#define V3D_QPU_RADDR_B_MASK                QPU_MASK(5, 0)
 
 #define THRSW .thrsw = true
 #define LDUNIF .ldunif = true
@@ -207,9 +207,9 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
         [21] = { THRSW,    LDVARY,        WRTMUC },
         [22] = { UCB,                     },
         [23] = { ROT,                     },
-        /* 24-30 reserved */
         [24] = {                   LDUNIFA},
         [25] = { LDUNIFARF                },
+        /* 26-30 reserved */
         [31] = { SMIMM,            LDTMU, },
 };
 
@@ -456,8 +456,15 @@ struct opcode_desc {
         uint8_t mux_b_mask;
         uint8_t mux_a_mask;
         uint8_t op;
-        /* 0 if it's the same across V3D versions, or a specific V3D version. */
-        uint8_t ver;
+
+        /* first_ver == 0 if it's the same across all V3D versions.
+         * first_ver == X, last_ver == 0 if it's the same for all V3D versions
+         *   starting from X
+         * first_ver == X, last_ver == Y if it's the same for all V3D versions
+         *   on the range X through Y
+         */
+        uint8_t first_ver;
+        uint8_t last_ver;
 };
 
 static const struct opcode_desc add_ops[] = {
@@ -519,8 +526,10 @@ static const struct opcode_desc add_ops[] = {
         { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
         { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
         { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
-
+        { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
+        { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
         { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+
         { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
         { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
         { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
@@ -576,9 +585,23 @@ static const struct opcode_desc mul_ops[] = {
         { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
 };
 
+/* Returns true if op_desc should be filtered out based on devinfo->ver
+ * against op_desc->first_ver and op_desc->last_ver. Check notes about
+ * first_ver/last_ver on struct opcode_desc comments.
+ */
+static bool
+opcode_invalid_in_version(const struct v3d_device_info *devinfo,
+                          const struct opcode_desc *op_desc)
+{
+        return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
+                (op_desc->last_ver != 0  && devinfo->ver > op_desc->last_ver);
+}
+
 static const struct opcode_desc *
-lookup_opcode(const struct opcode_desc *opcodes, size_t num_opcodes,
-              uint32_t opcode, uint32_t mux_a, uint32_t mux_b)
+lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
+                          const struct opcode_desc *opcodes,
+                          size_t num_opcodes, uint32_t opcode,
+                          uint32_t mux_a, uint32_t mux_b)
 {
         for (int i = 0; i < num_opcodes; i++) {
                 const struct opcode_desc *op_desc = &opcodes[i];
@@ -587,6 +610,9 @@ lookup_opcode(const struct opcode_desc *opcodes, size_t num_opcodes,
                     opcode > op_desc->opcode_last)
                         continue;
 
+                if (opcode_invalid_in_version(devinfo, op_desc))
+                        continue;
+
                 if (!(op_desc->mux_b_mask & (1 << mux_b)))
                         continue;
 
@@ -716,9 +742,9 @@ static bool
 v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                    struct v3d_qpu_instr *instr)
 {
-        uint32_t op = QPU_GET_FIELD(packed_inst, VC5_QPU_OP_ADD);
-        uint32_t mux_a = QPU_GET_FIELD(packed_inst, VC5_QPU_ADD_A);
-        uint32_t mux_b = QPU_GET_FIELD(packed_inst, VC5_QPU_ADD_B);
+        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+        uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
+        uint32_t mux_b = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_B);
         uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
 
         uint32_t map_op = op;
@@ -731,8 +757,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 map_op = (map_op - 253 + 245);
 
         const struct opcode_desc *desc =
-                lookup_opcode(add_ops, ARRAY_SIZE(add_ops),
-                              map_op, mux_a, mux_b);
+                lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
+                                          map_op, mux_a, mux_b);
+
         if (!desc)
                 return false;
 
@@ -846,7 +873,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
         instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
 
         instr->alu.add.magic_write = false;
-        if (packed_inst & VC5_QPU_MA) {
+        if (packed_inst & V3D_QPU_MA) {
                 switch (instr->alu.add.op) {
                 case V3D_QPU_A_LDVPMV_IN:
                         instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
@@ -870,14 +897,15 @@ static bool
 v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                    struct v3d_qpu_instr *instr)
 {
-        uint32_t op = QPU_GET_FIELD(packed_inst, VC5_QPU_OP_MUL);
-        uint32_t mux_a = QPU_GET_FIELD(packed_inst, VC5_QPU_MUL_A);
-        uint32_t mux_b = QPU_GET_FIELD(packed_inst, VC5_QPU_MUL_B);
+        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+        uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
+        uint32_t mux_b = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_B);
 
         {
                 const struct opcode_desc *desc =
-                        lookup_opcode(mul_ops, ARRAY_SIZE(mul_ops),
-                                      op, mux_a, mux_b);
+                        lookup_opcode_from_packed(devinfo, mul_ops,
+                                                  ARRAY_SIZE(mul_ops),
+                                                  op, mux_a, mux_b);
                 if (!desc)
                         return false;
 
@@ -933,11 +961,31 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
         instr->alu.mul.a = mux_a;
         instr->alu.mul.b = mux_b;
         instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
-        instr->alu.mul.magic_write = packed_inst & VC5_QPU_MM;
+        instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
 
         return true;
 }
 
+static const struct opcode_desc *
+lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+                         const struct opcode_desc *opcodes, size_t num_opcodes,
+                         uint8_t op)
+{
+        for (int i = 0; i < num_opcodes; i++) {
+                const struct opcode_desc *op_desc = &opcodes[i];
+
+                if (op_desc->op != op)
+                        continue;
+
+                if (opcode_invalid_in_version(devinfo, op_desc))
+                        continue;
+
+                return op_desc;
+        }
+
+        return NULL;
+}
+
 static bool
 v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                  const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
@@ -946,18 +994,14 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
         uint32_t mux_a = instr->alu.add.a;
         uint32_t mux_b = instr->alu.add.b;
         int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
-        const struct opcode_desc *desc;
+        const struct opcode_desc *desc =
+                lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
+                                         instr->alu.add.op);
 
-        int opcode;
-        for (desc = add_ops; desc != &add_ops[ARRAY_SIZE(add_ops)];
-             desc++) {
-                if (desc->op == instr->alu.add.op)
-                        break;
-        }
-        if (desc == &add_ops[ARRAY_SIZE(add_ops)])
+        if (!desc)
                 return false;
 
-        opcode = desc->opcode_first;
+        uint32_t opcode = desc->opcode_first;
 
         /* If an operation doesn't use an arg, its mux values may be used to
          * identify the operation type.
@@ -995,7 +1039,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
         case V3D_QPU_A_LDVPMD_OUT:
         case V3D_QPU_A_LDVPMG_OUT:
                 assert(!instr->alu.add.magic_write);
-                *packed_instr |= VC5_QPU_MA;
+                *packed_instr |= V3D_QPU_MA;
                 break;
 
         default:
@@ -1145,12 +1189,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 break;
         }
 
-        *packed_instr |= QPU_SET_FIELD(mux_a, VC5_QPU_ADD_A);
-        *packed_instr |= QPU_SET_FIELD(mux_b, VC5_QPU_ADD_B);
-        *packed_instr |= QPU_SET_FIELD(opcode, VC5_QPU_OP_ADD);
+        *packed_instr |= QPU_SET_FIELD(mux_a, V3D_QPU_ADD_A);
+        *packed_instr |= QPU_SET_FIELD(mux_b, V3D_QPU_ADD_B);
+        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
         *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
         if (instr->alu.add.magic_write && !no_magic_write)
-                *packed_instr |= VC5_QPU_MA;
+                *packed_instr |= V3D_QPU_MA;
 
         return true;
 }
@@ -1162,14 +1206,12 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
         uint32_t mux_a = instr->alu.mul.a;
         uint32_t mux_b = instr->alu.mul.b;
         int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
-        const struct opcode_desc *desc;
 
-        for (desc = mul_ops; desc != &mul_ops[ARRAY_SIZE(mul_ops)];
-             desc++) {
-                if (desc->op == instr->alu.mul.op)
-                        break;
-        }
-        if (desc == &mul_ops[ARRAY_SIZE(mul_ops)])
+        const struct opcode_desc *desc =
+                lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
+                                         instr->alu.mul.op);
+
+        if (!desc)
                 return false;
 
         uint32_t opcode = desc->opcode_first;
@@ -1253,13 +1295,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
                 break;
         }
 
-        *packed_instr |= QPU_SET_FIELD(mux_a, VC5_QPU_MUL_A);
-        *packed_instr |= QPU_SET_FIELD(mux_b, VC5_QPU_MUL_B);
+        *packed_instr |= QPU_SET_FIELD(mux_a, V3D_QPU_MUL_A);
+        *packed_instr |= QPU_SET_FIELD(mux_b, V3D_QPU_MUL_B);
 
-        *packed_instr |= QPU_SET_FIELD(opcode, VC5_QPU_OP_MUL);
+        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
         *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
         if (instr->alu.mul.magic_write)
-                *packed_instr |= VC5_QPU_MM;
+                *packed_instr |= V3D_QPU_MM;
 
         return true;
 }
@@ -1272,14 +1314,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
         instr->type = V3D_QPU_INSTR_TYPE_ALU;
 
         if (!v3d_qpu_sig_unpack(devinfo,
-                                QPU_GET_FIELD(packed_instr, VC5_QPU_SIG),
+                                QPU_GET_FIELD(packed_instr, V3D_QPU_SIG),
                                 &instr->sig))
                 return false;
 
-        uint32_t packed_cond = QPU_GET_FIELD(packed_instr, VC5_QPU_COND);
+        uint32_t packed_cond = QPU_GET_FIELD(packed_instr, V3D_QPU_COND);
         if (v3d_qpu_sig_writes_address(devinfo, &instr->sig)) {
-                instr->sig_addr = packed_cond & ~VC5_QPU_COND_SIG_MAGIC_ADDR;
-                instr->sig_magic = packed_cond & VC5_QPU_COND_SIG_MAGIC_ADDR;
+                instr->sig_addr = packed_cond & ~V3D_QPU_COND_SIG_MAGIC_ADDR;
+                instr->sig_magic = packed_cond & V3D_QPU_COND_SIG_MAGIC_ADDR;
 
                 instr->flags.ac = V3D_QPU_COND_NONE;
                 instr->flags.mc = V3D_QPU_COND_NONE;
@@ -1292,8 +1334,8 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
                         return false;
         }
 
-        instr->raddr_a = QPU_GET_FIELD(packed_instr, VC5_QPU_RADDR_A);
-        instr->raddr_b = QPU_GET_FIELD(packed_instr, VC5_QPU_RADDR_B);
+        instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+        instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
 
         if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
                 return false;
@@ -1311,7 +1353,7 @@ v3d_qpu_instr_unpack_branch(const struct v3d_device_info *devinfo,
 {
         instr->type = V3D_QPU_INSTR_TYPE_BRANCH;
 
-        uint32_t cond = QPU_GET_FIELD(packed_instr, VC5_QPU_BRANCH_COND);
+        uint32_t cond = QPU_GET_FIELD(packed_instr, V3D_QPU_BRANCH_COND);
         if (cond == 0)
                 instr->branch.cond = V3D_QPU_BRANCH_COND_ALWAYS;
         else if (V3D_QPU_BRANCH_COND_A0 + (cond - 2) <=
@@ -1320,31 +1362,31 @@ v3d_qpu_instr_unpack_branch(const struct v3d_device_info *devinfo,
         else
                 return false;
 
-        uint32_t msfign = QPU_GET_FIELD(packed_instr, VC5_QPU_BRANCH_MSFIGN);
+        uint32_t msfign = QPU_GET_FIELD(packed_instr, V3D_QPU_BRANCH_MSFIGN);
         if (msfign == 3)
                 return false;
         instr->branch.msfign = msfign;
 
-        instr->branch.bdi = QPU_GET_FIELD(packed_instr, VC5_QPU_BRANCH_BDI);
+        instr->branch.bdi = QPU_GET_FIELD(packed_instr, V3D_QPU_BRANCH_BDI);
 
-        instr->branch.ub = packed_instr & VC5_QPU_BRANCH_UB;
+        instr->branch.ub = packed_instr & V3D_QPU_BRANCH_UB;
         if (instr->branch.ub) {
                 instr->branch.bdu = QPU_GET_FIELD(packed_instr,
-                                                  VC5_QPU_BRANCH_BDU);
+                                                  V3D_QPU_BRANCH_BDU);
         }
 
         instr->branch.raddr_a = QPU_GET_FIELD(packed_instr,
-                                              VC5_QPU_RADDR_A);
+                                              V3D_QPU_RADDR_A);
 
         instr->branch.offset = 0;
 
         instr->branch.offset +=
                 QPU_GET_FIELD(packed_instr,
-                              VC5_QPU_BRANCH_ADDR_LOW) << 3;
+                              V3D_QPU_BRANCH_ADDR_LOW) << 3;
 
         instr->branch.offset +=
                 QPU_GET_FIELD(packed_instr,
-                              VC5_QPU_BRANCH_ADDR_HIGH) << 24;
+                              V3D_QPU_BRANCH_ADDR_HIGH) << 24;
 
         return true;
 }
@@ -1354,10 +1396,10 @@ v3d_qpu_instr_unpack(const struct v3d_device_info *devinfo,
                      uint64_t packed_instr,
                      struct v3d_qpu_instr *instr)
 {
-        if (QPU_GET_FIELD(packed_instr, VC5_QPU_OP_MUL) != 0) {
+        if (QPU_GET_FIELD(packed_instr, V3D_QPU_OP_MUL) != 0) {
                 return v3d_qpu_instr_unpack_alu(devinfo, packed_instr, instr);
         } else {
-                uint32_t sig = QPU_GET_FIELD(packed_instr, VC5_QPU_SIG);
+                uint32_t sig = QPU_GET_FIELD(packed_instr, V3D_QPU_SIG);
 
                 if ((sig & 24) == 16) {
                         return v3d_qpu_instr_unpack_branch(devinfo, packed_instr,
@@ -1376,11 +1418,11 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
         uint32_t sig;
         if (!v3d_qpu_sig_pack(devinfo, &instr->sig, &sig))
                 return false;
-        *packed_instr |= QPU_SET_FIELD(sig, VC5_QPU_SIG);
+        *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
 
         if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
-                *packed_instr |= QPU_SET_FIELD(instr->raddr_a, VC5_QPU_RADDR_A);
-                *packed_instr |= QPU_SET_FIELD(instr->raddr_b, VC5_QPU_RADDR_B);
+                *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+                *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
 
                 if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
                         return false;
@@ -1400,13 +1442,13 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
 
                         flags = instr->sig_addr;
                         if (instr->sig_magic)
-                                flags |= VC5_QPU_COND_SIG_MAGIC_ADDR;
+                                flags |= V3D_QPU_COND_SIG_MAGIC_ADDR;
                 } else {
                         if (!v3d_qpu_flags_pack(devinfo, &instr->flags, &flags))
                                 return false;
                 }
 
-                *packed_instr |= QPU_SET_FIELD(flags, VC5_QPU_COND);
+                *packed_instr |= QPU_SET_FIELD(flags, V3D_QPU_COND);
         } else {
                 if (v3d_qpu_sig_writes_address(devinfo, &instr->sig))
                         return false;
@@ -1420,38 +1462,39 @@ v3d_qpu_instr_pack_branch(const struct v3d_device_info *devinfo,
                           const struct v3d_qpu_instr *instr,
                           uint64_t *packed_instr)
 {
-        *packed_instr |= QPU_SET_FIELD(16, VC5_QPU_SIG);
+        *packed_instr |= QPU_SET_FIELD(16, V3D_QPU_SIG);
 
         if (instr->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
                 *packed_instr |= QPU_SET_FIELD(2 + (instr->branch.cond -
                                                     V3D_QPU_BRANCH_COND_A0),
-                                               VC5_QPU_BRANCH_COND);
+                                               V3D_QPU_BRANCH_COND);
         }
 
         *packed_instr |= QPU_SET_FIELD(instr->branch.msfign,
-                                       VC5_QPU_BRANCH_MSFIGN);
+                                       V3D_QPU_BRANCH_MSFIGN);
 
         *packed_instr |= QPU_SET_FIELD(instr->branch.bdi,
-                                       VC5_QPU_BRANCH_BDI);
+                                       V3D_QPU_BRANCH_BDI);
 
         if (instr->branch.ub) {
-                *packed_instr |= VC5_QPU_BRANCH_UB;
+                *packed_instr |= V3D_QPU_BRANCH_UB;
                 *packed_instr |= QPU_SET_FIELD(instr->branch.bdu,
-                                               VC5_QPU_BRANCH_BDU);
+                                               V3D_QPU_BRANCH_BDU);
         }
 
         switch (instr->branch.bdi) {
         case V3D_QPU_BRANCH_DEST_ABS:
         case V3D_QPU_BRANCH_DEST_REL:
                 *packed_instr |= QPU_SET_FIELD(instr->branch.msfign,
-                                               VC5_QPU_BRANCH_MSFIGN);
+                                               V3D_QPU_BRANCH_MSFIGN);
 
                 *packed_instr |= QPU_SET_FIELD((instr->branch.offset &
                                                 ~0xff000000) >> 3,
-                                               VC5_QPU_BRANCH_ADDR_LOW);
+                                               V3D_QPU_BRANCH_ADDR_LOW);
 
                 *packed_instr |= QPU_SET_FIELD(instr->branch.offset >> 24,
-                                               VC5_QPU_BRANCH_ADDR_HIGH);
+                                               V3D_QPU_BRANCH_ADDR_HIGH);
+                break;
         default:
                 break;
         }
@@ -1459,7 +1502,7 @@ v3d_qpu_instr_pack_branch(const struct v3d_device_info *devinfo,
         if (instr->branch.bdi == V3D_QPU_BRANCH_DEST_REGFILE ||
             instr->branch.bdu == V3D_QPU_BRANCH_DEST_REGFILE) {
                 *packed_instr |= QPU_SET_FIELD(instr->branch.raddr_a,
-                                               VC5_QPU_RADDR_A);
+                                               V3D_QPU_RADDR_A);
         }
 
         return true;
diff --git a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
index 5922b409a..e6b1918b8 100644
--- a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
@@ -162,6 +162,7 @@ main(int argc, char **argv)
                                          &instr.alu.add.b);
                                 swap_pack(&instr.alu.add.a_unpack,
                                           &instr.alu.add.b_unpack);
+                                break;
                         default:
                                 break;
                         }
diff --git a/lib/mesa/src/broadcom/simulator/v3d_simulator.c b/lib/mesa/src/broadcom/simulator/v3d_simulator.c
index 8d43bf6d5..494e5bb44 100644
--- a/lib/mesa/src/broadcom/simulator/v3d_simulator.c
+++ b/lib/mesa/src/broadcom/simulator/v3d_simulator.c
@@ -24,10 +24,10 @@
 /**
  * @file v3d_simulator.c
  *
- * Implements VC5 simulation on top of a non-VC5 GEM fd.
+ * Implements V3D simulation on top of a non-V3D GEM fd.
  *
- * This file's goal is to emulate the VC5 ioctls' behavior in the kernel on
- * top of the simpenrose software simulator.  Generally, VC5 driver BOs have a
+ * This file's goal is to emulate the V3D ioctls' behavior in the kernel on
+ * top of the simpenrose software simulator.  Generally, V3D driver BOs have a
  * GEM-side copy of their contents and a simulator-side memory area that the
  * GEM contents get copied into during simulation.  Once simulation is done,
  * the simulator's data is copied back out to the GEM BOs, so that rendering
@@ -40,8 +40,8 @@
  * outside of this file still call ioctls directly on the fd).
  *
  * Another limitation is that BO import doesn't work unless the underlying
- * window system's BO size matches what VC5 is going to use, which of course
- * doesn't work out in practice.  This means that for now, only DRI3 (VC5
+ * window system's BO size matches what V3D is going to use, which of course
+ * doesn't work out in practice.  This means that for now, only DRI3 (V3D
  * makes the winsys BOs) is supported, not DRI2 (window system makes the winys
  * BOs).
  */
@@ -79,7 +79,7 @@ static struct v3d_simulator_state {
         /* Base hardware address of the heap. */
         uint32_t mem_base;
         /* Size of the heap. */
-        size_t mem_size;
+        uint32_t mem_size;
 
         struct mem_block *heap;
         struct mem_block *overflow;
@@ -87,6 +87,9 @@ static struct v3d_simulator_state {
         /** Mapping from GEM fd to struct v3d_simulator_file * */
         struct hash_table *fd_map;
 
+        /** Last performance monitor ID. */
+        uint32_t last_perfid;
+
         struct util_dynarray bin_oom;
         int refcount;
 } sim_state = {
@@ -100,6 +103,11 @@ struct v3d_simulator_file {
         /** Mapping from GEM handle to struct v3d_simulator_bo * */
         struct hash_table *bo_map;
 
+        /** Dynamic array with performance monitors */
+        struct v3d_simulator_perfmon **perfmons;
+        uint32_t perfmons_size;
+        uint32_t active_perfid;
+
         struct mem_block *gmp;
         void *gmp_vaddr;
 
@@ -121,12 +129,34 @@ struct v3d_simulator_bo {
         int handle;
 };
 
+struct v3d_simulator_perfmon {
+        uint32_t ncounters;
+        uint8_t counters[DRM_V3D_MAX_PERF_COUNTERS];
+        uint64_t values[DRM_V3D_MAX_PERF_COUNTERS];
+};
+
 static void *
 int_to_key(int key)
 {
         return (void *)(uintptr_t)key;
 }
 
+#define PERFMONS_ALLOC_SIZE 100
+
+static uint32_t
+perfmons_next_id(struct v3d_simulator_file *sim_file) {
+        sim_state.last_perfid++;
+        if (sim_state.last_perfid > sim_file->perfmons_size) {
+                sim_file->perfmons_size += PERFMONS_ALLOC_SIZE;
+                sim_file->perfmons = reralloc(sim_file,
+                                              sim_file->perfmons,
+                                              struct v3d_simulator_perfmon *,
+                                              sim_file->perfmons_size);
+        }
+
+        return sim_state.last_perfid;
+}
+
 static struct v3d_simulator_file *
 v3d_get_simulator_file_for_fd(int fd)
 {
@@ -357,6 +387,46 @@ v3d_simulator_unpin_bos(struct v3d_simulator_file *file,
         return 0;
 }
 
+static struct v3d_simulator_perfmon *
+v3d_get_simulator_perfmon(int fd, uint32_t perfid)
+{
+        if (!perfid || perfid > sim_state.last_perfid)
+                return NULL;
+
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+
+        mtx_lock(&sim_state.mutex);
+        assert(perfid <= file->perfmons_size);
+        struct v3d_simulator_perfmon *perfmon = file->perfmons[perfid - 1];
+        mtx_unlock(&sim_state.mutex);
+
+        return perfmon;
+}
+
+static void
+v3d_simulator_perfmon_switch(int fd, uint32_t perfid)
+{
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+        struct v3d_simulator_perfmon *perfmon;
+
+        if (perfid == file->active_perfid)
+                return;
+
+        perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
+        if (perfmon)
+                v3d41_simulator_perfmon_stop(sim_state.v3d,
+                                             perfmon->ncounters,
+                                             perfmon->values);
+
+        perfmon = v3d_get_simulator_perfmon(fd, perfid);
+        if (perfmon)
+                v3d41_simulator_perfmon_start(sim_state.v3d,
+                                              perfmon->ncounters,
+                                              perfmon->counters);
+
+        file->active_perfid = perfid;
+}
+
 static int
 v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
 {
@@ -369,6 +439,9 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
 
         mtx_lock(&sim_state.submit_lock);
         bin_fd = fd;
+
+        v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
+
         if (sim_state.ver >= 41)
                 v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
         else
@@ -402,9 +475,9 @@ void v3d_simulator_open_from_handle(int fd, int handle, uint32_t size)
 }
 
 /**
- * Simulated ioctl(fd, DRM_VC5_CREATE_BO) implementation.
+ * Simulated ioctl(fd, DRM_V3D_CREATE_BO) implementation.
  *
- * Making a VC5 BO is just a matter of making a corresponding BO on the host.
+ * Making a V3D BO is just a matter of making a corresponding BO on the host.
  */
 static int
 v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
@@ -447,7 +520,7 @@ v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
 }
 
 /**
- * Simulated ioctl(fd, DRM_VC5_MMAP_BO) implementation.
+ * Simulated ioctl(fd, DRM_V3D_MMAP_BO) implementation.
  *
  * We've already grabbed the mmap offset when we created the sim bo, so just
  * return it.
@@ -530,6 +603,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
         for (int i = 0; i < args->bo_handle_count; i++)
                 v3d_simulator_copy_in_handle(file, bo_handles[i]);
 
+        v3d_simulator_perfmon_switch(fd, args->perfmon_id);
+
         if (sim_state.ver >= 41)
                 ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
                                                        file->gmp->ofs);
@@ -542,6 +617,79 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
         return ret;
 }
 
+static int
+v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
+{
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+
+        if (args->ncounters == 0 ||
+            args->ncounters > DRM_V3D_MAX_PERF_COUNTERS)
+                return -EINVAL;
+
+        struct v3d_simulator_perfmon *perfmon = rzalloc(file,
+                                                        struct v3d_simulator_perfmon);
+
+        perfmon->ncounters = args->ncounters;
+        for (int i = 0; i < args->ncounters; i++) {
+                if (args->counters[i] >= V3D_PERFCNT_NUM) {
+                        ralloc_free(perfmon);
+                        return -EINVAL;
+                } else {
+                        perfmon->counters[i] = args->counters[i];
+                }
+        }
+
+        mtx_lock(&sim_state.mutex);
+        args->id = perfmons_next_id(file);
+        file->perfmons[args->id - 1] = perfmon;
+        mtx_unlock(&sim_state.mutex);
+
+        return 0;
+}
+
+static int
+v3d_simulator_perfmon_destroy_ioctl(int fd, struct drm_v3d_perfmon_destroy *args)
+{
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+        struct v3d_simulator_perfmon *perfmon =
+                v3d_get_simulator_perfmon(fd, args->id);
+
+        if (!perfmon)
+                return -EINVAL;
+
+        mtx_lock(&sim_state.mutex);
+        file->perfmons[args->id - 1] = NULL;
+        mtx_unlock(&sim_state.mutex);
+
+        ralloc_free(perfmon);
+
+        return 0;
+}
+
+static int
+v3d_simulator_perfmon_get_values_ioctl(int fd, struct drm_v3d_perfmon_get_values *args)
+{
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+
+        mtx_lock(&sim_state.submit_lock);
+
+        /* Stop the perfmon if it is still active */
+        if (args->id == file->active_perfid)
+                v3d_simulator_perfmon_switch(fd, 0);
+
+        mtx_unlock(&sim_state.submit_lock);
+
+        struct v3d_simulator_perfmon *perfmon =
+                v3d_get_simulator_perfmon(fd, args->id);
+
+        if (!perfmon)
+                return -EINVAL;
+
+        memcpy((void *)args->values_ptr, perfmon->values, perfmon->ncounters * sizeof(uint64_t));
+
+        return 0;
+}
+
 int
 v3d_simulator_ioctl(int fd, unsigned long request, void *args)
 {
@@ -575,6 +723,15 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
         case DRM_IOCTL_V3D_SUBMIT_CSD:
                 return v3d_simulator_submit_csd_ioctl(fd, args);
 
+        case DRM_IOCTL_V3D_PERFMON_CREATE:
+                return v3d_simulator_perfmon_create_ioctl(fd, args);
+
+        case DRM_IOCTL_V3D_PERFMON_DESTROY:
+                return v3d_simulator_perfmon_destroy_ioctl(fd, args);
+
+        case DRM_IOCTL_V3D_PERFMON_GET_VALUES:
+                return v3d_simulator_perfmon_get_values_ioctl(fd, args);
+
         case DRM_IOCTL_GEM_OPEN:
         case DRM_IOCTL_GEM_FLINK:
                 return drmIoctl(fd, request, args);
diff --git a/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.cpp b/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.cpp
index 15db767d5..88e439255 100644
--- a/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.cpp
+++ b/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.cpp
@@ -46,7 +46,7 @@ struct v3d_hw *v3d_hw_auto_new(void *in_params)
 }
 
 
-uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, size_t *size, void **p)
+uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p)
 {
         return hw->get_mem(size, p);
 }
@@ -56,11 +56,6 @@ bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size)
         return hw->alloc_mem(min_size) == V3D_HW_ALLOC_SUCCESS;
 }
 
-bool v3d_hw_has_gca(struct v3d_hw *hw)
-{
-        return hw->has_gca();
-}
-
 uint32_t v3d_hw_read_reg(struct v3d_hw *hw, uint32_t reg)
 {
         return hw->read_reg(reg);
@@ -89,5 +84,10 @@ v3d_hw_set_isr(struct v3d_hw *hw, void (*isr)(uint32_t status))
         hw->set_isr(isr);
 }
 
+uint32_t v3d_hw_get_hub_core()
+{
+        return V3D_HW_HUB_CORE;
+}
+
 }
 #endif /* USE_V3D_SIMULATOR */
diff --git a/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.h b/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.h
index b20ea2484..05b2a3361 100644
--- a/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.h
+++ b/lib/mesa/src/broadcom/simulator/v3d_simulator_wrapper.h
@@ -31,14 +31,14 @@ extern "C" {
 #endif
 
 struct v3d_hw *v3d_hw_auto_new(void *params);
-uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, size_t *size, void **p);
+uint32_t v3d_hw_get_mem(const struct v3d_hw *hw, uint32_t *size, void **p);
 bool v3d_hw_alloc_mem(struct v3d_hw *hw, size_t min_size);
-bool v3d_hw_has_gca(struct v3d_hw *hw);
 uint32_t v3d_hw_read_reg(struct v3d_hw *hw, uint32_t reg);
 void v3d_hw_write_reg(struct v3d_hw *hw, uint32_t reg, uint32_t val);
 void v3d_hw_tick(struct v3d_hw *hw);
 int v3d_hw_get_version(struct v3d_hw *hw);
 void v3d_hw_set_isr(struct v3d_hw *hw, void (*isr)(uint32_t status));
+uint32_t v3d_hw_get_hub_core();
 
 #ifdef __cplusplus
 }
diff --git a/lib/mesa/src/broadcom/simulator/v3dx_simulator.c b/lib/mesa/src/broadcom/simulator/v3dx_simulator.c
index cbf257859..07bbbe2f8 100644
--- a/lib/mesa/src/broadcom/simulator/v3dx_simulator.c
+++ b/lib/mesa/src/broadcom/simulator/v3dx_simulator.c
@@ -24,7 +24,7 @@
 /**
  * @file v3dx_simulator.c
  *
- * Implements the actual HW interaction betweeh the GL driver's VC5 simulator and the simulator.
+ * Implements the actual HW interaction betweeh the GL driver's V3D simulator and the simulator.
  *
  * The register headers between V3D versions will have conflicting defines, so
  * all register interactions appear in this file and are compiled per V3D version
@@ -41,6 +41,7 @@
 #include "v3d_simulator_wrapper.h"
 
 #include "util/macros.h"
+#include "util/bitscan.h"
 #include "drm-uapi/v3d_drm.h"
 
 #define HW_REGISTER_RO(x) (x)
@@ -57,9 +58,6 @@
 static void
 v3d_invalidate_l3(struct v3d_hw *v3d)
 {
-        if (!v3d_hw_has_gca(v3d))
-                return;
-
 #if V3D_VERSION < 40
         uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
 
@@ -80,6 +78,12 @@ v3d_invalidate_l2c(struct v3d_hw *v3d)
                   V3D_CTL_0_L2CACTL_L2CENA_SET);
 }
 
+enum v3d_l2t_cache_flush_mode {
+        V3D_CACHE_FLUSH_MODE_FLUSH,
+        V3D_CACHE_FLUSH_MODE_CLEAR,
+        V3D_CACHE_FLUSH_MODE_CLEAN,
+};
+
 /* Invalidates texture L2 cachelines */
 static void
 v3d_invalidate_l2t(struct v3d_hw *v3d)
@@ -88,7 +92,23 @@ v3d_invalidate_l2t(struct v3d_hw *v3d)
         V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
         V3D_WRITE(V3D_CTL_0_L2TCACTL,
                   V3D_CTL_0_L2TCACTL_L2TFLS_SET |
-                  (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
+                  (V3D_CACHE_FLUSH_MODE_FLUSH << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
+}
+
+/*
+ * Wait for l2tcactl, used for flushes.
+ *
+ * FIXME: for a multicore scenario we should pass here the core. All wrapper
+ * assumes just one core, so would be better to handle that on that case.
+ */
+static UNUSED void v3d_core_wait_l2tcactl(struct v3d_hw *v3d,
+                                          uint32_t ctrl)
+{
+   assert(!(ctrl & ~(V3D_CTL_0_L2TCACTL_TMUWCF_SET | V3D_CTL_0_L2TCACTL_L2TFLS_SET)));
+
+   while (V3D_READ(V3D_CTL_0_L2TCACTL) & ctrl) {
+           v3d_hw_tick(v3d);
+   }
 }
 
 /* Flushes dirty texture cachelines from the L1 write combiner */
@@ -98,7 +118,13 @@ v3d_flush_l1td(struct v3d_hw *v3d)
         V3D_WRITE(V3D_CTL_0_L2TCACTL,
                   V3D_CTL_0_L2TCACTL_TMUWCF_SET);
 
-        assert(!(V3D_READ(V3D_CTL_0_L2TCACTL) & V3D_CTL_0_L2TCACTL_L2TFLS_SET));
+        /* Note: here the kernel (and previous versions of the simulator
+         * wrapper) is using V3D_CTL_0_L2TCACTL_L2TFLS_SET, as with l2t. We
+         * understand that it makes more sense to do like this. We need to
+         * confirm which one is doing it correctly. So far things work fine on
+         * the simulator this way.
+         */
+        v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_TMUWCF_SET);
 }
 
 /* Flushes dirty texture L2 cachelines */
@@ -109,9 +135,9 @@ v3d_flush_l2t(struct v3d_hw *v3d)
         V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
         V3D_WRITE(V3D_CTL_0_L2TCACTL,
                   V3D_CTL_0_L2TCACTL_L2TFLS_SET |
-                  (2 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
+                  (V3D_CACHE_FLUSH_MODE_CLEAN << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
 
-        assert(!(V3D_READ(V3D_CTL_0_L2TCACTL) & V3D_CTL_0_L2TCACTL_L2TFLS_SET));
+        v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_L2TFLS_SET);
 }
 
 /* Invalidates the slice caches.  These are read-only caches. */
@@ -184,6 +210,8 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
                                  struct drm_v3d_submit_csd *args,
                                  uint32_t gmp_ofs)
 {
+        int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
+                                   V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
         g_gmp_ofs = gmp_ofs;
         v3d_reload_gmp(v3d);
 
@@ -198,9 +226,13 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
         /* CFG0 kicks off the job */
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
 
-        while (V3D_READ(V3D_CSD_0_STATUS) &
-               (V3D_CSD_0_STATUS_HAVE_CURRENT_DISPATCH_SET |
-                V3D_CSD_0_STATUS_HAVE_QUEUED_DISPATCH_SET)) {
+        /* Now we wait for the dispatch to finish. The safest way is to check
+         * if NUM_COMPLETED_JOBS has increased. Note that in spite of that
+         * name that register field is about the number of completed
+         * dispatches.
+         */
+        while ((V3D_READ(V3D_CSD_0_STATUS) &
+                V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET) == last_completed_jobs) {
                 v3d_hw_tick(v3d);
         }
 
@@ -234,6 +266,9 @@ v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
         case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
                 args->value = 1;
                 return 0;
+        case DRM_V3D_PARAM_SUPPORTS_PERFMON:
+                args->value = V3D_VERSION >= 41;
+                return 0;
         }
 
         if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) {
@@ -241,44 +276,139 @@ v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
                 return 0;
         }
 
-        fprintf(stderr, "Unknown DRM_IOCTL_VC5_GET_PARAM(%lld)\n",
+        fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM(%lld)\n",
                 (long long)args->value);
         abort();
 }
 
 static struct v3d_hw *v3d_isr_hw;
 
+
+static void
+v3d_isr_core(struct v3d_hw *v3d,
+             unsigned core)
+{
+        /* FIXME: so far we are assuming just one core, and using only the _0_
+         * registers. If we add multiple-core on the simulator, we would need
+         * to pass core as a parameter, and chose the proper registers.
+         */
+        assert(core == 0);
+        uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS);
+        V3D_WRITE(V3D_CTL_0_INT_CLR, core_status);
+
+        if (core_status & V3D_CTL_0_INT_STS_INT_OUTOMEM_SET) {
+                uint32_t size = 256 * 1024;
+                uint32_t offset = v3d_simulator_get_spill(size);
+
+                v3d_reload_gmp(v3d);
+
+                V3D_WRITE(V3D_PTB_0_BPOA, offset);
+                V3D_WRITE(V3D_PTB_0_BPOS, size);
+                return;
+        }
+
+        if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
+                fprintf(stderr, "GMP violation at 0x%08x\n",
+                        V3D_READ(V3D_GMP_VIO_ADDR));
+                abort();
+        } else {
+                fprintf(stderr,
+                        "Unexpected ISR with core status 0x%08x\n",
+                        core_status);
+        }
+        abort();
+}
+
+static void
+handle_mmu_interruptions(struct v3d_hw *v3d,
+                         uint32_t hub_status)
+{
+        bool wrv = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET;
+        bool pti = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET;
+        bool cap = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET;
+
+        if (!(pti || cap || wrv))
+                return;
+
+        const char *client = "?";
+        uint32_t axi_id = V3D_READ(V3D_MMU_VIO_ID);
+        uint32_t va_width = 30;
+
+#if V3D_VERSION >= 41
+        static const char *const v3d41_axi_ids[] = {
+                "L2T",
+                "PTB",
+                "PSE",
+                "TLB",
+                "CLE",
+                "TFU",
+                "MMU",
+                "GMP",
+        };
+
+        axi_id = axi_id >> 5;
+        if (axi_id < ARRAY_SIZE(v3d41_axi_ids))
+                client = v3d41_axi_ids[axi_id];
+
+        uint32_t mmu_debug = V3D_READ(V3D_MMU_DEBUG_INFO);
+
+        va_width += ((mmu_debug & V3D_MMU_DEBUG_INFO_VA_WIDTH_SET)
+                     >> V3D_MMU_DEBUG_INFO_VA_WIDTH_LSB);
+#endif
+        /* Only the top bits (final number depends on the gen) of the virtual
+         * address are reported in the MMU VIO_ADDR register.
+         */
+        uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU_VIO_ADDR) <<
+                             (va_width - 32));
+
+        /* Difference with the kernal: here were are going to abort after
+         * logging, so we don't bother with some stuff that the kernel does,
+         * like restoring the MMU ctrl bits
+         */
+
+        fprintf(stderr, "MMU error from client %s (%d) at 0x%llx%s%s%s\n",
+                client, axi_id, (long long) vio_addr,
+                wrv ? ", write violation" : "",
+                pti ? ", pte invalid" : "",
+                cap ? ", cap exceeded" : "");
+
+        abort();
+}
+
+static void
+v3d_isr_hub(struct v3d_hw *v3d)
+{
+        uint32_t hub_status = V3D_READ(V3D_HUB_CTL_INT_STS);
+
+        /* Acknowledge the interrupts we're handling here */
+        V3D_WRITE(V3D_HUB_CTL_INT_CLR, hub_status);
+
+        if (hub_status & V3D_HUB_CTL_INT_STS_INT_TFUC_SET) {
+                /* FIXME: we were not able to raise this exception. We let the
+                 * unreachable here, so we could get one if it is raised on
+                 * the future. In any case, note that for this case we would
+                 * only be doing debugging log.
+                 */
+                unreachable("TFU Conversion Complete interrupt not handled");
+        }
+
+        handle_mmu_interruptions(v3d, hub_status);
+}
+
 static void
 v3d_isr(uint32_t hub_status)
 {
         struct v3d_hw *v3d = v3d_isr_hw;
+        uint32_t mask = hub_status;
 
-        /* Check the per-core bits */
-        if (hub_status & (1 << 0)) {
-                uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS);
-                V3D_WRITE(V3D_CTL_0_INT_CLR, core_status);
-
-                if (core_status & V3D_CTL_0_INT_STS_INT_OUTOMEM_SET) {
-                        uint32_t size = 256 * 1024;
-                        uint32_t offset = v3d_simulator_get_spill(size);
-
-                        v3d_reload_gmp(v3d);
-
-                        V3D_WRITE(V3D_PTB_0_BPOA, offset);
-                        V3D_WRITE(V3D_PTB_0_BPOS, size);
-                        return;
-                }
-
-                if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
-                        fprintf(stderr, "GMP violation at 0x%08x\n",
-                                V3D_READ(V3D_GMP_VIO_ADDR));
-                        abort();
-                } else {
-                        fprintf(stderr,
-                                "Unexpected ISR with core status 0x%08x\n",
-                                core_status);
-                }
-                abort();
+        /* Check the hub_status bits */
+        while (mask) {
+                unsigned core = u_bit_scan(&mask);
+
+                if (core == v3d_hw_get_hub_core())
+                        v3d_isr_hub(v3d);
+                else
+                        v3d_isr_core(v3d, core);
         }
 
         return;
@@ -299,11 +429,24 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
         V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
 #endif
 
+        /* FIXME: the kernel captures some additional core interrupts here,
+         * for tracing. Perhaps we should evaluate to do the same here and add
+         * some debug options.
+         */
         uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
                                     V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
         V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
         V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
 
+        uint32_t hub_interrupts =
+           (V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET |  /* write violation */
+            V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET |  /* page table invalid */
+            V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET |  /* CAP exceeded */
+            V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
+
+        V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
+        V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
+
         v3d_isr_hw = v3d;
         v3d_hw_set_isr(v3d, v3d_isr);
 }
@@ -313,6 +456,12 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
                                 struct drm_v3d_submit_cl *submit,
                                 uint32_t gmp_ofs)
 {
+        int last_bfc = (V3D_READ(V3D_CLE_0_BFC) &
+                        V3D_CLE_0_BFC_BMFCT_SET);
+
+        int last_rfc = (V3D_READ(V3D_CLE_0_RFC) &
+                        V3D_CLE_0_RFC_RMFCT_SET);
+
         g_gmp_ofs = gmp_ofs;
         v3d_reload_gmp(v3d);
 
@@ -336,8 +485,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
          * scheduler implements this using the GPU scheduler blocking on the
          * bin fence completing.  (We don't use HW semaphores).
          */
-        while (V3D_READ(V3D_CLE_0_CT0CA) !=
-               V3D_READ(V3D_CLE_0_CT0EA)) {
+        while ((V3D_READ(V3D_CLE_0_BFC) &
+                V3D_CLE_0_BFC_BMFCT_SET) == last_bfc) {
                 v3d_hw_tick(v3d);
         }
 
@@ -346,12 +495,55 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
         V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start);
         V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end);
 
-        while (V3D_READ(V3D_CLE_0_CT1CA) !=
-               V3D_READ(V3D_CLE_0_CT1EA) ||
-               V3D_READ(V3D_CLE_1_CT1CA) !=
-               V3D_READ(V3D_CLE_1_CT1EA)) {
+        while ((V3D_READ(V3D_CLE_0_RFC) &
+                V3D_CLE_0_RFC_RMFCT_SET) == last_rfc) {
                 v3d_hw_tick(v3d);
         }
 }
 
+#if V3D_VERSION >= 41
+#define V3D_PCTR_0_PCTR_N(x) (V3D_PCTR_0_PCTR0 + 4 * (x))
+#define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
+#define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
+#define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
+                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
+#endif
+
+void
+v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
+                              uint32_t ncounters,
+                              uint8_t *events)
+{
+#if V3D_VERSION >= 41
+        int i, j;
+        uint32_t source;
+        uint32_t mask = BITFIELD_RANGE(0, ncounters);
+
+        for (i = 0; i < ncounters; i+=4) {
+                source = i / 4;
+                uint32_t channels = 0;
+                for (j = 0; j < 4 && (i + j) < ncounters; j++)
+                        channels |= events[i + j] << V3D_PCTR_0_SRC_N_SHIFT(j);
+                V3D_WRITE(V3D_PCTR_0_SRC_N(source), channels);
+        }
+        V3D_WRITE(V3D_PCTR_0_CLR, mask);
+        V3D_WRITE(V3D_PCTR_0_OVERFLOW, mask);
+        V3D_WRITE(V3D_PCTR_0_EN, mask);
+#endif
+}
+
+void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
+                                  uint32_t ncounters,
+                                  uint64_t *values)
+{
+#if V3D_VERSION >= 41
+        int i;
+
+        for (i = 0; i < ncounters; i++)
+                values[i] += V3D_READ(V3D_PCTR_0_PCTR_N(i));
+
+        V3D_WRITE(V3D_PCTR_0_EN, 0);
+#endif
+}
+
 #endif /* USE_V3D_SIMULATOR */
diff --git a/lib/mesa/src/broadcom/simulator/v3dx_simulator.h b/lib/mesa/src/broadcom/simulator/v3dx_simulator.h
index 2c623d79a..145ae59c2 100644
--- a/lib/mesa/src/broadcom/simulator/v3dx_simulator.h
+++ b/lib/mesa/src/broadcom/simulator/v3dx_simulator.h
@@ -44,3 +44,9 @@ int v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
 int v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
                                      struct drm_v3d_submit_csd *args,
                                      uint32_t gmp_offset);
+void v3dX(simulator_perfmon_start)(struct v3d_hw *v3d,
+                                   uint32_t ncounters,
+                                   uint8_t *events);
+void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d,
+                                  uint32_t ncounters,
+                                  uint64_t *values);
diff --git a/lib/mesa/src/broadcom/vulkan/meson.build b/lib/mesa/src/broadcom/vulkan/meson.build
index 88bee8c13..a1cc58637 100644
--- a/lib/mesa/src/broadcom/vulkan/meson.build
+++ b/lib/mesa/src/broadcom/vulkan/meson.build
@@ -25,32 +25,11 @@ v3dv_entrypoints = custom_target(
   command : [
     prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
     '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
+    '--device-prefix', 'ver42',
   ],
   depend_files : vk_entrypoints_gen_depend_files,
 )
 
-v3dv_extensions_c = custom_target(
-  'v3dv_extensions.c',
-  input : ['v3dv_extensions_gen.py', vk_api_xml],
-  output : 'v3dv_extensions.c',
-  command : [
-    prog_python, '@INPUT0@', '--xml', '@INPUT1@',
-    '--out-c', '@OUTPUT@',
-  ],
-  depend_files : [files('v3dv_extensions.py'), vk_extensions_gen],
-)
-
-v3dv_extensions_h = custom_target(
-  'v3dv_extensions.h',
-  input : ['v3dv_extensions_gen.py', vk_api_xml],
-  output : 'v3dv_extensions.h',
-  command : [
-    prog_python, '@INPUT0@', '--xml', '@INPUT1@',
-    '--out-h', '@OUTPUT@',
-  ],
-  depend_files : [files('v3dv_extensions.py'), vk_extensions_gen],
-)
-
 libv3dv_files = files(
   'v3dv_bo.c',
   'v3dv_cl.c',
@@ -71,15 +50,27 @@ libv3dv_files = files(
   'v3dv_query.c',
   'v3dv_queue.c',
   'v3dv_uniforms.c',
-  'v3dv_util.c',
   'v3dv_wsi.c',
-  'v3d_tiling.c',
+)
+
+files_per_version = files(
+  'v3dvx_cmd_buffer.c',
+  'v3dvx_descriptor_set.c',
+  'v3dvx_device.c',
+  'v3dvx_formats.c',
+  'v3dvx_image.c',
+  'v3dvx_pipeline.c',
+  'v3dvx_meta_common.c',
+  'v3dvx_pipeline.c',
+  'v3dvx_queue.c',
 )
 
 # The vulkan driver only supports version >= 42, which is the version present in
 # Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
 # driver.
-v3dv_flags = ['-DV3D_VERSION=42']
+v3d_versions = ['42']
+
+v3dv_flags = []
 
 dep_v3dv3 = dependency('v3dv3', required : false)
 if dep_v3dv3.found()
@@ -94,39 +85,43 @@ v3dv_deps = [
   idep_nir,
   idep_nir_headers,
   idep_vulkan_util,
+  idep_vulkan_wsi,
 ]
 
 if with_platform_x11
   v3dv_deps += dep_xcb_dri3
-  v3dv_flags += [
-    '-DVK_USE_PLATFORM_XCB_KHR',
-    '-DVK_USE_PLATFORM_XLIB_KHR',
-  ]
-  libv3dv_files += files('v3dv_wsi_x11.c')
 endif
 
 if with_platform_wayland
   v3dv_deps += [dep_wayland_client, dep_wl_protocols]
-  v3dv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
-  libv3dv_files += files('v3dv_wsi_wayland.c')
   libv3dv_files += [wayland_drm_client_protocol_h, wayland_drm_protocol_c]
 endif
 
-if system_has_kms_drm and not with_platform_android
- v3dv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
- libv3dv_files += files('v3dv_wsi_display.c')
-endif
+per_version_libs = []
+foreach ver : v3d_versions
+  per_version_libs += static_library(
+    'v3dv-v' + ver,
+    [files_per_version, v3d_xml_pack, v3dv_entrypoints[0]],
+    include_directories : [
+      inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
+      inc_compiler, inc_util,
+    ],
+    c_args : [v3dv_flags, '-DV3D_VERSION=' + ver],
+    gnu_symbol_visibility : 'hidden',
+    dependencies : [v3dv_deps],
+)
+endforeach
 
 libvulkan_broadcom = shared_library(
   'vulkan_broadcom',
-  [libv3dv_files, v3dv_entrypoints, v3dv_extensions_c, v3dv_extensions_h, sha1_h],
+  [libv3dv_files, v3dv_entrypoints, sha1_h],
   include_directories : [
-    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util, inc_vulkan_wsi,
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util,
   ],
   link_with : [
     libbroadcom_cle,
     libbroadcom_v3d,
-    libvulkan_wsi,
+    per_version_libs,
   ],
   dependencies : v3dv_deps,
   c_args : v3dv_flags,
@@ -150,14 +145,15 @@ endif
 
 broadcom_icd = custom_target(
   'broadcom_icd',
-  input : 'v3dv_icd.py',
+  input : [vk_icd_gen, vk_api_xml],
   output : 'broadcom_icd.@0@.json'.format(host_machine.cpu()),
   command : [
-    prog_python, '@INPUT@',
-    '--lib-path', join_paths(get_option('prefix'), get_option('libdir')),
+    prog_python, '@INPUT0@',
+    '--api-version', '1.0', '--xml', '@INPUT1@',
+    '--lib-path', join_paths(get_option('prefix'), get_option('libdir'),
+    		  	     'libvulkan_broadcom.so'),
     '--out', '@OUTPUT@',
   ],
-  depend_files : files('v3dv_extensions.py'),
   build_by_default : true,
   install_dir : with_vulkan_icd_dir,
   install : true,
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_bo.c b/lib/mesa/src/broadcom/vulkan/v3dv_bo.c
index 459032990..71679ceec 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_bo.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_bo.c
@@ -184,6 +184,7 @@ v3dv_bo_init(struct v3dv_bo *bo,
              bool private)
 {
    bo->handle = handle;
+   bo->handle_bit = 1ull << (handle % 64);
    bo->size = size;
    bo->offset = offset;
    bo->map = NULL;
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_bo.h b/lib/mesa/src/broadcom/vulkan/v3dv_bo.h
index fd6754c48..ab2b8c735 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_bo.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_bo.h
@@ -30,6 +30,7 @@ struct v3dv_bo {
    struct list_head list_link;
 
    uint32_t handle;
+   uint64_t handle_bit;
    uint32_t size;
    uint32_t offset;
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_cl.c b/lib/mesa/src/broadcom/vulkan/v3dv_cl.c
index e9674b6c5..ed11f53c4 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_cl.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_cl.c
@@ -22,6 +22,13 @@
  */
 
 #include "v3dv_private.h"
+
+/* We don't expect that the packets we use in this file change across hw
+ * versions, so we just explicitly set the V3D_VERSION and include v3dx_pack
+ * here
+ */
+#define V3D_VERSION 33
+#include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"
 
 void
@@ -72,10 +79,10 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)
       cl_emit(cl, BRANCH, branch) {
          branch.address = v3dv_cl_address(bo, 0);
       }
+   } else {
+      v3dv_job_add_bo_unchecked(cl->job, bo);
    }
 
-   v3dv_job_add_bo(cl->job, bo);
-
    cl->bo = bo;
    cl->base = cl->bo->map;
    cl->size = cl->bo->size;
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_cl.h b/lib/mesa/src/broadcom/vulkan/v3dv_cl.h
index a6a38b4aa..68d5acd45 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_cl.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_cl.h
@@ -46,6 +46,16 @@ struct v3dv_cl_reloc {
    uint32_t offset;
 };
 
+static inline void
+pack_emit_reloc(void *cl, const void *reloc) {}
+
+#define __gen_user_data struct v3dv_cl
+#define __gen_address_type struct v3dv_cl_reloc
+#define __gen_address_offset(reloc) (((reloc)->bo ? (reloc)->bo->offset : 0) + \
+                                     (reloc)->offset)
+#define __gen_emit_reloc cl_pack_emit_reloc
+#define __gen_unpack_address(cl, s, e) __unpack_address(cl, s, e)
+
 struct v3dv_cl {
    void *base;
    struct v3dv_job *job;
@@ -194,7 +204,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
  * Helper function called by the XML-generated pack functions for filling in
  * an address field in shader records.
  *
- * Since we have a private address space as of VC5, our BOs can have lifelong
+ * Since we have a private address space as of V3D, our BOs can have lifelong
  * offsets, and all the kernel needs to know is which BOs need to be paged in
  * for this exec.
  */
@@ -213,7 +223,7 @@ cl_pack_emit_reloc(struct v3dv_cl *cl, const struct v3dv_cl_reloc *reloc)
 #define cl_emit_prepacked(cl, packet) \
         cl_emit_prepacked_sized(cl, packet, sizeof(*(packet)))
 
-#define v3dv_pack(packed, packet, name)                          \
+#define v3dvx_pack(packed, packet, name)                         \
         for (struct cl_packet_struct(packet) name = {            \
                 cl_packet_header(packet)                         \
         },                                                       \
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c b/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 6cb9de28a..ff914e048 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -22,8 +22,6 @@
  */
 
 #include "v3dv_private.h"
-#include "broadcom/cle/v3dx_pack.h"
-#include "util/half_float.h"
 #include "util/u_pack_color.h"
 #include "vk_format_info.h"
 #include "vk_util.h"
@@ -57,6 +55,7 @@ const struct v3dv_dynamic_state default_dynamic_state = {
       .slope_factor = 0.0f,
    },
    .line_width = 1.0f,
+   .color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1,
 };
 
 void
@@ -65,17 +64,26 @@ v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
    if (!bo)
       return;
 
-   if (_mesa_set_search(job->bos, bo))
-      return;
+   if (job->bo_handle_mask & bo->handle_bit) {
+      if (_mesa_set_search(job->bos, bo))
+         return;
+   }
 
    _mesa_set_add(job->bos, bo);
    job->bo_count++;
+   job->bo_handle_mask |= bo->handle_bit;
 }
 
-static void
-cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer);
+void
+v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
+{
+   assert(bo);
+   _mesa_set_add(job->bos, bo);
+   job->bo_count++;
+   job->bo_handle_mask |= bo->handle_bit;
+}
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateCommandPool(VkDevice _device,
                        const VkCommandPoolCreateInfo *pCreateInfo,
                        const VkAllocationCallbacks *pAllocator,
@@ -90,7 +98,7 @@ v3dv_CreateCommandPool(VkDevice _device,
    pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
                            VK_OBJECT_TYPE_COMMAND_POOL);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pAllocator)
       pool->alloc = *pAllocator;
@@ -114,7 +122,7 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
     * buffer reset that would reset the loader's dispatch table for the
     * command buffer, and any other relevant info from vk_object_base
     */
-   const uint32_t base_size = sizeof(struct vk_object_base);
+   const uint32_t base_size = sizeof(struct vk_command_buffer);
    uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
    memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
 
@@ -142,12 +150,20 @@ cmd_buffer_create(struct v3dv_device *device,
                   VkCommandBuffer *pCommandBuffer)
 {
    struct v3dv_cmd_buffer *cmd_buffer;
-   cmd_buffer = vk_object_zalloc(&device->vk,
-                                 &pool->alloc,
-                                 sizeof(*cmd_buffer),
-                                 VK_OBJECT_TYPE_COMMAND_BUFFER);
+   cmd_buffer = vk_zalloc2(&device->vk.alloc,
+                           &pool->alloc,
+                           sizeof(*cmd_buffer),
+                           8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result;
+   result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer);
+      return result;
+   }
 
    cmd_buffer_init(cmd_buffer, device, pool, level);
 
@@ -332,18 +348,9 @@ cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
 {
    list_del(&cmd_buffer->pool_link);
    cmd_buffer_free_resources(cmd_buffer);
-   vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
-}
-
-void
-v3dv_job_emit_binning_flush(struct v3dv_job *job)
-{
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, FLUSH, flush);
+   vk_command_buffer_finish(&cmd_buffer->vk);
+   vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc,
+            cmd_buffer);
 }
 
 static bool
@@ -402,6 +409,13 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
    struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
 
+   /* Don't merge if the subpasses have different view masks, since in that
+    * case the framebuffer setup is different and we need to emit different
+    * RCLs.
+    */
+   if (subpass->view_mask != prev_subpass->view_mask)
+      return false;
+
    /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED,
     * we need to check that for each subpass all its used attachments are
     * used by the other subpass.
@@ -517,6 +531,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
                      uint32_t width,
                      uint32_t height,
                      uint32_t layers,
+                     bool allocate_tile_state_for_all_layers,
                      uint32_t render_target_count,
                      uint8_t max_internal_bpp,
                      bool msaa)
@@ -532,6 +547,16 @@ v3dv_job_start_frame(struct v3dv_job *job,
    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
    v3dv_return_if_oom(NULL, job);
 
+   /* We only need to allocate tile state for all layers if the binner
+    * writes primitives to layers other than the first. This can only be
+    * done using layered rendering (writing gl_Layer from a geometry shader),
+    * so for other cases of multilayered framebuffers (typically with
+    * meta copy/clear operations) that won't use layered rendering, we only
+    * need one layer worth of of tile state for the binner.
+    */
+   if (!allocate_tile_state_for_all_layers)
+      layers = 1;
+
    /* The PTB will request the tile alloc initial size per tile at start
     * of tile binning.
     */
@@ -561,7 +586,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
       return;
    }
 
-   v3dv_job_add_bo(job, job->tile_alloc);
+   v3dv_job_add_bo_unchecked(job, job->tile_alloc);
 
    const uint32_t tsda_per_tile_size = 256;
    const uint32_t tile_state_size = tiling->layers *
@@ -574,33 +599,12 @@ v3dv_job_start_frame(struct v3dv_job *job,
       return;
    }
 
-   v3dv_job_add_bo(job, job->tile_state);
+   v3dv_job_add_bo_unchecked(job, job->tile_state);
 
-   /* This must go before the binning mode configuration. It is
-    * required for layered framebuffers to work.
-    */
-   cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
-      config.number_of_layers = layers;
-   }
-
-   cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
-      config.width_in_pixels = tiling->width;
-      config.height_in_pixels = tiling->height;
-      config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
-      config.multisample_mode_4x = tiling->msaa;
-      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
-   }
-
-   /* There's definitely nothing in the VCD cache we want. */
-   cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
-
-   /* "Binning mode lists must have a Start Tile Binning item (6) after
-    *  any prefix state data before the binning list proper starts."
-    */
-   cl_emit(&job->bcl, START_TILE_BINNING, bin);
+   v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers);
 
-   job->ez_state = VC5_EZ_UNDECIDED;
-   job->first_ez_state = VC5_EZ_UNDECIDED;
+   job->ez_state = V3D_EZ_UNDECIDED;
+   job->first_ez_state = V3D_EZ_UNDECIDED;
 }
 
 static void
@@ -617,19 +621,9 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
     * any RCL commands of its own.
     */
    if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0)
-      cmd_buffer_emit_render_pass_rcl(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
 
-   v3dv_job_emit_binning_flush(cmd_buffer->state.job);
-}
-
-static void
-cmd_buffer_end_render_pass_secondary(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   assert(cmd_buffer->state.job);
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
-                                    cl_packet_length(RETURN_FROM_SUB_LIST));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-   cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
+   v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job);
 }
 
 struct v3dv_job *
@@ -716,7 +710,7 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
          cmd_buffer_end_render_pass_frame(cmd_buffer);
       } else {
          assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
-         cmd_buffer_end_render_pass_secondary(cmd_buffer);
+         v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);
       }
    }
 
@@ -803,7 +797,7 @@ v3dv_job_init(struct v3dv_job *job,
 
       v3dv_cl_init(job, &job->indirect);
 
-      if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
+      if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH))
          job->always_flush = true;
    }
 
@@ -821,6 +815,7 @@ v3dv_job_init(struct v3dv_job *job,
        * bits.
        */
       cmd_buffer->state.dirty = ~0;
+      cmd_buffer->state.dirty_descriptor_stages = ~0;
 
       /* Honor inheritance of occlussion queries in secondaries if requested */
       if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
@@ -879,6 +874,7 @@ static VkResult
 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
                  VkCommandBufferResetFlags flags)
 {
+   vk_command_buffer_reset(&cmd_buffer->vk);
    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
       struct v3dv_device *device = cmd_buffer->device;
       struct v3dv_cmd_pool *pool = cmd_buffer->pool;
@@ -902,7 +898,7 @@ cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_AllocateCommandBuffers(VkDevice _device,
                             const VkCommandBufferAllocateInfo *pAllocateInfo,
                             VkCommandBuffer *pCommandBuffers)
@@ -930,7 +926,7 @@ v3dv_AllocateCommandBuffers(VkDevice _device,
    return result;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_FreeCommandBuffers(VkDevice device,
                         VkCommandPool commandPool,
                         uint32_t commandBufferCount,
@@ -946,7 +942,7 @@ v3dv_FreeCommandBuffers(VkDevice device,
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyCommandPool(VkDevice _device,
                         VkCommandPool commandPool,
                         const VkAllocationCallbacks *pAllocator)
@@ -965,7 +961,7 @@ v3dv_DestroyCommandPool(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, pool);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_TrimCommandPool(VkDevice device,
                      VkCommandPool commandPool,
                      VkCommandPoolTrimFlags flags)
@@ -1026,34 +1022,37 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
       struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];
       struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];
 
-      VkImageResolve region = {
+      VkImageResolve2KHR region = {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR,
          .srcSubresource = {
             VK_IMAGE_ASPECT_COLOR_BIT,
-            src_iview->base_level,
-            src_iview->first_layer,
-            src_iview->last_layer - src_iview->first_layer + 1,
+            src_iview->vk.base_mip_level,
+            src_iview->vk.base_array_layer,
+            src_iview->vk.layer_count,
          },
          .srcOffset = { 0, 0, 0 },
          .dstSubresource =  {
             VK_IMAGE_ASPECT_COLOR_BIT,
-            dst_iview->base_level,
-            dst_iview->first_layer,
-            dst_iview->last_layer - dst_iview->first_layer + 1,
+            dst_iview->vk.base_mip_level,
+            dst_iview->vk.base_array_layer,
+            dst_iview->vk.layer_count,
          },
          .dstOffset = { 0, 0, 0 },
-         .extent = src_iview->image->extent,
+         .extent = src_iview->vk.image->extent,
       };
 
-      VkImage src_image_handle =
-         v3dv_image_to_handle((struct v3dv_image *) src_iview->image);
-      VkImage dst_image_handle =
-         v3dv_image_to_handle((struct v3dv_image *) dst_iview->image);
-      v3dv_CmdResolveImage(cmd_buffer_handle,
-                           src_image_handle,
-                           VK_IMAGE_LAYOUT_GENERAL,
-                           dst_image_handle,
-                           VK_IMAGE_LAYOUT_GENERAL,
-                           1, &region);
+      struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
+      struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
+      VkResolveImageInfo2KHR resolve_info = {
+         .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR,
+         .srcImage = v3dv_image_to_handle(src_image),
+         .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .dstImage = v3dv_image_to_handle(dst_image),
+         .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .regionCount = 1,
+         .pRegions = &region,
+      };
+      v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info);
    }
 
    cmd_buffer->state.framebuffer = restore_fb;
@@ -1116,7 +1115,7 @@ cmd_buffer_begin_render_pass_secondary(
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                         const VkCommandBufferBeginInfo *pBeginInfo)
 {
@@ -1149,7 +1148,7 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
                         VkCommandBufferResetFlags flags)
 {
@@ -1157,7 +1156,7 @@ v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
    return cmd_buffer_reset(cmd_buffer, flags);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_ResetCommandPool(VkDevice device,
                       VkCommandPool commandPool,
                       VkCommandPoolResetFlags flags)
@@ -1176,22 +1175,6 @@ v3dv_ResetCommandPool(VkDevice device,
 }
 
 static void
-emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
-{
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, CLIP_WINDOW, clip) {
-      clip.clip_window_left_pixel_coordinate = rect->offset.x;
-      clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
-      clip.clip_window_width_in_pixels = rect->extent.width;
-      clip.clip_window_height_in_pixels = rect->extent.height;
-   }
-}
-
-static void
 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 {
    /* Render areas and scissor/viewport are only relevant inside render passes,
@@ -1206,7 +1189,7 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
     */
    assert(cmd_buffer->state.framebuffer);
    cmd_buffer->state.tile_aligned_render_area =
-      v3dv_subpass_area_is_tile_aligned(rect,
+      v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect,
                                         cmd_buffer->state.framebuffer,
                                         cmd_buffer->state.pass,
                                         cmd_buffer->state.subpass_idx);
@@ -1218,42 +1201,6 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
    }
 }
 
-void
-v3dv_get_hw_clear_color(const VkClearColorValue *color,
-                        uint32_t internal_type,
-                        uint32_t internal_size,
-                        uint32_t *hw_color)
-{
-   union util_color uc;
-   switch (internal_type) {
-   case V3D_INTERNAL_TYPE_8:
-      util_pack_color(color->float32, PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
-      memcpy(hw_color, uc.ui, internal_size);
-   break;
-   case V3D_INTERNAL_TYPE_8I:
-   case V3D_INTERNAL_TYPE_8UI:
-      hw_color[0] = ((color->uint32[0] & 0xff) |
-                     (color->uint32[1] & 0xff) << 8 |
-                     (color->uint32[2] & 0xff) << 16 |
-                     (color->uint32[3] & 0xff) << 24);
-   break;
-   case V3D_INTERNAL_TYPE_16F:
-      util_pack_color(color->float32, PIPE_FORMAT_R16G16B16A16_FLOAT, &uc);
-      memcpy(hw_color, uc.ui, internal_size);
-   break;
-   case V3D_INTERNAL_TYPE_16I:
-   case V3D_INTERNAL_TYPE_16UI:
-      hw_color[0] = ((color->uint32[0] & 0xffff) | color->uint32[1] << 16);
-      hw_color[1] = ((color->uint32[2] & 0xffff) | color->uint32[3] << 16);
-   break;
-   case V3D_INTERNAL_TYPE_32F:
-   case V3D_INTERNAL_TYPE_32I:
-   case V3D_INTERNAL_TYPE_32UI:
-      memcpy(hw_color, color->uint32, internal_size);
-      break;
-   }
-}
-
 static void
 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
                                             uint32_t attachment_idx,
@@ -1265,18 +1212,19 @@ cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
       &cmd_buffer->state.pass->attachments[attachment_idx];
 
    uint32_t internal_type, internal_bpp;
-   const struct v3dv_format *format = v3dv_get_format(attachment->desc.format);
-   v3dv_get_internal_type_bpp_for_output_format(format->rt_type,
-                                                &internal_type,
-                                                &internal_bpp);
+   const struct v3dv_format *format =
+      v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);
+
+   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)
+      (format->rt_type, &internal_type, &internal_bpp);
 
    uint32_t internal_size = 4 << internal_bpp;
 
    struct v3dv_cmd_buffer_attachment_state *attachment_state =
       &cmd_buffer->state.attachments[attachment_idx];
 
-   v3dv_get_hw_clear_color(color, internal_type, internal_size,
-                           &attachment_state->clear_value.color[0]);
+   v3dv_X(cmd_buffer->device, get_hw_clear_color)
+      (color, internal_type, internal_size, &attachment_state->clear_value.color[0]);
 
    attachment_state->vk_clear_value.color = *color;
 }
@@ -1370,7 +1318,7 @@ cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffe
    assert(state->attachment_alloc_count >= pass->attachment_count);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
                         const VkRenderPassBeginInfo *pRenderPassBegin,
                         VkSubpassContents contents)
@@ -1394,7 +1342,7 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
     * to emit a new clip window to constraint it to the render area.
     */
    uint32_t min_render_x = state->render_area.offset.x;
-   uint32_t min_render_y = state->render_area.offset.x;
+   uint32_t min_render_y = state->render_area.offset.y;
    uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
    uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
    uint32_t min_clip_x = state->clip_window.offset.x;
@@ -1410,7 +1358,7 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
    v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -1426,884 +1374,6 @@ v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
    v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
 }
 
-void
-v3dv_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
-                                     int rt,
-                                     uint32_t *rt_bpp,
-                                     uint32_t *rt_type,
-                                     uint32_t *rt_clamp)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
-   assert(state->subpass_idx < state->pass->subpass_count);
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-
-   if (rt >= subpass->color_count)
-      return;
-
-   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
-   const uint32_t attachment_idx = attachment->attachment;
-   if (attachment_idx == VK_ATTACHMENT_UNUSED)
-      return;
-
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-   assert(attachment_idx < framebuffer->attachment_count);
-   struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
-   assert(iview->aspects & VK_IMAGE_ASPECT_COLOR_BIT);
-
-   *rt_bpp = iview->internal_bpp;
-   *rt_type = iview->internal_type;
-   *rt_clamp =vk_format_is_int(iview->vk_format) ?
-      V3D_RENDER_TARGET_CLAMP_INT : V3D_RENDER_TARGET_CLAMP_NONE;
-}
-
-static void
-cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
-                                 struct v3dv_cl *cl,
-                                 struct v3dv_image_view *iview,
-                                 uint32_t layer,
-                                 uint32_t buffer)
-{
-   const struct v3dv_image *image = iview->image;
-   const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-   uint32_t layer_offset = v3dv_layer_offset(image,
-                                             iview->base_level,
-                                             iview->first_layer + layer);
-
-   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
-      load.buffer_to_load = buffer;
-      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
-
-      load.input_image_format = iview->format->rt_type;
-      load.r_b_swap = iview->swap_rb;
-      load.memory_format = slice->tiling;
-
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         load.height_in_ub_or_stride =
-            slice->padded_height_of_output_image_in_uif_blocks;
-      } else if (slice->tiling == VC5_TILING_RASTER) {
-         load.height_in_ub_or_stride = slice->stride;
-      }
-
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
-         load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
-      else
-         load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
-   }
-}
-
-static bool
-check_needs_load(const struct v3dv_cmd_buffer_state *state,
-                 VkImageAspectFlags aspect,
-                 uint32_t att_first_subpass_idx,
-                 VkAttachmentLoadOp load_op)
-{
-   /* We call this with image->aspects & aspect, so 0 means the aspect we are
-    * testing does not exist in the image.
-    */
-   if (!aspect)
-      return false;
-
-   /* Attachment load operations apply on the first subpass that uses the
-    * attachment, otherwise we always need to load.
-    */
-   if (state->job->first_subpass > att_first_subpass_idx)
-      return true;
-
-   /* If the job is continuing a subpass started in another job, we always
-    * need to load.
-    */
-   if (state->job->is_subpass_continue)
-      return true;
-
-   /* If the area is not aligned to tile boundaries, we always need to load */
-   if (!state->tile_aligned_render_area)
-      return true;
-
-   /* The attachment load operations must be LOAD */
-   return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
-}
-
-static bool
-check_needs_clear(const struct v3dv_cmd_buffer_state *state,
-                  VkImageAspectFlags aspect,
-                  uint32_t att_first_subpass_idx,
-                  VkAttachmentLoadOp load_op,
-                  bool do_clear_with_draw)
-{
-   /* We call this with image->aspects & aspect, so 0 means the aspect we are
-    * testing does not exist in the image.
-    */
-   if (!aspect)
-      return false;
-
-   /* If the aspect needs to be cleared with a draw call then we won't emit
-    * the clear here.
-    */
-   if (do_clear_with_draw)
-      return false;
-
-   /* If this is resuming a subpass started with another job, then attachment
-    * load operations don't apply.
-    */
-   if (state->job->is_subpass_continue)
-      return false;
-
-   /* If the render area is not aligned to tile boudaries we can't use the
-    * TLB for a clear.
-    */
-   if (!state->tile_aligned_render_area)
-      return false;
-
-   /* If this job is running in a subpass other than the first subpass in
-    * which this attachment is used then attachment load operations don't apply.
-    */
-   if (state->job->first_subpass != att_first_subpass_idx)
-      return false;
-
-   /* The attachment load operation must be CLEAR */
-   return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
-}
-
-static bool
-check_needs_store(const struct v3dv_cmd_buffer_state *state,
-                  VkImageAspectFlags aspect,
-                  uint32_t att_last_subpass_idx,
-                  VkAttachmentStoreOp store_op)
-{
-   /* We call this with image->aspects & aspect, so 0 means the aspect we are
-    * testing does not exist in the image.
-    */
-   if (!aspect)
-       return false;
-
-   /* Attachment store operations only apply on the last subpass where the
-    * attachment is used, in other subpasses we always need to store.
-    */
-   if (state->subpass_idx < att_last_subpass_idx)
-      return true;
-
-   /* Attachment store operations only apply on the last job we emit on the the
-    * last subpass where the attachment is used, otherwise we always need to
-    * store.
-    */
-   if (!state->job->is_subpass_finish)
-      return true;
-
-   /* The attachment store operation must be STORE */
-   return store_op == VK_ATTACHMENT_STORE_OP_STORE;
-}
-
-static void
-cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
-                                  struct v3dv_cl *cl,
-                                  uint32_t layer)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-   const struct v3dv_render_pass *pass = state->pass;
-   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
-
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      const struct v3dv_render_pass_attachment *attachment =
-         &state->pass->attachments[attachment_idx];
-
-      /* According to the Vulkan spec:
-       *
-       *    "The load operation for each sample in an attachment happens before
-       *     any recorded command which accesses the sample in the first subpass
-       *     where the attachment is used."
-       *
-       * If the load operation is CLEAR, we must only clear once on the first
-       * subpass that uses the attachment (and in that case we don't LOAD).
-       * After that, we always want to load so we don't lose any rendering done
-       * by a previous subpass to the same attachment. We also want to load
-       * if the current job is continuing subpass work started by a previous
-       * job, for the same reason.
-       *
-       * If the render area is not aligned to tile boundaries then we have
-       * tiles which are partially covered by it. In this case, we need to
-       * load the tiles so we can preserve the pixels that are outside the
-       * render area for any such tiles.
-       */
-      bool needs_load = check_needs_load(state,
-                                         VK_IMAGE_ASPECT_COLOR_BIT,
-                                         attachment->first_subpass,
-                                         attachment->desc.loadOp);
-      if (needs_load) {
-         struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
-         cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
-                                          layer, RENDER_TARGET_0 + i);
-      }
-   }
-
-   uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-      const struct v3dv_render_pass_attachment *ds_attachment =
-         &state->pass->attachments[ds_attachment_idx];
-
-      const VkImageAspectFlags ds_aspects =
-         vk_format_aspects(ds_attachment->desc.format);
-
-      const bool needs_depth_load =
-         check_needs_load(state,
-                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                          ds_attachment->first_subpass,
-                          ds_attachment->desc.loadOp);
-
-      const bool needs_stencil_load =
-         check_needs_load(state,
-                          ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                          ds_attachment->first_subpass,
-                          ds_attachment->desc.stencilLoadOp);
-
-      if (needs_depth_load || needs_stencil_load) {
-         struct v3dv_image_view *iview =
-            framebuffer->attachments[ds_attachment_idx];
-         /* From the Vulkan spec:
-          *
-          *   "When an image view of a depth/stencil image is used as a
-          *   depth/stencil framebuffer attachment, the aspectMask is ignored
-          *   and both depth and stencil image subresources are used."
-          *
-          * So we ignore the aspects from the subresource range of the image
-          * view for the depth/stencil attachment, but we still need to restrict
-          * the to aspects compatible with the render pass and the image.
-          */
-         const uint32_t zs_buffer =
-            v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
-         cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
-                                          iview, layer, zs_buffer);
-      }
-   }
-
-   cl_emit(cl, END_OF_LOADS, end);
-}
-
-static void
-cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
-                                  struct v3dv_cl *cl,
-                                  uint32_t attachment_idx,
-                                  uint32_t layer,
-                                  uint32_t buffer,
-                                  bool clear,
-                                  bool is_multisample_resolve)
-{
-   const struct v3dv_image_view *iview =
-      cmd_buffer->state.framebuffer->attachments[attachment_idx];
-   const struct v3dv_image *image = iview->image;
-   const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-   uint32_t layer_offset = v3dv_layer_offset(image,
-                                             iview->base_level,
-                                             iview->first_layer + layer);
-
-   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-      store.buffer_to_store = buffer;
-      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
-      store.clear_buffer_being_stored = clear;
-
-      store.output_image_format = iview->format->rt_type;
-      store.r_b_swap = iview->swap_rb;
-      store.memory_format = slice->tiling;
-
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         store.height_in_ub_or_stride =
-            slice->padded_height_of_output_image_in_uif_blocks;
-      } else if (slice->tiling == VC5_TILING_RASTER) {
-         store.height_in_ub_or_stride = slice->stride;
-      }
-
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
-         store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
-      else if (is_multisample_resolve)
-         store.decimate_mode = V3D_DECIMATE_MODE_4X;
-      else
-         store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
-   }
-}
-
-static void
-cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
-                                   struct v3dv_cl *cl,
-                                   uint32_t layer)
-{
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-
-   bool has_stores = false;
-   bool use_global_zs_clear = false;
-   bool use_global_rt_clear = false;
-
-   /* FIXME: separate stencil */
-   uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-      const struct v3dv_render_pass_attachment *ds_attachment =
-         &state->pass->attachments[ds_attachment_idx];
-
-      assert(state->job->first_subpass >= ds_attachment->first_subpass);
-      assert(state->subpass_idx >= ds_attachment->first_subpass);
-      assert(state->subpass_idx <= ds_attachment->last_subpass);
-
-      /* From the Vulkan spec, VkImageSubresourceRange:
-       *
-       *   "When an image view of a depth/stencil image is used as a
-       *   depth/stencil framebuffer attachment, the aspectMask is ignored
-       *   and both depth and stencil image subresources are used."
-       *
-       * So we ignore the aspects from the subresource range of the image
-       * view for the depth/stencil attachment, but we still need to restrict
-       * the to aspects compatible with the render pass and the image.
-       */
-      const VkImageAspectFlags aspects =
-         vk_format_aspects(ds_attachment->desc.format);
-
-      /* Only clear once on the first subpass that uses the attachment */
-      bool needs_depth_clear =
-         check_needs_clear(state,
-                           aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                           ds_attachment->first_subpass,
-                           ds_attachment->desc.loadOp,
-                           subpass->do_depth_clear_with_draw);
-
-      bool needs_stencil_clear =
-         check_needs_clear(state,
-                           aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                           ds_attachment->first_subpass,
-                           ds_attachment->desc.stencilLoadOp,
-                           subpass->do_stencil_clear_with_draw);
-
-      /* Skip the last store if it is not required */
-      bool needs_depth_store =
-         check_needs_store(state,
-                           aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                           ds_attachment->last_subpass,
-                           ds_attachment->desc.storeOp);
-
-      bool needs_stencil_store =
-         check_needs_store(state,
-                           aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                           ds_attachment->last_subpass,
-                           ds_attachment->desc.stencilStoreOp);
-
-      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
-       * for depth/stencil.
-       *
-       * There used to be some confusion regarding the Clear Tile Buffers
-       * Z/S bit also being broken, but we confirmed with Broadcom that this
-       * is not the case, it was just that some other hardware bugs (that we
-       * need to work around, such as GFXH-1461) could cause this bit to behave
-       * incorrectly.
-       *
-       * There used to be another issue where the RTs bit in the Clear Tile
-       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
-       * fixed since V3D 4.1.
-       *
-       * So if we have to emit a clear of depth or stencil we don't use
-       * the per-buffer store clear bit, even if we need to store the buffers,
-       * instead we always have to use the Clear Tile Buffers Z/S bit.
-       * If we have configured the job to do early Z/S clearing, then we
-       * don't want to emit any Clear Tile Buffers command at all here.
-       *
-       * Note that GFXH-1689 is not reproduced in the simulator, where
-       * using the clear buffer bit in depth/stencil stores works fine.
-       */
-      use_global_zs_clear = !state->job->early_zs_clear &&
-                            (needs_depth_clear || needs_stencil_clear);
-      if (needs_depth_store || needs_stencil_store) {
-         const uint32_t zs_buffer =
-            v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
-         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
-                                           ds_attachment_idx, layer,
-                                           zs_buffer, false, false);
-         has_stores = true;
-      }
-   }
-
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      const struct v3dv_render_pass_attachment *attachment =
-         &state->pass->attachments[attachment_idx];
-
-      assert(state->job->first_subpass >= attachment->first_subpass);
-      assert(state->subpass_idx >= attachment->first_subpass);
-      assert(state->subpass_idx <= attachment->last_subpass);
-
-      /* Only clear once on the first subpass that uses the attachment */
-      bool needs_clear =
-         check_needs_clear(state,
-                           VK_IMAGE_ASPECT_COLOR_BIT,
-                           attachment->first_subpass,
-                           attachment->desc.loadOp,
-                           false);
-
-      /* Skip the last store if it is not required  */
-      bool needs_store =
-         check_needs_store(state,
-                           VK_IMAGE_ASPECT_COLOR_BIT,
-                           attachment->last_subpass,
-                           attachment->desc.storeOp);
-
-      /* If we need to resolve this attachment emit that store first. Notice
-       * that we must not request a tile buffer clear here in that case, since
-       * that would clear the tile buffer before we get to emit the actual
-       * color attachment store below, since the clear happens after the
-       * store is completed.
-       *
-       * If the attachment doesn't support TLB resolves then we will have to
-       * fallback to doing the resolve in a shader separately after this
-       * job, so we will need to store the multisampled sttachment even if that
-       * wansn't requested by the client.
-       */
-      const bool needs_resolve =
-         subpass->resolve_attachments &&
-         subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
-      if (needs_resolve && attachment->use_tlb_resolve) {
-         const uint32_t resolve_attachment_idx =
-            subpass->resolve_attachments[i].attachment;
-         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
-                                           resolve_attachment_idx, layer,
-                                           RENDER_TARGET_0 + i,
-                                           false, true);
-         has_stores = true;
-      } else if (needs_resolve) {
-         needs_store = true;
-      }
-
-      /* Emit the color attachment store if needed */
-      if (needs_store) {
-         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
-                                           attachment_idx, layer,
-                                           RENDER_TARGET_0 + i,
-                                           needs_clear && !use_global_rt_clear,
-                                           false);
-         has_stores = true;
-      } else if (needs_clear) {
-         use_global_rt_clear = true;
-      }
-   }
-
-   /* We always need to emit at least one dummy store */
-   if (!has_stores) {
-      cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-         store.buffer_to_store = NONE;
-      }
-   }
-
-   /* If we have any depth/stencil clears we can't use the per-buffer clear
-    * bit and instead we have to emit a single clear of all tile buffers.
-    */
-   if (use_global_zs_clear || use_global_rt_clear) {
-      cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
-         clear.clear_z_stencil_buffer = use_global_zs_clear;
-         clear.clear_all_render_targets = use_global_rt_clear;
-      }
-   }
-}
-
-static void
-cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
-                                         uint32_t layer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   /* Emit the generic list in our indirect state -- the rcl will just
-    * have pointers into it.
-    */
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
-
-   /* The binner starts out writing tiles assuming that the initial mode
-    * is triangles, so make sure that's the case.
-    */
-   cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
-      fmt.primitive_type = LIST_TRIANGLES;
-   }
-
-   /* PTB assumes that value to be 0, but hw will not set it. */
-   cl_emit(cl, SET_INSTANCEID, set) {
-      set.instance_id = 0;
-   }
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
-                                      uint32_t layer)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   struct v3dv_cl *rcl = &job->rcl;
-
-   /* If doing multicore binning, we would need to initialize each
-    * core's tile list here.
-    */
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-   const uint32_t tile_alloc_offset =
-      64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
-   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
-   }
-
-   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
-      config.number_of_bin_tile_lists = 1;
-      config.total_frame_width_in_tiles = tiling->draw_tiles_x;
-      config.total_frame_height_in_tiles = tiling->draw_tiles_y;
-
-      config.supertile_width_in_tiles = tiling->supertile_width;
-      config.supertile_height_in_tiles = tiling->supertile_height;
-
-      config.total_frame_width_in_supertiles =
-         tiling->frame_width_in_supertiles;
-      config.total_frame_height_in_supertiles =
-         tiling->frame_height_in_supertiles;
-   }
-
-   /* Start by clearing the tile buffer. */
-   cl_emit(rcl, TILE_COORDINATES, coords) {
-      coords.tile_column_number = 0;
-      coords.tile_row_number = 0;
-   }
-
-   /* Emit an initial clear of the tile buffers. This is necessary
-    * for any buffers that should be cleared (since clearing
-    * normally happens at the *end* of the generic tile list), but
-    * it's also nice to clear everything so the first tile doesn't
-    * inherit any contents from some previous frame.
-    *
-    * Also, implement the GFXH-1742 workaround. There's a race in
-    * the HW between the RCL updating the TLB's internal type/size
-    * and the spawning of the QPU instances using the TLB's current
-    * internal type/size. To make sure the QPUs get the right
-    * state, we need 1 dummy store in between internal type/size
-    * changes on V3D 3.x, and 2 dummy stores on 4.x.
-    */
-   for (int i = 0; i < 2; i++) {
-      if (i > 0)
-         cl_emit(rcl, TILE_COORDINATES, coords);
-      cl_emit(rcl, END_OF_LOADS, end);
-      cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
-         store.buffer_to_store = NONE;
-      }
-      if (i == 0 && cmd_buffer->state.tile_aligned_render_area) {
-         cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
-            clear.clear_z_stencil_buffer = !job->early_zs_clear;
-            clear.clear_all_render_targets = true;
-         }
-      }
-      cl_emit(rcl, END_OF_TILE_MARKER, end);
-   }
-
-   cl_emit(rcl, FLUSH_VCD_CACHE, flush);
-
-   cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
-
-   uint32_t supertile_w_in_pixels =
-      tiling->tile_width * tiling->supertile_width;
-   uint32_t supertile_h_in_pixels =
-      tiling->tile_height * tiling->supertile_height;
-   const uint32_t min_x_supertile =
-      state->render_area.offset.x / supertile_w_in_pixels;
-   const uint32_t min_y_supertile =
-      state->render_area.offset.y / supertile_h_in_pixels;
-
-   uint32_t max_render_x = state->render_area.offset.x;
-   if (state->render_area.extent.width > 0)
-      max_render_x += state->render_area.extent.width - 1;
-   uint32_t max_render_y = state->render_area.offset.y;
-   if (state->render_area.extent.height > 0)
-      max_render_y += state->render_area.extent.height - 1;
-   const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
-   const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
-
-   for (int y = min_y_supertile; y <= max_y_supertile; y++) {
-      for (int x = min_x_supertile; x <= max_x_supertile; x++) {
-         cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
-            coords.column_number_in_supertiles = x;
-            coords.row_number_in_supertiles = y;
-         }
-      }
-   }
-}
-
-static void
-set_rcl_early_z_config(struct v3dv_job *job,
-                       bool *early_z_disable,
-                       uint32_t *early_z_test_and_update_direction)
-{
-   /* If this is true then we have not emitted any draw calls in this job
-    * and we don't get any benefits form early Z.
-    */
-   if (!job->decided_global_ez_enable) {
-      assert(job->draw_count == 0);
-      *early_z_disable = true;
-      return;
-   }
-
-   switch (job->first_ez_state) {
-   case VC5_EZ_UNDECIDED:
-   case VC5_EZ_LT_LE:
-      *early_z_disable = false;
-      *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
-      break;
-   case VC5_EZ_GT_GE:
-      *early_z_disable = false;
-      *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
-      break;
-   case VC5_EZ_DISABLED:
-      *early_z_disable = true;
-      break;
-   }
-}
-
-static void
-cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-
-   /* We can't emit the RCL until we have a framebuffer, which we may not have
-    * if we are recording a secondary command buffer. In that case, we will
-    * have to wait until vkCmdExecuteCommands is called from a primary command
-    * buffer.
-    */
-   if (!framebuffer) {
-      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
-      return;
-   }
-
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-
-   const uint32_t fb_layers = framebuffer->layers;
-   v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
-                                    MAX2(fb_layers, 1) * 256 *
-                                    cl_packet_length(SUPERTILE_COORDINATES));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   assert(state->subpass_idx < state->pass->subpass_count);
-   const struct v3dv_render_pass *pass = state->pass;
-   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
-   struct v3dv_cl *rcl = &job->rcl;
-
-   /* Comon config must be the first TILE_RENDERING_MODE_CFG and
-    * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
-    * updates to the previous HW state.
-    */
-   bool do_early_zs_clear = false;
-   const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
-      config.image_width_pixels = framebuffer->width;
-      config.image_height_pixels = framebuffer->height;
-      config.number_of_render_targets = MAX2(subpass->color_count, 1);
-      config.multisample_mode_4x = tiling->msaa;
-      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
-
-      if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-         const struct v3dv_image_view *iview =
-            framebuffer->attachments[ds_attachment_idx];
-         config.internal_depth_type = iview->internal_type;
-
-         set_rcl_early_z_config(job,
-                                &config.early_z_disable,
-                                &config.early_z_test_and_update_direction);
-
-         /* Early-Z/S clear can be enabled if the job is clearing and not
-          * storing (or loading) depth. If a stencil aspect is also present
-          * we have the same requirements for it, however, in this case we
-          * can accept stencil loadOp DONT_CARE as well, so instead of
-          * checking that stencil is cleared we check that is not loaded.
-          *
-          * Early-Z/S clearing is independent of Early Z/S testing, so it is
-          * possible to enable one but not the other so long as their
-          * respective requirements are met.
-          */
-         struct v3dv_render_pass_attachment *ds_attachment =
-            &pass->attachments[ds_attachment_idx];
-
-         const VkImageAspectFlags ds_aspects =
-            vk_format_aspects(ds_attachment->desc.format);
-
-         bool needs_depth_clear =
-            check_needs_clear(state,
-                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                              ds_attachment->first_subpass,
-                              ds_attachment->desc.loadOp,
-                              subpass->do_depth_clear_with_draw);
-
-         bool needs_depth_store =
-            check_needs_store(state,
-                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                              ds_attachment->last_subpass,
-                              ds_attachment->desc.storeOp);
-
-         do_early_zs_clear = needs_depth_clear && !needs_depth_store;
-         if (do_early_zs_clear &&
-             vk_format_has_stencil(ds_attachment->desc.format)) {
-            bool needs_stencil_load =
-               check_needs_load(state,
-                                ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                                ds_attachment->first_subpass,
-                                ds_attachment->desc.stencilLoadOp);
-
-            bool needs_stencil_store =
-               check_needs_store(state,
-                                 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                                 ds_attachment->last_subpass,
-                                 ds_attachment->desc.stencilStoreOp);
-
-            do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
-         }
-
-         config.early_depth_stencil_clear = do_early_zs_clear;
-      } else {
-         config.early_z_disable = true;
-      }
-   }
-
-   /* If we enabled early Z/S clear, then we can't emit any "Clear Tile Buffers"
-    * commands with the Z/S bit set, so keep track of whether we enabled this
-    * in the job so we can skip these later.
-    */
-   job->early_zs_clear = do_early_zs_clear;
-
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      struct v3dv_image_view *iview =
-         state->framebuffer->attachments[attachment_idx];
-
-      const struct v3dv_image *image = iview->image;
-      const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-
-      const uint32_t *clear_color =
-         &state->attachments[attachment_idx].clear_value.color[0];
-
-      uint32_t clear_pad = 0;
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         int uif_block_height = v3d_utile_height(image->cpp) * 2;
-
-         uint32_t implicit_padded_height =
-            align(framebuffer->height, uif_block_height) / uif_block_height;
-
-         if (slice->padded_height_of_output_image_in_uif_blocks -
-             implicit_padded_height >= 15) {
-            clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
-         }
-      }
-
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
-         clear.clear_color_low_32_bits = clear_color[0];
-         clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
-         clear.render_target_number = i;
-      };
-
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
-            clear.clear_color_mid_low_32_bits =
-              ((clear_color[1] >> 24) | (clear_color[2] << 8));
-            clear.clear_color_mid_high_24_bits =
-              ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
-            clear.render_target_number = i;
-         };
-      }
-
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
-            clear.uif_padded_height_in_uif_blocks = clear_pad;
-            clear.clear_color_high_16_bits = clear_color[3] >> 16;
-            clear.render_target_number = i;
-         };
-      }
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      v3dv_render_pass_setup_render_target(cmd_buffer, 0,
-                                           &rt.render_target_0_internal_bpp,
-                                           &rt.render_target_0_internal_type,
-                                           &rt.render_target_0_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 1,
-                                           &rt.render_target_1_internal_bpp,
-                                           &rt.render_target_1_internal_type,
-                                           &rt.render_target_1_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 2,
-                                           &rt.render_target_2_internal_bpp,
-                                           &rt.render_target_2_internal_type,
-                                           &rt.render_target_2_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 3,
-                                           &rt.render_target_3_internal_bpp,
-                                           &rt.render_target_3_internal_type,
-                                           &rt.render_target_3_clamp);
-   }
-
-   /* Ends rendering mode config. */
-   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
-         clear.z_clear_value =
-            state->attachments[ds_attachment_idx].clear_value.z;
-         clear.stencil_clear_value =
-            state->attachments[ds_attachment_idx].clear_value.s;
-      };
-   } else {
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
-         clear.z_clear_value = 1.0f;
-         clear.stencil_clear_value = 0;
-      };
-   }
-
-   /* Always set initial block size before the first branch, which needs
-    * to match the value from binning mode config.
-    */
-   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
-      init.use_auto_chained_tile_lists = true;
-      init.size_of_first_block_in_chained_tile_lists =
-         TILE_ALLOCATION_BLOCK_SIZE_64B;
-   }
-
-   for (int layer = 0; layer < MAX2(1, fb_layers); layer++)
-      cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
-
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
 static void
 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -2445,13 +1515,30 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
 
       uint8_t internal_bpp;
       bool msaa;
-      v3dv_framebuffer_compute_internal_bpp_msaa(framebuffer, subpass,
-                                                 &internal_bpp, &msaa);
+      v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
+         (framebuffer, subpass, &internal_bpp, &msaa);
+
+      /* From the Vulkan spec:
+       *
+       *    "If the render pass uses multiview, then layers must be one and
+       *     each attachment requires a number of layers that is greater than
+       *     the maximum bit index set in the view mask in the subpasses in
+       *     which it is used."
+       *
+       * So when multiview is enabled, we take the number of layers from the
+       * last bit set in the view mask.
+       */
+      uint32_t layers = framebuffer->layers;
+      if (subpass->view_mask != 0) {
+         assert(framebuffer->layers == 1);
+         layers = util_last_bit(subpass->view_mask);
+      }
 
       v3dv_job_start_frame(job,
                            framebuffer->width,
                            framebuffer->height,
-                           framebuffer->layers,
+                           layers,
+                           true,
                            subpass->color_count,
                            internal_bpp,
                            msaa);
@@ -2534,7 +1621,7 @@ v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
       job->is_subpass_finish = true;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -2553,7 +1640,7 @@ v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
    state->subpass_idx = -1;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -2580,44 +1667,6 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
 }
 
 static void
-emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
-
-static void
-ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
-                   uint32_t slot_size,
-                   uint32_t used_count,
-                   uint32_t *alloc_count,
-                   void **ptr);
-
-static void
-cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
-                                          struct v3dv_cmd_buffer *secondary)
-{
-   struct v3dv_cmd_buffer_state *p_state = &primary->state;
-   struct v3dv_cmd_buffer_state *s_state = &secondary->state;
-
-   const uint32_t total_state_count =
-      p_state->query.end.used_count + s_state->query.end.used_count;
-   ensure_array_state(primary,
-                      sizeof(struct v3dv_end_query_cpu_job_info),
-                      total_state_count,
-                      &p_state->query.end.alloc_count,
-                      (void **) &p_state->query.end.states);
-   v3dv_return_if_oom(primary, NULL);
-
-   for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
-      const struct v3dv_end_query_cpu_job_info *s_qstate =
-         &secondary->state.query.end.states[i];
-
-      struct v3dv_end_query_cpu_job_info *p_qstate =
-         &p_state->query.end.states[p_state->query.end.used_count++];
-
-      p_qstate->pool = s_qstate->pool;
-      p_qstate->query = s_qstate->query;
-   }
-}
-
-static void
 clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
               struct list_head *dst,
               struct list_head *src)
@@ -2645,9 +1694,9 @@ clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
  * for jobs recorded in secondary command buffers when we want to execute
  * them in primaries.
  */
-static struct v3dv_job *
-job_clone_in_cmd_buffer(struct v3dv_job *job,
-                        struct v3dv_cmd_buffer *cmd_buffer)
+struct v3dv_job *
+v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
+                             struct v3dv_cmd_buffer *cmd_buffer)
 {
    struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc,
                                          sizeof(struct v3dv_job), 8,
@@ -2676,163 +1725,6 @@ job_clone_in_cmd_buffer(struct v3dv_job *job,
    return clone_job;
 }
 
-static struct v3dv_job *
-cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
-                                     bool is_bcl_barrier)
-{
-   assert(cmd_buffer->state.subpass_idx != -1);
-   v3dv_cmd_buffer_finish_job(cmd_buffer);
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_subpass_resume(cmd_buffer,
-                                     cmd_buffer->state.subpass_idx);
-   if (!job)
-      return NULL;
-
-   job->serialize = true;
-   job->needs_bcl_sync = is_bcl_barrier;
-   return job;
-}
-
-static void
-cmd_buffer_execute_inside_pass(struct v3dv_cmd_buffer *primary,
-                               uint32_t cmd_buffer_count,
-                               const VkCommandBuffer *cmd_buffers)
-{
-   assert(primary->state.job);
-
-   /* Emit occlusion query state if needed so the draw calls inside our
-    * secondaries update the counters.
-    */
-   bool has_occlusion_query =
-      primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY;
-   if (has_occlusion_query)
-      emit_occlusion_query(primary);
-
-   /* FIXME: if our primary job tiling doesn't enable MSSA but any of the
-    * pipelines used by the secondaries do, we need to re-start the primary
-    * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
-    */
-   bool pending_barrier = false;
-   bool pending_bcl_barrier = false;
-   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
-
-      assert(secondary->usage_flags &
-             VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
-
-      list_for_each_entry(struct v3dv_job, secondary_job,
-                          &secondary->jobs, list_link) {
-         if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
-            /* If the job is a CL, then we branch to it from the primary BCL.
-             * In this case the secondary's BCL is finished with a
-             * RETURN_FROM_SUB_LIST command to return back to the primary BCL
-             * once we are done executing it.
-             */
-            assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
-            assert(secondary_job->bcl.bo);
-
-            /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
-            STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
-            assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
-            assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
-                   V3D42_RETURN_FROM_SUB_LIST_opcode);
-
-            /* If this secondary has any barriers (or we had any pending barrier
-             * to apply), then we can't just branch to it from the primary, we
-             * need to split the primary to create a new job that can consume
-             * the barriers first.
-             *
-             * FIXME: in this case, maybe just copy the secondary BCL without
-             * the RETURN_FROM_SUB_LIST into the primary job to skip the
-             * branch?
-             */
-            struct v3dv_job *primary_job = primary->state.job;
-            if (!primary_job || secondary_job->serialize || pending_barrier) {
-               const bool needs_bcl_barrier =
-                  secondary_job->needs_bcl_sync || pending_bcl_barrier;
-               primary_job =
-                  cmd_buffer_subpass_split_for_barrier(primary,
-                                                       needs_bcl_barrier);
-               v3dv_return_if_oom(primary, NULL);
-
-               /* Since we have created a new primary we need to re-emit
-                * occlusion query state.
-                */
-               if (has_occlusion_query)
-                  emit_occlusion_query(primary);
-            }
-
-            /* Make sure our primary job has all required BO references */
-            set_foreach(secondary_job->bos, entry) {
-               struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
-               v3dv_job_add_bo(primary_job, bo);
-            }
-
-            /* Emit required branch instructions. We expect each of these
-             * to end with a corresponding 'return from sub list' item.
-             */
-            list_for_each_entry(struct v3dv_bo, bcl_bo,
-                                &secondary_job->bcl.bo_list, list_link) {
-               v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
-                                                cl_packet_length(BRANCH_TO_SUB_LIST));
-               v3dv_return_if_oom(primary, NULL);
-               cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
-                  branch.address = v3dv_cl_address(bcl_bo, 0);
-               }
-            }
-
-            primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
-         } else if (secondary_job->type == V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS) {
-            if (pending_barrier) {
-               cmd_buffer_subpass_split_for_barrier(primary, pending_bcl_barrier);
-               v3dv_return_if_oom(primary, NULL);
-            }
-
-            const struct v3dv_clear_attachments_cpu_job_info *info =
-               &secondary_job->cpu.clear_attachments;
-            v3dv_CmdClearAttachments(v3dv_cmd_buffer_to_handle(primary),
-                                     info->attachment_count,
-                                     info->attachments,
-                                     info->rect_count,
-                                     info->rects);
-         } else {
-            /* This is a regular job (CPU or GPU), so just finish the current
-             * primary job (if any) and then add the secondary job to the
-             * primary's job list right after it.
-             */
-            v3dv_cmd_buffer_finish_job(primary);
-            job_clone_in_cmd_buffer(secondary_job, primary);
-            if (pending_barrier) {
-               secondary_job->serialize = true;
-               if (pending_bcl_barrier)
-                  secondary_job->needs_bcl_sync = true;
-            }
-         }
-
-         pending_barrier = false;
-         pending_bcl_barrier = false;
-      }
-
-      /* If the secondary has recorded any vkCmdEndQuery commands, we need to
-       * copy this state to the primary so it is processed properly when the
-       * current primary job is finished.
-       */
-      cmd_buffer_copy_secondary_end_query_state(primary, secondary);
-
-      /* If this secondary had any pending barrier state we will need that
-       * barrier state consumed with whatever comes next in the primary.
-       */
-      assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
-      pending_barrier = secondary->state.has_barrier;
-      pending_bcl_barrier = secondary->state.has_bcl_barrier;
-   }
-
-   if (pending_barrier) {
-      primary->state.has_barrier = true;
-      primary->state.has_bcl_barrier |= pending_bcl_barrier;
-   }
-}
-
 static void
 cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
                                 uint32_t cmd_buffer_count,
@@ -2862,9 +1754,8 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
       list_for_each_entry(struct v3dv_job, secondary_job,
                           &secondary->jobs, list_link) {
          /* These can only happen inside a render pass */
-         assert(secondary_job->type != V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS);
          assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);
-         struct v3dv_job *job = job_clone_in_cmd_buffer(secondary_job, primary);
+         struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
          if (!job)
             return;
 
@@ -2892,7 +1783,7 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
                         uint32_t commandBufferCount,
                         const VkCommandBuffer *pCommandBuffers)
@@ -2900,8 +1791,8 @@ v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
 
    if (primary->state.pass != NULL) {
-      cmd_buffer_execute_inside_pass(primary,
-                                     commandBufferCount, pCommandBuffers);
+      v3dv_X(primary->device, cmd_buffer_execute_inside_pass)
+         (primary, commandBufferCount, pCommandBuffers);
    } else {
       cmd_buffer_execute_outside_pass(primary,
                                       commandBufferCount, pCommandBuffers);
@@ -2993,131 +1884,15 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   cmd_buffer->state.dynamic.mask = dynamic_mask;
-   cmd_buffer->state.dirty |= dirty;
-}
-
-static void
-job_update_ez_state(struct v3dv_job *job,
-                    struct v3dv_pipeline *pipeline,
-                    struct v3dv_cmd_buffer *cmd_buffer)
-{
-   /* If first_ez_state is VC5_EZ_DISABLED it means that we have already
-    * determined that we should disable EZ completely for all draw calls in
-    * this job. This will cause us to disable EZ for the entire job in the
-    * Tile Rendering Mode RCL packet and when we do that we need to make sure
-    * we never emit a draw call in the job with EZ enabled in the CFG_BITS
-    * packet, so ez_state must also be VC5_EZ_DISABLED;
-    */
-   if (job->first_ez_state == VC5_EZ_DISABLED) {
-      assert(job->ez_state == VC5_EZ_DISABLED);
-      return;
-   }
-
-   /* This is part of the pre draw call handling, so we should be inside a
-    * render pass.
-    */
-   assert(cmd_buffer->state.pass);
-
-   /* If this is the first time we update EZ state for this job we first check
-    * if there is anything that requires disabling it completely for the entire
-    * job (based on state that is not related to the current draw call and
-    * pipeline state).
-    */
-   if (!job->decided_global_ez_enable) {
-      job->decided_global_ez_enable = true;
-
-      struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-      assert(state->subpass_idx < state->pass->subpass_count);
-      struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
-      if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
-         job->first_ez_state = VC5_EZ_DISABLED;
-         job->ez_state = VC5_EZ_DISABLED;
-         return;
-      }
-
-      /* GFXH-1918: the early-z buffer may load incorrect depth values
-       * if the frame has odd width or height.
-       *
-       * So we need to disable EZ in this case.
-       */
-      const struct v3dv_render_pass_attachment *ds_attachment =
-         &state->pass->attachments[subpass->ds_attachment.attachment];
-
-      const VkImageAspectFlags ds_aspects =
-         vk_format_aspects(ds_attachment->desc.format);
-
-      bool needs_depth_load =
-         check_needs_load(state,
-                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                          ds_attachment->first_subpass,
-                          ds_attachment->desc.loadOp);
-
-      if (needs_depth_load) {
-         struct v3dv_framebuffer *fb = state->framebuffer;
-
-         if (!fb) {
-            assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
-            perf_debug("Loading depth aspect in a secondary command buffer "
-                       "without framebuffer info disables early-z tests.\n");
-            job->first_ez_state = VC5_EZ_DISABLED;
-            job->ez_state = VC5_EZ_DISABLED;
-            return;
-         }
-
-         if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
-            perf_debug("Loading depth aspect for framebuffer with odd width "
-                       "or height disables early-Z tests.\n");
-            job->first_ez_state = VC5_EZ_DISABLED;
-            job->ez_state = VC5_EZ_DISABLED;
-            return;
-         }
+   if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
+      if (dest->color_write_enable != src->color_write_enable) {
+         dest->color_write_enable = src->color_write_enable;
+         dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
       }
    }
 
-   /* Otherwise, we can decide to selectively enable or disable EZ for draw
-    * calls using the CFG_BITS packet based on the bound pipeline state.
-    */
-
-   /* If the FS writes Z, then it may update against the chosen EZ direction */
-   struct v3dv_shader_variant *fs_variant =
-      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
-   if (fs_variant->prog_data.fs->writes_z) {
-      job->ez_state = VC5_EZ_DISABLED;
-      return;
-   }
-
-   switch (pipeline->ez_state) {
-   case VC5_EZ_UNDECIDED:
-      /* If the pipeline didn't pick a direction but didn't disable, then go
-       * along with the current EZ state. This allows EZ optimization for Z
-       * func == EQUAL or NEVER.
-       */
-      break;
-
-   case VC5_EZ_LT_LE:
-   case VC5_EZ_GT_GE:
-      /* If the pipeline picked a direction, then it needs to match the current
-       * direction if we've decided on one.
-       */
-      if (job->ez_state == VC5_EZ_UNDECIDED)
-         job->ez_state = pipeline->ez_state;
-      else if (job->ez_state != pipeline->ez_state)
-         job->ez_state = VC5_EZ_DISABLED;
-      break;
-
-   case VC5_EZ_DISABLED:
-      /* If the pipeline disables EZ because of a bad Z func or stencil
-       * operation, then we can't do any more EZ in this frame.
-       */
-      job->ez_state = VC5_EZ_DISABLED;
-      break;
-   }
-
-   if (job->first_ez_state == VC5_EZ_UNDECIDED &&
-       job->ez_state != VC5_EZ_DISABLED) {
-      job->first_ez_state = job->ez_state;
-   }
+   cmd_buffer->state.dynamic.mask = dynamic_mask;
+   cmd_buffer->state.dirty |= dirty;
 }
 
 static void
@@ -3128,25 +1903,6 @@ bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
    if (cmd_buffer->state.gfx.pipeline == pipeline)
       return;
 
-   /* Enable always flush if we are blending to sRGB render targets. This
-    * fixes test failures in:
-    * dEQP-VK.pipeline.blend.format.r8g8b8a8_srgb.*
-    *
-    * FIXME: not sure why we need this. The tile buffer is always linear, with
-    * conversion from/to sRGB happening on tile load/store operations. This
-    * means that when we enable flushing the only difference is that we convert
-    * to sRGB on the store after each draw call and we convert from sRGB on the
-    * load before each draw call, but the blend happens in linear format in the
-    * tile buffer anyway, which is the same scenario as if we didn't flush.
-    */
-   assert(pipeline->subpass);
-   if (pipeline->subpass->has_srgb_rt && pipeline->blend.enables) {
-      assert(cmd_buffer->state.job);
-      cmd_buffer->state.job->always_flush = true;
-      perf_debug("flushing draw calls for subpass %d because bound pipeline "
-                 "uses sRGB blending\n", cmd_buffer->state.subpass_idx);
-   }
-
    cmd_buffer->state.gfx.pipeline = pipeline;
 
    cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);
@@ -3167,7 +1923,7 @@ bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
                      VkPipelineBindPoint pipelineBindPoint,
                      VkPipeline _pipeline)
@@ -3223,7 +1979,7 @@ v3dv_viewport_compute_xform(const VkViewport *viewport,
       scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
                     uint32_t firstViewport,
                     uint32_t viewportCount,
@@ -3256,7 +2012,7 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
                    uint32_t firstScissor,
                    uint32_t scissorCount,
@@ -3360,379 +2116,13 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.clip_window.extent.width = maxx - minx;
    cmd_buffer->state.clip_window.extent.height = maxy - miny;
 
-   emit_clip_window(cmd_buffer->state.job, &cmd_buffer->state.clip_window);
+   v3dv_X(cmd_buffer->device, job_emit_clip_window)
+      (cmd_buffer->state.job, &cmd_buffer->state.clip_window);
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;
 }
 
 static void
-emit_viewport(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-   /* FIXME: right now we only support one viewport. viewporst[0] would work
-    * now, would need to change if we allow multiple viewports
-    */
-   float *vptranslate = dynamic->viewport.translate[0];
-   float *vpscale = dynamic->viewport.scale[0];
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const uint32_t required_cl_size =
-      cl_packet_length(CLIPPER_XY_SCALING) +
-      cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
-      cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
-      cl_packet_length(VIEWPORT_OFFSET);
-   v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
-      clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
-      clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
-   }
-
-   cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
-      clip.viewport_z_offset_zc_to_zs = vptranslate[2];
-      clip.viewport_z_scale_zc_to_zs = vpscale[2];
-   }
-   cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
-      /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */
-      float z1 = vptranslate[2];
-      float z2 = vptranslate[2] + vpscale[2];
-      clip.minimum_zw = MIN2(z1, z2);
-      clip.maximum_zw = MAX2(z1, z2);
-   }
-
-   cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
-      vp.viewport_centre_x_coordinate = vptranslate[0];
-      vp.viewport_centre_y_coordinate = vptranslate[1];
-   }
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
-}
-
-static void
-emit_stencil(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic;
-
-   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_WRITE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_REFERENCE;
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    2 * cl_packet_length(STENCIL_CFG));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   bool emitted_stencil = false;
-   for (uint32_t i = 0; i < 2; i++) {
-      if (pipeline->emit_stencil_cfg[i]) {
-         if (dynamic_state->mask & dynamic_stencil_states) {
-            cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
-                                   pipeline->stencil_cfg[i], config) {
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) {
-                  config.stencil_test_mask =
-                     i == 0 ? dynamic_state->stencil_compare_mask.front :
-                              dynamic_state->stencil_compare_mask.back;
-               }
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) {
-                  config.stencil_write_mask =
-                     i == 0 ? dynamic_state->stencil_write_mask.front :
-                              dynamic_state->stencil_write_mask.back;
-               }
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) {
-                  config.stencil_ref_value =
-                     i == 0 ? dynamic_state->stencil_reference.front :
-                              dynamic_state->stencil_reference.back;
-               }
-            }
-         } else {
-            cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
-         }
-
-         emitted_stencil = true;
-      }
-   }
-
-   if (emitted_stencil) {
-      const uint32_t dynamic_stencil_dirty_flags =
-               V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
-               V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
-               V3DV_CMD_DIRTY_STENCIL_REFERENCE;
-      cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags;
-   }
-}
-
-static void
-emit_depth_bias(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   assert(pipeline);
-
-   if (!pipeline->depth_bias.enabled)
-      return;
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-   cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
-      bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
-      bias.depth_offset_units = dynamic->depth_bias.constant_factor;
-      if (pipeline->depth_bias.is_z16)
-         bias.depth_offset_units *= 256.0f;
-      bias.limit = dynamic->depth_bias.depth_bias_clamp;
-   }
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
-}
-
-static void
-emit_line_width(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   cl_emit(&job->bcl, LINE_WIDTH, line) {
-      line.line_width = cmd_buffer->state.dynamic.line_width;
-   }
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH;
-}
-
-static void
-emit_sample_state(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   assert(pipeline);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   cl_emit(&job->bcl, SAMPLE_STATE, state) {
-      state.coverage = 1.0f;
-      state.mask = pipeline->sample_mask;
-   }
-}
-
-static void
-emit_blend(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   assert(pipeline);
-
-   const uint32_t blend_packets_size =
-      cl_packet_length(BLEND_ENABLES) +
-      cl_packet_length(BLEND_CONSTANT_COLOR) +
-      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS +
-      cl_packet_length(COLOR_WRITE_MASKS);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
-      if (pipeline->blend.enables) {
-         cl_emit(&job->bcl, BLEND_ENABLES, enables) {
-            enables.mask = pipeline->blend.enables;
-         }
-      }
-
-      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
-         if (pipeline->blend.enables & (1 << i))
-            cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
-      }
-
-      cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
-         mask.mask = pipeline->blend.color_write_masks;
-      }
-   }
-
-   if (pipeline->blend.needs_color_constants &&
-       cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) {
-      struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-      cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
-         color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]);
-         color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]);
-         color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]);
-         color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]);
-      }
-      cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS;
-   }
-}
-
-static void
-emit_flat_shade_flags(struct v3dv_job *job,
-                      int varying_offset,
-                      uint32_t varyings,
-                      enum V3DX(Varying_Flags_Action) lower,
-                      enum V3DX(Varying_Flags_Action) higher)
-{
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(FLAT_SHADE_FLAGS));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
-      flags.varying_offset_v0 = varying_offset;
-      flags.flat_shade_flags_for_varyings_v024 = varyings;
-      flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
-      flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
-   }
-}
-
-static void
-emit_noperspective_flags(struct v3dv_job *job,
-                         int varying_offset,
-                         uint32_t varyings,
-                         enum V3DX(Varying_Flags_Action) lower,
-                         enum V3DX(Varying_Flags_Action) higher)
-{
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(NON_PERSPECTIVE_FLAGS));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
-      flags.varying_offset_v0 = varying_offset;
-      flags.non_perspective_flags_for_varyings_v024 = varyings;
-      flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
-      flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
-   }
-}
-
-static void
-emit_centroid_flags(struct v3dv_job *job,
-                    int varying_offset,
-                    uint32_t varyings,
-                    enum V3DX(Varying_Flags_Action) lower,
-                    enum V3DX(Varying_Flags_Action) higher)
-{
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(CENTROID_FLAGS));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
-      flags.varying_offset_v0 = varying_offset;
-      flags.centroid_flags_for_varyings_v024 = varyings;
-      flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
-      flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
-   }
-}
-
-static bool
-emit_varying_flags(struct v3dv_job *job,
-                   uint32_t num_flags,
-                   const uint32_t *flags,
-                   void (*flag_emit_callback)(struct v3dv_job *job,
-                                              int varying_offset,
-                                              uint32_t flags,
-                                              enum V3DX(Varying_Flags_Action) lower,
-                                              enum V3DX(Varying_Flags_Action) higher))
-{
-   bool emitted_any = false;
-   for (int i = 0; i < num_flags; i++) {
-      if (!flags[i])
-         continue;
-
-      if (emitted_any) {
-        flag_emit_callback(job, i, flags[i],
-                           V3D_VARYING_FLAGS_ACTION_UNCHANGED,
-                           V3D_VARYING_FLAGS_ACTION_UNCHANGED);
-      } else if (i == 0) {
-        flag_emit_callback(job, i, flags[i],
-                           V3D_VARYING_FLAGS_ACTION_UNCHANGED,
-                           V3D_VARYING_FLAGS_ACTION_ZEROED);
-      } else {
-        flag_emit_callback(job, i, flags[i],
-                           V3D_VARYING_FLAGS_ACTION_ZEROED,
-                           V3D_VARYING_FLAGS_ACTION_ZEROED);
-      }
-
-      emitted_any = true;
-   }
-
-   return emitted_any;
-}
-
-static void
-emit_varyings_state(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-
-   struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
-
-   const uint32_t num_flags =
-      ARRAY_SIZE(prog_data_fs->flat_shade_flags);
-   const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
-   const uint32_t *noperspective_flags =  prog_data_fs->noperspective_flags;
-   const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
-
-   if (!emit_varying_flags(job, num_flags, flat_shade_flags,
-                           emit_flat_shade_flags)) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
-   }
-
-   if (!emit_varying_flags(job, num_flags, noperspective_flags,
-                           emit_noperspective_flags)) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
-   }
-
-   if (!emit_varying_flags(job, num_flags, centroid_flags,
-                           emit_centroid_flags)) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
-   }
-}
-
-static void
-emit_configuration_bits(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   assert(pipeline);
-
-   job_update_ez_state(job, pipeline, cmd_buffer);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
-      config.early_z_enable = job->ez_state != VC5_EZ_DISABLED;
-      config.early_z_updates_enable = config.early_z_enable &&
-                                      pipeline->z_updates_enable;
-   }
-}
-
-static void
 update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
                          uint32_t dirty_uniform_state)
 {
@@ -3746,13 +2136,26 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);
 
-   const bool dirty_descriptors_only =
-      (cmd_buffer->state.dirty & dirty_uniform_state) ==
-      V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
+   const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE;
+   const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT;
+   const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+   const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
+   const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX;
+
+   /* VK_SHADER_STAGE_FRAGMENT_BIT */
+   const bool has_new_descriptors_fs =
+      has_new_descriptors &&
+      (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
 
-   const bool needs_fs_update =
-      !dirty_descriptors_only ||
-      (pipeline->layout->shader_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
+   const bool has_new_push_constants_fs =
+      has_new_push_constants &&
+      (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
+
+   const bool needs_fs_update = has_new_pipeline ||
+                                has_new_view_index ||
+                                has_new_push_constants_fs ||
+                                has_new_descriptors_fs ||
+                                has_new_view_index;
 
    if (needs_fs_update) {
       struct v3dv_shader_variant *fs_variant =
@@ -3762,221 +2165,69 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
          v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
    }
 
-   const bool needs_vs_update =
-      !dirty_descriptors_only ||
-      (pipeline->layout->shader_stages & VK_SHADER_STAGE_VERTEX_BIT);
-
-   if (needs_vs_update) {
-      struct v3dv_shader_variant *vs_variant =
-         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
-
-       struct v3dv_shader_variant *vs_bin_variant =
-         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
-
-      cmd_buffer->state.uniforms.vs =
-         v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
-
-      cmd_buffer->state.uniforms.vs_bin =
-         v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
-   }
-}
-
-static void
-emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   struct v3dv_pipeline *pipeline = state->gfx.pipeline;
-   assert(pipeline);
-
-   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
-   struct v3d_vs_prog_data *prog_data_vs_bin =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
-   struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
-
-   /* Update the cache dirty flag based on the shader progs data */
-   job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;
-
-   /* See GFXH-930 workaround below */
-   uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
-
-   uint32_t shader_rec_offset =
-      v3dv_cl_ensure_space(&job->indirect,
-                           cl_packet_length(GL_SHADER_STATE_RECORD) +
-                           num_elements_to_emit *
-                           cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
-                           32);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   struct v3dv_shader_variant *vs_variant =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
-   struct v3dv_shader_variant *vs_bin_variant =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
-   struct v3dv_shader_variant *fs_variant =
-      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
-   struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
-
-   struct v3dv_bo *default_attribute_values =
-      pipeline->default_attribute_values != NULL ?
-      pipeline->default_attribute_values :
-      pipeline->device->default_attribute_float;
-
-   cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
-                          pipeline->shader_state_record, shader) {
-
-      /* FIXME: we are setting this values here and during the
-       * prepacking. This is because both cl_emit_with_prepacked and v3dv_pack
-       * asserts for minimum values of these. It would be good to get
-       * v3dv_pack to assert on the final value if possible
-       */
-      shader.min_coord_shader_input_segments_required_in_play =
-         pipeline->vpm_cfg_bin.As;
-      shader.min_vertex_shader_input_segments_required_in_play =
-         pipeline->vpm_cfg.As;
-
-      shader.coordinate_shader_code_address =
-         v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
-      shader.vertex_shader_code_address =
-         v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
-      shader.fragment_shader_code_address =
-         v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
-
-      shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
-      shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
-      shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
-
-      shader.address_of_default_attribute_values =
-         v3dv_cl_address(default_attribute_values, 0);
-   }
-
-   /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
-   bool cs_loaded_any = false;
-   const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
-                                 prog_data_vs_bin->uses_biid ||
-                                 prog_data_vs_bin->uses_vid;
-   const uint32_t packet_length =
-      cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
-
-   uint32_t emitted_va_count = 0;
-   for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
-      assert(i < MAX_VERTEX_ATTRIBS);
-
-      if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
-         continue;
-
-      const uint32_t binding = pipeline->va[i].binding;
+   /* VK_SHADER_STAGE_GEOMETRY_BIT */
+   if (pipeline->has_gs) {
+      const bool has_new_descriptors_gs =
+         has_new_descriptors &&
+         (cmd_buffer->state.dirty_descriptor_stages &
+          VK_SHADER_STAGE_GEOMETRY_BIT);
 
-      /* We store each vertex attribute in the array using its driver location
-       * as index.
-       */
-      const uint32_t location = i;
-
-      struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
-
-      cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
-                             &pipeline->vertex_attrs[i * packet_length], attr) {
-
-         assert(c_vb->buffer->mem->bo);
-         attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
-                                        c_vb->buffer->mem_offset +
-                                        pipeline->va[i].offset +
-                                        c_vb->offset);
-
-         attr.number_of_values_read_by_coordinate_shader =
-            prog_data_vs_bin->vattr_sizes[location];
-         attr.number_of_values_read_by_vertex_shader =
-            prog_data_vs->vattr_sizes[location];
-
-         /* GFXH-930: At least one attribute must be enabled and read by CS
-          * and VS.  If we have attributes being consumed by the VS but not
-          * the CS, then set up a dummy load of the last attribute into the
-          * CS's VPM inputs.  (Since CS is just dead-code-elimination compared
-          * to VS, we can't have CS loading but not VS).
-          *
-          * GFXH-1602: first attribute must be active if using builtins.
-          */
-         if (prog_data_vs_bin->vattr_sizes[location])
-            cs_loaded_any = true;
-
-         if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
-            attr.number_of_values_read_by_coordinate_shader = 1;
-            cs_loaded_any = true;
-         } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
-            attr.number_of_values_read_by_coordinate_shader = 1;
-            cs_loaded_any = true;
-         }
+      const bool has_new_push_constants_gs =
+         has_new_push_constants &&
+         (cmd_buffer->state.dirty_push_constants_stages &
+          VK_SHADER_STAGE_GEOMETRY_BIT);
 
-         attr.maximum_index = 0xffffff;
-      }
+      const bool needs_gs_update = has_new_viewport ||
+                                   has_new_view_index ||
+                                   has_new_pipeline ||
+                                   has_new_push_constants_gs ||
+                                   has_new_descriptors_gs;
 
-      emitted_va_count++;
-   }
+      if (needs_gs_update) {
+         struct v3dv_shader_variant *gs_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
 
-   if (pipeline->va_count == 0) {
-      /* GFXH-930: At least one attribute must be enabled and read
-       * by CS and VS.  If we have no attributes being consumed by
-       * the shader, set up a dummy to be loaded into the VPM.
-       */
-      cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
-         /* Valid address of data whose value will be unused. */
-         attr.address = v3dv_cl_address(job->indirect.bo, 0);
+          struct v3dv_shader_variant *gs_bin_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
 
-         attr.type = ATTRIBUTE_FLOAT;
-         attr.stride = 0;
-         attr.vec_size = 1;
+         cmd_buffer->state.uniforms.gs =
+            v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant);
 
-         attr.number_of_values_read_by_coordinate_shader = 1;
-         attr.number_of_values_read_by_vertex_shader = 1;
+         cmd_buffer->state.uniforms.gs_bin =
+            v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant);
       }
    }
 
-   if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
-      v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                       sizeof(pipeline->vcm_cache_size));
-      v3dv_return_if_oom(cmd_buffer, NULL);
+   /* VK_SHADER_STAGE_VERTEX_BIT */
+   const bool has_new_descriptors_vs =
+      has_new_descriptors &&
+      (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT);
 
-      cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
-   }
+   const bool has_new_push_constants_vs =
+      has_new_push_constants &&
+      (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT);
 
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(GL_SHADER_STATE));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   const bool needs_vs_update = has_new_viewport ||
+                                has_new_view_index ||
+                                has_new_pipeline ||
+                                has_new_push_constants_vs ||
+                                has_new_descriptors_vs;
 
-   cl_emit(&job->bcl, GL_SHADER_STATE, state) {
-      state.address = v3dv_cl_address(job->indirect.bo,
-                                      shader_rec_offset);
-      state.number_of_attribute_arrays = num_elements_to_emit;
-   }
-
-   cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
-                                V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
-                                V3DV_CMD_DIRTY_PUSH_CONSTANTS);
-}
+   if (needs_vs_update) {
+      struct v3dv_shader_variant *vs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
 
-static void
-emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
+       struct v3dv_shader_variant *vs_bin_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
 
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(OCCLUSION_QUERY_COUNTER));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+      cmd_buffer->state.uniforms.vs =
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
 
-   cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
-      if (cmd_buffer->state.query.active_query) {
-         counter.address =
-            v3dv_cl_address(cmd_buffer->state.query.active_query, 0);
-      }
+      cmd_buffer->state.uniforms.vs_bin =
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
    }
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
 }
 
 /* This stores command buffer state that we might be about to stomp for
@@ -4115,86 +2366,6 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
    state->meta.has_descriptor_state = false;
 }
 
-/* FIXME: C&P from v3dx_draw. Refactor to common place? */
-static uint32_t
-v3d_hw_prim_type(enum pipe_prim_type prim_type)
-{
-   switch (prim_type) {
-   case PIPE_PRIM_POINTS:
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_LOOP:
-   case PIPE_PRIM_LINE_STRIP:
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return prim_type;
-
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY);
-
-   default:
-      unreachable("Unsupported primitive type");
-   }
-}
-
-struct v3dv_draw_info {
-   uint32_t vertex_count;
-   uint32_t instance_count;
-   uint32_t first_vertex;
-   uint32_t first_instance;
-};
-
-static void
-cmd_buffer_emit_draw(struct v3dv_cmd_buffer *cmd_buffer,
-                     struct v3dv_draw_info *info)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   struct v3dv_pipeline *pipeline = state->gfx.pipeline;
-
-   assert(pipeline);
-
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
-
-   if (info->first_instance > 0) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
-         base.base_instance = info->first_instance;
-         base.base_vertex = 0;
-      }
-   }
-
-   if (info->instance_count > 1) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
-         prim.mode = hw_prim_type;
-         prim.index_of_first_vertex = info->first_vertex;
-         prim.number_of_instances = info->instance_count;
-         prim.instance_length = info->vertex_count;
-      }
-   } else {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-      cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
-         prim.mode = hw_prim_type;
-         prim.length = info->vertex_count;
-         prim.index_of_first_vertex = info->first_vertex;
-      }
-   }
-}
-
 static struct v3dv_job *
 cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -4297,6 +2468,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
                         old_job->frame_tiling.width,
                         old_job->frame_tiling.height,
                         old_job->frame_tiling.layers,
+                        true,
                         old_job->frame_tiling.render_target_count,
                         old_job->frame_tiling.internal_bpp,
                         true /* msaa */);
@@ -4304,35 +2476,8 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_job_destroy(old_job);
 }
 
-static void
-emit_index_buffer(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   /* We flag all state as dirty when we create a new job so make sure we
-    * have a valid index buffer before attempting to emit state for it.
-    */
-   struct v3dv_buffer *ibuffer =
-      v3dv_buffer_from_handle(cmd_buffer->state.index_buffer.buffer);
-   if (ibuffer) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      const uint32_t offset = cmd_buffer->state.index_buffer.offset;
-      cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
-         ib.address = v3dv_cl_address(ibuffer->mem->bo,
-                                      ibuffer->mem_offset + offset);
-         ib.size = ibuffer->mem->bo->size;
-      }
-   }
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
-}
-
-static void
-cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
+void
+v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
 {
    assert(cmd_buffer->state.gfx.pipeline);
    assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
@@ -4368,17 +2513,20 @@ cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
       *dirty & (V3DV_CMD_DIRTY_PIPELINE |
                 V3DV_CMD_DIRTY_PUSH_CONSTANTS |
                 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
-                V3DV_CMD_DIRTY_VIEWPORT);
+                V3DV_CMD_DIRTY_VIEWPORT |
+                V3DV_CMD_DIRTY_VIEW_INDEX);
 
    if (dirty_uniform_state)
       update_gfx_uniform_state(cmd_buffer, dirty_uniform_state);
 
+   struct v3dv_device *device = cmd_buffer->device;
+
    if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
-      emit_gl_shader_state(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);
 
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
-      emit_configuration_bits(cmd_buffer);
-      emit_varyings_state(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);
    }
 
    if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {
@@ -4386,46 +2534,69 @@ cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
    }
 
    if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {
-      emit_viewport(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);
    }
 
    if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
-      emit_index_buffer(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);
 
    const uint32_t dynamic_stencil_dirty_flags =
       V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
       V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
       V3DV_CMD_DIRTY_STENCIL_REFERENCE;
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))
-      emit_stencil(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);
 
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
-      emit_depth_bias(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
 
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
-      emit_blend(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
 
    if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
-      emit_occlusion_query(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);
 
    if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)
-      emit_line_width(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);
 
    if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
-      emit_sample_state(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);
+
+   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE))
+      v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
 }
 
+static inline void
+cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
+                          uint32_t view_index)
+{
+   cmd_buffer->state.view_index = view_index;
+   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
+}
+
 static void
 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_draw_info *info)
 {
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-   cmd_buffer_emit_draw(cmd_buffer, info);
+
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
+      return;
+   }
+
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
+   }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDraw(VkCommandBuffer commandBuffer,
              uint32_t vertexCount,
              uint32_t instanceCount,
@@ -4445,7 +2616,7 @@ v3dv_CmdDraw(VkCommandBuffer commandBuffer,
    cmd_buffer_draw(cmd_buffer, &info);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
                     uint32_t indexCount,
                     uint32_t instanceCount,
@@ -4458,56 +2629,26 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
 
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
-   uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
-   uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
-
-   if (vertexOffset != 0 || firstInstance != 0) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
-         base.base_instance = firstInstance;
-         base.base_vertex = vertexOffset;
-      }
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+         (cmd_buffer, indexCount, instanceCount,
+          firstIndex, vertexOffset, firstInstance);
+      return;
    }
 
-   if (instanceCount == 1) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
-         prim.index_type = index_type;
-         prim.length = indexCount;
-         prim.index_offset = index_offset;
-         prim.mode = hw_prim_type;
-         prim.enable_primitive_restarts = pipeline->primitive_restart;
-      }
-   } else if (instanceCount > 1) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
-         prim.index_type = index_type;
-         prim.index_offset = index_offset;
-         prim.mode = hw_prim_type;
-         prim.enable_primitive_restarts = pipeline->primitive_restart;
-         prim.number_of_instances = instanceCount;
-         prim.instance_length = indexCount;
-      }
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+         (cmd_buffer, indexCount, instanceCount,
+          firstIndex, vertexOffset, firstInstance);
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
                      VkBuffer _buffer,
                      VkDeviceSize offset,
@@ -4521,28 +2662,24 @@ v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
 
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
-
-   v3dv_cl_ensure_space_with_branch(
-      &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
+      return;
+   }
 
-   cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
-      prim.mode = hw_prim_type;
-      prim.number_of_draw_indirect_array_records = drawCount;
-      prim.stride_in_multiples_of_4_bytes = stride >> 2;
-      prim.address = v3dv_cl_address(buffer->mem->bo,
-                                     buffer->mem_offset + offset);
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
                             VkBuffer _buffer,
                             VkDeviceSize offset,
@@ -4556,31 +2693,24 @@ v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
 
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
-   uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
-
-   v3dv_cl_ensure_space_with_branch(
-      &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
+      return;
+   }
 
-   cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
-      prim.index_type = index_type;
-      prim.mode = hw_prim_type;
-      prim.enable_primitive_restarts = pipeline->primitive_restart;
-      prim.number_of_draw_indirect_indexed_records = drawCount;
-      prim.stride_in_multiples_of_4_bytes = stride >> 2;
-      prim.address = v3dv_cl_address(buffer->mem->bo,
-                                     buffer->mem_offset + offset);
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
                         VkPipelineStageFlags srcStageMask,
                         VkPipelineStageFlags dstStageMask,
@@ -4616,7 +2746,7 @@ v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
                           uint32_t firstBinding,
                           uint32_t bindingCount,
@@ -4651,6 +2781,9 @@ static uint32_t
 get_index_size(VkIndexType index_type)
 {
    switch (index_type) {
+   case VK_INDEX_TYPE_UINT8_EXT:
+      return 1;
+      break;
    case VK_INDEX_TYPE_UINT16:
       return 2;
       break;
@@ -4662,7 +2795,7 @@ get_index_size(VkIndexType index_type)
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
                         VkBuffer buffer,
                         VkDeviceSize offset,
@@ -4683,7 +2816,7 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
                               VkStencilFaceFlags faceMask,
                               uint32_t compareMask)
@@ -4698,7 +2831,7 @@ v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
                             VkStencilFaceFlags faceMask,
                             uint32_t writeMask)
@@ -4713,7 +2846,7 @@ v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
                             VkStencilFaceFlags faceMask,
                             uint32_t reference)
@@ -4728,7 +2861,7 @@ v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
                      float depthBiasConstantFactor,
                      float depthBiasClamp,
@@ -4742,7 +2875,7 @@ v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
                        float minDepthBounds,
                        float maxDepthBounds)
@@ -4752,7 +2885,7 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
     */
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
                      float lineWidth)
 {
@@ -4762,7 +2895,7 @@ v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
                            VkPipelineBindPoint pipelineBindPoint,
                            VkPipelineLayout _layout,
@@ -4784,18 +2917,16 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
       &cmd_buffer->state.compute.descriptor_state :
       &cmd_buffer->state.gfx.descriptor_state;
 
+   VkShaderStageFlags dirty_stages = 0;
    bool descriptor_state_changed = false;
    for (uint32_t i = 0; i < descriptorSetCount; i++) {
       V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
       uint32_t index = firstSet + i;
 
+      descriptor_state->valid |= (1u << index);
       if (descriptor_state->descriptor_sets[index] != set) {
          descriptor_state->descriptor_sets[index] = set;
-         descriptor_state_changed = true;
-      }
-
-      if (!(descriptor_state->valid & (1u << index))) {
-         descriptor_state->valid |= (1u << index);
+         dirty_stages |= set->layout->shader_stages;
          descriptor_state_changed = true;
       }
 
@@ -4804,20 +2935,24 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
 
          if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
             descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
+            dirty_stages |= set->layout->shader_stages;
             descriptor_state_changed = true;
          }
       }
    }
 
    if (descriptor_state_changed) {
-      if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS)
+      if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
-      else
+         cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
+      } else {
          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
+         cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
+      }
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
                       VkPipelineLayout layout,
                       VkShaderStageFlags stageFlags,
@@ -4833,9 +2968,10 @@ v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
    memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size);
 
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+   cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
                           const float blendConstants[4])
 {
@@ -4853,6 +2989,26 @@ v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
 }
 
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
+                               uint32_t attachmentCount,
+                               const VkBool32 *pColorWriteEnables)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   uint32_t color_write_enable = 0;
+
+   for (uint32_t i = 0; i < attachmentCount; i++)
+      color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
+
+   if (state->dynamic.color_write_enable == color_write_enable)
+      return;
+
+   state->dynamic.color_write_enable = color_write_enable;
+
+   state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
+}
+
 void
 v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
                               struct v3dv_query_pool *pool,
@@ -4881,12 +3037,12 @@ v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-static void
-ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
-                   uint32_t slot_size,
-                   uint32_t used_count,
-                   uint32_t *alloc_count,
-                   void **ptr)
+void
+v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
+                                   uint32_t slot_size,
+                                   uint32_t used_count,
+                                   uint32_t *alloc_count,
+                                   void **ptr)
 {
    if (used_count >= *alloc_count) {
       const uint32_t prev_slot_count = *alloc_count;
@@ -4915,10 +3071,11 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
                             VkQueryControlFlags flags)
 {
    /* FIXME: we only support one active query for now */
-   assert(cmd_buffer->state.query.active_query == NULL);
+   assert(cmd_buffer->state.query.active_query.bo == NULL);
    assert(query < pool->query_count);
 
-   cmd_buffer->state.query.active_query = pool->queries[query].bo;
+   cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
+   cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
@@ -4928,7 +3085,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t query)
 {
    assert(query < pool->query_count);
-   assert(cmd_buffer->state.query.active_query != NULL);
+   assert(cmd_buffer->state.query.active_query.bo != NULL);
 
    if  (cmd_buffer->state.pass) {
       /* Queue the EndQuery in the command buffer state, we will create a CPU
@@ -4936,11 +3093,11 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
        * render pass job in which they have been recorded.
        */
       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-      ensure_array_state(cmd_buffer,
-                         sizeof(struct v3dv_end_query_cpu_job_info),
-                         state->query.end.used_count,
-                         &state->query.end.alloc_count,
-                         (void **) &state->query.end.states);
+      v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
+                                         sizeof(struct v3dv_end_query_cpu_job_info),
+                                         state->query.end.used_count,
+                                         &state->query.end.alloc_count,
+                                         (void **) &state->query.end.states);
       v3dv_return_if_oom(cmd_buffer, NULL);
 
       struct v3dv_end_query_cpu_job_info *info =
@@ -4948,6 +3105,27 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
 
       info->pool = pool;
       info->query = query;
+
+      /* From the Vulkan spec:
+       *
+       *   "If queries are used while executing a render pass instance that has
+       *    multiview enabled, the query uses N consecutive query indices in
+       *    the query pool (starting at query) where N is the number of bits set
+       *    in the view mask in the subpass the query is used in. How the
+       *    numerical results of the query are distributed among the queries is
+       *    implementation-dependent."
+       *
+       * In our case, only the first query is used but this means we still need
+       * to flag the other queries as available so we don't emit errors when
+       * the applications attempt to retrive values from them.
+       */
+      struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+      if (!pass->multiview_enabled) {
+         info->count = 1;
+      } else {
+         struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+         info->count = util_bitcount(subpass->view_mask);
+      }
    } else {
       /* Otherwise, schedule the CPU job immediately */
       struct v3dv_job *job =
@@ -4958,10 +3136,14 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
 
       job->cpu.query_end.pool = pool;
       job->cpu.query_end.query = query;
+
+      /* Multiview queries cannot cross subpass boundaries */
+      job->cpu.query_end.count = 1;
+
       list_addtail(&job->list_link, &cmd_buffer->jobs);
    }
 
-   cmd_buffer->state.query.active_query = NULL;
+   cmd_buffer->state.query.active_query.bo = NULL;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
@@ -5019,7 +3201,7 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
                  VkEvent _event,
                  VkPipelineStageFlags stageMask)
@@ -5045,7 +3227,7 @@ v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
                    VkEvent _event,
                    VkPipelineStageFlags stageMask)
@@ -5071,7 +3253,7 @@ v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
                    uint32_t eventCount,
                    const VkEvent *pEvents,
@@ -5124,7 +3306,7 @@ v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
                        VkPipelineStageFlagBits pipelineStage,
                        VkQueryPool queryPool,
@@ -5136,7 +3318,8 @@ v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
    /* If this is called inside a render pass we need to finish the current
     * job here...
     */
-   if (cmd_buffer->state.pass)
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (pass)
       v3dv_cmd_buffer_finish_job(cmd_buffer);
 
    struct v3dv_job *job =
@@ -5148,6 +3331,14 @@ v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
    job->cpu.query_timestamp.pool = query_pool;
    job->cpu.query_timestamp.query = query;
 
+   if (!pass || !pass->multiview_enabled) {
+      job->cpu.query_timestamp.count = 1;
+   } else {
+      struct v3dv_subpass *subpass =
+         &pass->subpasses[cmd_buffer->state.subpass_idx];
+      job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask);
+   }
+
    list_addtail(&job->list_link, &cmd_buffer->jobs);
    cmd_buffer->state.job = NULL;
 
@@ -5163,9 +3354,10 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
    assert(cmd_buffer->state.compute.pipeline->active_stages ==
           VK_SHADER_STAGE_COMPUTE_BIT);
 
-   uint32_t *dirty = &cmd_buffer->state.dirty;
-   *dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
-               V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
+   cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
+                                V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
+   cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+   cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
 }
 
 #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
@@ -5230,6 +3422,9 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
 
 static struct v3dv_job *
 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
+                          uint32_t base_offset_x,
+                          uint32_t base_offset_y,
+                          uint32_t base_offset_z,
                           uint32_t group_count_x,
                           uint32_t group_count_y,
                           uint32_t group_count_z,
@@ -5258,6 +3453,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
    job->csd.wg_count[1] = group_count_y;
    job->csd.wg_count[2] = group_count_z;
 
+   job->csd.wg_base[0] = base_offset_x;
+   job->csd.wg_base[1] = base_offset_y;
+   job->csd.wg_base[2] = base_offset_z;
+
    submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
    submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
    submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
@@ -5265,20 +3464,32 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
    const struct v3d_compute_prog_data *cpd =
       cs_variant->prog_data.cs;
 
-   const uint32_t wgs_per_sg = 1; /* FIXME */
+   const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;
    const uint32_t wg_size = cpd->local_size[0] *
                             cpd->local_size[1] *
                             cpd->local_size[2];
-   submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
-   submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
-                       V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
+
+   uint32_t wgs_per_sg =
+      v3d_csd_choose_workgroups_per_supergroup(
+         &cmd_buffer->device->devinfo,
+         cs_variant->prog_data.cs->has_subgroups,
+         cs_variant->prog_data.cs->base.has_control_barrier,
+         cs_variant->prog_data.cs->base.threads,
+         num_wgs, wg_size);
+
+   uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
+   uint32_t whole_sgs = num_wgs / wgs_per_sg;
+   uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
+   uint32_t num_batches = batches_per_sg * whole_sgs +
+                          DIV_ROUND_UP(rem_wgs * wg_size, 16);
+
+   submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
+   submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
    submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
    if (wg_size_out)
       *wg_size_out = wg_size;
 
-   uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
-   submit->cfg[4] = batches_per_wg *
-                    (group_count_x * group_count_y * group_count_z) - 1;
+   submit->cfg[4] = num_batches - 1;
    assert(submit->cfg[4] != ~0);
 
    assert(pipeline->shared_data->assembly_bo);
@@ -5302,7 +3513,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   v3dv_job_add_bo(job, cs_assembly_bo);
+   v3dv_job_add_bo_unchecked(job, cs_assembly_bo);
    struct v3dv_cl_reloc uniforms =
       v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
                                      cs_variant,
@@ -5316,6 +3527,9 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
 
 static void
 cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
+                    uint32_t base_offset_x,
+                    uint32_t base_offset_y,
+                    uint32_t base_offset_z,
                     uint32_t group_count_x,
                     uint32_t group_count_y,
                     uint32_t group_count_z)
@@ -5325,6 +3539,9 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
 
    struct v3dv_job *job =
       cmd_buffer_create_csd_job(cmd_buffer,
+                                base_offset_x,
+                                base_offset_y,
+                                base_offset_z,
                                 group_count_x,
                                 group_count_y,
                                 group_count_z,
@@ -5334,7 +3551,7 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.job = NULL;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
                  uint32_t groupCountX,
                  uint32_t groupCountY,
@@ -5343,9 +3560,28 @@ v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
    cmd_buffer_emit_pre_dispatch(cmd_buffer);
-   cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ);
+   cmd_buffer_dispatch(cmd_buffer, 0, 0, 0,
+                       groupCountX, groupCountY, groupCountZ);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
+                     uint32_t baseGroupX,
+                     uint32_t baseGroupY,
+                     uint32_t baseGroupZ,
+                     uint32_t groupCountX,
+                     uint32_t groupCountY,
+                     uint32_t groupCountZ)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_emit_pre_dispatch(cmd_buffer);
+   cmd_buffer_dispatch(cmd_buffer,
+                       baseGroupX, baseGroupY, baseGroupZ,
+                       groupCountX, groupCountY, groupCountZ);
 }
 
+
 static void
 cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
                              struct v3dv_buffer *buffer,
@@ -5370,6 +3606,7 @@ cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
     */
    struct v3dv_job *csd_job =
       cmd_buffer_create_csd_job(cmd_buffer,
+                                0, 0, 0,
                                 1, 1, 1,
                                 &job->cpu.csd_indirect.wg_uniform_offsets[0],
                                 &job->cpu.csd_indirect.wg_size);
@@ -5392,7 +3629,7 @@ cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.job = NULL;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
                          VkBuffer _buffer,
                          VkDeviceSize offset)
@@ -5405,3 +3642,10 @@ v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
    cmd_buffer_emit_pre_dispatch(cmd_buffer);
    cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
 }
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
+{
+   /* Nothing to do here since we only support a single device */
+   assert(deviceMask == 0x1);
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_descriptor_set.c b/lib/mesa/src/broadcom/vulkan/v3dv_descriptor_set.c
index 3487d701a..14a93cea4 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_descriptor_set.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_descriptor_set.c
@@ -27,42 +27,20 @@
 #include "v3dv_private.h"
 
 /*
- * Returns how much space a given descriptor type needs on a bo (GPU
- * memory).
- */
-static uint32_t
-descriptor_bo_size(VkDescriptorType type)
-{
-   switch(type) {
-   case VK_DESCRIPTOR_TYPE_SAMPLER:
-      return sizeof(struct v3dv_sampler_descriptor);
-   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-      return sizeof(struct v3dv_combined_image_sampler_descriptor);
-   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-      return sizeof(struct v3dv_sampled_image_descriptor);
-   default:
-      return 0;
-   }
-}
-
-/*
  * For a given descriptor defined by the descriptor_set it belongs, its
  * binding layout, and array_index, it returns the map region assigned to it
  * from the descriptor pool bo.
  */
 static void*
-descriptor_bo_map(struct v3dv_descriptor_set *set,
+descriptor_bo_map(struct v3dv_device *device,
+                  struct v3dv_descriptor_set *set,
                   const struct v3dv_descriptor_set_binding_layout *binding_layout,
                   uint32_t array_index)
 {
-   assert(descriptor_bo_size(binding_layout->type) > 0);
+   assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
    return set->pool->bo->map +
       set->base_offset + binding_layout->descriptor_offset +
-      array_index * descriptor_bo_size(binding_layout->type);
+      array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type);
 }
 
 static bool
@@ -125,7 +103,8 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
  * validation or adding extra offsets if the bo contains more that one field.
  */
 static struct v3dv_cl_reloc
-v3dv_descriptor_map_get_descriptor_bo(struct v3dv_descriptor_state *descriptor_state,
+v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
+                                      struct v3dv_descriptor_state *descriptor_state,
                                       struct v3dv_descriptor_map *map,
                                       struct v3dv_pipeline_layout *pipeline_layout,
                                       uint32_t index,
@@ -146,7 +125,7 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_descriptor_state *descriptor_s
    const struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set->layout->binding[binding_number];
 
-   assert(descriptor_bo_size(binding_layout->type) > 0);
+   assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
    *out_type = binding_layout->type;
 
    uint32_t array_index = map->array_index[index];
@@ -155,7 +134,7 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_descriptor_state *descriptor_s
    struct v3dv_cl_reloc reloc = {
       .bo = set->pool->bo,
       .offset = set->base_offset + binding_layout->descriptor_offset +
-      array_index * descriptor_bo_size(binding_layout->type),
+      array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type),
    };
 
    return reloc;
@@ -218,24 +197,23 @@ v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state,
 
 
 struct v3dv_cl_reloc
-v3dv_descriptor_map_get_sampler_state(struct v3dv_descriptor_state *descriptor_state,
+v3dv_descriptor_map_get_sampler_state(struct v3dv_device *device,
+                                      struct v3dv_descriptor_state *descriptor_state,
                                       struct v3dv_descriptor_map *map,
                                       struct v3dv_pipeline_layout *pipeline_layout,
                                       uint32_t index)
 {
    VkDescriptorType type;
    struct v3dv_cl_reloc reloc =
-      v3dv_descriptor_map_get_descriptor_bo(descriptor_state, map,
+      v3dv_descriptor_map_get_descriptor_bo(device, descriptor_state, map,
                                             pipeline_layout,
                                             index, &type);
 
    assert(type == VK_DESCRIPTOR_TYPE_SAMPLER ||
           type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
 
-   if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
-      reloc.offset += offsetof(struct v3dv_combined_image_sampler_descriptor,
-                               sampler_state);
-   }
+   if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+      reloc.offset += v3dv_X(device, combined_image_sampler_sampler_state_offset)();
 
    return reloc;
 }
@@ -262,7 +240,7 @@ v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
       assert(descriptor->image_view);
-      *out_vk_format = descriptor->image_view->vk_format;
+      *out_vk_format = descriptor->image_view->vk.format;
       return descriptor->image_view->format;
    default:
       unreachable("descriptor type doesn't has a texture format");
@@ -288,23 +266,28 @@ v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_stat
    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
       assert(descriptor->image_view);
-      return descriptor->image_view->image->mem->bo;
+      struct v3dv_image *image =
+         (struct v3dv_image *) descriptor->image_view->vk.image;
+      return image->mem->bo;
+   }
    default:
       unreachable("descriptor type doesn't has a texture bo");
    }
 }
 
 struct v3dv_cl_reloc
-v3dv_descriptor_map_get_texture_shader_state(struct v3dv_descriptor_state *descriptor_state,
+v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device,
+                                             struct v3dv_descriptor_state *descriptor_state,
                                              struct v3dv_descriptor_map *map,
                                              struct v3dv_pipeline_layout *pipeline_layout,
                                              uint32_t index)
 {
    VkDescriptorType type;
    struct v3dv_cl_reloc reloc =
-      v3dv_descriptor_map_get_descriptor_bo(descriptor_state, map,
+      v3dv_descriptor_map_get_descriptor_bo(device,
+                                            descriptor_state, map,
                                             pipeline_layout,
                                             index, &type);
 
@@ -315,10 +298,8 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_descriptor_state *descr
           type == VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER ||
           type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
 
-   if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
-      reloc.offset += offsetof(struct v3dv_combined_image_sampler_descriptor,
-                               texture_state);
-   }
+   if (type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+      reloc.offset += v3dv_X(device, combined_image_sampler_texture_state_offset)();
 
    return reloc;
 }
@@ -330,7 +311,7 @@ v3dv_descriptor_map_get_texture_shader_state(struct v3dv_descriptor_state *descr
  * just multiple descriptor set layouts pasted together."
  */
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreatePipelineLayout(VkDevice _device,
                          const VkPipelineLayoutCreateInfo *pCreateInfo,
                          const VkAllocationCallbacks *pAllocator,
@@ -345,7 +326,7 @@ v3dv_CreatePipelineLayout(VkDevice _device,
    layout = vk_object_zalloc(&device->vk, pAllocator, sizeof(*layout),
                              VK_OBJECT_TYPE_PIPELINE_LAYOUT);
    if (layout == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    layout->num_sets = pCreateInfo->setLayoutCount;
 
@@ -380,7 +361,7 @@ v3dv_CreatePipelineLayout(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyPipelineLayout(VkDevice _device,
                           VkPipelineLayout _pipelineLayout,
                           const VkAllocationCallbacks *pAllocator)
@@ -393,7 +374,7 @@ v3dv_DestroyPipelineLayout(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, pipeline_layout);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateDescriptorPool(VkDevice _device,
                           const VkDescriptorPoolCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *pAllocator,
@@ -435,7 +416,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
 
       assert(pCreateInfo->pPoolSizes[i].descriptorCount > 0);
       descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
-      bo_size += descriptor_bo_size(pCreateInfo->pPoolSizes[i].type) *
+      bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
          pCreateInfo->pPoolSizes[i].descriptorCount;
    }
 
@@ -452,7 +433,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
                            VK_OBJECT_TYPE_DESCRIPTOR_POOL);
 
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
       pool->host_memory_base = (uint8_t*)pool + sizeof(struct v3dv_descriptor_pool);
@@ -482,7 +463,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
 
  out_of_device_memory:
    vk_object_free(&device->vk, pAllocator, pool);
-   return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 }
 
 static void
@@ -506,7 +487,7 @@ descriptor_set_destroy(struct v3dv_device *device,
    vk_object_free(&device->vk, NULL, set);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyDescriptorPool(VkDevice _device,
                            VkDescriptorPool _pool,
                            const VkAllocationCallbacks *pAllocator)
@@ -531,7 +512,7 @@ v3dv_DestroyDescriptorPool(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, pool);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_ResetDescriptorPool(VkDevice _device,
                          VkDescriptorPool descriptorPool,
                          VkDescriptorPoolResetFlags flags)
@@ -558,7 +539,7 @@ v3dv_ResetDescriptorPool(VkDevice _device,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateDescriptorSetLayout(VkDevice _device,
                                const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
                                const VkAllocationCallbacks *pAllocator,
@@ -602,7 +583,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
                                  VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
 
    if (!set_layout)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* We just allocate all the immutable samplers at the end of the struct */
    struct v3dv_sampler *samplers = (void*) &set_layout->binding[num_bindings];
@@ -614,7 +595,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
                                                pCreateInfo->bindingCount, &bindings);
    if (result != VK_SUCCESS) {
       vk_object_free(&device->vk, pAllocator, set_layout);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    memset(set_layout->binding, 0,
@@ -680,7 +661,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
 
       set_layout->binding[binding_number].descriptor_offset = set_layout->bo_size;
       set_layout->bo_size +=
-         descriptor_bo_size(set_layout->binding[binding_number].type) *
+         v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
          binding->descriptorCount;
    }
 
@@ -694,7 +675,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyDescriptorSetLayout(VkDevice _device,
                                 VkDescriptorSetLayout _set_layout,
                                 const VkAllocationCallbacks *pAllocator)
@@ -716,7 +697,7 @@ out_of_pool_memory(const struct v3dv_device *device,
     * by allocating a new pool, so they don't point to real issues.
     */
    if (!pool->is_driver_internal)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY)
+      return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
    else
       return VK_ERROR_OUT_OF_POOL_MEMORY;
 }
@@ -745,7 +726,7 @@ descriptor_set_create(struct v3dv_device *device,
                              VK_OBJECT_TYPE_DESCRIPTOR_SET);
 
       if (!set)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    set->pool = pool;
@@ -818,15 +799,14 @@ descriptor_set_create(struct v3dv_device *device,
       for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
          uint32_t combined_offset =
             layout->binding[b].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ?
-            offsetof(struct v3dv_combined_image_sampler_descriptor, sampler_state) :
-            0;
+            v3dv_X(device, combined_image_sampler_sampler_state_offset)() : 0;
 
-         void *desc_map = descriptor_bo_map(set, &layout->binding[b], i);
+         void *desc_map = descriptor_bo_map(device, set, &layout->binding[b], i);
          desc_map += combined_offset;
 
          memcpy(desc_map,
                 samplers[i].sampler_state,
-                cl_packet_length(SAMPLER_STATE));
+                sizeof(samplers[i].sampler_state));
       }
    }
 
@@ -835,7 +815,7 @@ descriptor_set_create(struct v3dv_device *device,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_AllocateDescriptorSets(VkDevice _device,
                             const VkDescriptorSetAllocateInfo *pAllocateInfo,
                             VkDescriptorSet *pDescriptorSets)
@@ -869,7 +849,7 @@ v3dv_AllocateDescriptorSets(VkDevice _device,
    return result;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_FreeDescriptorSets(VkDevice _device,
                         VkDescriptorPool descriptorPool,
                         uint32_t count,
@@ -888,7 +868,8 @@ v3dv_FreeDescriptorSets(VkDevice _device,
 }
 
 static void
-descriptor_bo_copy(struct v3dv_descriptor_set *dst_set,
+descriptor_bo_copy(struct v3dv_device *device,
+                   struct v3dv_descriptor_set *dst_set,
                    const struct v3dv_descriptor_set_binding_layout *dst_binding_layout,
                    uint32_t dst_array_index,
                    struct v3dv_descriptor_set *src_set,
@@ -897,31 +878,55 @@ descriptor_bo_copy(struct v3dv_descriptor_set *dst_set,
 {
    assert(dst_binding_layout->type == src_binding_layout->type);
 
-   void *dst_map = descriptor_bo_map(dst_set, dst_binding_layout, dst_array_index);
-   void *src_map = descriptor_bo_map(src_set, src_binding_layout, src_array_index);
+   void *dst_map = descriptor_bo_map(device, dst_set, dst_binding_layout, dst_array_index);
+   void *src_map = descriptor_bo_map(device, src_set, src_binding_layout, src_array_index);
+
+   memcpy(dst_map, src_map, v3dv_X(device, descriptor_bo_size)(src_binding_layout->type));
+}
+
+static void
+write_buffer_descriptor(struct v3dv_descriptor *descriptor,
+                        VkDescriptorType desc_type,
+                        const VkDescriptorBufferInfo *buffer_info)
+{
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, buffer_info->buffer);
 
-   memcpy(dst_map, src_map, descriptor_bo_size(src_binding_layout->type));
+   descriptor->type = desc_type;
+   descriptor->buffer = buffer;
+   descriptor->offset = buffer_info->offset;
+   if (buffer_info->range == VK_WHOLE_SIZE) {
+      descriptor->range = buffer->size - buffer_info->offset;
+   } else {
+      assert(descriptor->range <= UINT32_MAX);
+      descriptor->range = buffer_info->range;
+   }
 }
 
 static void
-write_image_descriptor(VkDescriptorType desc_type,
+write_image_descriptor(struct v3dv_device *device,
+                       struct v3dv_descriptor *descriptor,
+                       VkDescriptorType desc_type,
                        struct v3dv_descriptor_set *set,
                        const struct v3dv_descriptor_set_binding_layout *binding_layout,
                        struct v3dv_image_view *iview,
                        struct v3dv_sampler *sampler,
                        uint32_t array_index)
 {
-   void *desc_map = descriptor_bo_map(set, binding_layout, array_index);
+   descriptor->type = desc_type;
+   descriptor->sampler = sampler;
+   descriptor->image_view = iview;
+
+   void *desc_map = descriptor_bo_map(device, set,
+                                      binding_layout, array_index);
 
    if (iview) {
       const uint32_t tex_state_index =
-         iview->type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
+         iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
          desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1;
       memcpy(desc_map,
              iview->texture_shader_state[tex_state_index],
              sizeof(iview->texture_shader_state[0]));
-      desc_map += offsetof(struct v3dv_combined_image_sampler_descriptor,
-                           sampler_state);
+      desc_map += v3dv_X(device, combined_image_sampler_sampler_state_offset)();
    }
 
    if (sampler && !binding_layout->immutable_samplers_offset) {
@@ -936,28 +941,33 @@ write_image_descriptor(VkDescriptorType desc_type,
 
 
 static void
-write_buffer_view_descriptor(VkDescriptorType desc_type,
+write_buffer_view_descriptor(struct v3dv_device *device,
+                             struct v3dv_descriptor *descriptor,
+                             VkDescriptorType desc_type,
                              struct v3dv_descriptor_set *set,
                              const struct v3dv_descriptor_set_binding_layout *binding_layout,
                              struct v3dv_buffer_view *bview,
                              uint32_t array_index)
 {
-   void *desc_map = descriptor_bo_map(set, binding_layout, array_index);
-
    assert(bview);
+   descriptor->type = desc_type;
+   descriptor->buffer_view = bview;
+
+   void *desc_map = descriptor_bo_map(device, set, binding_layout, array_index);
 
    memcpy(desc_map,
           bview->texture_shader_state,
           sizeof(bview->texture_shader_state));
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_UpdateDescriptorSets(VkDevice  _device,
                           uint32_t descriptorWriteCount,
                           const VkWriteDescriptorSet *pDescriptorWrites,
                           uint32_t descriptorCopyCount,
                           const VkCopyDescriptorSet *pDescriptorCopies)
 {
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
    for (uint32_t i = 0; i < descriptorWriteCount; i++) {
       const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
       V3DV_FROM_HANDLE(v3dv_descriptor_set, set, writeset->dstSet);
@@ -971,8 +981,6 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
       descriptor += writeset->dstArrayElement;
 
       for (uint32_t j = 0; j < writeset->descriptorCount; ++j) {
-         descriptor->type = writeset->descriptorType;
-
          switch(writeset->descriptorType) {
 
          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -980,16 +988,8 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
             const VkDescriptorBufferInfo *buffer_info = writeset->pBufferInfo + j;
-            V3DV_FROM_HANDLE(v3dv_buffer, buffer, buffer_info->buffer);
-
-            descriptor->buffer = buffer;
-            descriptor->offset = buffer_info->offset;
-            if (buffer_info->range == VK_WHOLE_SIZE) {
-               descriptor->range = buffer->size - buffer_info->offset;
-            } else {
-               assert(descriptor->range <= UINT32_MAX);
-               descriptor->range = buffer_info->range;
-            }
+            write_buffer_descriptor(descriptor, writeset->descriptorType,
+                                    buffer_info);
             break;
          }
          case VK_DESCRIPTOR_TYPE_SAMPLER: {
@@ -999,10 +999,7 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
              */
             const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
             V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler);
-
-            descriptor->sampler = sampler;
-
-            write_image_descriptor(writeset->descriptorType,
+            write_image_descriptor(device, descriptor, writeset->descriptorType,
                                    set, binding_layout, NULL, sampler,
                                    writeset->dstArrayElement + j);
 
@@ -1013,10 +1010,7 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {
             const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
             V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView);
-
-            descriptor->image_view = iview;
-
-            write_image_descriptor(writeset->descriptorType,
+            write_image_descriptor(device, descriptor, writeset->descriptorType,
                                    set, binding_layout, iview, NULL,
                                    writeset->dstArrayElement + j);
 
@@ -1026,11 +1020,7 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
             const VkDescriptorImageInfo *image_info = writeset->pImageInfo + j;
             V3DV_FROM_HANDLE(v3dv_image_view, iview, image_info->imageView);
             V3DV_FROM_HANDLE(v3dv_sampler, sampler, image_info->sampler);
-
-            descriptor->image_view = iview;
-            descriptor->sampler = sampler;
-
-            write_image_descriptor(writeset->descriptorType,
+            write_image_descriptor(device, descriptor, writeset->descriptorType,
                                    set, binding_layout, iview, sampler,
                                    writeset->dstArrayElement + j);
 
@@ -1040,12 +1030,7 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
             V3DV_FROM_HANDLE(v3dv_buffer_view, buffer_view,
                              writeset->pTexelBufferView[j]);
-
-            assert(buffer_view);
-
-            descriptor->buffer_view = buffer_view;
-
-            write_buffer_view_descriptor(writeset->descriptorType,
+            write_buffer_view_descriptor(device, descriptor, writeset->descriptorType,
                                          set, binding_layout, buffer_view,
                                          writeset->dstArrayElement + j);
             break;
@@ -1086,8 +1071,9 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
          dst_descriptor++;
          src_descriptor++;
 
-         if (descriptor_bo_size(src_binding_layout->type) > 0) {
-            descriptor_bo_copy(dst_set, dst_binding_layout,
+         if (v3dv_X(device, descriptor_bo_size)(src_binding_layout->type) > 0) {
+            descriptor_bo_copy(device,
+                               dst_set, dst_binding_layout,
                                j + copyset->dstArrayElement,
                                src_set, src_binding_layout,
                                j + copyset->srcArrayElement);
@@ -1096,3 +1082,197 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
       }
    }
 }
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDescriptorSetLayoutSupport(
+   VkDevice _device,
+   const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
+   VkDescriptorSetLayoutSupport *pSupport)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   VkDescriptorSetLayoutBinding *bindings = NULL;
+   VkResult result = vk_create_sorted_bindings(
+      pCreateInfo->pBindings, pCreateInfo->bindingCount, &bindings);
+   if (result != VK_SUCCESS) {
+      pSupport->supported = false;
+      return;
+   }
+
+   bool supported = true;
+
+   uint32_t desc_host_size = sizeof(struct v3dv_descriptor);
+   uint32_t host_size = sizeof(struct v3dv_descriptor_set);
+   uint32_t bo_size = 0;
+   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
+      const VkDescriptorSetLayoutBinding *binding = bindings + i;
+
+      if ((UINT32_MAX - host_size) / desc_host_size < binding->descriptorCount) {
+         supported = false;
+         break;
+      }
+
+      uint32_t desc_bo_size = v3dv_X(device, descriptor_bo_size)(binding->descriptorType);
+      if (desc_bo_size > 0 &&
+          (UINT32_MAX - bo_size) / desc_bo_size < binding->descriptorCount) {
+         supported = false;
+         break;
+      }
+
+      host_size += binding->descriptorCount * desc_host_size;
+      bo_size += binding->descriptorCount * desc_bo_size;
+   }
+
+   free(bindings);
+
+   pSupport->supported = supported;
+}
+
+VkResult
+v3dv_CreateDescriptorUpdateTemplate(
+   VkDevice _device,
+   const VkDescriptorUpdateTemplateCreateInfo *pCreateInfo,
+   const VkAllocationCallbacks *pAllocator,
+   VkDescriptorUpdateTemplate *pDescriptorUpdateTemplate)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   struct v3dv_descriptor_update_template *template;
+
+   size_t size = sizeof(*template) +
+      pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]);
+   template = vk_object_alloc(&device->vk, pAllocator, size,
+                              VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
+   if (template == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   template->bind_point = pCreateInfo->pipelineBindPoint;
+
+   assert(pCreateInfo->templateType ==
+          VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET);
+   template->set = pCreateInfo->set;
+
+   template->entry_count = pCreateInfo->descriptorUpdateEntryCount;
+   for (uint32_t i = 0; i < template->entry_count; i++) {
+      const VkDescriptorUpdateTemplateEntry *pEntry =
+         &pCreateInfo->pDescriptorUpdateEntries[i];
+
+      template->entries[i] = (struct v3dv_descriptor_template_entry) {
+         .type = pEntry->descriptorType,
+         .binding = pEntry->dstBinding,
+         .array_element = pEntry->dstArrayElement,
+         .array_count = pEntry->descriptorCount,
+         .offset = pEntry->offset,
+         .stride = pEntry->stride,
+      };
+   }
+
+   *pDescriptorUpdateTemplate =
+      v3dv_descriptor_update_template_to_handle(template);
+
+   return VK_SUCCESS;
+}
+
+void
+v3dv_DestroyDescriptorUpdateTemplate(
+   VkDevice _device,
+   VkDescriptorUpdateTemplate descriptorUpdateTemplate,
+   const VkAllocationCallbacks *pAllocator)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template,
+                    descriptorUpdateTemplate);
+
+   if (!template)
+      return;
+
+   vk_object_free(&device->vk, pAllocator, template);
+}
+
+void
+v3dv_UpdateDescriptorSetWithTemplate(
+   VkDevice _device,
+   VkDescriptorSet descriptorSet,
+   VkDescriptorUpdateTemplate descriptorUpdateTemplate,
+   const void *pData)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_descriptor_set, set, descriptorSet);
+   V3DV_FROM_HANDLE(v3dv_descriptor_update_template, template,
+                    descriptorUpdateTemplate);
+
+   for (int i = 0; i < template->entry_count; i++) {
+      const struct v3dv_descriptor_template_entry *entry =
+         &template->entries[i];
+
+      const struct v3dv_descriptor_set_binding_layout *binding_layout =
+         set->layout->binding + entry->binding;
+
+      struct v3dv_descriptor *descriptor =
+         set->descriptors +
+         binding_layout->descriptor_index +
+         entry->array_element;
+
+      switch (entry->type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorBufferInfo *info =
+               pData + entry->offset + j * entry->stride;
+            write_buffer_descriptor(descriptor + j, entry->type, info);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorImageInfo *info =
+               pData + entry->offset + j * entry->stride;
+            V3DV_FROM_HANDLE(v3dv_image_view, iview, info->imageView);
+            V3DV_FROM_HANDLE(v3dv_sampler, sampler, info->sampler);
+            write_image_descriptor(device, descriptor + j, entry->type,
+                                   set, binding_layout, iview, sampler,
+                                   entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkBufferView *_bview =
+               pData + entry->offset + j * entry->stride;
+            V3DV_FROM_HANDLE(v3dv_buffer_view, bview, *_bview);
+            write_buffer_view_descriptor(device, descriptor + j, entry->type,
+                                         set, binding_layout, bview,
+                                         entry->array_element + j);
+         }
+         break;
+
+      default:
+         unreachable("Unsupported descriptor type");
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_CreateSamplerYcbcrConversion(
+    VkDevice _device,
+    const VkSamplerYcbcrConversionCreateInfo *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkSamplerYcbcrConversion *pYcbcrConversion)
+{
+   unreachable("Ycbcr sampler conversion is not supported");
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_DestroySamplerYcbcrConversion(
+    VkDevice _device,
+    VkSamplerYcbcrConversion YcbcrConversion,
+    const VkAllocationCallbacks *pAllocator)
+{
+   unreachable("Ycbcr sampler conversion is not supported");
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_device.c b/lib/mesa/src/broadcom/vulkan/v3dv_device.c
index 496f93e28..de085bf09 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_device.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_device.c
@@ -30,12 +30,17 @@
 #include <unistd.h>
 #include <xf86drm.h>
 
+#ifdef MAJOR_IN_MKDEV
+#include <sys/mkdev.h>
+#endif
+#ifdef MAJOR_IN_SYSMACROS
+#include <sys/sysmacros.h>
+#endif
+
 #include "v3dv_private.h"
 
 #include "common/v3d_debug.h"
 
-#include "broadcom/cle/v3dx_pack.h"
-
 #include "compiler/v3d_compiler.h"
 
 #include "drm-uapi/v3d_drm.h"
@@ -61,34 +66,96 @@
 #include "drm-uapi/i915_drm.h"
 #endif
 
-static void *
-default_alloc_func(void *pUserData, size_t size, size_t align,
-                   VkSystemAllocationScope allocationScope)
-{
-   return malloc(size);
-}
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 0, VK_HEADER_VERSION)
 
-static void *
-default_realloc_func(void *pUserData, void *pOriginal, size_t size,
-                     size_t align, VkSystemAllocationScope allocationScope)
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion)
 {
-   return realloc(pOriginal, size);
+    *pApiVersion = V3DV_API_VERSION;
+    return VK_SUCCESS;
 }
 
-static void
-default_free_func(void *pUserData, void *pMemory)
-{
-   free(pMemory);
-}
+#if defined(VK_USE_PLATFORM_WIN32_KHR) ||   \
+    defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
+    defined(VK_USE_PLATFORM_XCB_KHR) ||     \
+    defined(VK_USE_PLATFORM_XLIB_KHR) ||    \
+    defined(VK_USE_PLATFORM_DISPLAY_KHR)
+#define V3DV_USE_WSI_PLATFORM
+#endif
 
-static const VkAllocationCallbacks default_alloc = {
-   .pUserData = NULL,
-   .pfnAllocation = default_alloc_func,
-   .pfnReallocation = default_realloc_func,
-   .pfnFree = default_free_func,
+static const struct vk_instance_extension_table instance_extensions = {
+   .KHR_device_group_creation           = true,
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+   .KHR_display                         = true,
+   .KHR_get_display_properties2         = true,
+#endif
+   .KHR_external_fence_capabilities     = true,
+   .KHR_external_memory_capabilities    = true,
+   .KHR_external_semaphore_capabilities = true,
+   .KHR_get_physical_device_properties2 = true,
+#ifdef V3DV_USE_WSI_PLATFORM
+   .KHR_get_surface_capabilities2       = true,
+   .KHR_surface                         = true,
+   .KHR_surface_protected_capabilities  = true,
+#endif
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+   .KHR_wayland_surface                 = true,
+#endif
+#ifdef VK_USE_PLATFORM_XCB_KHR
+   .KHR_xcb_surface                     = true,
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+   .KHR_xlib_surface                    = true,
+#endif
+   .EXT_debug_report                    = true,
 };
 
-VkResult
+static void
+get_device_extensions(const struct v3dv_physical_device *device,
+                      struct vk_device_extension_table *ext)
+{
+   *ext = (struct vk_device_extension_table) {
+      .KHR_bind_memory2                    = true,
+      .KHR_copy_commands2                  = true,
+      .KHR_dedicated_allocation            = true,
+      .KHR_device_group                    = true,
+      .KHR_descriptor_update_template      = true,
+      .KHR_external_fence                  = true,
+      .KHR_external_fence_fd               = true,
+      .KHR_external_memory                 = true,
+      .KHR_external_memory_fd              = true,
+      .KHR_external_semaphore              = true,
+      .KHR_external_semaphore_fd           = true,
+      .KHR_get_memory_requirements2        = true,
+      .KHR_image_format_list               = true,
+      .KHR_relaxed_block_layout            = true,
+      .KHR_maintenance1                    = true,
+      .KHR_maintenance2                    = true,
+      .KHR_maintenance3                    = true,
+      .KHR_multiview                       = true,
+      .KHR_shader_non_semantic_info        = true,
+      .KHR_sampler_mirror_clamp_to_edge    = true,
+      .KHR_storage_buffer_storage_class    = true,
+      .KHR_uniform_buffer_standard_layout  = true,
+#ifdef V3DV_USE_WSI_PLATFORM
+      .KHR_swapchain                       = true,
+      .KHR_incremental_present             = true,
+#endif
+      .KHR_variable_pointers               = true,
+      .EXT_color_write_enable              = true,
+      .EXT_custom_border_color             = true,
+      .EXT_external_memory_dma_buf         = true,
+      .EXT_index_type_uint8                = true,
+      .EXT_physical_device_drm             = true,
+      .EXT_pipeline_creation_cache_control = true,
+      .EXT_pipeline_creation_feedback      = true,
+      .EXT_private_data                    = true,
+      .EXT_provoking_vertex                = true,
+      .EXT_vertex_attribute_divisor        = true,
+   };
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EnumerateInstanceExtensionProperties(const char *pLayerName,
                                           uint32_t *pPropertyCount,
                                           VkExtensionProperties *pProperties)
@@ -98,10 +165,10 @@ v3dv_EnumerateInstanceExtensionProperties(const char *pLayerName,
       return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 
    return vk_enumerate_instance_extension_properties(
-      &v3dv_instance_extensions_supported, pPropertyCount, pProperties);
+      &instance_extensions, pPropertyCount, pProperties);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
                     const VkAllocationCallbacks *pAllocator,
                     VkInstance *pInstance)
@@ -112,25 +179,27 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO);
 
    if (pAllocator == NULL)
-      pAllocator = &default_alloc;
+      pAllocator = vk_default_allocator();
 
-   instance = vk_alloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
-                        VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   instance = vk_alloc(pAllocator, sizeof(*instance), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
    if (!instance)
       return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    struct vk_instance_dispatch_table dispatch_table;
    vk_instance_dispatch_table_from_entrypoints(
       &dispatch_table, &v3dv_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
 
    result = vk_instance_init(&instance->vk,
-                             &v3dv_instance_extensions_supported,
+                             &instance_extensions,
                              &dispatch_table,
                              pCreateInfo, pAllocator);
 
    if (result != VK_SUCCESS) {
       vk_free(pAllocator, instance);
-      return vk_error(instance, result);
+      return vk_error(NULL, result);
    }
 
    v3d_process_debug_variable();
@@ -208,7 +277,7 @@ physical_device_finish(struct v3dv_physical_device *device)
    mtx_destroy(&device->mutex);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyInstance(VkInstance _instance,
                      const VkAllocationCallbacks *pAllocator)
 {
@@ -550,14 +619,14 @@ init_uuids(struct v3dv_physical_device *device)
    const struct build_id_note *note =
       build_id_find_nhdr_for_addr(init_uuids);
    if (!note) {
-      return vk_errorf((struct v3dv_instance*) device->vk.instance,
+      return vk_errorf(device->vk.instance,
                        VK_ERROR_INITIALIZATION_FAILED,
                        "Failed to find build-id");
    }
 
    unsigned build_id_len = build_id_length(note);
    if (build_id_len < 20) {
-      return vk_errorf((struct v3dv_instance*) device->vk.instance,
+      return vk_errorf(device->vk.instance,
                        VK_ERROR_INITIALIZATION_FAILED,
                        "build-id too short.  It needs to be a SHA");
    }
@@ -627,6 +696,8 @@ physical_device_init(struct v3dv_physical_device *device,
    struct vk_physical_device_dispatch_table dispatch_table;
    vk_physical_device_dispatch_table_from_entrypoints
       (&dispatch_table, &v3dv_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
 
    result = vk_physical_device_init(&device->vk, &instance->vk, NULL,
                                     &dispatch_table);
@@ -648,17 +719,48 @@ physical_device_init(struct v3dv_physical_device *device,
     * we postpone that until a swapchain is created.
     */
 
+   const char *primary_path;
+#if !using_v3d_simulator
+   if (drm_primary_device)
+      primary_path = drm_primary_device->nodes[DRM_NODE_PRIMARY];
+   else
+      primary_path = NULL;
+#else
+   primary_path = drm_render_device->nodes[DRM_NODE_PRIMARY];
+#endif
+
+   struct stat primary_stat = {0}, render_stat = {0};
+
+   device->has_primary = primary_path;
+   if (device->has_primary) {
+      if (stat(primary_path, &primary_stat) != 0) {
+         result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                            "failed to stat DRM primary node %s",
+                            primary_path);
+         goto fail;
+      }
+
+      device->primary_devid = primary_stat.st_rdev;
+   }
+
+   if (fstat(render_fd, &render_stat) != 0) {
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "failed to stat DRM render node %s",
+                         path);
+      goto fail;
+   }
+   device->has_render = true;
+   device->render_devid = render_stat.st_rdev;
+
    if (instance->vk.enabled_extensions.KHR_display) {
 #if !using_v3d_simulator
       /* Open the primary node on the vc4 display device */
       assert(drm_primary_device);
-      const char *primary_path = drm_primary_device->nodes[DRM_NODE_PRIMARY];
       master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
 #else
       /* There is only one device with primary and render nodes.
        * Open its primary node.
        */
-      const char *primary_path = drm_render_device->nodes[DRM_NODE_PRIMARY];
       master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
 #endif
    }
@@ -722,8 +824,7 @@ physical_device_init(struct v3dv_physical_device *device,
       goto fail;
    }
 
-   v3dv_physical_device_get_supported_extensions(device,
-                                                 &device->vk.supported_extensions);
+   get_device_extensions(device, &device->vk.supported_extensions);
 
    pthread_mutex_init(&device->mutex, NULL);
 
@@ -835,7 +936,7 @@ instance_ensure_physical_device(struct v3dv_instance *instance)
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult  VKAPI_CALL
 v3dv_EnumeratePhysicalDevices(VkInstance _instance,
                               uint32_t *pPhysicalDeviceCount,
                               VkPhysicalDevice *pPhysicalDevices)
@@ -858,7 +959,37 @@ v3dv_EnumeratePhysicalDevices(VkInstance _instance,
    return vk_outarray_status(&out);
 }
 
-void
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_EnumeratePhysicalDeviceGroups(
+    VkInstance _instance,
+    uint32_t *pPhysicalDeviceGroupCount,
+    VkPhysicalDeviceGroupProperties *pPhysicalDeviceGroupProperties)
+{
+   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
+   VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties,
+                         pPhysicalDeviceGroupCount);
+
+   VkResult result = instance_ensure_physical_device(instance);
+   if (result != VK_SUCCESS)
+      return result;
+
+   assert(instance->physicalDeviceCount == 1);
+
+   vk_outarray_append(&out, p) {
+      p->physicalDeviceCount = 1;
+      memset(p->physicalDevices, 0, sizeof(p->physicalDevices));
+      p->physicalDevices[0] =
+         v3dv_physical_device_to_handle(&instance->physicalDevice);
+      p->subsetAllocation = false;
+
+      vk_foreach_struct(ext, p->pNext)
+         v3dv_debug_ignored_stype(ext->sType);
+   }
+
+   return vk_outarray_status(&out);
+}
+
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
                                VkPhysicalDeviceFeatures *pFeatures)
 {
@@ -869,7 +1000,7 @@ v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
       .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
       .imageCubeArray = true,
       .independentBlend = true,
-      .geometryShader = false,
+      .geometryShader = true,
       .tessellationShader = false,
       .sampleRateShading = true,
       .dualSrcBlend = false,
@@ -886,7 +1017,7 @@ v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
       .multiViewport = false,
       .samplerAnisotropy = true,
       .textureCompressionETC2 = true,
-      .textureCompressionASTC_LDR = false,
+      .textureCompressionASTC_LDR = true,
       /* Note that textureCompressionBC requires that the driver support all
        * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim
        * that we support it.
@@ -896,7 +1027,7 @@ v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
       .pipelineStatisticsQuery = false,
       .vertexPipelineStoresAndAtomics = true,
       .fragmentStoresAndAtomics = true,
-      .shaderTessellationAndGeometryPointSize = false,
+      .shaderTessellationAndGeometryPointSize = true,
       .shaderImageGatherExtended = false,
       .shaderStorageImageExtendedFormats = true,
       .shaderStorageImageMultisample = false,
@@ -927,14 +1058,45 @@ v3dv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
    };
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
                                 VkPhysicalDeviceFeatures2 *pFeatures)
 {
    v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 
+   VkPhysicalDeviceVulkan11Features vk11 = {
+      .storageBuffer16BitAccess = false,
+      .uniformAndStorageBuffer16BitAccess = false,
+      .storagePushConstant16 = false,
+      .storageInputOutput16 = false,
+      .multiview = true,
+      .multiviewGeometryShader = false,
+      .multiviewTessellationShader = false,
+      .variablePointersStorageBuffer = true,
+      /* FIXME: this needs support for non-constant index on UBO/SSBO */
+      .variablePointers = false,
+      .protectedMemory = false,
+      .samplerYcbcrConversion = false,
+      .shaderDrawParameters = false,
+   };
+
    vk_foreach_struct(ext, pFeatures->pNext) {
       switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
+         VkPhysicalDeviceCustomBorderColorFeaturesEXT *features =
+            (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext;
+         features->customBorderColors = true;
+         features->customBorderColorWithoutFormat = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: {
+         VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features =
+            (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext;
+         features->uniformBufferStandardLayout = true;
+         break;
+      }
+
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT: {
          VkPhysicalDevicePrivateDataFeaturesEXT *features =
             (VkPhysicalDevicePrivateDataFeaturesEXT *)ext;
@@ -942,6 +1104,87 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
+         VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
+            (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
+         features->indexTypeUint8 = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
+          VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = (void *) ext;
+          features->colorWriteEnable = true;
+          break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_CREATION_CACHE_CONTROL_FEATURES_EXT: {
+         VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *features = (void *) ext;
+         features->pipelineCreationCacheControl = true;
+         break;
+      }         
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: {
+         VkPhysicalDeviceProvokingVertexFeaturesEXT *features = (void *) ext;
+         features->provokingVertexLast = true;
+         /* FIXME: update when supporting EXT_transform_feedback */
+         features->transformFeedbackPreservesProvokingVertex = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
+            (void *) ext;
+         features->vertexAttributeInstanceRateDivisor = true;
+         features->vertexAttributeInstanceRateZeroDivisor = false;
+         break;
+      }
+
+      /* Vulkan 1.1 */
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: {
+         VkPhysicalDeviceVulkan11Features *features =
+            (VkPhysicalDeviceVulkan11Features *)ext;
+         memcpy(features, &vk11, sizeof(VkPhysicalDeviceVulkan11Features));
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
+         VkPhysicalDevice16BitStorageFeatures *features = (void *) ext;
+         features->storageBuffer16BitAccess = vk11.storageBuffer16BitAccess;
+         features->uniformAndStorageBuffer16BitAccess =
+            vk11.uniformAndStorageBuffer16BitAccess;
+         features->storagePushConstant16 = vk11.storagePushConstant16;
+         features->storageInputOutput16 = vk11.storageInputOutput16;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
+         VkPhysicalDeviceMultiviewFeatures *features = (void *) ext;
+         features->multiview = vk11.multiview;
+         features->multiviewGeometryShader = vk11.multiviewGeometryShader;
+         features->multiviewTessellationShader = vk11.multiviewTessellationShader;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
+         VkPhysicalDeviceProtectedMemoryFeatures *features = (void *) ext;
+         features->protectedMemory = vk11.protectedMemory;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
+         VkPhysicalDeviceSamplerYcbcrConversionFeatures *features = (void *) ext;
+         features->samplerYcbcrConversion = vk11.samplerYcbcrConversion;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
+         VkPhysicalDeviceShaderDrawParametersFeatures *features = (void *) ext;
+         features->shaderDrawParameters = vk11.shaderDrawParameters;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
+         VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext;
+         features->variablePointersStorageBuffer =
+            vk11.variablePointersStorageBuffer;
+         features->variablePointers = vk11.variablePointers;
+         break;
+      }
+
       default:
          v3dv_debug_ignored_stype(ext->sType);
          break;
@@ -949,6 +1192,20 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
    }
 }
 
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetDeviceGroupPeerMemoryFeatures(VkDevice device,
+                                      uint32_t heapIndex,
+                                      uint32_t localDeviceIndex,
+                                      uint32_t remoteDeviceIndex,
+                                      VkPeerMemoryFeatureFlags *pPeerMemoryFeatures)
+{
+   assert(localDeviceIndex == 0 && remoteDeviceIndex == 0);
+   *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
+                          VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
+                          VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
+                          VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
+}
+
 uint32_t
 v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev)
 {
@@ -987,11 +1244,16 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
 
    return devid;
 #else
-   return dev->devinfo.ver;
+   switch (dev->devinfo.ver) {
+   case 42:
+      return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+   default:
+      unreachable("Unsupported V3D version");
+   }
 #endif
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
                                  VkPhysicalDeviceProperties *pProperties)
 {
@@ -1009,7 +1271,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
 
    const uint32_t v3d_coord_shift = 6;
 
-   const uint32_t v3d_point_line_granularity = 2.0f / (1 << v3d_coord_shift);
+   const float v3d_point_line_granularity = 2.0f / (1 << v3d_coord_shift);
    const uint32_t max_fb_size = 4096;
 
    const VkSampleCountFlags supported_sample_counts =
@@ -1028,8 +1290,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxImageDimensionCube                    = 4096,
       .maxImageArrayLayers                      = 2048,
       .maxTexelBufferElements                   = (1ul << 28),
-      .maxUniformBufferRange                    = (1ul << 27),
-      .maxStorageBufferRange                    = (1ul << 27),
+      .maxUniformBufferRange                    = V3D_MAX_BUFFER_RANGE,
+      .maxStorageBufferRange                    = V3D_MAX_BUFFER_RANGE,
       .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
       .maxMemoryAllocationCount                 = mem_size / page_size,
       .maxSamplerAllocationCount                = 64 * 1024,
@@ -1075,11 +1337,11 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxTessellationEvaluationOutputComponents = 0,
 
       /* Geometry limits */
-      .maxGeometryShaderInvocations             = 0,
-      .maxGeometryInputComponents               = 0,
-      .maxGeometryOutputComponents              = 0,
-      .maxGeometryOutputVertices                = 0,
-      .maxGeometryTotalOutputComponents         = 0,
+      .maxGeometryShaderInvocations             = 32,
+      .maxGeometryInputComponents               = 64,
+      .maxGeometryOutputComponents              = 64,
+      .maxGeometryOutputVertices                = 256,
+      .maxGeometryTotalOutputComponents         = 1024,
 
       /* Fragment limits */
       .maxFragmentInputComponents               = max_varying_components,
@@ -1108,7 +1370,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
                                                     2.0 * max_fb_size - 1 },
       .viewportSubPixelBits                     = 0,
       .minMemoryMapAlignment                    = page_size,
-      .minTexelBufferOffsetAlignment            = VC5_UIFBLOCK_SIZE,
+      .minTexelBufferOffsetAlignment            = V3D_UIFBLOCK_SIZE,
       .minUniformBufferOffsetAlignment          = 32,
       .minStorageBufferOffsetAlignment          = 32,
       .minTexelOffset                           = -8,
@@ -1151,7 +1413,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
    };
 
    *pProperties = (VkPhysicalDeviceProperties) {
-      .apiVersion = v3dv_physical_device_api_version(pdevice),
+      .apiVersion = V3DV_API_VERSION,
       .driverVersion = vk_get_driver_version(),
       .vendorID = v3dv_physical_device_vendor_id(pdevice),
       .deviceID = v3dv_physical_device_device_id(pdevice),
@@ -1166,7 +1428,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
           pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
                                   VkPhysicalDeviceProperties2 *pProperties)
 {
@@ -1176,6 +1438,26 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
 
    vk_foreach_struct(ext, pProperties->pNext) {
       switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: {
+         VkPhysicalDeviceCustomBorderColorPropertiesEXT *props =
+            (VkPhysicalDeviceCustomBorderColorPropertiesEXT *)ext;
+         props->maxCustomBorderColorSamplers = V3D_MAX_TEXTURE_SAMPLERS;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
+         VkPhysicalDeviceProvokingVertexPropertiesEXT *props =
+            (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
+         props->provokingVertexModePerPipeline = true;
+         /* FIXME: update when supporting EXT_transform_feedback */
+         props->transformFeedbackPreservesTriangleFanProvokingVertex = false;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
+            (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
+         props->maxVertexAttribDivisor = 0xffff;
+         break;
+      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
          VkPhysicalDeviceIDProperties *id_props =
             (VkPhysicalDeviceIDProperties *)ext;
@@ -1185,11 +1467,78 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          id_props->deviceLUIDValid = false;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
+         VkPhysicalDeviceDrmPropertiesEXT *props =
+            (VkPhysicalDeviceDrmPropertiesEXT *)ext;
+         props->hasPrimary = pdevice->has_primary;
+         if (props->hasPrimary) {
+            props->primaryMajor = (int64_t) major(pdevice->primary_devid);
+            props->primaryMinor = (int64_t) minor(pdevice->primary_devid);
+         }
+         props->hasRender = pdevice->has_render;
+         if (props->hasRender) {
+            props->renderMajor = (int64_t) major(pdevice->render_devid);
+            props->renderMinor = (int64_t) minor(pdevice->render_devid);
+         }
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
+         VkPhysicalDeviceMaintenance3Properties *props =
+            (VkPhysicalDeviceMaintenance3Properties *)ext;
+         /* We don't really have special restrictions for the maximum
+          * descriptors per set, other than maybe not exceeding the limits
+          * of addressable memory in a single allocation on either the host
+          * or the GPU. This will be a much larger limit than any of the
+          * per-stage limits already available in Vulkan though, so in practice,
+          * it is not expected to limit anything beyond what is already
+          * constrained through per-stage limits.
+          */
+         uint32_t max_host_descriptors =
+            (UINT32_MAX - sizeof(struct v3dv_descriptor_set)) /
+            sizeof(struct v3dv_descriptor);
+         uint32_t max_gpu_descriptors =
+            (UINT32_MAX / v3dv_X(pdevice, max_descriptor_bo_size)());
+         props->maxPerSetDescriptors =
+            MIN2(max_host_descriptors, max_gpu_descriptors);
+
+         /* Minimum required by the spec */
+         props->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
+         VkPhysicalDeviceMultiviewProperties *props =
+            (VkPhysicalDeviceMultiviewProperties *)ext;
+         props->maxMultiviewViewCount = MAX_MULTIVIEW_VIEW_COUNT;
+         props->maxMultiviewInstanceIndex = UINT32_MAX - 1;
+         break;
+      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT:
          /* Do nothing, not even logging. This is a non-PCI device, so we will
           * never provide this extension.
           */
          break;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
+         VkPhysicalDevicePointClippingProperties *props =
+            (VkPhysicalDevicePointClippingProperties *)ext;
+         props->pointClippingBehavior =
+            VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: {
+         VkPhysicalDeviceProtectedMemoryProperties *props =
+            (VkPhysicalDeviceProtectedMemoryProperties *)ext;
+         props->protectedNoFault = false;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
+         VkPhysicalDeviceSubgroupProperties *props =
+            (VkPhysicalDeviceSubgroupProperties *)ext;
+         props->subgroupSize = V3D_CHANNELS;
+         props->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
+         props->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT;
+         props->quadOperationsInAllStages = false;
+         break;
+      }
       default:
          v3dv_debug_ignored_stype(ext->sType);
          break;
@@ -1208,7 +1557,7 @@ v3dv_queue_family_properties = {
    .minImageTransferGranularity = { 1, 1, 1 },
 };
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceQueueFamilyProperties(VkPhysicalDevice physicalDevice,
                                             uint32_t *pCount,
                                             VkQueueFamilyProperties *pQueueFamilyProperties)
@@ -1220,7 +1569,7 @@ v3dv_GetPhysicalDeviceQueueFamilyProperties(VkPhysicalDevice physicalDevice,
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceQueueFamilyProperties2(VkPhysicalDevice physicalDevice,
                                              uint32_t *pQueueFamilyPropertyCount,
                                              VkQueueFamilyProperties2 *pQueueFamilyProperties)
@@ -1236,7 +1585,7 @@ v3dv_GetPhysicalDeviceQueueFamilyProperties2(VkPhysicalDevice physicalDevice,
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceMemoryProperties(VkPhysicalDevice physicalDevice,
                                        VkPhysicalDeviceMemoryProperties *pMemoryProperties)
 {
@@ -1244,7 +1593,7 @@ v3dv_GetPhysicalDeviceMemoryProperties(VkPhysicalDevice physicalDevice,
    *pMemoryProperties = device->memory;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice physicalDevice,
                                         VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)
 {
@@ -1260,7 +1609,7 @@ v3dv_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice physicalDevice,
    }
 }
 
-PFN_vkVoidFunction
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 v3dv_GetInstanceProcAddr(VkInstance _instance,
                          const char *pName)
 {
@@ -1303,7 +1652,7 @@ vk_icdGetPhysicalDeviceProcAddr(VkInstance  _instance,
    return vk_instance_get_physical_device_proc_addr(&instance->vk, pName);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
                                       VkLayerProperties *pProperties)
 {
@@ -1315,7 +1664,7 @@ v3dv_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
    return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice,
                                     uint32_t *pPropertyCount,
                                     VkLayerProperties *pProperties)
@@ -1327,16 +1676,19 @@ v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice,
       return VK_SUCCESS;
    }
 
-   return vk_error((struct v3dv_instance*) physical_device->vk.instance,
-                   VK_ERROR_LAYER_NOT_PRESENT);
+   return vk_error(physical_device, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
 static VkResult
-queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
+queue_init(struct v3dv_device *device, struct v3dv_queue *queue,
+           const VkDeviceQueueCreateInfo *create_info,
+           uint32_t index_in_family)
 {
-   vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
+   VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info,
+                                   index_in_family);
+   if (result != VK_SUCCESS)
+      return result;
    queue->device = device;
-   queue->flags = 0;
    queue->noop_job = NULL;
    list_inithead(&queue->submit_wait_list);
    pthread_mutex_init(&queue->mutex, NULL);
@@ -1346,7 +1698,7 @@ queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
 static void
 queue_finish(struct v3dv_queue *queue)
 {
-   vk_object_base_finish(&queue->base);
+   vk_queue_finish(&queue->vk);
    assert(list_is_empty(&queue->submit_wait_list));
    if (queue->noop_job)
       v3dv_job_destroy(queue->noop_job);
@@ -1371,7 +1723,7 @@ destroy_device_meta(struct v3dv_device *device)
    v3dv_meta_texel_buffer_copy_finish(device);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
                   const VkDeviceCreateInfo *pCreateInfo,
                   const VkAllocationCallbacks *pAllocator,
@@ -1384,19 +1736,6 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
 
-   /* Check enabled features */
-   if (pCreateInfo->pEnabledFeatures) {
-      VkPhysicalDeviceFeatures supported_features;
-      v3dv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
-      VkBool32 *supported_feature = (VkBool32 *)&supported_features;
-      VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures;
-      unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
-      for (uint32_t i = 0; i < num_features; i++) {
-         if (enabled_feature[i] && !supported_feature[i])
-            return vk_error(instance, VK_ERROR_FEATURE_NOT_PRESENT);
-      }
-   }
-
    /* Check requested queues (we only expose one queue ) */
    assert(pCreateInfo->queueCreateInfoCount == 1);
    for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
@@ -1415,11 +1754,13 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    struct vk_device_dispatch_table dispatch_table;
    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
                                              &v3dv_device_entrypoints, true);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &wsi_device_entrypoints, false);
    result = vk_device_init(&device->vk, &physical_device->vk,
                            &dispatch_table, pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
       vk_free(&device->vk.alloc, device);
-      return vk_error(instance, result);
+      return vk_error(NULL, result);
    }
 
    device->instance = instance;
@@ -1432,20 +1773,31 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
 
    pthread_mutex_init(&device->mutex, NULL);
 
-   result = queue_init(device, &device->queue);
+   result = queue_init(device, &device->queue,
+                       pCreateInfo->pQueueCreateInfos, 0);
    if (result != VK_SUCCESS)
       goto fail;
 
    device->devinfo = physical_device->devinfo;
 
-   if (pCreateInfo->pEnabledFeatures) {
+   /* Vulkan 1.1 and VK_KHR_get_physical_device_properties2 added
+    * VkPhysicalDeviceFeatures2 which can be used in the pNext chain of
+    * vkDeviceCreateInfo, in which case it should be used instead of
+    * pEnabledFeatures.
+    */
+   const VkPhysicalDeviceFeatures2 *features2 =
+      vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_FEATURES_2);
+   if (features2) {
+      memcpy(&device->features, &features2->features,
+             sizeof(device->features));
+   } else  if (pCreateInfo->pEnabledFeatures) {
       memcpy(&device->features, pCreateInfo->pEnabledFeatures,
              sizeof(device->features));
-
-      if (device->features.robustBufferAccess)
-         perf_debug("Device created with Robust Buffer Access enabled.\n");
    }
 
+   if (device->features.robustBufferAccess)
+      perf_debug("Device created with Robust Buffer Access enabled.\n");
+
    int ret = drmSyncobjCreate(physical_device->render_fd,
                               DRM_SYNCOBJ_CREATE_SIGNALED,
                               &device->last_job_sync);
@@ -1454,9 +1806,12 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
       goto fail;
    }
 
+#ifdef DEBUG
+   v3dv_X(device, device_check_prepacked_sizes)();
+#endif
    init_device_meta(device);
    v3dv_bo_cache_init(device);
-   v3dv_pipeline_cache_init(&device->default_pipeline_cache, device,
+   v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
                             device->instance->default_pipeline_cache_enabled);
    device->default_attribute_float =
       v3dv_pipeline_create_default_attribute_values(device, NULL);
@@ -1472,7 +1827,7 @@ fail:
    return result;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyDevice(VkDevice _device,
                    const VkAllocationCallbacks *pAllocator)
 {
@@ -1496,24 +1851,10 @@ v3dv_DestroyDevice(VkDevice _device,
    v3dv_bo_cache_destroy(device);
 
    vk_device_finish(&device->vk);
-   vk_free2(&default_alloc, pAllocator, device);
+   vk_free2(&device->vk.alloc, pAllocator, device);
 }
 
-void
-v3dv_GetDeviceQueue(VkDevice _device,
-                    uint32_t queueFamilyIndex,
-                    uint32_t queueIndex,
-                    VkQueue *pQueue)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   assert(queueIndex == 0);
-   assert(queueFamilyIndex == 0);
-
-   *pQueue = v3dv_queue_to_handle(&device->queue);
-}
-
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_DeviceWaitIdle(VkDevice _device)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
@@ -1526,8 +1867,7 @@ device_alloc(struct v3dv_device *device,
              VkDeviceSize size)
 {
    /* Our kernel interface is 32-bit */
-   if (size > UINT32_MAX)
-      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   assert(size <= UINT32_MAX);
 
    mem->bo = v3dv_bo_alloc(device, size, "device_alloc", false);
    if (!mem->bo)
@@ -1546,7 +1886,9 @@ device_free_wsi_dumb(int32_t display_fd, int32_t dumb_handle)
    struct drm_mode_destroy_dumb destroy_dumb = {
       .handle = dumb_handle,
    };
-   v3dv_ioctl(display_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_dumb);
+   if (v3dv_ioctl(display_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_dumb)) {
+      fprintf(stderr, "destroy dumb object %d: %s\n", dumb_handle, strerror(errno));
+   }
 }
 
 static void
@@ -1724,7 +2066,7 @@ fail_create:
 #endif
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_AllocateMemory(VkDevice _device,
                     const VkMemoryAllocateInfo *pAllocateInfo,
                     const VkAllocationCallbacks *pAllocator,
@@ -1759,6 +2101,22 @@ v3dv_AllocateMemory(VkDevice _device,
       case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR:
          fd_info = (void *)ext;
          break;
+      case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO:
+         /* We don't support VK_KHR_buffer_device_address or multiple
+          * devices per device group, so we can ignore this.
+          */
+         break;
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR:
+         /* We don't have particular optimizations associated with memory
+          * allocations that won't be suballocated to multiple resources.
+          */
+         break;
+      case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR:
+         /* The mask of handle types specified here must be supported
+          * according to VkExternalImageFormatProperties, so it must be
+          * fd or dmabuf, which don't have special requirements for us.
+          */
+         break;
       default:
          v3dv_debug_ignored_stype(ext->sType);
          break;
@@ -1766,32 +2124,40 @@ v3dv_AllocateMemory(VkDevice _device,
    }
 
    VkResult result = VK_SUCCESS;
-   if (wsi_info) {
-      result = device_alloc_for_wsi(device, pAllocator, mem,
-                                    pAllocateInfo->allocationSize);
-   } else if (fd_info && fd_info->handleType) {
-      assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
-             fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
-      result = device_import_bo(device, pAllocator,
-                                fd_info->fd, pAllocateInfo->allocationSize,
-                                &mem->bo);
-      mem->has_bo_ownership = false;
-      if (result == VK_SUCCESS)
-         close(fd_info->fd);
+
+   /* We always allocate device memory in multiples of a page, so round up
+    * requested size to that.
+    */
+   VkDeviceSize alloc_size = ALIGN(pAllocateInfo->allocationSize, 4096);
+
+   if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE)) {
+      result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
    } else {
-      result = device_alloc(device, mem, pAllocateInfo->allocationSize);
+      if (wsi_info) {
+         result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size);
+      } else if (fd_info && fd_info->handleType) {
+         assert(fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
+                fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+         result = device_import_bo(device, pAllocator,
+                                   fd_info->fd, alloc_size, &mem->bo);
+         mem->has_bo_ownership = false;
+         if (result == VK_SUCCESS)
+            close(fd_info->fd);
+      } else {
+         result = device_alloc(device, mem, alloc_size);
+      }
    }
 
    if (result != VK_SUCCESS) {
       vk_object_free(&device->vk, pAllocator, mem);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    *pMem = v3dv_device_memory_to_handle(mem);
    return result;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_FreeMemory(VkDevice _device,
                 VkDeviceMemory _mem,
                 const VkAllocationCallbacks *pAllocator)
@@ -1810,7 +2176,7 @@ v3dv_FreeMemory(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, mem);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_MapMemory(VkDevice _device,
                VkDeviceMemory _memory,
                VkDeviceSize offset,
@@ -1835,13 +2201,13 @@ v3dv_MapMemory(VkDevice _device,
     */
    VkResult result = device_map(device, mem);
    if (result != VK_SUCCESS)
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
 
    *ppData = ((uint8_t *) mem->bo->map) + offset;
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_UnmapMemory(VkDevice _device,
                  VkDeviceMemory _memory)
 {
@@ -1854,7 +2220,7 @@ v3dv_UnmapMemory(VkDevice _device,
    device_unmap(device, mem);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_FlushMappedMemoryRanges(VkDevice _device,
                              uint32_t memoryRangeCount,
                              const VkMappedMemoryRange *pMemoryRanges)
@@ -1862,7 +2228,7 @@ v3dv_FlushMappedMemoryRanges(VkDevice _device,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_InvalidateMappedMemoryRanges(VkDevice _device,
                                   uint32_t memoryRangeCount,
                                   const VkMappedMemoryRange *pMemoryRanges)
@@ -1870,28 +2236,40 @@ v3dv_InvalidateMappedMemoryRanges(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
-v3dv_GetImageMemoryRequirements(VkDevice _device,
-                                VkImage _image,
-                                VkMemoryRequirements *pMemoryRequirements)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetImageMemoryRequirements2(VkDevice device,
+                                 const VkImageMemoryRequirementsInfo2 *pInfo,
+                                 VkMemoryRequirements2 *pMemoryRequirements)
 {
-   V3DV_FROM_HANDLE(v3dv_image, image, _image);
+   V3DV_FROM_HANDLE(v3dv_image, image, pInfo->image);
 
-   assert(image->size > 0);
+   pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
+      .memoryTypeBits = 0x1,
+      .alignment = image->alignment,
+      .size = image->size
+   };
 
-   pMemoryRequirements->size = image->size;
-   pMemoryRequirements->alignment = image->alignment;
-   pMemoryRequirements->memoryTypeBits = 0x1;
+   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *req =
+            (VkMemoryDedicatedRequirements *) ext;
+         req->requiresDedicatedAllocation = image->vk.external_handle_types != 0;
+         req->prefersDedicatedAllocation = image->vk.external_handle_types != 0;
+         break;
+      }
+      default:
+         v3dv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
 }
 
-VkResult
-v3dv_BindImageMemory(VkDevice _device,
-                     VkImage _image,
-                     VkDeviceMemory _memory,
-                     VkDeviceSize memoryOffset)
+static void
+bind_image_memory(const VkBindImageMemoryInfo *info)
 {
-   V3DV_FROM_HANDLE(v3dv_device_memory, mem, _memory);
-   V3DV_FROM_HANDLE(v3dv_image, image, _image);
+   V3DV_FROM_HANDLE(v3dv_image, image, info->image);
+   V3DV_FROM_HANDLE(v3dv_device_memory, mem, info->memory);
 
    /* Valid usage:
     *
@@ -1899,36 +2277,75 @@ v3dv_BindImageMemory(VkDevice _device,
     *    the VkMemoryRequirements structure returned from a call to
     *    vkGetImageMemoryRequirements with image"
     */
-   assert(memoryOffset % image->alignment == 0);
-   assert(memoryOffset < mem->bo->size);
+   assert(info->memoryOffset % image->alignment == 0);
+   assert(info->memoryOffset < mem->bo->size);
 
    image->mem = mem;
-   image->mem_offset = memoryOffset;
+   image->mem_offset = info->memoryOffset;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_BindImageMemory2(VkDevice _device,
+                      uint32_t bindInfoCount,
+                      const VkBindImageMemoryInfo *pBindInfos)
+{
+   for (uint32_t i = 0; i < bindInfoCount; i++) {
+      const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
+         vk_find_struct_const(pBindInfos->pNext,
+                              BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR);
+      if (swapchain_info && swapchain_info->swapchain) {
+         struct v3dv_image *swapchain_image =
+            v3dv_wsi_get_image_from_swapchain(swapchain_info->swapchain,
+                                              swapchain_info->imageIndex);
+         VkBindImageMemoryInfo swapchain_bind = {
+            .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
+            .image = pBindInfos[i].image,
+            .memory = v3dv_device_memory_to_handle(swapchain_image->mem),
+            .memoryOffset = swapchain_image->mem_offset,
+         };
+         bind_image_memory(&swapchain_bind);
+      } else {
+         bind_image_memory(&pBindInfos[i]);
+      }
+   }
 
    return VK_SUCCESS;
 }
 
-void
-v3dv_GetBufferMemoryRequirements(VkDevice _device,
-                                 VkBuffer _buffer,
-                                 VkMemoryRequirements* pMemoryRequirements)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetBufferMemoryRequirements2(VkDevice device,
+                                  const VkBufferMemoryRequirementsInfo2 *pInfo,
+                                  VkMemoryRequirements2 *pMemoryRequirements)
 {
-   V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
+
+   pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
+      .memoryTypeBits = 0x1,
+      .alignment = buffer->alignment,
+      .size = align64(buffer->size, buffer->alignment),
+   };
 
-   pMemoryRequirements->memoryTypeBits = 0x1;
-   pMemoryRequirements->alignment = buffer->alignment;
-   pMemoryRequirements->size =
-      align64(buffer->size, pMemoryRequirements->alignment);
+   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *req =
+            (VkMemoryDedicatedRequirements *) ext;
+         req->requiresDedicatedAllocation = false;
+         req->prefersDedicatedAllocation = false;
+         break;
+      }
+      default:
+         v3dv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
 }
 
-VkResult
-v3dv_BindBufferMemory(VkDevice _device,
-                      VkBuffer _buffer,
-                      VkDeviceMemory _memory,
-                      VkDeviceSize memoryOffset)
+static void
+bind_buffer_memory(const VkBindBufferMemoryInfo *info)
 {
-   V3DV_FROM_HANDLE(v3dv_device_memory, mem, _memory);
-   V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->buffer);
+   V3DV_FROM_HANDLE(v3dv_device_memory, mem, info->memory);
 
    /* Valid usage:
     *
@@ -1936,16 +2353,26 @@ v3dv_BindBufferMemory(VkDevice _device,
     *    the VkMemoryRequirements structure returned from a call to
     *    vkGetBufferMemoryRequirements with buffer"
     */
-   assert(memoryOffset % buffer->alignment == 0);
-   assert(memoryOffset < mem->bo->size);
+   assert(info->memoryOffset % buffer->alignment == 0);
+   assert(info->memoryOffset < mem->bo->size);
 
    buffer->mem = mem;
-   buffer->mem_offset = memoryOffset;
+   buffer->mem_offset = info->memoryOffset;
+}
+
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_BindBufferMemory2(VkDevice device,
+                       uint32_t bindInfoCount,
+                       const VkBindBufferMemoryInfo *pBindInfos)
+{
+   for (uint32_t i = 0; i < bindInfoCount; i++)
+      bind_buffer_memory(&pBindInfos[i]);
 
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateBuffer(VkDevice  _device,
                   const VkBufferCreateInfo *pCreateInfo,
                   const VkAllocationCallbacks *pAllocator,
@@ -1963,7 +2390,7 @@ v3dv_CreateBuffer(VkDevice  _device,
    buffer = vk_object_zalloc(&device->vk, pAllocator, sizeof(*buffer),
                              VK_OBJECT_TYPE_BUFFER);
    if (buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    buffer->size = pCreateInfo->size;
    buffer->usage = pCreateInfo->usage;
@@ -1979,7 +2406,7 @@ v3dv_CreateBuffer(VkDevice  _device,
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyBuffer(VkDevice _device,
                    VkBuffer _buffer,
                    const VkAllocationCallbacks *pAllocator)
@@ -1993,67 +2420,7 @@ v3dv_DestroyBuffer(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, buffer);
 }
 
-/**
- * This computes the maximum bpp used by any of the render targets used by
- * a particular subpass and checks if any of those render targets are
- * multisampled. If we don't have a subpass (when we are not inside a
- * render pass), then we assume that all framebuffer attachments are used.
- */
-void
-v3dv_framebuffer_compute_internal_bpp_msaa(
-   const struct v3dv_framebuffer *framebuffer,
-   const struct v3dv_subpass *subpass,
-   uint8_t *max_bpp,
-   bool *msaa)
-{
-   STATIC_ASSERT(RENDER_TARGET_MAXIMUM_32BPP == 0);
-   *max_bpp = RENDER_TARGET_MAXIMUM_32BPP;
-   *msaa = false;
-
-   if (subpass) {
-      for (uint32_t i = 0; i < subpass->color_count; i++) {
-         uint32_t att_idx = subpass->color_attachments[i].attachment;
-         if (att_idx == VK_ATTACHMENT_UNUSED)
-            continue;
-
-         const struct v3dv_image_view *att = framebuffer->attachments[att_idx];
-         assert(att);
-
-         if (att->aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-            *max_bpp = MAX2(*max_bpp, att->internal_bpp);
-
-         if (att->image->samples > VK_SAMPLE_COUNT_1_BIT)
-            *msaa = true;
-      }
-
-      if (!*msaa && subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
-         const struct v3dv_image_view *att =
-            framebuffer->attachments[subpass->ds_attachment.attachment];
-         assert(att);
-
-         if (att->image->samples > VK_SAMPLE_COUNT_1_BIT)
-            *msaa = true;
-      }
-
-      return;
-   }
-
-   assert(framebuffer->attachment_count <= 4);
-   for (uint32_t i = 0; i < framebuffer->attachment_count; i++) {
-      const struct v3dv_image_view *att = framebuffer->attachments[i];
-      assert(att);
-
-      if (att->aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-         *max_bpp = MAX2(*max_bpp, att->internal_bpp);
-
-      if (att->image->samples > VK_SAMPLE_COUNT_1_BIT)
-         *msaa = true;
-   }
-
-   return;
-}
-
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateFramebuffer(VkDevice _device,
                        const VkFramebufferCreateInfo *pCreateInfo,
                        const VkAllocationCallbacks *pAllocator,
@@ -2069,7 +2436,7 @@ v3dv_CreateFramebuffer(VkDevice _device,
    framebuffer = vk_object_zalloc(&device->vk, pAllocator, size,
                                   VK_OBJECT_TYPE_FRAMEBUFFER);
    if (framebuffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    framebuffer->width = pCreateInfo->width;
    framebuffer->height = pCreateInfo->height;
@@ -2081,7 +2448,7 @@ v3dv_CreateFramebuffer(VkDevice _device,
    for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
       framebuffer->attachments[i] =
          v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]);
-      if (framebuffer->attachments[i]->aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+      if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
          framebuffer->color_attachment_count++;
    }
 
@@ -2090,7 +2457,7 @@ v3dv_CreateFramebuffer(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyFramebuffer(VkDevice _device,
                         VkFramebuffer _fb,
                         const VkAllocationCallbacks *pAllocator)
@@ -2104,7 +2471,7 @@ v3dv_DestroyFramebuffer(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, fb);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
                               VkExternalMemoryHandleTypeFlagBits handleType,
                               int fd,
@@ -2119,11 +2486,11 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
          (1 << pdevice->memory.memoryTypeCount) - 1;
       return VK_SUCCESS;
    default:
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetMemoryFdKHR(VkDevice _device,
                     const VkMemoryGetFdInfoKHR *pGetFdInfo,
                     int *pFd)
@@ -2140,14 +2507,14 @@ v3dv_GetMemoryFdKHR(VkDevice _device,
                             mem->bo->handle,
                             DRM_CLOEXEC, &fd);
    if (ret)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    *pFd = fd;
 
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateEvent(VkDevice _device,
                  const VkEventCreateInfo *pCreateInfo,
                  const VkAllocationCallbacks *pAllocator,
@@ -2158,7 +2525,7 @@ v3dv_CreateEvent(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*event),
                        VK_OBJECT_TYPE_EVENT);
    if (!event)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* Events are created in the unsignaled state */
    event->state = false;
@@ -2167,7 +2534,7 @@ v3dv_CreateEvent(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyEvent(VkDevice _device,
                   VkEvent _event,
                   const VkAllocationCallbacks *pAllocator)
@@ -2181,14 +2548,14 @@ v3dv_DestroyEvent(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, event);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetEventStatus(VkDevice _device, VkEvent _event)
 {
    V3DV_FROM_HANDLE(v3dv_event, event, _event);
    return p_atomic_read(&event->state) ? VK_EVENT_SET : VK_EVENT_RESET;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_SetEvent(VkDevice _device, VkEvent _event)
 {
    V3DV_FROM_HANDLE(v3dv_event, event, _event);
@@ -2196,7 +2563,7 @@ v3dv_SetEvent(VkDevice _device, VkEvent _event)
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_ResetEvent(VkDevice _device, VkEvent _event)
 {
    V3DV_FROM_HANDLE(v3dv_event, event, _event);
@@ -2204,101 +2571,7 @@ v3dv_ResetEvent(VkDevice _device, VkEvent _event)
    return VK_SUCCESS;
 }
 
-static const enum V3DX(Wrap_Mode) vk_to_v3d_wrap_mode[] = {
-   [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = V3D_WRAP_MODE_REPEAT,
-   [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = V3D_WRAP_MODE_MIRROR,
-   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = V3D_WRAP_MODE_CLAMP,
-   [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = V3D_WRAP_MODE_MIRROR_ONCE,
-   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = V3D_WRAP_MODE_BORDER,
-};
-
-static const enum V3DX(Compare_Function)
-vk_to_v3d_compare_func[] = {
-   [VK_COMPARE_OP_NEVER]                        = V3D_COMPARE_FUNC_NEVER,
-   [VK_COMPARE_OP_LESS]                         = V3D_COMPARE_FUNC_LESS,
-   [VK_COMPARE_OP_EQUAL]                        = V3D_COMPARE_FUNC_EQUAL,
-   [VK_COMPARE_OP_LESS_OR_EQUAL]                = V3D_COMPARE_FUNC_LEQUAL,
-   [VK_COMPARE_OP_GREATER]                      = V3D_COMPARE_FUNC_GREATER,
-   [VK_COMPARE_OP_NOT_EQUAL]                    = V3D_COMPARE_FUNC_NOTEQUAL,
-   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = V3D_COMPARE_FUNC_GEQUAL,
-   [VK_COMPARE_OP_ALWAYS]                       = V3D_COMPARE_FUNC_ALWAYS,
-};
-
-static void
-pack_sampler_state(struct v3dv_sampler *sampler,
-                   const VkSamplerCreateInfo *pCreateInfo)
-{
-   enum V3DX(Border_Color_Mode) border_color_mode;
-
-   /* For now we only support the preset Vulkan border color modes. If we
-    * want to implement VK_EXT_custom_border_color in the future we would have
-    * to use V3D_BORDER_COLOR_FOLLOWS, and fill up border_color_word_[0/1/2/3]
-    * SAMPLER_STATE.
-    */
-   switch (pCreateInfo->borderColor) {
-   case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
-   case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
-      border_color_mode = V3D_BORDER_COLOR_0000;
-      break;
-   case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
-   case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
-      border_color_mode = V3D_BORDER_COLOR_0001;
-      break;
-   case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
-   case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
-      border_color_mode = V3D_BORDER_COLOR_1111;
-      break;
-   default:
-      unreachable("Unknown border color");
-      break;
-   }
-
-   /* For some texture formats, when clamping to transparent black border the
-    * CTS expects alpha to be set to 1 instead of 0, but the border color mode
-    * will take priority over the texture state swizzle, so the only way to
-    * fix that is to apply a swizzle in the shader. Here we keep track of
-    * whether we are activating that mode and we will decide if we need to
-    * activate the texture swizzle lowering in the shader key at compile time
-    * depending on the actual texture format.
-    */
-   if ((pCreateInfo->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
-        pCreateInfo->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
-        pCreateInfo->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER) &&
-       border_color_mode == V3D_BORDER_COLOR_0000) {
-      sampler->clamp_to_transparent_black_border = true;
-   }
-
-   v3dv_pack(sampler->sampler_state, SAMPLER_STATE, s) {
-      if (pCreateInfo->anisotropyEnable) {
-         s.anisotropy_enable = true;
-         if (pCreateInfo->maxAnisotropy > 8)
-            s.maximum_anisotropy = 3;
-         else if (pCreateInfo->maxAnisotropy > 4)
-            s.maximum_anisotropy = 2;
-         else if (pCreateInfo->maxAnisotropy > 2)
-            s.maximum_anisotropy = 1;
-      }
-
-      s.border_color_mode = border_color_mode;
-
-      s.wrap_i_border = false; /* Also hardcoded on v3d */
-      s.wrap_s = vk_to_v3d_wrap_mode[pCreateInfo->addressModeU];
-      s.wrap_t = vk_to_v3d_wrap_mode[pCreateInfo->addressModeV];
-      s.wrap_r = vk_to_v3d_wrap_mode[pCreateInfo->addressModeW];
-      s.fixed_bias = pCreateInfo->mipLodBias;
-      s.max_level_of_detail = MIN2(MAX2(0, pCreateInfo->maxLod), 15);
-      s.min_level_of_detail = MIN2(MAX2(0, pCreateInfo->minLod), 15);
-      s.srgb_disable = 0; /* Not even set by v3d */
-      s.depth_compare_function =
-         vk_to_v3d_compare_func[pCreateInfo->compareEnable ?
-                                pCreateInfo->compareOp : VK_COMPARE_OP_NEVER];
-      s.mip_filter_nearest = pCreateInfo->mipmapMode == VK_SAMPLER_MIPMAP_MODE_NEAREST;
-      s.min_filter_nearest = pCreateInfo->minFilter == VK_FILTER_NEAREST;
-      s.mag_filter_nearest = pCreateInfo->magFilter == VK_FILTER_NEAREST;
-   }
-}
-
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateSampler(VkDevice _device,
                  const VkSamplerCreateInfo *pCreateInfo,
                  const VkAllocationCallbacks *pAllocator,
@@ -2312,18 +2585,23 @@ v3dv_CreateSampler(VkDevice _device,
    sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
                               VK_OBJECT_TYPE_SAMPLER);
    if (!sampler)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    sampler->compare_enable = pCreateInfo->compareEnable;
    sampler->unnormalized_coordinates = pCreateInfo->unnormalizedCoordinates;
-   pack_sampler_state(sampler, pCreateInfo);
+
+   const VkSamplerCustomBorderColorCreateInfoEXT *bc_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT);
+
+   v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
 
    *pSampler = v3dv_sampler_to_handle(sampler);
 
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroySampler(VkDevice _device,
                   VkSampler _sampler,
                   const VkAllocationCallbacks *pAllocator)
@@ -2337,7 +2615,7 @@ v3dv_DestroySampler(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, sampler);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetDeviceMemoryCommitment(VkDevice device,
                                VkDeviceMemory memory,
                                VkDeviceSize *pCommittedMemoryInBytes)
@@ -2345,17 +2623,17 @@ v3dv_GetDeviceMemoryCommitment(VkDevice device,
    *pCommittedMemoryInBytes = 0;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetImageSparseMemoryRequirements(
-   VkDevice device,
-   VkImage image,
-   uint32_t *pSparseMemoryRequirementCount,
-   VkSparseImageMemoryRequirements *pSparseMemoryRequirements)
+    VkDevice device,
+    VkImage image,
+    uint32_t *pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements *pSparseMemoryRequirements)
 {
    *pSparseMemoryRequirementCount = 0;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetImageSparseMemoryRequirements2(
    VkDevice device,
    const VkImageSparseMemoryRequirementsInfo2 *pInfo,
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_formats.c b/lib/mesa/src/broadcom/vulkan/v3dv_formats.c
index cefa1418b..6e32d341a 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_formats.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_formats.c
@@ -25,300 +25,14 @@
 #include "vk_util.h"
 #include "vk_format_info.h"
 
-#include "broadcom/cle/v3dx_pack.h"
 #include "drm-uapi/drm_fourcc.h"
 #include "util/format/u_format.h"
 #include "vulkan/wsi/wsi_common.h"
 
-#define SWIZ(x,y,z,w) {   \
-   PIPE_SWIZZLE_##x,      \
-   PIPE_SWIZZLE_##y,      \
-   PIPE_SWIZZLE_##z,      \
-   PIPE_SWIZZLE_##w       \
-}
-
-#define FORMAT(vk, rt, tex, swiz, return_size, supports_filtering)  \
-   [VK_FORMAT_##vk] = {                                             \
-      true,                                                         \
-      V3D_OUTPUT_IMAGE_FORMAT_##rt,                                 \
-      TEXTURE_DATA_FORMAT_##tex,                                    \
-      swiz,                                                         \
-      return_size,                                                  \
-      supports_filtering,                                           \
-   }
-
-#define SWIZ_X001 SWIZ(X, 0, 0, 1)
-#define SWIZ_XY01 SWIZ(X, Y, 0, 1)
-#define SWIZ_XYZ1 SWIZ(X, Y, Z, 1)
-#define SWIZ_XYZW SWIZ(X, Y, Z, W)
-#define SWIZ_YZWX SWIZ(Y, Z, W, X)
-#define SWIZ_YZW1 SWIZ(Y, Z, W, 1)
-#define SWIZ_ZYXW SWIZ(Z, Y, X, W)
-#define SWIZ_ZYX1 SWIZ(Z, Y, X, 1)
-#define SWIZ_XXXY SWIZ(X, X, X, Y)
-#define SWIZ_XXX1 SWIZ(X, X, X, 1)
-#define SWIZ_XXXX SWIZ(X, X, X, X)
-#define SWIZ_000X SWIZ(0, 0, 0, X)
-#define SWIZ_WXYZ SWIZ(W, X, Y, Z)
-
-/* FIXME: expand format table to describe whether the format is supported
- * for buffer surfaces (texel buffers, vertex buffers, etc).
- */
-static const struct v3dv_format format_table[] = {
-   /* Color, 4 channels */
-   FORMAT(B8G8R8A8_SRGB,           SRGB8_ALPHA8, RGBA8,         SWIZ_ZYXW, 16, true),
-   FORMAT(B8G8R8A8_UNORM,          RGBA8,        RGBA8,         SWIZ_ZYXW, 16, true),
-
-   FORMAT(R8G8B8A8_SRGB,           SRGB8_ALPHA8, RGBA8,         SWIZ_XYZW, 16, true),
-   FORMAT(R8G8B8A8_UNORM,          RGBA8,        RGBA8,         SWIZ_XYZW, 16, true),
-   FORMAT(R8G8B8A8_SNORM,          NO,           RGBA8_SNORM,   SWIZ_XYZW, 16, true),
-   FORMAT(R8G8B8A8_SINT,           RGBA8I,       RGBA8I,        SWIZ_XYZW, 16, false),
-   FORMAT(R8G8B8A8_UINT,           RGBA8UI,      RGBA8UI,       SWIZ_XYZW, 16, false),
-
-   FORMAT(R16G16B16A16_SFLOAT,     RGBA16F,      RGBA16F,       SWIZ_XYZW, 16, true),
-   FORMAT(R16G16B16A16_UNORM,      NO,           RGBA16,        SWIZ_XYZW, 32, true),
-   FORMAT(R16G16B16A16_SNORM,      NO,           RGBA16_SNORM,  SWIZ_XYZW, 32, true),
-   FORMAT(R16G16B16A16_SINT,       RGBA16I,      RGBA16I,       SWIZ_XYZW, 16, false),
-   FORMAT(R16G16B16A16_UINT,       RGBA16UI,     RGBA16UI,      SWIZ_XYZW, 16, false),
-
-   FORMAT(R32G32B32A32_SFLOAT,     RGBA32F,      RGBA32F,       SWIZ_XYZW, 32, false),
-   FORMAT(R32G32B32A32_SINT,       RGBA32I,      RGBA32I,       SWIZ_XYZW, 32, false),
-   FORMAT(R32G32B32A32_UINT,       RGBA32UI,     RGBA32UI,      SWIZ_XYZW, 32, false),
-
-   /* Color, 3 channels */
-   FORMAT(R32G32B32_SFLOAT,        NO,           NO,            SWIZ_XYZ1,  0, false),
-   FORMAT(R32G32B32_UINT,          NO,           NO,            SWIZ_XYZ1,  0, false),
-   FORMAT(R32G32B32_SINT,          NO,           NO,            SWIZ_XYZ1,  0, false),
-
-   /* Color, 2 channels */
-   FORMAT(R8G8_UNORM,              RG8,          RG8,           SWIZ_XY01, 16, true),
-   FORMAT(R8G8_SNORM,              NO,           RG8_SNORM,     SWIZ_XY01, 16, true),
-   FORMAT(R8G8_SINT,               RG8I,         RG8I,          SWIZ_XY01, 16, false),
-   FORMAT(R8G8_UINT,               RG8UI,        RG8UI,         SWIZ_XY01, 16, false),
-
-   FORMAT(R16G16_UNORM,            NO,           RG16,          SWIZ_XY01, 32, true),
-   FORMAT(R16G16_SNORM,            NO,           RG16_SNORM,    SWIZ_XY01, 32, true),
-   FORMAT(R16G16_SFLOAT,           RG16F,        RG16F,         SWIZ_XY01, 16, true),
-   FORMAT(R16G16_SINT,             RG16I,        RG16I,         SWIZ_XY01, 16, false),
-   FORMAT(R16G16_UINT,             RG16UI,       RG16UI,        SWIZ_XY01, 16, false),
-
-   FORMAT(R32G32_SFLOAT,           RG32F,        RG32F,         SWIZ_XY01, 32, false),
-   FORMAT(R32G32_SINT,             RG32I,        RG32I,         SWIZ_XY01, 32, false),
-   FORMAT(R32G32_UINT,             RG32UI,       RG32UI,        SWIZ_XY01, 32, false),
-
-   /* Color, 1 channel */
-   FORMAT(R8_UNORM,                R8,           R8,            SWIZ_X001, 16, true),
-   FORMAT(R8_SNORM,                NO,           R8_SNORM,      SWIZ_X001, 16, true),
-   FORMAT(R8_SINT,                 R8I,          R8I,           SWIZ_X001, 16, false),
-   FORMAT(R8_UINT,                 R8UI,         R8UI,          SWIZ_X001, 16, false),
-
-   FORMAT(R16_UNORM,               NO,           R16,           SWIZ_X001, 32, true),
-   FORMAT(R16_SNORM,               NO,           R16_SNORM,     SWIZ_X001, 32, true),
-   FORMAT(R16_SFLOAT,              R16F,         R16F,          SWIZ_X001, 16, true),
-   FORMAT(R16_SINT,                R16I,         R16I,          SWIZ_X001, 16, false),
-   FORMAT(R16_UINT,                R16UI,        R16UI,         SWIZ_X001, 16, false),
-
-   FORMAT(R32_SFLOAT,              R32F,         R32F,          SWIZ_X001, 32, false),
-   FORMAT(R32_SINT,                R32I,         R32I,          SWIZ_X001, 32, false),
-   FORMAT(R32_UINT,                R32UI,        R32UI,         SWIZ_X001, 32, false),
-
-   /* Color, packed */
-   FORMAT(B4G4R4A4_UNORM_PACK16,   ABGR4444,     RGBA4,         SWIZ_ZYXW, 16, true), /* Swap RB */
-   FORMAT(R5G6B5_UNORM_PACK16,     BGR565,       RGB565,        SWIZ_XYZ1, 16, true),
-   FORMAT(R5G5B5A1_UNORM_PACK16,   ABGR1555,     RGB5_A1,       SWIZ_XYZW, 16, true),
-   FORMAT(A1R5G5B5_UNORM_PACK16,   RGBA5551,     A1_RGB5,       SWIZ_ZYXW, 16, true), /* Swap RB */
-   FORMAT(A8B8G8R8_UNORM_PACK32,   RGBA8,        RGBA8,         SWIZ_XYZW, 16, true), /* RGBA8 UNORM */
-   FORMAT(A8B8G8R8_SNORM_PACK32,   NO,           RGBA8_SNORM,   SWIZ_XYZW, 16, true), /* RGBA8 SNORM */
-   FORMAT(A8B8G8R8_UINT_PACK32,    RGBA8UI,      RGBA8UI,       SWIZ_XYZW, 16, true), /* RGBA8 UINT */
-   FORMAT(A8B8G8R8_SINT_PACK32,    RGBA8I,       RGBA8I,        SWIZ_XYZW, 16, true), /* RGBA8 SINT */
-   FORMAT(A8B8G8R8_SRGB_PACK32,    SRGB8_ALPHA8, RGBA8,         SWIZ_XYZW, 16, true), /* RGBA8 sRGB */
-   FORMAT(A2B10G10R10_UNORM_PACK32,RGB10_A2,     RGB10_A2,      SWIZ_XYZW, 16, true),
-   FORMAT(A2B10G10R10_UINT_PACK32, RGB10_A2UI,   RGB10_A2UI,    SWIZ_XYZW, 16, true),
-   FORMAT(E5B9G9R9_UFLOAT_PACK32,  NO,           RGB9_E5,       SWIZ_XYZ1, 16, true),
-   FORMAT(B10G11R11_UFLOAT_PACK32, R11F_G11F_B10F,R11F_G11F_B10F, SWIZ_XYZ1, 16, true),
-
-   /* Depth */
-   FORMAT(D16_UNORM,               D16,          DEPTH_COMP16,  SWIZ_X001, 32, false),
-   FORMAT(D32_SFLOAT,              D32F,         DEPTH_COMP32F, SWIZ_X001, 32, false),
-   FORMAT(X8_D24_UNORM_PACK32,     D24S8,        DEPTH24_X8,    SWIZ_X001, 32, false),
-
-   /* Depth + Stencil */
-   FORMAT(D24_UNORM_S8_UINT,       D24S8,        DEPTH24_X8,    SWIZ_X001, 32, false),
-
-   /* Compressed: ETC2 / EAC */
-   FORMAT(ETC2_R8G8B8_UNORM_BLOCK,    NO,  RGB8_ETC2,                SWIZ_XYZ1, 16, true),
-   FORMAT(ETC2_R8G8B8_SRGB_BLOCK,     NO,  RGB8_ETC2,                SWIZ_XYZ1, 16, true),
-   FORMAT(ETC2_R8G8B8A1_UNORM_BLOCK,  NO,  RGB8_PUNCHTHROUGH_ALPHA1, SWIZ_XYZW, 16, true),
-   FORMAT(ETC2_R8G8B8A1_SRGB_BLOCK,   NO,  RGB8_PUNCHTHROUGH_ALPHA1, SWIZ_XYZW, 16, true),
-   FORMAT(ETC2_R8G8B8A8_UNORM_BLOCK,  NO,  RGBA8_ETC2_EAC,           SWIZ_XYZW, 16, true),
-   FORMAT(ETC2_R8G8B8A8_SRGB_BLOCK,   NO,  RGBA8_ETC2_EAC,           SWIZ_XYZW, 16, true),
-   FORMAT(EAC_R11_UNORM_BLOCK,        NO,  R11_EAC,                  SWIZ_X001, 16, true),
-   FORMAT(EAC_R11_SNORM_BLOCK,        NO,  SIGNED_R11_EAC,           SWIZ_X001, 16, true),
-   FORMAT(EAC_R11G11_UNORM_BLOCK,     NO,  RG11_EAC,                 SWIZ_XY01, 16, true),
-   FORMAT(EAC_R11G11_SNORM_BLOCK,     NO,  SIGNED_RG11_EAC,          SWIZ_XY01, 16, true),
-
-   /* Compressed: BC1-3 */
-   FORMAT(BC1_RGB_UNORM_BLOCK,        NO,  BC1,                      SWIZ_XYZ1, 16, true),
-   FORMAT(BC1_RGB_SRGB_BLOCK,         NO,  BC1,                      SWIZ_XYZ1, 16, true),
-   FORMAT(BC1_RGBA_UNORM_BLOCK,       NO,  BC1,                      SWIZ_XYZW, 16, true),
-   FORMAT(BC1_RGBA_SRGB_BLOCK,        NO,  BC1,                      SWIZ_XYZW, 16, true),
-   FORMAT(BC2_UNORM_BLOCK,            NO,  BC2,                      SWIZ_XYZW, 16, true),
-   FORMAT(BC2_SRGB_BLOCK,             NO,  BC2,                      SWIZ_XYZW, 16, true),
-   FORMAT(BC3_UNORM_BLOCK,            NO,  BC3,                      SWIZ_XYZW, 16, true),
-   FORMAT(BC3_SRGB_BLOCK,             NO,  BC3,                      SWIZ_XYZW, 16, true),
-};
-
-const struct v3dv_format *
-v3dv_get_format(VkFormat format)
-{
-   if (format < ARRAY_SIZE(format_table) && format_table[format].supported)
-      return &format_table[format];
-   else
-      return NULL;
-}
-
-void
-v3dv_get_internal_type_bpp_for_output_format(uint32_t format,
-                                             uint32_t *type,
-                                             uint32_t *bpp)
-{
-   switch (format) {
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA8:
-   case V3D_OUTPUT_IMAGE_FORMAT_RGB8:
-   case V3D_OUTPUT_IMAGE_FORMAT_RG8:
-   case V3D_OUTPUT_IMAGE_FORMAT_R8:
-   case V3D_OUTPUT_IMAGE_FORMAT_ABGR4444:
-   case V3D_OUTPUT_IMAGE_FORMAT_BGR565:
-   case V3D_OUTPUT_IMAGE_FORMAT_ABGR1555:
-      *type = V3D_INTERNAL_TYPE_8;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA8I:
-   case V3D_OUTPUT_IMAGE_FORMAT_RG8I:
-   case V3D_OUTPUT_IMAGE_FORMAT_R8I:
-      *type = V3D_INTERNAL_TYPE_8I;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI:
-   case V3D_OUTPUT_IMAGE_FORMAT_RG8UI:
-   case V3D_OUTPUT_IMAGE_FORMAT_R8UI:
-      *type = V3D_INTERNAL_TYPE_8UI;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_SRGB8_ALPHA8:
-   case V3D_OUTPUT_IMAGE_FORMAT_SRGB:
-   case V3D_OUTPUT_IMAGE_FORMAT_RGB10_A2:
-   case V3D_OUTPUT_IMAGE_FORMAT_R11F_G11F_B10F:
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA16F:
-      /* Note that sRGB RTs are stored in the tile buffer at 16F,
-       * and the conversion to sRGB happens at tilebuffer load/store.
-       */
-      *type = V3D_INTERNAL_TYPE_16F;
-      *bpp = V3D_INTERNAL_BPP_64;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RG16F:
-   case V3D_OUTPUT_IMAGE_FORMAT_R16F:
-      *type = V3D_INTERNAL_TYPE_16F;
-      /* Use 64bpp to make sure the TLB doesn't throw away the alpha
-       * channel before alpha test happens.
-       */
-      *bpp = V3D_INTERNAL_BPP_64;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA16I:
-      *type = V3D_INTERNAL_TYPE_16I;
-      *bpp = V3D_INTERNAL_BPP_64;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RG16I:
-   case V3D_OUTPUT_IMAGE_FORMAT_R16I:
-      *type = V3D_INTERNAL_TYPE_16I;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RGB10_A2UI:
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA16UI:
-      *type = V3D_INTERNAL_TYPE_16UI;
-      *bpp = V3D_INTERNAL_BPP_64;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RG16UI:
-   case V3D_OUTPUT_IMAGE_FORMAT_R16UI:
-      *type = V3D_INTERNAL_TYPE_16UI;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA32I:
-      *type = V3D_INTERNAL_TYPE_32I;
-      *bpp = V3D_INTERNAL_BPP_128;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RG32I:
-      *type = V3D_INTERNAL_TYPE_32I;
-      *bpp = V3D_INTERNAL_BPP_64;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_R32I:
-      *type = V3D_INTERNAL_TYPE_32I;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA32UI:
-      *type = V3D_INTERNAL_TYPE_32UI;
-      *bpp = V3D_INTERNAL_BPP_128;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RG32UI:
-      *type = V3D_INTERNAL_TYPE_32UI;
-      *bpp = V3D_INTERNAL_BPP_64;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_R32UI:
-      *type = V3D_INTERNAL_TYPE_32UI;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RGBA32F:
-      *type = V3D_INTERNAL_TYPE_32F;
-      *bpp = V3D_INTERNAL_BPP_128;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_RG32F:
-      *type = V3D_INTERNAL_TYPE_32F;
-      *bpp = V3D_INTERNAL_BPP_64;
-      break;
-
-   case V3D_OUTPUT_IMAGE_FORMAT_R32F:
-      *type = V3D_INTERNAL_TYPE_32F;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-
-   default:
-      /* Provide some default values, as we'll be called at RB
-       * creation time, even if an RB with this format isn't supported.
-       */
-      *type = V3D_INTERNAL_TYPE_8;
-      *bpp = V3D_INTERNAL_BPP_32;
-      break;
-   }
-}
-
-bool
-v3dv_format_supports_tlb_resolve(const struct v3dv_format *format)
-{
-   uint32_t type, bpp;
-   v3dv_get_internal_type_bpp_for_output_format(format->rt_type, &type, &bpp);
-   return type == V3D_INTERNAL_TYPE_8 || type == V3D_INTERNAL_TYPE_16F;
-}
-
 const uint8_t *
-v3dv_get_format_swizzle(VkFormat f)
+v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f)
 {
-   const struct v3dv_format *vf = v3dv_get_format(f);
+   const struct v3dv_format *vf = v3dv_X(device, get_format)(f);
    static const uint8_t fallback[] = {0, 1, 2, 3};
 
    if (!vf)
@@ -331,57 +45,18 @@ uint8_t
 v3dv_get_tex_return_size(const struct v3dv_format *vf,
                          bool compare_enable)
 {
+   if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT))
+      return 16;
+
+   if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT))
+      return 32;
+
    if (compare_enable)
       return 16;
 
    return vf->return_size;
 }
 
-bool
-v3dv_tfu_supports_tex_format(const struct v3d_device_info *devinfo,
-                             uint32_t tex_format)
-{
-   assert(devinfo->ver >= 42);
-
-   switch (tex_format) {
-   case TEXTURE_DATA_FORMAT_R8:
-   case TEXTURE_DATA_FORMAT_R8_SNORM:
-   case TEXTURE_DATA_FORMAT_RG8:
-   case TEXTURE_DATA_FORMAT_RG8_SNORM:
-   case TEXTURE_DATA_FORMAT_RGBA8:
-   case TEXTURE_DATA_FORMAT_RGBA8_SNORM:
-   case TEXTURE_DATA_FORMAT_RGB565:
-   case TEXTURE_DATA_FORMAT_RGBA4:
-   case TEXTURE_DATA_FORMAT_RGB5_A1:
-   case TEXTURE_DATA_FORMAT_RGB10_A2:
-   case TEXTURE_DATA_FORMAT_R16:
-   case TEXTURE_DATA_FORMAT_R16_SNORM:
-   case TEXTURE_DATA_FORMAT_RG16:
-   case TEXTURE_DATA_FORMAT_RG16_SNORM:
-   case TEXTURE_DATA_FORMAT_RGBA16:
-   case TEXTURE_DATA_FORMAT_RGBA16_SNORM:
-   case TEXTURE_DATA_FORMAT_R16F:
-   case TEXTURE_DATA_FORMAT_RG16F:
-   case TEXTURE_DATA_FORMAT_RGBA16F:
-   case TEXTURE_DATA_FORMAT_R11F_G11F_B10F:
-   case TEXTURE_DATA_FORMAT_R4:
-   case TEXTURE_DATA_FORMAT_RGB9_E5:
-   case TEXTURE_DATA_FORMAT_R32F:
-   case TEXTURE_DATA_FORMAT_RG32F:
-   case TEXTURE_DATA_FORMAT_RGBA32F:
-   case TEXTURE_DATA_FORMAT_RGB8_ETC2:
-   case TEXTURE_DATA_FORMAT_RGB8_PUNCHTHROUGH_ALPHA1:
-   case TEXTURE_DATA_FORMAT_RGBA8_ETC2_EAC:
-   case TEXTURE_DATA_FORMAT_R11_EAC:
-   case TEXTURE_DATA_FORMAT_SIGNED_R11_EAC:
-   case TEXTURE_DATA_FORMAT_RG11_EAC:
-   case TEXTURE_DATA_FORMAT_SIGNED_RG11_EAC:
-      return true;
-   default:
-      return false;
-   }
-}
-
 /* Some cases of transfer operations are raw data copies that don't depend
  * on the semantics of the pixel format (no pixel format conversions are
  * involved). In these cases, it is safe to choose any format supported by
@@ -389,7 +64,7 @@ v3dv_tfu_supports_tex_format(const struct v3d_device_info *devinfo,
  * TFU paths with formats that are not TFU supported otherwise.
  */
 const struct v3dv_format *
-v3dv_get_compatible_tfu_format(const struct v3d_device_info *devinfo,
+v3dv_get_compatible_tfu_format(struct v3dv_device *device,
                                uint32_t bpp,
                                VkFormat *out_vk_format)
 {
@@ -406,32 +81,15 @@ v3dv_get_compatible_tfu_format(const struct v3d_device_info *devinfo,
    if (out_vk_format)
       *out_vk_format = vk_format;
 
-   const struct v3dv_format *format = v3dv_get_format(vk_format);
-   assert(v3dv_tfu_supports_tex_format(devinfo, format->tex_type));
+   const struct v3dv_format *format = v3dv_X(device, get_format)(vk_format);
+   assert(v3dv_X(device, tfu_supports_tex_format)(format->tex_type));
 
    return format;
 }
 
-static bool
-format_supports_blending(const struct v3dv_format *format)
-{
-   /* Hardware blending is only supported on render targets that are configured
-    * 4x8-bit unorm, 2x16-bit float or 4x16-bit float.
-    */
-   uint32_t type, bpp;
-   v3dv_get_internal_type_bpp_for_output_format(format->rt_type, &type, &bpp);
-   switch (type) {
-   case V3D_INTERNAL_TYPE_8:
-      return bpp == V3D_INTERNAL_BPP_32;
-   case V3D_INTERNAL_TYPE_16F:
-      return bpp == V3D_INTERNAL_BPP_32 || V3D_INTERNAL_BPP_64;
-   default:
-      return false;
-   }
-}
-
 static VkFormatFeatureFlags
-image_format_features(VkFormat vk_format,
+image_format_features(struct v3dv_physical_device *pdevice,
+                      VkFormat vk_format,
                       const struct v3dv_format *v3dv_format,
                       VkImageTiling tiling)
 {
@@ -476,7 +134,7 @@ image_format_features(VkFormat vk_format,
       if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
          flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
                   VK_FORMAT_FEATURE_BLIT_DST_BIT;
-         if (format_supports_blending(v3dv_format))
+         if (v3dv_X(pdevice, format_supports_blending)(v3dv_format))
             flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
       } else if (aspects & zs_aspects) {
          flags |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
@@ -556,33 +214,35 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
 }
 
 bool
-v3dv_buffer_format_supports_features(VkFormat vk_format,
+v3dv_buffer_format_supports_features(struct v3dv_device *device,
+                                     VkFormat vk_format,
                                      VkFormatFeatureFlags features)
 {
-   const struct v3dv_format *v3dv_format = v3dv_get_format(vk_format);
+   const struct v3dv_format *v3dv_format = v3dv_X(device, get_format)(vk_format);
    const VkFormatFeatureFlags supported =
       buffer_format_features(vk_format, v3dv_format);
    return (supported & features) == features;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice,
                                        VkFormat format,
                                        VkFormatProperties* pFormatProperties)
 {
-   const struct v3dv_format *v3dv_format = v3dv_get_format(format);
+   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
+   const struct v3dv_format *v3dv_format = v3dv_X(pdevice, get_format)(format);
 
    *pFormatProperties = (VkFormatProperties) {
       .linearTilingFeatures =
-         image_format_features(format, v3dv_format, VK_IMAGE_TILING_LINEAR),
+         image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_LINEAR),
       .optimalTilingFeatures =
-         image_format_features(format, v3dv_format, VK_IMAGE_TILING_OPTIMAL),
+         image_format_features(pdevice, format, v3dv_format, VK_IMAGE_TILING_OPTIMAL),
       .bufferFeatures =
          buffer_format_features(format, v3dv_format),
    };
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
                                         VkFormat format,
                                         VkFormatProperties2 *pFormatProperties)
@@ -600,12 +260,16 @@ v3dv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
             vk_outarray_append(&out, mod_props) {
                mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR;
                mod_props->drmFormatModifierPlaneCount = 1;
+               mod_props->drmFormatModifierTilingFeatures =
+                  pFormatProperties->formatProperties.linearTilingFeatures;
             }
          }
          if (pFormatProperties->formatProperties.optimalTilingFeatures) {
             vk_outarray_append(&out, mod_props) {
                mod_props->drmFormatModifier = DRM_FORMAT_MOD_BROADCOM_UIF;
                mod_props->drmFormatModifierPlaneCount = 1;
+               mod_props->drmFormatModifierTilingFeatures =
+                  pFormatProperties->formatProperties.optimalTilingFeatures;
             }
          }
          break;
@@ -625,12 +289,24 @@ get_image_format_properties(
    VkImageFormatProperties *pImageFormatProperties,
    VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties)
 {
-   const struct v3dv_format *v3dv_format = v3dv_get_format(info->format);
+   const struct v3dv_format *v3dv_format = v3dv_X(physical_device, get_format)(info->format);
    VkFormatFeatureFlags format_feature_flags =
-      image_format_features(info->format, v3dv_format, tiling);
+      image_format_features(physical_device, info->format, v3dv_format, tiling);
    if (!format_feature_flags)
       goto unsupported;
 
+   /* This allows users to create uncompressed views of compressed images,
+    * however this is not something the hardware supports naturally and requires
+    * the driver to lie when programming the texture state to make the hardware
+    * sample with the uncompressed view correctly, and even then, there are
+    * issues when running on real hardware.
+    *
+    * See https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11336
+    * for details.
+    */
+   if (info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT)
+      goto unsupported;
+
    if (info->usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
       if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_SRC_BIT)) {
          goto unsupported;
@@ -775,7 +451,7 @@ static const VkExternalMemoryProperties prime_fd_props = {
       VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
 };
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetPhysicalDeviceImageFormatProperties(
    VkPhysicalDevice physicalDevice,
    VkFormat format,
@@ -801,7 +477,7 @@ v3dv_GetPhysicalDeviceImageFormatProperties(
                                       pImageFormatProperties, NULL);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
                                              const VkPhysicalDeviceImageFormatInfo2 *base_info,
                                              VkImageFormatProperties2 *base_props)
@@ -875,7 +551,7 @@ done:
    return result;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceSparseImageFormatProperties(
    VkPhysicalDevice physicalDevice,
    VkFormat format,
@@ -889,7 +565,7 @@ v3dv_GetPhysicalDeviceSparseImageFormatProperties(
    *pPropertyCount = 0;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceSparseImageFormatProperties2(
    VkPhysicalDevice physicalDevice,
    const VkPhysicalDeviceSparseImageFormatInfo2 *pFormatInfo,
@@ -899,7 +575,7 @@ v3dv_GetPhysicalDeviceSparseImageFormatProperties2(
    *pPropertyCount = 0;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceExternalBufferProperties(
    VkPhysicalDevice physicalDevice,
    const VkPhysicalDeviceExternalBufferInfo *pExternalBufferInfo,
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_image.c b/lib/mesa/src/broadcom/vulkan/v3dv_image.c
index 2935d7e8b..d03814d98 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_image.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_image.c
@@ -23,7 +23,6 @@
 
 #include "v3dv_private.h"
 
-#include "broadcom/cle/v3dx_pack.h"
 #include "drm-uapi/drm_fourcc.h"
 #include "util/format/u_format.h"
 #include "util/u_math.h"
@@ -77,9 +76,9 @@ v3d_setup_slices(struct v3dv_image *image)
 {
    assert(image->cpp > 0);
 
-   uint32_t width = image->extent.width;
-   uint32_t height = image->extent.height;
-   uint32_t depth = image->extent.depth;
+   uint32_t width = image->vk.extent.width;
+   uint32_t height = image->vk.extent.height;
+   uint32_t depth = image->vk.extent.depth;
 
    /* Note that power-of-two padding is based on level 1.  These are not
     * equivalent to just util_next_power_of_two(dimension), because at a
@@ -95,21 +94,21 @@ v3d_setup_slices(struct v3dv_image *image)
    uint32_t uif_block_w = utile_w * 2;
    uint32_t uif_block_h = utile_h * 2;
 
-   uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
-   uint32_t block_height = vk_format_get_blockheight(image->vk_format);
+   uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
+   uint32_t block_height = vk_format_get_blockheight(image->vk.format);
 
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT ||
-          image->samples == VK_SAMPLE_COUNT_4_BIT);
-   bool msaa = image->samples != VK_SAMPLE_COUNT_1_BIT;
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT ||
+          image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
+   bool msaa = image->vk.samples != VK_SAMPLE_COUNT_1_BIT;
 
    bool uif_top = msaa;
 
-   assert(image->array_size > 0);
+   assert(image->vk.array_layers > 0);
    assert(depth > 0);
-   assert(image->levels >= 1);
+   assert(image->vk.mip_levels >= 1);
 
    uint32_t offset = 0;
-   for (int32_t i = image->levels - 1; i >= 0; i--) {
+   for (int32_t i = image->vk.mip_levels - 1; i >= 0; i--) {
       struct v3d_resource_slice *slice = &image->slices[i];
 
       uint32_t level_width, level_height, level_depth;
@@ -135,21 +134,21 @@ v3d_setup_slices(struct v3dv_image *image)
       level_height = DIV_ROUND_UP(level_height, block_height);
 
       if (!image->tiled) {
-         slice->tiling = VC5_TILING_RASTER;
-         if (image->type == VK_IMAGE_TYPE_1D)
+         slice->tiling = V3D_TILING_RASTER;
+         if (image->vk.image_type == VK_IMAGE_TYPE_1D)
             level_width = align(level_width, 64 / image->cpp);
       } else {
          if ((i != 0 || !uif_top) &&
              (level_width <= utile_w || level_height <= utile_h)) {
-            slice->tiling = VC5_TILING_LINEARTILE;
+            slice->tiling = V3D_TILING_LINEARTILE;
             level_width = align(level_width, utile_w);
             level_height = align(level_height, utile_h);
          } else if ((i != 0 || !uif_top) && level_width <= uif_block_w) {
-            slice->tiling = VC5_TILING_UBLINEAR_1_COLUMN;
+            slice->tiling = V3D_TILING_UBLINEAR_1_COLUMN;
             level_width = align(level_width, uif_block_w);
             level_height = align(level_height, uif_block_h);
          } else if ((i != 0 || !uif_top) && level_width <= 2 * uif_block_w) {
-            slice->tiling = VC5_TILING_UBLINEAR_2_COLUMN;
+            slice->tiling = V3D_TILING_UBLINEAR_2_COLUMN;
             level_width = align(level_width, 2 * uif_block_w);
             level_height = align(level_height, uif_block_h);
          } else {
@@ -167,10 +166,10 @@ v3d_setup_slices(struct v3dv_image *image)
              * perfectly misaligned.
              */
             if ((level_height / uif_block_h) %
-                (VC5_PAGE_CACHE_SIZE / VC5_UIFBLOCK_ROW_SIZE) == 0) {
-               slice->tiling = VC5_TILING_UIF_XOR;
+                (V3D_PAGE_CACHE_SIZE / V3D_UIFBLOCK_ROW_SIZE) == 0) {
+               slice->tiling = V3D_TILING_UIF_XOR;
             } else {
-               slice->tiling = VC5_TILING_UIF_NO_XOR;
+               slice->tiling = V3D_TILING_UIF_NO_XOR;
             }
          }
       }
@@ -178,8 +177,8 @@ v3d_setup_slices(struct v3dv_image *image)
       slice->offset = offset;
       slice->stride = level_width * image->cpp;
       slice->padded_height = level_height;
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
+      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+          slice->tiling == V3D_TILING_UIF_XOR) {
          slice->padded_height_of_output_image_in_uif_blocks =
             slice->padded_height / (2 * v3d_utile_height(image->cpp));
       }
@@ -195,7 +194,7 @@ v3d_setup_slices(struct v3dv_image *image)
       if (i == 1 &&
           level_width > 4 * uif_block_w &&
           level_height > PAGE_CACHE_MINUS_1_5_UB_ROWS * uif_block_h) {
-         slice_total_size = align(slice_total_size, VC5_UIFCFG_PAGE_SIZE);
+         slice_total_size = align(slice_total_size, V3D_UIFCFG_PAGE_SIZE);
       }
 
       offset += slice_total_size;
@@ -211,13 +210,12 @@ v3d_setup_slices(struct v3dv_image *image)
     *
     * We additionally align to 4k, which improves UIF XOR performance.
     */
-   image->alignment =
-      image->tiling == VK_IMAGE_TILING_LINEAR ? image->cpp : 4096;
+   image->alignment = image->tiled ? 4096 : image->cpp;
    uint32_t align_offset =
       align(image->slices[0].offset, image->alignment) - image->slices[0].offset;
    if (align_offset) {
       image->size += align_offset;
-      for (int i = 0; i < image->levels; i++)
+      for (int i = 0; i < image->vk.mip_levels; i++)
          image->slices[i].offset += align_offset;
    }
 
@@ -225,10 +223,10 @@ v3d_setup_slices(struct v3dv_image *image)
     * one full mipmap tree to the next (64b aligned).  For 3D textures,
     * we need to program the stride between slices of miplevel 0.
     */
-   if (image->type != VK_IMAGE_TYPE_3D) {
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       image->cube_map_stride =
          align(image->slices[0].offset + image->slices[0].size, 64);
-      image->size += image->cube_map_stride * (image->array_size - 1);
+      image->size += image->cube_map_stride * (image->vk.array_layers - 1);
    } else {
       image->cube_map_stride = image->slices[0].size;
    }
@@ -239,29 +237,23 @@ v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer
 {
    const struct v3d_resource_slice *slice = &image->slices[level];
 
-   if (image->type == VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type == VK_IMAGE_TYPE_3D)
       return image->mem_offset + slice->offset + layer * slice->size;
    else
       return image->mem_offset + slice->offset + layer * image->cube_map_stride;
 }
 
-VkResult
-v3dv_CreateImage(VkDevice _device,
-                 const VkImageCreateInfo *pCreateInfo,
-                 const VkAllocationCallbacks *pAllocator,
-                 VkImage *pImage)
+static VkResult
+create_image(struct v3dv_device *device,
+             const VkImageCreateInfo *pCreateInfo,
+             const VkAllocationCallbacks *pAllocator,
+             VkImage *pImage)
 {
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
    struct v3dv_image *image = NULL;
 
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO);
-
-   v3dv_assert(pCreateInfo->mipLevels > 0);
-   v3dv_assert(pCreateInfo->arrayLayers > 0);
-   v3dv_assert(pCreateInfo->samples > 0);
-   v3dv_assert(pCreateInfo->extent.width > 0);
-   v3dv_assert(pCreateInfo->extent.height > 0);
-   v3dv_assert(pCreateInfo->extent.depth > 0);
+   image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image));
+   if (image == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* When using the simulator the WSI common code will see that our
     * driver wsi device doesn't match the display device and because of that
@@ -272,68 +264,60 @@ v3dv_CreateImage(VkDevice _device,
     * As a result, on that path, swapchain images do not have any special
     * requirements and are not created with the pNext structs below.
     */
+   VkImageTiling tiling = pCreateInfo->tiling;
    uint64_t modifier = DRM_FORMAT_MOD_INVALID;
-   if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+   if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
       const VkImageDrmFormatModifierListCreateInfoEXT *mod_info =
          vk_find_struct_const(pCreateInfo->pNext,
                               IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
-      assert(mod_info);
-      for (uint32_t i = 0; i < mod_info->drmFormatModifierCount; i++) {
-         switch (mod_info->pDrmFormatModifiers[i]) {
-         case DRM_FORMAT_MOD_LINEAR:
-            if (modifier == DRM_FORMAT_MOD_INVALID)
-               modifier = DRM_FORMAT_MOD_LINEAR;
-            break;
-         case DRM_FORMAT_MOD_BROADCOM_UIF:
-            modifier = DRM_FORMAT_MOD_BROADCOM_UIF;
-            break;
+      const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info =
+         vk_find_struct_const(pCreateInfo->pNext,
+                              IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
+      assert(mod_info || explicit_mod_info);
+
+      if (mod_info) {
+         for (uint32_t i = 0; i < mod_info->drmFormatModifierCount; i++) {
+            switch (mod_info->pDrmFormatModifiers[i]) {
+            case DRM_FORMAT_MOD_LINEAR:
+               if (modifier == DRM_FORMAT_MOD_INVALID)
+                  modifier = DRM_FORMAT_MOD_LINEAR;
+               break;
+            case DRM_FORMAT_MOD_BROADCOM_UIF:
+               modifier = DRM_FORMAT_MOD_BROADCOM_UIF;
+               break;
+            }
          }
+      } else {
+         modifier = explicit_mod_info->drmFormatModifier;
       }
-   } else {
-      const struct wsi_image_create_info *wsi_info =
-         vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA);
-      if (wsi_info)
-         modifier = DRM_FORMAT_MOD_LINEAR;
-   }
-
-   /* 1D and 1D_ARRAY textures are always raster-order */
-   VkImageTiling tiling;
-   if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D)
-      tiling = VK_IMAGE_TILING_LINEAR;
-   else if (modifier == DRM_FORMAT_MOD_INVALID)
-      tiling = pCreateInfo->tiling;
-   else if (modifier == DRM_FORMAT_MOD_BROADCOM_UIF)
-      tiling = VK_IMAGE_TILING_OPTIMAL;
-   else
+      assert(modifier == DRM_FORMAT_MOD_LINEAR ||
+             modifier == DRM_FORMAT_MOD_BROADCOM_UIF);
+   } else if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D ||
+              image->vk.wsi_legacy_scanout) {
       tiling = VK_IMAGE_TILING_LINEAR;
+   }
 
-   const struct v3dv_format *format = v3dv_get_format(pCreateInfo->format);
+   const struct v3dv_format *format =
+      v3dv_X(device, get_format)(pCreateInfo->format);
    v3dv_assert(format != NULL && format->supported);
 
-   image = vk_object_zalloc(&device->vk, pAllocator, sizeof(*image),
-                            VK_OBJECT_TYPE_IMAGE);
-   if (!image)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
    assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT ||
           pCreateInfo->samples == VK_SAMPLE_COUNT_4_BIT);
 
-   image->type = pCreateInfo->imageType;
-   image->extent = pCreateInfo->extent;
-   image->vk_format = pCreateInfo->format;
    image->format = format;
-   image->aspects = vk_format_aspects(image->vk_format);
-   image->levels = pCreateInfo->mipLevels;
-   image->array_size = pCreateInfo->arrayLayers;
-   image->samples = pCreateInfo->samples;
-   image->usage = pCreateInfo->usage;
-   image->flags = pCreateInfo->flags;
+   image->cpp = vk_format_get_blocksize(image->vk.format);
+   image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL ||
+                  (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
+                   modifier != DRM_FORMAT_MOD_LINEAR);
 
-   image->drm_format_mod = modifier;
-   image->tiling = tiling;
-   image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL;
+   image->vk.tiling = tiling;
+   image->vk.drm_format_mod = modifier;
 
-   image->cpp = vk_format_get_blocksize(image->vk_format);
+   /* Our meta paths can create image views with compatible formats for any
+    * image, so always set this flag to keep the common Vulkan image code
+    * happy.
+    */
+   image->vk.create_flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
 
    v3d_setup_slices(image);
 
@@ -342,7 +326,71 @@ v3dv_CreateImage(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
+static VkResult
+create_image_from_swapchain(struct v3dv_device *device,
+                            const VkImageCreateInfo *pCreateInfo,
+                            const VkImageSwapchainCreateInfoKHR *swapchain_info,
+                            const VkAllocationCallbacks *pAllocator,
+                            VkImage *pImage)
+{
+   struct v3dv_image *swapchain_image =
+      v3dv_wsi_get_image_from_swapchain(swapchain_info->swapchain, 0);
+   assert(swapchain_image);
+
+   VkImageCreateInfo local_create_info = *pCreateInfo;
+   local_create_info.pNext = NULL;
+
+   /* Added by wsi code. */
+   local_create_info.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+   /* The spec requires TILING_OPTIMAL as input, but the swapchain image may
+    * privately use a different tiling.  See spec anchor
+    * #swapchain-wsi-image-create-info .
+    */
+   assert(local_create_info.tiling == VK_IMAGE_TILING_OPTIMAL);
+   local_create_info.tiling = swapchain_image->vk.tiling;
+
+   VkImageDrmFormatModifierListCreateInfoEXT local_modifier_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
+      .drmFormatModifierCount = 1,
+      .pDrmFormatModifiers = &swapchain_image->vk.drm_format_mod,
+   };
+
+   if (swapchain_image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
+      __vk_append_struct(&local_create_info, &local_modifier_info);
+
+   assert(swapchain_image->vk.image_type == local_create_info.imageType);
+   assert(swapchain_image->vk.format == local_create_info.format);
+   assert(swapchain_image->vk.extent.width == local_create_info.extent.width);
+   assert(swapchain_image->vk.extent.height == local_create_info.extent.height);
+   assert(swapchain_image->vk.extent.depth == local_create_info.extent.depth);
+   assert(swapchain_image->vk.array_layers == local_create_info.arrayLayers);
+   assert(swapchain_image->vk.samples == local_create_info.samples);
+   assert(swapchain_image->vk.tiling == local_create_info.tiling);
+   assert((swapchain_image->vk.usage & local_create_info.usage) ==
+          local_create_info.usage);
+
+   return create_image(device, &local_create_info, pAllocator, pImage);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_CreateImage(VkDevice _device,
+                 const VkImageCreateInfo *pCreateInfo,
+                 const VkAllocationCallbacks *pAllocator,
+                 VkImage *pImage)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+
+   const VkImageSwapchainCreateInfoKHR *swapchain_info =
+      vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
+   if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE)
+      return create_image_from_swapchain(device, pCreateInfo, swapchain_info,
+                                         pAllocator, pImage);
+
+   return create_image(device, pCreateInfo, pAllocator, pImage);
+}
+
+VKAPI_ATTR void VKAPI_CALL
 v3dv_GetImageSubresourceLayout(VkDevice device,
                                VkImage _image,
                                const VkImageSubresource *subresource,
@@ -358,7 +406,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
    layout->depthPitch = image->cube_map_stride;
    layout->arrayPitch = image->cube_map_stride;
 
-   if (image->type != VK_IMAGE_TYPE_3D) {
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       layout->size = slice->size;
    } else {
       /* For 3D images, the size of the slice represents the size of a 2D slice
@@ -368,7 +416,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
        * arranged in memory from last to first).
        */
       if (subresource->mipLevel == 0) {
-         layout->size = slice->size * image->extent.depth;
+         layout->size = slice->size * image->vk.extent.depth;
       } else {
             const struct v3d_resource_slice *prev_slice =
                &image->slices[subresource->mipLevel - 1];
@@ -377,23 +425,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
    }
 }
 
-VkResult
-v3dv_GetImageDrmFormatModifierPropertiesEXT(
-   VkDevice device,
-   VkImage _image,
-   VkImageDrmFormatModifierPropertiesEXT *pProperties)
-{
-   V3DV_FROM_HANDLE(v3dv_image, image, _image);
-
-   assert(pProperties->sType ==
-          VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT);
-
-   pProperties->drmFormatModifier = image->drm_format_mod;
-
-   return VK_SUCCESS;
-}
-
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyImage(VkDevice _device,
                   VkImage _image,
                   const VkAllocationCallbacks* pAllocator)
@@ -404,7 +436,7 @@ v3dv_DestroyImage(VkDevice _device,
    if (image == NULL)
       return;
 
-   vk_object_free(&device->vk, pAllocator, image);
+   vk_image_destroy(&device->vk, pAllocator, &image->vk);
 }
 
 VkImageViewType
@@ -419,138 +451,10 @@ v3dv_image_type_to_view_type(VkImageType type)
    }
 }
 
-/*
- * This method translates pipe_swizzle to the swizzle values used at the
- * packet TEXTURE_SHADER_STATE
- *
- * FIXME: C&P from v3d, common place?
- */
-static uint32_t
-translate_swizzle(unsigned char pipe_swizzle)
-{
-   switch (pipe_swizzle) {
-   case PIPE_SWIZZLE_0:
-      return 0;
-   case PIPE_SWIZZLE_1:
-      return 1;
-   case PIPE_SWIZZLE_X:
-   case PIPE_SWIZZLE_Y:
-   case PIPE_SWIZZLE_Z:
-   case PIPE_SWIZZLE_W:
-      return 2 + pipe_swizzle;
-   default:
-      unreachable("unknown swizzle");
-   }
-}
-
-/*
- * Packs and ensure bo for the shader state (the latter can be temporal).
- */
-static void
-pack_texture_shader_state_helper(struct v3dv_device *device,
-                                 struct v3dv_image_view *image_view,
-                                 bool for_cube_map_array_storage)
-{
-   assert(!for_cube_map_array_storage ||
-          image_view->type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY);
-   const uint32_t index = for_cube_map_array_storage ? 1 : 0;
-
-   assert(image_view->image);
-   const struct v3dv_image *image = image_view->image;
-
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT ||
-          image->samples == VK_SAMPLE_COUNT_4_BIT);
-   const uint32_t msaa_scale = image->samples == VK_SAMPLE_COUNT_1_BIT ? 1 : 2;
-
-   v3dv_pack(image_view->texture_shader_state[index], TEXTURE_SHADER_STATE, tex) {
-
-      tex.level_0_is_strictly_uif =
-         (image->slices[0].tiling == VC5_TILING_UIF_XOR ||
-          image->slices[0].tiling == VC5_TILING_UIF_NO_XOR);
-
-      tex.level_0_xor_enable = (image->slices[0].tiling == VC5_TILING_UIF_XOR);
-
-      if (tex.level_0_is_strictly_uif)
-         tex.level_0_ub_pad = image->slices[0].ub_pad;
-
-      /* FIXME: v3d never sets uif_xor_disable, but uses it on the following
-       * check so let's set the default value
-       */
-      tex.uif_xor_disable = false;
-      if (tex.uif_xor_disable ||
-          tex.level_0_is_strictly_uif) {
-         tex.extended = true;
-      }
-
-      tex.base_level = image_view->base_level;
-      tex.max_level = image_view->max_level;
-
-      tex.swizzle_r = translate_swizzle(image_view->swizzle[0]);
-      tex.swizzle_g = translate_swizzle(image_view->swizzle[1]);
-      tex.swizzle_b = translate_swizzle(image_view->swizzle[2]);
-      tex.swizzle_a = translate_swizzle(image_view->swizzle[3]);
-
-      tex.texture_type = image_view->format->tex_type;
-
-      if (image->type == VK_IMAGE_TYPE_3D) {
-         tex.image_depth = image->extent.depth;
-      } else {
-         tex.image_depth = (image_view->last_layer - image_view->first_layer) + 1;
-      }
-
-      /* Empirical testing with CTS shows that when we are sampling from cube
-       * arrays we want to set image depth to layers / 6, but not when doing
-       * image load/store.
-       */
-      if (image_view->type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
-          !for_cube_map_array_storage) {
-         assert(tex.image_depth % 6 == 0);
-         tex.image_depth /= 6;
-      }
-
-      tex.image_height = image->extent.height * msaa_scale;
-      tex.image_width = image->extent.width * msaa_scale;
-
-      /* On 4.x, the height of a 1D texture is redefined to be the
-       * upper 14 bits of the width (which is only usable with txf).
-       */
-      if (image->type == VK_IMAGE_TYPE_1D) {
-         tex.image_height = tex.image_width >> 14;
-      }
-      tex.image_width &= (1 << 14) - 1;
-      tex.image_height &= (1 << 14) - 1;
-
-      tex.array_stride_64_byte_aligned = image->cube_map_stride / 64;
-
-      tex.srgb = vk_format_is_srgb(image_view->vk_format);
-
-      /* At this point we don't have the job. That's the reason the first
-       * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
-       * add the bo to the job. This also means that we need to add manually
-       * the image bo to the job using the texture.
-       */
-      const uint32_t base_offset =
-         image->mem->bo->offset +
-         v3dv_layer_offset(image, 0, image_view->first_layer);
-      tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
-   }
-}
-
-static void
-pack_texture_shader_state(struct v3dv_device *device,
-                          struct v3dv_image_view *iview)
-{
-   pack_texture_shader_state_helper(device, iview, false);
-   if (iview->type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
-      pack_texture_shader_state_helper(device, iview, true);
-}
-
 static enum pipe_swizzle
-vk_component_mapping_to_pipe_swizzle(VkComponentSwizzle comp,
-                                     VkComponentSwizzle swz)
+vk_component_mapping_to_pipe_swizzle(VkComponentSwizzle swz)
 {
-   if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
-      swz = comp;
+   assert(swz != VK_COMPONENT_SWIZZLE_IDENTITY);
 
    switch (swz) {
    case VK_COMPONENT_SWIZZLE_ZERO:
@@ -570,7 +474,7 @@ vk_component_mapping_to_pipe_swizzle(VkComponentSwizzle comp,
    };
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateImageView(VkDevice _device,
                      const VkImageViewCreateInfo *pCreateInfo,
                      const VkAllocationCallbacks *pAllocator,
@@ -580,56 +484,15 @@ v3dv_CreateImageView(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_image, image, pCreateInfo->image);
    struct v3dv_image_view *iview;
 
-   iview = vk_object_zalloc(&device->vk, pAllocator, sizeof(*iview),
-                            VK_OBJECT_TYPE_IMAGE_VIEW);
+   iview = vk_image_view_create(&device->vk, pCreateInfo, pAllocator,
+                                sizeof(*iview));
    if (iview == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
 
-   assert(range->layerCount > 0);
-   assert(range->baseMipLevel < image->levels);
-
-#ifdef DEBUG
-   switch (image->type) {
-   case VK_IMAGE_TYPE_1D:
-   case VK_IMAGE_TYPE_2D:
-      assert(range->baseArrayLayer + v3dv_layer_count(image, range) - 1 <=
-             image->array_size);
-      break;
-   case VK_IMAGE_TYPE_3D:
-      assert(range->baseArrayLayer + v3dv_layer_count(image, range) - 1
-             <= u_minify(image->extent.depth, range->baseMipLevel));
-      /* VK_KHR_maintenance1 */
-      assert(pCreateInfo->viewType != VK_IMAGE_VIEW_TYPE_2D ||
-             ((image->flags & VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT) &&
-              range->levelCount == 1 && range->layerCount == 1));
-      assert(pCreateInfo->viewType != VK_IMAGE_VIEW_TYPE_2D_ARRAY ||
-             ((image->flags & VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT) &&
-              range->levelCount == 1));
-      break;
-   default:
-      unreachable("bad VkImageType");
-   }
-#endif
-
-   iview->image = image;
-   iview->aspects = range->aspectMask;
-   iview->type = pCreateInfo->viewType;
-
-   iview->base_level = range->baseMipLevel;
-   iview->max_level = iview->base_level + v3dv_level_count(image, range) - 1;
-   iview->extent = (VkExtent3D) {
-      .width  = u_minify(image->extent.width , iview->base_level),
-      .height = u_minify(image->extent.height, iview->base_level),
-      .depth  = u_minify(image->extent.depth , iview->base_level),
-   };
-
-   iview->first_layer = range->baseArrayLayer;
-   iview->last_layer = range->baseArrayLayer +
-                       v3dv_layer_count(image, range) - 1;
-   iview->offset =
-      v3dv_layer_offset(image, iview->base_level, iview->first_layer);
+   iview->offset = v3dv_layer_offset(image, iview->vk.base_mip_level,
+                                     iview->vk.base_array_layer);
 
    /* If we have D24S8 format but the view only selects the stencil aspect
     * we want to re-interpret the format as RGBA8_UINT, then map our stencil
@@ -653,44 +516,40 @@ v3dv_CreateImageView(VkDevice _device,
        * better to reimplement the latter using vk component
        */
       image_view_swizzle[0] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_R,
-                                              pCreateInfo->components.r);
+         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.r);
       image_view_swizzle[1] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_G,
-                                              pCreateInfo->components.g);
+         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.g);
       image_view_swizzle[2] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_B,
-                                              pCreateInfo->components.b);
+         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.b);
       image_view_swizzle[3] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_A,
-                                              pCreateInfo->components.a);
+         vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle.a);
    }
 
-   iview->vk_format = format;
-   iview->format = v3dv_get_format(format);
+   iview->vk.format = format;
+   iview->format = v3dv_X(device, get_format)(format);
    assert(iview->format && iview->format->supported);
 
-   if (vk_format_is_depth_or_stencil(iview->vk_format)) {
-      iview->internal_type = v3dv_get_internal_depth_type(iview->vk_format);
+   if (vk_format_is_depth_or_stencil(iview->vk.format)) {
+      iview->internal_type =
+         v3dv_X(device, get_internal_depth_type)(iview->vk.format);
    } else {
-      v3dv_get_internal_type_bpp_for_output_format(iview->format->rt_type,
-                                                   &iview->internal_type,
-                                                   &iview->internal_bpp);
+      v3dv_X(device, get_internal_type_bpp_for_output_format)
+         (iview->format->rt_type, &iview->internal_type, &iview->internal_bpp);
    }
 
-   const uint8_t *format_swizzle = v3dv_get_format_swizzle(format);
+   const uint8_t *format_swizzle = v3dv_get_format_swizzle(device, format);
    util_format_compose_swizzles(format_swizzle, image_view_swizzle,
                                 iview->swizzle);
    iview->swap_rb = iview->swizzle[0] == PIPE_SWIZZLE_Z;
 
-   pack_texture_shader_state(device, iview);
+   v3dv_X(device, pack_texture_shader_state)(device, iview);
 
    *pView = v3dv_image_view_to_handle(iview);
 
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyImageView(VkDevice _device,
                       VkImageView imageView,
                       const VkAllocationCallbacks* pAllocator)
@@ -701,52 +560,10 @@ v3dv_DestroyImageView(VkDevice _device,
    if (image_view == NULL)
       return;
 
-   vk_object_free(&device->vk, pAllocator, image_view);
-}
-
-static void
-pack_texture_shader_state_from_buffer_view(struct v3dv_device *device,
-                                           struct v3dv_buffer_view *buffer_view)
-{
-   assert(buffer_view->buffer);
-   const struct v3dv_buffer *buffer = buffer_view->buffer;
-
-   v3dv_pack(buffer_view->texture_shader_state, TEXTURE_SHADER_STATE, tex) {
-      tex.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X);
-      tex.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y);
-      tex.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z);
-      tex.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W);
-
-      tex.image_depth = 1;
-
-      /* On 4.x, the height of a 1D texture is redefined to be the upper 14
-       * bits of the width (which is only usable with txf) (or in other words,
-       * we are providing a 28 bit field for size, but split on the usual
-       * 14bit height/width).
-       */
-      tex.image_width = buffer_view->num_elements;
-      tex.image_height = tex.image_width >> 14;
-      tex.image_width &= (1 << 14) - 1;
-      tex.image_height &= (1 << 14) - 1;
-
-      tex.texture_type = buffer_view->format->tex_type;
-      tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
-
-      /* At this point we don't have the job. That's the reason the first
-       * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
-       * add the bo to the job. This also means that we need to add manually
-       * the image bo to the job using the texture.
-       */
-      const uint32_t base_offset =
-         buffer->mem->bo->offset +
-         buffer->mem_offset +
-         buffer_view->offset;
-
-      tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
-   }
+   vk_image_view_destroy(&device->vk, pAllocator, &image_view->vk);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateBufferView(VkDevice _device,
                       const VkBufferViewCreateInfo *pCreateInfo,
                       const VkAllocationCallbacks *pAllocator,
@@ -754,14 +571,14 @@ v3dv_CreateBufferView(VkDevice _device,
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
-   const struct v3dv_buffer *buffer =
+   struct v3dv_buffer *buffer =
       v3dv_buffer_from_handle(pCreateInfo->buffer);
 
    struct v3dv_buffer_view *view =
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*view),
                        VK_OBJECT_TYPE_BUFFER_VIEW);
    if (!view)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    uint32_t range;
    if (pCreateInfo->range == VK_WHOLE_SIZE)
@@ -777,22 +594,21 @@ v3dv_CreateBufferView(VkDevice _device,
    view->size = view->offset + range;
    view->num_elements = num_elements;
    view->vk_format = pCreateInfo->format;
-   view->format = v3dv_get_format(view->vk_format);
+   view->format = v3dv_X(device, get_format)(view->vk_format);
 
-   v3dv_get_internal_type_bpp_for_output_format(view->format->rt_type,
-                                                &view->internal_type,
-                                                &view->internal_bpp);
+   v3dv_X(device, get_internal_type_bpp_for_output_format)
+      (view->format->rt_type, &view->internal_type, &view->internal_bpp);
 
    if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT ||
        buffer->usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)
-      pack_texture_shader_state_from_buffer_view(device, view);
+      v3dv_X(device, pack_texture_shader_state_from_buffer_view)(device, view);
 
    *pView = v3dv_buffer_view_to_handle(view);
 
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyBufferView(VkDevice _device,
                        VkBufferView bufferView,
                        const VkAllocationCallbacks *pAllocator)
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_limits.h b/lib/mesa/src/broadcom/vulkan/v3dv_limits.h
index a5ddb66e4..aaab1ce03 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_limits.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_limits.h
@@ -44,7 +44,7 @@
 #define MAX_INPUT_ATTACHMENTS 4
 
 #define MAX_UNIFORM_BUFFERS 12
-#define MAX_STORAGE_BUFFERS 4
+#define MAX_STORAGE_BUFFERS 8
 
 #define MAX_DYNAMIC_UNIFORM_BUFFERS 8
 #define MAX_DYNAMIC_STORAGE_BUFFERS 4
@@ -53,21 +53,22 @@
 
 #define MAX_RENDER_TARGETS 4
 
+#define MAX_MULTIVIEW_VIEW_COUNT 16
+
 /* These are tunable parameters in the HW design, but all the V3D
  * implementations agree.
  */
-#define VC5_UIFCFG_BANKS 8
-#define VC5_UIFCFG_PAGE_SIZE 4096
-#define VC5_UIFCFG_XOR_VALUE (1 << 4)
-#define VC5_PAGE_CACHE_SIZE (VC5_UIFCFG_PAGE_SIZE * VC5_UIFCFG_BANKS)
-#define VC5_UBLOCK_SIZE 64
-#define VC5_UIFBLOCK_SIZE (4 * VC5_UBLOCK_SIZE)
-#define VC5_UIFBLOCK_ROW_SIZE (4 * VC5_UIFBLOCK_SIZE)
+#define V3D_UIFCFG_BANKS 8
+#define V3D_UIFCFG_PAGE_SIZE 4096
+#define V3D_UIFCFG_XOR_VALUE (1 << 4)
+#define V3D_PAGE_CACHE_SIZE (V3D_UIFCFG_PAGE_SIZE * V3D_UIFCFG_BANKS)
+#define V3D_UBLOCK_SIZE 64
+#define V3D_UIFBLOCK_SIZE (4 * V3D_UBLOCK_SIZE)
+#define V3D_UIFBLOCK_ROW_SIZE (4 * V3D_UIFBLOCK_SIZE)
 
-#define PAGE_UB_ROWS (VC5_UIFCFG_PAGE_SIZE / VC5_UIFBLOCK_ROW_SIZE)
+#define PAGE_UB_ROWS (V3D_UIFCFG_PAGE_SIZE / V3D_UIFBLOCK_ROW_SIZE)
 #define PAGE_UB_ROWS_TIMES_1_5 ((PAGE_UB_ROWS * 3) >> 1)
-#define PAGE_CACHE_UB_ROWS (VC5_PAGE_CACHE_SIZE / VC5_UIFBLOCK_ROW_SIZE)
+#define PAGE_CACHE_UB_ROWS (V3D_PAGE_CACHE_SIZE / V3D_UIFBLOCK_ROW_SIZE)
 #define PAGE_CACHE_MINUS_1_5_UB_ROWS (PAGE_CACHE_UB_ROWS - PAGE_UB_ROWS_TIMES_1_5)
 
-
 #endif /* V3DV_LIMITS_H */
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c b/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c
index 0a38edb21..5555c690b 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -22,13 +22,175 @@
  */
 
 #include "v3dv_private.h"
+#include "v3dv_meta_common.h"
 
-#include "broadcom/cle/v3dx_pack.h"
 #include "compiler/nir/nir_builder.h"
 #include "vk_format_info.h"
 #include "util/u_pack_color.h"
 
 static void
+get_hw_clear_color(struct v3dv_device *device,
+                   const VkClearColorValue *color,
+                   VkFormat fb_format,
+                   VkFormat image_format,
+                   uint32_t internal_type,
+                   uint32_t internal_bpp,
+                   uint32_t *hw_color)
+{
+   const uint32_t internal_size = 4 << internal_bpp;
+
+   /* If the image format doesn't match the framebuffer format, then we are
+    * trying to clear an unsupported tlb format using a compatible
+    * format for the framebuffer. In this case, we want to make sure that
+    * we pack the clear value according to the original format semantics,
+    * not the compatible format.
+    */
+   if (fb_format == image_format) {
+      v3dv_X(device, get_hw_clear_color)(color, internal_type, internal_size,
+                                         hw_color);
+   } else {
+      union util_color uc;
+      enum pipe_format pipe_image_format =
+         vk_format_to_pipe_format(image_format);
+      util_pack_color(color->float32, pipe_image_format, &uc);
+      memcpy(hw_color, uc.ui, internal_size);
+   }
+}
+
+/* Returns true if the implementation is able to handle the case, false
+ * otherwise.
+*/
+static bool
+clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+                struct v3dv_image *image,
+                const VkClearValue *clear_value,
+                const VkImageSubresourceRange *range)
+{
+   const VkOffset3D origin = { 0, 0, 0 };
+   VkFormat fb_format;
+   if (!v3dv_meta_can_use_tlb(image, &origin, &fb_format))
+      return false;
+
+   uint32_t internal_type, internal_bpp;
+   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
+      (fb_format, range->aspectMask,
+       &internal_type, &internal_bpp);
+
+   union v3dv_clear_value hw_clear_value = { 0 };
+   if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+      get_hw_clear_color(cmd_buffer->device, &clear_value->color, fb_format,
+                         image->vk.format, internal_type, internal_bpp,
+                         &hw_clear_value.color[0]);
+   } else {
+      assert((range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) ||
+             (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT));
+      hw_clear_value.z = clear_value->depthStencil.depth;
+      hw_clear_value.s = clear_value->depthStencil.stencil;
+   }
+
+   uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
+   uint32_t min_level = range->baseMipLevel;
+   uint32_t max_level = range->baseMipLevel + level_count;
+
+   /* For 3D images baseArrayLayer and layerCount must be 0 and 1 respectively.
+    * Instead, we need to consider the full depth dimension of the image, which
+    * goes from 0 up to the level's depth extent.
+    */
+   uint32_t min_layer;
+   uint32_t max_layer;
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
+      min_layer = range->baseArrayLayer;
+      max_layer = range->baseArrayLayer +
+                  vk_image_subresource_layer_count(&image->vk, range);
+   } else {
+      min_layer = 0;
+      max_layer = 0;
+   }
+
+   for (uint32_t level = min_level; level < max_level; level++) {
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D)
+         max_layer = u_minify(image->vk.extent.depth, level);
+
+      uint32_t width = u_minify(image->vk.extent.width, level);
+      uint32_t height = u_minify(image->vk.extent.height, level);
+
+      struct v3dv_job *job =
+         v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
+
+      if (!job)
+         return true;
+
+      v3dv_job_start_frame(job, width, height, max_layer, false,
+                           1, internal_bpp,
+                           image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
+
+      struct v3dv_meta_framebuffer framebuffer;
+      v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+                                                 internal_type,
+                                                 &job->frame_tiling);
+
+      v3dv_X(job->device, job_emit_binning_flush)(job);
+
+      /* If this triggers it is an application bug: the spec requires
+       * that any aspects to clear are present in the image.
+       */
+      assert(range->aspectMask & image->vk.aspects);
+
+      v3dv_X(job->device, meta_emit_clear_image_rcl)
+         (job, image, &framebuffer, &hw_clear_value,
+          range->aspectMask, min_layer, max_layer, level);
+
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+   }
+
+   return true;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,
+                        VkImage _image,
+                        VkImageLayout imageLayout,
+                        const VkClearColorValue *pColor,
+                        uint32_t rangeCount,
+                        const VkImageSubresourceRange *pRanges)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_image, image, _image);
+
+   const VkClearValue clear_value = {
+      .color = *pColor,
+   };
+
+   for (uint32_t i = 0; i < rangeCount; i++) {
+      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
+         continue;
+      unreachable("Unsupported color clear.");
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
+                               VkImage _image,
+                               VkImageLayout imageLayout,
+                               const VkClearDepthStencilValue *pDepthStencil,
+                               uint32_t rangeCount,
+                               const VkImageSubresourceRange *pRanges)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_image, image, _image);
+
+   const VkClearValue clear_value = {
+      .depthStencil = *pDepthStencil,
+   };
+
+   for (uint32_t i = 0; i < rangeCount; i++) {
+      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
+         continue;
+      unreachable("Unsupported depth/stencil clear.");
+   }
+}
+
+static void
 destroy_color_clear_pipeline(VkDevice _device,
                              uint64_t pipeline,
                              VkAllocationCallbacks *alloc)
@@ -54,12 +216,20 @@ static VkResult
 create_color_clear_pipeline_layout(struct v3dv_device *device,
                                    VkPipelineLayout *pipeline_layout)
 {
+   /* FIXME: this is abusing a bit the API, since not all of our clear
+    * pipelines have a geometry shader. We could create 2 different pipeline
+    * layouts, but this works for us for now.
+    */
+   VkPushConstantRange ranges[2] = {
+      { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16 },
+      { VK_SHADER_STAGE_GEOMETRY_BIT, 16, 4 },
+   };
+
    VkPipelineLayoutCreateInfo info = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
       .setLayoutCount = 0,
-      .pushConstantRangeCount = 1,
-      .pPushConstantRanges =
-         &(VkPushConstantRange) { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16 },
+      .pushConstantRangeCount = 2,
+      .pPushConstantRanges = ranges,
    };
 
    return v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
@@ -70,12 +240,20 @@ static VkResult
 create_depth_clear_pipeline_layout(struct v3dv_device *device,
                                    VkPipelineLayout *pipeline_layout)
 {
+   /* FIXME: this is abusing a bit the API, since not all of our clear
+    * pipelines have a geometry shader. We could create 2 different pipeline
+    * layouts, but this works for us for now.
+    */
+   VkPushConstantRange ranges[2] = {
+      { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4 },
+      { VK_SHADER_STAGE_GEOMETRY_BIT, 4, 4 },
+   };
+
    VkPipelineLayoutCreateInfo info = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
       .setLayoutCount = 0,
-      .pushConstantRangeCount = 1,
-      .pPushConstantRanges =
-         &(VkPushConstantRange) { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4 },
+      .pushConstantRangeCount = 2,
+      .pPushConstantRanges = ranges
    };
 
    return v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
@@ -178,6 +356,70 @@ get_clear_rect_vs()
 }
 
 static nir_shader *
+get_clear_rect_gs(uint32_t push_constant_layer_base)
+{
+   /* FIXME: this creates a geometry shader that takes the index of a single
+    * layer to clear from push constants, so we need to emit a draw call for
+    * each layer that we want to clear. We could actually do better and have it
+    * take a range of layers and then emit one triangle per layer to clear,
+    * however, if we were to do this we would need to be careful not to exceed
+    * the maximum number of output vertices allowed in a geometry shader.
+    */
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
+                                                  "meta clear gs");
+   nir_shader *nir = b.shader;
+   nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
+   nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
+                               (1ull << VARYING_SLOT_LAYER);
+   nir->info.gs.input_primitive = GL_TRIANGLES;
+   nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
+   nir->info.gs.vertices_in = 3;
+   nir->info.gs.vertices_out = 3;
+   nir->info.gs.invocations = 1;
+   nir->info.gs.active_stream_mask = 0x1;
+
+   /* in vec4 gl_Position[3] */
+   nir_variable *gs_in_pos =
+      nir_variable_create(b.shader, nir_var_shader_in,
+                          glsl_array_type(glsl_vec4_type(), 3, 0),
+                          "in_gl_Position");
+   gs_in_pos->data.location = VARYING_SLOT_POS;
+
+   /* out vec4 gl_Position */
+   nir_variable *gs_out_pos =
+      nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
+                          "out_gl_Position");
+   gs_out_pos->data.location = VARYING_SLOT_POS;
+
+   /* out float gl_Layer */
+   nir_variable *gs_out_layer =
+      nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
+                          "out_gl_Layer");
+   gs_out_layer->data.location = VARYING_SLOT_LAYER;
+
+   /* Emit output triangle */
+   for (uint32_t i = 0; i < 3; i++) {
+      /* gl_Position from shader input */
+      nir_deref_instr *in_pos_i =
+         nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
+      nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
+
+      /* gl_Layer from push constants */
+      nir_ssa_def *layer =
+         nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
+                                .base = push_constant_layer_base, .range = 4);
+      nir_store_var(&b, gs_out_layer, layer, 0x1);
+
+      nir_emit_vertex(&b, 0);
+   }
+
+   nir_end_primitive(&b, 0);
+
+   return nir;
+}
+
+static nir_shader *
 get_color_clear_rect_fs(uint32_t rt_idx, VkFormat format)
 {
    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
@@ -224,6 +466,7 @@ create_pipeline(struct v3dv_device *device,
                 uint32_t subpass_idx,
                 uint32_t samples,
                 struct nir_shader *vs_nir,
+                struct nir_shader *gs_nir,
                 struct nir_shader *fs_nir,
                 const VkPipelineVertexInputStateCreateInfo *vi_state,
                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
@@ -231,32 +474,41 @@ create_pipeline(struct v3dv_device *device,
                 const VkPipelineLayout layout,
                 VkPipeline *pipeline)
 {
+   VkPipelineShaderStageCreateInfo stages[3] = { 0 };
    struct vk_shader_module vs_m;
+   struct vk_shader_module gs_m;
    struct vk_shader_module fs_m;
 
+   uint32_t stage_count = 0;
    v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
-   if (fs_nir)
-      v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
+   stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+   stages[stage_count].stage = VK_SHADER_STAGE_VERTEX_BIT;
+   stages[stage_count].module = vk_shader_module_to_handle(&vs_m);
+   stages[stage_count].pName = "main";
+   stage_count++;
+
+   if (gs_nir) {
+      v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
+      stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+      stages[stage_count].stage = VK_SHADER_STAGE_GEOMETRY_BIT;
+      stages[stage_count].module = vk_shader_module_to_handle(&gs_m);
+      stages[stage_count].pName = "main";
+      stage_count++;
+   }
 
-   VkPipelineShaderStageCreateInfo stages[2] = {
-      {
-         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
-         .stage = VK_SHADER_STAGE_VERTEX_BIT,
-         .module = vk_shader_module_to_handle(&vs_m),
-         .pName = "main",
-      },
-      {
-         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
-         .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
-         .module = fs_nir ? vk_shader_module_to_handle(&fs_m) : VK_NULL_HANDLE,
-         .pName = "main",
-      },
-   };
+   if (fs_nir) {
+      v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
+      stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+      stages[stage_count].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
+      stages[stage_count].module = vk_shader_module_to_handle(&fs_m);
+      stages[stage_count].pName = "main";
+      stage_count++;
+   }
 
    VkGraphicsPipelineCreateInfo info = {
       .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 
-      .stageCount = fs_nir ? 2 : 1,
+      .stageCount = stage_count,
       .pStages = stages,
 
       .pVertexInputState = vi_state,
@@ -342,11 +594,13 @@ create_color_clear_pipeline(struct v3dv_device *device,
                             VkFormat format,
                             uint32_t samples,
                             uint32_t components,
+                            bool is_layered,
                             VkPipelineLayout pipeline_layout,
                             VkPipeline *pipeline)
 {
    nir_shader *vs_nir = get_clear_rect_vs();
    nir_shader *fs_nir = get_color_clear_rect_fs(rt_idx, format);
+   nir_shader *gs_nir = is_layered ? get_clear_rect_gs(16) : NULL;
 
    const VkPipelineVertexInputStateCreateInfo vi_state = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
@@ -384,7 +638,7 @@ create_color_clear_pipeline(struct v3dv_device *device,
    return create_pipeline(device,
                           pass, subpass_idx,
                           samples,
-                          vs_nir, fs_nir,
+                          vs_nir, gs_nir, fs_nir,
                           &vi_state,
                           &ds_state,
                           &cb_state,
@@ -398,6 +652,7 @@ create_depth_clear_pipeline(struct v3dv_device *device,
                             struct v3dv_render_pass *pass,
                             uint32_t subpass_idx,
                             uint32_t samples,
+                            bool is_layered,
                             VkPipelineLayout pipeline_layout,
                             VkPipeline *pipeline)
 {
@@ -407,6 +662,7 @@ create_depth_clear_pipeline(struct v3dv_device *device,
 
    nir_shader *vs_nir = get_clear_rect_vs();
    nir_shader *fs_nir = has_depth ? get_depth_clear_rect_fs() : NULL;
+   nir_shader *gs_nir = is_layered ? get_clear_rect_gs(4) : NULL;
 
    const VkPipelineVertexInputStateCreateInfo vi_state = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
@@ -441,7 +697,7 @@ create_depth_clear_pipeline(struct v3dv_device *device,
    return create_pipeline(device,
                           pass, subpass_idx,
                           samples,
-                          vs_nir, fs_nir,
+                          vs_nir, gs_nir, fs_nir,
                           &vi_state,
                           &ds_state,
                           &cb_state,
@@ -499,7 +755,8 @@ static inline uint64_t
 get_color_clear_pipeline_cache_key(uint32_t rt_idx,
                                    VkFormat format,
                                    uint32_t samples,
-                                   uint32_t components)
+                                   uint32_t components,
+                                   bool is_layered)
 {
    assert(rt_idx < V3D_MAX_DRAW_BUFFERS);
 
@@ -518,6 +775,9 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
    key |= ((uint64_t) components) << bit_offset;
    bit_offset += 4;
 
+   key |= (is_layered ? 1ull : 0ull) << bit_offset;
+   bit_offset += 1;
+
    assert(bit_offset <= 64);
    return key;
 }
@@ -525,7 +785,8 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
 static inline uint64_t
 get_depth_clear_pipeline_cache_key(VkImageAspectFlags aspects,
                                    VkFormat format,
-                                   uint32_t samples)
+                                   uint32_t samples,
+                                   bool is_layered)
 {
    uint64_t key = 0;
    uint32_t bit_offset = 0;
@@ -544,6 +805,9 @@ get_depth_clear_pipeline_cache_key(VkImageAspectFlags aspects,
    key |= ((uint64_t) has_stencil) << bit_offset;
    bit_offset++;;
 
+   key |= (is_layered ? 1ull : 0ull) << bit_offset;
+   bit_offset += 1;
+
    assert(bit_offset <= 64);
    return key;
 }
@@ -557,6 +821,7 @@ get_color_clear_pipeline(struct v3dv_device *device,
                          VkFormat format,
                          uint32_t samples,
                          uint32_t components,
+                         bool is_layered,
                          struct v3dv_meta_color_clear_pipeline **pipeline)
 {
    assert(vk_format_is_color(format));
@@ -580,8 +845,8 @@ get_color_clear_pipeline(struct v3dv_device *device,
 
    uint64_t key;
    if (can_cache_pipeline) {
-      key =
-         get_color_clear_pipeline_cache_key(rt_idx, format, samples, components);
+      key = get_color_clear_pipeline_cache_key(rt_idx, format, samples,
+                                               components, is_layered);
       mtx_lock(&device->meta.mtx);
       struct hash_entry *entry =
          _mesa_hash_table_search(device->meta.color_clear.cache, &key);
@@ -621,6 +886,7 @@ get_color_clear_pipeline(struct v3dv_device *device,
                                         format,
                                         samples,
                                         components,
+                                        is_layered,
                                         device->meta.color_clear.p_layout,
                                         &(*pipeline)->pipeline);
    if (result != VK_SUCCESS)
@@ -660,6 +926,7 @@ get_depth_clear_pipeline(struct v3dv_device *device,
                          struct v3dv_render_pass *pass,
                          uint32_t subpass_idx,
                          uint32_t attachment_idx,
+                         bool is_layered,
                          struct v3dv_meta_depth_clear_pipeline **pipeline)
 {
    assert(subpass_idx < pass->subpass_count);
@@ -673,7 +940,7 @@ get_depth_clear_pipeline(struct v3dv_device *device,
    assert(vk_format_is_depth_or_stencil(format));
 
    const uint64_t key =
-      get_depth_clear_pipeline_cache_key(aspects, format, samples);
+      get_depth_clear_pipeline_cache_key(aspects, format, samples, is_layered);
    mtx_lock(&device->meta.mtx);
    struct hash_entry *entry =
       _mesa_hash_table_search(device->meta.depth_clear.cache, &key);
@@ -696,6 +963,7 @@ get_depth_clear_pipeline(struct v3dv_device *device,
                                         pass,
                                         subpass_idx,
                                         samples,
+                                        is_layered,
                                         device->meta.depth_clear.p_layout,
                                         &(*pipeline)->pipeline);
    if (result != VK_SUCCESS)
@@ -722,272 +990,15 @@ fail:
    return result;
 }
 
-static VkFormat
-get_color_format_for_depth_stencil_format(VkFormat format)
-{
-   /* For single depth/stencil aspect formats, we just choose a compatible
-    * 1 channel format, but for combined depth/stencil we want an RGBA format
-    * so we can specify the channels we want to write.
-    */
-   switch (format) {
-   case VK_FORMAT_D16_UNORM:
-      return VK_FORMAT_R16_UINT;
-   case VK_FORMAT_D32_SFLOAT:
-      return VK_FORMAT_R32_SFLOAT;
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
-      return VK_FORMAT_R8G8B8A8_UINT;
-   default:
-      unreachable("Unsupported depth/stencil format");
-   };
-}
-
-/**
- * Emits a scissored quad in the clear color, however, unlike the subpass
- * versions, this creates its own framebuffer setup with a single color
- * attachment, and therefore spanws new jobs, making it much slower than the
- * subpass version.
- *
- * This path is only used when we have clears on layers other than the
- * base layer in a framebuffer attachment, since we don't currently
- * support any form of layered rendering that would allow us to implement
- * this in the subpass version.
- *
- * Notice this can also handle depth/stencil formats by rendering to the
- * depth/stencil target using a compatible color format.
- */
-static void
-emit_color_clear_rect(struct v3dv_cmd_buffer *cmd_buffer,
-                      uint32_t attachment_idx,
-                      VkFormat rt_format,
-                      uint32_t rt_samples,
-                      uint32_t rt_components,
-                      VkClearColorValue clear_color,
-                      const VkClearRect *rect)
-{
-   assert(cmd_buffer->state.pass);
-   struct v3dv_device *device = cmd_buffer->device;
-   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
-
-   assert(attachment_idx != VK_ATTACHMENT_UNUSED &&
-          attachment_idx < pass->attachment_count);
-
-   struct v3dv_meta_color_clear_pipeline *pipeline = NULL;
-   VkResult result =
-      get_color_clear_pipeline(device,
-                               NULL, 0, /* Not using current subpass */
-                               0, attachment_idx,
-                               rt_format, rt_samples, rt_components,
-                               &pipeline);
-   if (result != VK_SUCCESS) {
-      if (result == VK_ERROR_OUT_OF_HOST_MEMORY)
-         v3dv_flag_oom(cmd_buffer, NULL);
-      return;
-   }
-   assert(pipeline && pipeline->pipeline && pipeline->pass);
-
-   /* Since we are not emitting the draw call in the current subpass we should
-    * be caching the clear pipeline and we don't have to take care of destorying
-    * it below.
-    */
-   assert(pipeline->cached);
-
-   /* Store command buffer state for the current subpass before we interrupt
-    * it to emit the color clear pass and then finish the job for the
-    * interrupted subpass.
-    */
-   v3dv_cmd_buffer_meta_state_push(cmd_buffer, false);
-   v3dv_cmd_buffer_finish_job(cmd_buffer);
-
-   struct v3dv_framebuffer *subpass_fb =
-      v3dv_framebuffer_from_handle(cmd_buffer->state.meta.framebuffer);
-   VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
-   VkDevice device_handle = v3dv_device_to_handle(cmd_buffer->device);
-
-   /* If we are clearing a depth/stencil attachment as a color attachment
-    * then we need to configure the framebuffer to the compatible color
-    * format.
-    */
-   const struct v3dv_image_view *att_iview =
-      subpass_fb->attachments[attachment_idx];
-   const bool is_depth_or_stencil =
-      vk_format_is_depth_or_stencil(att_iview->vk_format);
-
-   /* Emit the pass for each attachment layer, which creates a framebuffer
-    * for each selected layer of the attachment and then renders a scissored
-    * quad in the clear color.
-    */
-   uint32_t dirty_dynamic_state = 0;
-   for (uint32_t i = 0; i < rect->layerCount; i++) {
-      VkImageViewCreateInfo fb_layer_view_info = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
-         .image = v3dv_image_to_handle((struct v3dv_image *)att_iview->image),
-         .viewType =
-            v3dv_image_type_to_view_type(att_iview->image->type),
-         .format = is_depth_or_stencil ? rt_format : att_iview->vk_format,
-         .subresourceRange = {
-            .aspectMask = is_depth_or_stencil ? VK_IMAGE_ASPECT_COLOR_BIT :
-                                                att_iview->aspects,
-            .baseMipLevel = att_iview->base_level,
-            .levelCount = att_iview->max_level - att_iview->base_level + 1,
-            .baseArrayLayer = att_iview->first_layer + rect->baseArrayLayer + i,
-            .layerCount = 1,
-         },
-      };
-      VkImageView fb_attachment;
-      result = v3dv_CreateImageView(v3dv_device_to_handle(device),
-                                    &fb_layer_view_info,
-                                    &device->vk.alloc, &fb_attachment);
-      if (result != VK_SUCCESS)
-         goto fail;
-
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)fb_attachment,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
-
-      VkFramebufferCreateInfo fb_info = {
-         .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
-         .renderPass = v3dv_render_pass_to_handle(pass),
-         .attachmentCount = 1,
-         .pAttachments = &fb_attachment,
-         .width = subpass_fb->width,
-         .height = subpass_fb->height,
-         .layers = 1,
-      };
-
-      VkFramebuffer fb;
-      result = v3dv_CreateFramebuffer(device_handle, &fb_info,
-                                      &cmd_buffer->device->vk.alloc, &fb);
-      if (result != VK_SUCCESS)
-         goto fail;
-
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)fb,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
-
-      VkRenderPassBeginInfo rp_info = {
-         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-         .renderPass = pipeline->pass,
-         .framebuffer = fb,
-         .renderArea = {
-            .offset = { rect->rect.offset.x, rect->rect.offset.y },
-            .extent = { rect->rect.extent.width, rect->rect.extent.height } },
-         .clearValueCount = 0,
-      };
-
-      v3dv_CmdBeginRenderPass(cmd_buffer_handle, &rp_info,
-                              VK_SUBPASS_CONTENTS_INLINE);
-
-      struct v3dv_job *job = cmd_buffer->state.job;
-      if (!job)
-         goto fail;
-      job->is_subpass_continue = true;
-
-      v3dv_CmdPushConstants(cmd_buffer_handle,
-                            device->meta.color_clear.p_layout,
-                            VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16,
-                            &clear_color);
-
-      v3dv_CmdBindPipeline(cmd_buffer_handle,
-                           VK_PIPELINE_BIND_POINT_GRAPHICS,
-                           pipeline->pipeline);
-
-      const VkViewport viewport = {
-         .x = rect->rect.offset.x,
-         .y = rect->rect.offset.y,
-         .width = rect->rect.extent.width,
-         .height = rect->rect.extent.height,
-         .minDepth = 0.0f,
-         .maxDepth = 1.0f
-      };
-      v3dv_CmdSetViewport(cmd_buffer_handle, 0, 1, &viewport);
-      v3dv_CmdSetScissor(cmd_buffer_handle, 0, 1, &rect->rect);
-
-      v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
-
-      v3dv_CmdEndRenderPass(cmd_buffer_handle);
-   }
-
-   /* The clear pipeline sets viewport and scissor state, so we need
-    * to restore it
-    */
-   dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
-
-fail:
-   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
-}
-
-static void
-emit_ds_clear_rect(struct v3dv_cmd_buffer *cmd_buffer,
-                   VkImageAspectFlags aspects,
-                   uint32_t attachment_idx,
-                   VkClearDepthStencilValue clear_ds,
-                   const VkClearRect *rect)
-{
-   assert(cmd_buffer->state.pass);
-   assert(attachment_idx != VK_ATTACHMENT_UNUSED);
-   assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
-
-   VkFormat format =
-      cmd_buffer->state.pass->attachments[attachment_idx].desc.format;
-   assert ((aspects & ~vk_format_aspects(format)) == 0);
-
-   uint32_t samples =
-      cmd_buffer->state.pass->attachments[attachment_idx].desc.samples;
-
-   enum pipe_format pformat = vk_format_to_pipe_format(format);
-   VkClearColorValue clear_color;
-   uint32_t clear_zs =
-      util_pack_z_stencil(pformat, clear_ds.depth, clear_ds.stencil);
-
-   /* We implement depth/stencil clears by turning them into color clears
-    * with a compatible color format.
-    */
-   VkFormat color_format = get_color_format_for_depth_stencil_format(format);
-
-   uint32_t comps;
-   if (color_format == VK_FORMAT_R8G8B8A8_UINT) {
-    /* We are clearing a D24 format so we need to select the channels that we
-     * are being asked to clear to avoid clearing aspects that should be
-     * preserved. Also, the hardware uses the MSB channels to store the D24
-     * component, so we need to shift the components in the clear value to
-     * match that.
-     */
-      comps = 0;
-      if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-         comps |= VK_COLOR_COMPONENT_R_BIT;
-         clear_color.uint32[0] = clear_zs >> 24;
-      }
-      if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-         comps |= VK_COLOR_COMPONENT_G_BIT |
-                  VK_COLOR_COMPONENT_B_BIT |
-                  VK_COLOR_COMPONENT_A_BIT;
-         clear_color.uint32[1] = (clear_zs >>  0) & 0xff;
-         clear_color.uint32[2] = (clear_zs >>  8) & 0xff;
-         clear_color.uint32[3] = (clear_zs >> 16) & 0xff;
-      }
-   } else {
-      /* For anything else we use a single component format */
-      comps = VK_COLOR_COMPONENT_R_BIT;
-      clear_color.uint32[0] = clear_zs;
-   }
-
-   emit_color_clear_rect(cmd_buffer, attachment_idx,
-                         color_format, samples, comps,
-                         clear_color, rect);
-}
-
-/* Emits a scissored quad in the clear color.
- *
- * This path only works for clears to the base layer in the framebuffer, since
- * we don't currently support any form of layered rendering.
- */
+/* Emits a scissored quad in the clear color */
 static void
 emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                                struct v3dv_render_pass *pass,
                                struct v3dv_subpass *subpass,
                                uint32_t rt_idx,
                                const VkClearColorValue *clear_color,
+                               bool is_layered,
+                               bool all_rects_same_layers,
                                uint32_t rect_count,
                                const VkClearRect *rects)
 {
@@ -1016,6 +1027,7 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                                               format,
                                               samples,
                                               components,
+                                              is_layered,
                                               &pipeline);
    if (result != VK_SUCCESS) {
       if (result == VK_ERROR_OUT_OF_HOST_MEMORY)
@@ -1040,7 +1052,6 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
 
    for (uint32_t i = 0; i < rect_count; i++) {
-      assert(rects[i].baseArrayLayer == 0 && rects[i].layerCount == 1);
       const VkViewport viewport = {
          .x = rects[i].rect.offset.x,
          .y = rects[i].rect.offset.y,
@@ -1051,7 +1062,20 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
       };
       v3dv_CmdSetViewport(cmd_buffer_handle, 0, 1, &viewport);
       v3dv_CmdSetScissor(cmd_buffer_handle, 0, 1, &rects[i].rect);
-      v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
+
+      if (is_layered) {
+         for (uint32_t layer_offset = 0; layer_offset < rects[i].layerCount;
+              layer_offset++) {
+            uint32_t layer = rects[i].baseArrayLayer + layer_offset;
+            v3dv_CmdPushConstants(cmd_buffer_handle,
+                                  cmd_buffer->device->meta.depth_clear.p_layout,
+                                  VK_SHADER_STAGE_GEOMETRY_BIT, 16, 4, &layer);
+            v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
+         }
+      } else {
+         assert(rects[i].baseArrayLayer == 0 && rects[i].layerCount == 1);
+         v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
+      }
    }
 
    /* Subpass pipelines can't be cached because they include a reference to the
@@ -1068,9 +1092,6 @@ emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
 
 /* Emits a scissored quad, clearing the depth aspect by writing to gl_FragDepth
  * and the stencil aspect by using stencil testing.
- *
- * This path only works for clears to the base layer in the framebuffer, since
- * we don't currently support any form of layered rendering.
  */
 static void
 emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
@@ -1078,6 +1099,8 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                             struct v3dv_subpass *subpass,
                             VkImageAspectFlags aspects,
                             const VkClearDepthStencilValue *clear_ds,
+                            bool is_layered,
+                            bool all_rects_same_layers,
                             uint32_t rect_count,
                             const VkClearRect *rects)
 {
@@ -1094,6 +1117,7 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                                               pass,
                                               cmd_buffer->state.subpass_idx,
                                               attachment_idx,
+                                              is_layered,
                                               &pipeline);
    if (result != VK_SUCCESS) {
       if (result == VK_ERROR_OUT_OF_HOST_MEMORY)
@@ -1130,7 +1154,6 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
    }
 
    for (uint32_t i = 0; i < rect_count; i++) {
-      assert(rects[i].baseArrayLayer == 0 && rects[i].layerCount == 1);
       const VkViewport viewport = {
          .x = rects[i].rect.offset.x,
          .y = rects[i].rect.offset.y,
@@ -1141,485 +1164,46 @@ emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
       };
       v3dv_CmdSetViewport(cmd_buffer_handle, 0, 1, &viewport);
       v3dv_CmdSetScissor(cmd_buffer_handle, 0, 1, &rects[i].rect);
-      v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
-   }
-
-   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
-}
-
-static void
-emit_tlb_clear_store(struct v3dv_cmd_buffer *cmd_buffer,
-                     struct v3dv_cl *cl,
-                     uint32_t attachment_idx,
-                     uint32_t layer,
-                     uint32_t buffer)
-{
-   const struct v3dv_image_view *iview =
-      cmd_buffer->state.framebuffer->attachments[attachment_idx];
-   const struct v3dv_image *image = iview->image;
-   const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-   uint32_t layer_offset = v3dv_layer_offset(image,
-                                             iview->base_level,
-                                             iview->first_layer + layer);
-
-   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-      store.buffer_to_store = buffer;
-      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
-      store.clear_buffer_being_stored = false;
-
-      store.output_image_format = iview->format->rt_type;
-      store.r_b_swap = iview->swap_rb;
-      store.memory_format = slice->tiling;
-
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         store.height_in_ub_or_stride =
-            slice->padded_height_of_output_image_in_uif_blocks;
-      } else if (slice->tiling == VC5_TILING_RASTER) {
-         store.height_in_ub_or_stride = slice->stride;
-      }
-
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
-         store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
-      else
-         store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
-   }
-}
-
-static void
-emit_tlb_clear_stores(struct v3dv_cmd_buffer *cmd_buffer,
-                      struct v3dv_cl *cl,
-                      uint32_t attachment_count,
-                      const VkClearAttachment *attachments,
-                      uint32_t layer)
-{
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-
-   bool has_stores = false;
-   for (uint32_t i = 0; i < attachment_count; i++) {
-      uint32_t attachment_idx;
-      uint32_t buffer;
-      if (attachments[i].aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT |
-                                       VK_IMAGE_ASPECT_STENCIL_BIT)) {
-         attachment_idx = subpass->ds_attachment.attachment;
-         buffer = v3dv_zs_buffer_from_aspect_bits(attachments[i].aspectMask);
+      if (is_layered) {
+         for (uint32_t layer_offset = 0; layer_offset < rects[i].layerCount;
+              layer_offset++) {
+            uint32_t layer = rects[i].baseArrayLayer + layer_offset;
+            v3dv_CmdPushConstants(cmd_buffer_handle,
+                                  cmd_buffer->device->meta.depth_clear.p_layout,
+                                  VK_SHADER_STAGE_GEOMETRY_BIT, 4, 4, &layer);
+            v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
+         }
       } else {
-         uint32_t rt_idx = attachments[i].colorAttachment;
-         attachment_idx = subpass->color_attachments[rt_idx].attachment;
-         buffer = RENDER_TARGET_0 + rt_idx;
+         assert(rects[i].baseArrayLayer == 0 && rects[i].layerCount == 1);
+         v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
       }
-
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      has_stores = true;
-      emit_tlb_clear_store(cmd_buffer, cl, attachment_idx, layer, buffer);
-   }
-
-   if (!has_stores) {
-      cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-         store.buffer_to_store = NONE;
-      }
-   }
-}
-
-static void
-emit_tlb_clear_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
-                            uint32_t attachment_count,
-                            const VkClearAttachment *attachments,
-                            uint32_t layer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   cl_emit(cl, END_OF_LOADS, end); /* Nothing to load */
-
-   cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
-      fmt.primitive_type = LIST_TRIANGLES;
    }
 
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   emit_tlb_clear_stores(cmd_buffer, cl, attachment_count, attachments, layer);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
 }
 
 static void
-emit_tlb_clear_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
-                         uint32_t attachment_count,
-                         const VkClearAttachment *attachments,
-                         uint32_t layer)
+gather_layering_info(uint32_t rect_count, const VkClearRect *rects,
+                     bool *is_layered, bool *all_rects_same_layers)
 {
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   struct v3dv_cl *rcl = &job->rcl;
-
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-
-   const uint32_t tile_alloc_offset =
-      64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
-   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
-   }
-
-   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
-      config.number_of_bin_tile_lists = 1;
-      config.total_frame_width_in_tiles = tiling->draw_tiles_x;
-      config.total_frame_height_in_tiles = tiling->draw_tiles_y;
-
-      config.supertile_width_in_tiles = tiling->supertile_width;
-      config.supertile_height_in_tiles = tiling->supertile_height;
-
-      config.total_frame_width_in_supertiles =
-         tiling->frame_width_in_supertiles;
-      config.total_frame_height_in_supertiles =
-         tiling->frame_height_in_supertiles;
-   }
-
-   /* Emit the clear and also the workaround for GFXH-1742 */
-   for (int i = 0; i < 2; i++) {
-      cl_emit(rcl, TILE_COORDINATES, coords);
-      cl_emit(rcl, END_OF_LOADS, end);
-      cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
-         store.buffer_to_store = NONE;
-      }
-      if (i == 0) {
-         cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
-            clear.clear_z_stencil_buffer = true;
-            clear.clear_all_render_targets = true;
-         }
+   *all_rects_same_layers = true;
+
+   uint32_t min_layer = rects[0].baseArrayLayer;
+   uint32_t max_layer = rects[0].baseArrayLayer + rects[0].layerCount - 1;
+   for (uint32_t i = 1; i < rect_count; i++) {
+      if (rects[i].baseArrayLayer != rects[i - 1].baseArrayLayer ||
+          rects[i].layerCount != rects[i - 1].layerCount) {
+         *all_rects_same_layers = false;
+         min_layer = MIN2(min_layer, rects[i].baseArrayLayer);
+         max_layer = MAX2(max_layer, rects[i].baseArrayLayer +
+                                     rects[i].layerCount - 1);
       }
-      cl_emit(rcl, END_OF_TILE_MARKER, end);
    }
 
-   cl_emit(rcl, FLUSH_VCD_CACHE, flush);
-
-   emit_tlb_clear_per_tile_rcl(cmd_buffer, attachment_count, attachments, layer);
-
-   uint32_t supertile_w_in_pixels =
-      tiling->tile_width * tiling->supertile_width;
-   uint32_t supertile_h_in_pixels =
-      tiling->tile_height * tiling->supertile_height;
-
-   const uint32_t max_render_x = framebuffer->width - 1;
-   const uint32_t max_render_y = framebuffer->height - 1;
-   const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
-   const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
-
-   for (int y = 0; y <= max_y_supertile; y++) {
-      for (int x = 0; x <= max_x_supertile; x++) {
-         cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
-            coords.column_number_in_supertiles = x;
-            coords.row_number_in_supertiles = y;
-         }
-      }
-   }
+   *is_layered = !(min_layer == 0 && max_layer == 0);
 }
 
-static void
-emit_tlb_clear_job(struct v3dv_cmd_buffer *cmd_buffer,
-                   uint32_t attachment_count,
-                   const VkClearAttachment *attachments,
-                   uint32_t base_layer,
-                   uint32_t layer_count)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   /* Check how many color attachments we have and also if we have a
-    * depth/stencil attachment.
-    */
-   uint32_t color_attachment_count = 0;
-   VkClearAttachment color_attachments[4];
-   const VkClearDepthStencilValue *ds_clear_value = NULL;
-   uint8_t internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
-   for (uint32_t i = 0; i < attachment_count; i++) {
-      if (attachments[i].aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT |
-                                       VK_IMAGE_ASPECT_STENCIL_BIT)) {
-         assert(subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED);
-         ds_clear_value = &attachments[i].clearValue.depthStencil;
-         struct v3dv_render_pass_attachment *att =
-            &state->pass->attachments[subpass->ds_attachment.attachment];
-         internal_depth_type = v3dv_get_internal_depth_type(att->desc.format);
-      } else if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-         color_attachments[color_attachment_count++] = attachments[i];
-      }
-   }
-
-   uint8_t internal_bpp;
-   bool msaa;
-   v3dv_framebuffer_compute_internal_bpp_msaa(framebuffer, subpass,
-                                              &internal_bpp, &msaa);
-
-   v3dv_job_start_frame(job,
-                        framebuffer->width,
-                        framebuffer->height,
-                        framebuffer->layers,
-                        color_attachment_count,
-                        internal_bpp, msaa);
-
-   struct v3dv_cl *rcl = &job->rcl;
-   v3dv_cl_ensure_space_with_branch(rcl, 200 +
-                                    layer_count * 256 *
-                                    cl_packet_length(SUPERTILE_COORDINATES));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
-      config.early_z_disable = true;
-      config.image_width_pixels = framebuffer->width;
-      config.image_height_pixels = framebuffer->height;
-      config.number_of_render_targets = MAX2(color_attachment_count, 1);
-      config.multisample_mode_4x = false; /* FIXME */
-      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
-      config.internal_depth_type = internal_depth_type;
-   }
-
-   for (uint32_t i = 0; i < color_attachment_count; i++) {
-      uint32_t rt_idx = color_attachments[i].colorAttachment;
-      uint32_t attachment_idx = subpass->color_attachments[rt_idx].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      const struct v3dv_render_pass_attachment *attachment =
-         &state->pass->attachments[attachment_idx];
-
-      uint32_t internal_type, internal_bpp, internal_size;
-      const struct v3dv_format *format =
-         v3dv_get_format(attachment->desc.format);
-      v3dv_get_internal_type_bpp_for_output_format(format->rt_type,
-                                                   &internal_type,
-                                                   &internal_bpp);
-      internal_size = 4 << internal_bpp;
-
-      uint32_t clear_color[4] = { 0 };
-      v3dv_get_hw_clear_color(&color_attachments[i].clearValue.color,
-                              internal_type,
-                              internal_size,
-                              clear_color);
-
-      struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
-      const struct v3dv_image *image = iview->image;
-      const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-
-      uint32_t clear_pad = 0;
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         int uif_block_height = v3d_utile_height(image->cpp) * 2;
-
-         uint32_t implicit_padded_height =
-            align(framebuffer->height, uif_block_height) / uif_block_height;
-
-         if (slice->padded_height_of_output_image_in_uif_blocks -
-             implicit_padded_height >= 15) {
-            clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
-         }
-      }
-
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
-         clear.clear_color_low_32_bits = clear_color[0];
-         clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
-         clear.render_target_number = i;
-      };
-
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
-            clear.clear_color_mid_low_32_bits =
-              ((clear_color[1] >> 24) | (clear_color[2] << 8));
-            clear.clear_color_mid_high_24_bits =
-              ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
-            clear.render_target_number = i;
-         };
-      }
-
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
-            clear.uif_padded_height_in_uif_blocks = clear_pad;
-            clear.clear_color_high_16_bits = clear_color[3] >> 16;
-            clear.render_target_number = i;
-         };
-      }
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      v3dv_render_pass_setup_render_target(cmd_buffer, 0,
-                                           &rt.render_target_0_internal_bpp,
-                                           &rt.render_target_0_internal_type,
-                                           &rt.render_target_0_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 1,
-                                           &rt.render_target_1_internal_bpp,
-                                           &rt.render_target_1_internal_type,
-                                           &rt.render_target_1_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 2,
-                                           &rt.render_target_2_internal_bpp,
-                                           &rt.render_target_2_internal_type,
-                                           &rt.render_target_2_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 3,
-                                           &rt.render_target_3_internal_bpp,
-                                           &rt.render_target_3_internal_type,
-                                           &rt.render_target_3_clamp);
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
-      clear.z_clear_value = ds_clear_value ? ds_clear_value->depth : 1.0f;
-      clear.stencil_clear_value = ds_clear_value ? ds_clear_value->stencil : 0;
-   };
-
-   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
-      init.use_auto_chained_tile_lists = true;
-      init.size_of_first_block_in_chained_tile_lists =
-         TILE_ALLOCATION_BLOCK_SIZE_64B;
-   }
-
-   for (int layer = base_layer; layer < base_layer + layer_count; layer++) {
-      emit_tlb_clear_layer_rcl(cmd_buffer,
-                               attachment_count,
-                               attachments,
-                               layer);
-   }
-
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
-static void
-emit_tlb_clear(struct v3dv_cmd_buffer *cmd_buffer,
-               uint32_t attachment_count,
-               const VkClearAttachment *attachments,
-               uint32_t base_layer,
-               uint32_t layer_count)
-{
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_start_job(cmd_buffer, cmd_buffer->state.subpass_idx,
-                                V3DV_JOB_TYPE_GPU_CL);
-
-   /* vkCmdClearAttachments runs inside a render pass */
-   job->is_subpass_continue = true;
-
-   emit_tlb_clear_job(cmd_buffer,
-                      attachment_count,
-                      attachments,
-                      base_layer, layer_count);
-
-   v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
-}
-
-static bool
-is_subrect(const VkRect2D *r0, const VkRect2D *r1)
-{
-   return r0->offset.x <= r1->offset.x &&
-          r0->offset.y <= r1->offset.y &&
-          r0->offset.x + r0->extent.width >= r1->offset.x + r1->extent.width &&
-          r0->offset.y + r0->extent.height >= r1->offset.y + r1->extent.height;
-}
-
-static bool
-can_use_tlb_clear(struct v3dv_cmd_buffer *cmd_buffer,
-                  uint32_t rect_count,
-                  const VkClearRect* rects)
-{
-   const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
-
-   const VkRect2D *render_area = &cmd_buffer->state.render_area;
-
-   /* Check if we are clearing a single region covering the entire framebuffer
-    * and that we are not constrained by the current render area.
-    *
-    * From the Vulkan 1.0 spec:
-    *
-    *   "The vkCmdClearAttachments command is not affected by the bound
-    *    pipeline state."
-    *
-    * So we can ignore scissor and viewport state for this check.
-    */
-   const VkRect2D fb_rect = {
-      { 0, 0 },
-      { framebuffer->width, framebuffer->height }
-   };
-
-   return rect_count == 1 &&
-          is_subrect(&rects[0].rect, &fb_rect) &&
-          is_subrect(render_area, &fb_rect);
-}
-
-static void
-handle_deferred_clear_attachments(struct v3dv_cmd_buffer *cmd_buffer,
-                                  uint32_t attachmentCount,
-                                  const VkClearAttachment *pAttachments,
-                                  uint32_t rectCount,
-                                  const VkClearRect *pRects)
-{
-   /* Finish the current job */
-   v3dv_cmd_buffer_finish_job(cmd_buffer);
-
-   /* Add a deferred clear attachments job right after that we will process
-    * when we execute this secondary command buffer into a primary.
-    */
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
-                                     V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS,
-                                     cmd_buffer,
-                                     cmd_buffer->state.subpass_idx);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   job->cpu.clear_attachments.rects =
-      vk_alloc(&cmd_buffer->device->vk.alloc,
-               sizeof(VkClearRect) * rectCount, 8,
-               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!job->cpu.clear_attachments.rects) {
-      v3dv_flag_oom(cmd_buffer, NULL);
-      return;
-   }
-
-   job->cpu.clear_attachments.attachment_count = attachmentCount;
-   memcpy(job->cpu.clear_attachments.attachments, pAttachments,
-          sizeof(VkClearAttachment) * attachmentCount);
-
-   job->cpu.clear_attachments.rect_count = rectCount;
-   memcpy(job->cpu.clear_attachments.rects, pRects,
-          sizeof(VkClearRect) * rectCount);
-
-   list_addtail(&job->list_link, &cmd_buffer->jobs);
-
-   /* Resume the subpass so we can continue recording commands */
-   v3dv_cmd_buffer_subpass_resume(cmd_buffer,
-                                  cmd_buffer->state.subpass_idx);
-}
-
-static bool
-all_clear_rects_in_base_layer(uint32_t rect_count, const VkClearRect *rects)
-{
-   for (uint32_t i = 0; i < rect_count; i++) {
-      if (rects[i].baseArrayLayer != 0 || rects[i].layerCount != 1)
-         return false;
-   }
-   return true;
-}
-
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
                          uint32_t attachmentCount,
                          const VkClearAttachment *pAttachments,
@@ -1631,117 +1215,31 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
    /* We can only clear attachments in the current subpass */
    assert(attachmentCount <= 5); /* 4 color + D/S */
 
-   /* Clear attachments may clear multiple layers of the framebuffer, which
-    * currently requires that we emit multiple jobs (one per layer) and
-    * therefore requires that we have the framebuffer information available
-    * to select the destination layers.
-    *
-    * For secondary command buffers the framebuffer state may not be available
-    * until they are executed inside a primary command buffer, so in that case
-    * we need to defer recording of the command until that moment.
-    *
-    * FIXME: once we add support for geometry shaders in the driver we could
-    * avoid emitting a job per layer to implement this by always using the clear
-    * rect path below with a passthrough geometry shader to select the layer to
-    * clear. If we did that we would not need to special case secondary command
-    * buffers here and we could ensure that any secondary command buffer in a
-    * render pass only has on job with a partial CL, which would simplify things
-    * quite a bit.
-    */
-   if (!cmd_buffer->state.framebuffer) {
-      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
-      handle_deferred_clear_attachments(cmd_buffer,
-                                        attachmentCount, pAttachments,
-                                        rectCount, pRects);
-      return;
-   }
-
-   assert(cmd_buffer->state.framebuffer);
-
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
 
    assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
    struct v3dv_subpass *subpass =
       &cmd_buffer->state.pass->subpasses[cmd_buffer->state.subpass_idx];
 
-   /* First we try to handle this by emitting a clear rect inside the
-    * current job for this subpass. This should be optimal but this method
-    * cannot handle clearing layers other than the base layer, since we don't
-    * support any form of layered rendering yet.
-    */
-   if (all_clear_rects_in_base_layer(rectCount, pRects)) {
-      for (uint32_t i = 0; i < attachmentCount; i++) {
-         if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-            emit_subpass_color_clear_rects(cmd_buffer, pass, subpass,
-                                           pAttachments[i].colorAttachment,
-                                           &pAttachments[i].clearValue.color,
-                                           rectCount, pRects);
-         } else {
-            emit_subpass_ds_clear_rects(cmd_buffer, pass, subpass,
-                                        pAttachments[i].aspectMask,
-                                        &pAttachments[i].clearValue.depthStencil,
-                                        rectCount, pRects);
-         }
-      }
-      return;
-   }
-
-   perf_debug("Falling back to slow path for vkCmdClearAttachments due to "
-              "clearing layers other than the base array layer.\n");
-
-   /* If we can't handle this as a draw call inside the current job then we
-    * will have to spawn jobs for the clears, which will be slow. In that case,
-    * try to use the TLB to clear if possible.
-    */
-   if (can_use_tlb_clear(cmd_buffer, rectCount, pRects)) {
-      emit_tlb_clear(cmd_buffer, attachmentCount, pAttachments,
-                     pRects[0].baseArrayLayer, pRects[0].layerCount);
-      return;
-   }
-
-   /* Otherwise, fall back to drawing rects with the clear value using a
-    * separate job. This is the slowest path.
+   /* Emit a clear rect inside the current job for this subpass. For layered
+    * framebuffers, we use a geometry shader to redirect clears to the
+    * appropriate layers.
     */
+   bool is_layered, all_rects_same_layers;
+   gather_layering_info(rectCount, pRects, &is_layered, &all_rects_same_layers);
    for (uint32_t i = 0; i < attachmentCount; i++) {
-      uint32_t attachment_idx = VK_ATTACHMENT_UNUSED;
-
-      if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-         uint32_t rt_idx = pAttachments[i].colorAttachment;
-         attachment_idx = subpass->color_attachments[rt_idx].attachment;
-      } else if (pAttachments[i].aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT |
-                                               VK_IMAGE_ASPECT_STENCIL_BIT)) {
-         attachment_idx = subpass->ds_attachment.attachment;
-      }
-
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-         const uint32_t components = VK_COLOR_COMPONENT_R_BIT |
-                                     VK_COLOR_COMPONENT_G_BIT |
-                                     VK_COLOR_COMPONENT_B_BIT |
-                                     VK_COLOR_COMPONENT_A_BIT;
-         const uint32_t samples =
-            cmd_buffer->state.pass->attachments[attachment_idx].desc.samples;
-         const VkFormat format =
-            cmd_buffer->state.pass->attachments[attachment_idx].desc.format;
-         for (uint32_t j = 0; j < rectCount; j++) {
-            emit_color_clear_rect(cmd_buffer,
-                                  attachment_idx,
-                                  format,
-                                  samples,
-                                  components,
-                                  pAttachments[i].clearValue.color,
-                                  &pRects[j]);
-         }
+         emit_subpass_color_clear_rects(cmd_buffer, pass, subpass,
+                                        pAttachments[i].colorAttachment,
+                                        &pAttachments[i].clearValue.color,
+                                        is_layered, all_rects_same_layers,
+                                        rectCount, pRects);
       } else {
-         for (uint32_t j = 0; j < rectCount; j++) {
-            emit_ds_clear_rect(cmd_buffer,
-                               pAttachments[i].aspectMask,
-                               attachment_idx,
-                               pAttachments[i].clearValue.depthStencil,
-                               &pRects[j]);
-         }
+         emit_subpass_ds_clear_rects(cmd_buffer, pass, subpass,
+                                     pAttachments[i].aspectMask,
+                                     &pAttachments[i].clearValue.depthStencil,
+                                     is_layered, all_rects_same_layers,
+                                     rectCount, pRects);
       }
    }
 }
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_meta_common.h b/lib/mesa/src/broadcom/vulkan/v3dv_meta_common.h
new file mode 100644
index 000000000..555b55f90
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_meta_common.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef V3DV_META_COMMON_H
+#define V3DV_META_COMMON_H
+
+/* Disable level 0 write, just write following mipmaps */
+#define V3D_TFU_IOA_DIMTW (1 << 0)
+#define V3D_TFU_IOA_FORMAT_SHIFT 3
+#define V3D_TFU_IOA_FORMAT_LINEARTILE 3
+#define V3D_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
+#define V3D_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
+#define V3D_TFU_IOA_FORMAT_UIF_NO_XOR 6
+#define V3D_TFU_IOA_FORMAT_UIF_XOR 7
+
+#define V3D_TFU_ICFG_NUMMM_SHIFT 5
+#define V3D_TFU_ICFG_TTYPE_SHIFT 9
+
+#define V3D_TFU_ICFG_OPAD_SHIFT 22
+
+#define V3D_TFU_ICFG_FORMAT_SHIFT 18
+#define V3D_TFU_ICFG_FORMAT_RASTER 0
+#define V3D_TFU_ICFG_FORMAT_SAND_128 1
+#define V3D_TFU_ICFG_FORMAT_SAND_256 2
+#define V3D_TFU_ICFG_FORMAT_LINEARTILE 11
+#define V3D_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+#define V3D_TFU_ICFG_FORMAT_UIF_XOR 15
+
+/**
+ * Copy/Clear operations implemented in v3dv_meta_*.c that use the TLB hardware
+ * need to figure out TLB programming from the target image data instead of an
+ * actual Vulkan framebuffer object. For the most part, the job's frame tiling
+ * information is enough for this, however we still need additional information
+ * such us the internal type of our single render target, so we use this
+ * auxiliary struct to pass that information around.
+ */
+struct v3dv_meta_framebuffer {
+   /* The internal type of the single render target */
+   uint32_t internal_type;
+
+   /* Supertile coverage */
+   uint32_t min_x_supertile;
+   uint32_t min_y_supertile;
+   uint32_t max_x_supertile;
+   uint32_t max_y_supertile;
+
+   /* Format info */
+   VkFormat vk_format;
+   const struct v3dv_format *format;
+   uint8_t internal_depth_type;
+};
+
+#endif
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c b/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c
index d998d7d8a..85cd8e066 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -22,11 +22,12 @@
  */
 
 #include "v3dv_private.h"
+#include "v3dv_meta_common.h"
 
 #include "compiler/nir/nir_builder.h"
-#include "broadcom/cle/v3dx_pack.h"
 #include "vk_format_info.h"
 #include "util/u_pack_color.h"
+#include "vulkan/util/vk_common_entrypoints.h"
 
 static uint32_t
 meta_blit_key_hash(const void *key)
@@ -169,13 +170,25 @@ create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
    }
 
    assert(*p_layout == 0);
+   /* FIXME: this is abusing a bit the API, since not all of our copy
+    * pipelines have a geometry shader. We could create 2 different pipeline
+    * layouts, but this works for us for now.
+    */
+#define TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET      0
+#define TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET  16
+#define TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET  20
+#define TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET   24
+   VkPushConstantRange ranges[2] = {
+      { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 24 },
+      { VK_SHADER_STAGE_GEOMETRY_BIT, 24, 4 },
+   };
+
    VkPipelineLayoutCreateInfo p_layout_info = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
       .setLayoutCount = 1,
       .pSetLayouts = ds_layout,
-      .pushConstantRangeCount = 1,
-      .pPushConstantRanges =
-         &(VkPushConstantRange) { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 20 },
+      .pushConstantRangeCount = 2,
+      .pPushConstantRanges = ranges,
    };
 
    result =
@@ -229,640 +242,127 @@ v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
    }
 }
 
-static inline bool
-can_use_tlb(struct v3dv_image *image,
-            const VkOffset3D *offset,
-            VkFormat *compat_format);
-
-/**
- * Copy operations implemented in this file don't operate on a framebuffer
- * object provided by the user, however, since most use the TLB for this,
- * we still need to have some representation of the framebuffer. For the most
- * part, the job's frame tiling information is enough for this, however we
- * still need additional information such us the internal type of our single
- * render target, so we use this auxiliary struct to pass that information
- * around.
- */
-struct framebuffer_data {
-   /* The internal type of the single render target */
-   uint32_t internal_type;
-
-   /* Supertile coverage */
-   uint32_t min_x_supertile;
-   uint32_t min_y_supertile;
-   uint32_t max_x_supertile;
-   uint32_t max_y_supertile;
-
-   /* Format info */
-   VkFormat vk_format;
-   const struct v3dv_format *format;
-   uint8_t internal_depth_type;
-};
-
-static void
-setup_framebuffer_data(struct framebuffer_data *fb,
-                       VkFormat vk_format,
-                       uint32_t internal_type,
-                       const struct v3dv_frame_tiling *tiling)
-{
-   fb->internal_type = internal_type;
-
-   /* Supertile coverage always starts at 0,0  */
-   uint32_t supertile_w_in_pixels =
-      tiling->tile_width * tiling->supertile_width;
-   uint32_t supertile_h_in_pixels =
-      tiling->tile_height * tiling->supertile_height;
-
-   fb->min_x_supertile = 0;
-   fb->min_y_supertile = 0;
-   fb->max_x_supertile = (tiling->width - 1) / supertile_w_in_pixels;
-   fb->max_y_supertile = (tiling->height - 1) / supertile_h_in_pixels;
-
-   fb->vk_format = vk_format;
-   fb->format = v3dv_get_format(vk_format);
-
-   fb->internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
-   if (vk_format_is_depth_or_stencil(vk_format))
-      fb->internal_depth_type = v3dv_get_internal_depth_type(vk_format);
-}
-
-/* This chooses a tile buffer format that is appropriate for the copy operation.
- * Typically, this is the image render target type, however, if we are copying
- * depth/stencil to/from a buffer the hardware can't do raster loads/stores, so
- * we need to load and store to/from a tile color buffer using a compatible
- * color format.
- */
-static uint32_t
-choose_tlb_format(struct framebuffer_data *framebuffer,
-                  VkImageAspectFlags aspect,
-                  bool for_store,
-                  bool is_copy_to_buffer,
-                  bool is_copy_from_buffer)
-{
-   if (is_copy_to_buffer || is_copy_from_buffer) {
-      switch (framebuffer->vk_format) {
-      case VK_FORMAT_D16_UNORM:
-         return V3D_OUTPUT_IMAGE_FORMAT_R16UI;
-      case VK_FORMAT_D32_SFLOAT:
-         return V3D_OUTPUT_IMAGE_FORMAT_R32F;
-      case VK_FORMAT_X8_D24_UNORM_PACK32:
-         return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
-      case VK_FORMAT_D24_UNORM_S8_UINT:
-         /* When storing the stencil aspect of a combined depth/stencil image
-          * to a buffer, the Vulkan spec states that the output buffer must
-          * have packed stencil values, so we choose an R8UI format for our
-          * store outputs. For the load input we still want RGBA8UI since the
-          * source image contains 4 channels (including the 3 channels
-          * containing the 24-bit depth value).
-          *
-          * When loading the stencil aspect of a combined depth/stencil image
-          * from a buffer, we read packed 8-bit stencil values from the buffer
-          * that we need to put into the LSB of the 32-bit format (the R
-          * channel), so we use R8UI. For the store, if we used R8UI then we
-          * would write 8-bit stencil values consecutively over depth channels,
-          * so we need to use RGBA8UI. This will write each stencil value in
-          * its correct position, but will overwrite depth values (channels G
-          * B,A) with undefined values. To fix this,  we will have to restore
-          * the depth aspect from the Z tile buffer, which we should pre-load
-          * from the image before the store).
-          */
-         if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
-            return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
-         } else {
-            assert(aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
-            if (is_copy_to_buffer) {
-               return for_store ? V3D_OUTPUT_IMAGE_FORMAT_R8UI :
-                                  V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
-            } else {
-               assert(is_copy_from_buffer);
-               return for_store ? V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI :
-                                  V3D_OUTPUT_IMAGE_FORMAT_R8UI;
-            }
-         }
-      default: /* Color formats */
-         return framebuffer->format->rt_type;
-         break;
-      }
-   } else {
-      return framebuffer->format->rt_type;
-   }
-}
-
-static inline bool
-format_needs_rb_swap(VkFormat format)
-{
-   const uint8_t *swizzle = v3dv_get_format_swizzle(format);
-   return swizzle[0] == PIPE_SWIZZLE_Z;
-}
-
-static void
-get_internal_type_bpp_for_image_aspects(VkFormat vk_format,
-                                        VkImageAspectFlags aspect_mask,
-                                        uint32_t *internal_type,
-                                        uint32_t *internal_bpp)
-{
-   const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
-                                         VK_IMAGE_ASPECT_STENCIL_BIT;
-
-   /* We can't store depth/stencil pixel formats to a raster format, so
-    * so instead we load our depth/stencil aspects to a compatible color
-    * format.
-    */
-   /* FIXME: pre-compute this at image creation time? */
-   if (aspect_mask & ds_aspects) {
-      switch (vk_format) {
-      case VK_FORMAT_D16_UNORM:
-         *internal_type = V3D_INTERNAL_TYPE_16UI;
-         *internal_bpp = V3D_INTERNAL_BPP_64;
-         break;
-      case VK_FORMAT_D32_SFLOAT:
-         *internal_type = V3D_INTERNAL_TYPE_32F;
-         *internal_bpp = V3D_INTERNAL_BPP_128;
-         break;
-      case VK_FORMAT_X8_D24_UNORM_PACK32:
-      case VK_FORMAT_D24_UNORM_S8_UINT:
-         /* Use RGBA8 format so we can relocate the X/S bits in the appropriate
-          * place to match Vulkan expectations. See the comment on the tile
-          * load command for more details.
-          */
-         *internal_type = V3D_INTERNAL_TYPE_8UI;
-         *internal_bpp = V3D_INTERNAL_BPP_32;
-         break;
-      default:
-         assert(!"unsupported format");
-         break;
-      }
-   } else {
-      const struct v3dv_format *format = v3dv_get_format(vk_format);
-      v3dv_get_internal_type_bpp_for_output_format(format->rt_type,
-                                                   internal_type,
-                                                   internal_bpp);
-   }
-}
-
-struct rcl_clear_info {
-   const union v3dv_clear_value *clear_value;
-   struct v3dv_image *image;
-   VkImageAspectFlags aspects;
-   uint32_t layer;
-   uint32_t level;
-};
-
-static struct v3dv_cl *
-emit_rcl_prologue(struct v3dv_job *job,
-                  struct framebuffer_data *fb,
-                  const struct rcl_clear_info *clear_info)
-{
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-
-   struct v3dv_cl *rcl = &job->rcl;
-   v3dv_cl_ensure_space_with_branch(rcl, 200 +
-                                    tiling->layers * 256 *
-                                    cl_packet_length(SUPERTILE_COORDINATES));
-   if (job->cmd_buffer->state.oom)
-      return NULL;
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
-      config.early_z_disable = true;
-      config.image_width_pixels = tiling->width;
-      config.image_height_pixels = tiling->height;
-      config.number_of_render_targets = 1;
-      config.multisample_mode_4x = tiling->msaa;
-      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
-      config.internal_depth_type = fb->internal_depth_type;
-   }
-
-   if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
-      uint32_t clear_pad = 0;
-      if (clear_info->image) {
-         const struct v3dv_image *image = clear_info->image;
-         const struct v3d_resource_slice *slice =
-            &image->slices[clear_info->level];
-         if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-             slice->tiling == VC5_TILING_UIF_XOR) {
-            int uif_block_height = v3d_utile_height(image->cpp) * 2;
-
-            uint32_t implicit_padded_height =
-               align(tiling->height, uif_block_height) / uif_block_height;
-
-            if (slice->padded_height_of_output_image_in_uif_blocks -
-                implicit_padded_height >= 15) {
-               clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
-            }
-         }
-      }
-
-      const uint32_t *color = &clear_info->clear_value->color[0];
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
-         clear.clear_color_low_32_bits = color[0];
-         clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
-         clear.render_target_number = 0;
-      };
-
-      if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
-            clear.clear_color_mid_low_32_bits =
-              ((color[1] >> 24) | (color[2] << 8));
-            clear.clear_color_mid_high_24_bits =
-              ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
-            clear.render_target_number = 0;
-         };
-      }
-
-      if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
-            clear.uif_padded_height_in_uif_blocks = clear_pad;
-            clear.clear_color_high_16_bits = color[3] >> 16;
-            clear.render_target_number = 0;
-         };
-      }
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      rt.render_target_0_internal_bpp = tiling->internal_bpp;
-      rt.render_target_0_internal_type = fb->internal_type;
-      rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
-      clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
-      clear.stencil_clear_value = clear_info ? clear_info->clear_value->s : 0;
-   };
-
-   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
-      init.use_auto_chained_tile_lists = true;
-      init.size_of_first_block_in_chained_tile_lists =
-         TILE_ALLOCATION_BLOCK_SIZE_64B;
-   }
-
-   return rcl;
-}
-
-static void
-emit_frame_setup(struct v3dv_job *job,
-                 uint32_t layer,
-                 const union v3dv_clear_value *clear_value)
+static VkFormat
+get_compatible_tlb_format(VkFormat format)
 {
-   v3dv_return_if_oom(NULL, job);
-
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-
-   struct v3dv_cl *rcl = &job->rcl;
-
-   const uint32_t tile_alloc_offset =
-      64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
-   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
-   }
-
-   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
-      config.number_of_bin_tile_lists = 1;
-      config.total_frame_width_in_tiles = tiling->draw_tiles_x;
-      config.total_frame_height_in_tiles = tiling->draw_tiles_y;
-
-      config.supertile_width_in_tiles = tiling->supertile_width;
-      config.supertile_height_in_tiles = tiling->supertile_height;
-
-      config.total_frame_width_in_supertiles =
-         tiling->frame_width_in_supertiles;
-      config.total_frame_height_in_supertiles =
-         tiling->frame_height_in_supertiles;
-   }
-
-   /* Implement GFXH-1742 workaround. Also, if we are clearing we have to do
-    * it here.
-    */
-   for (int i = 0; i < 2; i++) {
-      cl_emit(rcl, TILE_COORDINATES, coords);
-      cl_emit(rcl, END_OF_LOADS, end);
-      cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
-         store.buffer_to_store = NONE;
-      }
-      if (clear_value && i == 0) {
-         cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
-            clear.clear_z_stencil_buffer = true;
-            clear.clear_all_render_targets = true;
-         }
-      }
-      cl_emit(rcl, END_OF_TILE_MARKER, end);
-   }
-
-   cl_emit(rcl, FLUSH_VCD_CACHE, flush);
-}
+   switch (format) {
+   case VK_FORMAT_R8G8B8A8_SNORM:
+      return VK_FORMAT_R8G8B8A8_UINT;
 
-static void
-emit_supertile_coordinates(struct v3dv_job *job,
-                           struct framebuffer_data *framebuffer)
-{
-   v3dv_return_if_oom(NULL, job);
+   case VK_FORMAT_R8G8_SNORM:
+      return VK_FORMAT_R8G8_UINT;
 
-   struct v3dv_cl *rcl = &job->rcl;
+   case VK_FORMAT_R8_SNORM:
+      return VK_FORMAT_R8_UINT;
 
-   const uint32_t min_y = framebuffer->min_y_supertile;
-   const uint32_t max_y = framebuffer->max_y_supertile;
-   const uint32_t min_x = framebuffer->min_x_supertile;
-   const uint32_t max_x = framebuffer->max_x_supertile;
+   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
+      return VK_FORMAT_A8B8G8R8_UINT_PACK32;
 
-   for (int y = min_y; y <= max_y; y++) {
-      for (int x = min_x; x <= max_x; x++) {
-         cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
-            coords.column_number_in_supertiles = x;
-            coords.row_number_in_supertiles = y;
-         }
-      }
-   }
-}
+   case VK_FORMAT_R16_UNORM:
+   case VK_FORMAT_R16_SNORM:
+      return VK_FORMAT_R16_UINT;
 
-static void
-emit_linear_load(struct v3dv_cl *cl,
-                 uint32_t buffer,
-                 struct v3dv_bo *bo,
-                 uint32_t offset,
-                 uint32_t stride,
-                 uint32_t format)
-{
-   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
-      load.buffer_to_load = buffer;
-      load.address = v3dv_cl_address(bo, offset);
-      load.input_image_format = format;
-      load.memory_format = VC5_TILING_RASTER;
-      load.height_in_ub_or_stride = stride;
-      load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
-   }
-}
+   case VK_FORMAT_R16G16_UNORM:
+   case VK_FORMAT_R16G16_SNORM:
+      return VK_FORMAT_R16G16_UINT;
 
-static void
-emit_linear_store(struct v3dv_cl *cl,
-                  uint32_t buffer,
-                  struct v3dv_bo *bo,
-                  uint32_t offset,
-                  uint32_t stride,
-                  bool msaa,
-                  uint32_t format)
-{
-   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-      store.buffer_to_store = RENDER_TARGET_0;
-      store.address = v3dv_cl_address(bo, offset);
-      store.clear_buffer_being_stored = false;
-      store.output_image_format = format;
-      store.memory_format = VC5_TILING_RASTER;
-      store.height_in_ub_or_stride = stride;
-      store.decimate_mode = msaa ? V3D_DECIMATE_MODE_ALL_SAMPLES :
-                                   V3D_DECIMATE_MODE_SAMPLE_0;
-   }
-}
+   case VK_FORMAT_R16G16B16A16_UNORM:
+   case VK_FORMAT_R16G16B16A16_SNORM:
+      return VK_FORMAT_R16G16B16A16_UINT;
 
-static void
-emit_image_load(struct v3dv_cl *cl,
-                struct framebuffer_data *framebuffer,
-                struct v3dv_image *image,
-                VkImageAspectFlags aspect,
-                uint32_t layer,
-                uint32_t mip_level,
-                bool is_copy_to_buffer,
-                bool is_copy_from_buffer)
-{
-   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
+   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+      return VK_FORMAT_R32_SFLOAT;
 
-   /* For image to/from buffer copies we always load to and store from RT0,
-    * even for depth/stencil aspects, because the hardware can't do raster
-    * stores or loads from/to the depth/stencil tile buffers.
+   /* We can't render to compressed formats using the TLB so instead we use
+    * a compatible format with the same bpp as the compressed format. Because
+    * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
+    * case of ETC), when we implement copies with the compatible format we
+    * will have to divide offsets and dimensions on the compressed image by
+    * the compressed block size.
     */
-   bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
-                            aspect == VK_IMAGE_ASPECT_COLOR_BIT;
-
-   const struct v3d_resource_slice *slice = &image->slices[mip_level];
-   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
-      load.buffer_to_load = load_to_color_tlb ?
-         RENDER_TARGET_0 : v3dv_zs_buffer_from_aspect_bits(aspect);
-
-      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
-
-      load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
-                                                  is_copy_to_buffer,
-                                                  is_copy_from_buffer);
-      load.memory_format = slice->tiling;
-
-      /* When copying depth/stencil images to a buffer, for D24 formats Vulkan
-       * expects the depth value in the LSB bits of each 32-bit pixel.
-       * Unfortunately, the hardware seems to put the S8/X8 bits there and the
-       * depth bits on the MSB. To work around that we can reverse the channel
-       * order and then swap the R/B channels to get what we want.
-       *
-       * NOTE: reversing and swapping only gets us the behavior we want if the
-       * operations happen in that exact order, which seems to be the case when
-       * done on the tile buffer load operations. On the store, it seems the
-       * order is not the same. The order on the store is probably reversed so
-       * that reversing and swapping on both the load and the store preserves
-       * the original order of the channels in memory.
-       *
-       * Notice that we only need to do this when copying to a buffer, where
-       * depth and stencil aspects are copied as separate regions and
-       * the spec expects them to be tightly packed.
-       */
-      bool needs_rb_swap = false;
-      bool needs_chan_reverse = false;
-      if (is_copy_to_buffer &&
-         (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
-          (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
-           (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
-         needs_rb_swap = true;
-         needs_chan_reverse = true;
-      } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
-                 (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
-         /* This is not a raw data copy (i.e. we are clearing the image),
-          * so we need to make sure we respect the format swizzle.
-          */
-         needs_rb_swap = format_needs_rb_swap(framebuffer->vk_format);
-      }
-
-      load.r_b_swap = needs_rb_swap;
-      load.channel_reverse = needs_chan_reverse;
+   case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
+   case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+   case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
+   case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
+   case VK_FORMAT_BC2_UNORM_BLOCK:
+   case VK_FORMAT_BC2_SRGB_BLOCK:
+   case VK_FORMAT_BC3_SRGB_BLOCK:
+   case VK_FORMAT_BC3_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
+      return VK_FORMAT_R32G32B32A32_UINT;
 
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         load.height_in_ub_or_stride =
-            slice->padded_height_of_output_image_in_uif_blocks;
-      } else if (slice->tiling == VC5_TILING_RASTER) {
-         load.height_in_ub_or_stride = slice->stride;
-      }
+   case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
+   case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+   case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
+   case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+   case VK_FORMAT_EAC_R11_UNORM_BLOCK:
+   case VK_FORMAT_EAC_R11_SNORM_BLOCK:
+   case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
+   case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
+   case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
+   case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
+      return VK_FORMAT_R16G16B16A16_UINT;
 
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
-         load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
-      else
-         load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   default:
+      return VK_FORMAT_UNDEFINED;
    }
 }
 
-static void
-emit_image_store(struct v3dv_cl *cl,
-                 struct framebuffer_data *framebuffer,
-                 struct v3dv_image *image,
-                 VkImageAspectFlags aspect,
-                 uint32_t layer,
-                 uint32_t mip_level,
-                 bool is_copy_to_buffer,
-                 bool is_copy_from_buffer,
-                 bool is_multisample_resolve)
+/**
+ * Checks if we can implement an image copy or clear operation using the TLB
+ * hardware.
+ */
+bool
+v3dv_meta_can_use_tlb(struct v3dv_image *image,
+                      const VkOffset3D *offset,
+                      VkFormat *compat_format)
 {
-   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
-
-   bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
-                               aspect == VK_IMAGE_ASPECT_COLOR_BIT;
-
-   const struct v3d_resource_slice *slice = &image->slices[mip_level];
-   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-      store.buffer_to_store = store_from_color_tlb ?
-         RENDER_TARGET_0 : v3dv_zs_buffer_from_aspect_bits(aspect);
-
-      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
-      store.clear_buffer_being_stored = false;
-
-      /* See rationale in emit_image_load() */
-      bool needs_rb_swap = false;
-      bool needs_chan_reverse = false;
-      if (is_copy_from_buffer &&
-         (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
-          (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
-           (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
-         needs_rb_swap = true;
-         needs_chan_reverse = true;
-      } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
-                 (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
-         needs_rb_swap = format_needs_rb_swap(framebuffer->vk_format);
-      }
-
-      store.r_b_swap = needs_rb_swap;
-      store.channel_reverse = needs_chan_reverse;
-
-      store.output_image_format = choose_tlb_format(framebuffer, aspect, true,
-                                                    is_copy_to_buffer,
-                                                    is_copy_from_buffer);
-      store.memory_format = slice->tiling;
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         store.height_in_ub_or_stride =
-            slice->padded_height_of_output_image_in_uif_blocks;
-      } else if (slice->tiling == VC5_TILING_RASTER) {
-         store.height_in_ub_or_stride = slice->stride;
-      }
+   if (offset->x != 0 || offset->y != 0)
+      return false;
 
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
-         store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
-      else if (is_multisample_resolve)
-         store.decimate_mode = V3D_DECIMATE_MODE_4X;
-      else
-         store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
+      if (compat_format)
+         *compat_format = image->vk.format;
+      return true;
    }
-}
-
-static void
-emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
-                                        struct framebuffer_data *framebuffer,
-                                        struct v3dv_buffer *buffer,
-                                        struct v3dv_image *image,
-                                        uint32_t layer_offset,
-                                        const VkBufferImageCopy *region)
-{
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(NULL, job);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   /* Load image to TLB */
-   assert((image->type != VK_IMAGE_TYPE_3D &&
-           layer_offset < region->imageSubresource.layerCount) ||
-          layer_offset < image->extent.depth);
-
-   const uint32_t image_layer = image->type != VK_IMAGE_TYPE_3D ?
-      region->imageSubresource.baseArrayLayer + layer_offset :
-      region->imageOffset.z + layer_offset;
-
-   emit_image_load(cl, framebuffer, image,
-                   region->imageSubresource.aspectMask,
-                   image_layer,
-                   region->imageSubresource.mipLevel,
-                   true, false);
-
-   cl_emit(cl, END_OF_LOADS, end);
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   /* Store TLB to buffer */
-   uint32_t width, height;
-   if (region->bufferRowLength == 0)
-      width = region->imageExtent.width;
-   else
-      width = region->bufferRowLength;
-
-   if (region->bufferImageHeight == 0)
-      height = region->imageExtent.height;
-   else
-      height = region->bufferImageHeight;
 
-   /* Handle copy from compressed format */
-   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk_format));
-   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk_format));
-
-   /* If we are storing stencil from a combined depth/stencil format the
-    * Vulkan spec states that the output buffer must have packed stencil
-    * values, where each stencil value is 1 byte.
+   /* If the image format is not TLB-supported, then check if we can use
+    * a compatible format instead.
     */
-   uint32_t cpp =
-      region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
-         1 : image->cpp;
-   uint32_t buffer_stride = width * cpp;
-   uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
-                            height * buffer_stride * layer_offset;
-
-   uint32_t format = choose_tlb_format(framebuffer,
-                                       region->imageSubresource.aspectMask,
-                                       true, true, false);
-   bool msaa = image->samples > VK_SAMPLE_COUNT_1_BIT;
-
-   emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
-                     buffer_offset, buffer_stride, msaa, format);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
+   if (compat_format) {
+      *compat_format = get_compatible_tlb_format(image->vk.format);
+      if (*compat_format != VK_FORMAT_UNDEFINED)
+         return true;
    }
-}
 
-static void
-emit_copy_layer_to_buffer(struct v3dv_job *job,
-                          struct v3dv_buffer *buffer,
-                          struct v3dv_image *image,
-                          struct framebuffer_data *framebuffer,
-                          uint32_t layer,
-                          const VkBufferImageCopy *region)
-{
-   emit_frame_setup(job, layer, NULL);
-   emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
-                                           image, layer, region);
-   emit_supertile_coordinates(job, framebuffer);
-}
-
-static void
-emit_copy_image_to_buffer_rcl(struct v3dv_job *job,
-                              struct v3dv_buffer *buffer,
-                              struct v3dv_image *image,
-                              struct framebuffer_data *framebuffer,
-                              const VkBufferImageCopy *region)
-{
-   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
-   v3dv_return_if_oom(NULL, job);
-
-   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
-      emit_copy_layer_to_buffer(job, buffer, image, framebuffer, layer, region);
-   cl_emit(rcl, END_OF_RENDERING, end);
+   return false;
 }
 
 /* Implements a copy using the TLB.
@@ -879,19 +379,19 @@ static bool
 copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_buffer *buffer,
                          struct v3dv_image *image,
-                         const VkBufferImageCopy *region)
+                         const VkBufferImageCopy2KHR *region)
 {
    VkFormat fb_format;
-   if (!can_use_tlb(image, &region->imageOffset, &fb_format))
+   if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
       return false;
 
    uint32_t internal_type, internal_bpp;
-   get_internal_type_bpp_for_image_aspects(fb_format,
-                                           region->imageSubresource.aspectMask,
-                                           &internal_type, &internal_bpp);
+   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
+      (fb_format, region->imageSubresource.aspectMask,
+       &internal_type, &internal_bpp);
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -903,19 +403,21 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy from compressed format using a compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false,
+                        1, internal_bpp, false);
 
-   struct framebuffer_data framebuffer;
-   setup_framebuffer_data(&framebuffer, fb_format, internal_type,
-                          &job->frame_tiling);
+   struct v3dv_meta_framebuffer framebuffer;
+   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+                                              internal_type, &job->frame_tiling);
 
-   v3dv_job_emit_binning_flush(job);
-   emit_copy_image_to_buffer_rcl(job, buffer, image, &framebuffer, region);
+   v3dv_X(job->device, job_emit_binning_flush)(job);
+   v3dv_X(job->device, meta_emit_copy_image_to_buffer_rcl)
+      (job, buffer, image, &framebuffer, region);
 
    v3dv_cmd_buffer_finish_job(cmd_buffer);
 
@@ -930,7 +432,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             VkFormat src_format,
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
-            const VkImageBlit *region,
+            const VkImageBlit2KHR *region,
             VkFilter filter,
             bool dst_is_padded_image);
 
@@ -942,7 +444,7 @@ static bool
 copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
                           struct v3dv_buffer *buffer,
                           struct v3dv_image *image,
-                          const VkBufferImageCopy *region)
+                          const VkBufferImageCopy2KHR *region)
 {
    bool handled = false;
 
@@ -991,10 +493,10 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          dst_format = VK_FORMAT_R8G8B8A8_UINT;
          break;
       case VK_IMAGE_ASPECT_DEPTH_BIT:
-         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
-                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
-         if (image->vk_format == VK_FORMAT_D32_SFLOAT) {
+         assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
+                image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+                image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
+         if (image->vk.format == VK_FORMAT_D32_SFLOAT) {
             src_format = VK_FORMAT_R32_UINT;
             dst_format = VK_FORMAT_R32_UINT;
          } else {
@@ -1016,7 +518,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          break;
       case VK_IMAGE_ASPECT_STENCIL_BIT:
          assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
-         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
+         assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
          /* Copying from S8D24. We want to write 8-bit stencil values only,
           * so adjust the buffer bpp for that. Since the hardware stores stencil
           * in the LSB, we can just do a RGBA8UI to R8UI blit.
@@ -1070,14 +572,14 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
       buf_height = region->bufferImageHeight;
 
    /* If the image is compressed, the bpp refers to blocks, not pixels */
-   uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
-   uint32_t block_height = vk_format_get_blockheight(image->vk_format);
+   uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
+   uint32_t block_height = vk_format_get_blockheight(image->vk.format);
    buf_width = buf_width / block_width;
    buf_height = buf_height / block_height;
 
    /* Compute layers to copy */
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -1094,17 +596,17 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
    VkResult result;
    struct v3dv_device *device = cmd_buffer->device;
    VkDevice _device = v3dv_device_to_handle(device);
-   if (vk_format_is_compressed(image->vk_format)) {
+   if (vk_format_is_compressed(image->vk.format)) {
       VkImage uiview;
       VkImageCreateInfo uiview_info = {
          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
          .imageType = VK_IMAGE_TYPE_3D,
          .format = dst_format,
-         .extent = { buf_width, buf_height, image->extent.depth },
-         .mipLevels = image->levels,
-         .arrayLayers = image->array_size,
-         .samples = image->samples,
-         .tiling = image->tiling,
+         .extent = { buf_width, buf_height, image->vk.extent.depth },
+         .mipLevels = image->vk.mip_levels,
+         .arrayLayers = image->vk.array_layers,
+         .samples = image->vk.samples,
+         .tiling = image->vk.tiling,
          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
          .queueFamilyIndexCount = 0,
@@ -1118,9 +620,10 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          cmd_buffer, (uintptr_t)uiview,
          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
 
-      result = v3dv_BindImageMemory(_device, uiview,
-                                    v3dv_device_memory_to_handle(image->mem),
-                                    image->mem_offset);
+      result =
+         vk_common_BindImageMemory(_device, uiview,
+                                   v3dv_device_memory_to_handle(image->mem),
+                                   image->mem_offset);
       if (result != VK_SUCCESS)
          return handled;
 
@@ -1158,9 +661,10 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
       /* Bind the buffer memory to the image */
       VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
          i * buf_width * buf_height * buffer_bpp;
-      result = v3dv_BindImageMemory(_device, buffer_image,
-                                    v3dv_device_memory_to_handle(buffer->mem),
-                                    buffer_offset);
+      result =
+         vk_common_BindImageMemory(_device, buffer_image,
+                                   v3dv_device_memory_to_handle(buffer->mem),
+                                   buffer_offset);
       if (result != VK_SUCCESS)
          return handled;
 
@@ -1172,7 +676,8 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
        * image, but that we need to blit to a S8D24 destination (the only
        * stencil format we support).
        */
-      const VkImageBlit blit_region = {
+      const VkImageBlit2KHR blit_region = {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
          .srcSubresource = {
             .aspectMask = copy_aspect,
             .mipLevel = region->imageSubresource.mipLevel,
@@ -1225,309 +730,26 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
    return true;
 }
 
-static VkFormat
-get_compatible_tlb_format(VkFormat format)
-{
-   switch (format) {
-   case VK_FORMAT_R8G8B8A8_SNORM:
-      return VK_FORMAT_R8G8B8A8_UINT;
-
-   case VK_FORMAT_R8G8_SNORM:
-      return VK_FORMAT_R8G8_UINT;
-
-   case VK_FORMAT_R8_SNORM:
-      return VK_FORMAT_R8_UINT;
-
-   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-      return VK_FORMAT_A8B8G8R8_UINT_PACK32;
-
-   case VK_FORMAT_R16_UNORM:
-   case VK_FORMAT_R16_SNORM:
-      return VK_FORMAT_R16_UINT;
-
-   case VK_FORMAT_R16G16_UNORM:
-   case VK_FORMAT_R16G16_SNORM:
-      return VK_FORMAT_R16G16_UINT;
-
-   case VK_FORMAT_R16G16B16A16_UNORM:
-   case VK_FORMAT_R16G16B16A16_SNORM:
-      return VK_FORMAT_R16G16B16A16_UINT;
-
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-      return VK_FORMAT_R32_SFLOAT;
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
+                              const VkCopyImageToBufferInfo2KHR *info)
 
-   /* We can't render to compressed formats using the TLB so instead we use
-    * a compatible format with the same bpp as the compressed format. Because
-    * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
-    * case of ETC), when we implement copies with the compatible format we
-    * will have to divide offsets and dimensions on the compressed image by
-    * the compressed block size.
-    */
-   case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
-   case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
-   case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
-   case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
-   case VK_FORMAT_BC2_UNORM_BLOCK:
-   case VK_FORMAT_BC2_SRGB_BLOCK:
-   case VK_FORMAT_BC3_SRGB_BLOCK:
-   case VK_FORMAT_BC3_UNORM_BLOCK:
-      return VK_FORMAT_R32G32B32A32_UINT;
-
-   case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
-   case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
-   case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
-   case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
-   case VK_FORMAT_EAC_R11_UNORM_BLOCK:
-   case VK_FORMAT_EAC_R11_SNORM_BLOCK:
-   case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
-   case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
-   case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
-   case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
-      return VK_FORMAT_R16G16B16A16_UINT;
-
-   default:
-      return VK_FORMAT_UNDEFINED;
-   }
-}
-
-static inline bool
-can_use_tlb(struct v3dv_image *image,
-            const VkOffset3D *offset,
-            VkFormat *compat_format)
-{
-   if (offset->x != 0 || offset->y != 0)
-      return false;
-
-   if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
-      if (compat_format)
-         *compat_format = image->vk_format;
-      return true;
-   }
-
-   /* If the image format is not TLB-supported, then check if we can use
-    * a compatible format instead.
-    */
-   if (compat_format) {
-      *compat_format = get_compatible_tlb_format(image->vk_format);
-      if (*compat_format != VK_FORMAT_UNDEFINED)
-         return true;
-   }
-
-   return false;
-}
-
-void
-v3dv_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
-                          VkImage srcImage,
-                          VkImageLayout srcImageLayout,
-                          VkBuffer destBuffer,
-                          uint32_t regionCount,
-                          const VkBufferImageCopy *pRegions)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_image, image, srcImage);
-   V3DV_FROM_HANDLE(v3dv_buffer, buffer, destBuffer);
+   V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);
 
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
-   for (uint32_t i = 0; i < regionCount; i++) {
-      if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &pRegions[i]))
+   for (uint32_t i = 0; i < info->regionCount; i++) {
+      if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i]))
          continue;
-      if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &pRegions[i]))
+      if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &info->pRegions[i]))
          continue;
       unreachable("Unsupported image to buffer copy.");
    }
 }
 
-static void
-emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
-                                    struct framebuffer_data *framebuffer,
-                                    struct v3dv_image *dst,
-                                    struct v3dv_image *src,
-                                    uint32_t layer_offset,
-                                    const VkImageCopy *region)
-{
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(NULL, job);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   assert((src->type != VK_IMAGE_TYPE_3D &&
-           layer_offset < region->srcSubresource.layerCount) ||
-          layer_offset < src->extent.depth);
-
-   const uint32_t src_layer = src->type != VK_IMAGE_TYPE_3D ?
-      region->srcSubresource.baseArrayLayer + layer_offset :
-      region->srcOffset.z + layer_offset;
-
-   emit_image_load(cl, framebuffer, src,
-                   region->srcSubresource.aspectMask,
-                   src_layer,
-                   region->srcSubresource.mipLevel,
-                   false, false);
-
-   cl_emit(cl, END_OF_LOADS, end);
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   assert((dst->type != VK_IMAGE_TYPE_3D &&
-           layer_offset < region->dstSubresource.layerCount) ||
-          layer_offset < dst->extent.depth);
-
-   const uint32_t dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
-      region->dstSubresource.baseArrayLayer + layer_offset :
-      region->dstOffset.z + layer_offset;
-
-   emit_image_store(cl, framebuffer, dst,
-                    region->dstSubresource.aspectMask,
-                    dst_layer,
-                    region->dstSubresource.mipLevel,
-                    false, false, false);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-emit_copy_image_layer(struct v3dv_job *job,
-                      struct v3dv_image *dst,
-                      struct v3dv_image *src,
-                      struct framebuffer_data *framebuffer,
-                      uint32_t layer,
-                      const VkImageCopy *region)
-{
-   emit_frame_setup(job, layer, NULL);
-   emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
-   emit_supertile_coordinates(job, framebuffer);
-}
-
-static void
-emit_copy_image_rcl(struct v3dv_job *job,
-                    struct v3dv_image *dst,
-                    struct v3dv_image *src,
-                    struct framebuffer_data *framebuffer,
-                    const VkImageCopy *region)
-{
-   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
-   v3dv_return_if_oom(NULL, job);
-
-   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
-      emit_copy_image_layer(job, dst, src, framebuffer, layer, region);
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
-/* Disable level 0 write, just write following mipmaps */
-#define V3D_TFU_IOA_DIMTW (1 << 0)
-#define V3D_TFU_IOA_FORMAT_SHIFT 3
-#define V3D_TFU_IOA_FORMAT_LINEARTILE 3
-#define V3D_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
-#define V3D_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
-#define V3D_TFU_IOA_FORMAT_UIF_NO_XOR 6
-#define V3D_TFU_IOA_FORMAT_UIF_XOR 7
-
-#define V3D_TFU_ICFG_NUMMM_SHIFT 5
-#define V3D_TFU_ICFG_TTYPE_SHIFT 9
-
-#define V3D_TFU_ICFG_OPAD_SHIFT 22
-
-#define V3D_TFU_ICFG_FORMAT_SHIFT 18
-#define V3D_TFU_ICFG_FORMAT_RASTER 0
-#define V3D_TFU_ICFG_FORMAT_SAND_128 1
-#define V3D_TFU_ICFG_FORMAT_SAND_256 2
-#define V3D_TFU_ICFG_FORMAT_LINEARTILE 11
-#define V3D_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
-#define V3D_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
-#define V3D_TFU_ICFG_FORMAT_UIF_NO_XOR 14
-#define V3D_TFU_ICFG_FORMAT_UIF_XOR 15
-
-static void
-emit_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
-             struct v3dv_image *dst,
-             uint32_t dst_mip_level,
-             uint32_t dst_layer,
-             struct v3dv_image *src,
-             uint32_t src_mip_level,
-             uint32_t src_layer,
-             uint32_t width,
-             uint32_t height,
-             const struct v3dv_format *format)
-{
-   const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
-   const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
-
-   assert(dst->mem && dst->mem->bo);
-   const struct v3dv_bo *dst_bo = dst->mem->bo;
-
-   assert(src->mem && src->mem->bo);
-   const struct v3dv_bo *src_bo = src->mem->bo;
-
-   struct drm_v3d_submit_tfu tfu = {
-      .ios = (height << 16) | width,
-      .bo_handles = {
-         dst_bo->handle,
-         src_bo->handle != dst_bo->handle ? src_bo->handle : 0
-      },
-   };
-
-   const uint32_t src_offset =
-      src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
-   tfu.iia |= src_offset;
-
-   uint32_t icfg;
-   if (src_slice->tiling == VC5_TILING_RASTER) {
-      icfg = V3D_TFU_ICFG_FORMAT_RASTER;
-   } else {
-      icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
-             (src_slice->tiling - VC5_TILING_LINEARTILE);
-   }
-   tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
-
-   const uint32_t dst_offset =
-      dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
-   tfu.ioa |= dst_offset;
-
-   tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
-               (dst_slice->tiling - VC5_TILING_LINEARTILE)) <<
-                V3D_TFU_IOA_FORMAT_SHIFT;
-   tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
-
-   switch (src_slice->tiling) {
-   case VC5_TILING_UIF_NO_XOR:
-   case VC5_TILING_UIF_XOR:
-      tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
-      break;
-   case VC5_TILING_RASTER:
-      tfu.iis |= src_slice->stride / src->cpp;
-      break;
-   default:
-      break;
-   }
-
-   /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
-    * OPAD field for the destination (how many extra UIF blocks beyond
-    * those necessary to cover the height).
-    */
-   if (dst_slice->tiling == VC5_TILING_UIF_NO_XOR ||
-       dst_slice->tiling == VC5_TILING_UIF_XOR) {
-      uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
-      uint32_t implicit_padded_height = align(height, uif_block_h);
-      uint32_t icfg =
-         (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
-      tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
-   }
-
-   v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
-}
-
 /**
  * Returns true if the implementation supports the requested operation (even if
  * it failed to process it, for example, due to an out-of-memory error).
@@ -1536,17 +758,17 @@ static bool
 copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                struct v3dv_image *dst,
                struct v3dv_image *src,
-               const VkImageCopy *region)
+               const VkImageCopy2KHR *region)
 {
    /* Destination can't be raster format */
-   if (dst->tiling == VK_IMAGE_TILING_LINEAR)
+   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    /* We can only do full copies, so if the format is D24S8 both aspects need
     * to be copied. We only need to check the dst format because the spec
     * states that depth/stencil formats must match exactly.
     */
-   if (dst->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
                                              VK_IMAGE_ASPECT_STENCIL_BIT;
        if (region->dstSubresource.aspectMask != ds_aspects)
@@ -1562,8 +784,8 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     * checking against the region dimensions, which are in units of the source
     * image format.
     */
-   if (vk_format_is_compressed(dst->vk_format) !=
-       vk_format_is_compressed(src->vk_format)) {
+   if (vk_format_is_compressed(dst->vk.format) !=
+       vk_format_is_compressed(src->vk.format)) {
       return false;
    }
 
@@ -1576,8 +798,8 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       return false;
 
    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
-   uint32_t dst_width = u_minify(dst->extent.width, dst_mip_level);
-   uint32_t dst_height = u_minify(dst->extent.height, dst_mip_level);
+   uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
+   uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
    if (region->extent.width != dst_width || region->extent.height != dst_height)
       return false;
 
@@ -1587,15 +809,15 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     *    members represent the texel dimensions of the source image and not
     *    the destination."
     */
-   const uint32_t block_w = vk_format_get_blockwidth(src->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(src->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(src->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(src->vk.format);
    uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
    /* Account for sample count */
-   assert(dst->samples == src->samples);
-   if (dst->samples > VK_SAMPLE_COUNT_1_BIT) {
-      assert(dst->samples == VK_SAMPLE_COUNT_4_BIT);
+   assert(dst->vk.samples == src->vk.samples);
+   if (dst->vk.samples > VK_SAMPLE_COUNT_1_BIT) {
+      assert(dst->vk.samples == VK_SAMPLE_COUNT_4_BIT);
       width *= 2;
       height *= 2;
    }
@@ -1614,24 +836,24 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     */
    assert(dst->cpp == src->cpp);
    const struct v3dv_format *format =
-      v3dv_get_compatible_tfu_format(&cmd_buffer->device->devinfo,
+      v3dv_get_compatible_tfu_format(cmd_buffer->device,
                                      dst->cpp, NULL);
 
    /* Emit a TFU job for each layer to blit */
-   const uint32_t layer_count = dst->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->dstSubresource.layerCount :
       region->extent.depth;
    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
 
-   const uint32_t base_src_layer = src->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t base_src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->srcSubresource.baseArrayLayer : region->srcOffset.z;
-   const uint32_t base_dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->dstSubresource.baseArrayLayer : region->dstOffset.z;
    for (uint32_t i = 0; i < layer_count; i++) {
-      emit_tfu_job(cmd_buffer,
-                   dst, dst_mip_level, base_dst_layer + i,
-                   src, src_mip_level, base_src_layer + i,
-                   width, height, format);
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
+         (cmd_buffer, dst, dst_mip_level, base_dst_layer + i,
+          src, src_mip_level, base_src_layer + i,
+          width, height, format);
    }
 
    return true;
@@ -1645,11 +867,11 @@ static bool
 copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                struct v3dv_image *dst,
                struct v3dv_image *src,
-               const VkImageCopy *region)
+               const VkImageCopy2KHR *region)
 {
    VkFormat fb_format;
-   if (!can_use_tlb(src, &region->srcOffset, &fb_format) ||
-       !can_use_tlb(dst, &region->dstOffset, &fb_format)) {
+   if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, &fb_format) ||
+       !v3dv_meta_can_use_tlb(dst, &region->dstOffset, &fb_format)) {
       return false;
    }
 
@@ -1662,9 +884,9 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    assert(region->dstSubresource.aspectMask ==
           region->srcSubresource.aspectMask);
    uint32_t internal_type, internal_bpp;
-   get_internal_type_bpp_for_image_aspects(fb_format,
-                                           region->dstSubresource.aspectMask,
-                                           &internal_type, &internal_bpp);
+   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
+      (fb_format, region->dstSubresource.aspectMask,
+       &internal_type, &internal_bpp);
 
    /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
     *
@@ -1672,12 +894,12 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
     *  srcSubresource (for non-3D) must match the number of slices of the
     *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
     */
-   assert((src->type != VK_IMAGE_TYPE_3D ?
+   assert((src->vk.image_type != VK_IMAGE_TYPE_3D ?
            region->srcSubresource.layerCount : region->extent.depth) ==
-          (dst->type != VK_IMAGE_TYPE_3D ?
+          (dst->vk.image_type != VK_IMAGE_TYPE_3D ?
            region->dstSubresource.layerCount : region->extent.depth));
    uint32_t num_layers;
-   if (dst->type != VK_IMAGE_TYPE_3D)
+   if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->dstSubresource.layerCount;
    else
       num_layers = region->extent.depth;
@@ -1689,20 +911,20 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy to compressed image using compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp,
-                        src->samples > VK_SAMPLE_COUNT_1_BIT);
+   v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp,
+                        src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
 
-   struct framebuffer_data framebuffer;
-   setup_framebuffer_data(&framebuffer, fb_format, internal_type,
-                          &job->frame_tiling);
+   struct v3dv_meta_framebuffer framebuffer;
+   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+                                              internal_type, &job->frame_tiling);
 
-   v3dv_job_emit_binning_flush(job);
-   emit_copy_image_rcl(job, dst, src, &framebuffer, region);
+   v3dv_X(job->device, job_emit_binning_flush)(job);
+   v3dv_X(job->device, meta_emit_copy_image_rcl)(job, dst, src, &framebuffer, region);
 
    v3dv_cmd_buffer_finish_job(cmd_buffer);
 
@@ -1734,18 +956,18 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
 
    VkImageCreateInfo info = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-      .imageType = src->type,
+      .imageType = src->vk.image_type,
       .format = format,
       .extent = {
-         .width = src->extent.width * width_scale,
-         .height = src->extent.height * height_scale,
-         .depth = src->extent.depth,
+         .width = src->vk.extent.width * width_scale,
+         .height = src->vk.extent.height * height_scale,
+         .depth = src->vk.extent.depth,
       },
-      .mipLevels = src->levels,
-      .arrayLayers = src->array_size,
-      .samples = src->samples,
-      .tiling = src->tiling,
-      .usage = src->usage,
+      .mipLevels = src->vk.mip_levels,
+      .arrayLayers = src->vk.array_layers,
+      .samples = src->vk.samples,
+      .tiling = src->vk.tiling,
+      .usage = src->vk.usage,
    };
 
     VkImage _image;
@@ -1770,12 +992,12 @@ static bool
 copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_image *dst,
                 struct v3dv_image *src,
-                const VkImageCopy *region)
+                const VkImageCopy2KHR *region)
 {
-   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
-   const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
-   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
+   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
+   const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
+   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
 
@@ -1789,7 +1011,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
    float src_scale_h = 1.0f;
    float dst_scale_w = block_scale_w;
    float dst_scale_h = block_scale_h;
-   if (vk_format_is_compressed(src->vk_format)) {
+   if (vk_format_is_compressed(src->vk.format)) {
       /* If we are copying from a compressed format we should be aware that we
        * are going to texture from the source image, and the texture setup
        * knows the actual size of the image, so we need to choose a format
@@ -1813,18 +1035,13 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
        */
       assert(src->cpp == dst->cpp);
 
-      uint32_t divisor_w, divisor_h;
       format = VK_FORMAT_R32G32_UINT;
       switch (src->cpp) {
       case 16:
          format = VK_FORMAT_R32G32B32A32_UINT;
-         divisor_w = 4;
-         divisor_h = 4;
          break;
       case 8:
          format = VK_FORMAT_R16G16B16A16_UINT;
-         divisor_w = 4;
-         divisor_h = 4;
          break;
       default:
          unreachable("Unsupported compressed format");
@@ -1833,10 +1050,10 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       /* Create image views of the src/dst images that we can interpret in
        * terms of the canonical format.
        */
-      src_scale_w /= divisor_w;
-      src_scale_h /= divisor_h;
-      dst_scale_w /= divisor_w;
-      dst_scale_h /= divisor_h;
+      src_scale_w /= src_block_w;
+      src_scale_h /= src_block_h;
+      dst_scale_w /= src_block_w;
+      dst_scale_h /= src_block_h;
 
       src = create_image_alias(cmd_buffer, src,
                                src_scale_w, src_scale_h, format);
@@ -1845,11 +1062,11 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                                dst_scale_w, dst_scale_h, format);
    } else {
       format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
-         src->vk_format : get_compatible_tlb_format(src->vk_format);
+         src->vk.format : get_compatible_tlb_format(src->vk.format);
       if (format == VK_FORMAT_UNDEFINED)
          return false;
 
-      const struct v3dv_format *f = v3dv_get_format(format);
+      const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
       if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO)
          return false;
    }
@@ -1895,7 +1112,8 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       dst_start.z + region->extent.depth,
    };
 
-   const VkImageBlit blit_region = {
+   const VkImageBlit2KHR blit_region = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
       .srcSubresource = region->srcSubresource,
       .srcOffsets = { src_start, src_end },
       .dstSubresource = region->dstSubresource,
@@ -1912,466 +1130,42 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
    return handled;
 }
 
-void
-v3dv_CmdCopyImage(VkCommandBuffer commandBuffer,
-                  VkImage srcImage,
-                  VkImageLayout srcImageLayout,
-                  VkImage dstImage,
-                  VkImageLayout dstImageLayout,
-                  uint32_t regionCount,
-                  const VkImageCopy *pRegions)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
+                      const VkCopyImageInfo2KHR *info)
+
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_image, src, srcImage);
-   V3DV_FROM_HANDLE(v3dv_image, dst, dstImage);
+   V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
+   V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
 
-   assert(src->samples == dst->samples);
+   assert(src->vk.samples == dst->vk.samples);
 
-   for (uint32_t i = 0; i < regionCount; i++) {
-      if (copy_image_tfu(cmd_buffer, dst, src, &pRegions[i]))
+   for (uint32_t i = 0; i < info->regionCount; i++) {
+      if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i]))
          continue;
-      if (copy_image_tlb(cmd_buffer, dst, src, &pRegions[i]))
+      if (copy_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
          continue;
-      if (copy_image_blit(cmd_buffer, dst, src, &pRegions[i]))
+      if (copy_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
          continue;
       unreachable("Image copy not supported");
    }
 }
 
-static void
-emit_clear_image_per_tile_list(struct v3dv_job *job,
-                               struct framebuffer_data *framebuffer,
-                               struct v3dv_image *image,
-                               VkImageAspectFlags aspects,
-                               uint32_t layer,
-                               uint32_t level)
-{
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(NULL, job);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   cl_emit(cl, END_OF_LOADS, end);
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   emit_image_store(cl, framebuffer, image, aspects, layer, level,
-                    false, false, false);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-emit_clear_image(struct v3dv_job *job,
-                 struct v3dv_image *image,
-                 struct framebuffer_data *framebuffer,
-                 VkImageAspectFlags aspects,
-                 uint32_t layer,
-                 uint32_t level)
-{
-   emit_clear_image_per_tile_list(job, framebuffer, image, aspects, layer, level);
-   emit_supertile_coordinates(job, framebuffer);
-}
-
-static void
-emit_clear_image_rcl(struct v3dv_job *job,
-                     struct v3dv_image *image,
-                     struct framebuffer_data *framebuffer,
-                     const union v3dv_clear_value *clear_value,
-                     VkImageAspectFlags aspects,
-                     uint32_t layer,
-                     uint32_t level)
-{
-   const struct rcl_clear_info clear_info = {
-      .clear_value = clear_value,
-      .image = image,
-      .aspects = aspects,
-      .layer = layer,
-      .level = level,
-   };
-
-   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
-   v3dv_return_if_oom(NULL, job);
-
-   emit_frame_setup(job, 0, clear_value);
-   emit_clear_image(job, image, framebuffer, aspects, layer, level);
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
-static void
-get_hw_clear_color(const VkClearColorValue *color,
-                   VkFormat fb_format,
-                   VkFormat image_format,
-                   uint32_t internal_type,
-                   uint32_t internal_bpp,
-                   uint32_t *hw_color)
-{
-   const uint32_t internal_size = 4 << internal_bpp;
-
-   /* If the image format doesn't match the framebuffer format, then we are
-    * trying to clear an unsupported tlb format using a compatible
-    * format for the framebuffer. In this case, we want to make sure that
-    * we pack the clear value according to the original format semantics,
-    * not the compatible format.
-    */
-   if (fb_format == image_format) {
-      v3dv_get_hw_clear_color(color, internal_type, internal_size, hw_color);
-   } else {
-      union util_color uc;
-      enum pipe_format pipe_image_format =
-         vk_format_to_pipe_format(image_format);
-      util_pack_color(color->float32, pipe_image_format, &uc);
-      memcpy(hw_color, uc.ui, internal_size);
-   }
-}
-
-/* Returns true if the implementation is able to handle the case, false
- * otherwise.
-*/
-static bool
-clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
-                struct v3dv_image *image,
-                const VkClearValue *clear_value,
-                const VkImageSubresourceRange *range)
-{
-   const VkOffset3D origin = { 0, 0, 0 };
-   VkFormat fb_format;
-   if (!can_use_tlb(image, &origin, &fb_format))
-      return false;
-
-   uint32_t internal_type, internal_bpp;
-   get_internal_type_bpp_for_image_aspects(fb_format, range->aspectMask,
-                                           &internal_type, &internal_bpp);
-
-   union v3dv_clear_value hw_clear_value = { 0 };
-   if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-      get_hw_clear_color(&clear_value->color, fb_format, image->vk_format,
-                         internal_type, internal_bpp, &hw_clear_value.color[0]);
-   } else {
-      assert((range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) ||
-             (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT));
-      hw_clear_value.z = clear_value->depthStencil.depth;
-      hw_clear_value.s = clear_value->depthStencil.stencil;
-   }
-
-   uint32_t level_count = range->levelCount == VK_REMAINING_MIP_LEVELS ?
-                          image->levels - range->baseMipLevel :
-                          range->levelCount;
-   uint32_t min_level = range->baseMipLevel;
-   uint32_t max_level = range->baseMipLevel + level_count;
-
-   /* For 3D images baseArrayLayer and layerCount must be 0 and 1 respectively.
-    * Instead, we need to consider the full depth dimension of the image, which
-    * goes from 0 up to the level's depth extent.
-    */
-   uint32_t min_layer;
-   uint32_t max_layer;
-   if (image->type != VK_IMAGE_TYPE_3D) {
-      uint32_t layer_count = range->layerCount == VK_REMAINING_ARRAY_LAYERS ?
-                             image->array_size - range->baseArrayLayer :
-                             range->layerCount;
-      min_layer = range->baseArrayLayer;
-      max_layer = range->baseArrayLayer + layer_count;
-   } else {
-      min_layer = 0;
-      max_layer = 0;
-   }
-
-   for (uint32_t level = min_level; level < max_level; level++) {
-      if (image->type == VK_IMAGE_TYPE_3D)
-         max_layer = u_minify(image->extent.depth, level);
-      for (uint32_t layer = min_layer; layer < max_layer; layer++) {
-         uint32_t width = u_minify(image->extent.width, level);
-         uint32_t height = u_minify(image->extent.height, level);
-
-         struct v3dv_job *job =
-            v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
-
-         if (!job)
-            return true;
-
-         /* We start a a new job for each layer so the frame "depth" is 1 */
-         v3dv_job_start_frame(job, width, height, 1, 1, internal_bpp,
-                              image->samples > VK_SAMPLE_COUNT_1_BIT);
-
-         struct framebuffer_data framebuffer;
-         setup_framebuffer_data(&framebuffer, fb_format, internal_type,
-                                &job->frame_tiling);
-
-         v3dv_job_emit_binning_flush(job);
-
-         /* If this triggers it is an application bug: the spec requires
-          * that any aspects to clear are present in the image.
-          */
-         assert(range->aspectMask & image->aspects);
-
-         emit_clear_image_rcl(job, image, &framebuffer, &hw_clear_value,
-                             range->aspectMask, layer, level);
-
-         v3dv_cmd_buffer_finish_job(cmd_buffer);
-      }
-   }
-
-   return true;
-}
-
-void
-v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,
-                        VkImage _image,
-                        VkImageLayout imageLayout,
-                        const VkClearColorValue *pColor,
-                        uint32_t rangeCount,
-                        const VkImageSubresourceRange *pRanges)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_image, image, _image);
-
-   const VkClearValue clear_value = {
-      .color = *pColor,
-   };
-
-   for (uint32_t i = 0; i < rangeCount; i++) {
-      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
-         continue;
-      unreachable("Unsupported color clear.");
-   }
-}
-
-void
-v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
-                               VkImage _image,
-                               VkImageLayout imageLayout,
-                               const VkClearDepthStencilValue *pDepthStencil,
-                               uint32_t rangeCount,
-                               const VkImageSubresourceRange *pRanges)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
+                       const VkCopyBufferInfo2KHR *pCopyBufferInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_image, image, _image);
-
-   const VkClearValue clear_value = {
-      .depthStencil = *pDepthStencil,
-   };
-
-   for (uint32_t i = 0; i < rangeCount; i++) {
-      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
-         continue;
-      unreachable("Unsupported depth/stencil clear.");
-   }
-}
-
-static void
-emit_copy_buffer_per_tile_list(struct v3dv_job *job,
-                               struct v3dv_bo *dst,
-                               struct v3dv_bo *src,
-                               uint32_t dst_offset,
-                               uint32_t src_offset,
-                               uint32_t stride,
-                               uint32_t format)
-{
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(NULL, job);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   emit_linear_load(cl, RENDER_TARGET_0, src, src_offset, stride, format);
-
-   cl_emit(cl, END_OF_LOADS, end);
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   emit_linear_store(cl, RENDER_TARGET_0,
-                     dst, dst_offset, stride, false, format);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-emit_copy_buffer(struct v3dv_job *job,
-                 struct v3dv_bo *dst,
-                 struct v3dv_bo *src,
-                 uint32_t dst_offset,
-                 uint32_t src_offset,
-                 struct framebuffer_data *framebuffer,
-                 uint32_t format,
-                 uint32_t item_size)
-{
-   const uint32_t stride = job->frame_tiling.width * item_size;
-   emit_copy_buffer_per_tile_list(job, dst, src,
-                                  dst_offset, src_offset,
-                                  stride, format);
-   emit_supertile_coordinates(job, framebuffer);
-}
-
-static void
-emit_copy_buffer_rcl(struct v3dv_job *job,
-                     struct v3dv_bo *dst,
-                     struct v3dv_bo *src,
-                     uint32_t dst_offset,
-                     uint32_t src_offset,
-                     struct framebuffer_data *framebuffer,
-                     uint32_t format,
-                     uint32_t item_size)
-{
-   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
-   v3dv_return_if_oom(NULL, job);
-
-   emit_frame_setup(job, 0, NULL);
-
-   emit_copy_buffer(job, dst, src, dst_offset, src_offset,
-                    framebuffer, format, item_size);
-
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
-/* Figure out a TLB size configuration for a number of pixels to process.
- * Beware that we can't "render" more than 4096x4096 pixels in a single job,
- * if the pixel count is larger than this, the caller might need to split
- * the job and call this function multiple times.
- */
-static void
-framebuffer_size_for_pixel_count(uint32_t num_pixels,
-                                 uint32_t *width,
-                                 uint32_t *height)
-{
-   assert(num_pixels > 0);
-
-   const uint32_t max_dim_pixels = 4096;
-   const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
-
-   uint32_t w, h;
-   if (num_pixels > max_pixels) {
-      w = max_dim_pixels;
-      h = max_dim_pixels;
-   } else {
-      w = num_pixels;
-      h = 1;
-      while (w > max_dim_pixels || ((w % 2) == 0 && w > 2 * h)) {
-         w >>= 1;
-         h <<= 1;
-      }
-   }
-   assert(w <= max_dim_pixels && h <= max_dim_pixels);
-   assert(w * h <= num_pixels);
-   assert(w > 0 && h > 0);
-
-   *width = w;
-   *height = h;
-}
-
-static struct v3dv_job *
-copy_buffer(struct v3dv_cmd_buffer *cmd_buffer,
-            struct v3dv_bo *dst,
-            uint32_t dst_offset,
-            struct v3dv_bo *src,
-            uint32_t src_offset,
-            const VkBufferCopy *region)
-{
-   const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
-   const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
-
-   /* Select appropriate pixel format for the copy operation based on the
-    * size to copy and the alignment of the source and destination offsets.
-    */
-   src_offset += region->srcOffset;
-   dst_offset += region->dstOffset;
-   uint32_t item_size = 4;
-   while (item_size > 1 &&
-          (src_offset % item_size != 0 || dst_offset % item_size != 0)) {
-      item_size /= 2;
-   }
-
-   while (item_size > 1 && region->size % item_size != 0)
-      item_size /= 2;
-
-   assert(region->size % item_size == 0);
-   uint32_t num_items = region->size / item_size;
-   assert(num_items > 0);
-
-   uint32_t format;
-   VkFormat vk_format;
-   switch (item_size) {
-   case 4:
-      format = V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
-      vk_format = VK_FORMAT_R8G8B8A8_UINT;
-      break;
-   case 2:
-      format = V3D_OUTPUT_IMAGE_FORMAT_RG8UI;
-      vk_format = VK_FORMAT_R8G8_UINT;
-      break;
-   default:
-      format = V3D_OUTPUT_IMAGE_FORMAT_R8UI;
-      vk_format = VK_FORMAT_R8_UINT;
-      break;
-   }
-
-   struct v3dv_job *job = NULL;
-   while (num_items > 0) {
-      job = v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
-      if (!job)
-         return NULL;
-
-      uint32_t width, height;
-      framebuffer_size_for_pixel_count(num_items, &width, &height);
-
-      v3dv_job_start_frame(job, width, height, 1, 1, internal_bpp, false);
-
-      struct framebuffer_data framebuffer;
-      setup_framebuffer_data(&framebuffer, vk_format, internal_type,
-                             &job->frame_tiling);
-
-      v3dv_job_emit_binning_flush(job);
-
-      emit_copy_buffer_rcl(job, dst, src, dst_offset, src_offset,
-                           &framebuffer, format, item_size);
-
-      v3dv_cmd_buffer_finish_job(cmd_buffer);
-
-      const uint32_t items_copied = width * height;
-      const uint32_t bytes_copied = items_copied * item_size;
-      num_items -= items_copied;
-      src_offset += bytes_copied;
-      dst_offset += bytes_copied;
-   }
-
-   return job;
-}
-
-void
-v3dv_CmdCopyBuffer(VkCommandBuffer commandBuffer,
-                   VkBuffer srcBuffer,
-                   VkBuffer dstBuffer,
-                   uint32_t regionCount,
-                   const VkBufferCopy *pRegions)
-{
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, srcBuffer);
-   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
-
-   for (uint32_t i = 0; i < regionCount; i++) {
-     copy_buffer(cmd_buffer,
-                 dst_buffer->mem->bo, dst_buffer->mem_offset,
-                 src_buffer->mem->bo, src_buffer->mem_offset,
-                 &pRegions[i]);
+   V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
+   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
+
+   for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
+      v3dv_X(cmd_buffer->device, meta_copy_buffer)
+         (cmd_buffer,
+          dst_buffer->mem->bo, dst_buffer->mem_offset,
+          src_buffer->mem->bo, src_buffer->mem_offset,
+          &pCopyBufferInfo->pRegions[i]);
    }
 }
 
@@ -2385,7 +1179,7 @@ destroy_update_buffer_cb(VkDevice _device,
    v3dv_bo_free(device, bo);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
                      VkBuffer dstBuffer,
                      VkDeviceSize dstOffset,
@@ -2412,16 +1206,17 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
 
    v3dv_bo_unmap(cmd_buffer->device, src_bo);
 
-   VkBufferCopy region = {
+   VkBufferCopy2KHR region = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2_KHR,
       .srcOffset = 0,
       .dstOffset = dstOffset,
       .size = dataSize,
    };
    struct v3dv_job *copy_job =
-      copy_buffer(cmd_buffer,
-                  dst_buffer->mem->bo, dst_buffer->mem_offset,
-                  src_bo, 0,
-                  &region);
+      v3dv_X(cmd_buffer->device, meta_copy_buffer)
+      (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
+       src_bo, 0, &region);
+
    if (!copy_job)
       return;
 
@@ -2429,118 +1224,7 @@ v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
       cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
 }
 
-static void
-emit_fill_buffer_per_tile_list(struct v3dv_job *job,
-                               struct v3dv_bo *bo,
-                               uint32_t offset,
-                               uint32_t stride)
-{
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(NULL, job);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   cl_emit(cl, END_OF_LOADS, end);
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   emit_linear_store(cl, RENDER_TARGET_0, bo, offset, stride, false,
-                     V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-emit_fill_buffer(struct v3dv_job *job,
-                 struct v3dv_bo *bo,
-                 uint32_t offset,
-                 struct framebuffer_data *framebuffer)
-{
-   const uint32_t stride = job->frame_tiling.width * 4;
-   emit_fill_buffer_per_tile_list(job, bo, offset, stride);
-   emit_supertile_coordinates(job, framebuffer);
-}
-
-static void
-emit_fill_buffer_rcl(struct v3dv_job *job,
-                     struct v3dv_bo *bo,
-                     uint32_t offset,
-                     struct framebuffer_data *framebuffer,
-                     uint32_t data)
-{
-   const union v3dv_clear_value clear_value = {
-       .color = { data, 0, 0, 0 },
-   };
-
-   const struct rcl_clear_info clear_info = {
-      .clear_value = &clear_value,
-      .image = NULL,
-      .aspects = VK_IMAGE_ASPECT_COLOR_BIT,
-      .layer = 0,
-      .level = 0,
-   };
-
-   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
-   v3dv_return_if_oom(NULL, job);
-
-   emit_frame_setup(job, 0, &clear_value);
-   emit_fill_buffer(job, bo, offset, framebuffer);
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
-static void
-fill_buffer(struct v3dv_cmd_buffer *cmd_buffer,
-            struct v3dv_bo *bo,
-            uint32_t offset,
-            uint32_t size,
-            uint32_t data)
-{
-   assert(size > 0 && size % 4 == 0);
-   assert(offset + size <= bo->size);
-
-   const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
-   const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
-   uint32_t num_items = size / 4;
-
-   while (num_items > 0) {
-      struct v3dv_job *job =
-         v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
-      if (!job)
-         return;
-
-      uint32_t width, height;
-      framebuffer_size_for_pixel_count(num_items, &width, &height);
-
-      v3dv_job_start_frame(job, width, height, 1, 1, internal_bpp, false);
-
-      struct framebuffer_data framebuffer;
-      setup_framebuffer_data(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
-                             internal_type, &job->frame_tiling);
-
-      v3dv_job_emit_binning_flush(job);
-
-      emit_fill_buffer_rcl(job, bo, offset, &framebuffer, data);
-
-      v3dv_cmd_buffer_finish_job(cmd_buffer);
-
-      const uint32_t items_copied = width * height;
-      const uint32_t bytes_copied = items_copied * 4;
-      num_items -= items_copied;
-      offset += bytes_copied;
-   }
-}
-
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
                    VkBuffer dstBuffer,
                    VkDeviceSize dstOffset,
@@ -2562,7 +1246,8 @@ v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
       size -= size % 4;
    }
 
-   fill_buffer(cmd_buffer, bo, dstOffset, size, data);
+   v3dv_X(cmd_buffer->device, meta_fill_buffer)
+      (cmd_buffer, bo, dstOffset, size, data);
 }
 
 /**
@@ -2573,12 +1258,12 @@ static bool
 copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_image *image,
                          struct v3dv_buffer *buffer,
-                         const VkBufferImageCopy *region)
+                         const VkBufferImageCopy2KHR *region)
 {
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* Destination can't be raster format */
-   if (image->tiling == VK_IMAGE_TILING_LINEAR)
+   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    /* We can't copy D24S8 because buffer to image copies only copy one aspect
@@ -2588,8 +1273,8 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     * is not a straight copy, we would havew to swizzle the channels, which the
     * TFU can't do.
     */
-   if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-       image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) {
+   if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+       image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
          return false;
    }
 
@@ -2610,12 +1295,12 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    else
       height = region->bufferImageHeight;
 
-   if (width != image->extent.width || height != image->extent.height)
+   if (width != image->vk.extent.width || height != image->vk.extent.height)
       return false;
 
    /* Handle region semantics for compressed images */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
    width = DIV_ROUND_UP(width, block_w);
    height = DIV_ROUND_UP(height, block_h);
 
@@ -2625,14 +1310,14 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     * texel size instead, which expands the list of formats we can handle here.
     */
    const struct v3dv_format *format =
-      v3dv_get_compatible_tfu_format(&cmd_buffer->device->devinfo,
+      v3dv_get_compatible_tfu_format(cmd_buffer->device,
                                      image->cpp, NULL);
 
    const uint32_t mip_level = region->imageSubresource.mipLevel;
    const struct v3d_resource_slice *slice = &image->slices[mip_level];
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -2647,7 +1332,11 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    /* Emit a TFU job per layer to copy */
    const uint32_t buffer_stride = width * image->cpp;
    for (int i = 0; i < num_layers; i++) {
-      uint32_t layer = region->imageSubresource.baseArrayLayer + i;
+      uint32_t layer;
+      if (image->vk.image_type != VK_IMAGE_TYPE_3D)
+         layer = region->imageSubresource.baseArrayLayer + i;
+      else
+         layer = region->imageOffset.z + i;
 
       struct drm_v3d_submit_tfu tfu = {
          .ios = (height << 16) | width,
@@ -2671,7 +1360,7 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       tfu.ioa |= dst_offset;
 
       tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
-                  (slice->tiling - VC5_TILING_LINEARTILE)) <<
+                  (slice->tiling - V3D_TILING_LINEARTILE)) <<
                    V3D_TFU_IOA_FORMAT_SHIFT;
       tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
 
@@ -2679,8 +1368,8 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
        * OPAD field for the destination (how many extra UIF blocks beyond
        * those necessary to cover the height).
        */
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
+      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+          slice->tiling == V3D_TILING_UIF_XOR) {
          uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp);
          uint32_t implicit_padded_height = align(height, uif_block_h);
          uint32_t icfg =
@@ -2694,140 +1383,6 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    return true;
 }
 
-static void
-emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
-                                        struct framebuffer_data *framebuffer,
-                                        struct v3dv_image *image,
-                                        struct v3dv_buffer *buffer,
-                                        uint32_t layer,
-                                        const VkBufferImageCopy *region)
-{
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(NULL, job);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
-   assert((image->type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
-          layer < image->extent.depth);
-
-   /* Load TLB from buffer */
-   uint32_t width, height;
-   if (region->bufferRowLength == 0)
-      width = region->imageExtent.width;
-   else
-      width = region->bufferRowLength;
-
-   if (region->bufferImageHeight == 0)
-      height = region->imageExtent.height;
-   else
-      height = region->bufferImageHeight;
-
-   /* Handle copy to compressed format using a compatible format */
-   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk_format));
-   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk_format));
-
-   uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
-                  1 : image->cpp;
-   uint32_t buffer_stride = width * cpp;
-   uint32_t buffer_offset =
-      buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
-
-   uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
-                                       false, false, true);
-
-   emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
-                    buffer_offset, buffer_stride, format);
-
-   /* Because we can't do raster loads/stores of Z/S formats we need to
-    * use a color tile buffer with a compatible RGBA color format instead.
-    * However, when we are uploading a single aspect to a combined
-    * depth/stencil image we have the problem that our tile buffer stores don't
-    * allow us to mask out the other aspect, so we always write all four RGBA
-    * channels to the image and we end up overwriting that other aspect with
-    * undefined values. To work around that, we first load the aspect we are
-    * not copying from the image memory into a proper Z/S tile buffer. Then we
-    * do our store from the color buffer for the aspect we are copying, and
-    * after that, we do another store from the Z/S tile buffer to restore the
-    * other aspect to its original value.
-    */
-   if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
-         emit_image_load(cl, framebuffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
-                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
-                         false, false);
-      } else {
-         assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
-         emit_image_load(cl, framebuffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
-                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
-                         false, false);
-      }
-   }
-
-   cl_emit(cl, END_OF_LOADS, end);
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   /* Store TLB to image */
-   emit_image_store(cl, framebuffer, image, imgrsc->aspectMask,
-                    imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
-                    false, true, false);
-
-   if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
-         emit_image_store(cl, framebuffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
-                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
-                          false, false, false);
-      } else {
-         assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
-         emit_image_store(cl, framebuffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
-                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
-                          false, false, false);
-      }
-   }
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-emit_copy_buffer_to_layer(struct v3dv_job *job,
-                          struct v3dv_image *image,
-                          struct v3dv_buffer *buffer,
-                          struct framebuffer_data *framebuffer,
-                          uint32_t layer,
-                          const VkBufferImageCopy *region)
-{
-   emit_frame_setup(job, layer, NULL);
-   emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
-                                           layer, region);
-   emit_supertile_coordinates(job, framebuffer);
-}
-
-static void
-emit_copy_buffer_to_image_rcl(struct v3dv_job *job,
-                              struct v3dv_image *image,
-                              struct v3dv_buffer *buffer,
-                              struct framebuffer_data *framebuffer,
-                              const VkBufferImageCopy *region)
-{
-   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
-   v3dv_return_if_oom(NULL, job);
-
-   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
-      emit_copy_buffer_to_layer(job, image, buffer, framebuffer, layer, region);
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
 /**
  * Returns true if the implementation supports the requested operation (even if
  * it failed to process it, for example, due to an out-of-memory error).
@@ -2836,19 +1391,19 @@ static bool
 copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_image *image,
                          struct v3dv_buffer *buffer,
-                         const VkBufferImageCopy *region)
+                         const VkBufferImageCopy2KHR *region)
 {
    VkFormat fb_format;
-   if (!can_use_tlb(image, &region->imageOffset, &fb_format))
+   if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
       return false;
 
    uint32_t internal_type, internal_bpp;
-   get_internal_type_bpp_for_image_aspects(fb_format,
-                                           region->imageSubresource.aspectMask,
-                                           &internal_type, &internal_bpp);
+   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
+      (fb_format, region->imageSubresource.aspectMask,
+       &internal_type, &internal_bpp);
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -2860,19 +1415,21 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy to compressed format using a compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
-   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false,
+                        1, internal_bpp, false);
 
-   struct framebuffer_data framebuffer;
-   setup_framebuffer_data(&framebuffer, fb_format, internal_type,
-                          &job->frame_tiling);
+   struct v3dv_meta_framebuffer framebuffer;
+   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+                                              internal_type, &job->frame_tiling);
 
-   v3dv_job_emit_binning_flush(job);
-   emit_copy_buffer_to_image_rcl(job, image, buffer, &framebuffer, region);
+   v3dv_X(job->device, job_emit_binning_flush)(job);
+   v3dv_X(job->device, meta_emit_copy_buffer_to_image_rcl)
+      (job, image, buffer, &framebuffer, region);
 
    v3dv_cmd_buffer_finish_job(cmd_buffer);
 
@@ -2883,7 +1440,7 @@ static bool
 create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                                struct v3dv_image *image,
                                struct v3dv_buffer *buffer,
-                               const VkBufferImageCopy *region)
+                               const VkBufferImageCopy2KHR *region)
 {
    if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
       return true;
@@ -2980,6 +1537,7 @@ static void
 get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
                                          VkColorComponentFlags cmask,
                                          VkComponentMapping *cswizzle,
+                                         bool is_layered,
                                          uint8_t *key)
 {
    memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
@@ -2992,6 +1550,12 @@ get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
    *p = cmask;
    p++;
 
+   /* Note that that we are using a single byte for this, so we could pack
+    * more data into this 32-bit slot in the future.
+    */
+   *p = is_layered ? 1 : 0;
+   p++;
+
    memcpy(p, cswizzle, sizeof(VkComponentMapping));
    p += sizeof(VkComponentMapping) / sizeof(uint32_t);
 
@@ -3011,6 +1575,7 @@ static bool
 create_pipeline(struct v3dv_device *device,
                 struct v3dv_render_pass *pass,
                 struct nir_shader *vs_nir,
+                struct nir_shader *gs_nir,
                 struct nir_shader *fs_nir,
                 const VkPipelineVertexInputStateCreateInfo *vi_state,
                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
@@ -3036,6 +1601,71 @@ get_texel_buffer_copy_vs()
    return b.shader;
 }
 
+static nir_shader *
+get_texel_buffer_copy_gs()
+{
+   /* FIXME: this creates a geometry shader that takes the index of a single
+    * layer to clear from push constants, so we need to emit a draw call for
+    * each layer that we want to clear. We could actually do better and have it
+    * take a range of layers however, if we were to do this, we would need to
+    * be careful not to exceed the maximum number of output vertices allowed in
+    * a geometry shader.
+    */
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
+                                                  "meta texel buffer copy gs");
+   nir_shader *nir = b.shader;
+   nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
+   nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
+                               (1ull << VARYING_SLOT_LAYER);
+   nir->info.gs.input_primitive = GL_TRIANGLES;
+   nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
+   nir->info.gs.vertices_in = 3;
+   nir->info.gs.vertices_out = 3;
+   nir->info.gs.invocations = 1;
+   nir->info.gs.active_stream_mask = 0x1;
+
+   /* in vec4 gl_Position[3] */
+   nir_variable *gs_in_pos =
+      nir_variable_create(b.shader, nir_var_shader_in,
+                          glsl_array_type(glsl_vec4_type(), 3, 0),
+                          "in_gl_Position");
+   gs_in_pos->data.location = VARYING_SLOT_POS;
+
+   /* out vec4 gl_Position */
+   nir_variable *gs_out_pos =
+      nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
+                          "out_gl_Position");
+   gs_out_pos->data.location = VARYING_SLOT_POS;
+
+   /* out float gl_Layer */
+   nir_variable *gs_out_layer =
+      nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
+                          "out_gl_Layer");
+   gs_out_layer->data.location = VARYING_SLOT_LAYER;
+
+   /* Emit output triangle */
+   for (uint32_t i = 0; i < 3; i++) {
+      /* gl_Position from shader input */
+      nir_deref_instr *in_pos_i =
+         nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
+      nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
+
+      /* gl_Layer from push constants */
+      nir_ssa_def *layer =
+         nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
+                                .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
+                                .range = 4);
+      nir_store_var(&b, gs_out_layer, layer, 0x1);
+
+      nir_emit_vertex(&b, 0);
+   }
+
+   nir_end_primitive(&b, 0);
+
+   return nir;
+}
+
 static nir_ssa_def *
 load_frag_coord(nir_builder *b)
 {
@@ -3101,15 +1731,21 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
     * texel buffer.
     */
    nir_ssa_def *box =
-      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
+      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
+                             .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
+                             .range = 16);
 
    /* Load the buffer stride (this comes in texel units) */
    nir_ssa_def *stride =
-      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 16, .range = 4);
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
+                             .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
+                             .range = 4);
 
    /* Load the buffer offset (this comes in texel units) */
    nir_ssa_def *offset =
-      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 20, .range = 4);
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
+                             .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
+                             .range = 4);
 
    nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
 
@@ -3165,6 +1801,7 @@ create_texel_buffer_copy_pipeline(struct v3dv_device *device,
                                   VkFormat format,
                                   VkColorComponentFlags cmask,
                                   VkComponentMapping *cswizzle,
+                                  bool is_layered,
                                   VkRenderPass _pass,
                                   VkPipelineLayout pipeline_layout,
                                   VkPipeline *pipeline)
@@ -3175,6 +1812,7 @@ create_texel_buffer_copy_pipeline(struct v3dv_device *device,
 
    nir_shader *vs_nir = get_texel_buffer_copy_vs();
    nir_shader *fs_nir = get_texel_buffer_copy_fs(device, format, cswizzle);
+   nir_shader *gs_nir = is_layered ? get_texel_buffer_copy_gs() : NULL;
 
    const VkPipelineVertexInputStateCreateInfo vi_state = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
@@ -3210,7 +1848,7 @@ create_texel_buffer_copy_pipeline(struct v3dv_device *device,
 
    return create_pipeline(device,
                           pass,
-                          vs_nir, fs_nir,
+                          vs_nir, gs_nir, fs_nir,
                           &vi_state,
                           &ds_state,
                           &cb_state,
@@ -3226,12 +1864,14 @@ get_copy_texel_buffer_pipeline(
    VkColorComponentFlags cmask,
    VkComponentMapping *cswizzle,
    VkImageType image_type,
+   bool is_layered,
    struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
 {
    bool ok = true;
 
    uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
-   get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, key);
+   get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, is_layered,
+                                            key);
 
    mtx_lock(&device->meta.mtx);
    struct hash_entry *entry =
@@ -3257,7 +1897,8 @@ get_copy_texel_buffer_pipeline(
       goto fail;
 
    ok =
-      create_texel_buffer_copy_pipeline(device, format, cmask, cswizzle,
+      create_texel_buffer_copy_pipeline(device,
+                                        format, cmask, cswizzle, is_layered,
                                         (*pipeline)->pass,
                                         device->meta.texel_buffer_copy.p_layout,
                                         &(*pipeline)->pipeline);
@@ -3297,7 +1938,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
                          VkColorComponentFlags cmask,
                          VkComponentMapping *cswizzle,
                          uint32_t region_count,
-                         const VkBufferImageCopy *regions)
+                         const VkBufferImageCopy2KHR *regions)
 {
    VkResult result;
    bool handled = false;
@@ -3320,7 +1961,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
       return handled;
 
    /* FIXME: we only handle uncompressed images for now. */
-   if (vk_format_is_compressed(image->vk_format))
+   if (vk_format_is_compressed(image->vk.format))
       return handled;
 
    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
@@ -3336,7 +1977,8 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
     */
    if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
       if (v3dv_buffer_format_supports_features(
-            src_format, VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
+             cmd_buffer->device, src_format,
+             VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
          buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
       } else {
          return handled;
@@ -3348,11 +1990,29 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
     */
    handled = true;
 
+
+   /* Compute the number of layers to copy.
+    *
+    * If we are batching (region_count > 1) all our regions have the same
+    * image subresource so we can take this from the first region. For 3D
+    * images we require the same depth extent.
+    */
+   const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
+   uint32_t num_layers;
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
+      num_layers = resource->layerCount;
+   } else {
+      assert(region_count == 1);
+      num_layers = regions[0].imageExtent.depth;
+   }
+   assert(num_layers > 0);
+
    /* Get the texel buffer copy pipeline */
    struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
    bool ok = get_copy_texel_buffer_pipeline(cmd_buffer->device,
                                             dst_format, cmask, cswizzle,
-                                            image->type, &pipeline);
+                                            image->vk.image_type, num_layers > 1,
+                                            &pipeline);
    if (!ok)
       return handled;
    assert(pipeline && pipeline->pipeline && pipeline->pass);
@@ -3422,78 +2082,58 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
                               0, 1, &set,
                               0, NULL);
 
-   /* Compute the number of layers to copy.
+   /* Setup framebuffer.
     *
-    * If we are batching (region_count > 1) all our regions have the same
-    * image subresource so we can take this from the first region.
+    * For 3D images, this creates a layered framebuffer with a number of
+    * layers matching the depth extent of the 3D image.
     */
-   const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
-   uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D) {
-      num_layers = resource->layerCount;
-   } else {
-      assert(region_count == 1);
-      num_layers = regions[0].imageExtent.depth;
-   }
-   assert(num_layers > 0);
-
-   /* Sanity check: we can only batch multiple regions together if they have
-    * the same framebuffer (so the same layer).
-    */
-   assert(num_layers == 1 || region_count == 1);
-
-   /* For each layer */
-   for (uint32_t l = 0; l < num_layers; l++) {
-      /* Setup framebuffer for this layer.
-       *
-       * FIXME: once we support geometry shaders, we should be able to have
-       *        one layered framebuffer and emit just one draw call for
-       *        all layers using layered rendering. At that point, we should
-       *        also be able to batch multi-layered regions as well.
-       */
-      VkImageViewCreateInfo image_view_info = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
-         .image = v3dv_image_to_handle(image),
-         .viewType = v3dv_image_type_to_view_type(image->type),
-         .format = dst_format,
-         .subresourceRange = {
-            .aspectMask = aspect,
-            .baseMipLevel = resource->mipLevel,
-            .levelCount = 1,
-            .baseArrayLayer = resource->baseArrayLayer + l,
-            .layerCount = 1
-         },
-      };
-      VkImageView image_view;
-      result = v3dv_CreateImageView(_device, &image_view_info,
-                                    &cmd_buffer->device->vk.alloc, &image_view);
-      if (result != VK_SUCCESS)
-         goto fail;
+   uint32_t fb_width = u_minify(image->vk.extent.width, resource->mipLevel);
+   uint32_t fb_height = u_minify(image->vk.extent.height, resource->mipLevel);
+   VkImageViewCreateInfo image_view_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+      .image = v3dv_image_to_handle(image),
+      .viewType = v3dv_image_type_to_view_type(image->vk.image_type),
+      .format = dst_format,
+      .subresourceRange = {
+         .aspectMask = aspect,
+         .baseMipLevel = resource->mipLevel,
+         .levelCount = 1,
+         .baseArrayLayer = resource->baseArrayLayer,
+         .layerCount = num_layers,
+      },
+   };
+   VkImageView image_view;
+   result = v3dv_CreateImageView(_device, &image_view_info,
+                                 &cmd_buffer->device->vk.alloc, &image_view);
+   if (result != VK_SUCCESS)
+      goto fail;
 
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)image_view,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)image_view,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
 
-      VkFramebufferCreateInfo fb_info = {
-         .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
-         .renderPass = pipeline->pass,
-         .attachmentCount = 1,
-         .pAttachments = &image_view,
-         .width = u_minify(image->extent.width, resource->mipLevel),
-         .height = u_minify(image->extent.height, resource->mipLevel),
-         .layers = 1,
-      };
+   VkFramebufferCreateInfo fb_info = {
+      .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+      .renderPass = pipeline->pass,
+      .attachmentCount = 1,
+      .pAttachments = &image_view,
+      .width = fb_width,
+      .height = fb_height,
+      .layers = num_layers,
+   };
 
-      VkFramebuffer fb;
-      result = v3dv_CreateFramebuffer(_device, &fb_info,
-                                      &cmd_buffer->device->vk.alloc, &fb);
-      if (result != VK_SUCCESS)
-         goto fail;
+   VkFramebuffer fb;
+   result = v3dv_CreateFramebuffer(_device, &fb_info,
+                                   &cmd_buffer->device->vk.alloc, &fb);
+   if (result != VK_SUCCESS)
+      goto fail;
 
-       v3dv_cmd_buffer_add_private_obj(
-          cmd_buffer, (uintptr_t)fb,
-          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
+    v3dv_cmd_buffer_add_private_obj(
+       cmd_buffer, (uintptr_t)fb,
+       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
 
+   /* For each layer */
+   for (uint32_t l = 0; l < num_layers; l++) {
        /* Start render pass for this layer.
         *
         * If the we only have one region to copy, then we might be able to
@@ -3513,15 +2153,15 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
                v3dv_render_pass_from_handle(pipeline->pass);
             can_skip_tlb_load =
                cmask == full_cmask &&
-               v3dv_subpass_area_is_tile_aligned(&render_area,
+               v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
                                                  v3dv_framebuffer_from_handle(fb),
                                                  pipeline_pass, 0);
          }
       } else {
          render_area.offset.x = 0;
          render_area.offset.y = 0;
-         render_area.extent.width = fb_info.width;
-         render_area.extent.height = fb_info.height;
+         render_area.extent.width = fb_width;
+         render_area.extent.height = fb_height;
       }
 
       VkRenderPassBeginInfo rp_info = {
@@ -3538,10 +2178,21 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
       if (!job)
          goto fail;
 
+      /* If we are using a layered copy we need to specify the layer for the
+       * Geometry Shader.
+       */
+      if (num_layers > 1) {
+         uint32_t layer = resource->baseArrayLayer + l;
+         v3dv_CmdPushConstants(_cmd_buffer,
+                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
+                               VK_SHADER_STAGE_GEOMETRY_BIT,
+                               24, 4, &layer);
+      }
+
       /* For each region */
       dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
       for (uint32_t r = 0; r < region_count; r++) {
-         const VkBufferImageCopy *region = &regions[r];
+         const VkBufferImageCopy2KHR *region = &regions[r];
 
          /* Obtain the 2D buffer region spec */
          uint32_t buf_width, buf_height;
@@ -3612,7 +2263,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                           VkColorComponentFlags cmask,
                           VkComponentMapping *cswizzle,
                           uint32_t region_count,
-                          const VkBufferImageCopy *regions)
+                          const VkBufferImageCopy2KHR *regions)
 {
    /* Since we can't sample linear images we need to upload the linear
     * buffer to a tiled image that we can use as a blit source, which
@@ -3636,7 +2287,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
       .imageType = VK_IMAGE_TYPE_2D,
       .format = src_format,
-      .extent = { image->extent.width, image->extent.height, 1 },
+      .extent = { image->vk.extent.width, image->vk.extent.height, 1 },
       .mipLevels = 1,
       .arrayLayers = 1,
       .samples = VK_SAMPLE_COUNT_1_BIT,
@@ -3653,7 +2304,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       return handled;
 
    VkMemoryRequirements reqs;
-   v3dv_GetImageMemoryRequirements(_device, dummy_image, &reqs);
+   vk_common_GetImageMemoryRequirements(_device, dummy_image, &reqs);
    v3dv_DestroyImage(_device, dummy_image, &device->vk.alloc);
 
    VkDeviceMemory mem;
@@ -3676,7 +2327,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
     * image subresource so we can take this from the first region.
     */
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = regions[0].imageSubresource.layerCount;
    else
       num_layers = regions[0].imageExtent.depth;
@@ -3687,14 +2338,14 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
     */
    assert(num_layers == 1 || region_count == 1);
 
-   const uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_height = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_height = vk_format_get_blockheight(image->vk.format);
 
    /* Copy regions by uploading each region to a temporary tiled image using
     * the memory we have just allocated as storage.
     */
    for (uint32_t r = 0; r < region_count; r++) {
-      const VkBufferImageCopy *region = &regions[r];
+      const VkBufferImageCopy2KHR *region = &regions[r];
 
       /* Obtain the 2D buffer region spec */
       uint32_t buf_width, buf_height;
@@ -3741,14 +2392,15 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
             cmd_buffer, (uintptr_t)buffer_image,
             (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
 
-         result = v3dv_BindImageMemory(_device, buffer_image, mem, 0);
+         result = vk_common_BindImageMemory(_device, buffer_image, mem, 0);
          if (result != VK_SUCCESS)
             return handled;
 
          /* Upload buffer contents for the selected layer */
          const VkDeviceSize buf_offset_bytes =
             region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
-         const VkBufferImageCopy buffer_image_copy = {
+         const VkBufferImageCopy2KHR buffer_image_copy = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2_KHR,
             .bufferOffset = buf_offset_bytes,
             .bufferRowLength = region->bufferRowLength / block_width,
             .bufferImageHeight = region->bufferImageHeight / block_height,
@@ -3782,7 +2434,8 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
           * image, but that we need to blit to a S8D24 destination (the only
           * stencil format we support).
           */
-         const VkImageBlit blit_region = {
+         const VkImageBlit2KHR blit_region = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
             .srcSubresource = {
                .aspectMask = aspect,
                .mipLevel = 0,
@@ -3840,7 +2493,7 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
                             struct v3dv_image *image,
                             struct v3dv_buffer *buffer,
                             uint32_t region_count,
-                            const VkBufferImageCopy *regions,
+                            const VkBufferImageCopy2KHR *regions,
                             bool use_texel_buffer)
 {
    /* We can only call this with region_count > 1 if we can batch the regions
@@ -3890,9 +2543,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
          dst_format = src_format;
          break;
       case VK_IMAGE_ASPECT_DEPTH_BIT:
-         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
-                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
+         assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
+                image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+                image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
          src_format = VK_FORMAT_R8G8B8A8_UINT;
          dst_format = src_format;
          aspect = VK_IMAGE_ASPECT_COLOR_BIT;
@@ -3901,8 +2554,8 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
           * in the buffer is stored in the 24-LSB, but V3D wants it in the
           * 24-MSB.
           */
-         if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-             image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) {
+         if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+             image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
             cmask = VK_COLOR_COMPONENT_G_BIT |
                     VK_COLOR_COMPONENT_B_BIT |
                     VK_COLOR_COMPONENT_A_BIT;
@@ -3920,7 +2573,7 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
           * blit to an RGBA8UI destination masking out writes to components
           * GBA (which map to the D24 component of a S8D24 image).
           */
-         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
+         assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
          buf_bpp = 1;
          src_format = VK_FORMAT_R8_UINT;
          dst_format = VK_FORMAT_R8G8B8A8_UINT;
@@ -3970,16 +2623,16 @@ static bool
 copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_image *image,
                          struct v3dv_buffer *buffer,
-                         const VkBufferImageCopy *region)
+                         const VkBufferImageCopy2KHR *region)
 {
    /* FIXME */
-   if (vk_format_is_depth_or_stencil(image->vk_format))
+   if (vk_format_is_depth_or_stencil(image->vk.format))
       return false;
 
-   if (vk_format_is_compressed(image->vk_format))
+   if (vk_format_is_compressed(image->vk.format))
       return false;
 
-   if (image->tiling == VK_IMAGE_TILING_LINEAR)
+   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    uint32_t buffer_width, buffer_height;
@@ -3997,7 +2650,7 @@ copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t buffer_layer_stride = buffer_stride * buffer_height;
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -4028,50 +2681,55 @@ copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
    return true;
 }
 
-void
-v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
-                          VkBuffer srcBuffer,
-                          VkImage dstImage,
-                          VkImageLayout dstImageLayout,
-                          uint32_t regionCount,
-                          const VkBufferImageCopy *pRegions)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
+                              const VkCopyBufferToImageInfo2KHR *info)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_buffer, buffer, srcBuffer);
-   V3DV_FROM_HANDLE(v3dv_image, image, dstImage);
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
+   V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);
 
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    uint32_t r = 0;
-   while (r < regionCount) {
+   while (r < info->regionCount) {
       /* The TFU and TLB paths can only copy one region at a time and the region
        * needs to start at the origin. We try these first for the common case
        * where we are copying full images, since they should be the fastest.
        */
       uint32_t batch_size = 1;
-      if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &pRegions[r]))
+      if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &info->pRegions[r]))
          goto handled;
 
-      if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &pRegions[r]))
+      if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &info->pRegions[r]))
          goto handled;
 
       /* Otherwise, we are copying subrects, so we fallback to copying
        * via shader and texel buffers and we try to batch the regions
-       * if possible. We can only batch copies if they target the same
-       * image subresource (so they have the same framebuffer spec).
+       * if possible. We can only batch copies if they have the same
+       * framebuffer spec, which is mostly determined by the image
+       * subresource of the region.
        */
-      const VkImageSubresourceLayers *rsc = &pRegions[r].imageSubresource;
-      if (image->type != VK_IMAGE_TYPE_3D) {
-         for (uint32_t s = r + 1; s < regionCount; s++) {
-            const VkImageSubresourceLayers *rsc_s = &pRegions[s].imageSubresource;
-            if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
+      const VkImageSubresourceLayers *rsc = &info->pRegions[r].imageSubresource;
+      for (uint32_t s = r + 1; s < info->regionCount; s++) {
+         const VkImageSubresourceLayers *rsc_s =
+            &info->pRegions[s].imageSubresource;
+
+         if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
+            break;
+
+         /* For 3D images we also need to check the depth extent */
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D &&
+             info->pRegions[s].imageExtent.depth !=
+             info->pRegions[r].imageExtent.depth) {
                break;
-            batch_size++;
          }
+
+         batch_size++;
       }
 
       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
-                                      batch_size, &pRegions[r], true)) {
+                                      batch_size, &info->pRegions[r], true)) {
          goto handled;
       }
 
@@ -4081,13 +2739,14 @@ v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
        * slow it might not be worth it and we should instead put more effort
        * in handling more cases with the other paths.
        */
-      if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &pRegions[r])) {
+      if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer,
+                                   &info->pRegions[r])) {
          batch_size = 1;
          goto handled;
       }
 
       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
-                                      batch_size, &pRegions[r], false)) {
+                                      batch_size, &info->pRegions[r], false)) {
          goto handled;
       }
 
@@ -4114,17 +2773,17 @@ static bool
 blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
          struct v3dv_image *dst,
          struct v3dv_image *src,
-         const VkImageBlit *region)
+         const VkImageBlit2KHR *region)
 {
-   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT);
-   assert(src->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* Format must match */
-   if (src->vk_format != dst->vk_format)
+   if (src->vk.format != dst->vk.format)
       return false;
 
    /* Destination can't be raster format */
-   if (dst->tiling == VK_IMAGE_TILING_LINEAR)
+   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    /* Source region must start at (0,0) */
@@ -4136,8 +2795,8 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       return false;
 
    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
-   const uint32_t dst_width = u_minify(dst->extent.width, dst_mip_level);
-   const uint32_t dst_height = u_minify(dst->extent.height, dst_mip_level);
+   const uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
+   const uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
    if (region->dstOffsets[1].x < dst_width - 1||
        region->dstOffsets[1].y < dst_height - 1) {
       return false;
@@ -4152,7 +2811,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    /* If the format is D24S8 both aspects need to be copied, since the TFU
     * can't be programmed to copy only one aspect of the image.
     */
-   if (dst->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
                                              VK_IMAGE_ASPECT_STENCIL_BIT;
        if (region->dstSubresource.aspectMask != ds_aspects)
@@ -4165,7 +2824,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     * compatible based on its texel size.
     */
    const struct v3dv_format *format =
-      v3dv_get_compatible_tfu_format(&cmd_buffer->device->devinfo,
+      v3dv_get_compatible_tfu_format(cmd_buffer->device,
                                      dst->cpp, NULL);
 
    /* Emit a TFU job for each layer to blit */
@@ -4175,7 +2834,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_dst_layer;
    uint32_t max_dst_layer;
    bool dst_mirror_z = false;
-   if (dst->type == VK_IMAGE_TYPE_3D) {
+   if (dst->vk.image_type == VK_IMAGE_TYPE_3D) {
       compute_blit_3d_layers(region->dstOffsets,
                              &min_dst_layer, &max_dst_layer,
                              &dst_mirror_z);
@@ -4187,7 +2846,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_src_layer;
    uint32_t max_src_layer;
    bool src_mirror_z = false;
-   if (src->type == VK_IMAGE_TYPE_3D) {
+   if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
       compute_blit_3d_layers(region->srcOffsets,
                              &min_src_layer, &max_src_layer,
                              &src_mirror_z);
@@ -4212,10 +2871,10 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
          dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
       const uint32_t src_layer =
          src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
-      emit_tfu_job(cmd_buffer,
-                   dst, dst_mip_level, dst_layer,
-                   src, src_mip_level, src_layer,
-                   dst_width, dst_height, format);
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
+         (cmd_buffer, dst, dst_mip_level, dst_layer,
+          src, src_mip_level, src_layer,
+          dst_width, dst_height, format);
    }
 
    return true;
@@ -4657,6 +3316,7 @@ get_color_blit_fs(struct v3dv_device *device,
          if (dst_bit_size >= src_bit_size)
             continue;
 
+         assert(dst_bit_size > 0);
          if (util_format_is_pure_uint(dst_pformat)) {
             nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
             c[i] = nir_umin(&b, c[i], max);
@@ -4679,6 +3339,7 @@ static bool
 create_pipeline(struct v3dv_device *device,
                 struct v3dv_render_pass *pass,
                 struct nir_shader *vs_nir,
+                struct nir_shader *gs_nir,
                 struct nir_shader *fs_nir,
                 const VkPipelineVertexInputStateCreateInfo *vi_state,
                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
@@ -4688,12 +3349,15 @@ create_pipeline(struct v3dv_device *device,
                 VkPipeline *pipeline)
 {
    struct vk_shader_module vs_m;
+   struct vk_shader_module gs_m;
    struct vk_shader_module fs_m;
 
+   uint32_t num_stages = gs_nir ? 3 : 2;
+
    v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
    v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
 
-   VkPipelineShaderStageCreateInfo stages[2] = {
+   VkPipelineShaderStageCreateInfo stages[3] = {
       {
          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
          .stage = VK_SHADER_STAGE_VERTEX_BIT,
@@ -4706,12 +3370,23 @@ create_pipeline(struct v3dv_device *device,
          .module = vk_shader_module_to_handle(&fs_m),
          .pName = "main",
       },
+      {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+         .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
+         .module = VK_NULL_HANDLE,
+         .pName = "main",
+      },
    };
 
+   if (gs_nir) {
+      v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
+      stages[2].module = vk_shader_module_to_handle(&gs_m);
+   }
+
    VkGraphicsPipelineCreateInfo info = {
       .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 
-      .stageCount = 2,
+      .stageCount = num_stages,
       .pStages = stages,
 
       .pVertexInputState = vi_state,
@@ -4863,7 +3538,7 @@ create_blit_pipeline(struct v3dv_device *device,
 
    return create_pipeline(device,
                           pass,
-                          vs_nir, fs_nir,
+                          vs_nir, NULL, fs_nir,
                           &vi_state,
                           &ds_state,
                           &cb_state,
@@ -5096,7 +3771,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             VkFormat src_format,
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
-            const VkImageBlit *_region,
+            const VkImageBlit2KHR *_region,
             VkFilter filter,
             bool dst_is_padded_image)
 {
@@ -5107,14 +3782,14 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    /* We don't support rendering to linear depth/stencil, this should have
     * been rewritten to a compatible color blit by the caller.
     */
-   assert(dst->tiling != VK_IMAGE_TILING_LINEAR ||
+   assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR ||
           !vk_format_is_depth_or_stencil(dst_format));
 
    /* Can't sample from linear images */
-   if (src->tiling == VK_IMAGE_TILING_LINEAR && src->type != VK_IMAGE_TYPE_1D)
+   if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && src->vk.image_type != VK_IMAGE_TYPE_1D)
       return false;
 
-   VkImageBlit region = *_region;
+   VkImageBlit2KHR region = *_region;
    /* Rewrite combined D/S blits to compatible color blits */
    if (vk_format_is_depth_or_stencil(dst_format)) {
       assert(src_format == dst_format);
@@ -5169,23 +3844,23 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
     * need to apply those same semantics here when we compute the size of the
     * destination image level.
     */
-   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
-   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
-   const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
+   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
+   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
+   const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
    const uint32_t dst_level_w =
-      u_minify(DIV_ROUND_UP(dst->extent.width * src_block_w, dst_block_w),
+      u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
                region.dstSubresource.mipLevel);
    const uint32_t dst_level_h =
-      u_minify(DIV_ROUND_UP(dst->extent.height * src_block_h, dst_block_h),
+      u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
                region.dstSubresource.mipLevel);
 
    const uint32_t src_level_w =
-      u_minify(src->extent.width, region.srcSubresource.mipLevel);
+      u_minify(src->vk.extent.width, region.srcSubresource.mipLevel);
    const uint32_t src_level_h =
-      u_minify(src->extent.height, region.srcSubresource.mipLevel);
+      u_minify(src->vk.extent.height, region.srcSubresource.mipLevel);
    const uint32_t src_level_d =
-      u_minify(src->extent.depth, region.srcSubresource.mipLevel);
+      u_minify(src->vk.extent.depth, region.srcSubresource.mipLevel);
 
    uint32_t dst_x, dst_y, dst_w, dst_h;
    bool dst_mirror_x, dst_mirror_y;
@@ -5204,7 +3879,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_dst_layer;
    uint32_t max_dst_layer;
    bool dst_mirror_z = false;
-   if (dst->type != VK_IMAGE_TYPE_3D) {
+   if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
       min_dst_layer = region.dstSubresource.baseArrayLayer;
       max_dst_layer = min_dst_layer + region.dstSubresource.layerCount;
    } else {
@@ -5216,7 +3891,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_src_layer;
    uint32_t max_src_layer;
    bool src_mirror_z = false;
-   if (src->type != VK_IMAGE_TYPE_3D) {
+   if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
       min_src_layer = region.srcSubresource.baseArrayLayer;
       max_src_layer = min_src_layer + region.srcSubresource.layerCount;
    } else {
@@ -5238,7 +3913,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       (float)(src_y + src_h),
    };
 
-   if (src->samples == VK_SAMPLE_COUNT_1_BIT) {
+   if (src->vk.samples == VK_SAMPLE_COUNT_1_BIT) {
       coords[0] /= (float)src_level_w;
       coords[1] /= (float)src_level_h;
       coords[2] /= (float)src_level_w;
@@ -5270,8 +3945,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    /* Get the blit pipeline */
    struct v3dv_meta_blit_pipeline *pipeline = NULL;
    bool ok = get_blit_pipeline(cmd_buffer->device,
-                               dst_format, src_format, cmask, src->type,
-                               dst->samples, src->samples,
+                               dst_format, src_format, cmask, src->vk.image_type,
+                               dst->vk.samples, src->vk.samples,
                                &pipeline);
    if (!ok)
       return handled;
@@ -5341,7 +4016,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       VkImageViewCreateInfo dst_image_view_info = {
          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
          .image = v3dv_image_to_handle(dst),
-         .viewType = v3dv_image_type_to_view_type(dst->type),
+         .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
          .format = dst_format,
          .subresourceRange = {
             .aspectMask = aspects,
@@ -5399,7 +4074,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       VkImageViewCreateInfo src_image_view_info = {
          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
          .image = v3dv_image_to_handle(src),
-         .viewType = v3dv_image_type_to_view_type(src->type),
+         .viewType = v3dv_image_type_to_view_type(src->vk.image_type),
          .format = src_format,
          .components = *cswizzle,
          .subresourceRange = {
@@ -5407,7 +4082,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             .baseMipLevel = region.srcSubresource.mipLevel,
             .levelCount = 1,
             .baseArrayLayer =
-               src->type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
+               src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
             .layerCount = 1
          },
       };
@@ -5457,8 +4132,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             v3dv_render_pass_from_handle(pipeline->pass);
          can_skip_tlb_load =
             cmask == full_cmask &&
-            v3dv_subpass_area_is_tile_aligned(&render_area, framebuffer,
-                                              pipeline_pass, 0);
+            v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
+                                              framebuffer, pipeline_pass, 0);
       }
 
       /* Record blit */
@@ -5481,7 +4156,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
        * based on the ratio of the depth of the source and the destination
        * images, picking the coordinate in the middle of each step.
        */
-      if (src->type == VK_IMAGE_TYPE_3D) {
+      if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
          tex_coords[4] =
             !mirror_z ?
             (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
@@ -5505,150 +4180,58 @@ fail:
    return handled;
 }
 
-void
-v3dv_CmdBlitImage(VkCommandBuffer commandBuffer,
-                  VkImage srcImage,
-                  VkImageLayout srcImageLayout,
-                  VkImage dstImage,
-                  VkImageLayout dstImageLayout,
-                  uint32_t regionCount,
-                  const VkImageBlit* pRegions,
-                  VkFilter filter)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
+                      const VkBlitImageInfo2KHR *pBlitImageInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_image, src, srcImage);
-   V3DV_FROM_HANDLE(v3dv_image, dst, dstImage);
+   V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
+   V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
 
     /* This command can only happen outside a render pass */
    assert(cmd_buffer->state.pass == NULL);
    assert(cmd_buffer->state.job == NULL);
 
    /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
-   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT &&
-          src->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
+          src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
-   assert(!vk_format_is_compressed(dst->vk_format));
+   assert(!vk_format_is_compressed(dst->vk.format));
 
-   for (uint32_t i = 0; i < regionCount; i++) {
-      if (blit_tfu(cmd_buffer, dst, src, &pRegions[i]))
+   for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
+      if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i]))
          continue;
       if (blit_shader(cmd_buffer,
-                      dst, dst->vk_format,
-                      src, src->vk_format,
+                      dst, dst->vk.format,
+                      src, src->vk.format,
                       0, NULL,
-                      &pRegions[i], filter, true)) {
+                      &pBlitImageInfo->pRegions[i],
+                      pBlitImageInfo->filter, true)) {
          continue;
       }
       unreachable("Unsupported blit operation");
    }
 }
 
-static void
-emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
-                                       struct framebuffer_data *framebuffer,
-                                       struct v3dv_image *dst,
-                                       struct v3dv_image *src,
-                                       uint32_t layer_offset,
-                                       const VkImageResolve *region)
-{
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(NULL, job);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   assert((src->type != VK_IMAGE_TYPE_3D &&
-           layer_offset < region->srcSubresource.layerCount) ||
-          layer_offset < src->extent.depth);
-
-   const uint32_t src_layer = src->type != VK_IMAGE_TYPE_3D ?
-      region->srcSubresource.baseArrayLayer + layer_offset :
-      region->srcOffset.z + layer_offset;
-
-   emit_image_load(cl, framebuffer, src,
-                   region->srcSubresource.aspectMask,
-                   src_layer,
-                   region->srcSubresource.mipLevel,
-                   false, false);
-
-   cl_emit(cl, END_OF_LOADS, end);
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   assert((dst->type != VK_IMAGE_TYPE_3D &&
-           layer_offset < region->dstSubresource.layerCount) ||
-          layer_offset < dst->extent.depth);
-
-   const uint32_t dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
-      region->dstSubresource.baseArrayLayer + layer_offset :
-      region->dstOffset.z + layer_offset;
-
-   emit_image_store(cl, framebuffer, dst,
-                    region->dstSubresource.aspectMask,
-                    dst_layer,
-                    region->dstSubresource.mipLevel,
-                    false, false, true);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-emit_resolve_image_layer(struct v3dv_job *job,
-                         struct v3dv_image *dst,
-                         struct v3dv_image *src,
-                         struct framebuffer_data *framebuffer,
-                         uint32_t layer,
-                         const VkImageResolve *region)
-{
-   emit_frame_setup(job, layer, NULL);
-   emit_resolve_image_layer_per_tile_list(job, framebuffer,
-                                          dst, src, layer, region);
-   emit_supertile_coordinates(job, framebuffer);
-}
-
-static void
-emit_resolve_image_rcl(struct v3dv_job *job,
-                       struct v3dv_image *dst,
-                       struct v3dv_image *src,
-                       struct framebuffer_data *framebuffer,
-                       const VkImageResolve *region)
-{
-   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
-   v3dv_return_if_oom(NULL, job);
-
-   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
-      emit_resolve_image_layer(job, dst, src, framebuffer, layer, region);
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
 static bool
 resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                   struct v3dv_image *dst,
                   struct v3dv_image *src,
-                  const VkImageResolve *region)
+                  const VkImageResolve2KHR *region)
 {
-   if (!can_use_tlb(src, &region->srcOffset, NULL) ||
-       !can_use_tlb(dst, &region->dstOffset, NULL)) {
+   if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, NULL) ||
+       !v3dv_meta_can_use_tlb(dst, &region->dstOffset, NULL)) {
       return false;
    }
 
-   if (!v3dv_format_supports_tlb_resolve(src->format))
+   if (!v3dv_X(cmd_buffer->device, format_supports_tlb_resolve)(src->format))
       return false;
 
-   const VkFormat fb_format = src->vk_format;
+   const VkFormat fb_format = src->vk.format;
 
    uint32_t num_layers;
-   if (dst->type != VK_IMAGE_TYPE_3D)
+   if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->dstSubresource.layerCount;
    else
       num_layers = region->extent.depth;
@@ -5659,24 +4242,26 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    if (!job)
       return true;
 
-   const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
    uint32_t internal_type, internal_bpp;
-   get_internal_type_bpp_for_image_aspects(fb_format,
-                                           region->srcSubresource.aspectMask,
-                                           &internal_type, &internal_bpp);
+   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
+      (fb_format, region->srcSubresource.aspectMask,
+       &internal_type, &internal_bpp);
 
-   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, true);
+   v3dv_job_start_frame(job, width, height, num_layers, false,
+                        1, internal_bpp, true);
 
-   struct framebuffer_data framebuffer;
-   setup_framebuffer_data(&framebuffer, fb_format, internal_type,
-                          &job->frame_tiling);
+   struct v3dv_meta_framebuffer framebuffer;
+   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+                                              internal_type, &job->frame_tiling);
 
-   v3dv_job_emit_binning_flush(job);
-   emit_resolve_image_rcl(job, dst, src, &framebuffer, region);
+   v3dv_X(job->device, job_emit_binning_flush)(job);
+   v3dv_X(job->device, meta_emit_resolve_image_rcl)(job, dst, src,
+                                                    &framebuffer, region);
 
    v3dv_cmd_buffer_finish_job(cmd_buffer);
    return true;
@@ -5686,9 +4271,10 @@ static bool
 resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                    struct v3dv_image *dst,
                    struct v3dv_image *src,
-                   const VkImageResolve *region)
+                   const VkImageResolve2KHR *region)
 {
-   const VkImageBlit blit_region = {
+   const VkImageBlit2KHR blit_region = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
       .srcSubresource = region->srcSubresource,
       .srcOffsets = {
          region->srcOffset,
@@ -5707,36 +4293,32 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       },
    };
    return blit_shader(cmd_buffer,
-                      dst, dst->vk_format,
-                      src, src->vk_format,
+                      dst, dst->vk.format,
+                      src, src->vk.format,
                       0, NULL,
                       &blit_region, VK_FILTER_NEAREST, true);
 }
 
-void
-v3dv_CmdResolveImage(VkCommandBuffer commandBuffer,
-                     VkImage srcImage,
-                     VkImageLayout srcImageLayout,
-                     VkImage dstImage,
-                     VkImageLayout dstImageLayout,
-                     uint32_t regionCount,
-                     const VkImageResolve *pRegions)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
+                         const VkResolveImageInfo2KHR *info)
+
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_image, src, srcImage);
-   V3DV_FROM_HANDLE(v3dv_image, dst, dstImage);
+   V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
+   V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
 
     /* This command can only happen outside a render pass */
    assert(cmd_buffer->state.pass == NULL);
    assert(cmd_buffer->state.job == NULL);
 
-   assert(src->samples == VK_SAMPLE_COUNT_4_BIT);
-   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
+   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
-   for (uint32_t i = 0; i < regionCount; i++) {
-      if (resolve_image_tlb(cmd_buffer, dst, src, &pRegions[i]))
+   for (uint32_t i = 0; i < info->regionCount; i++) {
+      if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
          continue;
-      if (resolve_image_blit(cmd_buffer, dst, src, &pRegions[i]))
+      if (resolve_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
          continue;
       unreachable("Unsupported multismaple resolve operation");
    }
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pass.c b/lib/mesa/src/broadcom/vulkan/v3dv_pass.c
index 0f03dfe67..1b03c0d79 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_pass.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_pass.c
@@ -22,7 +22,6 @@
  */
 
 #include "v3dv_private.h"
-#include "vk_format_info.h"
 
 static uint32_t
 num_subpass_attachments(const VkSubpassDescription *desc)
@@ -34,18 +33,26 @@ num_subpass_attachments(const VkSubpassDescription *desc)
 }
 
 static void
-set_use_tlb_resolve(struct v3dv_render_pass_attachment *att)
+set_use_tlb_resolve(struct v3dv_device *device,
+                    struct v3dv_render_pass_attachment *att)
 {
-   const struct v3dv_format *format = v3dv_get_format(att->desc.format);
-   att->use_tlb_resolve = v3dv_format_supports_tlb_resolve(format);
+   const struct v3dv_format *format = v3dv_X(device, get_format)(att->desc.format);
+   att->use_tlb_resolve = v3dv_X(device, format_supports_tlb_resolve)(format);
 }
 
 static void
-pass_find_subpass_range_for_attachments(struct v3dv_render_pass *pass)
+pass_find_subpass_range_for_attachments(struct v3dv_device *device,
+                                        struct v3dv_render_pass *pass)
 {
    for (uint32_t i = 0; i < pass->attachment_count; i++) {
       pass->attachments[i].first_subpass = pass->subpass_count - 1;
       pass->attachments[i].last_subpass = 0;
+      if (pass->multiview_enabled) {
+         for (uint32_t j = 0; j < MAX_MULTIVIEW_VIEW_COUNT; j++) {
+            pass->attachments[i].views[j].first_subpass = pass->subpass_count - 1;
+            pass->attachments[i].views[j].last_subpass = 0;
+         }
+      }
    }
 
    for (uint32_t i = 0; i < pass->subpass_count; i++) {
@@ -56,14 +63,26 @@ pass_find_subpass_range_for_attachments(struct v3dv_render_pass *pass)
          if (attachment_idx == VK_ATTACHMENT_UNUSED)
             continue;
 
-         if (i < pass->attachments[attachment_idx].first_subpass)
-            pass->attachments[attachment_idx].first_subpass = i;
-         if (i > pass->attachments[attachment_idx].last_subpass)
-            pass->attachments[attachment_idx].last_subpass = i;
+         struct v3dv_render_pass_attachment *att =
+            &pass->attachments[attachment_idx];
+
+         if (i < att->first_subpass)
+            att->first_subpass = i;
+         if (i > att->last_subpass)
+            att->last_subpass = i;
+
+         uint32_t view_mask = subpass->view_mask;
+         while (view_mask) {
+            uint32_t view_index = u_bit_scan(&view_mask);
+            if (i < att->views[view_index].first_subpass)
+               att->views[view_index].first_subpass = i;
+            if (i > att->views[view_index].last_subpass)
+               att->views[view_index].last_subpass = i;
+         }
 
          if (subpass->resolve_attachments &&
              subpass->resolve_attachments[j].attachment != VK_ATTACHMENT_UNUSED) {
-            set_use_tlb_resolve(&pass->attachments[attachment_idx]);
+            set_use_tlb_resolve(device, att);
          }
       }
 
@@ -100,7 +119,7 @@ pass_find_subpass_range_for_attachments(struct v3dv_render_pass *pass)
 }
 
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateRenderPass(VkDevice _device,
                       const VkRenderPassCreateInfo *pCreateInfo,
                       const VkAllocationCallbacks *pAllocator,
@@ -111,6 +130,10 @@ v3dv_CreateRenderPass(VkDevice _device,
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO);
 
+   const VkRenderPassMultiviewCreateInfo *multiview_info =
+      vk_find_struct_const(pCreateInfo->pNext, RENDER_PASS_MULTIVIEW_CREATE_INFO);
+   bool multiview_enabled = multiview_info && multiview_info->subpassCount > 0;
+
    size_t size = sizeof(*pass);
    size_t subpasses_offset = size;
    size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]);
@@ -120,8 +143,9 @@ v3dv_CreateRenderPass(VkDevice _device,
    pass = vk_object_zalloc(&device->vk, pAllocator, size,
                            VK_OBJECT_TYPE_RENDER_PASS);
    if (pass == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   pass->multiview_enabled = multiview_enabled;
    pass->attachment_count = pCreateInfo->attachmentCount;
    pass->attachments = (void *) pass + attachments_offset;
    pass->subpass_count = pCreateInfo->subpassCount;
@@ -144,7 +168,7 @@ v3dv_CreateRenderPass(VkDevice _device,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (pass->subpass_attachments == NULL) {
          vk_object_free(&device->vk, pAllocator, pass);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    } else {
       pass->subpass_attachments = NULL;
@@ -157,6 +181,8 @@ v3dv_CreateRenderPass(VkDevice _device,
 
       subpass->input_count = desc->inputAttachmentCount;
       subpass->color_count = desc->colorAttachmentCount;
+      if (multiview_enabled)
+         subpass->view_mask = multiview_info->pViewMasks[i];
 
       if (desc->inputAttachmentCount > 0) {
          subpass->input_attachments = p;
@@ -175,16 +201,10 @@ v3dv_CreateRenderPass(VkDevice _device,
          p += desc->colorAttachmentCount;
 
          for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
-            const uint32_t attachment_idx =
-               desc->pColorAttachments[j].attachment;
             subpass->color_attachments[j] = (struct v3dv_subpass_attachment) {
-               .attachment = attachment_idx,
+               .attachment = desc->pColorAttachments[j].attachment,
                .layout = desc->pColorAttachments[j].layout,
             };
-            if (attachment_idx != VK_ATTACHMENT_UNUSED) {
-               VkFormat format = pass->attachments[attachment_idx].desc.format;
-               subpass->has_srgb_rt |= vk_format_is_srgb(format);
-            }
          }
       }
 
@@ -230,7 +250,7 @@ v3dv_CreateRenderPass(VkDevice _device,
       }
    }
 
-   pass_find_subpass_range_for_attachments(pass);
+   pass_find_subpass_range_for_attachments(device, pass);
 
    /* FIXME: handle subpass dependencies */
 
@@ -239,7 +259,7 @@ v3dv_CreateRenderPass(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyRenderPass(VkDevice _device,
                        VkRenderPass _pass,
                        const VkAllocationCallbacks *pAllocator)
@@ -255,7 +275,8 @@ v3dv_DestroyRenderPass(VkDevice _device,
 }
 
 static void
-subpass_get_granularity(struct v3dv_render_pass *pass,
+subpass_get_granularity(struct v3dv_device *device,
+                        struct v3dv_render_pass *pass,
                         uint32_t subpass_idx,
                         VkExtent2D *granularity)
 {
@@ -283,11 +304,11 @@ subpass_get_granularity(struct v3dv_render_pass *pass,
          continue;
       const VkAttachmentDescription *desc =
          &pass->attachments[attachment_idx].desc;
-      const struct v3dv_format *format = v3dv_get_format(desc->format);
+      const struct v3dv_format *format = v3dv_X(device, get_format)(desc->format);
       uint32_t internal_type, internal_bpp;
-      v3dv_get_internal_type_bpp_for_output_format(format->rt_type,
-                                                   &internal_type,
-                                                   &internal_bpp);
+      v3dv_X(device, get_internal_type_bpp_for_output_format)
+         (format->rt_type, &internal_type, &internal_bpp);
+
       max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
    }
 
@@ -306,12 +327,13 @@ subpass_get_granularity(struct v3dv_render_pass *pass,
    };
 }
 
-void
-v3dv_GetRenderAreaGranularity(VkDevice device,
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetRenderAreaGranularity(VkDevice _device,
                               VkRenderPass renderPass,
                               VkExtent2D *pGranularity)
 {
    V3DV_FROM_HANDLE(v3dv_render_pass, pass, renderPass);
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
    *pGranularity = (VkExtent2D) {
       .width = 64,
@@ -320,7 +342,7 @@ v3dv_GetRenderAreaGranularity(VkDevice device,
 
    for (uint32_t i = 0; i < pass->subpass_count; i++) {
       VkExtent2D sg;
-      subpass_get_granularity(pass, i, &sg);
+      subpass_get_granularity(device, pass, i, &sg);
       pGranularity->width = MIN2(pGranularity->width, sg.width);
       pGranularity->height = MIN2(pGranularity->height, sg.height);
    }
@@ -348,7 +370,8 @@ v3dv_GetRenderAreaGranularity(VkDevice device,
  * In that case, we can't flag the area as being aligned.
  */
 bool
-v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device,
+                                  const VkRect2D *area,
                                   struct v3dv_framebuffer *fb,
                                   struct v3dv_render_pass *pass,
                                   uint32_t subpass_idx)
@@ -356,7 +379,7 @@ v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
    assert(subpass_idx < pass->subpass_count);
 
    VkExtent2D granularity;
-   subpass_get_granularity(pass, subpass_idx, &granularity);
+   subpass_get_granularity(device, pass, subpass_idx, &granularity);
 
    return area->offset.x % granularity.width == 0 &&
           area->offset.y % granularity.height == 0 &&
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c
index 35cf35592..daa6c7550 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c
@@ -34,10 +34,13 @@
 #include "nir/nir_serialize.h"
 
 #include "util/u_atomic.h"
+#include "util/u_prim.h"
+#include "util/os_time.h"
 
 #include "vulkan/util/vk_format.h"
 
-#include "broadcom/cle/v3dx_pack.h"
+static VkResult
+compute_vpm_config(struct v3dv_pipeline *pipeline);
 
 void
 v3dv_print_v3d_key(struct v3d_key *key,
@@ -120,11 +123,15 @@ pipeline_free_stages(struct v3dv_device *device,
     */
    destroy_pipeline_stage(device, pipeline->vs, pAllocator);
    destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
+   destroy_pipeline_stage(device, pipeline->gs, pAllocator);
+   destroy_pipeline_stage(device, pipeline->gs_bin, pAllocator);
    destroy_pipeline_stage(device, pipeline->fs, pAllocator);
    destroy_pipeline_stage(device, pipeline->cs, pAllocator);
 
    pipeline->vs = NULL;
    pipeline->vs_bin = NULL;
+   pipeline->gs = NULL;
+   pipeline->gs_bin = NULL;
    pipeline->fs = NULL;
    pipeline->cs = NULL;
 }
@@ -157,7 +164,7 @@ v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
    vk_object_free(&device->vk, pAllocator, pipeline);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyPipeline(VkDevice _device,
                      VkPipeline _pipeline,
                      const VkAllocationCallbacks *pAllocator)
@@ -172,20 +179,27 @@ v3dv_DestroyPipeline(VkDevice _device,
 }
 
 static const struct spirv_to_nir_options default_spirv_options =  {
-   .caps = { false },
+   .caps = {
+      .device_group = true,
+      .multiview = true,
+      .subgroup_basic = true,
+      .variable_pointers = true,
+    },
    .ubo_addr_format = nir_address_format_32bit_index_offset,
    .ssbo_addr_format = nir_address_format_32bit_index_offset,
    .phys_ssbo_addr_format = nir_address_format_64bit_global,
    .push_const_addr_format = nir_address_format_logical,
    .shared_addr_format = nir_address_format_32bit_offset,
-   .frag_coord_is_sysval = false,
 };
 
 const nir_shader_compiler_options v3dv_nir_options = {
-   .lower_add_sat = true,
+   .lower_uadd_sat = true,
+   .lower_iadd_sat = true,
    .lower_all_io_to_temps = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
+   .lower_insert_byte = true,
+   .lower_insert_word = true,
    .lower_bitfield_insert_to_shifts = true,
    .lower_bitfield_extract_to_shifts = true,
    .lower_bitfield_reverse = true,
@@ -228,11 +242,16 @@ const nir_shader_compiler_options v3dv_nir_options = {
    .lower_wpos_pntc = true,
    .lower_rotate = true,
    .lower_to_scalar = true,
+   .lower_device_index_to_zero = true,
    .has_fsub = true,
    .has_isub = true,
    .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
                                    * needs to be supported */
    .lower_interpolate_at = true,
+   .max_unroll_iterations = 16,
+   .force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp),
+   .divergence_analysis_options =
+      nir_divergence_multiple_workgroup_per_compute_subgroup
 };
 
 const nir_shader_compiler_options *
@@ -250,9 +269,7 @@ v3dv_pipeline_get_nir_options(void)
 })
 
 static void
-nir_optimize(nir_shader *nir,
-             struct v3dv_pipeline_stage *stage,
-             bool allow_copies)
+nir_optimize(nir_shader *nir, bool allow_copies)
 {
    bool progress;
 
@@ -276,7 +293,7 @@ nir_optimize(nir_shader *nir,
       OPT(nir_lower_alu_to_scalar, NULL, NULL);
 
       OPT(nir_copy_prop);
-      OPT(nir_lower_phis_to_scalar);
+      OPT(nir_lower_phis_to_scalar, false);
 
       OPT(nir_copy_prop);
       OPT(nir_opt_dce);
@@ -313,9 +330,29 @@ nir_optimize(nir_shader *nir,
 }
 
 static void
-preprocess_nir(nir_shader *nir,
-               struct v3dv_pipeline_stage *stage)
+preprocess_nir(nir_shader *nir)
 {
+   /* We have to lower away local variable initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   /* Pick off the single entrypoint that we want */
+   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
+      if (func->is_entrypoint)
+         func->name = ralloc_strdup(func, "main");
+      else
+         exec_node_remove(&func->node);
+   }
+   assert(exec_list_length(&nir->functions) == 1);
+
+   /* Vulkan uses the separate-shader linking model */
+   nir->info.separate_shader = true;
+
    /* Make sure we lower variable initializers on output variables so that
     * nir_remove_dead_variables below sees the corresponding stores
     */
@@ -353,7 +390,7 @@ preprocess_nir(nir_shader *nir,
               nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
               NULL);
 
-   NIR_PASS_V(nir, nir_propagate_invariant);
+   NIR_PASS_V(nir, nir_propagate_invariant, false);
    NIR_PASS_V(nir, nir_lower_io_to_temporaries,
               nir_shader_get_entrypoint(nir), true, false);
 
@@ -369,15 +406,14 @@ preprocess_nir(nir_shader *nir,
    NIR_PASS_V(nir, nir_split_var_copies);
    NIR_PASS_V(nir, nir_split_struct_vars, nir_var_function_temp);
 
-   nir_optimize(nir, stage, true);
+   nir_optimize(nir, true);
 
    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
 
    /* Lower a bunch of stuff */
    NIR_PASS_V(nir, nir_lower_var_copies);
 
-   NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in |
-              nir_var_shader_out, UINT32_MAX);
+   NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
 
    NIR_PASS_V(nir, nir_lower_indirect_derefs,
               nir_var_function_temp, 2);
@@ -389,49 +425,7 @@ preprocess_nir(nir_shader *nir,
    NIR_PASS_V(nir, nir_lower_frexp);
 
    /* Get rid of split copies */
-   nir_optimize(nir, stage, false);
-}
-
-/* FIXME: This is basically the same code at anv, tu and radv. Move to common
- * place?
- */
-static struct nir_spirv_specialization*
-vk_spec_info_to_nir_spirv(const VkSpecializationInfo *spec_info,
-                          uint32_t *out_num_spec_entries)
-{
-   if (spec_info == NULL || spec_info->mapEntryCount == 0)
-      return NULL;
-
-   uint32_t num_spec_entries = spec_info->mapEntryCount;
-   struct nir_spirv_specialization *spec_entries = calloc(num_spec_entries, sizeof(*spec_entries));
-
-   for (uint32_t i = 0; i < num_spec_entries; i++) {
-      VkSpecializationMapEntry entry = spec_info->pMapEntries[i];
-      const void *data = spec_info->pData + entry.offset;
-      assert(data + entry.size <= spec_info->pData + spec_info->dataSize);
-
-      spec_entries[i].id = spec_info->pMapEntries[i].constantID;
-      switch (entry.size) {
-      case 8:
-         spec_entries[i].value.u64 = *(const uint64_t *)data;
-         break;
-      case 4:
-         spec_entries[i].value.u32 = *(const uint32_t *)data;
-         break;
-      case 2:
-         spec_entries[i].value.u16 = *(const uint16_t *)data;
-         break;
-      case 1:
-         spec_entries[i].value.u8 = *(const uint8_t *)data;
-         break;
-      default:
-         assert(!"Invalid spec constant size");
-         break;
-      }
-   }
-
-   *out_num_spec_entries = num_spec_entries;
-   return spec_entries;
+   nir_optimize(nir, false);
 }
 
 static nir_shader *
@@ -445,7 +439,7 @@ shader_module_compile_to_nir(struct v3dv_device *device,
       uint32_t *spirv = (uint32_t *) stage->module->data;
       assert(stage->module->size % 4 == 0);
 
-      if (V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV)
+      if (unlikely(V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV))
          v3dv_print_spirv(stage->module->data, stage->module->size, stderr);
 
       uint32_t num_spec_entries = 0;
@@ -472,37 +466,23 @@ shader_module_compile_to_nir(struct v3dv_device *device,
    }
    assert(nir->info.stage == broadcom_shader_stage_to_gl(stage->stage));
 
-   if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                    v3d_debug_flag_for_shader_stage(stage->stage))) {
+   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+      .frag_coord = true,
+      .point_coord = true,
+   };
+   NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
+   if (unlikely(V3D_DEBUG & (V3D_DEBUG_NIR |
+                             v3d_debug_flag_for_shader_stage(
+                                broadcom_shader_stage_to_gl(stage->stage))))) {
       fprintf(stderr, "Initial form: %s prog %d NIR:\n",
-              gl_shader_stage_name(stage->stage),
+              broadcom_shader_stage_name(stage->stage),
               stage->program_id);
       nir_print_shader(nir, stderr);
       fprintf(stderr, "\n");
    }
 
-   /* We have to lower away local variable initializers right before we
-    * inline functions.  That way they get properly initialized at the top
-    * of the function and not at the top of its caller.
-    */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
-   NIR_PASS_V(nir, nir_lower_returns);
-   NIR_PASS_V(nir, nir_inline_functions);
-   NIR_PASS_V(nir, nir_opt_deref);
-
-   /* Pick off the single entrypoint that we want */
-   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
-      if (func->is_entrypoint)
-         func->name = ralloc_strdup(func, "main");
-      else
-         exec_node_remove(&func->node);
-   }
-   assert(exec_list_length(&nir->functions) == 1);
-
-   /* Vulkan uses the separate-shader linking model */
-   nir->info.separate_shader = true;
-
-   preprocess_nir(nir, stage);
+   preprocess_nir(nir);
 
    return nir;
 }
@@ -567,11 +547,46 @@ lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
    instr->intrinsic = nir_intrinsic_load_uniform;
 }
 
+static struct v3dv_descriptor_map*
+pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
+                            VkDescriptorType desc_type,
+                            gl_shader_stage gl_stage,
+                            bool is_sampler)
+{
+   enum broadcom_shader_stage broadcom_stage =
+      gl_shader_stage_to_broadcom(gl_stage);
+
+   assert(pipeline->shared_data &&
+          pipeline->shared_data->maps[broadcom_stage]);
+
+   switch(desc_type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+      return &pipeline->shared_data->maps[broadcom_stage]->sampler_map;
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      return &pipeline->shared_data->maps[broadcom_stage]->texture_map;
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      return is_sampler ?
+         &pipeline->shared_data->maps[broadcom_stage]->sampler_map :
+         &pipeline->shared_data->maps[broadcom_stage]->texture_map;
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      return &pipeline->shared_data->maps[broadcom_stage]->ubo_map;
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map;
+   default:
+      unreachable("Descriptor type unknown or not having a descriptor map");
+   }
+}
+
 /* Gathers info from the intrinsic (set and binding) and then lowers it so it
  * could be used by the v3d_compiler */
 static void
 lower_vulkan_resource_index(nir_builder *b,
                             nir_intrinsic_instr *instr,
+                            nir_shader *shader,
                             struct v3dv_pipeline *pipeline,
                             const struct v3dv_pipeline_layout *layout)
 {
@@ -585,13 +600,13 @@ lower_vulkan_resource_index(nir_builder *b,
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
    unsigned index = 0;
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(instr);
 
-   switch (nir_intrinsic_desc_type(instr)) {
+   switch (desc_type) {
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
       struct v3dv_descriptor_map *descriptor_map =
-         nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
-         &pipeline->shared_data->ubo_map : &pipeline->shared_data->ssbo_map;
+         pipeline_get_descriptor_map(pipeline, desc_type, shader->info.stage, false);
 
       if (!const_val)
          unreachable("non-constant vulkan_resource_index array index");
@@ -601,7 +616,7 @@ lower_vulkan_resource_index(nir_builder *b,
                                  binding_layout->array_size,
                                  32 /* return_size: doesn't really apply for this case */);
 
-      if (nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+      if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
          /* skip index 0 which is used for push constants */
          index++;
       }
@@ -614,13 +629,11 @@ lower_vulkan_resource_index(nir_builder *b,
    }
 
    /* Since we use the deref pass, both vulkan_resource_index and
-    * vulkan_load_descriptor returns a vec2. But for the index the backend
-    * expect just one scalar (like with get_ssbo_size), so lets return here
-    * just it. Then on load_descriptor we would recreate the vec2, keeping the
-    * second component (unused right now) to zero.
+    * vulkan_load_descriptor return a vec2 providing an index and
+    * offset. Our backend compiler only cares about the index part.
     */
    nir_ssa_def_rewrite_uses(&instr->dest.ssa,
-                            nir_imm_int(b, index));
+                            nir_imm_ivec2(b, index, 0));
    nir_instr_remove(&instr->instr);
 }
 
@@ -629,6 +642,7 @@ lower_vulkan_resource_index(nir_builder *b,
  */
 static uint8_t
 lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
+                        nir_shader *shader,
                         struct v3dv_pipeline *pipeline,
                         const struct v3dv_pipeline_layout *layout)
 {
@@ -704,11 +718,17 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
       deref->var->data.index + base_index :
       base_index;
 
-   uint8_t return_size = relaxed_precision || instr->is_shadow ? 16 : 32;
+   uint8_t return_size;
+   if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT))
+      return_size = 16;
+   else  if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT))
+      return_size = 32;
+   else
+      return_size = relaxed_precision || instr->is_shadow ? 16 : 32;
 
-   struct v3dv_descriptor_map *map = is_sampler ?
-      &pipeline->shared_data->sampler_map :
-      &pipeline->shared_data->texture_map;
+   struct v3dv_descriptor_map *map =
+      pipeline_get_descriptor_map(pipeline, binding_layout->type,
+                                  shader->info.stage, is_sampler);
    int desc_index =
       descriptor_map_add(map,
                          deref->var->data.descriptor_set,
@@ -727,6 +747,7 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
 
 static bool
 lower_sampler(nir_builder *b, nir_tex_instr *instr,
+              nir_shader *shader,
               struct v3dv_pipeline *pipeline,
               const struct v3dv_pipeline_layout *layout)
 {
@@ -736,13 +757,14 @@ lower_sampler(nir_builder *b, nir_tex_instr *instr,
       nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
 
    if (texture_idx >= 0)
-      return_size = lower_tex_src_to_offset(b, instr, texture_idx, pipeline, layout);
+      return_size = lower_tex_src_to_offset(b, instr, texture_idx, shader,
+                                            pipeline, layout);
 
    int sampler_idx =
       nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
 
    if (sampler_idx >= 0)
-      lower_tex_src_to_offset(b, instr, sampler_idx, pipeline, layout);
+      lower_tex_src_to_offset(b, instr, sampler_idx, shader, pipeline, layout);
 
    if (texture_idx < 0 && sampler_idx < 0)
       return false;
@@ -762,6 +784,7 @@ lower_sampler(nir_builder *b, nir_tex_instr *instr,
 static void
 lower_image_deref(nir_builder *b,
                   nir_intrinsic_instr *instr,
+                  nir_shader *shader,
                   struct v3dv_pipeline *pipeline,
                   const struct v3dv_pipeline_layout *layout)
 {
@@ -811,8 +834,12 @@ lower_image_deref(nir_builder *b,
    assert(binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ||
           binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
 
+   struct v3dv_descriptor_map *map =
+      pipeline_get_descriptor_map(pipeline, binding_layout->type,
+                                  shader->info.stage, false);
+
    int desc_index =
-      descriptor_map_add(&pipeline->shared_data->texture_map,
+      descriptor_map_add(map,
                          deref->var->data.descriptor_set,
                          deref->var->data.binding,
                          array_index,
@@ -832,6 +859,7 @@ lower_image_deref(nir_builder *b,
 
 static bool
 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
+                nir_shader *shader,
                 struct v3dv_pipeline *pipeline,
                 const struct v3dv_pipeline_layout *layout)
 {
@@ -850,16 +878,14 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
       return true;
 
    case nir_intrinsic_vulkan_resource_index:
-      lower_vulkan_resource_index(b, instr, pipeline, layout);
+      lower_vulkan_resource_index(b, instr, shader, pipeline, layout);
       return true;
 
    case nir_intrinsic_load_vulkan_descriptor: {
-      /* We are not using it, as loading the descriptor happens as part of the
-       * load/store instruction, so the simpler is just doing a no-op. We just
-       * lower the desc back to a vec2, as it is what load_ssbo/ubo expects.
+      /* Loading the descriptor happens as part of load/store instructions,
+       * so for us this is a no-op.
        */
-      nir_ssa_def *desc = nir_vec2(b, instr->src[0].ssa, nir_imm_int(b, 0));
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa, desc);
+      nir_ssa_def_rewrite_uses(&instr->dest.ssa, instr->src[0].ssa);
       nir_instr_remove(&instr->instr);
       return true;
    }
@@ -878,7 +904,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
    case nir_intrinsic_image_deref_atomic_comp_swap:
    case nir_intrinsic_image_deref_size:
    case nir_intrinsic_image_deref_samples:
-      lower_image_deref(b, instr, pipeline, layout);
+      lower_image_deref(b, instr, shader, pipeline, layout);
       return true;
 
    default:
@@ -888,6 +914,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
 
 static bool
 lower_impl(nir_function_impl *impl,
+           nir_shader *shader,
            struct v3dv_pipeline *pipeline,
            const struct v3dv_pipeline_layout *layout)
 {
@@ -901,11 +928,12 @@ lower_impl(nir_function_impl *impl,
          switch (instr->type) {
          case nir_instr_type_tex:
             progress |=
-               lower_sampler(&b, nir_instr_as_tex(instr), pipeline, layout);
+               lower_sampler(&b, nir_instr_as_tex(instr), shader, pipeline, layout);
             break;
          case nir_instr_type_intrinsic:
             progress |=
-               lower_intrinsic(&b, nir_instr_as_intrinsic(instr), pipeline, layout);
+               lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader,
+                               pipeline, layout);
             break;
          default:
             break;
@@ -925,7 +953,7 @@ lower_pipeline_layout_info(nir_shader *shader,
 
    nir_foreach_function(function, shader) {
       if (function->impl)
-         progress |= lower_impl(function->impl, pipeline, layout);
+         progress |= lower_impl(function->impl, shader, pipeline, layout);
    }
 
    return progress;
@@ -950,6 +978,18 @@ lower_fs_io(nir_shader *nir)
 }
 
 static void
+lower_gs_io(struct nir_shader *nir)
+{
+   NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
+
+   nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
+                               MESA_SHADER_GEOMETRY);
+
+   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
+                               MESA_SHADER_GEOMETRY);
+}
+
+static void
 lower_vs_io(struct nir_shader *nir)
 {
    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
@@ -982,13 +1022,16 @@ pipeline_populate_v3d_key(struct v3d_key *key,
                           uint32_t ucp_enables,
                           bool robust_buffer_access)
 {
+   assert(p_stage->pipeline->shared_data &&
+          p_stage->pipeline->shared_data->maps[p_stage->stage]);
+
    /* The following values are default values used at pipeline create. We use
     * there 32 bit as default return size.
     */
    struct v3dv_descriptor_map *sampler_map =
-      &p_stage->pipeline->shared_data->sampler_map;
+      &p_stage->pipeline->shared_data->maps[p_stage->stage]->sampler_map;
    struct v3dv_descriptor_map *texture_map =
-      &p_stage->pipeline->shared_data->texture_map;
+      &p_stage->pipeline->shared_data->maps[p_stage->stage]->texture_map;
 
    key->num_tex_used = texture_map->num_desc;
    assert(key->num_tex_used <= V3D_MAX_TEXTURE_SAMPLERS);
@@ -1010,12 +1053,23 @@ pipeline_populate_v3d_key(struct v3d_key *key,
          key->sampler[sampler_idx].return_size == 32 ? 4 : 2;
    }
 
-
-
-   /* default value. Would be override on the vs/gs populate methods when GS
-    * gets supported
-    */
-   key->is_last_geometry_stage = true;
+   switch (p_stage->stage) {
+   case BROADCOM_SHADER_VERTEX:
+   case BROADCOM_SHADER_VERTEX_BIN:
+      key->is_last_geometry_stage = p_stage->pipeline->gs == NULL;
+      break;
+   case BROADCOM_SHADER_GEOMETRY:
+   case BROADCOM_SHADER_GEOMETRY_BIN:
+      /* FIXME: while we don't implement tessellation shaders */
+      key->is_last_geometry_stage = true;
+      break;
+   case BROADCOM_SHADER_FRAGMENT:
+   case BROADCOM_SHADER_COMPUTE:
+      key->is_last_geometry_stage = false;
+      break;
+   default:
+      unreachable("unsupported shader stage");
+   }
 
    /* Vulkan doesn't have fixed function state for user clip planes. Instead,
     * shaders can write to gl_ClipDistance[], in which case the SPIR-V compiler
@@ -1073,8 +1127,11 @@ static void
 pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
                              const struct v3dv_pipeline_stage *p_stage,
+                             bool has_geometry_shader,
                              uint32_t ucp_enables)
 {
+   assert(p_stage->stage == BROADCOM_SHADER_FRAGMENT);
+
    memset(key, 0, sizeof(*key));
 
    const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
@@ -1087,9 +1144,11 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
    key->is_points = (topology == PIPE_PRIM_POINTS);
    key->is_lines = (topology >= PIPE_PRIM_LINES &&
                     topology <= PIPE_PRIM_LINE_STRIP);
+   key->has_gs = has_geometry_shader;
 
    const VkPipelineColorBlendStateCreateInfo *cb_info =
-      pCreateInfo->pColorBlendState;
+      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ?
+      pCreateInfo->pColorBlendState : NULL;
 
    key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
                        vk_to_pipe_logicop[cb_info->logicOp] :
@@ -1139,7 +1198,8 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
        */
       if (key->logicop_func != PIPE_LOGICOP_COPY) {
          key->color_fmt[i].format = fb_pipe_format;
-         key->color_fmt[i].swizzle = v3dv_get_format_swizzle(fb_format);
+         key->color_fmt[i].swizzle =
+            v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format);
       }
 
       const struct util_format_description *desc =
@@ -1173,43 +1233,140 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
 }
 
 static void
-pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
+setup_stage_outputs_from_next_stage_inputs(
+   uint8_t next_stage_num_inputs,
+   struct v3d_varying_slot *next_stage_input_slots,
+   uint8_t *num_used_outputs,
+   struct v3d_varying_slot *used_output_slots,
+   uint32_t size_of_used_output_slots)
+{
+   *num_used_outputs = next_stage_num_inputs;
+   memcpy(used_output_slots, next_stage_input_slots, size_of_used_output_slots);
+}
+
+static void
+pipeline_populate_v3d_gs_key(struct v3d_gs_key *key,
                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
                              const struct v3dv_pipeline_stage *p_stage)
 {
+   assert(p_stage->stage == BROADCOM_SHADER_GEOMETRY ||
+          p_stage->stage == BROADCOM_SHADER_GEOMETRY_BIN);
+
    memset(key, 0, sizeof(*key));
 
    const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
    pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
 
-   /* Vulkan specifies a point size per vertex, so true for if the prim are
-    * points, like on ES2)
-    */
-   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
-      pCreateInfo->pInputAssemblyState;
-   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+   struct v3dv_pipeline *pipeline = p_stage->pipeline;
 
-   /* FIXME: not enough to being PRIM_POINTS, on gallium the full check is
-    * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
-   key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);
+   key->per_vertex_point_size =
+      p_stage->nir->info.outputs_written & (1ull << VARYING_SLOT_PSIZ);
+
+   key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
 
-   key->is_coord = p_stage->stage == BROADCOM_SHADER_VERTEX_BIN;
+   assert(key->base.is_last_geometry_stage);
    if (key->is_coord) {
-      /* The only output varying on coord shaders are for transform
+      /* Output varyings in the last binning shader are only used for transform
        * feedback. Set to 0 as VK_EXT_transform_feedback is not supported.
        */
       key->num_used_outputs = 0;
    } else {
-      struct v3dv_pipeline *pipeline = p_stage->pipeline;
       struct v3dv_shader_variant *fs_variant =
          pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
 
-      key->num_used_outputs = fs_variant->prog_data.fs->num_inputs;
-
       STATIC_ASSERT(sizeof(key->used_outputs) ==
                     sizeof(fs_variant->prog_data.fs->input_slots));
-      memcpy(key->used_outputs, fs_variant->prog_data.fs->input_slots,
-             sizeof(key->used_outputs));
+
+      setup_stage_outputs_from_next_stage_inputs(
+         fs_variant->prog_data.fs->num_inputs,
+         fs_variant->prog_data.fs->input_slots,
+         &key->num_used_outputs,
+         key->used_outputs,
+         sizeof(key->used_outputs));
+   }
+}
+
+static void
+pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
+                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                             const struct v3dv_pipeline_stage *p_stage)
+{
+   assert(p_stage->stage == BROADCOM_SHADER_VERTEX ||
+          p_stage->stage == BROADCOM_SHADER_VERTEX_BIN);
+
+   memset(key, 0, sizeof(*key));
+
+   const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
+   pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
+
+   struct v3dv_pipeline *pipeline = p_stage->pipeline;
+
+   /* Vulkan specifies a point size per vertex, so true for if the prim are
+    * points, like on ES2)
+    */
+   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
+      pCreateInfo->pInputAssemblyState;
+   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
+
+   /* FIXME: PRIM_POINTS is not enough, in gallium the full check is
+    * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
+   key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);
+
+   key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
+
+   if (key->is_coord) { /* Binning VS*/
+      if (key->base.is_last_geometry_stage) {
+         /* Output varyings in the last binning shader are only used for
+          * transform feedback. Set to 0 as VK_EXT_transform_feedback is not
+          * supported.
+          */
+         key->num_used_outputs = 0;
+      } else {
+         /* Linking against GS binning program */
+         assert(pipeline->gs);
+         struct v3dv_shader_variant *gs_bin_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
+
+         STATIC_ASSERT(sizeof(key->used_outputs) ==
+                       sizeof(gs_bin_variant->prog_data.gs->input_slots));
+
+         setup_stage_outputs_from_next_stage_inputs(
+            gs_bin_variant->prog_data.gs->num_inputs,
+            gs_bin_variant->prog_data.gs->input_slots,
+            &key->num_used_outputs,
+            key->used_outputs,
+            sizeof(key->used_outputs));
+      }
+   } else { /* Render VS */
+      if (pipeline->gs) {
+         /* Linking against GS render program */
+         struct v3dv_shader_variant *gs_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
+
+         STATIC_ASSERT(sizeof(key->used_outputs) ==
+                       sizeof(gs_variant->prog_data.gs->input_slots));
+
+         setup_stage_outputs_from_next_stage_inputs(
+            gs_variant->prog_data.gs->num_inputs,
+            gs_variant->prog_data.gs->input_slots,
+            &key->num_used_outputs,
+            key->used_outputs,
+            sizeof(key->used_outputs));
+      } else {
+         /* Linking against FS program */
+         struct v3dv_shader_variant *fs_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+
+         STATIC_ASSERT(sizeof(key->used_outputs) ==
+                       sizeof(fs_variant->prog_data.fs->input_slots));
+
+         setup_stage_outputs_from_next_stage_inputs(
+            fs_variant->prog_data.fs->num_inputs,
+            fs_variant->prog_data.fs->input_slots,
+            &key->num_used_outputs,
+            key->used_outputs,
+            sizeof(key->used_outputs));
+      }
    }
 
    const VkPipelineVertexInputStateCreateInfo *vi_info =
@@ -1223,16 +1380,16 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
    }
 }
 
-/*
- * Creates the pipeline_stage for the coordinate shader. Initially a clone of
- * the vs pipeline_stage, with is_coord to true
+/**
+ * Creates the initial form of the pipeline stage for a binning shader by
+ * cloning the render shader and flagging it as a coordinate shader.
  *
  * Returns NULL if it was not able to allocate the object, so it should be
  * handled as a VK_ERROR_OUT_OF_HOST_MEMORY error.
  */
-static struct v3dv_pipeline_stage*
-pipeline_stage_create_vs_bin(const struct v3dv_pipeline_stage *src,
-                             const VkAllocationCallbacks *pAllocator)
+static struct v3dv_pipeline_stage *
+pipeline_stage_create_binning(const struct v3dv_pipeline_stage *src,
+                              const VkAllocationCallbacks *pAllocator)
 {
    struct v3dv_device *device = src->pipeline->device;
 
@@ -1243,13 +1400,25 @@ pipeline_stage_create_vs_bin(const struct v3dv_pipeline_stage *src,
    if (p_stage == NULL)
       return NULL;
 
+   assert(src->stage == BROADCOM_SHADER_VERTEX ||
+          src->stage == BROADCOM_SHADER_GEOMETRY);
+
+   enum broadcom_shader_stage bin_stage =
+      src->stage == BROADCOM_SHADER_VERTEX ?
+         BROADCOM_SHADER_VERTEX_BIN :
+         BROADCOM_SHADER_GEOMETRY_BIN;
+
    p_stage->pipeline = src->pipeline;
-   assert(src->stage == BROADCOM_SHADER_VERTEX);
-   p_stage->stage = BROADCOM_SHADER_VERTEX_BIN;
+   p_stage->stage = bin_stage;
    p_stage->entrypoint = src->entrypoint;
    p_stage->module = src->module;
-   p_stage->nir = src->nir ? nir_shader_clone(NULL, src->nir) : NULL;
+   /* For binning shaders we will clone the NIR code from the corresponding
+    * render shader later, when we call pipeline_compile_xxx_shader. This way
+    * we only have to run the relevant NIR lowerings once for render shaders
+    */
+   p_stage->nir = NULL;
    p_stage->spec_info = src->spec_info;
+   p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
    memcpy(p_stage->shader_sha1, src->shader_sha1, 20);
 
    return p_stage;
@@ -1314,14 +1483,18 @@ pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   /* We need to include both on the sha1 key as one could affect the other
-    * during linking (like if vertex output are constants, then the
-    * fragment shader would load_const intead of load_input). An
-    * alternative would be to use the serialized nir, but that seems like
-    * an overkill
+   /* We need to include all shader stages in the sha1 key as linking may modify
+    * the shader code in any stage. An alternative would be to use the
+    * serialized NIR, but that seems like an overkill.
     */
    _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
                      sizeof(pipeline->vs->shader_sha1));
+
+   if (pipeline->gs) {
+      _mesa_sha1_update(&ctx, pipeline->gs->shader_sha1,
+                        sizeof(pipeline->gs->shader_sha1));
+   }
+
    _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
                      sizeof(pipeline->fs->shader_sha1));
 
@@ -1397,7 +1570,7 @@ pipeline_check_spill_size(struct v3dv_pipeline *pipeline)
  */
 struct v3dv_shader_variant *
 v3dv_shader_variant_create(struct v3dv_device *device,
-                           broadcom_shader_stage stage,
+                           enum broadcom_shader_stage stage,
                            struct v3d_prog_data *prog_data,
                            uint32_t prog_data_size,
                            uint32_t assembly_offset,
@@ -1441,22 +1614,25 @@ v3dv_shader_variant_create(struct v3dv_device *device,
  *     VK_ERROR_UNKNOWN, even if we know that the problem was a compiler
  *     error.
  */
-static struct v3dv_shader_variant*
+static struct v3dv_shader_variant *
 pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
                                 struct v3d_key *key,
                                 size_t key_size,
                                 const VkAllocationCallbacks *pAllocator,
                                 VkResult *out_vk_result)
 {
+   int64_t stage_start = os_time_get_nano();
+
    struct v3dv_pipeline *pipeline = p_stage->pipeline;
    struct v3dv_physical_device *physical_device =
       &pipeline->device->instance->physicalDevice;
    const struct v3d_compiler *compiler = physical_device->compiler;
 
-   if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                    v3d_debug_flag_for_shader_stage(p_stage->stage))) {
+   if (unlikely(V3D_DEBUG & (V3D_DEBUG_NIR |
+                             v3d_debug_flag_for_shader_stage
+                             (broadcom_shader_stage_to_gl(p_stage->stage))))) {
       fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n",
-              gl_shader_stage_name(p_stage->stage),
+              broadcom_shader_stage_name(p_stage->stage),
               p_stage->program_id);
       nir_print_shader(p_stage->nir, stderr);
       fprintf(stderr, "\n");
@@ -1495,6 +1671,8 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
     * we finish it, so let's not worry about freeing the nir here.
     */
 
+   p_stage->feedback.duration += os_time_get_nano() - stage_start;
+
    return variant;
 }
 
@@ -1525,7 +1703,7 @@ st_nir_opts(nir_shader *nir)
 
       if (nir->options->lower_to_scalar) {
          NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-         NIR_PASS_V(nir, nir_lower_phis_to_scalar);
+         NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
       }
 
       NIR_PASS_V(nir, nir_lower_alu);
@@ -1594,6 +1772,11 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
                    struct v3dv_pipeline_stage *p_stage,
                    struct v3dv_pipeline_layout *layout)
 {
+   int64_t stage_start = os_time_get_nano();
+
+   assert(pipeline->shared_data &&
+          pipeline->shared_data->maps[p_stage->stage]);
+
    nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir));
 
    /* We add this because we need a valid sampler for nir_lower_tex to do
@@ -1604,17 +1787,19 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
     * another for the case we need a 32bit return size.
     */
    UNUSED unsigned index =
-      descriptor_map_add(&pipeline->shared_data->sampler_map,
+      descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
                          -1, -1, -1, 0, 16);
    assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
 
    index =
-      descriptor_map_add(&pipeline->shared_data->sampler_map,
+      descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
                          -2, -2, -2, 0, 32);
    assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
 
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
    NIR_PASS_V(p_stage->nir, lower_pipeline_layout_info, pipeline, layout);
+
+   p_stage->feedback.duration += os_time_get_nano() - stage_start;
 }
 
 /**
@@ -1638,11 +1823,13 @@ get_ucp_enable_mask(struct v3dv_pipeline_stage *p_stage)
    return 0;
 }
 
-static nir_shader*
+static nir_shader *
 pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
                        struct v3dv_pipeline *pipeline,
                        struct v3dv_pipeline_cache *cache)
 {
+   int64_t stage_start = os_time_get_nano();
+
    nir_shader *nir = NULL;
 
    nir = v3dv_pipeline_cache_search_for_nir(pipeline, cache,
@@ -1651,6 +1838,14 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
 
    if (nir) {
       assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage));
+
+      /* A NIR cach hit doesn't avoid the large majority of pipeline stage
+       * creation so the cache hit is not recorded in the pipeline feedback
+       * flags
+       */
+
+      p_stage->feedback.duration += os_time_get_nano() - stage_start;
+
       return nir;
    }
 
@@ -1670,6 +1865,9 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
          v3dv_pipeline_cache_upload_nir(pipeline, default_cache, nir,
                                         p_stage->shader_sha1);
       }
+
+      p_stage->feedback.duration += os_time_get_nano() - stage_start;
+
       return nir;
    }
 
@@ -1706,13 +1904,6 @@ pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
                                const VkAllocationCallbacks *pAllocator,
                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
-   struct v3dv_pipeline_stage *p_stage = pipeline->vs;
-
-   /* Right now we only support pipelines with both vertex and fragment
-    * shader.
-    */
-   assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
-
    assert(pipeline->vs_bin != NULL);
    if (pipeline->vs_bin->nir == NULL) {
       assert(pipeline->vs->nir);
@@ -1728,8 +1919,7 @@ pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
-   p_stage = pipeline->vs_bin;
-   pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage);
+   pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin);
    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
       pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key),
                                       pAllocator, &vk_result);
@@ -1738,6 +1928,36 @@ pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
 }
 
 static VkResult
+pipeline_compile_geometry_shader(struct v3dv_pipeline *pipeline,
+                                 const VkAllocationCallbacks *pAllocator,
+                                 const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+   assert(pipeline->gs);
+
+   assert(pipeline->gs_bin != NULL);
+   if (pipeline->gs_bin->nir == NULL) {
+      assert(pipeline->gs->nir);
+      pipeline->gs_bin->nir = nir_shader_clone(NULL, pipeline->gs->nir);
+   }
+
+   VkResult vk_result;
+   struct v3d_gs_key key;
+   pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs);
+   pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] =
+      pipeline_compile_shader_variant(pipeline->gs, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
+   if (vk_result != VK_SUCCESS)
+      return vk_result;
+
+   pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs_bin);
+   pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN] =
+      pipeline_compile_shader_variant(pipeline->gs_bin, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
+
+   return vk_result;
+}
+
+static VkResult
 pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
                                  const VkAllocationCallbacks *pAllocator,
                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
@@ -1749,6 +1969,7 @@ pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
    struct v3d_fs_key key;
 
    pipeline_populate_v3d_fs_key(&key, pCreateInfo, p_stage,
+                                pipeline->gs != NULL,
                                 get_ucp_enable_mask(pipeline->vs));
 
    VkResult vk_result;
@@ -1768,19 +1989,20 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
    key->robust_buffer_access =
       pipeline->device->features.robustBufferAccess;
 
+   const bool raster_enabled =
+      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
+
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
    key->topology = vk_to_pipe_prim_type[ia_info->topology];
 
    const VkPipelineColorBlendStateCreateInfo *cb_info =
-      pCreateInfo->pColorBlendState;
+      raster_enabled ? pCreateInfo->pColorBlendState : NULL;
+
    key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
       vk_to_pipe_logicop[cb_info->logicOp] :
       PIPE_LOGICOP_COPY;
 
-   const bool raster_enabled =
-      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
-
    /* Multisample rasterization state must be ignored if rasterization
     * is disabled.
     */
@@ -1817,7 +2039,8 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
        */
       if (key->logicop_func != PIPE_LOGICOP_COPY) {
          key->color_fmt[i].format = fb_pipe_format;
-         key->color_fmt[i].swizzle = v3dv_get_format_swizzle(fb_format);
+         key->color_fmt[i].swizzle = v3dv_get_format_swizzle(pipeline->device,
+                                                             fb_format);
       }
 
       const struct util_format_description *desc =
@@ -1839,6 +2062,8 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
    }
 
+   assert(pipeline->subpass);
+   key->has_multiview = pipeline->subpass->view_mask != 0;
 }
 
 static void
@@ -1858,25 +2083,285 @@ pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
 
 static struct v3dv_pipeline_shared_data *
 v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
-                                    struct v3dv_device *device)
+                                    struct v3dv_pipeline *pipeline,
+                                    bool is_graphics_pipeline)
 {
-   size_t size = sizeof(struct v3dv_pipeline_shared_data);
    /* We create new_entry using the device alloc. Right now shared_data is ref
     * and unref by both the pipeline and the pipeline cache, so we can't
     * ensure that the cache or pipeline alloc will be available on the last
     * unref.
     */
    struct v3dv_pipeline_shared_data *new_entry =
-      vk_zalloc2(&device->vk.alloc, NULL, size, 8,
+      vk_zalloc2(&pipeline->device->vk.alloc, NULL,
+                 sizeof(struct v3dv_pipeline_shared_data), 8,
                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
    if (new_entry == NULL)
       return NULL;
 
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      /* We don't need specific descriptor maps for binning stages we use the
+       * map for the render stage.
+       */
+      if (broadcom_shader_stage_is_binning(stage))
+         continue;
+
+      if ((is_graphics_pipeline && stage == BROADCOM_SHADER_COMPUTE) ||
+          (!is_graphics_pipeline && stage != BROADCOM_SHADER_COMPUTE)) {
+         continue;
+      }
+
+      if (stage == BROADCOM_SHADER_GEOMETRY && !pipeline->gs) {
+         /* We always inject a custom GS if we have multiview */
+         if (!pipeline->subpass->view_mask)
+            continue;
+      }
+
+      struct v3dv_descriptor_maps *new_maps =
+         vk_zalloc2(&pipeline->device->vk.alloc, NULL,
+                    sizeof(struct v3dv_descriptor_maps), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+      if (new_maps == NULL)
+         goto fail;
+
+      new_entry->maps[stage] = new_maps;
+   }
+
+   new_entry->maps[BROADCOM_SHADER_VERTEX_BIN] =
+      new_entry->maps[BROADCOM_SHADER_VERTEX];
+
+   new_entry->maps[BROADCOM_SHADER_GEOMETRY_BIN] =
+      new_entry->maps[BROADCOM_SHADER_GEOMETRY];
+
    new_entry->ref_cnt = 1;
    memcpy(new_entry->sha1_key, sha1_key, 20);
 
    return new_entry;
+
+fail:
+   if (new_entry != NULL) {
+      for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+         if (new_entry->maps[stage] != NULL)
+            vk_free(&pipeline->device->vk.alloc, new_entry->maps[stage]);
+      }
+   }
+
+   vk_free(&pipeline->device->vk.alloc, new_entry);
+
+   return NULL;
+}
+
+static void
+write_creation_feedback(struct v3dv_pipeline *pipeline,
+                        const void *next,
+                        const VkPipelineCreationFeedbackEXT *pipeline_feedback,
+                        uint32_t stage_count,
+                        const VkPipelineShaderStageCreateInfo *stages)
+{
+   const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
+      vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+
+   if (create_feedback) {
+      typed_memcpy(create_feedback->pPipelineCreationFeedback,
+             pipeline_feedback,
+             1);
+
+      assert(stage_count == create_feedback->pipelineStageCreationFeedbackCount);
+
+      for (uint32_t i = 0; i < stage_count; i++) {
+         gl_shader_stage s = vk_to_mesa_shader_stage(stages[i].stage);
+         switch (s) {
+         case MESA_SHADER_VERTEX:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->vs->feedback;
+
+            create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
+               pipeline->vs_bin->feedback.duration;
+            break;
+
+         case MESA_SHADER_GEOMETRY:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->gs->feedback;
+
+            create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
+               pipeline->gs_bin->feedback.duration;
+            break;
+
+         case MESA_SHADER_FRAGMENT:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->fs->feedback;
+            break;
+
+         case MESA_SHADER_COMPUTE:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->cs->feedback;
+            break;
+
+         default:
+            unreachable("not supported shader stage");
+         }
+      }
+   }
+}
+
+static uint32_t
+multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
+{
+   switch (pipeline->topology) {
+   case PIPE_PRIM_POINTS:
+      return GL_POINTS;
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_LINE_STRIP:
+      return GL_LINES;
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return GL_TRIANGLES;
+   default:
+      /* Since we don't allow GS with multiview, we can only see non-adjacency
+       * primitives.
+       */
+      unreachable("Unexpected pipeline primitive type");
+   }
+}
+
+static uint32_t
+multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
+{
+   switch (pipeline->topology) {
+   case PIPE_PRIM_POINTS:
+      return GL_POINTS;
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_LINE_STRIP:
+      return GL_LINE_STRIP;
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return GL_TRIANGLE_STRIP;
+   default:
+      /* Since we don't allow GS with multiview, we can only see non-adjacency
+       * primitives.
+       */
+      unreachable("Unexpected pipeline primitive type");
+   }
+}
+
+static bool
+pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
+                          struct v3dv_pipeline_cache *cache,
+                          const VkAllocationCallbacks *pAllocator)
+{
+   /* Create the passthrough GS from the VS output interface */
+   pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
+   nir_shader *vs_nir = pipeline->vs->nir;
+
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
+                                                  "multiview broadcast gs");
+   nir_shader *nir = b.shader;
+   nir->info.inputs_read = vs_nir->info.outputs_written;
+   nir->info.outputs_written = vs_nir->info.outputs_written |
+                               (1ull << VARYING_SLOT_LAYER);
+
+   uint32_t vertex_count = u_vertices_per_prim(pipeline->topology);
+   nir->info.gs.input_primitive =
+      multiview_gs_input_primitive_from_pipeline(pipeline);
+   nir->info.gs.output_primitive =
+      multiview_gs_output_primitive_from_pipeline(pipeline);
+   nir->info.gs.vertices_in = vertex_count;
+   nir->info.gs.vertices_out = nir->info.gs.vertices_in;
+   nir->info.gs.invocations = 1;
+   nir->info.gs.active_stream_mask = 0x1;
+
+   /* Make a list of GS input/output variables from the VS outputs */
+   nir_variable *in_vars[100];
+   nir_variable *out_vars[100];
+   uint32_t var_count = 0;
+   nir_foreach_shader_out_variable(out_vs_var, vs_nir) {
+      char name[8];
+      snprintf(name, ARRAY_SIZE(name), "in_%d", var_count);
+
+      in_vars[var_count] =
+         nir_variable_create(nir, nir_var_shader_in,
+                             glsl_array_type(out_vs_var->type, vertex_count, 0),
+                             name);
+      in_vars[var_count]->data.location = out_vs_var->data.location;
+      in_vars[var_count]->data.location_frac = out_vs_var->data.location_frac;
+      in_vars[var_count]->data.interpolation = out_vs_var->data.interpolation;
+
+      snprintf(name, ARRAY_SIZE(name), "out_%d", var_count);
+      out_vars[var_count] =
+         nir_variable_create(nir, nir_var_shader_out, out_vs_var->type, name);
+      out_vars[var_count]->data.location = out_vs_var->data.location;
+      out_vars[var_count]->data.interpolation = out_vs_var->data.interpolation;
+
+      var_count++;
+   }
+
+   /* Add the gl_Layer output variable */
+   nir_variable *out_layer =
+      nir_variable_create(nir, nir_var_shader_out, glsl_int_type(),
+                          "out_Layer");
+   out_layer->data.location = VARYING_SLOT_LAYER;
+
+   /* Get the view index value that we will write to gl_Layer */
+   nir_ssa_def *layer =
+      nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32);
+
+   /* Emit all output vertices */
+   for (uint32_t vi = 0; vi < vertex_count; vi++) {
+      /* Emit all output varyings */
+      for (uint32_t i = 0; i < var_count; i++) {
+         nir_deref_instr *in_value =
+            nir_build_deref_array_imm(&b, nir_build_deref_var(&b, in_vars[i]), vi);
+         nir_copy_deref(&b, nir_build_deref_var(&b, out_vars[i]), in_value);
+      }
+
+      /* Emit gl_Layer write */
+      nir_store_var(&b, out_layer, layer, 0x1);
+
+      nir_emit_vertex(&b, 0);
+   }
+   nir_end_primitive(&b, 0);
+
+   /* Make sure we run our pre-process NIR passes so we produce NIR compatible
+    * with what we expect from SPIR-V modules.
+    */
+   preprocess_nir(nir);
+
+   /* Attach the geometry shader to the  pipeline */
+   struct v3dv_device *device = pipeline->device;
+   struct v3dv_physical_device *physical_device =
+      &device->instance->physicalDevice;
+
+   struct v3dv_pipeline_stage *p_stage =
+      vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (p_stage == NULL) {
+      ralloc_free(nir);
+      return false;
+   }
+
+   p_stage->pipeline = pipeline;
+   p_stage->stage = BROADCOM_SHADER_GEOMETRY;
+   p_stage->entrypoint = "main";
+   p_stage->module = 0;
+   p_stage->nir = nir;
+   pipeline_compute_sha1_from_nir(p_stage->nir, p_stage->shader_sha1);
+   p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
+
+   pipeline->has_gs = true;
+   pipeline->gs = p_stage;
+   pipeline->active_stages |= MESA_SHADER_GEOMETRY;
+
+   pipeline->gs_bin =
+      pipeline_stage_create_binning(pipeline->gs, pAllocator);
+      if (pipeline->gs_bin == NULL)
+         return false;
+
+   return true;
 }
 
 /*
@@ -1895,6 +2380,11 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *pAllocator)
 {
+   VkPipelineCreationFeedbackEXT pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
    struct v3dv_device *device = pipeline->device;
    struct v3dv_physical_device *physical_device =
       &device->instance->physicalDevice;
@@ -1945,14 +2435,24 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
       case MESA_SHADER_VERTEX:
          pipeline->vs = p_stage;
          pipeline->vs_bin =
-            pipeline_stage_create_vs_bin(pipeline->vs, pAllocator);
+            pipeline_stage_create_binning(pipeline->vs, pAllocator);
          if (pipeline->vs_bin == NULL)
             return VK_ERROR_OUT_OF_HOST_MEMORY;
+         break;
 
+      case MESA_SHADER_GEOMETRY:
+         pipeline->has_gs = true;
+         pipeline->gs = p_stage;
+         pipeline->gs_bin =
+            pipeline_stage_create_binning(pipeline->gs, pAllocator);
+         if (pipeline->gs_bin == NULL)
+            return VK_ERROR_OUT_OF_HOST_MEMORY;
          break;
+
       case MESA_SHADER_FRAGMENT:
          pipeline->fs = p_stage;
          break;
+
       default:
          unreachable("not supported shader stage");
       }
@@ -1984,39 +2484,85 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
       pipeline->active_stages |= MESA_SHADER_FRAGMENT;
    }
 
-   /* Now we will try to get the variants from the pipeline cache */
+   /* If multiview is enabled, we inject a custom passthrough geometry shader
+    * to broadcast draw calls to the appropriate views.
+    */
+   assert(!pipeline->subpass->view_mask || (!pipeline->has_gs && !pipeline->gs));
+   if (pipeline->subpass->view_mask) {
+      if (!pipeline_add_multiview_gs(pipeline, cache, pAllocator))
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   /* First we try to get the variants from the pipeline cache */
    struct v3dv_pipeline_key pipeline_key;
    pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
    unsigned char pipeline_sha1[20];
    pipeline_hash_graphics(pipeline, &pipeline_key, pipeline_sha1);
 
+   bool cache_hit = false;
+
    pipeline->shared_data =
-      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+      v3dv_pipeline_cache_search_for_pipeline(cache,
+                                              pipeline_sha1,
+                                              &cache_hit);
 
    if (pipeline->shared_data != NULL) {
+      /* A correct pipeline must have at least a VS and FS */
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
+      assert(!pipeline->gs ||
+             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
+      assert(!pipeline->gs ||
+             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
+
+      if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+         pipeline_feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
 
       goto success;
    }
 
-   pipeline->shared_data =
-      v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline->device);
-   /* If not, we try to get the nir shaders (from the SPIR-V shader, or from
-    * the pipeline cache again) and compile.
+   if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
+      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+
+   /* Otherwise we try to get the NIR shaders (either from the original SPIR-V
+    * shader or the pipeline cache) and compile.
     */
+   pipeline->shared_data =
+      v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline, true);
+
+   pipeline->vs->feedback.flags |=
+      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+   if (pipeline->gs)
+      pipeline->gs->feedback.flags |=
+         VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+   pipeline->fs->feedback.flags |=
+      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+
    if (!pipeline->vs->nir)
       pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
+   if (pipeline->gs && !pipeline->gs->nir)
+      pipeline->gs->nir = pipeline_stage_get_nir(pipeline->gs, pipeline, cache);
    if (!pipeline->fs->nir)
       pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache);
 
    /* Linking + pipeline lowerings */
-   link_shaders(pipeline->vs->nir, pipeline->fs->nir);
+   if (pipeline->gs) {
+      link_shaders(pipeline->gs->nir, pipeline->fs->nir);
+      link_shaders(pipeline->vs->nir, pipeline->gs->nir);
+   } else {
+      link_shaders(pipeline->vs->nir, pipeline->fs->nir);
+   }
 
    pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout);
    lower_fs_io(pipeline->fs->nir);
 
+   if (pipeline->gs) {
+      pipeline_lower_nir(pipeline, pipeline->gs, pipeline->layout);
+      lower_gs_io(pipeline->gs->nir);
+   }
+
    pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout);
    lower_vs_io(pipeline->vs->nir);
 
@@ -2029,6 +2575,16 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
+   assert(!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] &&
+          !pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
+
+   if (pipeline->gs) {
+      vk_result =
+         pipeline_compile_geometry_shader(pipeline, pAllocator, pCreateInfo);
+      if (vk_result != VK_SUCCESS)
+         return vk_result;
+   }
+
    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] &&
           !pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
 
@@ -2041,29 +2597,52 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
 
    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
 
-   /* As we got the variants in pipeline->shared_data, after compiling we
-    * don't need the pipeline_stages
+ success:
+
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+   write_creation_feedback(pipeline,
+                           pCreateInfo->pNext,
+                           &pipeline_feedback,
+                           pCreateInfo->stageCount,
+                           pCreateInfo->pStages);
+
+   /* Since we have the variants in the pipeline shared data we can now free
+    * the pipeline stages.
     */
    pipeline_free_stages(device, pipeline, pAllocator);
 
- success:
    pipeline_check_spill_size(pipeline);
 
-   /* FIXME: values below are default when non-GS is available. Would need to
-    * provide real values if GS gets supported
-    */
+   return compute_vpm_config(pipeline);
+}
+
+static VkResult
+compute_vpm_config(struct v3dv_pipeline *pipeline)
+{
    struct v3dv_shader_variant *vs_variant =
       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
    struct v3dv_shader_variant *vs_bin_variant =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
-
-   pipeline->vpm_cfg_bin.As = 1;
-   pipeline->vpm_cfg_bin.Ve = 0;
-   pipeline->vpm_cfg_bin.Vc = vs_bin_variant->prog_data.vs->vcm_cache_size;
-
-   pipeline->vpm_cfg.As = 1;
-   pipeline->vpm_cfg.Ve = 0;
-   pipeline->vpm_cfg.Vc = vs_variant->prog_data.vs->vcm_cache_size;
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   struct v3d_vs_prog_data *vs = vs_variant->prog_data.vs;
+   struct v3d_vs_prog_data *vs_bin =vs_bin_variant->prog_data.vs;
+
+   struct v3d_gs_prog_data *gs = NULL;
+   struct v3d_gs_prog_data *gs_bin = NULL;
+   if (pipeline->has_gs) {
+      struct v3dv_shader_variant *gs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
+      struct v3dv_shader_variant *gs_bin_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
+      gs = gs_variant->prog_data.gs;
+      gs_bin = gs_bin_variant->prog_data.gs;
+   }
+
+   if (!v3d_compute_vpm_config(&pipeline->device->devinfo,
+                               vs_bin, vs, gs_bin, gs,
+                               &pipeline->vpm_cfg_bin,
+                               &pipeline->vpm_cfg)) {
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   }
 
    return VK_SUCCESS;
 }
@@ -2088,6 +2667,8 @@ v3dv_dynamic_state_mask(VkDynamicState state)
       return V3DV_DYNAMIC_DEPTH_BIAS;
    case VK_DYNAMIC_STATE_LINE_WIDTH:
       return V3DV_DYNAMIC_LINE_WIDTH;
+   case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
+      return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
 
    /* Depth bounds testing is not available in in V3D 4.2 so here we are just
     * ignoring this dynamic state. We are already asserting at pipeline creation
@@ -2108,7 +2689,8 @@ pipeline_init_dynamic_state(
    const VkPipelineViewportStateCreateInfo *pViewportState,
    const VkPipelineDepthStencilStateCreateInfo *pDepthStencilState,
    const VkPipelineColorBlendStateCreateInfo *pColorBlendState,
-   const VkPipelineRasterizationStateCreateInfo *pRasterizationState)
+   const VkPipelineRasterizationStateCreateInfo *pRasterizationState,
+   const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
 {
    pipeline->dynamic_state = default_dynamic_state;
    struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
@@ -2184,310 +2766,13 @@ pipeline_init_dynamic_state(
          dynamic->line_width = pRasterizationState->lineWidth;
    }
 
-   pipeline->dynamic_state.mask = dynamic_states;
-}
-
-static uint8_t
-blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants)
-{
-   switch (factor) {
-   case VK_BLEND_FACTOR_ZERO:
-   case VK_BLEND_FACTOR_ONE:
-   case VK_BLEND_FACTOR_SRC_COLOR:
-   case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
-   case VK_BLEND_FACTOR_DST_COLOR:
-   case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
-   case VK_BLEND_FACTOR_SRC_ALPHA:
-   case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
-   case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
-      return factor;
-   case VK_BLEND_FACTOR_CONSTANT_COLOR:
-   case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
-   case VK_BLEND_FACTOR_CONSTANT_ALPHA:
-   case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
-      *needs_constants = true;
-      return factor;
-   case VK_BLEND_FACTOR_DST_ALPHA:
-      return dst_alpha_one ? V3D_BLEND_FACTOR_ONE :
-                             V3D_BLEND_FACTOR_DST_ALPHA;
-   case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
-      return dst_alpha_one ? V3D_BLEND_FACTOR_ZERO :
-                             V3D_BLEND_FACTOR_INV_DST_ALPHA;
-   case VK_BLEND_FACTOR_SRC1_COLOR:
-   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
-   case VK_BLEND_FACTOR_SRC1_ALPHA:
-   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
-      assert(!"Invalid blend factor: dual source blending not supported.");
-   default:
-      assert(!"Unknown blend factor.");
+   if (pColorWriteState && !(dynamic_states & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
+      dynamic->color_write_enable = 0;
+      for (uint32_t i = 0; i < pColorWriteState->attachmentCount; i++)
+         dynamic->color_write_enable |= pColorWriteState->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
    }
 
-   /* Should be handled by the switch, added to avoid a "end of non-void
-    * function" error
-    */
-   unreachable("Unknown blend factor.");
-}
-
-static void
-pack_blend(struct v3dv_pipeline *pipeline,
-           const VkPipelineColorBlendStateCreateInfo *cb_info)
-{
-   /* By default, we are not enabling blending and all color channel writes are
-    * enabled. Color write enables are independent of whether blending is
-    * enabled or not.
-    *
-    * Vulkan specifies color write masks so that bits set correspond to
-    * enabled channels. Our hardware does it the other way around.
-    */
-   pipeline->blend.enables = 0;
-   pipeline->blend.color_write_masks = 0; /* All channels enabled */
-
-   if (!cb_info)
-      return;
-
-   assert(pipeline->subpass);
-   if (pipeline->subpass->color_count == 0)
-      return;
-
-   assert(pipeline->subpass->color_count == cb_info->attachmentCount);
-
-   pipeline->blend.needs_color_constants = false;
-   uint32_t color_write_masks = 0;
-   for (uint32_t i = 0; i < pipeline->subpass->color_count; i++) {
-      const VkPipelineColorBlendAttachmentState *b_state =
-         &cb_info->pAttachments[i];
-
-      uint32_t attachment_idx =
-         pipeline->subpass->color_attachments[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      color_write_masks |= (~b_state->colorWriteMask & 0xf) << (4 * i);
-
-      if (!b_state->blendEnable)
-         continue;
-
-      VkAttachmentDescription *desc =
-         &pipeline->pass->attachments[attachment_idx].desc;
-      const struct v3dv_format *format = v3dv_get_format(desc->format);
-      bool dst_alpha_one = (format->swizzle[3] == PIPE_SWIZZLE_1);
-
-      uint8_t rt_mask = 1 << i;
-      pipeline->blend.enables |= rt_mask;
-
-      v3dv_pack(pipeline->blend.cfg[i], BLEND_CFG, config) {
-         config.render_target_mask = rt_mask;
-
-         config.color_blend_mode = b_state->colorBlendOp;
-         config.color_blend_dst_factor =
-            blend_factor(b_state->dstColorBlendFactor, dst_alpha_one,
-                         &pipeline->blend.needs_color_constants);
-         config.color_blend_src_factor =
-            blend_factor(b_state->srcColorBlendFactor, dst_alpha_one,
-                         &pipeline->blend.needs_color_constants);
-
-         config.alpha_blend_mode = b_state->alphaBlendOp;
-         config.alpha_blend_dst_factor =
-            blend_factor(b_state->dstAlphaBlendFactor, dst_alpha_one,
-                         &pipeline->blend.needs_color_constants);
-         config.alpha_blend_src_factor =
-            blend_factor(b_state->srcAlphaBlendFactor, dst_alpha_one,
-                         &pipeline->blend.needs_color_constants);
-      }
-   }
-
-   pipeline->blend.color_write_masks = color_write_masks;
-}
-
-/* This requires that pack_blend() had been called before so we can set
- * the overall blend enable bit in the CFG_BITS packet.
- */
-static void
-pack_cfg_bits(struct v3dv_pipeline *pipeline,
-              const VkPipelineDepthStencilStateCreateInfo *ds_info,
-              const VkPipelineRasterizationStateCreateInfo *rs_info,
-              const VkPipelineMultisampleStateCreateInfo *ms_info)
-{
-   assert(sizeof(pipeline->cfg_bits) == cl_packet_length(CFG_BITS));
-
-   pipeline->msaa =
-      ms_info && ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
-
-   v3dv_pack(pipeline->cfg_bits, CFG_BITS, config) {
-      config.enable_forward_facing_primitive =
-         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_FRONT_BIT) : false;
-
-      config.enable_reverse_facing_primitive =
-         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_BACK_BIT) : false;
-
-      /* Seems like the hardware is backwards regarding this setting... */
-      config.clockwise_primitives =
-         rs_info ? rs_info->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE : false;
-
-      config.enable_depth_offset = rs_info ? rs_info->depthBiasEnable: false;
-
-      /* This is required to pass line rasterization tests in CTS while
-       * exposing, at least, a minimum of 4-bits of subpixel precision
-       * (the minimum requirement).
-       */
-      config.line_rasterization = 1; /* perp end caps */
-
-      if (rs_info && rs_info->polygonMode != VK_POLYGON_MODE_FILL) {
-         config.direct3d_wireframe_triangles_mode = true;
-         config.direct3d_point_fill_mode =
-            rs_info->polygonMode == VK_POLYGON_MODE_POINT;
-      }
-
-      config.rasterizer_oversample_mode = pipeline->msaa ? 1 : 0;
-
-      /* From the Vulkan spec:
-       *
-       *   "Provoking Vertex:
-       *
-       *       The vertex in a primitive from which flat shaded attribute
-       *       values are taken. This is generally the “first” vertex in the
-       *       primitive, and depends on the primitive topology."
-       *
-       * First vertex is the Direct3D style for provoking vertex. OpenGL uses
-       * the last vertex by default.
-       */
-      config.direct3d_provoking_vertex = true;
-
-      config.blend_enable = pipeline->blend.enables != 0;
-
-      /* Disable depth/stencil if we don't have a D/S attachment */
-      bool has_ds_attachment =
-         pipeline->subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED;
-
-      if (ds_info && ds_info->depthTestEnable && has_ds_attachment) {
-         config.z_updates_enable = ds_info->depthWriteEnable;
-         config.depth_test_function = ds_info->depthCompareOp;
-      } else {
-         config.depth_test_function = VK_COMPARE_OP_ALWAYS;
-      }
-
-      /* EZ state will be updated at draw time based on bound pipeline state */
-      config.early_z_updates_enable = false;
-      config.early_z_enable = false;
-
-      config.stencil_enable =
-         ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
-
-      pipeline->z_updates_enable = config.z_updates_enable;
-   };
-}
-
-static uint32_t
-translate_stencil_op(enum pipe_stencil_op op)
-{
-   switch (op) {
-   case VK_STENCIL_OP_KEEP:
-      return V3D_STENCIL_OP_KEEP;
-   case VK_STENCIL_OP_ZERO:
-      return V3D_STENCIL_OP_ZERO;
-   case VK_STENCIL_OP_REPLACE:
-      return V3D_STENCIL_OP_REPLACE;
-   case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
-      return V3D_STENCIL_OP_INCR;
-   case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
-      return V3D_STENCIL_OP_DECR;
-   case VK_STENCIL_OP_INVERT:
-      return V3D_STENCIL_OP_INVERT;
-   case VK_STENCIL_OP_INCREMENT_AND_WRAP:
-      return V3D_STENCIL_OP_INCWRAP;
-   case VK_STENCIL_OP_DECREMENT_AND_WRAP:
-      return V3D_STENCIL_OP_DECWRAP;
-   default:
-      unreachable("bad stencil op");
-   }
-}
-
-static void
-pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
-                        uint8_t *stencil_cfg,
-                        bool is_front,
-                        bool is_back,
-                        const VkStencilOpState *stencil_state)
-{
-   /* From the Vulkan spec:
-    *
-    *   "Reference is an integer reference value that is used in the unsigned
-    *    stencil comparison. The reference value used by stencil comparison
-    *    must be within the range [0,2^s-1] , where s is the number of bits in
-    *    the stencil framebuffer attachment, otherwise the reference value is
-    *    considered undefined."
-    *
-    * In our case, 's' is always 8, so we clamp to that to prevent our packing
-    * functions to assert in debug mode if they see larger values.
-    *
-    * If we have dynamic state we need to make sure we set the corresponding
-    * state bits to 0, since cl_emit_with_prepacked ORs the new value with
-    * the old.
-    */
-   const uint8_t write_mask =
-      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK ?
-         0 : stencil_state->writeMask & 0xff;
-
-   const uint8_t compare_mask =
-      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
-         0 : stencil_state->compareMask & 0xff;
-
-   const uint8_t reference =
-      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
-         0 : stencil_state->reference & 0xff;
-
-   v3dv_pack(stencil_cfg, STENCIL_CFG, config) {
-      config.front_config = is_front;
-      config.back_config = is_back;
-      config.stencil_write_mask = write_mask;
-      config.stencil_test_mask = compare_mask;
-      config.stencil_test_function = stencil_state->compareOp;
-      config.stencil_pass_op = translate_stencil_op(stencil_state->passOp);
-      config.depth_test_fail_op = translate_stencil_op(stencil_state->depthFailOp);
-      config.stencil_test_fail_op = translate_stencil_op(stencil_state->failOp);
-      config.stencil_ref_value = reference;
-   }
-}
-
-static void
-pack_stencil_cfg(struct v3dv_pipeline *pipeline,
-                 const VkPipelineDepthStencilStateCreateInfo *ds_info)
-{
-   assert(sizeof(pipeline->stencil_cfg) == 2 * cl_packet_length(STENCIL_CFG));
-
-   if (!ds_info || !ds_info->stencilTestEnable)
-      return;
-
-   if (pipeline->subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
-      return;
-
-   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_WRITE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_REFERENCE;
-
-
-   /* If front != back or we have dynamic stencil state we can't emit a single
-    * packet for both faces.
-    */
-   bool needs_front_and_back = false;
-   if ((pipeline->dynamic_state.mask & dynamic_stencil_states) ||
-       memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front)))
-      needs_front_and_back = true;
-
-   /* If the front and back configurations are the same we can emit both with
-    * a single packet.
-    */
-   pipeline->emit_stencil_cfg[0] = true;
-   if (!needs_front_and_back) {
-      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
-                              true, true, &ds_info->front);
-   } else {
-      pipeline->emit_stencil_cfg[1] = true;
-      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
-                              true, false, &ds_info->front);
-      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[1],
-                              false, true, &ds_info->back);
-   }
+   pipeline->dynamic_state.mask = dynamic_states;
 }
 
 static bool
@@ -2532,25 +2817,25 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
                       const VkPipelineDepthStencilStateCreateInfo *ds_info)
 {
    if (!ds_info || !ds_info->depthTestEnable) {
-      pipeline->ez_state = VC5_EZ_DISABLED;
+      pipeline->ez_state = V3D_EZ_DISABLED;
       return;
    }
 
    switch (ds_info->depthCompareOp) {
    case VK_COMPARE_OP_LESS:
    case VK_COMPARE_OP_LESS_OR_EQUAL:
-      pipeline->ez_state = VC5_EZ_LT_LE;
+      pipeline->ez_state = V3D_EZ_LT_LE;
       break;
    case VK_COMPARE_OP_GREATER:
    case VK_COMPARE_OP_GREATER_OR_EQUAL:
-      pipeline->ez_state = VC5_EZ_GT_GE;
+      pipeline->ez_state = V3D_EZ_GT_GE;
       break;
    case VK_COMPARE_OP_NEVER:
    case VK_COMPARE_OP_EQUAL:
-      pipeline->ez_state = VC5_EZ_UNDECIDED;
+      pipeline->ez_state = V3D_EZ_UNDECIDED;
       break;
    default:
-      pipeline->ez_state = VC5_EZ_DISABLED;
+      pipeline->ez_state = V3D_EZ_DISABLED;
       break;
    }
 
@@ -2558,220 +2843,10 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
    if (ds_info->stencilTestEnable &&
        (!stencil_op_is_no_op(&ds_info->front) ||
         !stencil_op_is_no_op(&ds_info->back))) {
-         pipeline->ez_state = VC5_EZ_DISABLED;
-   }
-}
-
-static void
-pack_shader_state_record(struct v3dv_pipeline *pipeline)
-{
-   assert(sizeof(pipeline->shader_state_record) ==
-          cl_packet_length(GL_SHADER_STATE_RECORD));
-
-   struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
-
-   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
-
-   struct v3d_vs_prog_data *prog_data_vs_bin =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
-
-
-   /* Note: we are not packing addresses, as we need the job (see
-    * cl_pack_emit_reloc). Additionally uniforms can't be filled up at this
-    * point as they depend on dynamic info that can be set after create the
-    * pipeline (like viewport), . Would need to be filled later, so we are
-    * doing a partial prepacking.
-    */
-   v3dv_pack(pipeline->shader_state_record, GL_SHADER_STATE_RECORD, shader) {
-      shader.enable_clipping = true;
-
-      shader.point_size_in_shaded_vertex_data =
-         pipeline->topology == PIPE_PRIM_POINTS;
-
-      /* Must be set if the shader modifies Z, discards, or modifies
-       * the sample mask.  For any of these cases, the fragment
-       * shader needs to write the Z value (even just discards).
-       */
-      shader.fragment_shader_does_z_writes = prog_data_fs->writes_z;
-      /* Set if the EZ test must be disabled (due to shader side
-       * effects and the early_z flag not being present in the
-       * shader).
-       */
-      shader.turn_off_early_z_test = prog_data_fs->disable_ez;
-
-      shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
-         prog_data_fs->uses_center_w;
-
-      /* The description for gl_SampleID states that if a fragment shader reads
-       * it, then we should automatically activate per-sample shading. However,
-       * the Vulkan spec also states that if a framebuffer has no attachments:
-       *
-       *    "The subpass continues to use the width, height, and layers of the
-       *     framebuffer to define the dimensions of the rendering area, and the
-       *     rasterizationSamples from each pipeline’s
-       *     VkPipelineMultisampleStateCreateInfo to define the number of
-       *     samples used in rasterization multisample rasterization."
-       *
-       * So in this scenario, if the pipeline doesn't enable multiple samples
-       * but the fragment shader accesses gl_SampleID we would be requested
-       * to do per-sample shading in single sample rasterization mode, which
-       * is pointless, so just disable it in that case.
-       */
-      shader.enable_sample_rate_shading =
-         pipeline->sample_rate_shading ||
-         (pipeline->msaa && prog_data_fs->force_per_sample_msaa);
-
-      shader.any_shader_reads_hardware_written_primitive_id = false;
-
-      shader.do_scoreboard_wait_on_first_thread_switch =
-         prog_data_fs->lock_scoreboard_on_first_thrsw;
-      shader.disable_implicit_point_line_varyings =
-         !prog_data_fs->uses_implicit_point_line_varyings;
-
-      shader.number_of_varyings_in_fragment_shader =
-         prog_data_fs->num_inputs;
-
-      shader.coordinate_shader_propagate_nans = true;
-      shader.vertex_shader_propagate_nans = true;
-      shader.fragment_shader_propagate_nans = true;
-
-      /* Note: see previous note about adresses */
-      /* shader.coordinate_shader_code_address */
-      /* shader.vertex_shader_code_address */
-      /* shader.fragment_shader_code_address */
-
-      /* FIXME: Use combined input/output size flag in the common case (also
-       * on v3d, see v3dx_draw).
-       */
-      shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
-         prog_data_vs_bin->separate_segments;
-      shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
-         prog_data_vs->separate_segments;
-
-      shader.coordinate_shader_input_vpm_segment_size =
-         prog_data_vs_bin->separate_segments ?
-         prog_data_vs_bin->vpm_input_size : 1;
-      shader.vertex_shader_input_vpm_segment_size =
-         prog_data_vs->separate_segments ?
-         prog_data_vs->vpm_input_size : 1;
-
-      shader.coordinate_shader_output_vpm_segment_size =
-         prog_data_vs_bin->vpm_output_size;
-      shader.vertex_shader_output_vpm_segment_size =
-         prog_data_vs->vpm_output_size;
-
-      /* Note: see previous note about adresses */
-      /* shader.coordinate_shader_uniforms_address */
-      /* shader.vertex_shader_uniforms_address */
-      /* shader.fragment_shader_uniforms_address */
-
-      shader.min_coord_shader_input_segments_required_in_play =
-         pipeline->vpm_cfg_bin.As;
-      shader.min_vertex_shader_input_segments_required_in_play =
-         pipeline->vpm_cfg.As;
-
-      shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
-         pipeline->vpm_cfg_bin.Ve;
-      shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
-         pipeline->vpm_cfg.Ve;
-
-      shader.coordinate_shader_4_way_threadable =
-         prog_data_vs_bin->base.threads == 4;
-      shader.vertex_shader_4_way_threadable =
-         prog_data_vs->base.threads == 4;
-      shader.fragment_shader_4_way_threadable =
-         prog_data_fs->base.threads == 4;
-
-      shader.coordinate_shader_start_in_final_thread_section =
-         prog_data_vs_bin->base.single_seg;
-      shader.vertex_shader_start_in_final_thread_section =
-         prog_data_vs->base.single_seg;
-      shader.fragment_shader_start_in_final_thread_section =
-         prog_data_fs->base.single_seg;
-
-      shader.vertex_id_read_by_coordinate_shader =
-         prog_data_vs_bin->uses_vid;
-      shader.base_instance_id_read_by_coordinate_shader =
-         prog_data_vs_bin->uses_biid;
-      shader.instance_id_read_by_coordinate_shader =
-         prog_data_vs_bin->uses_iid;
-      shader.vertex_id_read_by_vertex_shader =
-         prog_data_vs->uses_vid;
-      shader.base_instance_id_read_by_vertex_shader =
-         prog_data_vs->uses_biid;
-      shader.instance_id_read_by_vertex_shader =
-         prog_data_vs->uses_iid;
-
-      /* Note: see previous note about adresses */
-      /* shader.address_of_default_attribute_values */
-   }
-}
-
-static void
-pack_vcm_cache_size(struct v3dv_pipeline *pipeline)
-{
-   assert(sizeof(pipeline->vcm_cache_size) ==
-          cl_packet_length(VCM_CACHE_SIZE));
-
-   v3dv_pack(pipeline->vcm_cache_size, VCM_CACHE_SIZE, vcm) {
-      vcm.number_of_16_vertex_batches_for_binning = pipeline->vpm_cfg_bin.Vc;
-      vcm.number_of_16_vertex_batches_for_rendering = pipeline->vpm_cfg.Vc;
+         pipeline->ez_state = V3D_EZ_DISABLED;
    }
 }
 
-/* As defined on the GL_SHADER_STATE_ATTRIBUTE_RECORD */
-static uint8_t
-get_attr_type(const struct util_format_description *desc)
-{
-   uint32_t r_size = desc->channel[0].size;
-   uint8_t attr_type = ATTRIBUTE_FLOAT;
-
-   switch (desc->channel[0].type) {
-   case UTIL_FORMAT_TYPE_FLOAT:
-      if (r_size == 32) {
-         attr_type = ATTRIBUTE_FLOAT;
-      } else {
-         assert(r_size == 16);
-         attr_type = ATTRIBUTE_HALF_FLOAT;
-      }
-      break;
-
-   case UTIL_FORMAT_TYPE_SIGNED:
-   case UTIL_FORMAT_TYPE_UNSIGNED:
-      switch (r_size) {
-      case 32:
-         attr_type = ATTRIBUTE_INT;
-         break;
-      case 16:
-         attr_type = ATTRIBUTE_SHORT;
-         break;
-      case 10:
-         attr_type = ATTRIBUTE_INT2_10_10_10;
-         break;
-      case 8:
-         attr_type = ATTRIBUTE_BYTE;
-         break;
-      default:
-         fprintf(stderr,
-                 "format %s unsupported\n",
-                 desc->name);
-         attr_type = ATTRIBUTE_BYTE;
-         abort();
-      }
-      break;
-
-   default:
-      fprintf(stderr,
-              "format %s unsupported\n",
-              desc->name);
-      abort();
-   }
-
-   return attr_type;
-}
-
 static bool
 pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
 {
@@ -2829,36 +2904,6 @@ v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
 }
 
 static void
-pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline,
-                                   uint32_t index,
-                                   const VkVertexInputAttributeDescription *vi_desc)
-{
-   const uint32_t packet_length =
-      cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
-
-   const struct util_format_description *desc =
-      vk_format_description(vi_desc->format);
-
-   uint32_t binding = vi_desc->binding;
-
-   v3dv_pack(&pipeline->vertex_attrs[index * packet_length],
-             GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
-
-      /* vec_size == 0 means 4 */
-      attr.vec_size = desc->nr_channels & 3;
-      attr.signed_int_type = (desc->channel[0].type ==
-                              UTIL_FORMAT_TYPE_SIGNED);
-      attr.normalized_int_type = desc->channel[0].normalized;
-      attr.read_as_int_uint = desc->channel[0].pure_integer;
-
-      attr.instance_divisor = MIN2(pipeline->vb[binding].instance_divisor,
-                                   0xffff);
-      attr.stride = pipeline->vb[binding].stride;
-      attr.type = get_attr_type(desc);
-   }
-}
-
-static void
 pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
                          const VkPipelineMultisampleStateCreateInfo *ms_info)
 {
@@ -2920,24 +2965,35 @@ pipeline_init(struct v3dv_pipeline *pipeline,
    const VkPipelineRasterizationStateCreateInfo *rs_info =
       raster_enabled ? pCreateInfo->pRasterizationState : NULL;
 
+   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info =
+      rs_info ? vk_find_struct_const(
+         rs_info->pNext,
+         PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT) :
+            NULL;
+
    const VkPipelineColorBlendStateCreateInfo *cb_info =
       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
 
    const VkPipelineMultisampleStateCreateInfo *ms_info =
       raster_enabled ? pCreateInfo->pMultisampleState : NULL;
 
+   const VkPipelineColorWriteCreateInfoEXT *cw_info =
+      cb_info ? vk_find_struct_const(cb_info->pNext,
+                                     PIPELINE_COLOR_WRITE_CREATE_INFO_EXT) :
+                NULL;
+
    pipeline_init_dynamic_state(pipeline,
                                pCreateInfo->pDynamicState,
-                               vp_info, ds_info, cb_info, rs_info);
+                               vp_info, ds_info, cb_info, rs_info, cw_info);
 
    /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
     * feature and it shouldn't be used by any pipeline.
     */
    assert(!ds_info || !ds_info->depthBoundsTestEnable);
 
-   pack_blend(pipeline, cb_info);
-   pack_cfg_bits(pipeline, ds_info, rs_info, ms_info);
-   pack_stencil_cfg(pipeline, ds_info);
+   v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info,
+                                       rs_info, pv_info, ms_info);
+
    pipeline_set_ez_state(pipeline, ds_info);
    enable_depth_bias(pipeline, rs_info);
    pipeline_set_sample_mask(pipeline, ms_info);
@@ -2955,49 +3011,14 @@ pipeline_init(struct v3dv_pipeline *pipeline,
       return result;
    }
 
-   pack_shader_state_record(pipeline);
-   pack_vcm_cache_size(pipeline);
-
    const VkPipelineVertexInputStateCreateInfo *vi_info =
       pCreateInfo->pVertexInputState;
 
-   pipeline->vb_count = vi_info->vertexBindingDescriptionCount;
-   for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
-      const VkVertexInputBindingDescription *desc =
-         &vi_info->pVertexBindingDescriptions[i];
-
-      pipeline->vb[desc->binding].stride = desc->stride;
-      pipeline->vb[desc->binding].instance_divisor = desc->inputRate;
-   }
-
-   pipeline->va_count = 0;
-   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
-
-   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
-      const VkVertexInputAttributeDescription *desc =
-         &vi_info->pVertexAttributeDescriptions[i];
-      uint32_t location = desc->location + VERT_ATTRIB_GENERIC0;
-
-      /* We use a custom driver_location_map instead of
-       * nir_find_variable_with_location because if we were able to get the
-       * shader variant from the cache, we would not have the nir shader
-       * available.
-       */
-      uint32_t driver_location =
-         prog_data_vs->driver_location_map[location];
-
-      if (driver_location != -1) {
-         assert(driver_location < MAX_VERTEX_ATTRIBS);
-         pipeline->va[driver_location].offset = desc->offset;
-         pipeline->va[driver_location].binding = desc->binding;
-         pipeline->va[driver_location].vk_format = desc->format;
-
-         pack_shader_state_attribute_record(pipeline, driver_location, desc);
+   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info =
+      vk_find_struct_const(vi_info->pNext,
+                           PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
 
-         pipeline->va_count++;
-      }
-   }
+   v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
 
    if (pipeline_has_integer_vertex_attrib(pipeline)) {
       pipeline->default_attribute_values =
@@ -3032,7 +3053,7 @@ graphics_pipeline_create(VkDevice _device,
                                VK_OBJECT_TYPE_PIPELINE);
 
    if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    result = pipeline_init(pipeline, device, cache,
                           pCreateInfo,
@@ -3040,6 +3061,8 @@ graphics_pipeline_create(VkDevice _device,
 
    if (result != VK_SUCCESS) {
       v3dv_destroy_pipeline(pipeline, device, pAllocator);
+      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
+         *pPipeline = VK_NULL_HANDLE;
       return result;
    }
 
@@ -3048,7 +3071,7 @@ graphics_pipeline_create(VkDevice _device,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateGraphicsPipelines(VkDevice _device,
                              VkPipelineCache pipelineCache,
                              uint32_t count,
@@ -3062,7 +3085,8 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
       mtx_lock(&device->pdevice->mutex);
 
-   for (uint32_t i = 0; i < count; i++) {
+   uint32_t i = 0;
+   for (; i < count; i++) {
       VkResult local_result;
 
       local_result = graphics_pipeline_create(_device,
@@ -3074,9 +3098,16 @@ v3dv_CreateGraphicsPipelines(VkDevice _device,
       if (local_result != VK_SUCCESS) {
          result = local_result;
          pPipelines[i] = VK_NULL_HANDLE;
+
+         if (pCreateInfos[i].flags &
+             VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
+            break;
       }
    }
 
+   for (; i < count; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
+
    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
       mtx_unlock(&device->pdevice->mutex);
 
@@ -3110,6 +3141,11 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
                          const VkComputePipelineCreateInfo *info,
                          const VkAllocationCallbacks *alloc)
 {
+   VkPipelineCreationFeedbackEXT pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
    struct v3dv_device *device = pipeline->device;
    struct v3dv_physical_device *physical_device =
       &device->instance->physicalDevice;
@@ -3129,6 +3165,7 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
    p_stage->entrypoint = sinfo->pName;
    p_stage->module = vk_shader_module_from_handle(sinfo->module);
    p_stage->spec_info = sinfo->pSpecializationInfo;
+   p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
 
    pipeline_hash_shader(p_stage->module,
                         p_stage->entrypoint,
@@ -3147,16 +3184,27 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
    unsigned char pipeline_sha1[20];
    pipeline_hash_compute(pipeline, &pipeline_key, pipeline_sha1);
 
+   bool cache_hit = false;
    pipeline->shared_data =
-      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1, &cache_hit);
 
    if (pipeline->shared_data != NULL) {
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+      if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+         pipeline_feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+
       goto success;
    }
 
+   if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
+      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+
    pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline_sha1,
-                                                               pipeline->device);
+                                                               pipeline,
+                                                               false);
+
+   p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
 
    /* If not found on cache, compile it */
    p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
@@ -3183,12 +3231,21 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 
    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
+
+success:
+
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+   write_creation_feedback(pipeline,
+                           info->pNext,
+                           &pipeline_feedback,
+                           1,
+                           &info->stage);
+
    /* As we got the variants in pipeline->shared_data, after compiling we
     * don't need the pipeline_stages
     */
    pipeline_free_stages(device, pipeline, alloc);
 
- success:
    pipeline_check_spill_size(pipeline);
 
    return VK_SUCCESS;
@@ -3231,12 +3288,14 @@ compute_pipeline_create(VkDevice _device,
    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
                                VK_OBJECT_TYPE_PIPELINE);
    if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    result = compute_pipeline_init(pipeline, device, cache,
                                   pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
       v3dv_destroy_pipeline(pipeline, device, pAllocator);
+      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
+         *pPipeline = VK_NULL_HANDLE;
       return result;
    }
 
@@ -3245,7 +3304,7 @@ compute_pipeline_create(VkDevice _device,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateComputePipelines(VkDevice _device,
                             VkPipelineCache pipelineCache,
                             uint32_t createInfoCount,
@@ -3259,7 +3318,8 @@ v3dv_CreateComputePipelines(VkDevice _device,
    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
       mtx_lock(&device->pdevice->mutex);
 
-   for (uint32_t i = 0; i < createInfoCount; i++) {
+   uint32_t i = 0;
+   for (; i < createInfoCount; i++) {
       VkResult local_result;
       local_result = compute_pipeline_create(_device,
                                               pipelineCache,
@@ -3270,9 +3330,16 @@ v3dv_CreateComputePipelines(VkDevice _device,
       if (local_result != VK_SUCCESS) {
          result = local_result;
          pPipelines[i] = VK_NULL_HANDLE;
+
+         if (pCreateInfos[i].flags &
+             VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
+            break;
       }
    }
 
+   for (; i < createInfoCount; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
+
    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
       mtx_unlock(&device->pdevice->mutex);
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c
index 7d1d11485..c19eecc42 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c
@@ -63,6 +63,20 @@ cache_dump_stats(struct v3dv_pipeline_cache *cache)
    fprintf(stderr, "  cache hit  count:   %d\n", cache->stats.hit);
 }
 
+static void
+pipeline_cache_lock(struct v3dv_pipeline_cache *cache)
+{
+   if (!cache->externally_synchronized)
+      pthread_mutex_lock(&cache->mutex);
+}
+
+static void
+pipeline_cache_unlock(struct v3dv_pipeline_cache *cache)
+{
+   if (!cache->externally_synchronized)
+      pthread_mutex_unlock(&cache->mutex);
+}
+
 void
 v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
                                struct v3dv_pipeline_cache *cache,
@@ -75,10 +89,10 @@ v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
    if (cache->nir_stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
       return;
 
-   pthread_mutex_lock(&cache->mutex);
+   pipeline_cache_lock(cache);
    struct hash_entry *entry =
       _mesa_hash_table_search(cache->nir_cache, sha1_key);
-   pthread_mutex_unlock(&cache->mutex);
+   pipeline_cache_unlock(cache);
    if (entry)
       return;
 
@@ -91,7 +105,7 @@ v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
       return;
    }
 
-   pthread_mutex_lock(&cache->mutex);
+   pipeline_cache_lock(cache);
    /* Because ralloc isn't thread-safe, we have to do all this inside the
     * lock.  We could unlock for the big memcpy but it's probably not worth
     * the hassle.
@@ -99,7 +113,7 @@ v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
    entry = _mesa_hash_table_search(cache->nir_cache, sha1_key);
    if (entry) {
       blob_finish(&blob);
-      pthread_mutex_unlock(&cache->mutex);
+      pipeline_cache_unlock(cache);
       return;
    }
 
@@ -122,7 +136,7 @@ v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
 
    _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);
 
-   pthread_mutex_unlock(&cache->mutex);
+   pipeline_cache_unlock(cache);
 }
 
 nir_shader*
@@ -143,12 +157,12 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
 
    const struct serialized_nir *snir = NULL;
 
-   pthread_mutex_lock(&cache->mutex);
+   pipeline_cache_lock(cache);
    struct hash_entry *entry =
       _mesa_hash_table_search(cache->nir_cache, sha1_key);
    if (entry)
       snir = entry->data;
-   pthread_mutex_unlock(&cache->mutex);
+   pipeline_cache_unlock(cache);
 
    if (snir) {
       struct blob_reader blob;
@@ -185,6 +199,7 @@ v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
 void
 v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
                          struct v3dv_device *device,
+                         VkPipelineCacheCreateFlags flags,
                          bool cache_enabled)
 {
    cache->device = device;
@@ -202,6 +217,9 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
       cache->stats.miss = 0;
       cache->stats.hit = 0;
       cache->stats.count = 0;
+
+      cache->externally_synchronized = flags &
+         VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT;
    } else {
       cache->nir_cache = NULL;
       cache->cache = NULL;
@@ -229,7 +247,8 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
  */
 struct v3dv_pipeline_shared_data *
 v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
-                                        unsigned char sha1_key[20])
+                                        unsigned char sha1_key[20],
+                                        bool *cache_hit)
 {
    if (!cache || !cache->cache)
       return NULL;
@@ -241,7 +260,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
       fprintf(stderr, "pipeline cache %p, search pipeline with key %s\n", cache, sha1buf);
    }
 
-   pthread_mutex_lock(&cache->mutex);
+   pipeline_cache_lock(cache);
 
    struct hash_entry *entry =
       _mesa_hash_table_search(cache->cache, sha1_key);
@@ -252,6 +271,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
       assert(cache_entry);
 
       cache->stats.hit++;
+      *cache_hit = true;
       if (debug_cache) {
          fprintf(stderr, "\tcache hit: %p\n", cache_entry);
          if (dump_stats)
@@ -261,7 +281,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
 
       v3dv_pipeline_shared_data_ref(cache_entry);
 
-      pthread_mutex_unlock(&cache->mutex);
+      pipeline_cache_unlock(cache);
 
       return cache_entry;
    }
@@ -273,7 +293,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
          cache_dump_stats(cache);
    }
 
-   pthread_mutex_unlock(&cache->mutex);
+   pipeline_cache_unlock(cache);
 
 #ifdef ENABLE_SHADER_CACHE
    struct v3dv_device *device = cache->device;
@@ -324,6 +344,14 @@ v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
       if (shared_data->variants[stage] != NULL)
          v3dv_shader_variant_destroy(device, shared_data->variants[stage]);
+
+      /* We don't free binning descriptor maps as we are sharing them
+       * with the render shaders.
+       */
+      if (shared_data->maps[stage] != NULL &&
+          !broadcom_shader_stage_is_binning(stage)) {
+         vk_free(&device->vk.alloc, shared_data->maps[stage]);
+      }
    }
 
    if (shared_data->assembly_bo)
@@ -335,11 +363,8 @@ v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
 static struct v3dv_pipeline_shared_data *
 v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
                               const unsigned char sha1_key[20],
+                              struct v3dv_descriptor_maps **maps,
                               struct v3dv_shader_variant **variants,
-                              const struct v3dv_descriptor_map *ubo_map,
-                              const struct v3dv_descriptor_map *ssbo_map,
-                              const struct v3dv_descriptor_map *sampler_map,
-                              const struct v3dv_descriptor_map *texture_map,
                               const uint64_t *total_assembly,
                               const uint32_t total_assembly_size)
 {
@@ -359,13 +384,10 @@ v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
    new_entry->ref_cnt = 1;
    memcpy(new_entry->sha1_key, sha1_key, 20);
 
-   memcpy(&new_entry->ubo_map, ubo_map, sizeof(struct v3dv_descriptor_map));
-   memcpy(&new_entry->ssbo_map, ssbo_map, sizeof(struct v3dv_descriptor_map));
-   memcpy(&new_entry->sampler_map, sampler_map, sizeof(struct v3dv_descriptor_map));
-   memcpy(&new_entry->texture_map, texture_map, sizeof(struct v3dv_descriptor_map));
-
-   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++)
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      new_entry->maps[stage] = maps[stage];
       new_entry->variants[stage] = variants[stage];
+   }
 
    struct v3dv_bo *bo = v3dv_bo_alloc(cache->device, total_assembly_size,
                                       "pipeline shader assembly", true);
@@ -402,12 +424,12 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
    if (cache->stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
       return;
 
-   pthread_mutex_lock(&cache->mutex);
+   pipeline_cache_lock(cache);
    struct hash_entry *entry =
       _mesa_hash_table_search(cache->cache, shared_data->sha1_key);
 
    if (entry) {
-      pthread_mutex_unlock(&cache->mutex);
+      pipeline_cache_unlock(cache);
       return;
    }
 
@@ -424,7 +446,7 @@ pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
          cache_dump_stats(cache);
    }
 
-   pthread_mutex_unlock(&cache->mutex);
+   pipeline_cache_unlock(cache);
 
 #ifdef ENABLE_SHADER_CACHE
    /* If we are being called from a on-disk-cache hit, we can skip writing to
@@ -490,7 +512,7 @@ shader_variant_create_from_blob(struct v3dv_device *device,
 {
    VkResult result;
 
-   broadcom_shader_stage stage = blob_read_uint32(blob);
+   enum broadcom_shader_stage stage = blob_read_uint32(blob);
 
    uint32_t prog_data_size = blob_read_uint32(blob);
    /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
@@ -541,17 +563,32 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
 {
    const unsigned char *sha1_key = blob_read_bytes(blob, 20);
 
-   const struct v3dv_descriptor_map *ubo_map =
-      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
-   const struct v3dv_descriptor_map *ssbo_map =
-      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
-   const struct v3dv_descriptor_map *sampler_map =
-      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
-   const struct v3dv_descriptor_map *texture_map =
-      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+   struct v3dv_descriptor_maps *maps[BROADCOM_SHADER_STAGES] = { 0 };
 
-   if (blob->overrun)
-      return NULL;
+   uint8_t descriptor_maps_count = blob_read_uint8(blob);
+   for (uint8_t count = 0; count < descriptor_maps_count; count++) {
+      uint8_t stage = blob_read_uint8(blob);
+
+      const struct v3dv_descriptor_maps *current_maps =
+         blob_read_bytes(blob, sizeof(struct v3dv_descriptor_maps));
+
+      if (blob->overrun)
+         return NULL;
+
+      maps[stage] = vk_zalloc2(&cache->device->vk.alloc, NULL,
+                               sizeof(struct v3dv_descriptor_maps), 8,
+                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+      if (maps[stage] == NULL)
+         return NULL;
+
+      memcpy(maps[stage], current_maps, sizeof(struct v3dv_descriptor_maps));
+      if (broadcom_shader_stage_is_render_with_binning(stage)) {
+         enum broadcom_shader_stage bin_stage =
+            broadcom_binning_shader_stage_for_render_stage(stage);
+            maps[bin_stage] = maps[stage];
+      }
+   }
 
    uint8_t variant_count = blob_read_uint8(blob);
 
@@ -571,8 +608,7 @@ v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
    if (blob->overrun)
       return NULL;
 
-   return v3dv_pipeline_shared_data_new(cache, sha1_key, variants,
-                                        ubo_map, ssbo_map, sampler_map, texture_map,
+   return v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants,
                                         total_assembly, total_assembly_size);
 }
 
@@ -643,7 +679,7 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache,
    }
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreatePipelineCache(VkDevice _device,
                          const VkPipelineCacheCreateInfo *pCreateInfo,
                          const VkAllocationCallbacks *pAllocator,
@@ -653,16 +689,15 @@ v3dv_CreatePipelineCache(VkDevice _device,
    struct v3dv_pipeline_cache *cache;
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
-   assert(pCreateInfo->flags == 0);
 
    cache = vk_object_zalloc(&device->vk, pAllocator,
                             sizeof(*cache),
                             VK_OBJECT_TYPE_PIPELINE_CACHE);
 
    if (cache == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   v3dv_pipeline_cache_init(cache, device,
+   v3dv_pipeline_cache_init(cache, device, pCreateInfo->flags,
                             device->instance->pipeline_cache_enabled);
 
    if (pCreateInfo->initialDataSize > 0) {
@@ -702,7 +737,7 @@ v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache)
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyPipelineCache(VkDevice _device,
                           VkPipelineCache _cache,
                           const VkAllocationCallbacks *pAllocator)
@@ -718,7 +753,7 @@ v3dv_DestroyPipelineCache(VkDevice _device,
    vk_object_free(&device->vk, pAllocator, cache);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_MergePipelineCaches(VkDevice device,
                          VkPipelineCache dstCache,
                          uint32_t srcCacheCount,
@@ -820,14 +855,33 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
 {
    blob_write_bytes(blob, cache_entry->sha1_key, 20);
 
-   blob_write_bytes(blob, &cache_entry->ubo_map,
-                    sizeof(struct v3dv_descriptor_map));
-   blob_write_bytes(blob, &cache_entry->ssbo_map,
-                    sizeof(struct v3dv_descriptor_map));
-   blob_write_bytes(blob, &cache_entry->sampler_map,
-                    sizeof(struct v3dv_descriptor_map));
-   blob_write_bytes(blob, &cache_entry->texture_map,
-                    sizeof(struct v3dv_descriptor_map));
+   uint8_t descriptor_maps_count = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (broadcom_shader_stage_is_binning(stage))
+         continue;
+      if (cache_entry->maps[stage] == NULL)
+         continue;
+      descriptor_maps_count++;
+   }
+
+   /* Compute pipelines only have one descriptor map,
+    * graphics pipelines may have 2 (VS+FS) or 3 (VS+GS+FS), since the binning
+    * stages take the descriptor map from the render stage.
+    */
+   assert((descriptor_maps_count >= 2 && descriptor_maps_count <= 3) ||
+          (descriptor_maps_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
+   blob_write_uint8(blob, descriptor_maps_count);
+
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (cache_entry->maps[stage] == NULL)
+         continue;
+      if (broadcom_shader_stage_is_binning(stage))
+         continue;
+
+      blob_write_uint8(blob, stage);
+      blob_write_bytes(blob, cache_entry->maps[stage],
+                       sizeof(struct v3dv_descriptor_maps));
+   }
 
    uint8_t variant_count = 0;
    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
@@ -836,10 +890,10 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
       variant_count++;
    }
 
-   /* Right now we only support compute pipeline, or graphics pipeline with
-    * vertex, vertex bin, and fragment shader.
+   /* Graphics pipelines with VS+FS have 3 variants, VS+GS+FS will have 5 and
+    * compute pipelines only have 1.
     */
-   assert(variant_count == 3 ||
+   assert((variant_count == 5  || variant_count == 3) ||
           (variant_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
    blob_write_uint8(blob, variant_count);
 
@@ -864,7 +918,7 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
 }
 
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetPipelineCacheData(VkDevice _device,
                           VkPipelineCache _cache,
                           size_t *pDataSize,
@@ -881,9 +935,9 @@ v3dv_GetPipelineCacheData(VkDevice _device,
    }
 
    struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
-   VkResult result = VK_SUCCESS;
+   VkResult result = VK_INCOMPLETE;
 
-   pthread_mutex_lock(&cache->mutex);
+   pipeline_cache_lock(cache);
 
    struct vk_pipeline_cache_header header = {
       .header_size = sizeof(struct vk_pipeline_cache_header),
@@ -898,9 +952,7 @@ v3dv_GetPipelineCacheData(VkDevice _device,
    intptr_t nir_count_offset = blob_reserve_uint32(&blob);
    if (nir_count_offset < 0) {
       *pDataSize = 0;
-      blob_finish(&blob);
-      pthread_mutex_unlock(&cache->mutex);
-      return VK_INCOMPLETE;
+      goto done;
    }
 
    if (cache->nir_cache) {
@@ -915,9 +967,7 @@ v3dv_GetPipelineCacheData(VkDevice _device,
 
          if (blob.out_of_memory) {
             blob.size = save_size;
-            pthread_mutex_unlock(&cache->mutex);
-            result = VK_INCOMPLETE;
-            break;
+            goto done;
          }
 
          nir_count++;
@@ -929,9 +979,7 @@ v3dv_GetPipelineCacheData(VkDevice _device,
    intptr_t count_offset = blob_reserve_uint32(&blob);
    if (count_offset < 0) {
       *pDataSize = 0;
-      blob_finish(&blob);
-      pthread_mutex_unlock(&cache->mutex);
-      return VK_INCOMPLETE;
+      goto done;
    }
 
    if (cache->cache) {
@@ -942,9 +990,7 @@ v3dv_GetPipelineCacheData(VkDevice _device,
          if (!v3dv_pipeline_shared_data_write_to_blob(cache_entry, &blob)) {
             /* If it fails reset to the previous size and bail */
             blob.size = save_size;
-            pthread_mutex_unlock(&cache->mutex);
-            result = VK_INCOMPLETE;
-            break;
+            goto done;
          }
 
          count++;
@@ -955,7 +1001,7 @@ v3dv_GetPipelineCacheData(VkDevice _device,
 
    *pDataSize = blob.size;
 
-   blob_finish(&blob);
+   result = VK_SUCCESS;
 
    if (debug_cache) {
       assert(count <= cache->stats.count);
@@ -965,7 +1011,10 @@ v3dv_GetPipelineCacheData(VkDevice _device,
               cache, nir_count, count, (uint32_t) *pDataSize);
    }
 
-   pthread_mutex_unlock(&cache->mutex);
+ done:
+   blob_finish(&blob);
+
+   pipeline_cache_unlock(cache);
 
    return result;
 }
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_private.h b/lib/mesa/src/broadcom/vulkan/v3dv_private.h
index 36ecba130..d3c07c649 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_private.h
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_private.h
@@ -38,10 +38,15 @@
 
 #include "vk_device.h"
 #include "vk_instance.h"
+#include "vk_image.h"
+#include "vk_log.h"
 #include "vk_physical_device.h"
 #include "vk_shader_module.h"
 #include "vk_util.h"
 
+#include "vk_command_buffer.h"
+#include "vk_queue.h"
+
 #include <xf86drm.h>
 
 #ifdef HAVE_VALGRIND
@@ -56,6 +61,8 @@
 
 #include "common/v3d_device_info.h"
 #include "common/v3d_limits.h"
+#include "common/v3d_tiling.h"
+#include "common/v3d_util.h"
 
 #include "compiler/shader_enums.h"
 #include "compiler/spirv/nir_spirv.h"
@@ -69,36 +76,17 @@
 #include "u_atomic.h"
 
 #include "v3dv_entrypoints.h"
-#include "v3dv_extensions.h"
 #include "v3dv_bo.h"
 
 #include "drm-uapi/v3d_drm.h"
 
-/* FIXME: hooks for the packet definition functions. */
-static inline void
-pack_emit_reloc(void *cl, const void *reloc) {}
-
-#define __gen_user_data struct v3dv_cl
-#define __gen_address_type struct v3dv_cl_reloc
-#define __gen_address_offset(reloc) (((reloc)->bo ? (reloc)->bo->offset : 0) + \
-                                     (reloc)->offset)
-#define __gen_emit_reloc cl_pack_emit_reloc
-#define __gen_unpack_address(cl, s, e) __unpack_address(cl, s, e)
-#include "v3dv_cl.h"
-
 #include "vk_alloc.h"
 #include "simulator/v3d_simulator.h"
 
+#include "v3dv_cl.h"
 
-/* FIXME: pipe_box from Gallium. Needed for some v3d_tiling.c functions.
- * In the future we might want to drop that depedency, but for now it is
- * good enough.
- */
-#include "util/u_box.h"
 #include "wsi_common.h"
 
-#include "broadcom/cle/v3dx_pack.h"
-
 /* A non-fatal assert.  Useful for debugging. */
 #ifdef DEBUG
 #define v3dv_assert(x) ({ \
@@ -124,6 +112,9 @@ struct v3dv_instance;
 
 struct v3d_simulator_file;
 
+/* Minimum required by the Vulkan 1.1 spec */
+#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
+
 struct v3dv_physical_device {
    struct vk_physical_device vk;
 
@@ -132,6 +123,15 @@ struct v3dv_physical_device {
    int32_t display_fd;
    int32_t master_fd;
 
+   /* We need these because it is not clear how to detect
+    * valid devids in a portable way
+     */
+   bool has_primary;
+   bool has_render;
+
+   dev_t primary_devid;
+   dev_t render_devid;
+
    uint8_t driver_build_sha1[20];
    uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
    uint8_t device_uuid[VK_UUID_SIZE];
@@ -163,6 +163,8 @@ VkResult v3dv_physical_device_acquire_display(struct v3dv_instance *instance,
 
 VkResult v3dv_wsi_init(struct v3dv_physical_device *physical_device);
 void v3dv_wsi_finish(struct v3dv_physical_device *physical_device);
+struct v3dv_image *v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain,
+                                                     uint32_t index);
 
 void v3dv_meta_clear_init(struct v3dv_device *device);
 void v3dv_meta_clear_finish(struct v3dv_device *device);
@@ -173,6 +175,10 @@ void v3dv_meta_blit_finish(struct v3dv_device *device);
 void v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device);
 void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device);
 
+bool v3dv_meta_can_use_tlb(struct v3dv_image *image,
+                           const VkOffset3D *offset,
+                           VkFormat *compat_format);
+
 struct v3dv_instance {
    struct vk_instance vk;
 
@@ -214,10 +220,9 @@ struct v3dv_queue_submit_wait_info {
 };
 
 struct v3dv_queue {
-   struct vk_object_base base;
+   struct vk_queue vk;
 
    struct v3dv_device *device;
-   VkDeviceQueueCreateFlags flags;
 
    /* A list of active v3dv_queue_submit_wait_info */
    struct list_head submit_wait_list;
@@ -229,7 +234,7 @@ struct v3dv_queue {
 };
 
 #define V3DV_META_BLIT_CACHE_KEY_SIZE              (4 * sizeof(uint32_t))
-#define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (2 * sizeof(uint32_t) + \
+#define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (3 * sizeof(uint32_t) + \
                                                     sizeof(VkComponentMapping))
 
 struct v3dv_meta_color_clear_pipeline {
@@ -273,6 +278,7 @@ struct v3dv_pipeline_key {
    } color_fmt[V3D_MAX_DRAW_BUFFERS];
    uint8_t f32_color_rb;
    uint32_t va_swap_rb_mask;
+   bool has_multiview;
 };
 
 struct v3dv_pipeline_cache_stats {
@@ -285,22 +291,26 @@ struct v3dv_pipeline_cache_stats {
  *
  * FIXME: perhaps move to common
  */
-typedef enum {
+enum broadcom_shader_stage {
    BROADCOM_SHADER_VERTEX,
    BROADCOM_SHADER_VERTEX_BIN,
+   BROADCOM_SHADER_GEOMETRY,
+   BROADCOM_SHADER_GEOMETRY_BIN,
    BROADCOM_SHADER_FRAGMENT,
    BROADCOM_SHADER_COMPUTE,
-} broadcom_shader_stage;
+};
 
 #define BROADCOM_SHADER_STAGES (BROADCOM_SHADER_COMPUTE + 1)
 
 /* Assumes that coordinate shaders will be custom-handled by the caller */
-static inline broadcom_shader_stage
+static inline enum broadcom_shader_stage
 gl_shader_stage_to_broadcom(gl_shader_stage stage)
 {
    switch (stage) {
    case MESA_SHADER_VERTEX:
       return BROADCOM_SHADER_VERTEX;
+   case MESA_SHADER_GEOMETRY:
+      return BROADCOM_SHADER_GEOMETRY;
    case MESA_SHADER_FRAGMENT:
       return BROADCOM_SHADER_FRAGMENT;
    case MESA_SHADER_COMPUTE:
@@ -311,12 +321,15 @@ gl_shader_stage_to_broadcom(gl_shader_stage stage)
 }
 
 static inline gl_shader_stage
-broadcom_shader_stage_to_gl(broadcom_shader_stage stage)
+broadcom_shader_stage_to_gl(enum broadcom_shader_stage stage)
 {
    switch (stage) {
    case BROADCOM_SHADER_VERTEX:
    case BROADCOM_SHADER_VERTEX_BIN:
       return MESA_SHADER_VERTEX;
+   case BROADCOM_SHADER_GEOMETRY:
+   case BROADCOM_SHADER_GEOMETRY_BIN:
+      return MESA_SHADER_GEOMETRY;
    case BROADCOM_SHADER_FRAGMENT:
       return MESA_SHADER_FRAGMENT;
    case BROADCOM_SHADER_COMPUTE:
@@ -326,6 +339,56 @@ broadcom_shader_stage_to_gl(broadcom_shader_stage stage)
    }
 }
 
+static inline bool
+broadcom_shader_stage_is_binning(enum broadcom_shader_stage stage)
+{
+   switch (stage) {
+   case BROADCOM_SHADER_VERTEX_BIN:
+   case BROADCOM_SHADER_GEOMETRY_BIN:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+broadcom_shader_stage_is_render_with_binning(enum broadcom_shader_stage stage)
+{
+   switch (stage) {
+   case BROADCOM_SHADER_VERTEX:
+   case BROADCOM_SHADER_GEOMETRY:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline enum broadcom_shader_stage
+broadcom_binning_shader_stage_for_render_stage(enum broadcom_shader_stage stage)
+{
+   switch (stage) {
+   case BROADCOM_SHADER_VERTEX:
+      return BROADCOM_SHADER_VERTEX_BIN;
+   case BROADCOM_SHADER_GEOMETRY:
+      return BROADCOM_SHADER_GEOMETRY_BIN;
+   default:
+      unreachable("Invalid shader stage");
+   }
+}
+
+static inline const char *
+broadcom_shader_stage_name(enum broadcom_shader_stage stage)
+{
+   switch(stage) {
+   case BROADCOM_SHADER_VERTEX_BIN:
+      return "MESA_SHADER_VERTEX_BIN";
+   case BROADCOM_SHADER_GEOMETRY_BIN:
+      return "MESA_SHADER_GEOMETRY_BIN";
+   default:
+      return gl_shader_stage_name(broadcom_shader_stage_to_gl(stage));
+   }
+}
+
 struct v3dv_pipeline_cache {
    struct vk_object_base base;
 
@@ -337,6 +400,9 @@ struct v3dv_pipeline_cache {
 
    struct hash_table *cache;
    struct v3dv_pipeline_cache_stats stats;
+
+   /* For VK_EXT_pipeline_creation_cache_control. */
+   bool externally_synchronized;
 };
 
 struct v3dv_device {
@@ -441,34 +507,6 @@ struct v3dv_format {
    bool supports_filtering;
 };
 
-/**
- * Tiling mode enum used for v3d_resource.c, which maps directly to the Memory
- * Format field of render target and Z/Stencil config.
- */
-enum v3d_tiling_mode {
-   /* Untiled resources.  Not valid as texture inputs. */
-   VC5_TILING_RASTER,
-
-   /* Single line of u-tiles. */
-   VC5_TILING_LINEARTILE,
-
-   /* Departure from standard 4-UIF block column format. */
-   VC5_TILING_UBLINEAR_1_COLUMN,
-
-   /* Departure from standard 4-UIF block column format. */
-   VC5_TILING_UBLINEAR_2_COLUMN,
-
-   /* Normal tiling format: grouped in 4x4 UIFblocks, each of which is
-    * split 2x2 into utiles.
-    */
-   VC5_TILING_UIF_NO_XOR,
-
-   /* Normal tiling format: grouped in 4x4 UIFblocks, each of which is
-    * split 2x2 into utiles.
-    */
-   VC5_TILING_UIF_XOR,
-};
-
 struct v3d_resource_slice {
    uint32_t offset;
    uint32_t stride;
@@ -484,56 +522,43 @@ struct v3d_resource_slice {
 };
 
 struct v3dv_image {
-   struct vk_object_base base;
-
-   VkImageType type;
-   VkImageAspectFlags aspects;
-
-   VkExtent3D extent;
-   uint32_t levels;
-   uint32_t array_size;
-   uint32_t samples;
-   VkImageUsageFlags usage;
-   VkImageCreateFlags flags;
-   VkImageTiling tiling;
+   struct vk_image vk;
 
-   VkFormat vk_format;
    const struct v3dv_format *format;
-
    uint32_t cpp;
-
-   uint64_t drm_format_mod;
    bool tiled;
 
    struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS];
    uint64_t size; /* Total size in bytes */
    uint32_t cube_map_stride;
-   uint32_t alignment;
 
    struct v3dv_device_memory *mem;
    VkDeviceSize mem_offset;
+   uint32_t alignment;
 };
 
 VkImageViewType v3dv_image_type_to_view_type(VkImageType type);
 
-struct v3dv_image_view {
-   struct vk_object_base base;
+/* Pre-generating packets needs to consider changes in packet sizes across hw
+ * versions. Keep things simple and allocate enough space for any supported
+ * version. We ensure the size is large enough through static asserts.
+ */
+#define V3DV_TEXTURE_SHADER_STATE_LENGTH 32
+#define V3DV_SAMPLER_STATE_LENGTH 24
+#define V3DV_BLEND_CFG_LENGTH 5
+#define V3DV_CFG_BITS_LENGTH 4
+#define V3DV_GL_SHADER_STATE_RECORD_LENGTH 36
+#define V3DV_VCM_CACHE_SIZE_LENGTH 2
+#define V3DV_GL_SHADER_STATE_ATTRIBUTE_RECORD_LENGTH 16
+#define V3DV_STENCIL_CFG_LENGTH 6
 
-   const struct v3dv_image *image;
-   VkImageAspectFlags aspects;
-   VkExtent3D extent;
-   VkImageViewType type;
+struct v3dv_image_view {
+   struct vk_image_view vk;
 
-   VkFormat vk_format;
    const struct v3dv_format *format;
    bool swap_rb;
    uint32_t internal_bpp;
    uint32_t internal_type;
-
-   uint32_t base_level;
-   uint32_t max_level;
-   uint32_t first_layer;
-   uint32_t last_layer;
    uint32_t offset;
 
    /* Precomputed (composed from createinfo->components and formar swizzle)
@@ -552,7 +577,7 @@ struct v3dv_image_view {
     * we generate two states and select the one to use based on the descriptor
     * type.
     */
-   uint8_t texture_shader_state[2][cl_packet_length(TEXTURE_SHADER_STATE)];
+   uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH];
 };
 
 uint32_t v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer);
@@ -571,7 +596,7 @@ struct v3dv_buffer {
 struct v3dv_buffer_view {
    struct vk_object_base base;
 
-   const struct v3dv_buffer *buffer;
+   struct v3dv_buffer *buffer;
 
    VkFormat vk_format;
    const struct v3dv_format *format;
@@ -583,7 +608,7 @@ struct v3dv_buffer_view {
    uint32_t num_elements;
 
    /* Prepacked TEXTURE_SHADER_STATE. */
-   uint8_t texture_shader_state[cl_packet_length(TEXTURE_SHADER_STATE)];
+   uint8_t texture_shader_state[V3DV_TEXTURE_SHADER_STATE_LENGTH];
 };
 
 struct v3dv_subpass_attachment {
@@ -601,20 +626,33 @@ struct v3dv_subpass {
 
    struct v3dv_subpass_attachment ds_attachment;
 
-   bool has_srgb_rt;
-
    /* If we need to emit the clear of the depth/stencil attachment using a
     * a draw call instead of using the TLB (GFXH-1461).
     */
    bool do_depth_clear_with_draw;
    bool do_stencil_clear_with_draw;
+
+   /* Multiview */
+   uint32_t view_mask;
 };
 
 struct v3dv_render_pass_attachment {
    VkAttachmentDescription desc;
+
    uint32_t first_subpass;
    uint32_t last_subpass;
 
+   /* When multiview is enabled, we no longer care about when a particular
+    * attachment is first or last used in a render pass, since not all views
+    * in the attachment will meet that criteria. Instead, we need to track
+    * each individual view (layer) in each attachment and emit our stores,
+    * loads and clears accordingly.
+    */
+   struct {
+      uint32_t first_subpass;
+      uint32_t last_subpass;
+   } views[MAX_MULTIVIEW_VIEW_COUNT];
+
    /* If this is a multismapled attachment that is going to be resolved,
     * whether we can use the TLB resolve on store.
     */
@@ -624,6 +662,8 @@ struct v3dv_render_pass_attachment {
 struct v3dv_render_pass {
    struct vk_object_base base;
 
+   bool multiview_enabled;
+
    uint32_t attachment_count;
    struct v3dv_render_pass_attachment *attachments;
 
@@ -677,10 +717,12 @@ void v3dv_framebuffer_compute_internal_bpp_msaa(const struct v3dv_framebuffer *f
                                                 const struct v3dv_subpass *subpass,
                                                 uint8_t *max_bpp, bool *msaa);
 
-bool v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+bool v3dv_subpass_area_is_tile_aligned(struct v3dv_device *device,
+                                       const VkRect2D *area,
                                        struct v3dv_framebuffer *fb,
                                        struct v3dv_render_pass *pass,
                                        uint32_t subpass_idx);
+
 struct v3dv_cmd_pool {
    struct vk_object_base base;
 
@@ -711,11 +753,6 @@ struct v3dv_cmd_buffer_attachment_state {
    union v3dv_clear_value clear_value;
 };
 
-void v3dv_get_hw_clear_color(const VkClearColorValue *color,
-                             uint32_t internal_type,
-                             uint32_t internal_size,
-                             uint32_t *hw_color);
-
 struct v3dv_viewport_state {
    uint32_t count;
    VkViewport viewports[MAX_VIEWPORTS];
@@ -740,7 +777,8 @@ enum v3dv_dynamic_state_bits {
    V3DV_DYNAMIC_BLEND_CONSTANTS           = 1 << 5,
    V3DV_DYNAMIC_DEPTH_BIAS                = 1 << 6,
    V3DV_DYNAMIC_LINE_WIDTH                = 1 << 7,
-   V3DV_DYNAMIC_ALL                       = (1 << 8) - 1,
+   V3DV_DYNAMIC_COLOR_WRITE_ENABLE        = 1 << 8,
+   V3DV_DYNAMIC_ALL                       = (1 << 9) - 1,
 };
 
 /* Flags for dirty pipeline state.
@@ -762,6 +800,8 @@ enum v3dv_cmd_dirty_bits {
    V3DV_CMD_DIRTY_OCCLUSION_QUERY           = 1 << 13,
    V3DV_CMD_DIRTY_DEPTH_BIAS                = 1 << 14,
    V3DV_CMD_DIRTY_LINE_WIDTH                = 1 << 15,
+   V3DV_CMD_DIRTY_VIEW_INDEX                = 1 << 16,
+   V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE        = 1 << 17,
 };
 
 struct v3dv_dynamic_state {
@@ -799,6 +839,8 @@ struct v3dv_dynamic_state {
    } depth_bias;
 
    float line_width;
+
+   uint32_t color_write_enable;
 };
 
 extern const struct v3dv_dynamic_state default_dynamic_state;
@@ -808,10 +850,10 @@ void v3dv_viewport_compute_xform(const VkViewport *viewport,
                                  float translate[3]);
 
 enum v3dv_ez_state {
-   VC5_EZ_UNDECIDED = 0,
-   VC5_EZ_GT_GE,
-   VC5_EZ_LT_LE,
-   VC5_EZ_DISABLED,
+   V3D_EZ_UNDECIDED = 0,
+   V3D_EZ_GT_GE,
+   V3D_EZ_LT_LE,
+   V3D_EZ_DISABLED,
 };
 
 enum v3dv_job_type {
@@ -824,7 +866,6 @@ enum v3dv_job_type {
    V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
    V3DV_JOB_TYPE_CPU_SET_EVENT,
    V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
-   V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS,
    V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
    V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
    V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
@@ -839,6 +880,9 @@ struct v3dv_reset_query_cpu_job_info {
 struct v3dv_end_query_cpu_job_info {
    struct v3dv_query_pool *pool;
    uint32_t query;
+
+   /* This is one unless multiview is used */
+   uint32_t count;
 };
 
 struct v3dv_copy_query_results_cpu_job_info {
@@ -865,13 +909,6 @@ struct v3dv_event_wait_cpu_job_info {
    bool sem_wait;
 };
 
-struct v3dv_clear_attachments_cpu_job_info {
-   uint32_t attachment_count;
-   VkClearAttachment attachments[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
-   uint32_t rect_count;
-   VkClearRect *rects;
-};
-
 struct v3dv_copy_buffer_to_image_cpu_job_info {
    struct v3dv_image *image;
    struct v3dv_buffer *buffer;
@@ -897,6 +934,9 @@ struct v3dv_csd_indirect_cpu_job_info {
 struct v3dv_timestamp_query_cpu_job_info {
    struct v3dv_query_pool *pool;
    uint32_t query;
+
+   /* This is one unless multiview is used */
+   uint32_t count;
 };
 
 struct v3dv_job {
@@ -924,6 +964,7 @@ struct v3dv_job {
     */
    struct set *bos;
    uint32_t bo_count;
+   uint64_t bo_handle_mask;
 
    struct v3dv_bo *tile_alloc;
    struct v3dv_bo *tile_state;
@@ -975,7 +1016,6 @@ struct v3dv_job {
       struct v3dv_copy_query_results_cpu_job_info   query_copy_results;
       struct v3dv_event_set_cpu_job_info            event_set;
       struct v3dv_event_wait_cpu_job_info           event_wait;
-      struct v3dv_clear_attachments_cpu_job_info    clear_attachments;
       struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
       struct v3dv_csd_indirect_cpu_job_info         csd_indirect;
       struct v3dv_timestamp_query_cpu_job_info      query_timestamp;
@@ -988,6 +1028,7 @@ struct v3dv_job {
    struct {
       struct v3dv_bo *shared_memory;
       uint32_t wg_count[3];
+      uint32_t wg_base[3];
       struct drm_v3d_submit_csd submit;
    } csd;
 };
@@ -998,20 +1039,47 @@ void v3dv_job_init(struct v3dv_job *job,
                    struct v3dv_cmd_buffer *cmd_buffer,
                    int32_t subpass_idx);
 void v3dv_job_destroy(struct v3dv_job *job);
+
 void v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo);
-void v3dv_job_emit_binning_flush(struct v3dv_job *job);
+void v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo);
+
 void v3dv_job_start_frame(struct v3dv_job *job,
                           uint32_t width,
                           uint32_t height,
                           uint32_t layers,
+                          bool allocate_tile_state_for_all_layers,
                           uint32_t render_target_count,
                           uint8_t max_internal_bpp,
                           bool msaa);
+
+struct v3dv_job *
+v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
+                             struct v3dv_cmd_buffer *cmd_buffer);
+
 struct v3dv_job *v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
                                                 enum v3dv_job_type type,
                                                 struct v3dv_cmd_buffer *cmd_buffer,
                                                 uint32_t subpass_idx);
 
+void
+v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
+                                   uint32_t slot_size,
+                                   uint32_t used_count,
+                                   uint32_t *alloc_count,
+                                   void **ptr);
+
+void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer);
+
+/* FIXME: only used on v3dv_cmd_buffer and v3dvx_cmd_buffer, perhaps move to a
+ * cmd_buffer specific header?
+ */
+struct v3dv_draw_info {
+   uint32_t vertex_count;
+   uint32_t instance_count;
+   uint32_t first_vertex;
+   uint32_t first_instance;
+};
+
 struct v3dv_vertex_binding {
    struct v3dv_buffer *buffer;
    VkDeviceSize offset;
@@ -1043,7 +1111,10 @@ struct v3dv_cmd_buffer_state {
    struct v3dv_cmd_pipeline_state compute;
 
    struct v3dv_dynamic_state dynamic;
+
    uint32_t dirty;
+   VkShaderStageFlagBits dirty_descriptor_stages;
+   VkShaderStageFlagBits dirty_push_constants_stages;
 
    /* Current clip window. We use this to check whether we have an active
     * scissor, since in that case we can't use TLB clears and need to fallback
@@ -1075,9 +1146,14 @@ struct v3dv_cmd_buffer_state {
    struct {
       struct v3dv_cl_reloc vs_bin;
       struct v3dv_cl_reloc vs;
+      struct v3dv_cl_reloc gs_bin;
+      struct v3dv_cl_reloc gs;
       struct v3dv_cl_reloc fs;
    } uniforms;
 
+   /* Current view index for multiview rendering */
+   uint32_t view_index;
+
    /* Used to flag OOM conditions during command buffer recording */
    bool oom;
 
@@ -1126,10 +1202,13 @@ struct v3dv_cmd_buffer_state {
          struct v3dv_end_query_cpu_job_info *states;
       } end;
 
-      /* This is not NULL if we have an active query, that is, we have called
-       * vkCmdBeginQuery but not vkCmdEndQuery.
+      /* This BO is not NULL if we have an active query, that is, we have
+       * called vkCmdBeginQuery but not vkCmdEndQuery.
        */
-      struct v3dv_bo *active_query;
+      struct {
+         struct v3dv_bo *bo;
+         uint32_t offset;
+      } active_query;
    } query;
 };
 
@@ -1160,44 +1239,24 @@ struct v3dv_descriptor {
    };
 };
 
-/* The following v3dv_xxx_descriptor structs represent descriptor info that we
- * upload to a bo, specifically a subregion of the descriptor pool bo.
- *
- * The general rule that we apply right now to decide which info goes to such
- * bo is that we upload those that are referenced by an address when emitting
- * a packet, so needed to be uploaded to an bo in any case.
- *
- * Note that these structs are mostly helpers that improve the semantics when
- * doing all that, but we could do as other mesa vulkan drivers and just
- * upload the info we know it is expected based on the context.
- *
- * Also note that the sizes are aligned, as there is an alignment requirement
- * for addresses.
- */
-struct v3dv_sampled_image_descriptor {
-   uint8_t texture_state[cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32)];
-};
-
-struct v3dv_sampler_descriptor {
-   uint8_t sampler_state[cl_aligned_packet_length(SAMPLER_STATE, 32)];
-};
-
-struct v3dv_combined_image_sampler_descriptor {
-   uint8_t texture_state[cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32)];
-   uint8_t sampler_state[cl_aligned_packet_length(SAMPLER_STATE, 32)];
-};
-
 struct v3dv_query {
    bool maybe_available;
    union {
-      struct v3dv_bo *bo; /* Used by GPU queries (occlusion) */
-      uint64_t value; /* Used by CPU queries (timestamp) */
+      /* Used by GPU queries (occlusion) */
+      struct {
+         struct v3dv_bo *bo;
+         uint32_t offset;
+      };
+      /* Used by CPU queries (timestamp) */
+      uint64_t value;
    };
 };
 
 struct v3dv_query_pool {
    struct vk_object_base base;
 
+   struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
+
    VkQueryType query_type;
    uint32_t query_count;
    struct v3dv_query *queries;
@@ -1221,7 +1280,7 @@ struct v3dv_cmd_buffer_private_obj {
 };
 
 struct v3dv_cmd_buffer {
-   struct vk_object_base base;
+   struct vk_command_buffer vk;
 
    struct v3dv_device *device;
 
@@ -1293,12 +1352,6 @@ void v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
                                     uint32_t dirty_dynamic_state,
                                     bool needs_subpass_resume);
 
-void v3dv_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
-                                          int rt,
-                                          uint32_t *rt_bpp,
-                                          uint32_t *rt_type,
-                                          uint32_t *rt_clamp);
-
 void v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
                                    struct v3dv_query_pool *pool,
                                    uint32_t first,
@@ -1338,8 +1391,8 @@ struct v3dv_semaphore {
    /* A syncobject handle associated with this semaphore */
    uint32_t sync;
 
-   /* The file handle of a fence that we imported into our syncobject */
-   int32_t fd;
+   /* A temporary syncobject handle produced from a vkImportSemaphoreFd. */
+   uint32_t temp_sync;
 };
 
 struct v3dv_fence {
@@ -1348,8 +1401,8 @@ struct v3dv_fence {
    /* A syncobject handle associated with this fence */
    uint32_t sync;
 
-   /* The file handle of a fence that we imported into our syncobject */
-   int32_t fd;
+   /* A temporary syncobject handle produced from a vkImportFenceFd. */
+   uint32_t temp_sync;
 };
 
 struct v3dv_event {
@@ -1358,11 +1411,12 @@ struct v3dv_event {
 };
 
 struct v3dv_shader_variant {
-   broadcom_shader_stage stage;
+   enum broadcom_shader_stage stage;
 
    union {
       struct v3d_prog_data *base;
       struct v3d_vs_prog_data *vs;
+      struct v3d_gs_prog_data *gs;
       struct v3d_fs_prog_data *fs;
       struct v3d_compute_prog_data *cs;
    } prog_data;
@@ -1397,7 +1451,7 @@ struct v3dv_shader_variant {
 struct v3dv_pipeline_stage {
    struct v3dv_pipeline *pipeline;
 
-   broadcom_shader_stage stage;
+   enum broadcom_shader_stage stage;
 
    const struct vk_shader_module *module;
    const char *entrypoint;
@@ -1410,20 +1464,8 @@ struct v3dv_pipeline_stage {
 
    /** A name for this program, so you can track it in shader-db output. */
    uint32_t program_id;
-};
 
-/* FIXME: although the full vpm_config is not required at this point, as we
- * don't plan to initially support GS, it is more readable and serves as a
- * placeholder, to have the struct and fill it with default values.
- */
-struct vpm_config {
-   uint32_t As;
-   uint32_t Vc;
-   uint32_t Gs;
-   uint32_t Gd;
-   uint32_t Gv;
-   uint32_t Ve;
-   uint32_t gs_width;
+   VkPipelineCreationFeedbackEXT feedback;
 };
 
 /* We are using the descriptor pool entry for two things:
@@ -1590,9 +1632,48 @@ struct v3dv_sampler {
     * configuration. If needed it will be copied to the descriptor info during
     * UpdateDescriptorSets
     */
-   uint8_t sampler_state[cl_packet_length(SAMPLER_STATE)];
+   uint8_t sampler_state[V3DV_SAMPLER_STATE_LENGTH];
+};
+
+struct v3dv_descriptor_template_entry {
+   /* The type of descriptor in this entry */
+   VkDescriptorType type;
+
+   /* Binding in the descriptor set */
+   uint32_t binding;
+
+   /* Offset at which to write into the descriptor set binding */
+   uint32_t array_element;
+
+   /* Number of elements to write into the descriptor set binding */
+   uint32_t array_count;
+
+   /* Offset into the user provided data */
+   size_t offset;
+
+   /* Stride between elements into the user provided data */
+   size_t stride;
 };
 
+struct v3dv_descriptor_update_template {
+   struct vk_object_base base;
+
+   VkPipelineBindPoint bind_point;
+
+   /* The descriptor set this template corresponds to. This value is only
+    * valid if the template was created with the templateType
+    * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET.
+    */
+   uint8_t set;
+
+   /* Number of entries in this template */
+   uint32_t entry_count;
+
+   /* Entries of the template */
+   struct v3dv_descriptor_template_entry entries[0];
+};
+
+
 /* We keep two special values for the sampler idx that represents exactly when a
  * sampler is not needed/provided. The main use is that even if we don't have
  * sampler, we still need to do the output unpacking (through
@@ -1633,6 +1714,13 @@ v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key,
       *sampler_index = sampler;
 }
 
+struct v3dv_descriptor_maps {
+   struct v3dv_descriptor_map ubo_map;
+   struct v3dv_descriptor_map ssbo_map;
+   struct v3dv_descriptor_map sampler_map;
+   struct v3dv_descriptor_map texture_map;
+};
+
 /* The structure represents data shared between different objects, like the
  * pipeline and the pipeline cache, so we ref count it to know when it should
  * be freed.
@@ -1642,11 +1730,7 @@ struct v3dv_pipeline_shared_data {
 
    unsigned char sha1_key[20];
 
-   struct v3dv_descriptor_map ubo_map;
-   struct v3dv_descriptor_map ssbo_map;
-   struct v3dv_descriptor_map sampler_map;
-   struct v3dv_descriptor_map texture_map;
-
+   struct v3dv_descriptor_maps *maps[BROADCOM_SHADER_STAGES];
    struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES];
 
    struct v3dv_bo *assembly_bo;
@@ -1662,14 +1746,20 @@ struct v3dv_pipeline {
    struct v3dv_render_pass *pass;
    struct v3dv_subpass *subpass;
 
-   /* Note: We can't use just a MESA_SHADER_STAGES array as we need to track
-    * too the coordinate shader
+   /* Note: We can't use just a MESA_SHADER_STAGES array because we also need
+    * to track binning shaders. Note these will be freed once the pipeline
+    * has been compiled.
     */
    struct v3dv_pipeline_stage *vs;
    struct v3dv_pipeline_stage *vs_bin;
+   struct v3dv_pipeline_stage *gs;
+   struct v3dv_pipeline_stage *gs_bin;
    struct v3dv_pipeline_stage *fs;
    struct v3dv_pipeline_stage *cs;
 
+   /* Flags for whether optional pipeline stages are present, for convenience */
+   bool has_gs;
+
    /* Spilling memory requirements */
    struct {
       struct v3dv_bo *bo;
@@ -1736,7 +1826,7 @@ struct v3dv_pipeline {
       /* Per-RT bit mask with blend enables */
       uint8_t enables;
       /* Per-RT prepacked blend config packets */
-      uint8_t cfg[V3D_MAX_DRAW_BUFFERS][cl_packet_length(BLEND_CFG)];
+      uint8_t cfg[V3D_MAX_DRAW_BUFFERS][V3DV_BLEND_CFG_LENGTH];
       /* Flag indicating whether the blend factors in use require
        * color constants.
        */
@@ -1753,12 +1843,12 @@ struct v3dv_pipeline {
 
    /* Packets prepacked during pipeline creation
     */
-   uint8_t cfg_bits[cl_packet_length(CFG_BITS)];
-   uint8_t shader_state_record[cl_packet_length(GL_SHADER_STATE_RECORD)];
-   uint8_t vcm_cache_size[cl_packet_length(VCM_CACHE_SIZE)];
-   uint8_t vertex_attrs[cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD) *
+   uint8_t cfg_bits[V3DV_CFG_BITS_LENGTH];
+   uint8_t shader_state_record[V3DV_GL_SHADER_STATE_RECORD_LENGTH];
+   uint8_t vcm_cache_size[V3DV_VCM_CACHE_SIZE_LENGTH];
+   uint8_t vertex_attrs[V3DV_GL_SHADER_STATE_ATTRIBUTE_RECORD_LENGTH *
                         MAX_VERTEX_ATTRIBS];
-   uint8_t stencil_cfg[2][cl_packet_length(STENCIL_CFG)];
+   uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
 };
 
 static inline VkPipelineBindPoint
@@ -1782,82 +1872,9 @@ v3dv_cmd_buffer_get_descriptor_state(struct v3dv_cmd_buffer *cmd_buffer,
 
 const nir_shader_compiler_options *v3dv_pipeline_get_nir_options(void);
 
-static inline uint32_t
-v3dv_zs_buffer_from_aspect_bits(VkImageAspectFlags aspects)
-{
-   const VkImageAspectFlags zs_aspects =
-      VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
-   const VkImageAspectFlags filtered_aspects = aspects & zs_aspects;
-
-   if (filtered_aspects == zs_aspects)
-      return ZSTENCIL;
-   else if (filtered_aspects == VK_IMAGE_ASPECT_DEPTH_BIT)
-      return Z;
-   else if (filtered_aspects == VK_IMAGE_ASPECT_STENCIL_BIT)
-      return STENCIL;
-   else
-      return NONE;
-}
-
-static inline uint32_t
-v3dv_zs_buffer_from_vk_format(VkFormat format)
-{
-   switch (format) {
-   case VK_FORMAT_D16_UNORM_S8_UINT:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
-   case VK_FORMAT_D32_SFLOAT_S8_UINT:
-      return ZSTENCIL;
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D32_SFLOAT:
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-      return Z;
-   case VK_FORMAT_S8_UINT:
-      return STENCIL;
-   default:
-      return NONE;
-   }
-}
-
-static inline uint32_t
-v3dv_zs_buffer(bool depth, bool stencil)
-{
-   if (depth && stencil)
-      return ZSTENCIL;
-   else if (depth)
-      return Z;
-   else if (stencil)
-      return STENCIL;
-   return NONE;
-}
-
-static inline uint8_t
-v3dv_get_internal_depth_type(VkFormat format)
-{
-   switch (format) {
-   case VK_FORMAT_D16_UNORM:
-      return V3D_INTERNAL_TYPE_DEPTH_16;
-   case VK_FORMAT_D32_SFLOAT:
-      return V3D_INTERNAL_TYPE_DEPTH_32F;
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
-      return V3D_INTERNAL_TYPE_DEPTH_24;
-   default:
-      unreachable("Invalid depth format");
-      break;
-   }
-}
-
-uint32_t v3dv_physical_device_api_version(struct v3dv_physical_device *dev);
 uint32_t v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev);
 uint32_t v3dv_physical_device_device_id(struct v3dv_physical_device *dev);
 
-VkResult __vk_errorf(struct v3dv_instance *instance, VkResult error,
-                     const char *file, int line,
-                     const char *format, ...);
-
-#define vk_error(instance, error) __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
-#define vk_errorf(instance, error, format, ...) __vk_errorf(instance, error, __FILE__, __LINE__, format, ## __VA_ARGS__);
-
 #ifdef DEBUG
 #define v3dv_debug_ignored_stype(sType) \
    fprintf(stderr, "%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType))
@@ -1865,33 +1882,14 @@ VkResult __vk_errorf(struct v3dv_instance *instance, VkResult error,
 #define v3dv_debug_ignored_stype(sType)
 #endif
 
-const struct v3dv_format *v3dv_get_format(VkFormat);
-const uint8_t *v3dv_get_format_swizzle(VkFormat f);
-void v3dv_get_internal_type_bpp_for_output_format(uint32_t format, uint32_t *type, uint32_t *bpp);
+const uint8_t *v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f);
 uint8_t v3dv_get_tex_return_size(const struct v3dv_format *vf, bool compare_enable);
-bool v3dv_tfu_supports_tex_format(const struct v3d_device_info *devinfo,
-                                  uint32_t tex_format);
 const struct v3dv_format *
-v3dv_get_compatible_tfu_format(const struct v3d_device_info *devinfo,
+v3dv_get_compatible_tfu_format(struct v3dv_device *device,
                                uint32_t bpp, VkFormat *out_vk_format);
-bool v3dv_buffer_format_supports_features(VkFormat vk_format,
+bool v3dv_buffer_format_supports_features(struct v3dv_device *device,
+                                          VkFormat vk_format,
                                           VkFormatFeatureFlags features);
-bool v3dv_format_supports_tlb_resolve(const struct v3dv_format *format);
-
-uint32_t v3d_utile_width(int cpp);
-uint32_t v3d_utile_height(int cpp);
-
-void v3d_load_tiled_image(void *dst, uint32_t dst_stride,
-                          void *src, uint32_t src_stride,
-                          enum v3d_tiling_mode tiling_format,
-                          int cpp, uint32_t image_h,
-                          const struct pipe_box *box);
-
-void v3d_store_tiled_image(void *dst, uint32_t dst_stride,
-                           void *src, uint32_t src_stride,
-                           enum v3d_tiling_mode tiling_format,
-                           int cpp, uint32_t image_h,
-                           const struct pipe_box *box);
 
 struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
                                          struct v3dv_pipeline *pipeline,
@@ -1912,7 +1910,7 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
 
 struct v3dv_shader_variant *
 v3dv_shader_variant_create(struct v3dv_device *device,
-                           broadcom_shader_stage stage,
+                           enum broadcom_shader_stage stage,
                            struct v3d_prog_data *prog_data,
                            uint32_t prog_data_size,
                            uint32_t assembly_offset,
@@ -1958,13 +1956,15 @@ v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state,
                                 uint32_t index);
 
 struct v3dv_cl_reloc
-v3dv_descriptor_map_get_sampler_state(struct v3dv_descriptor_state *descriptor_state,
+v3dv_descriptor_map_get_sampler_state(struct v3dv_device *device,
+                                      struct v3dv_descriptor_state *descriptor_state,
                                       struct v3dv_descriptor_map *map,
                                       struct v3dv_pipeline_layout *pipeline_layout,
                                       uint32_t index);
 
 struct v3dv_cl_reloc
-v3dv_descriptor_map_get_texture_shader_state(struct v3dv_descriptor_state *descriptor_state,
+v3dv_descriptor_map_get_texture_shader_state(struct v3dv_device *device,
+                                             struct v3dv_descriptor_state *descriptor_state,
                                              struct v3dv_descriptor_map *map,
                                              struct v3dv_pipeline_layout *pipeline_layout,
                                              uint32_t index);
@@ -1992,6 +1992,7 @@ v3dv_immutable_samplers(const struct v3dv_descriptor_set_layout *set,
 
 void v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
                               struct v3dv_device *device,
+                              VkPipelineCacheCreateFlags,
                               bool cache_enabled);
 
 void v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache);
@@ -2008,7 +2009,8 @@ nir_shader* v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
 
 struct v3dv_pipeline_shared_data *
 v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
-                                        unsigned char sha1_key[20]);
+                                        unsigned char sha1_key[20],
+                                        bool *cache_hit);
 
 void
 v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
@@ -2022,73 +2024,58 @@ void v3dv_shader_module_internal_init(struct v3dv_device *device,
                                       struct vk_shader_module *module,
                                       nir_shader *nir);
 
-#define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType)   \
-                                                        \
-   static inline struct __v3dv_type *                    \
-   __v3dv_type ## _from_handle(__VkType _handle)         \
-   {                                                    \
-      return (struct __v3dv_type *) _handle;             \
-   }                                                    \
-                                                        \
-   static inline __VkType                               \
-   __v3dv_type ## _to_handle(struct __v3dv_type *_obj)    \
-   {                                                    \
-      return (__VkType) _obj;                           \
-   }
-
-#define V3DV_DEFINE_NONDISP_HANDLE_CASTS(__v3dv_type, __VkType)              \
-                                                                           \
-   static inline struct __v3dv_type *                                       \
-   __v3dv_type ## _from_handle(__VkType _handle)                            \
-   {                                                                       \
-      return (struct __v3dv_type *)(uintptr_t) _handle;                     \
-   }                                                                       \
-                                                                           \
-   static inline __VkType                                                  \
-   __v3dv_type ## _to_handle(struct __v3dv_type *_obj)                       \
-   {                                                                       \
-      return (__VkType)(uintptr_t) _obj;                                   \
-   }
-
 #define V3DV_FROM_HANDLE(__v3dv_type, __name, __handle)			\
-   struct __v3dv_type *__name = __v3dv_type ## _from_handle(__handle)
-
-V3DV_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, VkCommandBuffer)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_device, VkDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_instance, VkInstance)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_physical_device, VkPhysicalDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_queue, VkQueue)
-
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_cmd_pool, VkCommandPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, VkBuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, VkBufferView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, VkDeviceMemory)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, VkDescriptorPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, VkDescriptorSet)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, VkDescriptorSetLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, VkEvent)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, VkFence)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, VkFramebuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, VkImage)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, VkImageView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, VkPipeline)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, VkPipelineCache)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, VkPipelineLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, VkQueryPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, VkRenderPass)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, VkSampler)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, VkSemaphore)
-
-/* This is defined as a macro so that it works for both
- * VkImageSubresourceRange and VkImageSubresourceLayers
- */
-#define v3dv_layer_count(_image, _range) \
-   ((_range)->layerCount == VK_REMAINING_ARRAY_LAYERS ? \
-    (_image)->array_size - (_range)->baseArrayLayer : (_range)->layerCount)
-
-#define v3dv_level_count(_image, _range) \
-   ((_range)->levelCount == VK_REMAINING_MIP_LEVELS ? \
-    (_image)->levels - (_range)->baseMipLevel : (_range)->levelCount)
+   VK_FROM_HANDLE(__v3dv_type, __name, __handle)
+
+VK_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+VK_DEFINE_HANDLE_CASTS(v3dv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_instance, vk.base, VkInstance,
+                       VK_OBJECT_TYPE_INSTANCE)
+VK_DEFINE_HANDLE_CASTS(v3dv_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_cmd_pool, base, VkCommandPool,
+                               VK_OBJECT_TYPE_COMMAND_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, base, VkBuffer,
+                               VK_OBJECT_TYPE_BUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, base,
+                               VkDescriptorUpdateTemplate,
+                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, base, VkFence, VK_OBJECT_TYPE_FENCE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, base, VkFramebuffer,
+                               VK_OBJECT_TYPE_FRAMEBUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, vk.base, VkImage,
+                               VK_OBJECT_TYPE_IMAGE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, vk.base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, base, VkPipeline,
+                               VK_OBJECT_TYPE_PIPELINE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, base, VkPipelineCache,
+                               VK_OBJECT_TYPE_PIPELINE_CACHE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, base, VkPipelineLayout,
+                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, base, VkRenderPass,
+                               VK_OBJECT_TYPE_RENDER_PASS)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, base, VkSemaphore,
+                               VK_OBJECT_TYPE_SEMAPHORE)
 
 static inline int
 v3dv_ioctl(int fd, unsigned long request, void *arg)
@@ -2136,4 +2123,31 @@ u64_compare(const void *key1, const void *key2)
    return memcmp(key1, key2, sizeof(uint64_t)) == 0;
 }
 
+/* Helper to call hw ver speficic functions */
+#define v3dv_X(device, thing) ({                      \
+   __typeof(&v3d42_##thing) v3d_X_thing;              \
+   switch (device->devinfo.ver) {                     \
+   case 42:                                           \
+      v3d_X_thing = &v3d42_##thing;                   \
+      break;                                          \
+   default:                                           \
+      unreachable("Unsupported hardware generation"); \
+   }                                                  \
+   v3d_X_thing;                                       \
+})
+
+
+/* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
+ * define v3dX for each version supported, because when we compile code that
+ * is not version-specific, all version-specific macros need to be already
+ * defined.
+ */
+#ifdef v3dX
+#  include "v3dvx_private.h"
+#else
+#  define v3dX(x) v3d42_##x
+#  include "v3dvx_private.h"
+#  undef v3dX
+#endif
+
 #endif /* V3DV_PRIVATE_H */
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_query.c b/lib/mesa/src/broadcom/vulkan/v3dv_query.c
index d3100498c..5e4b92fb1 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_query.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_query.c
@@ -23,7 +23,7 @@
 
 #include "v3dv_private.h"
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateQueryPool(VkDevice _device,
                      const VkQueryPoolCreateInfo *pCreateInfo,
                      const VkAllocationCallbacks *pAllocator,
@@ -35,14 +35,11 @@ v3dv_CreateQueryPool(VkDevice _device,
           pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP);
    assert(pCreateInfo->queryCount > 0);
 
-   /* FIXME: the hw allows us to allocate up to 16 queries in a single block
-    *        for occlussion queries so we should try to use that.
-    */
    struct v3dv_query_pool *pool =
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
                        VK_OBJECT_TYPE_QUERY_POOL);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    pool->query_type = pCreateInfo->queryType;
    pool->query_count = pCreateInfo->queryCount;
@@ -53,26 +50,39 @@ v3dv_CreateQueryPool(VkDevice _device,
    pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pool->queries == NULL) {
-      result = vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      goto fail_alloc_bo_list;
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail;
+   }
+
+   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      /* The hardware allows us to setup groups of 16 queries in consecutive
+       * 4-byte addresses, requiring only that each group of 16 queries is
+       * aligned to a 1024 byte boundary.
+       */
+      const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
+      const uint32_t bo_size = query_groups * 1024;
+      pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
+      if (!pool->bo) {
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail;
+      }
+      if (!v3dv_bo_map(device, pool->bo, bo_size)) {
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail;
+      }
    }
 
    uint32_t i;
    for (i = 0; i < pool->query_count; i++) {
       pool->queries[i].maybe_available = false;
       switch (pool->query_type) {
-      case VK_QUERY_TYPE_OCCLUSION:
-         pool->queries[i].bo = v3dv_bo_alloc(device, 4096, "query", true);
-         if (!pool->queries[i].bo) {
-            result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-            goto fail_alloc_bo;
-         }
-         /* For occlusion queries we only need a 4-byte counter */
-         if (!v3dv_bo_map(device, pool->queries[i].bo, 4)) {
-            result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-            goto fail_alloc_bo;
-         }
+      case VK_QUERY_TYPE_OCCLUSION: {
+         const uint32_t query_group = i / 16;
+         const uint32_t query_offset = query_group * 1024 + (i % 16) * 4;
+         pool->queries[i].bo = pool->bo;
+         pool->queries[i].offset = query_offset;
          break;
+         }
       case VK_QUERY_TYPE_TIMESTAMP:
          pool->queries[i].value = 0;
          break;
@@ -85,18 +95,17 @@ v3dv_CreateQueryPool(VkDevice _device,
 
    return VK_SUCCESS;
 
-fail_alloc_bo:
-   for (uint32_t j = 0; j < i; j++)
-      v3dv_bo_free(device, pool->queries[j].bo);
-   vk_free2(&device->vk.alloc, pAllocator, pool->queries);
-
-fail_alloc_bo_list:
+fail:
+   if (pool->bo)
+      v3dv_bo_free(device, pool->bo);
+   if (pool->queries)
+      vk_free2(&device->vk.alloc, pAllocator, pool->queries);
    vk_object_free(&device->vk, pAllocator, pool);
 
    return result;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyQueryPool(VkDevice _device,
                       VkQueryPool queryPool,
                       const VkAllocationCallbacks *pAllocator)
@@ -107,12 +116,12 @@ v3dv_DestroyQueryPool(VkDevice _device,
    if (!pool)
       return;
 
-   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
-      for (uint32_t i = 0; i < pool->query_count; i++)
-         v3dv_bo_free(device, pool->queries[i].bo);
-   }
+   if (pool->bo)
+      v3dv_bo_free(device, pool->bo);
+
+   if (pool->queries)
+      vk_free2(&device->vk.alloc, pAllocator, pool->queries);
 
-   vk_free2(&device->vk.alloc, pAllocator, pool->queries);
    vk_object_free(&device->vk, pAllocator, pool);
 }
 
@@ -128,12 +137,13 @@ write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
    }
 }
 
-static uint64_t
+static VkResult
 get_occlusion_query_result(struct v3dv_device *device,
                            struct v3dv_query_pool *pool,
                            uint32_t query,
                            bool do_wait,
-                           bool *available)
+                           bool *available,
+                           uint64_t *value)
 {
    assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
 
@@ -149,25 +159,28 @@ get_occlusion_query_result(struct v3dv_device *device,
        *     error may occur."
        */
       if (!q->maybe_available)
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
 
       if (!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
 
       *available = true;
    } else {
       *available = q->maybe_available && v3dv_bo_wait(device, q->bo, 0);
    }
 
-   return (uint64_t) *((uint32_t *) q->bo->map);
+   const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
+   *value = (uint64_t) *((uint32_t *)query_addr);
+   return VK_SUCCESS;
 }
 
-static uint64_t
+static VkResult
 get_timestamp_query_result(struct v3dv_device *device,
                            struct v3dv_query_pool *pool,
                            uint32_t query,
                            bool do_wait,
-                           bool *available)
+                           bool *available,
+                           uint64_t *value)
 {
    assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
 
@@ -182,28 +195,32 @@ get_timestamp_query_result(struct v3dv_device *device,
        *     error may occur."
        */
       if (!q->maybe_available)
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
 
       *available = true;
    } else {
       *available = q->maybe_available;
    }
 
-   return q->value;
+   *value = q->value;
+   return VK_SUCCESS;
 }
 
-static uint64_t
+static VkResult
 get_query_result(struct v3dv_device *device,
                  struct v3dv_query_pool *pool,
                  uint32_t query,
                  bool do_wait,
-                 bool *available)
+                 bool *available,
+                 uint64_t *value)
 {
    switch (pool->query_type) {
    case VK_QUERY_TYPE_OCCLUSION:
-      return get_occlusion_query_result(device, pool, query, do_wait, available);
+      return get_occlusion_query_result(device, pool, query, do_wait,
+                                        available, value);
    case VK_QUERY_TYPE_TIMESTAMP:
-      return get_timestamp_query_result(device, pool, query, do_wait, available);
+      return get_timestamp_query_result(device, pool, query, do_wait,
+                                        available, value);
    default:
       unreachable("Unsupported query type");
    }
@@ -229,7 +246,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
    VkResult result = VK_SUCCESS;
    for (uint32_t i = first; i < first + count; i++) {
       bool available = false;
-      uint64_t value = get_query_result(device, pool, i, do_wait, &available);
+      uint64_t value = 0;
+      VkResult query_result =
+         get_query_result(device, pool, i, do_wait, &available, &value);
+      if (query_result == VK_ERROR_DEVICE_LOST)
+         result = VK_ERROR_DEVICE_LOST;
 
       /**
        * From the Vulkan 1.0 spec:
@@ -251,7 +272,7 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
          write_query_result(data, slot++, do_64bit, available ? 1u : 0u);
 
-      if (!write_result)
+      if (!write_result && result != VK_ERROR_DEVICE_LOST)
          result = VK_NOT_READY;
 
       data += stride;
@@ -260,7 +281,7 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
    return result;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetQueryPoolResults(VkDevice _device,
                          VkQueryPool queryPool,
                          uint32_t firstQuery,
@@ -277,7 +298,7 @@ v3dv_GetQueryPoolResults(VkDevice _device,
                                           pData, stride, flags);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
                        VkQueryPool queryPool,
                        uint32_t firstQuery,
@@ -289,7 +310,7 @@ v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
    v3dv_cmd_buffer_reset_queries(cmd_buffer, pool, firstQuery, queryCount);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
                              VkQueryPool queryPool,
                              uint32_t firstQuery,
@@ -308,7 +329,7 @@ v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
                                       dst, dstOffset, stride, flags);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
                    VkQueryPool queryPool,
                    uint32_t query,
@@ -320,7 +341,7 @@ v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
    v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
                  VkQueryPool queryPool,
                  uint32_t query)
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_queue.c b/lib/mesa/src/broadcom/vulkan/v3dv_queue.c
index 6ea6d1acf..1209031d5 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_queue.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_queue.c
@@ -34,19 +34,28 @@ v3dv_clif_dump(struct v3dv_device *device,
                struct v3dv_job *job,
                struct drm_v3d_submit_cl *submit)
 {
-   if (!(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CLIF)))
+   if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL |
+                               V3D_DEBUG_CL_NO_BIN |
+                               V3D_DEBUG_CLIF))))
       return;
 
    struct clif_dump *clif = clif_dump_init(&device->devinfo,
                                            stderr,
-                                           V3D_DEBUG & V3D_DEBUG_CL);
+                                           V3D_DEBUG & (V3D_DEBUG_CL |
+                                                        V3D_DEBUG_CL_NO_BIN),
+                                           V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
 
    set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (void *)entry->key;
       char *name = ralloc_asprintf(NULL, "%s_0x%x",
                                    bo->name, bo->offset);
 
-      v3dv_bo_map(device, bo, bo->size);
+      bool ok = v3dv_bo_map(device, bo, bo->size);
+      if (!ok) {
+         fprintf(stderr, "failed to map BO for clif_dump.\n");
+         ralloc_free(name);
+         goto free_clif;
+      }
       clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
 
       ralloc_free(name);
@@ -54,6 +63,7 @@ v3dv_clif_dump(struct v3dv_device *device,
 
    clif_dump(clif, submit);
 
+ free_clif:
    clif_dump_destroy(clif);
 }
 
@@ -136,7 +146,7 @@ gpu_queue_wait_idle(struct v3dv_queue *queue)
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_QueueWaitIdle(VkQueue _queue)
 {
    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
@@ -163,19 +173,22 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
     * FIXME: we could avoid blocking the main thread for this if we use
     *        submission thread.
     */
+   if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
+         v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
+
    for (uint32_t i = info->first; i < info->first + info->count; i++) {
       assert(i < info->pool->query_count);
-      struct v3dv_query *query = &info->pool->queries[i];
-      query->maybe_available = false;
+      struct v3dv_query *q = &info->pool->queries[i];
+      q->maybe_available = false;
       switch (info->pool->query_type) {
       case VK_QUERY_TYPE_OCCLUSION: {
-         v3dv_bo_wait(job->device, query->bo, PIPE_TIMEOUT_INFINITE);
-         uint32_t *counter = (uint32_t *) query->bo->map;
+         const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
+         uint32_t *counter = (uint32_t *) q_addr;
          *counter = 0;
          break;
       }
       case VK_QUERY_TYPE_TIMESTAMP:
-         query->value = 0;
+         q->value = 0;
          break;
       default:
          unreachable("Unsupported query type");
@@ -189,9 +202,11 @@ static VkResult
 handle_end_query_cpu_job(struct v3dv_job *job)
 {
    struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
-   assert(info->query < info->pool->query_count);
-   struct v3dv_query *query = &info->pool->queries[info->query];
-   query->maybe_available = true;
+   for (uint32_t i = 0; i < info->count; i++) {
+      assert(info->query + i < info->pool->query_count);
+      struct v3dv_query *query = &info->pool->queries[info->query + i];
+      query->maybe_available = true;
+   }
 
    return VK_SUCCESS;
 }
@@ -208,17 +223,19 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
    /* Map the entire dst buffer for the CPU copy if needed */
    assert(!bo->map || bo->map_size == bo->size);
    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a
     * sync wait on the CPU for the corresponding GPU jobs to finish. We might
     * want to use a submission thread to avoid blocking on the main thread.
     */
+   uint8_t *offset = ((uint8_t *) bo->map) +
+                     info->offset + info->dst->mem_offset;
    v3dv_get_query_pool_results_cpu(job->device,
                                    info->pool,
                                    info->first,
                                    info->count,
-                                   bo->map + info->dst->mem_offset,
+                                   offset,
                                    info->stride,
                                    info->flags);
 
@@ -343,7 +360,7 @@ spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread)
    assert(wait_thread != NULL);
 
    if (pthread_create(wait_thread, NULL, event_wait_thread_func, job))
-      return vk_error(job->device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(job->device, VK_ERROR_DEVICE_LOST);
 
    return VK_NOT_READY;
 }
@@ -396,13 +413,13 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
    struct v3dv_bo *dst_bo = info->image->mem->bo;
    assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
    if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
    void *dst_ptr = dst_bo->map;
 
    struct v3dv_bo *src_bo = info->buffer->mem->bo;
    assert(!src_bo->map || src_bo->map_size == src_bo->size);
    if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
    void *src_ptr = src_bo->map;
 
    const struct v3d_resource_slice *slice =
@@ -441,10 +458,14 @@ handle_timestamp_query_cpu_job(struct v3dv_job *job)
    /* Compute timestamp */
    struct timespec t;
    clock_gettime(CLOCK_MONOTONIC, &t);
-   assert(info->query < info->pool->query_count);
-   struct v3dv_query *query = &info->pool->queries[info->query];
-   query->maybe_available = true;
-   query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
+
+   for (uint32_t i = 0; i < info->count; i++) {
+      assert(info->query + i < info->pool->query_count);
+      struct v3dv_query *query = &info->pool->queries[info->query + i];
+      query->maybe_available = true;
+      if (i == 0)
+         query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
+   }
 
    return VK_SUCCESS;
 }
@@ -471,7 +492,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
    struct v3dv_bo *bo = info->buffer->mem->bo;
    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
    assert(bo->map);
 
    const uint32_t offset = info->buffer->mem_offset + info->offset;
@@ -503,23 +524,28 @@ process_semaphores_to_signal(struct v3dv_device *device,
    drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
    mtx_unlock(&device->mutex);
    if (fd == -1)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   VkResult result = VK_SUCCESS;
    for (uint32_t i = 0; i < count; i++) {
       struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
 
-      if (sem->fd >= 0)
-         close(sem->fd);
-      sem->fd = -1;
-
-      int ret = drmSyncobjImportSyncFile(render_fd, sem->sync, fd);
-      if (ret)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      int ret;
+      if (!sem->temp_sync)
+         ret = drmSyncobjImportSyncFile(render_fd, sem->sync, fd);
+      else
+         ret = drmSyncobjImportSyncFile(render_fd, sem->temp_sync, fd);
 
-      sem->fd = fd;
+      if (ret) {
+         result = VK_ERROR_OUT_OF_HOST_MEMORY;
+         break;
+      }
    }
 
-   return VK_SUCCESS;
+   assert(fd >= 0);
+   close(fd);
+
+   return result;
 }
 
 static VkResult
@@ -530,10 +556,6 @@ process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
 
    struct v3dv_fence *fence = v3dv_fence_from_handle(_fence);
 
-   if (fence->fd >= 0)
-      close(fence->fd);
-   fence->fd = -1;
-
    int render_fd = device->pdevice->render_fd;
 
    int fd;
@@ -541,15 +563,18 @@ process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
    drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
    mtx_unlock(&device->mutex);
    if (fd == -1)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   int ret = drmSyncobjImportSyncFile(render_fd, fence->sync, fd);
-   if (ret)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   int ret;
+   if (!fence->temp_sync)
+      ret = drmSyncobjImportSyncFile(render_fd, fence->sync, fd);
+   else
+      ret = drmSyncobjImportSyncFile(render_fd, fence->temp_sync, fd);
 
-   fence->fd = fd;
+   assert(fd >= 0);
+   close(fd);
 
-   return VK_SUCCESS;
+   return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS;
 }
 
 static VkResult
@@ -559,7 +584,7 @@ handle_cl_job(struct v3dv_queue *queue,
 {
    struct v3dv_device *device = queue->device;
 
-   struct drm_v3d_submit_cl submit;
+   struct drm_v3d_submit_cl submit = { 0 };
 
    /* Sanity check: we should only flag a bcl sync on a job that needs to be
     * serialized.
@@ -636,7 +661,7 @@ handle_cl_job(struct v3dv_queue *queue,
    free(bo_handles);
 
    if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
 
    return VK_SUCCESS;
 }
@@ -659,7 +684,7 @@ handle_tfu_job(struct v3dv_queue *queue,
 
    if (ret != 0) {
       fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
    }
 
    return VK_SUCCESS;
@@ -704,7 +729,7 @@ handle_csd_job(struct v3dv_queue *queue,
    free(bo_handles);
 
    if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
 
    return VK_SUCCESS;
 }
@@ -745,91 +770,6 @@ queue_submit_job(struct v3dv_queue *queue,
    }
 }
 
-static void
-emit_noop_bin(struct v3dv_job *job)
-{
-   v3dv_job_start_frame(job, 1, 1, 1, 1, V3D_INTERNAL_BPP_32, false);
-   v3dv_job_emit_binning_flush(job);
-}
-
-static void
-emit_noop_render(struct v3dv_job *job)
-{
-   struct v3dv_cl *rcl = &job->rcl;
-   v3dv_cl_ensure_space_with_branch(rcl, 200 + 1 * 256 *
-                                    cl_packet_length(SUPERTILE_COORDINATES));
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
-      config.early_z_disable = true;
-      config.image_width_pixels = 1;
-      config.image_height_pixels = 1;
-      config.number_of_render_targets = 1;
-      config.multisample_mode_4x = false;
-      config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
-      rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
-      rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
-      clear.z_clear_value = 1.0f;
-      clear.stencil_clear_value = 0;
-   };
-
-   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
-      init.use_auto_chained_tile_lists = true;
-      init.size_of_first_block_in_chained_tile_lists =
-         TILE_ALLOCATION_BLOCK_SIZE_64B;
-   }
-
-   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(job->tile_alloc, 0);
-   }
-
-   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
-      config.number_of_bin_tile_lists = 1;
-      config.total_frame_width_in_tiles = 1;
-      config.total_frame_height_in_tiles = 1;
-      config.supertile_width_in_tiles = 1;
-      config.supertile_height_in_tiles = 1;
-      config.total_frame_width_in_supertiles = 1;
-      config.total_frame_height_in_supertiles = 1;
-   }
-
-   struct v3dv_cl *icl = &job->indirect;
-   v3dv_cl_ensure_space(icl, 200, 1);
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(icl);
-
-   cl_emit(icl, TILE_COORDINATES_IMPLICIT, coords);
-
-   cl_emit(icl, END_OF_LOADS, end);
-
-   cl_emit(icl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   cl_emit(icl, STORE_TILE_BUFFER_GENERAL, store) {
-      store.buffer_to_store = NONE;
-   }
-
-   cl_emit(icl, END_OF_TILE_MARKER, end);
-
-   cl_emit(icl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(icl);
-   }
-
-   cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
-      coords.column_number_in_supertiles = 0;
-      coords.row_number_in_supertiles = 0;
-   }
-
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
 static VkResult
 queue_create_noop_job(struct v3dv_queue *queue)
 {
@@ -837,11 +777,10 @@ queue_create_noop_job(struct v3dv_queue *queue)
    queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!queue->noop_job)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
 
-   emit_noop_bin(queue->noop_job);
-   emit_noop_render(queue->noop_job);
+   v3dv_X(device, job_emit_noop)(queue->noop_job);
 
    return VK_SUCCESS;
 }
@@ -1060,7 +999,7 @@ spawn_master_wait_thread(struct v3dv_queue *queue,
    mtx_lock(&queue->mutex);
    if (pthread_create(&wait_info->master_wait_thread, NULL,
                       master_wait_thread_func, wait_info)) {
-      result = vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
+      result = vk_error(queue, VK_ERROR_DEVICE_LOST);
       goto done;
    }
 
@@ -1071,7 +1010,7 @@ done:
    return result;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_QueueSubmit(VkQueue _queue,
                  uint32_t submitCount,
                  const VkSubmitInfo* pSubmits,
@@ -1106,7 +1045,15 @@ done:
    return result;
 }
 
-VkResult
+static void
+destroy_syncobj(uint32_t device_fd, uint32_t *sync)
+{
+   assert(sync);
+   drmSyncobjDestroy(device_fd, *sync);
+   *sync = 0;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateSemaphore(VkDevice _device,
                      const VkSemaphoreCreateInfo *pCreateInfo,
                      const VkAllocationCallbacks *pAllocator,
@@ -1120,14 +1067,12 @@ v3dv_CreateSemaphore(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore),
                        VK_OBJECT_TYPE_SEMAPHORE);
    if (sem == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   sem->fd = -1;
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync);
    if (ret) {
       vk_object_free(&device->vk, pAllocator, sem);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    *pSemaphore = v3dv_semaphore_to_handle(sem);
@@ -1135,7 +1080,158 @@ v3dv_CreateSemaphore(VkDevice _device,
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetPhysicalDeviceExternalSemaphoreProperties(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
+    VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
+{
+   switch (pExternalSemaphoreInfo->handleType) {
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
+      pExternalSemaphoreProperties->exportFromImportedHandleTypes =
+         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
+         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
+      pExternalSemaphoreProperties->compatibleHandleTypes =
+         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
+         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
+
+      /* FIXME: we can't import external semaphores until we improve the kernel
+       * submit interface to handle multiple in syncobjs, because once we have
+       * an imported semaphore in our list of semaphores to wait on, we can no
+       * longer use the workaround of waiting on the last syncobj fence produced
+       * from the device, since the imported semaphore may not (and in fact, it
+       * would typically not) have been produced from same device.
+       *
+       * This behavior is exercised via dEQP-VK.synchronization.cross_instance.*.
+       * Particularly, this test:
+       * dEQP-VK.synchronization.cross_instance.dedicated.
+       * write_ssbo_compute_read_vertex_input.buffer_16384_binary_semaphore_fd
+       * fails consistently because of this, so it'll be a good reference to
+       * verify the implementation when the kernel bits are in place.
+       */
+      pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
+
+      /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties
+       * for details on why we can't export to SYNC_FD.
+       */
+      if (pExternalSemaphoreInfo->handleType !=
+          VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
+         pExternalSemaphoreProperties->externalSemaphoreFeatures |=
+            VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT;
+      }
+      break;
+   default:
+      pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
+      pExternalSemaphoreProperties->compatibleHandleTypes = 0;
+      pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
+      break;
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_ImportSemaphoreFdKHR(
+   VkDevice _device,
+   const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
+
+   assert(pImportSemaphoreFdInfo->sType ==
+          VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR);
+
+   int fd = pImportSemaphoreFdInfo->fd;
+   int render_fd = device->pdevice->render_fd;
+
+   bool is_temporary =
+      pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT ||
+      (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT);
+
+   uint32_t new_sync;
+   switch (pImportSemaphoreFdInfo->handleType) {
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
+      /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the
+       *  special value -1 for fd is treated like a valid sync file descriptor
+       *  referring to an object that has already signaled. The import
+       *  operation will succeed and the VkSemaphore will have a temporarily
+       *  imported payload as if a valid file descriptor had been provided."
+       */
+      unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
+      if (drmSyncobjCreate(render_fd, flags, &new_sync))
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      if (fd != -1) {
+         if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
+            drmSyncobjDestroy(render_fd, new_sync);
+            return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         }
+      }
+      break;
+   }
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: {
+      if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      break;
+   }
+   default:
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+   }
+
+   destroy_syncobj(render_fd, &sem->temp_sync);
+   if (is_temporary) {
+      sem->temp_sync = new_sync;
+   } else {
+      destroy_syncobj(render_fd, &sem->sync);
+      sem->sync = new_sync;
+   }
+
+   /* From the Vulkan 1.0.53 spec:
+    *
+    *    "Importing a semaphore payload from a file descriptor transfers
+    *     ownership of the file descriptor from the application to the
+    *     Vulkan implementation. The application must not perform any
+    *     operations on the file descriptor after a successful import."
+    *
+    * If the import fails, we leave the file descriptor open.
+    */
+   if (fd != -1)
+      close(fd);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetSemaphoreFdKHR(VkDevice _device,
+                       const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
+                       int *pFd)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore);
+
+   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
+
+   *pFd = -1;
+   int render_fd = device->pdevice->render_fd;
+   switch (pGetFdInfo->handleType) {
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
+      drmSyncobjExportSyncFile(render_fd, sem->sync, pFd);
+      if (*pFd == -1)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      break;
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
+      drmSyncobjHandleToFD(render_fd, sem->sync, pFd);
+      if (*pFd == -1)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      break;
+   }
+   default:
+      unreachable("Unsupported external semaphore handle type");
+   }
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroySemaphore(VkDevice _device,
                       VkSemaphore semaphore,
                       const VkAllocationCallbacks *pAllocator)
@@ -1146,15 +1242,13 @@ v3dv_DestroySemaphore(VkDevice _device,
    if (sem == NULL)
       return;
 
-   drmSyncobjDestroy(device->pdevice->render_fd, sem->sync);
-
-   if (sem->fd != -1)
-      close(sem->fd);
+   destroy_syncobj(device->pdevice->render_fd, &sem->sync);
+   destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync);
 
    vk_object_free(&device->vk, pAllocator, sem);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateFence(VkDevice _device,
                  const VkFenceCreateInfo *pCreateInfo,
                  const VkAllocationCallbacks *pAllocator,
@@ -1168,7 +1262,7 @@ v3dv_CreateFence(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence),
                        VK_OBJECT_TYPE_FENCE);
    if (fence == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    unsigned flags = 0;
    if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
@@ -1176,17 +1270,136 @@ v3dv_CreateFence(VkDevice _device,
    int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync);
    if (ret) {
       vk_object_free(&device->vk, pAllocator, fence);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
-   fence->fd = -1;
-
    *pFence = v3dv_fence_to_handle(fence);
 
    return VK_SUCCESS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetPhysicalDeviceExternalFenceProperties(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo,
+    VkExternalFenceProperties *pExternalFenceProperties)
+
+{
+   switch (pExternalFenceInfo->handleType) {
+   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
+      pExternalFenceProperties->exportFromImportedHandleTypes =
+         VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
+         VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
+      pExternalFenceProperties->compatibleHandleTypes =
+         VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
+         VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
+      pExternalFenceProperties->externalFenceFeatures =
+         VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
+
+      /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not
+       * the syncobj itself, and that fence is only created after we have
+       * submitted to the kernel and updated the syncobj for the fence to import
+       * the actual DRM fence created with the submission. Unfortunately, if the
+       * queue submission has a 'wait for events' we may hold any jobs after the
+       * wait in a user-space thread until the events are signaled, and in that
+       * case we don't update the out fence of the submit until the events are
+       * signaled and we can submit all the jobs involved with the vkQueueSubmit
+       * call. This means that if the applications submits with an out fence and
+       * a wait for events, trying to export the out fence to a SYNC_FD rigth
+       * after the submission and before the events are signaled will fail,
+       * because the actual DRM fence won't exist yet. This is not a problem
+       * with OPAQUE_FD because in this case we export the entire syncobj, not
+       * the underlying DRM fence. To fix this we need to rework our kernel
+       * interface to be more flexible and accept multiple in/out syncobjs so
+       * we can implement event waits as regular fence waits on the kernel side,
+       * until then, we can only reliably export OPAQUE_FD.
+       */
+      if (pExternalFenceInfo->handleType !=
+          VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) {
+         pExternalFenceProperties->externalFenceFeatures |=
+            VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT;
+      }
+      break;
+   default:
+      pExternalFenceProperties->exportFromImportedHandleTypes = 0;
+      pExternalFenceProperties->compatibleHandleTypes = 0;
+      pExternalFenceProperties->externalFenceFeatures = 0;
+      break;
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_ImportFenceFdKHR(VkDevice _device,
+                      const VkImportFenceFdInfoKHR *pImportFenceFdInfo)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence);
+
+   assert(pImportFenceFdInfo->sType ==
+          VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
+
+   int fd = pImportFenceFdInfo->fd;
+   int render_fd = device->pdevice->render_fd;
+
+   bool is_temporary =
+      pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT ||
+      (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT);
+
+   uint32_t new_sync;
+   switch (pImportFenceFdInfo->handleType) {
+   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
+      /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
+       *  special value -1 for fd is treated like a valid sync file descriptor
+       *  referring to an object that has already signaled. The import
+       *  operation will succeed and the VkFence will have a temporarily
+       *  imported payload as if a valid file descriptor had been provided."
+       */
+      unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
+      if (drmSyncobjCreate(render_fd, flags, &new_sync))
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      if (fd != -1) {
+         if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
+            drmSyncobjDestroy(render_fd, new_sync);
+            return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         }
+      }
+      break;
+   }
+   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
+      if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      break;
+   }
+   default:
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+   }
+
+   destroy_syncobj(render_fd, &fence->temp_sync);
+   if (is_temporary) {
+      fence->temp_sync = new_sync;
+   } else {
+      destroy_syncobj(render_fd, &fence->sync);
+      fence->sync = new_sync;
+   }
+
+   /* From the Vulkan 1.0.53 spec:
+    *
+    *    "Importing a fence payload from a file descriptor transfers
+    *     ownership of the file descriptor from the application to the
+    *     Vulkan implementation. The application must not perform any
+    *     operations on the file descriptor after a successful import."
+    *
+    * If the import fails, we leave the file descriptor open.
+    */
+   if (fd != -1)
+      close(fd);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyFence(VkDevice _device,
                   VkFence _fence,
                   const VkAllocationCallbacks *pAllocator)
@@ -1197,15 +1410,13 @@ v3dv_DestroyFence(VkDevice _device,
    if (fence == NULL)
       return;
 
-   drmSyncobjDestroy(device->pdevice->render_fd, fence->sync);
-
-   if (fence->fd != -1)
-      close(fence->fd);
+   destroy_syncobj(device->pdevice->render_fd, &fence->sync);
+   destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync);
 
    vk_object_free(&device->vk, pAllocator, fence);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
@@ -1216,11 +1427,42 @@ v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
    if (ret == -ETIME)
       return VK_NOT_READY;
    else if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetFenceFdKHR(VkDevice _device,
+                   const VkFenceGetFdInfoKHR *pGetFdInfo,
+                   int *pFd)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence);
+
+   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
+
+   *pFd = -1;
+   int render_fd = device->pdevice->render_fd;
+   switch (pGetFdInfo->handleType) {
+   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
+      drmSyncobjExportSyncFile(render_fd, fence->sync, pFd);
+      if (*pFd == -1)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      break;
+   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
+      drmSyncobjHandleToFD(render_fd, fence->sync, pFd);
+      if (*pFd == -1)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      break;
+   }
+   default:
+      unreachable("Unsupported external fence handle type");
+   }
+
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
@@ -1229,23 +1471,41 @@ v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
                                  sizeof(*syncobjs) * fenceCount, 8,
                                  VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!syncobjs)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   int render_fd = device->pdevice->render_fd;
+   uint32_t reset_count = 0;
    for (uint32_t i = 0; i < fenceCount; i++) {
       struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
-      syncobjs[i] = fence->sync;
+      /* From the Vulkan spec, section 'Importing Fence Payloads':
+       *
+       *    "If the import is temporary, the fence will be restored to its
+       *     permanent state the next time that fence is passed to
+       *     vkResetFences.
+       *
+       *     Note: Restoring a fence to its prior permanent payload is a
+       *     distinct operation from resetting a fence payload."
+       *
+       * To restore the previous state, we just need to destroy the temporary.
+       */
+      if (fence->temp_sync)
+         destroy_syncobj(render_fd, &fence->temp_sync);
+      else
+         syncobjs[reset_count++] = fence->sync;
    }
 
-   int ret = drmSyncobjReset(device->pdevice->render_fd, syncobjs, fenceCount);
+   int ret = 0;
+   if (reset_count > 0)
+      ret = drmSyncobjReset(render_fd, syncobjs, reset_count);
 
    vk_free(&device->vk.alloc, syncobjs);
 
    if (ret)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_WaitForFences(VkDevice _device,
                    uint32_t fenceCount,
                    const VkFence *pFences,
@@ -1260,11 +1520,11 @@ v3dv_WaitForFences(VkDevice _device,
                                  sizeof(*syncobjs) * fenceCount, 8,
                                  VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!syncobjs)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    for (uint32_t i = 0; i < fenceCount; i++) {
       struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
-      syncobjs[i] = fence->sync;
+      syncobjs[i] = fence->temp_sync ? fence->temp_sync : fence->sync;
    }
 
    unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
@@ -1282,16 +1542,16 @@ v3dv_WaitForFences(VkDevice _device,
    if (ret == -ETIME)
       return VK_TIMEOUT;
    else if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_QueueBindSparse(VkQueue _queue,
                      uint32_t bindInfoCount,
                      const VkBindSparseInfo *pBindInfo,
                      VkFence fence)
 {
    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   return vk_error(queue->device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
+   return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
 }
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c b/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c
index 8dd085862..47bc3a0b1 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c
@@ -28,6 +28,52 @@
 #include "v3dv_private.h"
 #include "vk_format_info.h"
 
+/* The only version specific structure that we need is
+ * TMU_CONFIG_PARAMETER_1. This didn't seem to change significantly from
+ * previous V3D versions and we don't expect that to change, so for now let's
+ * just hardcode the V3D version here.
+ */
+#define V3D_VERSION 41
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+
+/* Our Vulkan resource indices represent indices in descriptor maps which
+ * include all shader stages, so we need to size the arrays below
+ * accordingly. For now we only support a maximum of 3 stages: VS, GS, FS.
+ */
+#define MAX_STAGES 3
+
+#define MAX_TOTAL_TEXTURE_SAMPLERS (V3D_MAX_TEXTURE_SAMPLERS * MAX_STAGES)
+struct texture_bo_list {
+   struct v3dv_bo *tex[MAX_TOTAL_TEXTURE_SAMPLERS];
+};
+
+/* This tracks state BOs for both textures and samplers, so we
+ * multiply by 2.
+ */
+#define MAX_TOTAL_STATES (2 * V3D_MAX_TEXTURE_SAMPLERS * MAX_STAGES)
+struct state_bo_list {
+   uint32_t count;
+   struct v3dv_bo *states[MAX_TOTAL_STATES];
+};
+
+#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES)
+#define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES)
+struct buffer_bo_list {
+   struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS];
+   struct v3dv_bo *ssbo[MAX_TOTAL_STORAGE_BUFFERS];
+};
+
+static bool
+state_bo_in_list(struct state_bo_list *list, struct v3dv_bo *bo)
+{
+   for (int i = 0; i < list->count; i++) {
+      if (list->states[i] == bo)
+         return true;
+   }
+   return false;
+}
+
 /*
  * This method checks if the ubo used for push constants is needed to be
  * updated or not.
@@ -87,43 +133,56 @@ check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer,
 static void
 write_tmu_p0(struct v3dv_cmd_buffer *cmd_buffer,
              struct v3dv_pipeline *pipeline,
+             enum broadcom_shader_stage stage,
              struct v3dv_cl_out **uniforms,
-             uint32_t data)
+             uint32_t data,
+             struct texture_bo_list *tex_bos,
+             struct state_bo_list *state_bos)
 {
    uint32_t texture_idx = v3d_unit_data_get_unit(data);
-   struct v3dv_job *job = cmd_buffer->state.job;
+
    struct v3dv_descriptor_state *descriptor_state =
       v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
 
    /* We need to ensure that the texture bo is added to the job */
    struct v3dv_bo *texture_bo =
       v3dv_descriptor_map_get_texture_bo(descriptor_state,
-                                         &pipeline->shared_data->texture_map,
+                                         &pipeline->shared_data->maps[stage]->texture_map,
                                          pipeline->layout, texture_idx);
    assert(texture_bo);
-   v3dv_job_add_bo(job, texture_bo);
+   assert(texture_idx < V3D_MAX_TEXTURE_SAMPLERS);
+   tex_bos->tex[texture_idx] = texture_bo;
 
    struct v3dv_cl_reloc state_reloc =
-      v3dv_descriptor_map_get_texture_shader_state(descriptor_state,
-                                                   &pipeline->shared_data->texture_map,
+      v3dv_descriptor_map_get_texture_shader_state(cmd_buffer->device, descriptor_state,
+                                                   &pipeline->shared_data->maps[stage]->texture_map,
                                                    pipeline->layout,
                                                    texture_idx);
 
-   cl_aligned_reloc(&job->indirect, uniforms,
-                    state_reloc.bo,
-                    state_reloc.offset +
-                    v3d_unit_data_get_offset(data));
+   cl_aligned_u32(uniforms, state_reloc.bo->offset +
+                            state_reloc.offset +
+                            v3d_unit_data_get_offset(data));
+
+   /* Texture and Sampler states are typically suballocated, so they are
+    * usually the same BO: only flag them once to avoid trying to add them
+    * multiple times to the job later.
+    */
+   if (!state_bo_in_list(state_bos, state_reloc.bo)) {
+      assert(state_bos->count < 2 * V3D_MAX_TEXTURE_SAMPLERS);
+      state_bos->states[state_bos->count++] = state_reloc.bo;
+   }
 }
 
 /** V3D 4.x TMU configuration parameter 1 (sampler) */
 static void
 write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,
              struct v3dv_pipeline *pipeline,
+             enum broadcom_shader_stage stage,
              struct v3dv_cl_out **uniforms,
-             uint32_t data)
+             uint32_t data,
+             struct state_bo_list *state_bos)
 {
    uint32_t sampler_idx = v3d_unit_data_get_unit(data);
-   struct v3dv_job *job = cmd_buffer->state.job;
    struct v3dv_descriptor_state *descriptor_state =
       v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
 
@@ -131,13 +190,13 @@ write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,
           sampler_idx != V3DV_NO_SAMPLER_32BIT_IDX);
 
    struct v3dv_cl_reloc sampler_state_reloc =
-      v3dv_descriptor_map_get_sampler_state(descriptor_state,
-                                            &pipeline->shared_data->sampler_map,
+      v3dv_descriptor_map_get_sampler_state(cmd_buffer->device, descriptor_state,
+                                            &pipeline->shared_data->maps[stage]->sampler_map,
                                             pipeline->layout, sampler_idx);
 
    const struct v3dv_sampler *sampler =
       v3dv_descriptor_map_get_sampler(descriptor_state,
-                                      &pipeline->shared_data->sampler_map,
+                                      &pipeline->shared_data->maps[stage]->sampler_map,
                                       pipeline->layout, sampler_idx);
    assert(sampler);
 
@@ -151,26 +210,36 @@ write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,
                                         &p1_unpacked);
    }
 
-   cl_aligned_reloc(&job->indirect, uniforms,
-                    sampler_state_reloc.bo,
-                    sampler_state_reloc.offset +
-                    p1_packed);
+   cl_aligned_u32(uniforms, sampler_state_reloc.bo->offset +
+                            sampler_state_reloc.offset +
+                            p1_packed);
+
+   /* Texture and Sampler states are typically suballocated, so they are
+    * usually the same BO: only flag them once to avoid trying to add them
+    * multiple times to the job later.
+    */
+   if (!state_bo_in_list(state_bos, sampler_state_reloc.bo)) {
+      assert(state_bos->count < 2 * V3D_MAX_TEXTURE_SAMPLERS);
+      state_bos->states[state_bos->count++] = sampler_state_reloc.bo;
+   }
 }
 
 static void
 write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
                         struct v3dv_pipeline *pipeline,
+                        enum broadcom_shader_stage stage,
                         struct v3dv_cl_out **uniforms,
                         enum quniform_contents content,
-                        uint32_t data)
+                        uint32_t data,
+                        struct buffer_bo_list *buffer_bos)
 {
-   struct v3dv_job *job = cmd_buffer->state.job;
    struct v3dv_descriptor_state *descriptor_state =
       v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
 
    struct v3dv_descriptor_map *map =
       content == QUNIFORM_UBO_ADDR || content == QUNIFORM_GET_UBO_SIZE ?
-      &pipeline->shared_data->ubo_map : &pipeline->shared_data->ssbo_map;
+      &pipeline->shared_data->maps[stage]->ubo_map :
+      &pipeline->shared_data->maps[stage]->ssbo_map;
 
    uint32_t offset =
       content == QUNIFORM_UBO_ADDR ?
@@ -193,10 +262,10 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
          &cmd_buffer->push_constants_resource;
       assert(resource->bo);
 
-      cl_aligned_reloc(&job->indirect, uniforms,
-                       resource->bo,
-                       resource->offset + offset + dynamic_offset);
-
+      cl_aligned_u32(uniforms, resource->bo->offset +
+                               resource->offset +
+                               offset + dynamic_offset);
+      buffer_bos->ubo[0] = resource->bo;
    } else {
       uint32_t index =
          content == QUNIFORM_UBO_ADDR ?
@@ -216,10 +285,18 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
           content == QUNIFORM_GET_UBO_SIZE) {
          cl_aligned_u32(uniforms, descriptor->range);
       } else {
-         cl_aligned_reloc(&job->indirect, uniforms,
-                          descriptor->buffer->mem->bo,
-                          descriptor->buffer->mem_offset +
-                          descriptor->offset + offset + dynamic_offset);
+         cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset +
+                                  descriptor->buffer->mem_offset +
+                                  descriptor->offset +
+                                  offset + dynamic_offset);
+
+         if (content == QUNIFORM_UBO_ADDR) {
+            assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS);
+            buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo;
+         } else {
+            assert(index < MAX_TOTAL_STORAGE_BUFFERS);
+            buffer_bos->ssbo[index] = descriptor->buffer->mem->bo;
+         }
       }
    }
 }
@@ -235,26 +312,26 @@ get_texture_size_from_image_view(struct v3dv_image_view *image_view,
       /* We don't u_minify the values, as we are using the image_view
        * extents
        */
-      return image_view->extent.width;
+      return image_view->vk.extent.width;
    case QUNIFORM_IMAGE_HEIGHT:
    case QUNIFORM_TEXTURE_HEIGHT:
-      return image_view->extent.height;
+      return image_view->vk.extent.height;
    case QUNIFORM_IMAGE_DEPTH:
    case QUNIFORM_TEXTURE_DEPTH:
-      return image_view->extent.depth;
+      return image_view->vk.extent.depth;
    case QUNIFORM_IMAGE_ARRAY_SIZE:
    case QUNIFORM_TEXTURE_ARRAY_SIZE:
-      if (image_view->type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) {
-         return image_view->last_layer - image_view->first_layer + 1;
+      if (image_view->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) {
+         return image_view->vk.layer_count;
       } else {
-         assert((image_view->last_layer - image_view->first_layer + 1) % 6 == 0);
-         return (image_view->last_layer - image_view->first_layer + 1) / 6;
+         assert(image_view->vk.layer_count % 6 == 0);
+         return image_view->vk.layer_count / 6;
       }
    case QUNIFORM_TEXTURE_LEVELS:
-      return image_view->max_level - image_view->base_level + 1;
+      return image_view->vk.level_count;
    case QUNIFORM_TEXTURE_SAMPLES:
-      assert(image_view->image);
-      return image_view->image->samples;
+      assert(image_view->vk.image);
+      return image_view->vk.image->samples;
    default:
       unreachable("Bad texture size field");
    }
@@ -279,16 +356,18 @@ get_texture_size_from_buffer_view(struct v3dv_buffer_view *buffer_view,
 static uint32_t
 get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,
                  struct v3dv_pipeline *pipeline,
+                 enum broadcom_shader_stage stage,
                  enum quniform_contents contents,
                  uint32_t data)
 {
-   uint32_t texture_idx = v3d_unit_data_get_unit(data);
+   uint32_t texture_idx = data;
+
    struct v3dv_descriptor_state *descriptor_state =
       v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
 
    struct v3dv_descriptor *descriptor =
       v3dv_descriptor_map_get_descriptor(descriptor_state,
-                                         &pipeline->shared_data->texture_map,
+                                         &pipeline->shared_data->maps[stage]->texture_map,
                                          pipeline->layout,
                                          texture_idx, NULL);
 
@@ -322,6 +401,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
 
    struct v3dv_job *job = cmd_buffer->state.job;
    assert(job);
+   assert(job->cmd_buffer == cmd_buffer);
+
+   struct texture_bo_list tex_bos = { 0 };
+   struct state_bo_list state_bos = { 0 };
+   struct buffer_bo_list buffer_bos = { 0 };
 
    /* The hardware always pre-fetches the next uniform (also when there
     * aren't any), so we always allocate space for an extra slot. This
@@ -369,17 +453,20 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
       case QUNIFORM_UBO_ADDR:
       case QUNIFORM_GET_SSBO_SIZE:
       case QUNIFORM_GET_UBO_SIZE:
-         write_ubo_ssbo_uniforms(cmd_buffer, pipeline, &uniforms,
-                                 uinfo->contents[i], data);
+         write_ubo_ssbo_uniforms(cmd_buffer, pipeline, variant->stage, &uniforms,
+                                 uinfo->contents[i], data, &buffer_bos);
+
         break;
 
       case QUNIFORM_IMAGE_TMU_CONFIG_P0:
       case QUNIFORM_TMU_CONFIG_P0:
-         write_tmu_p0(cmd_buffer, pipeline, &uniforms, data);
+         write_tmu_p0(cmd_buffer, pipeline, variant->stage,
+                      &uniforms, data, &tex_bos, &state_bos);
          break;
 
       case QUNIFORM_TMU_CONFIG_P1:
-         write_tmu_p1(cmd_buffer, pipeline, &uniforms, data);
+         write_tmu_p1(cmd_buffer, pipeline, variant->stage,
+                      &uniforms, data, &state_bos);
          break;
 
       case QUNIFORM_IMAGE_WIDTH:
@@ -395,10 +482,66 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
          cl_aligned_u32(&uniforms,
                         get_texture_size(cmd_buffer,
                                          pipeline,
+                                         variant->stage,
                                          uinfo->contents[i],
                                          data));
          break;
 
+      /* We generate this from geometry shaders to cap the generated gl_Layer
+       * to be within the number of layers of the framebuffer so we prevent the
+       * binner from trying to access tile state memory out of bounds (for
+       * layers that don't exist).
+       *
+       * Unfortunately, for secondary command buffers we may not know the
+       * number of layers in the framebuffer at this stage. Since we are
+       * only using this to sanitize the shader and it should not have any
+       * impact on correct shaders that emit valid values for gl_Layer,
+       * we just work around it by using the largest number of layers we
+       * support.
+       *
+       * FIXME: we could do better than this by recording in the job that
+       * the value at this uniform offset is not correct, and patch it when
+       * we execute the secondary command buffer into a primary, since we do
+       * have the correct number of layers at that point, but again, since this
+       * is only for sanityzing the shader and it only affects the specific case
+       * of secondary command buffers without framebuffer info available it
+       * might not be worth the trouble.
+       *
+       * With multiview the number of layers is dictated by the view mask
+       * and not by the framebuffer layers. We do set the job's frame tiling
+       * information correctly from the view mask in that case, however,
+       * secondary command buffers may not have valid frame tiling data,
+       * so when multiview is enabled, we always set the number of layers
+       * from the subpass view mask.
+       */
+      case QUNIFORM_FB_LAYERS: {
+         const struct v3dv_cmd_buffer_state *state = &job->cmd_buffer->state;
+         const uint32_t view_mask =
+            state->pass->subpasses[state->subpass_idx].view_mask;
+
+         uint32_t num_layers;
+         if (view_mask != 0) {
+            num_layers = util_last_bit(view_mask);
+         } else if (job->frame_tiling.layers != 0) {
+            num_layers = job->frame_tiling.layers;
+         } else if (cmd_buffer->state.framebuffer) {
+            num_layers = cmd_buffer->state.framebuffer->layers;
+         } else {
+            assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+            num_layers = 2048;
+#if DEBUG
+            fprintf(stderr, "Skipping gl_LayerID shader sanity check for "
+                            "secondary command buffer\n");
+#endif
+         }
+         cl_aligned_u32(&uniforms, num_layers);
+         break;
+      }
+
+      case QUNIFORM_VIEW_INDEX:
+         cl_aligned_u32(&uniforms, job->cmd_buffer->state.view_index);
+         break;
+
       case QUNIFORM_NUM_WORK_GROUPS:
          assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
          assert(job->csd.wg_count[data] > 0);
@@ -407,15 +550,20 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
          cl_aligned_u32(&uniforms, job->csd.wg_count[data]);
          break;
 
+      case QUNIFORM_WORK_GROUP_BASE:
+         assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
+         cl_aligned_u32(&uniforms, job->csd.wg_base[data]);
+         break;
+
       case QUNIFORM_SHARED_OFFSET:
          assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
          assert(job->csd.shared_memory);
-         cl_aligned_reloc(&job->indirect, &uniforms, job->csd.shared_memory, 0);
+         cl_aligned_u32(&uniforms, job->csd.shared_memory->offset);
          break;
 
       case QUNIFORM_SPILL_OFFSET:
          assert(pipeline->spill.bo);
-         cl_aligned_reloc(&job->indirect, &uniforms, pipeline->spill.bo, 0);
+         cl_aligned_u32(&uniforms, pipeline->spill.bo->offset);
          break;
 
       case QUNIFORM_SPILL_SIZE_PER_THREAD:
@@ -430,6 +578,30 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
 
    cl_end(&job->indirect, uniforms);
 
+   for (int i = 0; i < MAX_TOTAL_TEXTURE_SAMPLERS; i++) {
+      if (tex_bos.tex[i])
+         v3dv_job_add_bo(job, tex_bos.tex[i]);
+   }
+
+   for (int i = 0; i < state_bos.count; i++)
+      v3dv_job_add_bo(job, state_bos.states[i]);
+
+   for (int i = 0; i < MAX_TOTAL_UNIFORM_BUFFERS; i++) {
+      if (buffer_bos.ubo[i])
+         v3dv_job_add_bo(job, buffer_bos.ubo[i]);
+   }
+
+   for (int i = 0; i < MAX_TOTAL_STORAGE_BUFFERS; i++) {
+      if (buffer_bos.ssbo[i])
+         v3dv_job_add_bo(job, buffer_bos.ssbo[i]);
+   }
+
+   if (job->csd.shared_memory)
+      v3dv_job_add_bo(job, job->csd.shared_memory);
+
+   if (pipeline->spill.bo)
+      v3dv_job_add_bo(job, pipeline->spill.bo);
+
    return uniform_stream;
 }
 
diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c b/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c
index 25bb4636a..154adf3a7 100644
--- a/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c
+++ b/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c
@@ -25,11 +25,12 @@
 
 #include "v3dv_private.h"
 #include "drm-uapi/drm_fourcc.h"
+#include "wsi_common_entrypoints.h"
 #include "vk_format_info.h"
 #include "vk_util.h"
 #include "wsi_common.h"
 
-static PFN_vkVoidFunction
+static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 v3dv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
 {
    V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
@@ -46,6 +47,31 @@ v3dv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
    return vk_device_dispatch_table_get(&vk_device_trampolines, pName);
 }
 
+static bool
+v3dv_wsi_can_present_on_device(VkPhysicalDevice _pdevice, int fd)
+{
+   V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, _pdevice);
+
+   drmDevicePtr fd_devinfo, display_devinfo;
+   int ret;
+
+   ret = drmGetDevice2(fd, 0, &fd_devinfo);
+   if (ret)
+      return false;
+
+   ret = drmGetDevice2(pdevice->display_fd, 0, &display_devinfo);
+   if (ret) {
+      drmFreeDevice(&fd_devinfo);
+      return false;
+   }
+
+   bool result = drmDevicesEqual(fd_devinfo, display_devinfo);
+
+   drmFreeDevice(&fd_devinfo);
+   drmFreeDevice(&display_devinfo);
+   return result;
+}
+
 VkResult
 v3dv_wsi_init(struct v3dv_physical_device *physical_device)
 {
@@ -61,6 +87,10 @@ v3dv_wsi_init(struct v3dv_physical_device *physical_device)
       return result;
 
    physical_device->wsi_device.supports_modifiers = true;
+   physical_device->wsi_device.can_present_on_device =
+      v3dv_wsi_can_present_on_device;
+
+   physical_device->vk.wsi_device = &physical_device->wsi_device;
 
    return VK_SUCCESS;
 }
@@ -68,38 +98,11 @@ v3dv_wsi_init(struct v3dv_physical_device *physical_device)
 void
 v3dv_wsi_finish(struct v3dv_physical_device *physical_device)
 {
+   physical_device->vk.wsi_device = NULL;
    wsi_device_finish(&physical_device->wsi_device,
                      &physical_device->vk.instance->alloc);
 }
 
-void v3dv_DestroySurfaceKHR(
-    VkInstance                                   _instance,
-    VkSurfaceKHR                                 _surface,
-    const VkAllocationCallbacks*                 pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
-
-   if (!surface)
-      return;
-
-   vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VkResult v3dv_GetPhysicalDeviceSurfaceSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    VkSurfaceKHR                                surface,
-    VkBool32*                                   pSupported)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_support(&device->wsi_device,
-                                         queueFamilyIndex,
-                                         surface,
-                                         pSupported);
-}
-
 static void
 constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps)
 {
@@ -114,74 +117,36 @@ constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps)
    caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_SAMPLED_BIT;
 }
 
-VkResult v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
     VkPhysicalDevice                            physicalDevice,
     VkSurfaceKHR                                surface,
     VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
 {
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
    VkResult result;
-   result = wsi_common_get_surface_capabilities(&device->wsi_device,
-                                                surface,
-                                                pSurfaceCapabilities);
+   result = wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice,
+                                                        surface,
+                                                        pSurfaceCapabilities);
    constraint_surface_capabilities(pSurfaceCapabilities);
    return result;
 }
 
-VkResult v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR(
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR(
     VkPhysicalDevice                            physicalDevice,
     const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
     VkSurfaceCapabilities2KHR*                  pSurfaceCapabilities)
 {
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
    VkResult result;
-   result = wsi_common_get_surface_capabilities2(&device->wsi_device,
-                                                 pSurfaceInfo,
-                                                 pSurfaceCapabilities);
+   result = wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(physicalDevice,
+                                                         pSurfaceInfo,
+                                                         pSurfaceCapabilities);
    constraint_surface_capabilities(&pSurfaceCapabilities->surfaceCapabilities);
    return result;
 }
 
-VkResult v3dv_GetPhysicalDeviceSurfaceFormatsKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormatKHR*                         pSurfaceFormats)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats(&device->wsi_device, surface,
-                                         pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VkResult v3dv_GetPhysicalDeviceSurfaceFormats2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormat2KHR*                        pSurfaceFormats)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
-                                          pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VkResult v3dv_GetPhysicalDeviceSurfacePresentModesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pPresentModeCount,
-    VkPresentModeKHR*                           pPresentModes)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_present_modes(&device->wsi_device, surface,
-                                               pPresentModeCount,
-                                               pPresentModes);
-}
-
-VkResult v3dv_CreateSwapchainKHR(
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_CreateSwapchainKHR(
     VkDevice                                     _device,
     const VkSwapchainCreateInfoKHR*              pCreateInfo,
     const VkAllocationCallbacks*                 pAllocator,
@@ -190,7 +155,6 @@ VkResult v3dv_CreateSwapchainKHR(
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    struct v3dv_instance *instance = device->instance;
    struct v3dv_physical_device *pdevice = &instance->physicalDevice;
-   struct wsi_device *wsi_device = &pdevice->wsi_device;
 
    ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface);
    VkResult result =
@@ -198,64 +162,29 @@ VkResult v3dv_CreateSwapchainKHR(
    if (result != VK_SUCCESS)
       return result;
 
-   const VkAllocationCallbacks *alloc;
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
-
-   return wsi_common_create_swapchain(wsi_device, _device,
-                                      pCreateInfo, alloc, pSwapchain);
+   return wsi_CreateSwapchainKHR(_device, pCreateInfo, pAllocator, pSwapchain);
 }
 
-void v3dv_DestroySwapchainKHR(
-    VkDevice                                     _device,
-    VkSwapchainKHR                               swapchain,
-    const VkAllocationCallbacks*                 pAllocator)
+struct v3dv_image *
+v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index)
 {
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   const VkAllocationCallbacks *alloc;
+   uint32_t n_images = index + 1;
+   VkImage *images = malloc(sizeof(*images) * n_images);
+   VkResult result = wsi_common_get_images(swapchain, &n_images, images);
 
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
+   if (result != VK_SUCCESS && result != VK_INCOMPLETE) {
+      free(images);
+      return NULL;
+   }
 
-   wsi_common_destroy_swapchain(_device, swapchain, alloc);
-}
+   V3DV_FROM_HANDLE(v3dv_image, image, images[index]);
+   free(images);
 
-VkResult v3dv_GetSwapchainImagesKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint32_t*                                    pSwapchainImageCount,
-    VkImage*                                     pSwapchainImages)
-{
-   return wsi_common_get_images(swapchain,
-                                pSwapchainImageCount,
-                                pSwapchainImages);
+   return image;
 }
 
-VkResult v3dv_AcquireNextImageKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint64_t                                     timeout,
-    VkSemaphore                                  semaphore,
-    VkFence                                      fence,
-    uint32_t*                                    pImageIndex)
-{
-   VkAcquireNextImageInfoKHR acquire_info = {
-      .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
-      .swapchain = swapchain,
-      .timeout = timeout,
-      .semaphore = semaphore,
-      .fence = fence,
-      .deviceMask = 0,
-   };
-
-   return v3dv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
-VkResult v3dv_AcquireNextImage2KHR(
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_AcquireNextImage2KHR(
     VkDevice                                     _device,
     const VkAcquireNextImageInfoKHR*             pAcquireInfo,
     uint32_t*                                    pImageIndex)
@@ -279,52 +208,3 @@ VkResult v3dv_AcquireNextImage2KHR(
 
    return result;
 }
-
-VkResult v3dv_QueuePresentKHR(
-    VkQueue                                  _queue,
-    const VkPresentInfoKHR*                  pPresentInfo)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   struct v3dv_physical_device *pdevice =
-      &queue->device->instance->physicalDevice;
-
-   return wsi_common_queue_present(&pdevice->wsi_device,
-                                   v3dv_device_to_handle(queue->device),
-                                   _queue, 0,
-                                   pPresentInfo);
-}
-
-VkResult v3dv_GetDeviceGroupPresentCapabilitiesKHR(
-    VkDevice                                    device,
-    VkDeviceGroupPresentCapabilitiesKHR*        pCapabilities)
-{
-   memset(pCapabilities->presentMask, 0,
-          sizeof(pCapabilities->presentMask));
-   pCapabilities->presentMask[0] = 0x1;
-   pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VkResult v3dv_GetDeviceGroupSurfacePresentModesKHR(
-    VkDevice                                    device,
-    VkSurfaceKHR                                surface,
-    VkDeviceGroupPresentModeFlagsKHR*           pModes)
-{
-   *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VkResult v3dv_GetPhysicalDevicePresentRectanglesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pRectCount,
-    VkRect2D*                                   pRects)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_present_rectangles(&device->wsi_device,
-                                            surface,
-                                            pRectCount, pRects);
-}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c
new file mode 100644
index 000000000..c2f2c7786
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -0,0 +1,2281 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+#include "util/half_float.h"
+#include "vulkan/util/vk_format.h"
+#include "util/u_pack_color.h"
+
+#include "vk_format_info.h"
+
+void
+v3dX(job_emit_binning_flush)(struct v3dv_job *job)
+{
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
+   v3dv_return_if_oom(NULL, job);
+
+   cl_emit(&job->bcl, FLUSH, flush);
+}
+
+void
+v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
+                              const struct v3dv_frame_tiling *tiling,
+                              uint32_t layers)
+{
+   /* This must go before the binning mode configuration. It is
+    * required for layered framebuffers to work.
+    */
+   cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
+      config.number_of_layers = layers;
+   }
+
+   cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+      config.width_in_pixels = tiling->width;
+      config.height_in_pixels = tiling->height;
+      config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
+      config.multisample_mode_4x = tiling->msaa;
+      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+   }
+
+   /* There's definitely nothing in the VCD cache we want. */
+   cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
+
+   /* "Binning mode lists must have a Start Tile Binning item (6) after
+    *  any prefix state data before the binning list proper starts."
+    */
+   cl_emit(&job->bcl, START_TILE_BINNING, bin);
+}
+
+void
+v3dX(cmd_buffer_end_render_pass_secondary)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   assert(cmd_buffer->state.job);
+   v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
+                                    cl_packet_length(RETURN_FROM_SUB_LIST));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+   cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
+}
+
+void
+v3dX(job_emit_clip_window)(struct v3dv_job *job, const VkRect2D *rect)
+{
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
+   v3dv_return_if_oom(NULL, job);
+
+   cl_emit(&job->bcl, CLIP_WINDOW, clip) {
+      clip.clip_window_left_pixel_coordinate = rect->offset.x;
+      clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
+      clip.clip_window_width_in_pixels = rect->extent.width;
+      clip.clip_window_height_in_pixels = rect->extent.height;
+   }
+}
+
+static void
+cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
+                                 struct v3dv_cl *cl,
+                                 struct v3dv_image_view *iview,
+                                 uint32_t layer,
+                                 uint32_t buffer)
+{
+   const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+   const struct v3d_resource_slice *slice =
+      &image->slices[iview->vk.base_mip_level];
+   uint32_t layer_offset =
+      v3dv_layer_offset(image, iview->vk.base_mip_level,
+                        iview->vk.base_array_layer + layer);
+
+   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
+      load.buffer_to_load = buffer;
+      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
+
+      load.input_image_format = iview->format->rt_type;
+      load.r_b_swap = iview->swap_rb;
+      load.memory_format = slice->tiling;
+
+      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+          slice->tiling == V3D_TILING_UIF_XOR) {
+         load.height_in_ub_or_stride =
+            slice->padded_height_of_output_image_in_uif_blocks;
+      } else if (slice->tiling == V3D_TILING_RASTER) {
+         load.height_in_ub_or_stride = slice->stride;
+      }
+
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
+         load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+      else
+         load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   }
+}
+
+static bool
+check_needs_load(const struct v3dv_cmd_buffer_state *state,
+                 VkImageAspectFlags aspect,
+                 uint32_t first_subpass_idx,
+                 VkAttachmentLoadOp load_op)
+{
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
+    * testing does not exist in the image.
+    */
+   if (!aspect)
+      return false;
+
+   /* Attachment (or view) load operations apply on the first subpass that
+    * uses the attachment (or view), otherwise we always need to load.
+    */
+   if (state->job->first_subpass > first_subpass_idx)
+      return true;
+
+   /* If the job is continuing a subpass started in another job, we always
+    * need to load.
+    */
+   if (state->job->is_subpass_continue)
+      return true;
+
+   /* If the area is not aligned to tile boundaries, we always need to load */
+   if (!state->tile_aligned_render_area)
+      return true;
+
+   /* The attachment load operations must be LOAD */
+   return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
+}
+
+static inline uint32_t
+v3dv_zs_buffer(bool depth, bool stencil)
+{
+   if (depth && stencil)
+      return ZSTENCIL;
+   else if (depth)
+      return Z;
+   else if (stencil)
+      return STENCIL;
+   return NONE;
+}
+
+static void
+cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
+                                  struct v3dv_cl *cl,
+                                  uint32_t layer)
+{
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
+   const struct v3dv_render_pass *pass = state->pass;
+   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+
+  assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
+
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+
+      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      const struct v3dv_render_pass_attachment *attachment =
+         &state->pass->attachments[attachment_idx];
+
+      /* According to the Vulkan spec:
+       *
+       *    "The load operation for each sample in an attachment happens before
+       *     any recorded command which accesses the sample in the first subpass
+       *     where the attachment is used."
+       *
+       * If the load operation is CLEAR, we must only clear once on the first
+       * subpass that uses the attachment (and in that case we don't LOAD).
+       * After that, we always want to load so we don't lose any rendering done
+       * by a previous subpass to the same attachment. We also want to load
+       * if the current job is continuing subpass work started by a previous
+       * job, for the same reason.
+       *
+       * If the render area is not aligned to tile boundaries then we have
+       * tiles which are partially covered by it. In this case, we need to
+       * load the tiles so we can preserve the pixels that are outside the
+       * render area for any such tiles.
+       */
+      uint32_t first_subpass = !pass->multiview_enabled ?
+         attachment->first_subpass :
+         attachment->views[layer].first_subpass;
+
+      bool needs_load = check_needs_load(state,
+                                         VK_IMAGE_ASPECT_COLOR_BIT,
+                                         first_subpass,
+                                         attachment->desc.loadOp);
+      if (needs_load) {
+         struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
+         cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
+                                          layer, RENDER_TARGET_0 + i);
+      }
+   }
+
+   uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+      const struct v3dv_render_pass_attachment *ds_attachment =
+         &state->pass->attachments[ds_attachment_idx];
+
+      const VkImageAspectFlags ds_aspects =
+         vk_format_aspects(ds_attachment->desc.format);
+
+      uint32_t ds_first_subpass = !pass->multiview_enabled ?
+         ds_attachment->first_subpass :
+         ds_attachment->views[layer].first_subpass;
+
+      const bool needs_depth_load =
+         check_needs_load(state,
+                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                          ds_first_subpass,
+                          ds_attachment->desc.loadOp);
+
+      const bool needs_stencil_load =
+         check_needs_load(state,
+                          ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                          ds_first_subpass,
+                          ds_attachment->desc.stencilLoadOp);
+
+      if (needs_depth_load || needs_stencil_load) {
+         struct v3dv_image_view *iview =
+            framebuffer->attachments[ds_attachment_idx];
+         /* From the Vulkan spec:
+          *
+          *   "When an image view of a depth/stencil image is used as a
+          *   depth/stencil framebuffer attachment, the aspectMask is ignored
+          *   and both depth and stencil image subresources are used."
+          *
+          * So we ignore the aspects from the subresource range of the image
+          * view for the depth/stencil attachment, but we still need to restrict
+          * the to aspects compatible with the render pass and the image.
+          */
+         const uint32_t zs_buffer =
+            v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
+         cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
+                                          iview, layer, zs_buffer);
+      }
+   }
+
+   cl_emit(cl, END_OF_LOADS, end);
+}
+
+static void
+cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
+                                  struct v3dv_cl *cl,
+                                  uint32_t attachment_idx,
+                                  uint32_t layer,
+                                  uint32_t buffer,
+                                  bool clear,
+                                  bool is_multisample_resolve)
+{
+   const struct v3dv_image_view *iview =
+      cmd_buffer->state.framebuffer->attachments[attachment_idx];
+   const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+   const struct v3d_resource_slice *slice =
+      &image->slices[iview->vk.base_mip_level];
+   uint32_t layer_offset = v3dv_layer_offset(image,
+                                             iview->vk.base_mip_level,
+                                             iview->vk.base_array_layer + layer);
+
+   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+      store.buffer_to_store = buffer;
+      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
+      store.clear_buffer_being_stored = clear;
+
+      store.output_image_format = iview->format->rt_type;
+      store.r_b_swap = iview->swap_rb;
+      store.memory_format = slice->tiling;
+
+      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+          slice->tiling == V3D_TILING_UIF_XOR) {
+         store.height_in_ub_or_stride =
+            slice->padded_height_of_output_image_in_uif_blocks;
+      } else if (slice->tiling == V3D_TILING_RASTER) {
+         store.height_in_ub_or_stride = slice->stride;
+      }
+
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
+         store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+      else if (is_multisample_resolve)
+         store.decimate_mode = V3D_DECIMATE_MODE_4X;
+      else
+         store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   }
+}
+
+static bool
+check_needs_clear(const struct v3dv_cmd_buffer_state *state,
+                  VkImageAspectFlags aspect,
+                  uint32_t first_subpass_idx,
+                  VkAttachmentLoadOp load_op,
+                  bool do_clear_with_draw)
+{
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
+    * testing does not exist in the image.
+    */
+   if (!aspect)
+      return false;
+
+   /* If the aspect needs to be cleared with a draw call then we won't emit
+    * the clear here.
+    */
+   if (do_clear_with_draw)
+      return false;
+
+   /* If this is resuming a subpass started with another job, then attachment
+    * load operations don't apply.
+    */
+   if (state->job->is_subpass_continue)
+      return false;
+
+   /* If the render area is not aligned to tile boudaries we can't use the
+    * TLB for a clear.
+    */
+   if (!state->tile_aligned_render_area)
+      return false;
+
+   /* If this job is running in a subpass other than the first subpass in
+    * which this attachment (or view) is used then attachment load operations
+    * don't apply.
+    */
+   if (state->job->first_subpass != first_subpass_idx)
+      return false;
+
+   /* The attachment load operation must be CLEAR */
+   return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
+}
+
+static bool
+check_needs_store(const struct v3dv_cmd_buffer_state *state,
+                  VkImageAspectFlags aspect,
+                  uint32_t last_subpass_idx,
+                  VkAttachmentStoreOp store_op)
+{
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
+    * testing does not exist in the image.
+    */
+   if (!aspect)
+      return false;
+
+   /* Attachment (or view) store operations only apply on the last subpass
+    * where the attachment (or view)  is used, in other subpasses we always
+    * need to store.
+    */
+   if (state->subpass_idx < last_subpass_idx)
+      return true;
+
+   /* Attachment store operations only apply on the last job we emit on the the
+    * last subpass where the attachment is used, otherwise we always need to
+    * store.
+    */
+   if (!state->job->is_subpass_finish)
+      return true;
+
+   /* The attachment store operation must be STORE */
+   return store_op == VK_ATTACHMENT_STORE_OP_STORE;
+}
+
+static void
+cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+                                   struct v3dv_cl *cl,
+                                   uint32_t layer)
+{
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct v3dv_render_pass *pass = state->pass;
+   const struct v3dv_subpass *subpass =
+      &pass->subpasses[state->subpass_idx];
+
+   bool has_stores = false;
+   bool use_global_zs_clear = false;
+   bool use_global_rt_clear = false;
+
+   assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
+
+   /* FIXME: separate stencil */
+   uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+      const struct v3dv_render_pass_attachment *ds_attachment =
+         &state->pass->attachments[ds_attachment_idx];
+
+      assert(state->job->first_subpass >= ds_attachment->first_subpass);
+      assert(state->subpass_idx >= ds_attachment->first_subpass);
+      assert(state->subpass_idx <= ds_attachment->last_subpass);
+
+      /* From the Vulkan spec, VkImageSubresourceRange:
+       *
+       *   "When an image view of a depth/stencil image is used as a
+       *   depth/stencil framebuffer attachment, the aspectMask is ignored
+       *   and both depth and stencil image subresources are used."
+       *
+       * So we ignore the aspects from the subresource range of the image
+       * view for the depth/stencil attachment, but we still need to restrict
+       * the to aspects compatible with the render pass and the image.
+       */
+      const VkImageAspectFlags aspects =
+         vk_format_aspects(ds_attachment->desc.format);
+
+      /* Only clear once on the first subpass that uses the attachment */
+      uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
+         ds_attachment->first_subpass :
+         ds_attachment->views[layer].first_subpass;
+
+      bool needs_depth_clear =
+         check_needs_clear(state,
+                           aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                           ds_first_subpass,
+                           ds_attachment->desc.loadOp,
+                           subpass->do_depth_clear_with_draw);
+
+      bool needs_stencil_clear =
+         check_needs_clear(state,
+                           aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                           ds_first_subpass,
+                           ds_attachment->desc.stencilLoadOp,
+                           subpass->do_stencil_clear_with_draw);
+
+      /* Skip the last store if it is not required */
+      uint32_t ds_last_subpass = !pass->multiview_enabled ?
+         ds_attachment->last_subpass :
+         ds_attachment->views[layer].last_subpass;
+
+      bool needs_depth_store =
+         check_needs_store(state,
+                           aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                           ds_last_subpass,
+                           ds_attachment->desc.storeOp);
+
+      bool needs_stencil_store =
+         check_needs_store(state,
+                           aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                           ds_last_subpass,
+                           ds_attachment->desc.stencilStoreOp);
+
+      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+       * for depth/stencil.
+       *
+       * There used to be some confusion regarding the Clear Tile Buffers
+       * Z/S bit also being broken, but we confirmed with Broadcom that this
+       * is not the case, it was just that some other hardware bugs (that we
+       * need to work around, such as GFXH-1461) could cause this bit to behave
+       * incorrectly.
+       *
+       * There used to be another issue where the RTs bit in the Clear Tile
+       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+       * fixed since V3D 4.1.
+       *
+       * So if we have to emit a clear of depth or stencil we don't use
+       * the per-buffer store clear bit, even if we need to store the buffers,
+       * instead we always have to use the Clear Tile Buffers Z/S bit.
+       * If we have configured the job to do early Z/S clearing, then we
+       * don't want to emit any Clear Tile Buffers command at all here.
+       *
+       * Note that GFXH-1689 is not reproduced in the simulator, where
+       * using the clear buffer bit in depth/stencil stores works fine.
+       */
+      use_global_zs_clear = !state->job->early_zs_clear &&
+         (needs_depth_clear || needs_stencil_clear);
+      if (needs_depth_store || needs_stencil_store) {
+         const uint32_t zs_buffer =
+            v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
+         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
+                                           ds_attachment_idx, layer,
+                                           zs_buffer, false, false);
+         has_stores = true;
+      }
+   }
+
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+
+      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      const struct v3dv_render_pass_attachment *attachment =
+         &state->pass->attachments[attachment_idx];
+
+      assert(state->job->first_subpass >= attachment->first_subpass);
+      assert(state->subpass_idx >= attachment->first_subpass);
+      assert(state->subpass_idx <= attachment->last_subpass);
+
+      /* Only clear once on the first subpass that uses the attachment */
+      uint32_t first_subpass = !pass->multiview_enabled ?
+         attachment->first_subpass :
+         attachment->views[layer].first_subpass;
+
+      bool needs_clear =
+         check_needs_clear(state,
+                           VK_IMAGE_ASPECT_COLOR_BIT,
+                           first_subpass,
+                           attachment->desc.loadOp,
+                           false);
+
+      /* Skip the last store if it is not required  */
+      uint32_t last_subpass = !pass->multiview_enabled ?
+         attachment->last_subpass :
+         attachment->views[layer].last_subpass;
+
+      bool needs_store =
+         check_needs_store(state,
+                           VK_IMAGE_ASPECT_COLOR_BIT,
+                           last_subpass,
+                           attachment->desc.storeOp);
+
+      /* If we need to resolve this attachment emit that store first. Notice
+       * that we must not request a tile buffer clear here in that case, since
+       * that would clear the tile buffer before we get to emit the actual
+       * color attachment store below, since the clear happens after the
+       * store is completed.
+       *
+       * If the attachment doesn't support TLB resolves then we will have to
+       * fallback to doing the resolve in a shader separately after this
+       * job, so we will need to store the multisampled sttachment even if that
+       * wansn't requested by the client.
+       */
+      const bool needs_resolve =
+         subpass->resolve_attachments &&
+         subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
+      if (needs_resolve && attachment->use_tlb_resolve) {
+         const uint32_t resolve_attachment_idx =
+            subpass->resolve_attachments[i].attachment;
+         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
+                                           resolve_attachment_idx, layer,
+                                           RENDER_TARGET_0 + i,
+                                           false, true);
+         has_stores = true;
+      } else if (needs_resolve) {
+         needs_store = true;
+      }
+
+      /* Emit the color attachment store if needed */
+      if (needs_store) {
+         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
+                                           attachment_idx, layer,
+                                           RENDER_TARGET_0 + i,
+                                           needs_clear && !use_global_rt_clear,
+                                           false);
+         has_stores = true;
+      } else if (needs_clear) {
+         use_global_rt_clear = true;
+      }
+   }
+
+   /* We always need to emit at least one dummy store */
+   if (!has_stores) {
+      cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+         store.buffer_to_store = NONE;
+      }
+   }
+
+   /* If we have any depth/stencil clears we can't use the per-buffer clear
+    * bit and instead we have to emit a single clear of all tile buffers.
+    */
+   if (use_global_zs_clear || use_global_rt_clear) {
+      cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+         clear.clear_z_stencil_buffer = use_global_zs_clear;
+         clear.clear_all_render_targets = use_global_rt_clear;
+      }
+   }
+}
+
+static void
+cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
+                                         uint32_t layer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   /* Emit the generic list in our indirect state -- the rcl will just
+    * have pointers into it.
+    */
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
+
+   /* The binner starts out writing tiles assuming that the initial mode
+    * is triangles, so make sure that's the case.
+    */
+   cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
+      fmt.primitive_type = LIST_TRIANGLES;
+   }
+
+   /* PTB assumes that value to be 0, but hw will not set it. */
+   cl_emit(cl, SET_INSTANCEID, set) {
+      set.instance_id = 0;
+   }
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+static void
+cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
+                                      uint32_t layer)
+{
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   struct v3dv_cl *rcl = &job->rcl;
+
+   /* If doing multicore binning, we would need to initialize each
+    * core's tile list here.
+    */
+   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+   const uint32_t tile_alloc_offset =
+      64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
+   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
+   }
+
+   cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
+
+   uint32_t supertile_w_in_pixels =
+      tiling->tile_width * tiling->supertile_width;
+   uint32_t supertile_h_in_pixels =
+      tiling->tile_height * tiling->supertile_height;
+   const uint32_t min_x_supertile =
+      state->render_area.offset.x / supertile_w_in_pixels;
+   const uint32_t min_y_supertile =
+      state->render_area.offset.y / supertile_h_in_pixels;
+
+   uint32_t max_render_x = state->render_area.offset.x;
+   if (state->render_area.extent.width > 0)
+      max_render_x += state->render_area.extent.width - 1;
+   uint32_t max_render_y = state->render_area.offset.y;
+   if (state->render_area.extent.height > 0)
+      max_render_y += state->render_area.extent.height - 1;
+   const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
+   const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
+
+   for (int y = min_y_supertile; y <= max_y_supertile; y++) {
+      for (int x = min_x_supertile; x <= max_x_supertile; x++) {
+         cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
+            coords.column_number_in_supertiles = x;
+            coords.row_number_in_supertiles = y;
+         }
+      }
+   }
+}
+
+static void
+set_rcl_early_z_config(struct v3dv_job *job,
+                       bool *early_z_disable,
+                       uint32_t *early_z_test_and_update_direction)
+{
+   /* If this is true then we have not emitted any draw calls in this job
+    * and we don't get any benefits form early Z.
+    */
+   if (!job->decided_global_ez_enable) {
+      assert(job->draw_count == 0);
+      *early_z_disable = true;
+      return;
+   }
+
+   switch (job->first_ez_state) {
+   case V3D_EZ_UNDECIDED:
+   case V3D_EZ_LT_LE:
+      *early_z_disable = false;
+      *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
+      break;
+   case V3D_EZ_GT_GE:
+      *early_z_disable = false;
+      *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
+      break;
+   case V3D_EZ_DISABLED:
+      *early_z_disable = true;
+      break;
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
+
+   /* We can't emit the RCL until we have a framebuffer, which we may not have
+    * if we are recording a secondary command buffer. In that case, we will
+    * have to wait until vkCmdExecuteCommands is called from a primary command
+    * buffer.
+    */
+   if (!framebuffer) {
+      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      return;
+   }
+
+   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+
+   const uint32_t fb_layers = job->frame_tiling.layers;
+
+   v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
+                                    MAX2(fb_layers, 1) * 256 *
+                                    cl_packet_length(SUPERTILE_COORDINATES));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   assert(state->subpass_idx < state->pass->subpass_count);
+   const struct v3dv_render_pass *pass = state->pass;
+   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+   struct v3dv_cl *rcl = &job->rcl;
+
+   /* Comon config must be the first TILE_RENDERING_MODE_CFG and
+    * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
+    * updates to the previous HW state.
+    */
+   bool do_early_zs_clear = false;
+   const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
+      config.image_width_pixels = framebuffer->width;
+      config.image_height_pixels = framebuffer->height;
+      config.number_of_render_targets = MAX2(subpass->color_count, 1);
+      config.multisample_mode_4x = tiling->msaa;
+      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+
+      if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+         const struct v3dv_image_view *iview =
+            framebuffer->attachments[ds_attachment_idx];
+         config.internal_depth_type = iview->internal_type;
+
+         set_rcl_early_z_config(job,
+                                &config.early_z_disable,
+                                &config.early_z_test_and_update_direction);
+
+         /* Early-Z/S clear can be enabled if the job is clearing and not
+          * storing (or loading) depth. If a stencil aspect is also present
+          * we have the same requirements for it, however, in this case we
+          * can accept stencil loadOp DONT_CARE as well, so instead of
+          * checking that stencil is cleared we check that is not loaded.
+          *
+          * Early-Z/S clearing is independent of Early Z/S testing, so it is
+          * possible to enable one but not the other so long as their
+          * respective requirements are met.
+          */
+         struct v3dv_render_pass_attachment *ds_attachment =
+            &pass->attachments[ds_attachment_idx];
+
+         const VkImageAspectFlags ds_aspects =
+            vk_format_aspects(ds_attachment->desc.format);
+
+         bool needs_depth_clear =
+            check_needs_clear(state,
+                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                              ds_attachment->first_subpass,
+                              ds_attachment->desc.loadOp,
+                              subpass->do_depth_clear_with_draw);
+
+         bool needs_depth_store =
+            check_needs_store(state,
+                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                              ds_attachment->last_subpass,
+                              ds_attachment->desc.storeOp);
+
+         do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+         if (do_early_zs_clear &&
+             vk_format_has_stencil(ds_attachment->desc.format)) {
+            bool needs_stencil_load =
+               check_needs_load(state,
+                                ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                ds_attachment->first_subpass,
+                                ds_attachment->desc.stencilLoadOp);
+
+            bool needs_stencil_store =
+               check_needs_store(state,
+                                 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                 ds_attachment->last_subpass,
+                                 ds_attachment->desc.stencilStoreOp);
+
+            do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
+         }
+
+         config.early_depth_stencil_clear = do_early_zs_clear;
+      } else {
+         config.early_z_disable = true;
+      }
+   }
+
+   /* If we enabled early Z/S clear, then we can't emit any "Clear Tile Buffers"
+    * commands with the Z/S bit set, so keep track of whether we enabled this
+    * in the job so we can skip these later.
+    */
+   job->early_zs_clear = do_early_zs_clear;
+
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      struct v3dv_image_view *iview =
+         state->framebuffer->attachments[attachment_idx];
+
+      const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+      const struct v3d_resource_slice *slice =
+         &image->slices[iview->vk.base_mip_level];
+
+      const uint32_t *clear_color =
+         &state->attachments[attachment_idx].clear_value.color[0];
+
+      uint32_t clear_pad = 0;
+      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+          slice->tiling == V3D_TILING_UIF_XOR) {
+         int uif_block_height = v3d_utile_height(image->cpp) * 2;
+
+         uint32_t implicit_padded_height =
+            align(framebuffer->height, uif_block_height) / uif_block_height;
+
+         if (slice->padded_height_of_output_image_in_uif_blocks -
+             implicit_padded_height >= 15) {
+            clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
+         }
+      }
+
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
+         clear.clear_color_low_32_bits = clear_color[0];
+         clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
+         clear.render_target_number = i;
+      };
+
+      if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
+            clear.clear_color_mid_low_32_bits =
+               ((clear_color[1] >> 24) | (clear_color[2] << 8));
+            clear.clear_color_mid_high_24_bits =
+               ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
+            clear.render_target_number = i;
+         };
+      }
+
+      if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
+            clear.uif_padded_height_in_uif_blocks = clear_pad;
+            clear.clear_color_high_16_bits = clear_color[3] >> 16;
+            clear.render_target_number = i;
+         };
+      }
+   }
+
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+      v3dX(cmd_buffer_render_pass_setup_render_target)
+         (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
+          &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
+      v3dX(cmd_buffer_render_pass_setup_render_target)
+         (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
+          &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
+      v3dX(cmd_buffer_render_pass_setup_render_target)
+         (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
+          &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
+      v3dX(cmd_buffer_render_pass_setup_render_target)
+         (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
+          &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
+   }
+
+   /* Ends rendering mode config. */
+   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+         clear.z_clear_value =
+            state->attachments[ds_attachment_idx].clear_value.z;
+         clear.stencil_clear_value =
+            state->attachments[ds_attachment_idx].clear_value.s;
+      };
+   } else {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+         clear.z_clear_value = 1.0f;
+         clear.stencil_clear_value = 0;
+      };
+   }
+
+   /* Always set initial block size before the first branch, which needs
+    * to match the value from binning mode config.
+    */
+   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
+      init.use_auto_chained_tile_lists = true;
+      init.size_of_first_block_in_chained_tile_lists =
+         TILE_ALLOCATION_BLOCK_SIZE_64B;
+   }
+
+   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
+      config.number_of_bin_tile_lists = 1;
+      config.total_frame_width_in_tiles = tiling->draw_tiles_x;
+      config.total_frame_height_in_tiles = tiling->draw_tiles_y;
+
+      config.supertile_width_in_tiles = tiling->supertile_width;
+      config.supertile_height_in_tiles = tiling->supertile_height;
+
+      config.total_frame_width_in_supertiles =
+         tiling->frame_width_in_supertiles;
+      config.total_frame_height_in_supertiles =
+         tiling->frame_height_in_supertiles;
+   }
+
+   /* Start by clearing the tile buffer. */
+   cl_emit(rcl, TILE_COORDINATES, coords) {
+      coords.tile_column_number = 0;
+      coords.tile_row_number = 0;
+   }
+
+   /* Emit an initial clear of the tile buffers. This is necessary
+    * for any buffers that should be cleared (since clearing
+    * normally happens at the *end* of the generic tile list), but
+    * it's also nice to clear everything so the first tile doesn't
+    * inherit any contents from some previous frame.
+    *
+    * Also, implement the GFXH-1742 workaround. There's a race in
+    * the HW between the RCL updating the TLB's internal type/size
+    * and the spawning of the QPU instances using the TLB's current
+    * internal type/size. To make sure the QPUs get the right
+    * state, we need 1 dummy store in between internal type/size
+    * changes on V3D 3.x, and 2 dummy stores on 4.x.
+    */
+   for (int i = 0; i < 2; i++) {
+      if (i > 0)
+         cl_emit(rcl, TILE_COORDINATES, coords);
+      cl_emit(rcl, END_OF_LOADS, end);
+      cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
+         store.buffer_to_store = NONE;
+      }
+      if (i == 0 && cmd_buffer->state.tile_aligned_render_area) {
+         cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
+            clear.clear_z_stencil_buffer = !job->early_zs_clear;
+            clear.clear_all_render_targets = true;
+         }
+      }
+      cl_emit(rcl, END_OF_TILE_MARKER, end);
+   }
+
+   cl_emit(rcl, FLUSH_VCD_CACHE, flush);
+
+   for (int layer = 0; layer < MAX2(1, fb_layers); layer++) {
+      if (subpass->view_mask == 0 || (subpass->view_mask & (1u << layer)))
+         cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
+   }
+
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+void
+v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   /* FIXME: right now we only support one viewport. viewporst[0] would work
+    * now, would need to change if we allow multiple viewports
+    */
+   float *vptranslate = dynamic->viewport.translate[0];
+   float *vpscale = dynamic->viewport.scale[0];
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   const uint32_t required_cl_size =
+      cl_packet_length(CLIPPER_XY_SCALING) +
+      cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
+      cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
+      cl_packet_length(VIEWPORT_OFFSET);
+   v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+      clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
+      clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
+   }
+
+   cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+      clip.viewport_z_offset_zc_to_zs = vptranslate[2];
+      clip.viewport_z_scale_zc_to_zs = vpscale[2];
+   }
+   cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
+      /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */
+      float z1 = vptranslate[2];
+      float z2 = vptranslate[2] + vpscale[2];
+      clip.minimum_zw = MIN2(z1, z2);
+      clip.maximum_zw = MAX2(z1, z2);
+   }
+
+   cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
+      vp.viewport_centre_x_coordinate = vptranslate[0];
+      vp.viewport_centre_y_coordinate = vptranslate[1];
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
+}
+
+void
+v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic;
+
+   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
+      V3DV_DYNAMIC_STENCIL_WRITE_MASK |
+      V3DV_DYNAMIC_STENCIL_REFERENCE;
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl,
+                                    2 * cl_packet_length(STENCIL_CFG));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   bool emitted_stencil = false;
+   for (uint32_t i = 0; i < 2; i++) {
+      if (pipeline->emit_stencil_cfg[i]) {
+         if (dynamic_state->mask & dynamic_stencil_states) {
+            cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
+                                   pipeline->stencil_cfg[i], config) {
+               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) {
+                  config.stencil_test_mask =
+                     i == 0 ? dynamic_state->stencil_compare_mask.front :
+                     dynamic_state->stencil_compare_mask.back;
+               }
+               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) {
+                  config.stencil_write_mask =
+                     i == 0 ? dynamic_state->stencil_write_mask.front :
+                     dynamic_state->stencil_write_mask.back;
+               }
+               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) {
+                  config.stencil_ref_value =
+                     i == 0 ? dynamic_state->stencil_reference.front :
+                     dynamic_state->stencil_reference.back;
+               }
+            }
+         } else {
+            cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
+         }
+
+         emitted_stencil = true;
+      }
+   }
+
+   if (emitted_stencil) {
+      const uint32_t dynamic_stencil_dirty_flags =
+         V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
+         V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
+         V3DV_CMD_DIRTY_STENCIL_REFERENCE;
+      cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags;
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   assert(pipeline);
+
+   if (!pipeline->depth_bias.enabled)
+      return;
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
+      bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
+      bias.depth_offset_units = dynamic->depth_bias.constant_factor;
+      if (pipeline->depth_bias.is_z16)
+         bias.depth_offset_units *= 256.0f;
+      bias.limit = dynamic->depth_bias.depth_bias_clamp;
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
+}
+
+void
+v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit(&job->bcl, LINE_WIDTH, line) {
+      line.line_width = cmd_buffer->state.dynamic.line_width;
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH;
+}
+
+void
+v3dX(cmd_buffer_emit_sample_state)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   assert(pipeline);
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit(&job->bcl, SAMPLE_STATE, state) {
+      state.coverage = 1.0f;
+      state.mask = pipeline->sample_mask;
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   assert(pipeline);
+
+   const uint32_t blend_packets_size =
+      cl_packet_length(BLEND_ENABLES) +
+      cl_packet_length(BLEND_CONSTANT_COLOR) +
+      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
+      if (pipeline->blend.enables) {
+         cl_emit(&job->bcl, BLEND_ENABLES, enables) {
+            enables.mask = pipeline->blend.enables;
+         }
+      }
+
+      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+         if (pipeline->blend.enables & (1 << i))
+            cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
+      }
+   }
+
+   if (pipeline->blend.needs_color_constants &&
+       cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) {
+      struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+      cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
+         color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]);
+         color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]);
+         color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]);
+         color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]);
+      }
+      cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS;
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS));
+
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
+      mask.mask = (~dynamic->color_write_enable |
+                   pipeline->blend.color_write_masks) & 0xffff;
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
+}
+
+static void
+emit_flat_shade_flags(struct v3dv_job *job,
+                      int varying_offset,
+                      uint32_t varyings,
+                      enum V3DX(Varying_Flags_Action) lower,
+                      enum V3DX(Varying_Flags_Action) higher)
+{
+   v3dv_cl_ensure_space_with_branch(&job->bcl,
+                                    cl_packet_length(FLAT_SHADE_FLAGS));
+   v3dv_return_if_oom(NULL, job);
+
+   cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
+      flags.varying_offset_v0 = varying_offset;
+      flags.flat_shade_flags_for_varyings_v024 = varyings;
+      flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
+      flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
+   }
+}
+
+static void
+emit_noperspective_flags(struct v3dv_job *job,
+                         int varying_offset,
+                         uint32_t varyings,
+                         enum V3DX(Varying_Flags_Action) lower,
+                         enum V3DX(Varying_Flags_Action) higher)
+{
+   v3dv_cl_ensure_space_with_branch(&job->bcl,
+                                    cl_packet_length(NON_PERSPECTIVE_FLAGS));
+   v3dv_return_if_oom(NULL, job);
+
+   cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
+      flags.varying_offset_v0 = varying_offset;
+      flags.non_perspective_flags_for_varyings_v024 = varyings;
+      flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
+      flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
+   }
+}
+
+static void
+emit_centroid_flags(struct v3dv_job *job,
+                    int varying_offset,
+                    uint32_t varyings,
+                    enum V3DX(Varying_Flags_Action) lower,
+                    enum V3DX(Varying_Flags_Action) higher)
+{
+   v3dv_cl_ensure_space_with_branch(&job->bcl,
+                                    cl_packet_length(CENTROID_FLAGS));
+   v3dv_return_if_oom(NULL, job);
+
+   cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
+      flags.varying_offset_v0 = varying_offset;
+      flags.centroid_flags_for_varyings_v024 = varyings;
+      flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
+      flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
+   }
+}
+
+static bool
+emit_varying_flags(struct v3dv_job *job,
+                   uint32_t num_flags,
+                   const uint32_t *flags,
+                   void (*flag_emit_callback)(struct v3dv_job *job,
+                                              int varying_offset,
+                                              uint32_t flags,
+                                              enum V3DX(Varying_Flags_Action) lower,
+                                              enum V3DX(Varying_Flags_Action) higher))
+{
+   bool emitted_any = false;
+   for (int i = 0; i < num_flags; i++) {
+      if (!flags[i])
+         continue;
+
+      if (emitted_any) {
+         flag_emit_callback(job, i, flags[i],
+                            V3D_VARYING_FLAGS_ACTION_UNCHANGED,
+                            V3D_VARYING_FLAGS_ACTION_UNCHANGED);
+      } else if (i == 0) {
+         flag_emit_callback(job, i, flags[i],
+                            V3D_VARYING_FLAGS_ACTION_UNCHANGED,
+                            V3D_VARYING_FLAGS_ACTION_ZEROED);
+      } else {
+         flag_emit_callback(job, i, flags[i],
+                            V3D_VARYING_FLAGS_ACTION_ZEROED,
+                            V3D_VARYING_FLAGS_ACTION_ZEROED);
+      }
+
+      emitted_any = true;
+   }
+
+   return emitted_any;
+}
+
+void
+v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+
+   struct v3d_fs_prog_data *prog_data_fs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
+
+   const uint32_t num_flags =
+      ARRAY_SIZE(prog_data_fs->flat_shade_flags);
+   const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
+   const uint32_t *noperspective_flags =  prog_data_fs->noperspective_flags;
+   const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
+
+   if (!emit_varying_flags(job, num_flags, flat_shade_flags,
+                           emit_flat_shade_flags)) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
+   }
+
+   if (!emit_varying_flags(job, num_flags, noperspective_flags,
+                           emit_noperspective_flags)) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
+   }
+
+   if (!emit_varying_flags(job, num_flags, centroid_flags,
+                           emit_centroid_flags)) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
+   }
+}
+
+static void
+job_update_ez_state(struct v3dv_job *job,
+                    struct v3dv_pipeline *pipeline,
+                    struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* If first_ez_state is V3D_EZ_DISABLED it means that we have already
+    * determined that we should disable EZ completely for all draw calls in
+    * this job. This will cause us to disable EZ for the entire job in the
+    * Tile Rendering Mode RCL packet and when we do that we need to make sure
+    * we never emit a draw call in the job with EZ enabled in the CFG_BITS
+    * packet, so ez_state must also be V3D_EZ_DISABLED;
+    */
+   if (job->first_ez_state == V3D_EZ_DISABLED) {
+      assert(job->ez_state == V3D_EZ_DISABLED);
+      return;
+   }
+
+   /* This is part of the pre draw call handling, so we should be inside a
+    * render pass.
+    */
+   assert(cmd_buffer->state.pass);
+
+   /* If this is the first time we update EZ state for this job we first check
+    * if there is anything that requires disabling it completely for the entire
+    * job (based on state that is not related to the current draw call and
+    * pipeline state).
+    */
+   if (!job->decided_global_ez_enable) {
+      job->decided_global_ez_enable = true;
+
+      struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+      assert(state->subpass_idx < state->pass->subpass_count);
+      struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
+      if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
+         job->first_ez_state = V3D_EZ_DISABLED;
+         job->ez_state = V3D_EZ_DISABLED;
+         return;
+      }
+
+      /* GFXH-1918: the early-z buffer may load incorrect depth values
+       * if the frame has odd width or height.
+       *
+       * So we need to disable EZ in this case.
+       */
+      const struct v3dv_render_pass_attachment *ds_attachment =
+         &state->pass->attachments[subpass->ds_attachment.attachment];
+
+      const VkImageAspectFlags ds_aspects =
+         vk_format_aspects(ds_attachment->desc.format);
+
+      bool needs_depth_load =
+         check_needs_load(state,
+                          ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                          ds_attachment->first_subpass,
+                          ds_attachment->desc.loadOp);
+
+      if (needs_depth_load) {
+         struct v3dv_framebuffer *fb = state->framebuffer;
+
+         if (!fb) {
+            assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+            perf_debug("Loading depth aspect in a secondary command buffer "
+                       "without framebuffer info disables early-z tests.\n");
+            job->first_ez_state = V3D_EZ_DISABLED;
+            job->ez_state = V3D_EZ_DISABLED;
+            return;
+         }
+
+         if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
+            perf_debug("Loading depth aspect for framebuffer with odd width "
+                       "or height disables early-Z tests.\n");
+            job->first_ez_state = V3D_EZ_DISABLED;
+            job->ez_state = V3D_EZ_DISABLED;
+            return;
+         }
+      }
+   }
+
+   /* Otherwise, we can decide to selectively enable or disable EZ for draw
+    * calls using the CFG_BITS packet based on the bound pipeline state.
+    */
+
+   /* If the FS writes Z, then it may update against the chosen EZ direction */
+   struct v3dv_shader_variant *fs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   if (fs_variant->prog_data.fs->writes_z) {
+      job->ez_state = V3D_EZ_DISABLED;
+      return;
+   }
+
+   switch (pipeline->ez_state) {
+   case V3D_EZ_UNDECIDED:
+      /* If the pipeline didn't pick a direction but didn't disable, then go
+       * along with the current EZ state. This allows EZ optimization for Z
+       * func == EQUAL or NEVER.
+       */
+      break;
+
+   case V3D_EZ_LT_LE:
+   case V3D_EZ_GT_GE:
+      /* If the pipeline picked a direction, then it needs to match the current
+       * direction if we've decided on one.
+       */
+      if (job->ez_state == V3D_EZ_UNDECIDED)
+         job->ez_state = pipeline->ez_state;
+      else if (job->ez_state != pipeline->ez_state)
+         job->ez_state = V3D_EZ_DISABLED;
+      break;
+
+   case V3D_EZ_DISABLED:
+      /* If the pipeline disables EZ because of a bad Z func or stencil
+       * operation, then we can't do any more EZ in this frame.
+       */
+      job->ez_state = V3D_EZ_DISABLED;
+      break;
+   }
+
+   if (job->first_ez_state == V3D_EZ_UNDECIDED &&
+       job->ez_state != V3D_EZ_DISABLED) {
+      job->first_ez_state = job->ez_state;
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   assert(pipeline);
+
+   job_update_ez_state(job, pipeline, cmd_buffer);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
+      config.early_z_enable = job->ez_state != V3D_EZ_DISABLED;
+      config.early_z_updates_enable = config.early_z_enable &&
+         pipeline->z_updates_enable;
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_occlusion_query)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl,
+                                    cl_packet_length(OCCLUSION_QUERY_COUNTER));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
+      if (cmd_buffer->state.query.active_query.bo) {
+         counter.address =
+            v3dv_cl_address(cmd_buffer->state.query.active_query.bo,
+                            cmd_buffer->state.query.active_query.offset);
+      }
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+}
+
+static struct v3dv_job *
+cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
+                                     bool is_bcl_barrier)
+{
+   assert(cmd_buffer->state.subpass_idx != -1);
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_subpass_resume(cmd_buffer,
+                                     cmd_buffer->state.subpass_idx);
+   if (!job)
+      return NULL;
+
+   job->serialize = true;
+   job->needs_bcl_sync = is_bcl_barrier;
+   return job;
+}
+
+static void
+cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
+                                          struct v3dv_cmd_buffer *secondary)
+{
+   struct v3dv_cmd_buffer_state *p_state = &primary->state;
+   struct v3dv_cmd_buffer_state *s_state = &secondary->state;
+
+   const uint32_t total_state_count =
+      p_state->query.end.used_count + s_state->query.end.used_count;
+   v3dv_cmd_buffer_ensure_array_state(primary,
+                                      sizeof(struct v3dv_end_query_cpu_job_info),
+                                      total_state_count,
+                                      &p_state->query.end.alloc_count,
+                                      (void **) &p_state->query.end.states);
+   v3dv_return_if_oom(primary, NULL);
+
+   for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
+      const struct v3dv_end_query_cpu_job_info *s_qstate =
+         &secondary->state.query.end.states[i];
+
+      struct v3dv_end_query_cpu_job_info *p_qstate =
+         &p_state->query.end.states[p_state->query.end.used_count++];
+
+      p_qstate->pool = s_qstate->pool;
+      p_qstate->query = s_qstate->query;
+   }
+}
+
+void
+v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
+                                     uint32_t cmd_buffer_count,
+                                     const VkCommandBuffer *cmd_buffers)
+{
+   assert(primary->state.job);
+
+   /* Emit occlusion query state if needed so the draw calls inside our
+    * secondaries update the counters.
+    */
+   bool has_occlusion_query =
+      primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+   if (has_occlusion_query)
+      v3dX(cmd_buffer_emit_occlusion_query)(primary);
+
+   /* FIXME: if our primary job tiling doesn't enable MSSA but any of the
+    * pipelines used by the secondaries do, we need to re-start the primary
+    * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
+    */
+   bool pending_barrier = false;
+   bool pending_bcl_barrier = false;
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
+
+      assert(secondary->usage_flags &
+             VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
+
+      list_for_each_entry(struct v3dv_job, secondary_job,
+                          &secondary->jobs, list_link) {
+         if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
+            /* If the job is a CL, then we branch to it from the primary BCL.
+             * In this case the secondary's BCL is finished with a
+             * RETURN_FROM_SUB_LIST command to return back to the primary BCL
+             * once we are done executing it.
+             */
+            assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
+            assert(secondary_job->bcl.bo);
+
+            /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
+            STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
+            assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
+            assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
+                   V3DX(RETURN_FROM_SUB_LIST_opcode));
+
+            /* If this secondary has any barriers (or we had any pending barrier
+             * to apply), then we can't just branch to it from the primary, we
+             * need to split the primary to create a new job that can consume
+             * the barriers first.
+             *
+             * FIXME: in this case, maybe just copy the secondary BCL without
+             * the RETURN_FROM_SUB_LIST into the primary job to skip the
+             * branch?
+             */
+            struct v3dv_job *primary_job = primary->state.job;
+            if (!primary_job || secondary_job->serialize || pending_barrier) {
+               const bool needs_bcl_barrier =
+                  secondary_job->needs_bcl_sync || pending_bcl_barrier;
+               primary_job =
+                  cmd_buffer_subpass_split_for_barrier(primary,
+                                                       needs_bcl_barrier);
+               v3dv_return_if_oom(primary, NULL);
+
+               /* Since we have created a new primary we need to re-emit
+                * occlusion query state.
+                */
+               if (has_occlusion_query)
+                  v3dX(cmd_buffer_emit_occlusion_query)(primary);
+            }
+
+            /* Make sure our primary job has all required BO references */
+            set_foreach(secondary_job->bos, entry) {
+               struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
+               v3dv_job_add_bo(primary_job, bo);
+            }
+
+            /* Emit required branch instructions. We expect each of these
+             * to end with a corresponding 'return from sub list' item.
+             */
+            list_for_each_entry(struct v3dv_bo, bcl_bo,
+                                &secondary_job->bcl.bo_list, list_link) {
+               v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
+                                                cl_packet_length(BRANCH_TO_SUB_LIST));
+               v3dv_return_if_oom(primary, NULL);
+               cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
+                  branch.address = v3dv_cl_address(bcl_bo, 0);
+               }
+            }
+
+            primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
+         } else {
+            /* This is a regular job (CPU or GPU), so just finish the current
+             * primary job (if any) and then add the secondary job to the
+             * primary's job list right after it.
+             */
+            v3dv_cmd_buffer_finish_job(primary);
+            v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
+            if (pending_barrier) {
+               secondary_job->serialize = true;
+               if (pending_bcl_barrier)
+                  secondary_job->needs_bcl_sync = true;
+            }
+         }
+
+         pending_barrier = false;
+         pending_bcl_barrier = false;
+      }
+
+      /* If the secondary has recorded any vkCmdEndQuery commands, we need to
+       * copy this state to the primary so it is processed properly when the
+       * current primary job is finished.
+       */
+      cmd_buffer_copy_secondary_end_query_state(primary, secondary);
+
+      /* If this secondary had any pending barrier state we will need that
+       * barrier state consumed with whatever comes next in the primary.
+       */
+      assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
+      pending_barrier = secondary->state.has_barrier;
+      pending_bcl_barrier = secondary->state.has_bcl_barrier;
+   }
+
+   if (pending_barrier) {
+      primary->state.has_barrier = true;
+      primary->state.has_bcl_barrier |= pending_bcl_barrier;
+   }
+}
+
+static void
+emit_gs_shader_state_record(struct v3dv_job *job,
+                            struct v3dv_bo *assembly_bo,
+                            struct v3dv_shader_variant *gs_bin,
+                            struct v3dv_cl_reloc gs_bin_uniforms,
+                            struct v3dv_shader_variant *gs,
+                            struct v3dv_cl_reloc gs_render_uniforms)
+{
+   cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) {
+      shader.geometry_bin_mode_shader_code_address =
+         v3dv_cl_address(assembly_bo, gs_bin->assembly_offset);
+      shader.geometry_bin_mode_shader_4_way_threadable =
+         gs_bin->prog_data.gs->base.threads == 4;
+      shader.geometry_bin_mode_shader_start_in_final_thread_section =
+         gs_bin->prog_data.gs->base.single_seg;
+      shader.geometry_bin_mode_shader_propagate_nans = true;
+      shader.geometry_bin_mode_shader_uniforms_address =
+         gs_bin_uniforms;
+
+      shader.geometry_render_mode_shader_code_address =
+         v3dv_cl_address(assembly_bo, gs->assembly_offset);
+      shader.geometry_render_mode_shader_4_way_threadable =
+         gs->prog_data.gs->base.threads == 4;
+      shader.geometry_render_mode_shader_start_in_final_thread_section =
+         gs->prog_data.gs->base.single_seg;
+      shader.geometry_render_mode_shader_propagate_nans = true;
+      shader.geometry_render_mode_shader_uniforms_address =
+         gs_render_uniforms;
+   }
+}
+
+static uint8_t
+v3d_gs_output_primitive(uint32_t prim_type)
+{
+    switch (prim_type) {
+    case GL_POINTS:
+        return GEOMETRY_SHADER_POINTS;
+    case GL_LINE_STRIP:
+        return GEOMETRY_SHADER_LINE_STRIP;
+    case GL_TRIANGLE_STRIP:
+        return GEOMETRY_SHADER_TRI_STRIP;
+    default:
+        unreachable("Unsupported primitive type");
+    }
+}
+
+static void
+emit_tes_gs_common_params(struct v3dv_job *job,
+                          uint8_t gs_out_prim_type,
+                          uint8_t gs_num_invocations)
+{
+   cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) {
+      shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE;
+      shader.tessellation_point_mode = false;
+      shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN;
+      shader.tessellation_clockwise = true;
+      shader.tessellation_invocations = 1;
+
+      shader.geometry_shader_output_format =
+         v3d_gs_output_primitive(gs_out_prim_type);
+      shader.geometry_shader_instances = gs_num_invocations & 0x1F;
+   }
+}
+
+static uint8_t
+simd_width_to_gs_pack_mode(uint32_t width)
+{
+   switch (width) {
+   case 16:
+      return V3D_PACK_MODE_16_WAY;
+   case 8:
+      return V3D_PACK_MODE_8_WAY;
+   case 4:
+      return V3D_PACK_MODE_4_WAY;
+   case 1:
+      return V3D_PACK_MODE_1_WAY;
+   default:
+      unreachable("Invalid SIMD width");
+   };
+}
+
+static void
+emit_tes_gs_shader_params(struct v3dv_job *job,
+                          uint32_t gs_simd,
+                          uint32_t gs_vpm_output_size,
+                          uint32_t gs_max_vpm_input_size_per_batch)
+{
+   cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) {
+      shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED;
+      shader.per_patch_data_column_depth = 1;
+      shader.tcs_output_segment_size_in_sectors = 1;
+      shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
+      shader.tes_output_segment_size_in_sectors = 1;
+      shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
+      shader.gs_output_segment_size_in_sectors = gs_vpm_output_size;
+      shader.gs_output_segment_pack_mode =
+         simd_width_to_gs_pack_mode(gs_simd);
+      shader.tbg_max_patches_per_tcs_batch = 1;
+      shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0;
+      shader.tbg_min_tcs_output_segments_required_in_play = 1;
+      shader.tbg_min_per_patch_data_segments_required_in_play = 1;
+      shader.tpg_max_patches_per_tes_batch = 1;
+      shader.tpg_max_vertex_segments_per_tes_batch = 0;
+      shader.tpg_max_tcs_output_segments_per_tes_batch = 1;
+      shader.tpg_min_tes_output_segments_required_in_play = 1;
+      shader.gbg_max_tes_output_vertex_segments_per_gs_batch =
+         gs_max_vpm_input_size_per_batch;
+      shader.gbg_min_gs_output_segments_required_in_play = 1;
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct v3dv_pipeline *pipeline = state->gfx.pipeline;
+   assert(pipeline);
+
+   struct v3dv_shader_variant *vs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   struct v3d_vs_prog_data *prog_data_vs = vs_variant->prog_data.vs;
+
+   struct v3dv_shader_variant *vs_bin_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+   struct v3d_vs_prog_data *prog_data_vs_bin = vs_bin_variant->prog_data.vs;
+
+   struct v3dv_shader_variant *fs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   struct v3d_fs_prog_data *prog_data_fs = fs_variant->prog_data.fs;
+
+   struct v3dv_shader_variant *gs_variant = NULL;
+   struct v3dv_shader_variant *gs_bin_variant = NULL;
+   struct v3d_gs_prog_data *prog_data_gs = NULL;
+   struct v3d_gs_prog_data *prog_data_gs_bin = NULL;
+   if (pipeline->has_gs) {
+      gs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
+      prog_data_gs = gs_variant->prog_data.gs;
+
+      gs_bin_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
+      prog_data_gs_bin = gs_bin_variant->prog_data.gs;
+   }
+
+   /* Update the cache dirty flag based on the shader progs data */
+   job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;
+   if (pipeline->has_gs) {
+      job->tmu_dirty_rcl |= prog_data_gs_bin->base.tmu_dirty_rcl;
+      job->tmu_dirty_rcl |= prog_data_gs->base.tmu_dirty_rcl;
+   }
+
+   /* See GFXH-930 workaround below */
+   uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
+
+   uint32_t shader_state_record_length =
+      cl_packet_length(GL_SHADER_STATE_RECORD);
+   if (pipeline->has_gs) {
+      shader_state_record_length +=
+         cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) +
+         cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) +
+         2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS);
+   }
+
+   uint32_t shader_rec_offset =
+      v3dv_cl_ensure_space(&job->indirect,
+                           shader_state_record_length +
+                           num_elements_to_emit *
+                           cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
+                           32);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
+
+   if (pipeline->has_gs) {
+      emit_gs_shader_state_record(job,
+                                  assembly_bo,
+                                  gs_bin_variant,
+                                  cmd_buffer->state.uniforms.gs_bin,
+                                  gs_variant,
+                                  cmd_buffer->state.uniforms.gs);
+
+      emit_tes_gs_common_params(job,
+                                prog_data_gs->out_prim_type,
+                                prog_data_gs->num_invocations);
+
+      emit_tes_gs_shader_params(job,
+                                pipeline->vpm_cfg_bin.gs_width,
+                                pipeline->vpm_cfg_bin.Gd,
+                                pipeline->vpm_cfg_bin.Gv);
+
+      emit_tes_gs_shader_params(job,
+                                pipeline->vpm_cfg.gs_width,
+                                pipeline->vpm_cfg.Gd,
+                                pipeline->vpm_cfg.Gv);
+   }
+
+   struct v3dv_bo *default_attribute_values =
+      pipeline->default_attribute_values != NULL ?
+      pipeline->default_attribute_values :
+      pipeline->device->default_attribute_float;
+
+   cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
+                          pipeline->shader_state_record, shader) {
+
+      /* FIXME: we are setting this values here and during the
+       * prepacking. This is because both cl_emit_with_prepacked and v3dvx_pack
+       * asserts for minimum values of these. It would be good to get
+       * v3dvx_pack to assert on the final value if possible
+       */
+      shader.min_coord_shader_input_segments_required_in_play =
+         pipeline->vpm_cfg_bin.As;
+      shader.min_vertex_shader_input_segments_required_in_play =
+         pipeline->vpm_cfg.As;
+
+      shader.coordinate_shader_code_address =
+         v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
+      shader.vertex_shader_code_address =
+         v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
+      shader.fragment_shader_code_address =
+         v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
+
+      shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
+      shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
+      shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
+
+      shader.address_of_default_attribute_values =
+         v3dv_cl_address(default_attribute_values, 0);
+
+      shader.any_shader_reads_hardware_written_primitive_id =
+         (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
+      shader.insert_primitive_id_as_first_varying_to_fragment_shader =
+         !pipeline->has_gs && prog_data_fs->uses_pid;
+   }
+
+   /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
+   bool cs_loaded_any = false;
+   const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
+                                 prog_data_vs_bin->uses_biid ||
+                                 prog_data_vs_bin->uses_vid;
+   const uint32_t packet_length =
+      cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
+
+   uint32_t emitted_va_count = 0;
+   for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
+      assert(i < MAX_VERTEX_ATTRIBS);
+
+      if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
+         continue;
+
+      const uint32_t binding = pipeline->va[i].binding;
+
+      /* We store each vertex attribute in the array using its driver location
+       * as index.
+       */
+      const uint32_t location = i;
+
+      struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
+
+      cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
+                             &pipeline->vertex_attrs[i * packet_length], attr) {
+
+         assert(c_vb->buffer->mem->bo);
+         attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
+                                        c_vb->buffer->mem_offset +
+                                        pipeline->va[i].offset +
+                                        c_vb->offset);
+
+         attr.number_of_values_read_by_coordinate_shader =
+            prog_data_vs_bin->vattr_sizes[location];
+         attr.number_of_values_read_by_vertex_shader =
+            prog_data_vs->vattr_sizes[location];
+
+         /* GFXH-930: At least one attribute must be enabled and read by CS
+          * and VS.  If we have attributes being consumed by the VS but not
+          * the CS, then set up a dummy load of the last attribute into the
+          * CS's VPM inputs.  (Since CS is just dead-code-elimination compared
+          * to VS, we can't have CS loading but not VS).
+          *
+          * GFXH-1602: first attribute must be active if using builtins.
+          */
+         if (prog_data_vs_bin->vattr_sizes[location])
+            cs_loaded_any = true;
+
+         if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
+            attr.number_of_values_read_by_coordinate_shader = 1;
+            cs_loaded_any = true;
+         } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
+            attr.number_of_values_read_by_coordinate_shader = 1;
+            cs_loaded_any = true;
+         }
+
+         attr.maximum_index = 0xffffff;
+      }
+
+      emitted_va_count++;
+   }
+
+   if (pipeline->va_count == 0) {
+      /* GFXH-930: At least one attribute must be enabled and read
+       * by CS and VS.  If we have no attributes being consumed by
+       * the shader, set up a dummy to be loaded into the VPM.
+       */
+      cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
+         /* Valid address of data whose value will be unused. */
+         attr.address = v3dv_cl_address(job->indirect.bo, 0);
+
+         attr.type = ATTRIBUTE_FLOAT;
+         attr.stride = 0;
+         attr.vec_size = 1;
+
+         attr.number_of_values_read_by_coordinate_shader = 1;
+         attr.number_of_values_read_by_vertex_shader = 1;
+      }
+   }
+
+   if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
+      v3dv_cl_ensure_space_with_branch(&job->bcl,
+                                       sizeof(pipeline->vcm_cache_size));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
+   }
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl,
+                                    cl_packet_length(GL_SHADER_STATE));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   if (pipeline->has_gs) {
+      cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) {
+         state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
+         state.number_of_attribute_arrays = num_elements_to_emit;
+      }
+   } else {
+      cl_emit(&job->bcl, GL_SHADER_STATE, state) {
+         state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
+         state.number_of_attribute_arrays = num_elements_to_emit;
+      }
+   }
+
+   cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
+                                V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
+                                V3DV_CMD_DIRTY_PUSH_CONSTANTS);
+   cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
+   cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
+}
+
+/* FIXME: C&P from v3dx_draw. Refactor to common place? */
+static uint32_t
+v3d_hw_prim_type(enum pipe_prim_type prim_type)
+{
+   switch (prim_type) {
+   case PIPE_PRIM_POINTS:
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_LINE_LOOP:
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return prim_type;
+
+   case PIPE_PRIM_LINES_ADJACENCY:
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY);
+
+   default:
+      unreachable("Unsupported primitive type");
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
+                           struct v3dv_draw_info *info)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   struct v3dv_pipeline *pipeline = state->gfx.pipeline;
+
+   assert(pipeline);
+
+   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+
+   if (info->first_instance > 0) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
+         base.base_instance = info->first_instance;
+         base.base_vertex = 0;
+      }
+   }
+
+   if (info->instance_count > 1) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
+         prim.mode = hw_prim_type;
+         prim.index_of_first_vertex = info->first_vertex;
+         prim.number_of_instances = info->instance_count;
+         prim.instance_length = info->vertex_count;
+      }
+   } else {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+      cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
+         prim.mode = hw_prim_type;
+         prim.length = info->vertex_count;
+         prim.index_of_first_vertex = info->first_vertex;
+      }
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_index_buffer)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   /* We flag all state as dirty when we create a new job so make sure we
+    * have a valid index buffer before attempting to emit state for it.
+    */
+   struct v3dv_buffer *ibuffer =
+      v3dv_buffer_from_handle(cmd_buffer->state.index_buffer.buffer);
+   if (ibuffer) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      const uint32_t offset = cmd_buffer->state.index_buffer.offset;
+      cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
+         ib.address = v3dv_cl_address(ibuffer->mem->bo,
+                                      ibuffer->mem_offset + offset);
+         ib.size = ibuffer->mem->bo->size;
+      }
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
+}
+
+void
+v3dX(cmd_buffer_emit_draw_indexed)(struct v3dv_cmd_buffer *cmd_buffer,
+                                   uint32_t indexCount,
+                                   uint32_t instanceCount,
+                                   uint32_t firstIndex,
+                                   int32_t vertexOffset,
+                                   uint32_t firstInstance)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+   uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
+   uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
+
+   if (vertexOffset != 0 || firstInstance != 0) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
+         base.base_instance = firstInstance;
+         base.base_vertex = vertexOffset;
+      }
+   }
+
+   if (instanceCount == 1) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
+         prim.index_type = index_type;
+         prim.length = indexCount;
+         prim.index_offset = index_offset;
+         prim.mode = hw_prim_type;
+         prim.enable_primitive_restarts = pipeline->primitive_restart;
+      }
+   } else if (instanceCount > 1) {
+      v3dv_cl_ensure_space_with_branch(
+         &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
+      v3dv_return_if_oom(cmd_buffer, NULL);
+
+      cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
+         prim.index_type = index_type;
+         prim.index_offset = index_offset;
+         prim.mode = hw_prim_type;
+         prim.enable_primitive_restarts = pipeline->primitive_restart;
+         prim.number_of_instances = instanceCount;
+         prim.instance_length = indexCount;
+      }
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_draw_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
+                                    struct v3dv_buffer *buffer,
+                                    VkDeviceSize offset,
+                                    uint32_t drawCount,
+                                    uint32_t stride)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+
+   v3dv_cl_ensure_space_with_branch(
+      &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
+      prim.mode = hw_prim_type;
+      prim.number_of_draw_indirect_array_records = drawCount;
+      prim.stride_in_multiples_of_4_bytes = stride >> 2;
+      prim.address = v3dv_cl_address(buffer->mem->bo,
+                                     buffer->mem_offset + offset);
+   }
+}
+
+void
+v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
+                                       struct v3dv_buffer *buffer,
+                                       VkDeviceSize offset,
+                                       uint32_t drawCount,
+                                       uint32_t stride)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
+   uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
+
+   v3dv_cl_ensure_space_with_branch(
+      &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
+      prim.index_type = index_type;
+      prim.mode = hw_prim_type;
+      prim.enable_primitive_restarts = pipeline->primitive_restart;
+      prim.number_of_draw_indirect_indexed_records = drawCount;
+      prim.stride_in_multiples_of_4_bytes = stride >> 2;
+      prim.address = v3dv_cl_address(buffer->mem->bo,
+                                     buffer->mem_offset + offset);
+   }
+}
+
+void
+v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
+                                                 int rt,
+                                                 uint32_t *rt_bpp,
+                                                 uint32_t *rt_type,
+                                                 uint32_t *rt_clamp)
+{
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   assert(state->subpass_idx < state->pass->subpass_count);
+   const struct v3dv_subpass *subpass =
+      &state->pass->subpasses[state->subpass_idx];
+
+   if (rt >= subpass->color_count)
+      return;
+
+   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+   const uint32_t attachment_idx = attachment->attachment;
+   if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      return;
+
+   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
+   assert(attachment_idx < framebuffer->attachment_count);
+   struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
+   assert(iview->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT);
+
+   *rt_bpp = iview->internal_bpp;
+   *rt_type = iview->internal_type;
+   if (vk_format_is_int(iview->vk.format))
+      *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
+   else if (vk_format_is_srgb(iview->vk.format))
+      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
+   else
+      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_descriptor_set.c b/lib/mesa/src/broadcom/vulkan/v3dvx_descriptor_set.c
new file mode 100644
index 000000000..2c28ce46a
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_descriptor_set.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+/*
+ * Returns how much space a given descriptor type needs on a bo (GPU
+ * memory).
+ */
+uint32_t
+v3dX(descriptor_bo_size)(VkDescriptorType type)
+{
+   switch(type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+      return cl_aligned_packet_length(SAMPLER_STATE, 32);
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      return cl_aligned_packet_length(SAMPLER_STATE, 32) +
+         cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32);
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      return cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32);
+   default:
+      return 0;
+   }
+}
+
+/* To compute the max_bo_size we want to iterate through the descriptor
+ * types. Unfourtunately we can't just use the descriptor type enum values, as
+ * the values are not defined consecutively (so extensions could add new
+ * descriptor types), and VK_DESCRIPTOR_TYPE_MAX_ENUM is also a really big
+ * number.
+ */
+static const uint32_t supported_descriptor_types[] = {
+   VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+   VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+   VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
+   VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
+   VK_DESCRIPTOR_TYPE_SAMPLER,
+   VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+   VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT,
+   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+   VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+   VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+};
+
+uint32_t
+v3dX(max_descriptor_bo_size)(void)
+{
+   static uint32_t max = 0;
+
+   if (max == 0) {
+      for (uint32_t i = 0; i < ARRAY_SIZE(supported_descriptor_types); i++)
+         max = MAX2(max, v3dX(descriptor_bo_size)(supported_descriptor_types[i]));
+   }
+   assert(max != 0);
+
+   return max;
+}
+
+
+uint32_t
+v3dX(combined_image_sampler_texture_state_offset)(void)
+{
+   return 0;
+}
+
+uint32_t
+v3dX(combined_image_sampler_sampler_state_offset)(void)
+{
+   return cl_aligned_packet_length(TEXTURE_SHADER_STATE, 32);
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_device.c b/lib/mesa/src/broadcom/vulkan/v3dvx_device.c
new file mode 100644
index 000000000..a48738aec
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_device.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+#include "vk_format_info.h"
+#include "util/u_pack_color.h"
+#include "util/half_float.h"
+
+static const enum V3DX(Wrap_Mode) vk_to_v3d_wrap_mode[] = {
+   [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = V3D_WRAP_MODE_REPEAT,
+   [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = V3D_WRAP_MODE_MIRROR,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = V3D_WRAP_MODE_CLAMP,
+   [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = V3D_WRAP_MODE_MIRROR_ONCE,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = V3D_WRAP_MODE_BORDER,
+};
+
+static const enum V3DX(Compare_Function)
+vk_to_v3d_compare_func[] = {
+   [VK_COMPARE_OP_NEVER]                        = V3D_COMPARE_FUNC_NEVER,
+   [VK_COMPARE_OP_LESS]                         = V3D_COMPARE_FUNC_LESS,
+   [VK_COMPARE_OP_EQUAL]                        = V3D_COMPARE_FUNC_EQUAL,
+   [VK_COMPARE_OP_LESS_OR_EQUAL]                = V3D_COMPARE_FUNC_LEQUAL,
+   [VK_COMPARE_OP_GREATER]                      = V3D_COMPARE_FUNC_GREATER,
+   [VK_COMPARE_OP_NOT_EQUAL]                    = V3D_COMPARE_FUNC_NOTEQUAL,
+   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = V3D_COMPARE_FUNC_GEQUAL,
+   [VK_COMPARE_OP_ALWAYS]                       = V3D_COMPARE_FUNC_ALWAYS,
+};
+
+
+static union pipe_color_union encode_border_color(
+   const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
+{
+   const struct util_format_description *desc =
+      vk_format_description(bc_info->format);
+
+   const struct v3dv_format *format = v3dX(get_format)(bc_info->format);
+
+   union pipe_color_union border;
+   for (int i = 0; i < 4; i++) {
+      if (format->swizzle[i] <= 3)
+         border.ui[i] = bc_info->customBorderColor.uint32[format->swizzle[i]];
+      else
+         border.ui[i] = 0;
+   }
+
+   /* handle clamping */
+   if (vk_format_has_depth(bc_info->format) &&
+       vk_format_has_stencil(bc_info->format)) {
+      border.f[0] = CLAMP(border.f[0], 0, 1);
+      border.ui[1] = CLAMP(border.ui[1], 0, 0xff);
+   } else if (vk_format_is_unorm(bc_info->format)) {
+      for (int i = 0; i < 4; i++)
+         border.f[i] = CLAMP(border.f[i], 0, 1);
+   } else if (vk_format_is_snorm(bc_info->format)) {
+      for (int i = 0; i < 4; i++)
+         border.f[i] = CLAMP(border.f[i], -1, 1);
+   } else if (vk_format_is_uint(bc_info->format) &&
+              desc->channel[0].size < 32) {
+      for (int i = 0; i < 4; i++)
+         border.ui[i] = CLAMP(border.ui[i], 0, (1 << desc->channel[i].size));
+   } else if (vk_format_is_sint(bc_info->format) &&
+              desc->channel[0].size < 32) {
+      for (int i = 0; i < 4; i++)
+         border.i[i] = CLAMP(border.i[i],
+                             -(1 << (desc->channel[i].size - 1)),
+                             (1 << (desc->channel[i].size - 1)) - 1);
+   }
+
+   /* convert from float to expected format */
+   if (vk_format_is_srgb(bc_info->format) ||
+       vk_format_is_compressed(bc_info->format)) {
+      for (int i = 0; i < 4; i++)
+         border.ui[i] = _mesa_float_to_half(border.f[i]);
+   } else if (vk_format_is_unorm(bc_info->format)) {
+      for (int i = 0; i < 4; i++) {
+         switch (desc->channel[i].size) {
+         case 8:
+         case 16:
+            /* expect u16 for non depth values */
+            if (!vk_format_has_depth(bc_info->format))
+               border.ui[i] = (uint32_t) (border.f[i] * (float) 0xffff);
+            break;
+         case 24:
+         case 32:
+            /* uses full f32; no conversion needed */
+            break;
+         default:
+            border.ui[i] = _mesa_float_to_half(border.f[i]);
+            break;
+         }
+      }
+   } else if (vk_format_is_snorm(bc_info->format)) {
+      for (int i = 0; i < 4; i++) {
+         switch (desc->channel[i].size) {
+         case 8:
+            border.ui[i] = (int32_t) (border.f[i] * (float) 0x3fff);
+            break;
+         case 16:
+            border.i[i] = (int32_t) (border.f[i] * (float) 0x7fff);
+            break;
+         case 24:
+         case 32:
+            /* uses full f32; no conversion needed */
+            break;
+         default:
+            border.ui[i] = _mesa_float_to_half(border.f[i]);
+            break;
+         }
+      }
+   } else if (vk_format_is_float(bc_info->format)) {
+      for (int i = 0; i < 4; i++) {
+         switch(desc->channel[i].size) {
+         case 16:
+            border.ui[i] = _mesa_float_to_half(border.f[i]);
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   return border;
+}
+
+void
+v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+                         const VkSamplerCreateInfo *pCreateInfo,
+                         const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
+{
+   enum V3DX(Border_Color_Mode) border_color_mode;
+
+   switch (pCreateInfo->borderColor) {
+   case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
+   case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
+      border_color_mode = V3D_BORDER_COLOR_0000;
+      break;
+   case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
+   case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
+      border_color_mode = V3D_BORDER_COLOR_0001;
+      break;
+   case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
+   case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
+      border_color_mode = V3D_BORDER_COLOR_1111;
+      break;
+   case VK_BORDER_COLOR_FLOAT_CUSTOM_EXT:
+   case VK_BORDER_COLOR_INT_CUSTOM_EXT:
+      border_color_mode = V3D_BORDER_COLOR_FOLLOWS;
+      break;
+   default:
+      unreachable("Unknown border color");
+      break;
+   }
+
+   /* For some texture formats, when clamping to transparent black border the
+    * CTS expects alpha to be set to 1 instead of 0, but the border color mode
+    * will take priority over the texture state swizzle, so the only way to
+    * fix that is to apply a swizzle in the shader. Here we keep track of
+    * whether we are activating that mode and we will decide if we need to
+    * activate the texture swizzle lowering in the shader key at compile time
+    * depending on the actual texture format.
+    */
+   if ((pCreateInfo->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
+        pCreateInfo->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
+        pCreateInfo->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER) &&
+       border_color_mode == V3D_BORDER_COLOR_0000) {
+      sampler->clamp_to_transparent_black_border = true;
+   }
+
+   v3dvx_pack(sampler->sampler_state, SAMPLER_STATE, s) {
+      if (pCreateInfo->anisotropyEnable) {
+         s.anisotropy_enable = true;
+         if (pCreateInfo->maxAnisotropy > 8)
+            s.maximum_anisotropy = 3;
+         else if (pCreateInfo->maxAnisotropy > 4)
+            s.maximum_anisotropy = 2;
+         else if (pCreateInfo->maxAnisotropy > 2)
+            s.maximum_anisotropy = 1;
+      }
+
+      s.border_color_mode = border_color_mode;
+
+      if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
+         union pipe_color_union border = encode_border_color(bc_info);
+
+         s.border_color_word_0 = border.ui[0];
+         s.border_color_word_1 = border.ui[1];
+         s.border_color_word_2 = border.ui[2];
+         s.border_color_word_3 = border.ui[3];
+      }
+
+      s.wrap_i_border = false; /* Also hardcoded on v3d */
+      s.wrap_s = vk_to_v3d_wrap_mode[pCreateInfo->addressModeU];
+      s.wrap_t = vk_to_v3d_wrap_mode[pCreateInfo->addressModeV];
+      s.wrap_r = vk_to_v3d_wrap_mode[pCreateInfo->addressModeW];
+      s.fixed_bias = pCreateInfo->mipLodBias;
+      s.max_level_of_detail = MIN2(MAX2(0, pCreateInfo->maxLod), 15);
+      s.min_level_of_detail = MIN2(MAX2(0, pCreateInfo->minLod), 15);
+      s.srgb_disable = 0; /* Not even set by v3d */
+      s.depth_compare_function =
+         vk_to_v3d_compare_func[pCreateInfo->compareEnable ?
+                                pCreateInfo->compareOp : VK_COMPARE_OP_NEVER];
+      s.mip_filter_nearest = pCreateInfo->mipmapMode == VK_SAMPLER_MIPMAP_MODE_NEAREST;
+      s.min_filter_nearest = pCreateInfo->minFilter == VK_FILTER_NEAREST;
+      s.mag_filter_nearest = pCreateInfo->magFilter == VK_FILTER_NEAREST;
+   }
+}
+
+/**
+ * This computes the maximum bpp used by any of the render targets used by
+ * a particular subpass and checks if any of those render targets are
+ * multisampled. If we don't have a subpass (when we are not inside a
+ * render pass), then we assume that all framebuffer attachments are used.
+ */
+void
+v3dX(framebuffer_compute_internal_bpp_msaa)(
+   const struct v3dv_framebuffer *framebuffer,
+   const struct v3dv_subpass *subpass,
+   uint8_t *max_bpp,
+   bool *msaa)
+{
+   STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
+   *max_bpp = V3D_INTERNAL_BPP_32;
+   *msaa = false;
+
+   if (subpass) {
+      for (uint32_t i = 0; i < subpass->color_count; i++) {
+         uint32_t att_idx = subpass->color_attachments[i].attachment;
+         if (att_idx == VK_ATTACHMENT_UNUSED)
+            continue;
+
+         const struct v3dv_image_view *att = framebuffer->attachments[att_idx];
+         assert(att);
+
+         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+            *max_bpp = MAX2(*max_bpp, att->internal_bpp);
+
+         if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+            *msaa = true;
+      }
+
+      if (!*msaa && subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+         const struct v3dv_image_view *att =
+            framebuffer->attachments[subpass->ds_attachment.attachment];
+         assert(att);
+
+         if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+            *msaa = true;
+      }
+
+      return;
+   }
+
+   assert(framebuffer->attachment_count <= 4);
+   for (uint32_t i = 0; i < framebuffer->attachment_count; i++) {
+      const struct v3dv_image_view *att = framebuffer->attachments[i];
+      assert(att);
+
+      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+         *max_bpp = MAX2(*max_bpp, att->internal_bpp);
+
+      if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+         *msaa = true;
+   }
+
+   return;
+}
+
+uint32_t
+v3dX(zs_buffer_from_aspect_bits)(VkImageAspectFlags aspects)
+{
+   const VkImageAspectFlags zs_aspects =
+      VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+   const VkImageAspectFlags filtered_aspects = aspects & zs_aspects;
+
+   if (filtered_aspects == zs_aspects)
+      return ZSTENCIL;
+   else if (filtered_aspects == VK_IMAGE_ASPECT_DEPTH_BIT)
+      return Z;
+   else if (filtered_aspects == VK_IMAGE_ASPECT_STENCIL_BIT)
+      return STENCIL;
+   else
+      return NONE;
+}
+
+void
+v3dX(get_hw_clear_color)(const VkClearColorValue *color,
+                         uint32_t internal_type,
+                         uint32_t internal_size,
+                         uint32_t *hw_color)
+{
+   union util_color uc;
+   switch (internal_type) {
+   case V3D_INTERNAL_TYPE_8:
+      util_pack_color(color->float32, PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
+      memcpy(hw_color, uc.ui, internal_size);
+   break;
+   case V3D_INTERNAL_TYPE_8I:
+   case V3D_INTERNAL_TYPE_8UI:
+      hw_color[0] = ((color->uint32[0] & 0xff) |
+                     (color->uint32[1] & 0xff) << 8 |
+                     (color->uint32[2] & 0xff) << 16 |
+                     (color->uint32[3] & 0xff) << 24);
+   break;
+   case V3D_INTERNAL_TYPE_16F:
+      util_pack_color(color->float32, PIPE_FORMAT_R16G16B16A16_FLOAT, &uc);
+      memcpy(hw_color, uc.ui, internal_size);
+   break;
+   case V3D_INTERNAL_TYPE_16I:
+   case V3D_INTERNAL_TYPE_16UI:
+      hw_color[0] = ((color->uint32[0] & 0xffff) | color->uint32[1] << 16);
+      hw_color[1] = ((color->uint32[2] & 0xffff) | color->uint32[3] << 16);
+   break;
+   case V3D_INTERNAL_TYPE_32F:
+   case V3D_INTERNAL_TYPE_32I:
+   case V3D_INTERNAL_TYPE_32UI:
+      memcpy(hw_color, color->uint32, internal_size);
+      break;
+   }
+}
+
+#ifdef DEBUG
+void
+v3dX(device_check_prepacked_sizes)(void)
+{
+   STATIC_ASSERT(V3DV_SAMPLER_STATE_LENGTH >=
+                 cl_packet_length(SAMPLER_STATE));
+   STATIC_ASSERT(V3DV_TEXTURE_SHADER_STATE_LENGTH >=
+                 cl_packet_length(TEXTURE_SHADER_STATE));
+   STATIC_ASSERT(V3DV_SAMPLER_STATE_LENGTH >=
+                 cl_packet_length(SAMPLER_STATE));
+   STATIC_ASSERT(V3DV_BLEND_CFG_LENGTH>=
+                 cl_packet_length(BLEND_CFG));
+   STATIC_ASSERT(V3DV_CFG_BITS_LENGTH>=
+                 cl_packet_length(CFG_BITS));
+   STATIC_ASSERT(V3DV_GL_SHADER_STATE_RECORD_LENGTH >=
+                 cl_packet_length(GL_SHADER_STATE_RECORD));
+   STATIC_ASSERT(V3DV_VCM_CACHE_SIZE_LENGTH>=
+                 cl_packet_length(VCM_CACHE_SIZE));
+   STATIC_ASSERT(V3DV_GL_SHADER_STATE_ATTRIBUTE_RECORD_LENGTH >=
+                 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD));
+   STATIC_ASSERT(V3DV_STENCIL_CFG_LENGTH >=
+                 cl_packet_length(STENCIL_CFG));
+}
+#endif
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c b/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c
new file mode 100644
index 000000000..4f77dd008
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+
+#include "util/format/u_format.h"
+
+#define SWIZ(x,y,z,w) {   \
+   PIPE_SWIZZLE_##x,      \
+   PIPE_SWIZZLE_##y,      \
+   PIPE_SWIZZLE_##z,      \
+   PIPE_SWIZZLE_##w       \
+}
+
+#define FORMAT(vk, rt, tex, swiz, return_size, supports_filtering)  \
+   [VK_FORMAT_##vk] = {                                             \
+      true,                                                         \
+      V3D_OUTPUT_IMAGE_FORMAT_##rt,                                 \
+      TEXTURE_DATA_FORMAT_##tex,                                    \
+      swiz,                                                         \
+      return_size,                                                  \
+      supports_filtering,                                           \
+   }
+
+#define SWIZ_X001 SWIZ(X, 0, 0, 1)
+#define SWIZ_XY01 SWIZ(X, Y, 0, 1)
+#define SWIZ_XYZ1 SWIZ(X, Y, Z, 1)
+#define SWIZ_XYZW SWIZ(X, Y, Z, W)
+#define SWIZ_YZWX SWIZ(Y, Z, W, X)
+#define SWIZ_YZW1 SWIZ(Y, Z, W, 1)
+#define SWIZ_ZYXW SWIZ(Z, Y, X, W)
+#define SWIZ_ZYX1 SWIZ(Z, Y, X, 1)
+#define SWIZ_XXXY SWIZ(X, X, X, Y)
+#define SWIZ_XXX1 SWIZ(X, X, X, 1)
+#define SWIZ_XXXX SWIZ(X, X, X, X)
+#define SWIZ_000X SWIZ(0, 0, 0, X)
+#define SWIZ_WXYZ SWIZ(W, X, Y, Z)
+
+/* FIXME: expand format table to describe whether the format is supported
+ * for buffer surfaces (texel buffers, vertex buffers, etc).
+ */
+static const struct v3dv_format format_table[] = {
+   /* Color, 4 channels */
+   FORMAT(B8G8R8A8_SRGB,           SRGB8_ALPHA8, RGBA8,         SWIZ_ZYXW, 16, true),
+   FORMAT(B8G8R8A8_UNORM,          RGBA8,        RGBA8,         SWIZ_ZYXW, 16, true),
+
+   FORMAT(R8G8B8A8_SRGB,           SRGB8_ALPHA8, RGBA8,         SWIZ_XYZW, 16, true),
+   FORMAT(R8G8B8A8_UNORM,          RGBA8,        RGBA8,         SWIZ_XYZW, 16, true),
+   FORMAT(R8G8B8A8_SNORM,          NO,           RGBA8_SNORM,   SWIZ_XYZW, 16, true),
+   FORMAT(R8G8B8A8_SINT,           RGBA8I,       RGBA8I,        SWIZ_XYZW, 16, false),
+   FORMAT(R8G8B8A8_UINT,           RGBA8UI,      RGBA8UI,       SWIZ_XYZW, 16, false),
+
+   FORMAT(R16G16B16A16_SFLOAT,     RGBA16F,      RGBA16F,       SWIZ_XYZW, 16, true),
+   FORMAT(R16G16B16A16_UNORM,      NO,           RGBA16,        SWIZ_XYZW, 32, true),
+   FORMAT(R16G16B16A16_SNORM,      NO,           RGBA16_SNORM,  SWIZ_XYZW, 32, true),
+   FORMAT(R16G16B16A16_SINT,       RGBA16I,      RGBA16I,       SWIZ_XYZW, 16, false),
+   FORMAT(R16G16B16A16_UINT,       RGBA16UI,     RGBA16UI,      SWIZ_XYZW, 16, false),
+
+   FORMAT(R32G32B32A32_SFLOAT,     RGBA32F,      RGBA32F,       SWIZ_XYZW, 32, false),
+   FORMAT(R32G32B32A32_SINT,       RGBA32I,      RGBA32I,       SWIZ_XYZW, 32, false),
+   FORMAT(R32G32B32A32_UINT,       RGBA32UI,     RGBA32UI,      SWIZ_XYZW, 32, false),
+
+   /* Color, 3 channels */
+   FORMAT(R32G32B32_SFLOAT,        NO,           NO,            SWIZ_XYZ1,  0, false),
+   FORMAT(R32G32B32_UINT,          NO,           NO,            SWIZ_XYZ1,  0, false),
+   FORMAT(R32G32B32_SINT,          NO,           NO,            SWIZ_XYZ1,  0, false),
+
+   /* Color, 2 channels */
+   FORMAT(R8G8_UNORM,              RG8,          RG8,           SWIZ_XY01, 16, true),
+   FORMAT(R8G8_SNORM,              NO,           RG8_SNORM,     SWIZ_XY01, 16, true),
+   FORMAT(R8G8_SINT,               RG8I,         RG8I,          SWIZ_XY01, 16, false),
+   FORMAT(R8G8_UINT,               RG8UI,        RG8UI,         SWIZ_XY01, 16, false),
+
+   FORMAT(R16G16_UNORM,            NO,           RG16,          SWIZ_XY01, 32, true),
+   FORMAT(R16G16_SNORM,            NO,           RG16_SNORM,    SWIZ_XY01, 32, true),
+   FORMAT(R16G16_SFLOAT,           RG16F,        RG16F,         SWIZ_XY01, 16, true),
+   FORMAT(R16G16_SINT,             RG16I,        RG16I,         SWIZ_XY01, 16, false),
+   FORMAT(R16G16_UINT,             RG16UI,       RG16UI,        SWIZ_XY01, 16, false),
+
+   FORMAT(R32G32_SFLOAT,           RG32F,        RG32F,         SWIZ_XY01, 32, false),
+   FORMAT(R32G32_SINT,             RG32I,        RG32I,         SWIZ_XY01, 32, false),
+   FORMAT(R32G32_UINT,             RG32UI,       RG32UI,        SWIZ_XY01, 32, false),
+
+   /* Color, 1 channel */
+   FORMAT(R8_UNORM,                R8,           R8,            SWIZ_X001, 16, true),
+   FORMAT(R8_SNORM,                NO,           R8_SNORM,      SWIZ_X001, 16, true),
+   FORMAT(R8_SINT,                 R8I,          R8I,           SWIZ_X001, 16, false),
+   FORMAT(R8_UINT,                 R8UI,         R8UI,          SWIZ_X001, 16, false),
+
+   FORMAT(R16_UNORM,               NO,           R16,           SWIZ_X001, 32, true),
+   FORMAT(R16_SNORM,               NO,           R16_SNORM,     SWIZ_X001, 32, true),
+   FORMAT(R16_SFLOAT,              R16F,         R16F,          SWIZ_X001, 16, true),
+   FORMAT(R16_SINT,                R16I,         R16I,          SWIZ_X001, 16, false),
+   FORMAT(R16_UINT,                R16UI,        R16UI,         SWIZ_X001, 16, false),
+
+   FORMAT(R32_SFLOAT,              R32F,         R32F,          SWIZ_X001, 32, false),
+   FORMAT(R32_SINT,                R32I,         R32I,          SWIZ_X001, 32, false),
+   FORMAT(R32_UINT,                R32UI,        R32UI,         SWIZ_X001, 32, false),
+
+   /* Color, packed */
+   FORMAT(R4G4B4A4_UNORM_PACK16,   ABGR4444,     RGBA4,         SWIZ_XYZW, 16, true),
+   FORMAT(B4G4R4A4_UNORM_PACK16,   ABGR4444,     RGBA4,         SWIZ_ZYXW, 16, true), /* Swap RB */
+   FORMAT(R5G6B5_UNORM_PACK16,     BGR565,       RGB565,        SWIZ_XYZ1, 16, true),
+   FORMAT(R5G5B5A1_UNORM_PACK16,   ABGR1555,     RGB5_A1,       SWIZ_XYZW, 16, true),
+   FORMAT(A1R5G5B5_UNORM_PACK16,   RGBA5551,     A1_RGB5,       SWIZ_ZYXW, 16, true), /* Swap RB */
+   FORMAT(A8B8G8R8_UNORM_PACK32,   RGBA8,        RGBA8,         SWIZ_XYZW, 16, true), /* RGBA8 UNORM */
+   FORMAT(A8B8G8R8_SNORM_PACK32,   NO,           RGBA8_SNORM,   SWIZ_XYZW, 16, true), /* RGBA8 SNORM */
+   FORMAT(A8B8G8R8_UINT_PACK32,    RGBA8UI,      RGBA8UI,       SWIZ_XYZW, 16, false), /* RGBA8 UINT */
+   FORMAT(A8B8G8R8_SINT_PACK32,    RGBA8I,       RGBA8I,        SWIZ_XYZW, 16, false), /* RGBA8 SINT */
+   FORMAT(A8B8G8R8_SRGB_PACK32,    SRGB8_ALPHA8, RGBA8,         SWIZ_XYZW, 16, true), /* RGBA8 sRGB */
+   FORMAT(A2B10G10R10_UNORM_PACK32,RGB10_A2,     RGB10_A2,      SWIZ_XYZW, 16, true),
+   FORMAT(A2B10G10R10_UINT_PACK32, RGB10_A2UI,   RGB10_A2UI,    SWIZ_XYZW, 16, false),
+   FORMAT(E5B9G9R9_UFLOAT_PACK32,  NO,           RGB9_E5,       SWIZ_XYZ1, 16, true),
+   FORMAT(B10G11R11_UFLOAT_PACK32, R11F_G11F_B10F,R11F_G11F_B10F, SWIZ_XYZ1, 16, true),
+
+   /* Depth */
+   FORMAT(D16_UNORM,               D16,          DEPTH_COMP16,  SWIZ_X001, 32, false),
+   FORMAT(D32_SFLOAT,              D32F,         DEPTH_COMP32F, SWIZ_X001, 32, false),
+   FORMAT(X8_D24_UNORM_PACK32,     D24S8,        DEPTH24_X8,    SWIZ_X001, 32, false),
+
+   /* Depth + Stencil */
+   FORMAT(D24_UNORM_S8_UINT,       D24S8,        DEPTH24_X8,    SWIZ_X001, 32, false),
+
+   /* Compressed: ETC2 / EAC */
+   FORMAT(ETC2_R8G8B8_UNORM_BLOCK,    NO,  RGB8_ETC2,                SWIZ_XYZ1, 16, true),
+   FORMAT(ETC2_R8G8B8_SRGB_BLOCK,     NO,  RGB8_ETC2,                SWIZ_XYZ1, 16, true),
+   FORMAT(ETC2_R8G8B8A1_UNORM_BLOCK,  NO,  RGB8_PUNCHTHROUGH_ALPHA1, SWIZ_XYZW, 16, true),
+   FORMAT(ETC2_R8G8B8A1_SRGB_BLOCK,   NO,  RGB8_PUNCHTHROUGH_ALPHA1, SWIZ_XYZW, 16, true),
+   FORMAT(ETC2_R8G8B8A8_UNORM_BLOCK,  NO,  RGBA8_ETC2_EAC,           SWIZ_XYZW, 16, true),
+   FORMAT(ETC2_R8G8B8A8_SRGB_BLOCK,   NO,  RGBA8_ETC2_EAC,           SWIZ_XYZW, 16, true),
+   FORMAT(EAC_R11_UNORM_BLOCK,        NO,  R11_EAC,                  SWIZ_X001, 16, true),
+   FORMAT(EAC_R11_SNORM_BLOCK,        NO,  SIGNED_R11_EAC,           SWIZ_X001, 16, true),
+   FORMAT(EAC_R11G11_UNORM_BLOCK,     NO,  RG11_EAC,                 SWIZ_XY01, 16, true),
+   FORMAT(EAC_R11G11_SNORM_BLOCK,     NO,  SIGNED_RG11_EAC,          SWIZ_XY01, 16, true),
+
+   /* Compressed: BC1-3 */
+   FORMAT(BC1_RGB_UNORM_BLOCK,        NO,  BC1,                      SWIZ_XYZ1, 16, true),
+   FORMAT(BC1_RGB_SRGB_BLOCK,         NO,  BC1,                      SWIZ_XYZ1, 16, true),
+   FORMAT(BC1_RGBA_UNORM_BLOCK,       NO,  BC1,                      SWIZ_XYZW, 16, true),
+   FORMAT(BC1_RGBA_SRGB_BLOCK,        NO,  BC1,                      SWIZ_XYZW, 16, true),
+   FORMAT(BC2_UNORM_BLOCK,            NO,  BC2,                      SWIZ_XYZW, 16, true),
+   FORMAT(BC2_SRGB_BLOCK,             NO,  BC2,                      SWIZ_XYZW, 16, true),
+   FORMAT(BC3_UNORM_BLOCK,            NO,  BC3,                      SWIZ_XYZW, 16, true),
+   FORMAT(BC3_SRGB_BLOCK,             NO,  BC3,                      SWIZ_XYZW, 16, true),
+
+   /* Compressed: ASTC */
+   FORMAT(ASTC_4x4_UNORM_BLOCK,       NO,  ASTC_4X4,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_4x4_SRGB_BLOCK,        NO,  ASTC_4X4,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_5x4_UNORM_BLOCK,       NO,  ASTC_5X4,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_5x4_SRGB_BLOCK,        NO,  ASTC_5X4,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_5x5_UNORM_BLOCK,       NO,  ASTC_5X5,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_5x5_SRGB_BLOCK,        NO,  ASTC_5X5,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_6x5_UNORM_BLOCK,       NO,  ASTC_6X5,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_6x5_SRGB_BLOCK,        NO,  ASTC_6X5,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_6x6_UNORM_BLOCK,       NO,  ASTC_6X6,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_6x6_SRGB_BLOCK,        NO,  ASTC_6X6,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_8x5_UNORM_BLOCK,       NO,  ASTC_8X5,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_8x5_SRGB_BLOCK,        NO,  ASTC_8X5,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_8x6_UNORM_BLOCK,       NO,  ASTC_8X6,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_8x6_SRGB_BLOCK,        NO,  ASTC_8X6,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_8x8_UNORM_BLOCK,       NO,  ASTC_8X8,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_8x8_SRGB_BLOCK,        NO,  ASTC_8X8,                 SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x5_UNORM_BLOCK,      NO,  ASTC_10X5,                SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x5_SRGB_BLOCK,       NO,  ASTC_10X5,                SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x6_UNORM_BLOCK,      NO,  ASTC_10X6,                SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x6_SRGB_BLOCK,       NO,  ASTC_10X6,                SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x8_UNORM_BLOCK,      NO,  ASTC_10X8,                SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x8_SRGB_BLOCK,       NO,  ASTC_10X8,                SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x10_UNORM_BLOCK,     NO,  ASTC_10X10,               SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_10x10_SRGB_BLOCK,      NO,  ASTC_10X10,               SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_12x10_UNORM_BLOCK,     NO,  ASTC_12X10,               SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_12x10_SRGB_BLOCK,      NO,  ASTC_12X10,               SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_12x12_UNORM_BLOCK,     NO,  ASTC_12X12,               SWIZ_XYZW, 16, true),
+   FORMAT(ASTC_12x12_SRGB_BLOCK,      NO,  ASTC_12X12,               SWIZ_XYZW, 16, true),
+};
+
+const struct v3dv_format *
+v3dX(get_format)(VkFormat format)
+{
+   if (format < ARRAY_SIZE(format_table) && format_table[format].supported)
+      return &format_table[format];
+   else
+      return NULL;
+}
+
+void
+v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
+                                              uint32_t *type,
+                                              uint32_t *bpp)
+{
+   switch (format) {
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA8:
+   case V3D_OUTPUT_IMAGE_FORMAT_RGB8:
+   case V3D_OUTPUT_IMAGE_FORMAT_RG8:
+   case V3D_OUTPUT_IMAGE_FORMAT_R8:
+   case V3D_OUTPUT_IMAGE_FORMAT_ABGR4444:
+   case V3D_OUTPUT_IMAGE_FORMAT_BGR565:
+   case V3D_OUTPUT_IMAGE_FORMAT_ABGR1555:
+      *type = V3D_INTERNAL_TYPE_8;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA8I:
+   case V3D_OUTPUT_IMAGE_FORMAT_RG8I:
+   case V3D_OUTPUT_IMAGE_FORMAT_R8I:
+      *type = V3D_INTERNAL_TYPE_8I;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI:
+   case V3D_OUTPUT_IMAGE_FORMAT_RG8UI:
+   case V3D_OUTPUT_IMAGE_FORMAT_R8UI:
+      *type = V3D_INTERNAL_TYPE_8UI;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_SRGB8_ALPHA8:
+   case V3D_OUTPUT_IMAGE_FORMAT_SRGB:
+   case V3D_OUTPUT_IMAGE_FORMAT_RGB10_A2:
+   case V3D_OUTPUT_IMAGE_FORMAT_R11F_G11F_B10F:
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA16F:
+      /* Note that sRGB RTs are stored in the tile buffer at 16F,
+       * and the conversion to sRGB happens at tilebuffer load/store.
+       */
+      *type = V3D_INTERNAL_TYPE_16F;
+      *bpp = V3D_INTERNAL_BPP_64;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RG16F:
+   case V3D_OUTPUT_IMAGE_FORMAT_R16F:
+      *type = V3D_INTERNAL_TYPE_16F;
+      /* Use 64bpp to make sure the TLB doesn't throw away the alpha
+       * channel before alpha test happens.
+       */
+      *bpp = V3D_INTERNAL_BPP_64;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA16I:
+      *type = V3D_INTERNAL_TYPE_16I;
+      *bpp = V3D_INTERNAL_BPP_64;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RG16I:
+   case V3D_OUTPUT_IMAGE_FORMAT_R16I:
+      *type = V3D_INTERNAL_TYPE_16I;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RGB10_A2UI:
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA16UI:
+      *type = V3D_INTERNAL_TYPE_16UI;
+      *bpp = V3D_INTERNAL_BPP_64;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RG16UI:
+   case V3D_OUTPUT_IMAGE_FORMAT_R16UI:
+      *type = V3D_INTERNAL_TYPE_16UI;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA32I:
+      *type = V3D_INTERNAL_TYPE_32I;
+      *bpp = V3D_INTERNAL_BPP_128;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RG32I:
+      *type = V3D_INTERNAL_TYPE_32I;
+      *bpp = V3D_INTERNAL_BPP_64;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_R32I:
+      *type = V3D_INTERNAL_TYPE_32I;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA32UI:
+      *type = V3D_INTERNAL_TYPE_32UI;
+      *bpp = V3D_INTERNAL_BPP_128;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RG32UI:
+      *type = V3D_INTERNAL_TYPE_32UI;
+      *bpp = V3D_INTERNAL_BPP_64;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_R32UI:
+      *type = V3D_INTERNAL_TYPE_32UI;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RGBA32F:
+      *type = V3D_INTERNAL_TYPE_32F;
+      *bpp = V3D_INTERNAL_BPP_128;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_RG32F:
+      *type = V3D_INTERNAL_TYPE_32F;
+      *bpp = V3D_INTERNAL_BPP_64;
+      break;
+
+   case V3D_OUTPUT_IMAGE_FORMAT_R32F:
+      *type = V3D_INTERNAL_TYPE_32F;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+
+   default:
+      /* Provide some default values, as we'll be called at RB
+       * creation time, even if an RB with this format isn't supported.
+       */
+      *type = V3D_INTERNAL_TYPE_8;
+      *bpp = V3D_INTERNAL_BPP_32;
+      break;
+   }
+}
+
+bool
+v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format)
+{
+   uint32_t type, bpp;
+   v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp);
+   return type == V3D_INTERNAL_TYPE_8 || type == V3D_INTERNAL_TYPE_16F;
+}
+
+bool
+v3dX(format_supports_blending)(const struct v3dv_format *format)
+{
+   /* Hardware blending is only supported on render targets that are configured
+    * 4x8-bit unorm, 2x16-bit float or 4x16-bit float.
+    */
+   uint32_t type, bpp;
+   v3dX(get_internal_type_bpp_for_output_format)(format->rt_type, &type, &bpp);
+   switch (type) {
+   case V3D_INTERNAL_TYPE_8:
+      return bpp == V3D_INTERNAL_BPP_32;
+   case V3D_INTERNAL_TYPE_16F:
+      return bpp == V3D_INTERNAL_BPP_32 || V3D_INTERNAL_BPP_64;
+   default:
+      return false;
+   }
+}
+
+bool
+v3dX(tfu_supports_tex_format)(uint32_t tex_format)
+{
+   switch (tex_format) {
+   case TEXTURE_DATA_FORMAT_R8:
+   case TEXTURE_DATA_FORMAT_R8_SNORM:
+   case TEXTURE_DATA_FORMAT_RG8:
+   case TEXTURE_DATA_FORMAT_RG8_SNORM:
+   case TEXTURE_DATA_FORMAT_RGBA8:
+   case TEXTURE_DATA_FORMAT_RGBA8_SNORM:
+   case TEXTURE_DATA_FORMAT_RGB565:
+   case TEXTURE_DATA_FORMAT_RGBA4:
+   case TEXTURE_DATA_FORMAT_RGB5_A1:
+   case TEXTURE_DATA_FORMAT_RGB10_A2:
+   case TEXTURE_DATA_FORMAT_R16:
+   case TEXTURE_DATA_FORMAT_R16_SNORM:
+   case TEXTURE_DATA_FORMAT_RG16:
+   case TEXTURE_DATA_FORMAT_RG16_SNORM:
+   case TEXTURE_DATA_FORMAT_RGBA16:
+   case TEXTURE_DATA_FORMAT_RGBA16_SNORM:
+   case TEXTURE_DATA_FORMAT_R16F:
+   case TEXTURE_DATA_FORMAT_RG16F:
+   case TEXTURE_DATA_FORMAT_RGBA16F:
+   case TEXTURE_DATA_FORMAT_R11F_G11F_B10F:
+   case TEXTURE_DATA_FORMAT_R4:
+   case TEXTURE_DATA_FORMAT_RGB9_E5:
+   case TEXTURE_DATA_FORMAT_R32F:
+   case TEXTURE_DATA_FORMAT_RG32F:
+   case TEXTURE_DATA_FORMAT_RGBA32F:
+   case TEXTURE_DATA_FORMAT_RGB8_ETC2:
+   case TEXTURE_DATA_FORMAT_RGB8_PUNCHTHROUGH_ALPHA1:
+   case TEXTURE_DATA_FORMAT_RGBA8_ETC2_EAC:
+   case TEXTURE_DATA_FORMAT_R11_EAC:
+   case TEXTURE_DATA_FORMAT_SIGNED_R11_EAC:
+   case TEXTURE_DATA_FORMAT_RG11_EAC:
+   case TEXTURE_DATA_FORMAT_SIGNED_RG11_EAC:
+      return true;
+   default:
+      return false;
+   }
+}
+
+uint8_t
+v3dX(get_internal_depth_type)(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_D16_UNORM:
+      return V3D_INTERNAL_TYPE_DEPTH_16;
+   case VK_FORMAT_D32_SFLOAT:
+      return V3D_INTERNAL_TYPE_DEPTH_32F;
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+      return V3D_INTERNAL_TYPE_DEPTH_24;
+   default:
+      unreachable("Invalid depth format");
+      break;
+   }
+}
+
+void
+v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
+                                              VkImageAspectFlags aspect_mask,
+                                              uint32_t *internal_type,
+                                              uint32_t *internal_bpp)
+{
+   const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
+                                         VK_IMAGE_ASPECT_STENCIL_BIT;
+
+   /* We can't store depth/stencil pixel formats to a raster format, so
+    * so instead we load our depth/stencil aspects to a compatible color
+    * format.
+    */
+   /* FIXME: pre-compute this at image creation time? */
+   if (aspect_mask & ds_aspects) {
+      switch (vk_format) {
+      case VK_FORMAT_D16_UNORM:
+         *internal_type = V3D_INTERNAL_TYPE_16UI;
+         *internal_bpp = V3D_INTERNAL_BPP_64;
+         break;
+      case VK_FORMAT_D32_SFLOAT:
+         *internal_type = V3D_INTERNAL_TYPE_32F;
+         *internal_bpp = V3D_INTERNAL_BPP_128;
+         break;
+      case VK_FORMAT_X8_D24_UNORM_PACK32:
+      case VK_FORMAT_D24_UNORM_S8_UINT:
+         /* Use RGBA8 format so we can relocate the X/S bits in the appropriate
+          * place to match Vulkan expectations. See the comment on the tile
+          * load command for more details.
+          */
+         *internal_type = V3D_INTERNAL_TYPE_8UI;
+         *internal_bpp = V3D_INTERNAL_BPP_32;
+         break;
+      default:
+         assert(!"unsupported format");
+         break;
+      }
+   } else {
+      const struct v3dv_format *format = v3dX(get_format)(vk_format);
+      v3dX(get_internal_type_bpp_for_output_format)(format->rt_type,
+                                                    internal_type, internal_bpp);
+   }
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_image.c b/lib/mesa/src/broadcom/vulkan/v3dvx_image.c
new file mode 100644
index 000000000..a9aa0fb97
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_image.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+#include "vk_format_info.h"
+
+/*
+ * This method translates pipe_swizzle to the swizzle values used at the
+ * packet TEXTURE_SHADER_STATE
+ *
+ * FIXME: C&P from v3d, common place?
+ */
+static uint32_t
+translate_swizzle(unsigned char pipe_swizzle)
+{
+   switch (pipe_swizzle) {
+   case PIPE_SWIZZLE_0:
+      return 0;
+   case PIPE_SWIZZLE_1:
+      return 1;
+   case PIPE_SWIZZLE_X:
+   case PIPE_SWIZZLE_Y:
+   case PIPE_SWIZZLE_Z:
+   case PIPE_SWIZZLE_W:
+      return 2 + pipe_swizzle;
+   default:
+      unreachable("unknown swizzle");
+   }
+}
+
+/*
+ * Packs and ensure bo for the shader state (the latter can be temporal).
+ */
+static void
+pack_texture_shader_state_helper(struct v3dv_device *device,
+                                 struct v3dv_image_view *image_view,
+                                 bool for_cube_map_array_storage)
+{
+   assert(!for_cube_map_array_storage ||
+          image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY);
+   const uint32_t index = for_cube_map_array_storage ? 1 : 0;
+
+   assert(image_view->vk.image);
+   const struct v3dv_image *image = (struct v3dv_image *) image_view->vk.image;
+
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT ||
+          image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
+   const uint32_t msaa_scale = image->vk.samples == VK_SAMPLE_COUNT_1_BIT ? 1 : 2;
+
+   v3dvx_pack(image_view->texture_shader_state[index], TEXTURE_SHADER_STATE, tex) {
+
+      tex.level_0_is_strictly_uif =
+         (image->slices[0].tiling == V3D_TILING_UIF_XOR ||
+          image->slices[0].tiling == V3D_TILING_UIF_NO_XOR);
+
+      tex.level_0_xor_enable = (image->slices[0].tiling == V3D_TILING_UIF_XOR);
+
+      if (tex.level_0_is_strictly_uif)
+         tex.level_0_ub_pad = image->slices[0].ub_pad;
+
+      /* FIXME: v3d never sets uif_xor_disable, but uses it on the following
+       * check so let's set the default value
+       */
+      tex.uif_xor_disable = false;
+      if (tex.uif_xor_disable ||
+          tex.level_0_is_strictly_uif) {
+         tex.extended = true;
+      }
+
+      tex.base_level = image_view->vk.base_mip_level;
+      tex.max_level = image_view->vk.base_mip_level +
+                      image_view->vk.level_count - 1;
+
+      tex.swizzle_r = translate_swizzle(image_view->swizzle[0]);
+      tex.swizzle_g = translate_swizzle(image_view->swizzle[1]);
+      tex.swizzle_b = translate_swizzle(image_view->swizzle[2]);
+      tex.swizzle_a = translate_swizzle(image_view->swizzle[3]);
+
+      tex.texture_type = image_view->format->tex_type;
+
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+         tex.image_depth = image->vk.extent.depth;
+      } else {
+         tex.image_depth = image_view->vk.layer_count;
+      }
+
+      /* Empirical testing with CTS shows that when we are sampling from cube
+       * arrays we want to set image depth to layers / 6, but not when doing
+       * image load/store.
+       */
+      if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
+          !for_cube_map_array_storage) {
+         assert(tex.image_depth % 6 == 0);
+         tex.image_depth /= 6;
+      }
+
+      tex.image_height = image->vk.extent.height * msaa_scale;
+      tex.image_width = image->vk.extent.width * msaa_scale;
+
+      /* On 4.x, the height of a 1D texture is redefined to be the
+       * upper 14 bits of the width (which is only usable with txf).
+       */
+      if (image->vk.image_type == VK_IMAGE_TYPE_1D) {
+         tex.image_height = tex.image_width >> 14;
+      }
+      tex.image_width &= (1 << 14) - 1;
+      tex.image_height &= (1 << 14) - 1;
+
+      tex.array_stride_64_byte_aligned = image->cube_map_stride / 64;
+
+      tex.srgb = vk_format_is_srgb(image_view->vk.format);
+
+      /* At this point we don't have the job. That's the reason the first
+       * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+       * add the bo to the job. This also means that we need to add manually
+       * the image bo to the job using the texture.
+       */
+      const uint32_t base_offset =
+         image->mem->bo->offset +
+         v3dv_layer_offset(image, 0, image_view->vk.base_array_layer);
+      tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+   }
+}
+
+void
+v3dX(pack_texture_shader_state)(struct v3dv_device *device,
+                                struct v3dv_image_view *iview)
+{
+   pack_texture_shader_state_helper(device, iview, false);
+   if (iview->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+      pack_texture_shader_state_helper(device, iview, true);
+}
+
+void
+v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
+                                                 struct v3dv_buffer_view *buffer_view)
+{
+   assert(buffer_view->buffer);
+   const struct v3dv_buffer *buffer = buffer_view->buffer;
+
+   v3dvx_pack(buffer_view->texture_shader_state, TEXTURE_SHADER_STATE, tex) {
+      tex.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X);
+      tex.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y);
+      tex.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z);
+      tex.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W);
+
+      tex.image_depth = 1;
+
+      /* On 4.x, the height of a 1D texture is redefined to be the upper 14
+       * bits of the width (which is only usable with txf) (or in other words,
+       * we are providing a 28 bit field for size, but split on the usual
+       * 14bit height/width).
+       */
+      tex.image_width = buffer_view->num_elements;
+      tex.image_height = tex.image_width >> 14;
+      tex.image_width &= (1 << 14) - 1;
+      tex.image_height &= (1 << 14) - 1;
+
+      tex.texture_type = buffer_view->format->tex_type;
+      tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+
+      /* At this point we don't have the job. That's the reason the first
+       * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+       * add the bo to the job. This also means that we need to add manually
+       * the image bo to the job using the texture.
+       */
+      const uint32_t base_offset =
+         buffer->mem->bo->offset +
+         buffer->mem_offset +
+         buffer_view->offset;
+
+      tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+   }
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c b/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c
new file mode 100644
index 000000000..2f79e4e9c
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -0,0 +1,1357 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "v3dv_meta_common.h"
+
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+#include "vk_format_info.h"
+
+struct rcl_clear_info {
+   const union v3dv_clear_value *clear_value;
+   struct v3dv_image *image;
+   VkImageAspectFlags aspects;
+   uint32_t level;
+};
+
+static struct v3dv_cl *
+emit_rcl_prologue(struct v3dv_job *job,
+                  struct v3dv_meta_framebuffer *fb,
+                  const struct rcl_clear_info *clear_info)
+{
+   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+
+   struct v3dv_cl *rcl = &job->rcl;
+   v3dv_cl_ensure_space_with_branch(rcl, 200 +
+                                    tiling->layers * 256 *
+                                    cl_packet_length(SUPERTILE_COORDINATES));
+   if (job->cmd_buffer->state.oom)
+      return NULL;
+
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
+      config.early_z_disable = true;
+      config.image_width_pixels = tiling->width;
+      config.image_height_pixels = tiling->height;
+      config.number_of_render_targets = 1;
+      config.multisample_mode_4x = tiling->msaa;
+      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+      config.internal_depth_type = fb->internal_depth_type;
+   }
+
+   if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
+      uint32_t clear_pad = 0;
+      if (clear_info->image) {
+         const struct v3dv_image *image = clear_info->image;
+         const struct v3d_resource_slice *slice =
+            &image->slices[clear_info->level];
+         if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+             slice->tiling == V3D_TILING_UIF_XOR) {
+            int uif_block_height = v3d_utile_height(image->cpp) * 2;
+
+            uint32_t implicit_padded_height =
+               align(tiling->height, uif_block_height) / uif_block_height;
+
+            if (slice->padded_height_of_output_image_in_uif_blocks -
+                implicit_padded_height >= 15) {
+               clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
+            }
+         }
+      }
+
+      const uint32_t *color = &clear_info->clear_value->color[0];
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
+         clear.clear_color_low_32_bits = color[0];
+         clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
+         clear.render_target_number = 0;
+      };
+
+      if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
+            clear.clear_color_mid_low_32_bits =
+              ((color[1] >> 24) | (color[2] << 8));
+            clear.clear_color_mid_high_24_bits =
+              ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
+            clear.render_target_number = 0;
+         };
+      }
+
+      if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
+            clear.uif_padded_height_in_uif_blocks = clear_pad;
+            clear.clear_color_high_16_bits = color[3] >> 16;
+            clear.render_target_number = 0;
+         };
+      }
+   }
+
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+      rt.render_target_0_internal_bpp = tiling->internal_bpp;
+      rt.render_target_0_internal_type = fb->internal_type;
+      rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+   }
+
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+      clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
+      clear.stencil_clear_value = clear_info ? clear_info->clear_value->s : 0;
+   };
+
+   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
+      init.use_auto_chained_tile_lists = true;
+      init.size_of_first_block_in_chained_tile_lists =
+         TILE_ALLOCATION_BLOCK_SIZE_64B;
+   }
+
+   return rcl;
+}
+
+static void
+emit_frame_setup(struct v3dv_job *job,
+                 uint32_t min_layer,
+                 const union v3dv_clear_value *clear_value)
+{
+   v3dv_return_if_oom(NULL, job);
+
+   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
+
+   struct v3dv_cl *rcl = &job->rcl;
+
+   const uint32_t tile_alloc_offset =
+      64 * min_layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
+   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
+   }
+
+   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
+      config.number_of_bin_tile_lists = 1;
+      config.total_frame_width_in_tiles = tiling->draw_tiles_x;
+      config.total_frame_height_in_tiles = tiling->draw_tiles_y;
+
+      config.supertile_width_in_tiles = tiling->supertile_width;
+      config.supertile_height_in_tiles = tiling->supertile_height;
+
+      config.total_frame_width_in_supertiles =
+         tiling->frame_width_in_supertiles;
+      config.total_frame_height_in_supertiles =
+         tiling->frame_height_in_supertiles;
+   }
+
+   /* Implement GFXH-1742 workaround. Also, if we are clearing we have to do
+    * it here.
+    */
+   for (int i = 0; i < 2; i++) {
+      cl_emit(rcl, TILE_COORDINATES, coords);
+      cl_emit(rcl, END_OF_LOADS, end);
+      cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
+         store.buffer_to_store = NONE;
+      }
+      if (clear_value && i == 0) {
+         cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
+            clear.clear_z_stencil_buffer = true;
+            clear.clear_all_render_targets = true;
+         }
+      }
+      cl_emit(rcl, END_OF_TILE_MARKER, end);
+   }
+
+   cl_emit(rcl, FLUSH_VCD_CACHE, flush);
+}
+
+static void
+emit_supertile_coordinates(struct v3dv_job *job,
+                           struct v3dv_meta_framebuffer *framebuffer)
+{
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl *rcl = &job->rcl;
+
+   const uint32_t min_y = framebuffer->min_y_supertile;
+   const uint32_t max_y = framebuffer->max_y_supertile;
+   const uint32_t min_x = framebuffer->min_x_supertile;
+   const uint32_t max_x = framebuffer->max_x_supertile;
+
+   for (int y = min_y; y <= max_y; y++) {
+      for (int x = min_x; x <= max_x; x++) {
+         cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
+            coords.column_number_in_supertiles = x;
+            coords.row_number_in_supertiles = y;
+         }
+      }
+   }
+}
+
+static void
+emit_linear_load(struct v3dv_cl *cl,
+                 uint32_t buffer,
+                 struct v3dv_bo *bo,
+                 uint32_t offset,
+                 uint32_t stride,
+                 uint32_t format)
+{
+   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
+      load.buffer_to_load = buffer;
+      load.address = v3dv_cl_address(bo, offset);
+      load.input_image_format = format;
+      load.memory_format = V3D_TILING_RASTER;
+      load.height_in_ub_or_stride = stride;
+      load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   }
+}
+
+static void
+emit_linear_store(struct v3dv_cl *cl,
+                  uint32_t buffer,
+                  struct v3dv_bo *bo,
+                  uint32_t offset,
+                  uint32_t stride,
+                  bool msaa,
+                  uint32_t format)
+{
+   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+      store.buffer_to_store = RENDER_TARGET_0;
+      store.address = v3dv_cl_address(bo, offset);
+      store.clear_buffer_being_stored = false;
+      store.output_image_format = format;
+      store.memory_format = V3D_TILING_RASTER;
+      store.height_in_ub_or_stride = stride;
+      store.decimate_mode = msaa ? V3D_DECIMATE_MODE_ALL_SAMPLES :
+                                   V3D_DECIMATE_MODE_SAMPLE_0;
+   }
+}
+
+/* This chooses a tile buffer format that is appropriate for the copy operation.
+ * Typically, this is the image render target type, however, if we are copying
+ * depth/stencil to/from a buffer the hardware can't do raster loads/stores, so
+ * we need to load and store to/from a tile color buffer using a compatible
+ * color format.
+ */
+static uint32_t
+choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
+                  VkImageAspectFlags aspect,
+                  bool for_store,
+                  bool is_copy_to_buffer,
+                  bool is_copy_from_buffer)
+{
+   if (is_copy_to_buffer || is_copy_from_buffer) {
+      switch (framebuffer->vk_format) {
+      case VK_FORMAT_D16_UNORM:
+         return V3D_OUTPUT_IMAGE_FORMAT_R16UI;
+      case VK_FORMAT_D32_SFLOAT:
+         return V3D_OUTPUT_IMAGE_FORMAT_R32F;
+      case VK_FORMAT_X8_D24_UNORM_PACK32:
+         return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
+      case VK_FORMAT_D24_UNORM_S8_UINT:
+         /* When storing the stencil aspect of a combined depth/stencil image
+          * to a buffer, the Vulkan spec states that the output buffer must
+          * have packed stencil values, so we choose an R8UI format for our
+          * store outputs. For the load input we still want RGBA8UI since the
+          * source image contains 4 channels (including the 3 channels
+          * containing the 24-bit depth value).
+          *
+          * When loading the stencil aspect of a combined depth/stencil image
+          * from a buffer, we read packed 8-bit stencil values from the buffer
+          * that we need to put into the LSB of the 32-bit format (the R
+          * channel), so we use R8UI. For the store, if we used R8UI then we
+          * would write 8-bit stencil values consecutively over depth channels,
+          * so we need to use RGBA8UI. This will write each stencil value in
+          * its correct position, but will overwrite depth values (channels G
+          * B,A) with undefined values. To fix this,  we will have to restore
+          * the depth aspect from the Z tile buffer, which we should pre-load
+          * from the image before the store).
+          */
+         if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
+            return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
+         } else {
+            assert(aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
+            if (is_copy_to_buffer) {
+               return for_store ? V3D_OUTPUT_IMAGE_FORMAT_R8UI :
+                                  V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
+            } else {
+               assert(is_copy_from_buffer);
+               return for_store ? V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI :
+                                  V3D_OUTPUT_IMAGE_FORMAT_R8UI;
+            }
+         }
+      default: /* Color formats */
+         return framebuffer->format->rt_type;
+         break;
+      }
+   } else {
+      return framebuffer->format->rt_type;
+   }
+}
+
+static inline bool
+format_needs_rb_swap(struct v3dv_device *device,
+                     VkFormat format)
+{
+   const uint8_t *swizzle = v3dv_get_format_swizzle(device, format);
+   return swizzle[0] == PIPE_SWIZZLE_Z;
+}
+
+static void
+emit_image_load(struct v3dv_device *device,
+                struct v3dv_cl *cl,
+                struct v3dv_meta_framebuffer *framebuffer,
+                struct v3dv_image *image,
+                VkImageAspectFlags aspect,
+                uint32_t layer,
+                uint32_t mip_level,
+                bool is_copy_to_buffer,
+                bool is_copy_from_buffer)
+{
+   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
+
+   /* For image to/from buffer copies we always load to and store from RT0,
+    * even for depth/stencil aspects, because the hardware can't do raster
+    * stores or loads from/to the depth/stencil tile buffers.
+    */
+   bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
+                            aspect == VK_IMAGE_ASPECT_COLOR_BIT;
+
+   const struct v3d_resource_slice *slice = &image->slices[mip_level];
+   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
+      load.buffer_to_load = load_to_color_tlb ?
+         RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
+
+      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
+
+      load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
+                                                  is_copy_to_buffer,
+                                                  is_copy_from_buffer);
+      load.memory_format = slice->tiling;
+
+      /* When copying depth/stencil images to a buffer, for D24 formats Vulkan
+       * expects the depth value in the LSB bits of each 32-bit pixel.
+       * Unfortunately, the hardware seems to put the S8/X8 bits there and the
+       * depth bits on the MSB. To work around that we can reverse the channel
+       * order and then swap the R/B channels to get what we want.
+       *
+       * NOTE: reversing and swapping only gets us the behavior we want if the
+       * operations happen in that exact order, which seems to be the case when
+       * done on the tile buffer load operations. On the store, it seems the
+       * order is not the same. The order on the store is probably reversed so
+       * that reversing and swapping on both the load and the store preserves
+       * the original order of the channels in memory.
+       *
+       * Notice that we only need to do this when copying to a buffer, where
+       * depth and stencil aspects are copied as separate regions and
+       * the spec expects them to be tightly packed.
+       */
+      bool needs_rb_swap = false;
+      bool needs_chan_reverse = false;
+      if (is_copy_to_buffer &&
+         (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
+          (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
+           (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
+         needs_rb_swap = true;
+         needs_chan_reverse = true;
+      } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
+                 (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
+         /* This is not a raw data copy (i.e. we are clearing the image),
+          * so we need to make sure we respect the format swizzle.
+          */
+         needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
+      }
+
+      load.r_b_swap = needs_rb_swap;
+      load.channel_reverse = needs_chan_reverse;
+
+      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+          slice->tiling == V3D_TILING_UIF_XOR) {
+         load.height_in_ub_or_stride =
+            slice->padded_height_of_output_image_in_uif_blocks;
+      } else if (slice->tiling == V3D_TILING_RASTER) {
+         load.height_in_ub_or_stride = slice->stride;
+      }
+
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
+         load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+      else
+         load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   }
+}
+
+static void
+emit_image_store(struct v3dv_device *device,
+                 struct v3dv_cl *cl,
+                 struct v3dv_meta_framebuffer *framebuffer,
+                 struct v3dv_image *image,
+                 VkImageAspectFlags aspect,
+                 uint32_t layer,
+                 uint32_t mip_level,
+                 bool is_copy_to_buffer,
+                 bool is_copy_from_buffer,
+                 bool is_multisample_resolve)
+{
+   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
+
+   bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
+                               aspect == VK_IMAGE_ASPECT_COLOR_BIT;
+
+   const struct v3d_resource_slice *slice = &image->slices[mip_level];
+   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+      store.buffer_to_store = store_from_color_tlb ?
+         RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
+
+      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
+      store.clear_buffer_being_stored = false;
+
+      /* See rationale in emit_image_load() */
+      bool needs_rb_swap = false;
+      bool needs_chan_reverse = false;
+      if (is_copy_from_buffer &&
+         (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
+          (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
+           (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
+         needs_rb_swap = true;
+         needs_chan_reverse = true;
+      } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
+                 (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
+         needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
+      }
+
+      store.r_b_swap = needs_rb_swap;
+      store.channel_reverse = needs_chan_reverse;
+
+      store.output_image_format = choose_tlb_format(framebuffer, aspect, true,
+                                                    is_copy_to_buffer,
+                                                    is_copy_from_buffer);
+      store.memory_format = slice->tiling;
+      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+          slice->tiling == V3D_TILING_UIF_XOR) {
+         store.height_in_ub_or_stride =
+            slice->padded_height_of_output_image_in_uif_blocks;
+      } else if (slice->tiling == V3D_TILING_RASTER) {
+         store.height_in_ub_or_stride = slice->stride;
+      }
+
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
+         store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+      else if (is_multisample_resolve)
+         store.decimate_mode = V3D_DECIMATE_MODE_4X;
+      else
+         store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   }
+}
+
+static void
+emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
+                                        struct v3dv_meta_framebuffer *framebuffer,
+                                        struct v3dv_buffer *buffer,
+                                        struct v3dv_image *image,
+                                        uint32_t layer_offset,
+                                        const VkBufferImageCopy2KHR *region)
+{
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   /* Load image to TLB */
+   assert((image->vk.image_type != VK_IMAGE_TYPE_3D &&
+           layer_offset < region->imageSubresource.layerCount) ||
+          layer_offset < image->vk.extent.depth);
+
+   const uint32_t image_layer = image->vk.image_type != VK_IMAGE_TYPE_3D ?
+      region->imageSubresource.baseArrayLayer + layer_offset :
+      region->imageOffset.z + layer_offset;
+
+   emit_image_load(job->device, cl, framebuffer, image,
+                   region->imageSubresource.aspectMask,
+                   image_layer,
+                   region->imageSubresource.mipLevel,
+                   true, false);
+
+   cl_emit(cl, END_OF_LOADS, end);
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   /* Store TLB to buffer */
+   uint32_t width, height;
+   if (region->bufferRowLength == 0)
+      width = region->imageExtent.width;
+   else
+      width = region->bufferRowLength;
+
+   if (region->bufferImageHeight == 0)
+      height = region->imageExtent.height;
+   else
+      height = region->bufferImageHeight;
+
+   /* Handle copy from compressed format */
+   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
+   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
+
+   /* If we are storing stencil from a combined depth/stencil format the
+    * Vulkan spec states that the output buffer must have packed stencil
+    * values, where each stencil value is 1 byte.
+    */
+   uint32_t cpp =
+      region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
+         1 : image->cpp;
+   uint32_t buffer_stride = width * cpp;
+   uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
+                            height * buffer_stride * layer_offset;
+
+   uint32_t format = choose_tlb_format(framebuffer,
+                                       region->imageSubresource.aspectMask,
+                                       true, true, false);
+   bool msaa = image->vk.samples > VK_SAMPLE_COUNT_1_BIT;
+
+   emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
+                     buffer_offset, buffer_stride, msaa, format);
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+static void
+emit_copy_layer_to_buffer(struct v3dv_job *job,
+                          struct v3dv_buffer *buffer,
+                          struct v3dv_image *image,
+                          struct v3dv_meta_framebuffer *framebuffer,
+                          uint32_t layer,
+                          const VkBufferImageCopy2KHR *region)
+{
+   emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
+                                           image, layer, region);
+   emit_supertile_coordinates(job, framebuffer);
+}
+
+void
+v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
+                                         struct v3dv_buffer *buffer,
+                                         struct v3dv_image *image,
+                                         struct v3dv_meta_framebuffer *framebuffer,
+                                         const VkBufferImageCopy2KHR *region)
+{
+   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
+   v3dv_return_if_oom(NULL, job);
+
+   emit_frame_setup(job, 0, NULL);
+   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
+      emit_copy_layer_to_buffer(job, buffer, image, framebuffer, layer, region);
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+static void
+emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
+                                       struct v3dv_meta_framebuffer *framebuffer,
+                                       struct v3dv_image *dst,
+                                       struct v3dv_image *src,
+                                       uint32_t layer_offset,
+                                       const VkImageResolve2KHR *region)
+{
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
+           layer_offset < region->srcSubresource.layerCount) ||
+          layer_offset < src->vk.extent.depth);
+
+   const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
+      region->srcSubresource.baseArrayLayer + layer_offset :
+      region->srcOffset.z + layer_offset;
+
+   emit_image_load(job->device, cl, framebuffer, src,
+                   region->srcSubresource.aspectMask,
+                   src_layer,
+                   region->srcSubresource.mipLevel,
+                   false, false);
+
+   cl_emit(cl, END_OF_LOADS, end);
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
+           layer_offset < region->dstSubresource.layerCount) ||
+          layer_offset < dst->vk.extent.depth);
+
+   const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
+      region->dstSubresource.baseArrayLayer + layer_offset :
+      region->dstOffset.z + layer_offset;
+
+   emit_image_store(job->device, cl, framebuffer, dst,
+                    region->dstSubresource.aspectMask,
+                    dst_layer,
+                    region->dstSubresource.mipLevel,
+                    false, false, true);
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+static void
+emit_resolve_image_layer(struct v3dv_job *job,
+                         struct v3dv_image *dst,
+                         struct v3dv_image *src,
+                         struct v3dv_meta_framebuffer *framebuffer,
+                         uint32_t layer,
+                         const VkImageResolve2KHR *region)
+{
+   emit_resolve_image_layer_per_tile_list(job, framebuffer,
+                                          dst, src, layer, region);
+   emit_supertile_coordinates(job, framebuffer);
+}
+
+void
+v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
+                                  struct v3dv_image *dst,
+                                  struct v3dv_image *src,
+                                  struct v3dv_meta_framebuffer *framebuffer,
+                                  const VkImageResolve2KHR *region)
+{
+   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
+   v3dv_return_if_oom(NULL, job);
+
+   emit_frame_setup(job, 0, NULL);
+   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
+      emit_resolve_image_layer(job, dst, src, framebuffer, layer, region);
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+static void
+emit_copy_buffer_per_tile_list(struct v3dv_job *job,
+                               struct v3dv_bo *dst,
+                               struct v3dv_bo *src,
+                               uint32_t dst_offset,
+                               uint32_t src_offset,
+                               uint32_t stride,
+                               uint32_t format)
+{
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   emit_linear_load(cl, RENDER_TARGET_0, src, src_offset, stride, format);
+
+   cl_emit(cl, END_OF_LOADS, end);
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   emit_linear_store(cl, RENDER_TARGET_0,
+                     dst, dst_offset, stride, false, format);
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+void
+v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
+                            struct v3dv_bo *dst,
+                            struct v3dv_bo *src,
+                            uint32_t dst_offset,
+                            uint32_t src_offset,
+                            struct v3dv_meta_framebuffer *framebuffer,
+                            uint32_t format,
+                            uint32_t item_size)
+{
+   const uint32_t stride = job->frame_tiling.width * item_size;
+   emit_copy_buffer_per_tile_list(job, dst, src,
+                                  dst_offset, src_offset,
+                                  stride, format);
+   emit_supertile_coordinates(job, framebuffer);
+}
+
+void
+v3dX(meta_emit_copy_buffer_rcl)(struct v3dv_job *job,
+                                struct v3dv_bo *dst,
+                                struct v3dv_bo *src,
+                                uint32_t dst_offset,
+                                uint32_t src_offset,
+                                struct v3dv_meta_framebuffer *framebuffer,
+                                uint32_t format,
+                                uint32_t item_size)
+{
+   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
+   v3dv_return_if_oom(NULL, job);
+
+   emit_frame_setup(job, 0, NULL);
+
+   v3dX(meta_emit_copy_buffer)(job, dst, src, dst_offset, src_offset,
+                               framebuffer, format, item_size);
+
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+static void
+emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
+                                    struct v3dv_meta_framebuffer *framebuffer,
+                                    struct v3dv_image *dst,
+                                    struct v3dv_image *src,
+                                    uint32_t layer_offset,
+                                    const VkImageCopy2KHR *region)
+{
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
+           layer_offset < region->srcSubresource.layerCount) ||
+          layer_offset < src->vk.extent.depth);
+
+   const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
+      region->srcSubresource.baseArrayLayer + layer_offset :
+      region->srcOffset.z + layer_offset;
+
+   emit_image_load(job->device, cl, framebuffer, src,
+                   region->srcSubresource.aspectMask,
+                   src_layer,
+                   region->srcSubresource.mipLevel,
+                   false, false);
+
+   cl_emit(cl, END_OF_LOADS, end);
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
+           layer_offset < region->dstSubresource.layerCount) ||
+          layer_offset < dst->vk.extent.depth);
+
+   const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
+      region->dstSubresource.baseArrayLayer + layer_offset :
+      region->dstOffset.z + layer_offset;
+
+   emit_image_store(job->device, cl, framebuffer, dst,
+                    region->dstSubresource.aspectMask,
+                    dst_layer,
+                    region->dstSubresource.mipLevel,
+                    false, false, false);
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+static void
+emit_copy_image_layer(struct v3dv_job *job,
+                      struct v3dv_image *dst,
+                      struct v3dv_image *src,
+                      struct v3dv_meta_framebuffer *framebuffer,
+                      uint32_t layer,
+                      const VkImageCopy2KHR *region)
+{
+   emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
+   emit_supertile_coordinates(job, framebuffer);
+}
+
+void
+v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
+                               struct v3dv_image *dst,
+                               struct v3dv_image *src,
+                               struct v3dv_meta_framebuffer *framebuffer,
+                               const VkImageCopy2KHR *region)
+{
+   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
+   v3dv_return_if_oom(NULL, job);
+
+   emit_frame_setup(job, 0, NULL);
+   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
+      emit_copy_image_layer(job, dst, src, framebuffer, layer, region);
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+void
+v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+                        struct v3dv_image *dst,
+                        uint32_t dst_mip_level,
+                        uint32_t dst_layer,
+                        struct v3dv_image *src,
+                        uint32_t src_mip_level,
+                        uint32_t src_layer,
+                        uint32_t width,
+                        uint32_t height,
+                        const struct v3dv_format *format)
+{
+   const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
+   const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
+
+   assert(dst->mem && dst->mem->bo);
+   const struct v3dv_bo *dst_bo = dst->mem->bo;
+
+   assert(src->mem && src->mem->bo);
+   const struct v3dv_bo *src_bo = src->mem->bo;
+
+   struct drm_v3d_submit_tfu tfu = {
+      .ios = (height << 16) | width,
+      .bo_handles = {
+         dst_bo->handle,
+         src_bo->handle != dst_bo->handle ? src_bo->handle : 0
+      },
+   };
+
+   const uint32_t src_offset =
+      src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
+   tfu.iia |= src_offset;
+
+   uint32_t icfg;
+   if (src_slice->tiling == V3D_TILING_RASTER) {
+      icfg = V3D_TFU_ICFG_FORMAT_RASTER;
+   } else {
+      icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
+             (src_slice->tiling - V3D_TILING_LINEARTILE);
+   }
+   tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
+
+   const uint32_t dst_offset =
+      dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
+   tfu.ioa |= dst_offset;
+
+   tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
+               (dst_slice->tiling - V3D_TILING_LINEARTILE)) <<
+                V3D_TFU_IOA_FORMAT_SHIFT;
+   tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
+
+   switch (src_slice->tiling) {
+   case V3D_TILING_UIF_NO_XOR:
+   case V3D_TILING_UIF_XOR:
+      tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
+      break;
+   case V3D_TILING_RASTER:
+      tfu.iis |= src_slice->stride / src->cpp;
+      break;
+   default:
+      break;
+   }
+
+   /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+    * OPAD field for the destination (how many extra UIF blocks beyond
+    * those necessary to cover the height).
+    */
+   if (dst_slice->tiling == V3D_TILING_UIF_NO_XOR ||
+       dst_slice->tiling == V3D_TILING_UIF_XOR) {
+      uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
+      uint32_t implicit_padded_height = align(height, uif_block_h);
+      uint32_t icfg =
+         (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
+      tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
+   }
+
+   v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
+}
+
+static void
+emit_clear_image_layer_per_tile_list(struct v3dv_job *job,
+                                     struct v3dv_meta_framebuffer *framebuffer,
+                                     struct v3dv_image *image,
+                                     VkImageAspectFlags aspects,
+                                     uint32_t layer,
+                                     uint32_t level)
+{
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   cl_emit(cl, END_OF_LOADS, end);
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   emit_image_store(job->device, cl, framebuffer, image, aspects,
+                    layer, level, false, false, false);
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+static void
+emit_clear_image_layers(struct v3dv_job *job,
+                 struct v3dv_image *image,
+                 struct v3dv_meta_framebuffer *framebuffer,
+                 VkImageAspectFlags aspects,
+                 uint32_t min_layer,
+                 uint32_t max_layer,
+                 uint32_t level)
+{
+   for (uint32_t layer = min_layer; layer < max_layer; layer++) {
+      emit_clear_image_layer_per_tile_list(job, framebuffer, image, aspects,
+                                           layer, level);
+      emit_supertile_coordinates(job, framebuffer);
+   }
+}
+
+void
+v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
+                                struct v3dv_image *image,
+                                struct v3dv_meta_framebuffer *framebuffer,
+                                const union v3dv_clear_value *clear_value,
+                                VkImageAspectFlags aspects,
+                                uint32_t min_layer,
+                                uint32_t max_layer,
+                                uint32_t level)
+{
+   const struct rcl_clear_info clear_info = {
+      .clear_value = clear_value,
+      .image = image,
+      .aspects = aspects,
+      .level = level,
+   };
+
+   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
+   v3dv_return_if_oom(NULL, job);
+
+   emit_frame_setup(job, 0, clear_value);
+   emit_clear_image_layers(job, image, framebuffer, aspects,
+                           min_layer, max_layer, level);
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+static void
+emit_fill_buffer_per_tile_list(struct v3dv_job *job,
+                               struct v3dv_bo *bo,
+                               uint32_t offset,
+                               uint32_t stride)
+{
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   cl_emit(cl, END_OF_LOADS, end);
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   emit_linear_store(cl, RENDER_TARGET_0, bo, offset, stride, false,
+                     V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI);
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+static void
+emit_fill_buffer(struct v3dv_job *job,
+                 struct v3dv_bo *bo,
+                 uint32_t offset,
+                 struct v3dv_meta_framebuffer *framebuffer)
+{
+   const uint32_t stride = job->frame_tiling.width * 4;
+   emit_fill_buffer_per_tile_list(job, bo, offset, stride);
+   emit_supertile_coordinates(job, framebuffer);
+}
+
+void
+v3dX(meta_emit_fill_buffer_rcl)(struct v3dv_job *job,
+                                struct v3dv_bo *bo,
+                                uint32_t offset,
+                                struct v3dv_meta_framebuffer *framebuffer,
+                                uint32_t data)
+{
+   const union v3dv_clear_value clear_value = {
+       .color = { data, 0, 0, 0 },
+   };
+
+   const struct rcl_clear_info clear_info = {
+      .clear_value = &clear_value,
+      .image = NULL,
+      .aspects = VK_IMAGE_ASPECT_COLOR_BIT,
+      .level = 0,
+   };
+
+   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
+   v3dv_return_if_oom(NULL, job);
+
+   emit_frame_setup(job, 0, &clear_value);
+   emit_fill_buffer(job, bo, offset, framebuffer);
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+
+static void
+emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
+                                        struct v3dv_meta_framebuffer *framebuffer,
+                                        struct v3dv_image *image,
+                                        struct v3dv_buffer *buffer,
+                                        uint32_t layer,
+                                        const VkBufferImageCopy2KHR *region)
+{
+   struct v3dv_cl *cl = &job->indirect;
+   v3dv_cl_ensure_space(cl, 200, 1);
+   v3dv_return_if_oom(NULL, job);
+
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
+
+   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
+
+   const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
+   assert((image->vk.image_type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
+          layer < image->vk.extent.depth);
+
+   /* Load TLB from buffer */
+   uint32_t width, height;
+   if (region->bufferRowLength == 0)
+      width = region->imageExtent.width;
+   else
+      width = region->bufferRowLength;
+
+   if (region->bufferImageHeight == 0)
+      height = region->imageExtent.height;
+   else
+      height = region->bufferImageHeight;
+
+   /* Handle copy to compressed format using a compatible format */
+   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
+   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
+
+   uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
+                  1 : image->cpp;
+   uint32_t buffer_stride = width * cpp;
+   uint32_t buffer_offset =
+      buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
+
+   uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
+                                       false, false, true);
+
+   emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
+                    buffer_offset, buffer_stride, format);
+
+   /* Because we can't do raster loads/stores of Z/S formats we need to
+    * use a color tile buffer with a compatible RGBA color format instead.
+    * However, when we are uploading a single aspect to a combined
+    * depth/stencil image we have the problem that our tile buffer stores don't
+    * allow us to mask out the other aspect, so we always write all four RGBA
+    * channels to the image and we end up overwriting that other aspect with
+    * undefined values. To work around that, we first load the aspect we are
+    * not copying from the image memory into a proper Z/S tile buffer. Then we
+    * do our store from the color buffer for the aspect we are copying, and
+    * after that, we do another store from the Z/S tile buffer to restore the
+    * other aspect to its original value.
+    */
+   if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+      if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         emit_image_load(job->device, cl, framebuffer, image,
+                         VK_IMAGE_ASPECT_STENCIL_BIT,
+                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                         false, false);
+      } else {
+         assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
+         emit_image_load(job->device, cl, framebuffer, image,
+                         VK_IMAGE_ASPECT_DEPTH_BIT,
+                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                         false, false);
+      }
+   }
+
+   cl_emit(cl, END_OF_LOADS, end);
+
+   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   /* Store TLB to image */
+   emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
+                    imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                    false, true, false);
+
+   if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+      if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         emit_image_store(job->device, cl, framebuffer, image,
+                          VK_IMAGE_ASPECT_STENCIL_BIT,
+                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                          false, false, false);
+      } else {
+         assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
+         emit_image_store(job->device, cl, framebuffer, image,
+                          VK_IMAGE_ASPECT_DEPTH_BIT,
+                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                          false, false, false);
+      }
+   }
+
+   cl_emit(cl, END_OF_TILE_MARKER, end);
+
+   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(cl);
+   }
+}
+
+static void
+emit_copy_buffer_to_layer(struct v3dv_job *job,
+                          struct v3dv_image *image,
+                          struct v3dv_buffer *buffer,
+                          struct v3dv_meta_framebuffer *framebuffer,
+                          uint32_t layer,
+                          const VkBufferImageCopy2KHR *region)
+{
+   emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
+                                           layer, region);
+   emit_supertile_coordinates(job, framebuffer);
+}
+
+void
+v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
+                                         struct v3dv_image *image,
+                                         struct v3dv_buffer *buffer,
+                                         struct v3dv_meta_framebuffer *framebuffer,
+                                         const VkBufferImageCopy2KHR *region)
+{
+   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
+   v3dv_return_if_oom(NULL, job);
+
+   emit_frame_setup(job, 0, NULL);
+   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
+      emit_copy_buffer_to_layer(job, image, buffer, framebuffer, layer, region);
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
+
+/* Figure out a TLB size configuration for a number of pixels to process.
+ * Beware that we can't "render" more than 4096x4096 pixels in a single job,
+ * if the pixel count is larger than this, the caller might need to split
+ * the job and call this function multiple times.
+ */
+static void
+framebuffer_size_for_pixel_count(uint32_t num_pixels,
+                                 uint32_t *width,
+                                 uint32_t *height)
+{
+   assert(num_pixels > 0);
+
+   const uint32_t max_dim_pixels = 4096;
+   const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
+
+   uint32_t w, h;
+   if (num_pixels > max_pixels) {
+      w = max_dim_pixels;
+      h = max_dim_pixels;
+   } else {
+      w = num_pixels;
+      h = 1;
+      while (w > max_dim_pixels || ((w % 2) == 0 && w > 2 * h)) {
+         w >>= 1;
+         h <<= 1;
+      }
+   }
+   assert(w <= max_dim_pixels && h <= max_dim_pixels);
+   assert(w * h <= num_pixels);
+   assert(w > 0 && h > 0);
+
+   *width = w;
+   *height = h;
+}
+
+struct v3dv_job *
+v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+                       struct v3dv_bo *dst,
+                       uint32_t dst_offset,
+                       struct v3dv_bo *src,
+                       uint32_t src_offset,
+                       const VkBufferCopy2KHR *region)
+{
+   const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
+   const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
+
+   /* Select appropriate pixel format for the copy operation based on the
+    * size to copy and the alignment of the source and destination offsets.
+    */
+   src_offset += region->srcOffset;
+   dst_offset += region->dstOffset;
+   uint32_t item_size = 4;
+   while (item_size > 1 &&
+          (src_offset % item_size != 0 || dst_offset % item_size != 0)) {
+      item_size /= 2;
+   }
+
+   while (item_size > 1 && region->size % item_size != 0)
+      item_size /= 2;
+
+   assert(region->size % item_size == 0);
+   uint32_t num_items = region->size / item_size;
+   assert(num_items > 0);
+
+   uint32_t format;
+   VkFormat vk_format;
+   switch (item_size) {
+   case 4:
+      format = V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
+      vk_format = VK_FORMAT_R8G8B8A8_UINT;
+      break;
+   case 2:
+      format = V3D_OUTPUT_IMAGE_FORMAT_RG8UI;
+      vk_format = VK_FORMAT_R8G8_UINT;
+      break;
+   default:
+      format = V3D_OUTPUT_IMAGE_FORMAT_R8UI;
+      vk_format = VK_FORMAT_R8_UINT;
+      break;
+   }
+
+   struct v3dv_job *job = NULL;
+   while (num_items > 0) {
+      job = v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
+      if (!job)
+         return NULL;
+
+      uint32_t width, height;
+      framebuffer_size_for_pixel_count(num_items, &width, &height);
+
+      v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
+
+      struct v3dv_meta_framebuffer framebuffer;
+      v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
+                                  &job->frame_tiling);
+
+      v3dX(job_emit_binning_flush)(job);
+
+      v3dX(meta_emit_copy_buffer_rcl)(job, dst, src, dst_offset, src_offset,
+                                      &framebuffer, format, item_size);
+
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+      const uint32_t items_copied = width * height;
+      const uint32_t bytes_copied = items_copied * item_size;
+      num_items -= items_copied;
+      src_offset += bytes_copied;
+      dst_offset += bytes_copied;
+   }
+
+   return job;
+}
+
+void
+v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+                       struct v3dv_bo *bo,
+                       uint32_t offset,
+                       uint32_t size,
+                       uint32_t data)
+{
+   assert(size > 0 && size % 4 == 0);
+   assert(offset + size <= bo->size);
+
+   const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
+   const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
+   uint32_t num_items = size / 4;
+
+   while (num_items > 0) {
+      struct v3dv_job *job =
+         v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
+      if (!job)
+         return;
+
+      uint32_t width, height;
+      framebuffer_size_for_pixel_count(num_items, &width, &height);
+
+      v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
+
+      struct v3dv_meta_framebuffer framebuffer;
+      v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
+                                  internal_type, &job->frame_tiling);
+
+      v3dX(job_emit_binning_flush)(job);
+
+      v3dX(meta_emit_fill_buffer_rcl)(job, bo, offset, &framebuffer, data);
+
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+      const uint32_t items_copied = width * height;
+      const uint32_t bytes_copied = items_copied * 4;
+      num_items -= items_copied;
+      offset += bytes_copied;
+   }
+}
+
+void
+v3dX(meta_framebuffer_init)(struct v3dv_meta_framebuffer *fb,
+                            VkFormat vk_format,
+                            uint32_t internal_type,
+                            const struct v3dv_frame_tiling *tiling)
+{
+   fb->internal_type = internal_type;
+
+   /* Supertile coverage always starts at 0,0  */
+   uint32_t supertile_w_in_pixels =
+      tiling->tile_width * tiling->supertile_width;
+   uint32_t supertile_h_in_pixels =
+      tiling->tile_height * tiling->supertile_height;
+
+   fb->min_x_supertile = 0;
+   fb->min_y_supertile = 0;
+   fb->max_x_supertile = (tiling->width - 1) / supertile_w_in_pixels;
+   fb->max_y_supertile = (tiling->height - 1) / supertile_h_in_pixels;
+
+   fb->vk_format = vk_format;
+   fb->format = v3dX(get_format)(vk_format);
+
+   fb->internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
+   if (vk_format_is_depth_or_stencil(vk_format))
+      fb->internal_depth_type = v3dX(get_internal_depth_type)(vk_format);
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c b/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c
new file mode 100644
index 000000000..8623a4537
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -0,0 +1,654 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+#include "vk_format_info.h"
+
+static uint8_t
+blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants)
+{
+   switch (factor) {
+   case VK_BLEND_FACTOR_ZERO:
+   case VK_BLEND_FACTOR_ONE:
+   case VK_BLEND_FACTOR_SRC_COLOR:
+   case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
+   case VK_BLEND_FACTOR_DST_COLOR:
+   case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
+   case VK_BLEND_FACTOR_SRC_ALPHA:
+   case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
+   case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
+      return factor;
+   case VK_BLEND_FACTOR_CONSTANT_COLOR:
+   case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
+   case VK_BLEND_FACTOR_CONSTANT_ALPHA:
+   case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
+      *needs_constants = true;
+      return factor;
+   case VK_BLEND_FACTOR_DST_ALPHA:
+      return dst_alpha_one ? V3D_BLEND_FACTOR_ONE :
+                             V3D_BLEND_FACTOR_DST_ALPHA;
+   case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
+      return dst_alpha_one ? V3D_BLEND_FACTOR_ZERO :
+                             V3D_BLEND_FACTOR_INV_DST_ALPHA;
+   case VK_BLEND_FACTOR_SRC1_COLOR:
+   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
+   case VK_BLEND_FACTOR_SRC1_ALPHA:
+   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
+      assert(!"Invalid blend factor: dual source blending not supported.");
+   default:
+      assert(!"Unknown blend factor.");
+   }
+
+   /* Should be handled by the switch, added to avoid a "end of non-void
+    * function" error
+    */
+   unreachable("Unknown blend factor.");
+}
+
+static void
+pack_blend(struct v3dv_pipeline *pipeline,
+           const VkPipelineColorBlendStateCreateInfo *cb_info)
+{
+   /* By default, we are not enabling blending and all color channel writes are
+    * enabled. Color write enables are independent of whether blending is
+    * enabled or not.
+    *
+    * Vulkan specifies color write masks so that bits set correspond to
+    * enabled channels. Our hardware does it the other way around.
+    */
+   pipeline->blend.enables = 0;
+   pipeline->blend.color_write_masks = 0; /* All channels enabled */
+
+   if (!cb_info)
+      return;
+
+   assert(pipeline->subpass);
+   if (pipeline->subpass->color_count == 0)
+      return;
+
+   assert(pipeline->subpass->color_count == cb_info->attachmentCount);
+
+   pipeline->blend.needs_color_constants = false;
+   uint32_t color_write_masks = 0;
+   for (uint32_t i = 0; i < pipeline->subpass->color_count; i++) {
+      const VkPipelineColorBlendAttachmentState *b_state =
+         &cb_info->pAttachments[i];
+
+      uint32_t attachment_idx =
+         pipeline->subpass->color_attachments[i].attachment;
+      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      color_write_masks |= (~b_state->colorWriteMask & 0xf) << (4 * i);
+
+      if (!b_state->blendEnable)
+         continue;
+
+      VkAttachmentDescription *desc =
+         &pipeline->pass->attachments[attachment_idx].desc;
+      const struct v3dv_format *format = v3dX(get_format)(desc->format);
+      bool dst_alpha_one = (format->swizzle[3] == PIPE_SWIZZLE_1);
+
+      uint8_t rt_mask = 1 << i;
+      pipeline->blend.enables |= rt_mask;
+
+      v3dvx_pack(pipeline->blend.cfg[i], BLEND_CFG, config) {
+         config.render_target_mask = rt_mask;
+
+         config.color_blend_mode = b_state->colorBlendOp;
+         config.color_blend_dst_factor =
+            blend_factor(b_state->dstColorBlendFactor, dst_alpha_one,
+                         &pipeline->blend.needs_color_constants);
+         config.color_blend_src_factor =
+            blend_factor(b_state->srcColorBlendFactor, dst_alpha_one,
+                         &pipeline->blend.needs_color_constants);
+
+         config.alpha_blend_mode = b_state->alphaBlendOp;
+         config.alpha_blend_dst_factor =
+            blend_factor(b_state->dstAlphaBlendFactor, dst_alpha_one,
+                         &pipeline->blend.needs_color_constants);
+         config.alpha_blend_src_factor =
+            blend_factor(b_state->srcAlphaBlendFactor, dst_alpha_one,
+                         &pipeline->blend.needs_color_constants);
+      }
+   }
+
+   pipeline->blend.color_write_masks = color_write_masks;
+}
+
+/* This requires that pack_blend() had been called before so we can set
+ * the overall blend enable bit in the CFG_BITS packet.
+ */
+static void
+pack_cfg_bits(struct v3dv_pipeline *pipeline,
+              const VkPipelineDepthStencilStateCreateInfo *ds_info,
+              const VkPipelineRasterizationStateCreateInfo *rs_info,
+              const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
+              const VkPipelineMultisampleStateCreateInfo *ms_info)
+{
+   assert(sizeof(pipeline->cfg_bits) == cl_packet_length(CFG_BITS));
+
+   pipeline->msaa =
+      ms_info && ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
+
+   v3dvx_pack(pipeline->cfg_bits, CFG_BITS, config) {
+      config.enable_forward_facing_primitive =
+         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_FRONT_BIT) : false;
+
+      config.enable_reverse_facing_primitive =
+         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_BACK_BIT) : false;
+
+      /* Seems like the hardware is backwards regarding this setting... */
+      config.clockwise_primitives =
+         rs_info ? rs_info->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE : false;
+
+      config.enable_depth_offset = rs_info ? rs_info->depthBiasEnable: false;
+
+      /* This is required to pass line rasterization tests in CTS while
+       * exposing, at least, a minimum of 4-bits of subpixel precision
+       * (the minimum requirement).
+       */
+      config.line_rasterization = 1; /* perp end caps */
+
+      if (rs_info && rs_info->polygonMode != VK_POLYGON_MODE_FILL) {
+         config.direct3d_wireframe_triangles_mode = true;
+         config.direct3d_point_fill_mode =
+            rs_info->polygonMode == VK_POLYGON_MODE_POINT;
+      }
+
+      config.rasterizer_oversample_mode = pipeline->msaa ? 1 : 0;
+
+      /* From the Vulkan spec:
+       *
+       *   "Provoking Vertex:
+       *
+       *       The vertex in a primitive from which flat shaded attribute
+       *       values are taken. This is generally the “first” vertex in the
+       *       primitive, and depends on the primitive topology."
+       *
+       * First vertex is the Direct3D style for provoking vertex. OpenGL uses
+       * the last vertex by default.
+       */
+      if (pv_info) {
+         config.direct3d_provoking_vertex =
+            pv_info->provokingVertexMode ==
+               VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
+      } else {
+         config.direct3d_provoking_vertex = true;
+      }
+
+      config.blend_enable = pipeline->blend.enables != 0;
+
+      /* Disable depth/stencil if we don't have a D/S attachment */
+      bool has_ds_attachment =
+         pipeline->subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED;
+
+      if (ds_info && ds_info->depthTestEnable && has_ds_attachment) {
+         config.z_updates_enable = ds_info->depthWriteEnable;
+         config.depth_test_function = ds_info->depthCompareOp;
+      } else {
+         config.depth_test_function = VK_COMPARE_OP_ALWAYS;
+      }
+
+      /* EZ state will be updated at draw time based on bound pipeline state */
+      config.early_z_updates_enable = false;
+      config.early_z_enable = false;
+
+      config.stencil_enable =
+         ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
+
+      pipeline->z_updates_enable = config.z_updates_enable;
+   };
+}
+
+static uint32_t
+translate_stencil_op(enum pipe_stencil_op op)
+{
+   switch (op) {
+   case VK_STENCIL_OP_KEEP:
+      return V3D_STENCIL_OP_KEEP;
+   case VK_STENCIL_OP_ZERO:
+      return V3D_STENCIL_OP_ZERO;
+   case VK_STENCIL_OP_REPLACE:
+      return V3D_STENCIL_OP_REPLACE;
+   case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
+      return V3D_STENCIL_OP_INCR;
+   case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
+      return V3D_STENCIL_OP_DECR;
+   case VK_STENCIL_OP_INVERT:
+      return V3D_STENCIL_OP_INVERT;
+   case VK_STENCIL_OP_INCREMENT_AND_WRAP:
+      return V3D_STENCIL_OP_INCWRAP;
+   case VK_STENCIL_OP_DECREMENT_AND_WRAP:
+      return V3D_STENCIL_OP_DECWRAP;
+   default:
+      unreachable("bad stencil op");
+   }
+}
+
+static void
+pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
+                        uint8_t *stencil_cfg,
+                        bool is_front,
+                        bool is_back,
+                        const VkStencilOpState *stencil_state)
+{
+   /* From the Vulkan spec:
+    *
+    *   "Reference is an integer reference value that is used in the unsigned
+    *    stencil comparison. The reference value used by stencil comparison
+    *    must be within the range [0,2^s-1] , where s is the number of bits in
+    *    the stencil framebuffer attachment, otherwise the reference value is
+    *    considered undefined."
+    *
+    * In our case, 's' is always 8, so we clamp to that to prevent our packing
+    * functions to assert in debug mode if they see larger values.
+    *
+    * If we have dynamic state we need to make sure we set the corresponding
+    * state bits to 0, since cl_emit_with_prepacked ORs the new value with
+    * the old.
+    */
+   const uint8_t write_mask =
+      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK ?
+         0 : stencil_state->writeMask & 0xff;
+
+   const uint8_t compare_mask =
+      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
+         0 : stencil_state->compareMask & 0xff;
+
+   const uint8_t reference =
+      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
+         0 : stencil_state->reference & 0xff;
+
+   v3dvx_pack(stencil_cfg, STENCIL_CFG, config) {
+      config.front_config = is_front;
+      config.back_config = is_back;
+      config.stencil_write_mask = write_mask;
+      config.stencil_test_mask = compare_mask;
+      config.stencil_test_function = stencil_state->compareOp;
+      config.stencil_pass_op = translate_stencil_op(stencil_state->passOp);
+      config.depth_test_fail_op = translate_stencil_op(stencil_state->depthFailOp);
+      config.stencil_test_fail_op = translate_stencil_op(stencil_state->failOp);
+      config.stencil_ref_value = reference;
+   }
+}
+
+static void
+pack_stencil_cfg(struct v3dv_pipeline *pipeline,
+                 const VkPipelineDepthStencilStateCreateInfo *ds_info)
+{
+   assert(sizeof(pipeline->stencil_cfg) == 2 * cl_packet_length(STENCIL_CFG));
+
+   if (!ds_info || !ds_info->stencilTestEnable)
+      return;
+
+   if (pipeline->subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
+      return;
+
+   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
+                                           V3DV_DYNAMIC_STENCIL_WRITE_MASK |
+                                           V3DV_DYNAMIC_STENCIL_REFERENCE;
+
+
+   /* If front != back or we have dynamic stencil state we can't emit a single
+    * packet for both faces.
+    */
+   bool needs_front_and_back = false;
+   if ((pipeline->dynamic_state.mask & dynamic_stencil_states) ||
+       memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front)))
+      needs_front_and_back = true;
+
+   /* If the front and back configurations are the same we can emit both with
+    * a single packet.
+    */
+   pipeline->emit_stencil_cfg[0] = true;
+   if (!needs_front_and_back) {
+      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
+                              true, true, &ds_info->front);
+   } else {
+      pipeline->emit_stencil_cfg[1] = true;
+      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
+                              true, false, &ds_info->front);
+      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[1],
+                              false, true, &ds_info->back);
+   }
+}
+
+void
+v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
+                          const VkPipelineColorBlendStateCreateInfo *cb_info,
+                          const VkPipelineDepthStencilStateCreateInfo *ds_info,
+                          const VkPipelineRasterizationStateCreateInfo *rs_info,
+                          const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
+                          const VkPipelineMultisampleStateCreateInfo *ms_info)
+{
+   pack_blend(pipeline, cb_info);
+   pack_cfg_bits(pipeline, ds_info, rs_info, pv_info, ms_info);
+   pack_stencil_cfg(pipeline, ds_info);
+}
+
+static void
+pack_shader_state_record(struct v3dv_pipeline *pipeline)
+{
+   assert(sizeof(pipeline->shader_state_record) ==
+          cl_packet_length(GL_SHADER_STATE_RECORD));
+
+   struct v3d_fs_prog_data *prog_data_fs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
+
+   struct v3d_vs_prog_data *prog_data_vs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
+
+   struct v3d_vs_prog_data *prog_data_vs_bin =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
+
+
+   /* Note: we are not packing addresses, as we need the job (see
+    * cl_pack_emit_reloc). Additionally uniforms can't be filled up at this
+    * point as they depend on dynamic info that can be set after create the
+    * pipeline (like viewport), . Would need to be filled later, so we are
+    * doing a partial prepacking.
+    */
+   v3dvx_pack(pipeline->shader_state_record, GL_SHADER_STATE_RECORD, shader) {
+      shader.enable_clipping = true;
+
+      if (!pipeline->has_gs) {
+         shader.point_size_in_shaded_vertex_data =
+            pipeline->topology == PIPE_PRIM_POINTS;
+      } else {
+         struct v3d_gs_prog_data *prog_data_gs =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]->prog_data.gs;
+         shader.point_size_in_shaded_vertex_data = prog_data_gs->writes_psiz;
+      }
+
+      /* Must be set if the shader modifies Z, discards, or modifies
+       * the sample mask.  For any of these cases, the fragment
+       * shader needs to write the Z value (even just discards).
+       */
+      shader.fragment_shader_does_z_writes = prog_data_fs->writes_z;
+      /* Set if the EZ test must be disabled (due to shader side
+       * effects and the early_z flag not being present in the
+       * shader).
+       */
+      shader.turn_off_early_z_test = prog_data_fs->disable_ez;
+
+      shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
+         prog_data_fs->uses_center_w;
+
+      /* The description for gl_SampleID states that if a fragment shader reads
+       * it, then we should automatically activate per-sample shading. However,
+       * the Vulkan spec also states that if a framebuffer has no attachments:
+       *
+       *    "The subpass continues to use the width, height, and layers of the
+       *     framebuffer to define the dimensions of the rendering area, and the
+       *     rasterizationSamples from each pipeline’s
+       *     VkPipelineMultisampleStateCreateInfo to define the number of
+       *     samples used in rasterization multisample rasterization."
+       *
+       * So in this scenario, if the pipeline doesn't enable multiple samples
+       * but the fragment shader accesses gl_SampleID we would be requested
+       * to do per-sample shading in single sample rasterization mode, which
+       * is pointless, so just disable it in that case.
+       */
+      shader.enable_sample_rate_shading =
+         pipeline->sample_rate_shading ||
+         (pipeline->msaa && prog_data_fs->force_per_sample_msaa);
+
+      shader.any_shader_reads_hardware_written_primitive_id = false;
+
+      shader.do_scoreboard_wait_on_first_thread_switch =
+         prog_data_fs->lock_scoreboard_on_first_thrsw;
+      shader.disable_implicit_point_line_varyings =
+         !prog_data_fs->uses_implicit_point_line_varyings;
+
+      shader.number_of_varyings_in_fragment_shader =
+         prog_data_fs->num_inputs;
+
+      shader.coordinate_shader_propagate_nans = true;
+      shader.vertex_shader_propagate_nans = true;
+      shader.fragment_shader_propagate_nans = true;
+
+      /* Note: see previous note about adresses */
+      /* shader.coordinate_shader_code_address */
+      /* shader.vertex_shader_code_address */
+      /* shader.fragment_shader_code_address */
+
+      /* FIXME: Use combined input/output size flag in the common case (also
+       * on v3d, see v3dx_draw).
+       */
+      shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
+         prog_data_vs_bin->separate_segments;
+      shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+         prog_data_vs->separate_segments;
+
+      shader.coordinate_shader_input_vpm_segment_size =
+         prog_data_vs_bin->separate_segments ?
+         prog_data_vs_bin->vpm_input_size : 1;
+      shader.vertex_shader_input_vpm_segment_size =
+         prog_data_vs->separate_segments ?
+         prog_data_vs->vpm_input_size : 1;
+
+      shader.coordinate_shader_output_vpm_segment_size =
+         prog_data_vs_bin->vpm_output_size;
+      shader.vertex_shader_output_vpm_segment_size =
+         prog_data_vs->vpm_output_size;
+
+      /* Note: see previous note about adresses */
+      /* shader.coordinate_shader_uniforms_address */
+      /* shader.vertex_shader_uniforms_address */
+      /* shader.fragment_shader_uniforms_address */
+
+      shader.min_coord_shader_input_segments_required_in_play =
+         pipeline->vpm_cfg_bin.As;
+      shader.min_vertex_shader_input_segments_required_in_play =
+         pipeline->vpm_cfg.As;
+
+      shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
+         pipeline->vpm_cfg_bin.Ve;
+      shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
+         pipeline->vpm_cfg.Ve;
+
+      shader.coordinate_shader_4_way_threadable =
+         prog_data_vs_bin->base.threads == 4;
+      shader.vertex_shader_4_way_threadable =
+         prog_data_vs->base.threads == 4;
+      shader.fragment_shader_4_way_threadable =
+         prog_data_fs->base.threads == 4;
+
+      shader.coordinate_shader_start_in_final_thread_section =
+         prog_data_vs_bin->base.single_seg;
+      shader.vertex_shader_start_in_final_thread_section =
+         prog_data_vs->base.single_seg;
+      shader.fragment_shader_start_in_final_thread_section =
+         prog_data_fs->base.single_seg;
+
+      shader.vertex_id_read_by_coordinate_shader =
+         prog_data_vs_bin->uses_vid;
+      shader.base_instance_id_read_by_coordinate_shader =
+         prog_data_vs_bin->uses_biid;
+      shader.instance_id_read_by_coordinate_shader =
+         prog_data_vs_bin->uses_iid;
+      shader.vertex_id_read_by_vertex_shader =
+         prog_data_vs->uses_vid;
+      shader.base_instance_id_read_by_vertex_shader =
+         prog_data_vs->uses_biid;
+      shader.instance_id_read_by_vertex_shader =
+         prog_data_vs->uses_iid;
+
+      /* Note: see previous note about adresses */
+      /* shader.address_of_default_attribute_values */
+   }
+}
+
+static void
+pack_vcm_cache_size(struct v3dv_pipeline *pipeline)
+{
+   assert(sizeof(pipeline->vcm_cache_size) ==
+          cl_packet_length(VCM_CACHE_SIZE));
+
+   v3dvx_pack(pipeline->vcm_cache_size, VCM_CACHE_SIZE, vcm) {
+      vcm.number_of_16_vertex_batches_for_binning = pipeline->vpm_cfg_bin.Vc;
+      vcm.number_of_16_vertex_batches_for_rendering = pipeline->vpm_cfg.Vc;
+   }
+}
+
+/* As defined on the GL_SHADER_STATE_ATTRIBUTE_RECORD */
+static uint8_t
+get_attr_type(const struct util_format_description *desc)
+{
+   uint32_t r_size = desc->channel[0].size;
+   uint8_t attr_type = ATTRIBUTE_FLOAT;
+
+   switch (desc->channel[0].type) {
+   case UTIL_FORMAT_TYPE_FLOAT:
+      if (r_size == 32) {
+         attr_type = ATTRIBUTE_FLOAT;
+      } else {
+         assert(r_size == 16);
+         attr_type = ATTRIBUTE_HALF_FLOAT;
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_SIGNED:
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+      switch (r_size) {
+      case 32:
+         attr_type = ATTRIBUTE_INT;
+         break;
+      case 16:
+         attr_type = ATTRIBUTE_SHORT;
+         break;
+      case 10:
+         attr_type = ATTRIBUTE_INT2_10_10_10;
+         break;
+      case 8:
+         attr_type = ATTRIBUTE_BYTE;
+         break;
+      default:
+         fprintf(stderr,
+                 "format %s unsupported\n",
+                 desc->name);
+         attr_type = ATTRIBUTE_BYTE;
+         abort();
+      }
+      break;
+
+   default:
+      fprintf(stderr,
+              "format %s unsupported\n",
+              desc->name);
+      abort();
+   }
+
+   return attr_type;
+}
+
+static void
+pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline,
+                                   uint32_t index,
+                                   const VkVertexInputAttributeDescription *vi_desc)
+{
+   const uint32_t packet_length =
+      cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
+
+   const struct util_format_description *desc =
+      vk_format_description(vi_desc->format);
+
+   uint32_t binding = vi_desc->binding;
+
+   v3dvx_pack(&pipeline->vertex_attrs[index * packet_length],
+             GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
+
+      /* vec_size == 0 means 4 */
+      attr.vec_size = desc->nr_channels & 3;
+      attr.signed_int_type = (desc->channel[0].type ==
+                              UTIL_FORMAT_TYPE_SIGNED);
+      attr.normalized_int_type = desc->channel[0].normalized;
+      attr.read_as_int_uint = desc->channel[0].pure_integer;
+
+      attr.instance_divisor = MIN2(pipeline->vb[binding].instance_divisor,
+                                   0xffff);
+      attr.stride = pipeline->vb[binding].stride;
+      attr.type = get_attr_type(desc);
+   }
+}
+
+void
+v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
+                                  const VkPipelineVertexInputStateCreateInfo *vi_info,
+                                  const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info)
+{
+   pack_shader_state_record(pipeline);
+   pack_vcm_cache_size(pipeline);
+
+   pipeline->vb_count = vi_info->vertexBindingDescriptionCount;
+   for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
+      const VkVertexInputBindingDescription *desc =
+         &vi_info->pVertexBindingDescriptions[i];
+
+      pipeline->vb[desc->binding].stride = desc->stride;
+      pipeline->vb[desc->binding].instance_divisor = desc->inputRate;
+   }
+
+   if (vd_info) {
+      for (uint32_t i = 0; i < vd_info->vertexBindingDivisorCount; i++) {
+         const VkVertexInputBindingDivisorDescriptionEXT *desc =
+            &vd_info->pVertexBindingDivisors[i];
+
+         pipeline->vb[desc->binding].instance_divisor = desc->divisor;
+      }
+   }
+
+   pipeline->va_count = 0;
+   struct v3d_vs_prog_data *prog_data_vs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
+
+   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
+      const VkVertexInputAttributeDescription *desc =
+         &vi_info->pVertexAttributeDescriptions[i];
+      uint32_t location = desc->location + VERT_ATTRIB_GENERIC0;
+
+      /* We use a custom driver_location_map instead of
+       * nir_find_variable_with_location because if we were able to get the
+       * shader variant from the cache, we would not have the nir shader
+       * available.
+       */
+      uint32_t driver_location =
+         prog_data_vs->driver_location_map[location];
+
+      if (driver_location != -1) {
+         assert(driver_location < MAX_VERTEX_ATTRIBS);
+         pipeline->va[driver_location].offset = desc->offset;
+         pipeline->va[driver_location].binding = desc->binding;
+         pipeline->va[driver_location].vk_format = desc->format;
+
+         pack_shader_state_attribute_record(pipeline, driver_location, desc);
+
+         pipeline->va_count++;
+      }
+   }
+}
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_private.h b/lib/mesa/src/broadcom/vulkan/v3dvx_private.h
new file mode 100644
index 000000000..ab134225a
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_private.h
@@ -0,0 +1,314 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* This file generates the per-v3d-version function prototypes.  It must only
+ * be included from v3dv_private.h.
+ */
+
+#ifndef V3DV_PRIVATE_H
+#error This file is included by means other than v3dv_private.h
+#endif
+
+/* Used at v3dv_cmd_buffer */
+void
+v3dX(job_emit_binning_flush)(struct v3dv_job *job);
+
+void
+v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_end_render_pass_secondary)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(job_emit_clip_window)(struct v3dv_job *job, const VkRect2D *rect);
+
+void
+v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_sample_state)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
+                              const struct v3dv_frame_tiling *tiling,
+                              uint32_t layers);
+
+void
+v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
+                                     uint32_t cmd_buffer_count,
+                                     const VkCommandBuffer *cmd_buffers);
+
+void
+v3dX(cmd_buffer_emit_occlusion_query)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer);
+
+
+void
+v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
+                           struct v3dv_draw_info *info);
+
+
+void
+v3dX(cmd_buffer_emit_index_buffer)(struct v3dv_cmd_buffer *cmd_buffer);
+
+void
+v3dX(cmd_buffer_emit_draw_indexed)(struct v3dv_cmd_buffer *cmd_buffer,
+                                   uint32_t indexCount,
+                                   uint32_t instanceCount,
+                                   uint32_t firstIndex,
+                                   int32_t vertexOffset,
+                                   uint32_t firstInstance);
+
+void
+v3dX(cmd_buffer_emit_draw_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
+                                    struct v3dv_buffer *buffer,
+                                    VkDeviceSize offset,
+                                    uint32_t drawCount,
+                                    uint32_t stride);
+
+void
+v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
+                                       struct v3dv_buffer *buffer,
+                                       VkDeviceSize offset,
+                                       uint32_t drawCount,
+                                       uint32_t stride);
+
+void
+v3dX(get_hw_clear_color)(const VkClearColorValue *color,
+                         uint32_t internal_type,
+                         uint32_t internal_size,
+                         uint32_t *hw_color);
+
+void
+v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
+                                                 int rt,
+                                                 uint32_t *rt_bpp,
+                                                 uint32_t *rt_type,
+                                                 uint32_t *rt_clamp);
+
+/* Used at v3dv_device */
+
+void
+v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+                         const VkSamplerCreateInfo *pCreateInfo,
+                         const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
+
+void
+v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
+                                            const struct v3dv_subpass *subpass,
+                                            uint8_t *max_bpp, bool *msaa);
+
+#ifdef DEBUG
+void
+v3dX(device_check_prepacked_sizes)(void);
+#endif
+
+/* Used at v3dv_format */
+const struct v3dv_format *
+v3dX(get_format)(VkFormat);
+
+void
+v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
+                                              uint32_t *type,
+                                              uint32_t *bpp);
+
+bool
+v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format);
+
+bool
+v3dX(format_supports_blending)(const struct v3dv_format *format);
+
+bool
+v3dX(tfu_supports_tex_format)(uint32_t tex_format);
+
+/* Used at v3dv_image */
+
+void
+v3dX(pack_texture_shader_state)(struct v3dv_device *device,
+                                struct v3dv_image_view *iview);
+
+void
+v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
+                                                 struct v3dv_buffer_view *buffer_view);
+
+/* Used at v3dv_meta_* */
+
+uint32_t
+v3dX(zs_buffer_from_aspect_bits)(VkImageAspectFlags aspects);
+
+uint8_t
+v3dX(get_internal_depth_type)(VkFormat format);
+
+struct v3dv_meta_framebuffer;
+
+void
+v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
+                                         struct v3dv_buffer *buffer,
+                                         struct v3dv_image *image,
+                                         struct v3dv_meta_framebuffer *framebuffer,
+                                         const VkBufferImageCopy2KHR *region);
+
+void
+v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
+                                  struct v3dv_image *dst,
+                                  struct v3dv_image *src,
+                                  struct v3dv_meta_framebuffer *framebuffer,
+                                  const VkImageResolve2KHR *region);
+
+void
+v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
+                            struct v3dv_bo *dst,
+                            struct v3dv_bo *src,
+                            uint32_t dst_offset,
+                            uint32_t src_offset,
+                            struct v3dv_meta_framebuffer *framebuffer,
+                            uint32_t format,
+                            uint32_t item_size);
+
+void
+v3dX(meta_emit_copy_buffer_rcl)(struct v3dv_job *job,
+                                struct v3dv_bo *dst,
+                                struct v3dv_bo *src,
+                                uint32_t dst_offset,
+                                uint32_t src_offset,
+                                struct v3dv_meta_framebuffer *framebuffer,
+                                uint32_t format,
+                                uint32_t item_size);
+
+void
+v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
+                               struct v3dv_image *dst,
+                               struct v3dv_image *src,
+                               struct v3dv_meta_framebuffer *framebuffer,
+                               const VkImageCopy2KHR *region);
+
+void
+v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+                        struct v3dv_image *dst,
+                        uint32_t dst_mip_level,
+                        uint32_t dst_layer,
+                        struct v3dv_image *src,
+                        uint32_t src_mip_level,
+                        uint32_t src_layer,
+                        uint32_t width,
+                        uint32_t height,
+                        const struct v3dv_format *format);
+
+void
+v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
+                                struct v3dv_image *image,
+                                struct v3dv_meta_framebuffer *framebuffer,
+                                const union v3dv_clear_value *clear_value,
+                                VkImageAspectFlags aspects,
+                                uint32_t min_layer,
+                                uint32_t max_layer,
+                                uint32_t level);
+
+void
+v3dX(meta_emit_fill_buffer_rcl)(struct v3dv_job *job,
+                                struct v3dv_bo *bo,
+                                uint32_t offset,
+                                struct v3dv_meta_framebuffer *framebuffer,
+                                uint32_t data);
+
+void
+v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
+                                         struct v3dv_image *image,
+                                         struct v3dv_buffer *buffer,
+                                         struct v3dv_meta_framebuffer *framebuffer,
+                                         const VkBufferImageCopy2KHR *region);
+
+void
+v3dX(get_internal_type_bpp_for_image_aspects)(VkFormat vk_format,
+                                              VkImageAspectFlags aspect_mask,
+                                              uint32_t *internal_type,
+                                              uint32_t *internal_bpp);
+
+struct v3dv_job *
+v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+                       struct v3dv_bo *dst,
+                       uint32_t dst_offset,
+                       struct v3dv_bo *src,
+                       uint32_t src_offset,
+                       const VkBufferCopy2KHR *region);
+
+void
+v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+                       struct v3dv_bo *bo,
+                       uint32_t offset,
+                       uint32_t size,
+                       uint32_t data);
+
+void
+v3dX(meta_framebuffer_init)(struct v3dv_meta_framebuffer *fb,
+                            VkFormat vk_format,
+                            uint32_t internal_type,
+                            const struct v3dv_frame_tiling *tiling);
+
+/* Used at v3dv_pipeline */
+void
+v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
+                          const VkPipelineColorBlendStateCreateInfo *cb_info,
+                          const VkPipelineDepthStencilStateCreateInfo *ds_info,
+                          const VkPipelineRasterizationStateCreateInfo *rs_info,
+                          const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info,
+                          const VkPipelineMultisampleStateCreateInfo *ms_info);
+void
+v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
+                                  const VkPipelineVertexInputStateCreateInfo *vi_info,
+                                  const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+/* Used at v3dv_queue */
+void
+v3dX(job_emit_noop)(struct v3dv_job *job);
+
+/* Used at v3dv_descriptor_set, and other descriptor set utils */
+uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
+
+uint32_t v3dX(max_descriptor_bo_size)(void);
+
+uint32_t v3dX(combined_image_sampler_texture_state_offset)(void);
+
+uint32_t v3dX(combined_image_sampler_sampler_state_offset)(void);
diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c b/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c
new file mode 100644
index 000000000..38f9efbfa
--- /dev/null
+++ b/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2021 Raspberry Pi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+void
+v3dX(job_emit_noop)(struct v3dv_job *job)
+{
+   v3dv_job_start_frame(job, 1, 1, 1, true, 1, V3D_INTERNAL_BPP_32, false);
+   v3dX(job_emit_binning_flush)(job);
+
+   struct v3dv_cl *rcl = &job->rcl;
+   v3dv_cl_ensure_space_with_branch(rcl, 200 + 1 * 256 *
+                                    cl_packet_length(SUPERTILE_COORDINATES));
+
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
+      config.early_z_disable = true;
+      config.image_width_pixels = 1;
+      config.image_height_pixels = 1;
+      config.number_of_render_targets = 1;
+      config.multisample_mode_4x = false;
+      config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+   }
+
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+      rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
+      rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
+      rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+   }
+
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+      clear.z_clear_value = 1.0f;
+      clear.stencil_clear_value = 0;
+   };
+
+   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
+      init.use_auto_chained_tile_lists = true;
+      init.size_of_first_block_in_chained_tile_lists =
+         TILE_ALLOCATION_BLOCK_SIZE_64B;
+   }
+
+   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+      list.address = v3dv_cl_address(job->tile_alloc, 0);
+   }
+
+   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
+      config.number_of_bin_tile_lists = 1;
+      config.total_frame_width_in_tiles = 1;
+      config.total_frame_height_in_tiles = 1;
+      config.supertile_width_in_tiles = 1;
+      config.supertile_height_in_tiles = 1;
+      config.total_frame_width_in_supertiles = 1;
+      config.total_frame_height_in_supertiles = 1;
+   }
+
+   struct v3dv_cl *icl = &job->indirect;
+   v3dv_cl_ensure_space(icl, 200, 1);
+   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(icl);
+
+   cl_emit(icl, TILE_COORDINATES_IMPLICIT, coords);
+
+   cl_emit(icl, END_OF_LOADS, end);
+
+   cl_emit(icl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
+
+   cl_emit(icl, STORE_TILE_BUFFER_GENERAL, store) {
+      store.buffer_to_store = NONE;
+   }
+
+   cl_emit(icl, END_OF_TILE_MARKER, end);
+
+   cl_emit(icl, RETURN_FROM_SUB_LIST, ret);
+
+   cl_emit(rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+      branch.start = tile_list_start;
+      branch.end = v3dv_cl_get_address(icl);
+   }
+
+   cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
+      coords.column_number_in_supertiles = 0;
+      coords.row_number_in_supertiles = 0;
+   }
+
+   cl_emit(rcl, END_OF_RENDERING, end);
+}
diff --git a/lib/mesa/src/broadcom/vulkan/vk_format_info.h b/lib/mesa/src/broadcom/vulkan/vk_format_info.h
index 3490ededf..da85cb5b5 100644
--- a/lib/mesa/src/broadcom/vulkan/vk_format_info.h
+++ b/lib/mesa/src/broadcom/vulkan/vk_format_info.h
@@ -50,6 +50,24 @@ vk_format_is_uint(VkFormat format)
 }
 
 static inline bool
+vk_format_is_unorm(VkFormat format)
+{
+   return util_format_is_unorm(vk_format_to_pipe_format(format));
+}
+
+static inline bool
+vk_format_is_snorm(VkFormat format)
+{
+   return util_format_is_snorm(vk_format_to_pipe_format(format));
+}
+
+static inline bool
+vk_format_is_float(VkFormat format)
+{
+   return util_format_is_float(vk_format_to_pipe_format(format));
+}
+
+static inline bool
 vk_format_is_srgb(VkFormat format)
 {
    return util_format_is_srgb(vk_format_to_pipe_format(format));
author	Jonathan Gray <jsg@cvs.openbsd.org>	2022-02-24 01:57:18 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2022-02-24 01:57:18 +0000
commit	b24b5b9049e889ee4eb39b565bcc8d48bd45ab48 (patch)
tree	658ca4e6b41655f49463c85edbaeda48979c394c /lib/mesa/src/broadcom
parent	57768bbb154c2879d34ec20e401b19472e77aaf7 (diff)