Merge Mesa 22.1.7

author: Jonathan Gray <jsg@cvs.openbsd.org> 2022-09-02 05:47:02 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2022-09-02 05:47:02 +0000
commit: 0dbbf1e0708df85a357d70e2708c0a11aeb5480e (patch)
tree: 6656ff8eb8b15a2fc1c02888973caf618388cfd0 /lib/mesa/src/freedreno
parent: 5f66494d31f735486b8222ecfa0a0c9046e92543 (diff)
24 files changed, 1384 insertions, 4098 deletions
diff --git a/lib/mesa/src/freedreno/afuc/Makefile b/lib/mesa/src/freedreno/afuc/Makefile
deleted file mode 100644
index 12e6f3aeb..000000000
--- a/lib/mesa/src/freedreno/afuc/Makefile
+++ /dev/null
@@ -1,368 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.17
-
-# Default target executed when no arguments are given to make.
-default_target: all
-
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-
-# A target that is always out of date.
-cmake_force:
-
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/robclark/src/envytools
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/robclark/src/envytools
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
-	/usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-
-.PHONY : edit_cache/fast
-
-# Special rule for the target test
-test:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests..."
-	/usr/bin/ctest --force-new-ctest-process $(ARGS)
-.PHONY : test
-
-# Special rule for the target test
-test/fast: test
-
-.PHONY : test/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-
-.PHONY : list_install_components/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-
-.PHONY : rebuild_cache/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/robclark/src/envytools && $(CMAKE_COMMAND) -E cmake_progress_start /home/robclark/src/envytools/CMakeFiles /home/robclark/src/envytools/afuc/CMakeFiles/progress.marks
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 afuc/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/robclark/src/envytools/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 afuc/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 afuc/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 afuc/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/robclark/src/envytools && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-afuc/CMakeFiles/asm.dir/rule:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 afuc/CMakeFiles/asm.dir/rule
-.PHONY : afuc/CMakeFiles/asm.dir/rule
-
-# Convenience name for target.
-asm: afuc/CMakeFiles/asm.dir/rule
-
-.PHONY : asm
-
-# fast build rule for target.
-asm/fast:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/build
-.PHONY : asm/fast
-
-# Convenience name for target.
-afuc/CMakeFiles/disasm.dir/rule:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 afuc/CMakeFiles/disasm.dir/rule
-.PHONY : afuc/CMakeFiles/disasm.dir/rule
-
-# Convenience name for target.
-disasm: afuc/CMakeFiles/disasm.dir/rule
-
-.PHONY : disasm
-
-# fast build rule for target.
-disasm/fast:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/disasm.dir/build.make afuc/CMakeFiles/disasm.dir/build
-.PHONY : disasm/fast
-
-asm.o: asm.c.o
-
-.PHONY : asm.o
-
-# target to build an object file
-asm.c.o:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/asm.c.o
-.PHONY : asm.c.o
-
-asm.i: asm.c.i
-
-.PHONY : asm.i
-
-# target to preprocess a source file
-asm.c.i:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/asm.c.i
-.PHONY : asm.c.i
-
-asm.s: asm.c.s
-
-.PHONY : asm.s
-
-# target to generate assembly for a file
-asm.c.s:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/asm.c.s
-.PHONY : asm.c.s
-
-disasm.o: disasm.c.o
-
-.PHONY : disasm.o
-
-# target to build an object file
-disasm.c.o:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/disasm.dir/build.make afuc/CMakeFiles/disasm.dir/disasm.c.o
-.PHONY : disasm.c.o
-
-disasm.i: disasm.c.i
-
-.PHONY : disasm.i
-
-# target to preprocess a source file
-disasm.c.i:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/disasm.dir/build.make afuc/CMakeFiles/disasm.dir/disasm.c.i
-.PHONY : disasm.c.i
-
-disasm.s: disasm.c.s
-
-.PHONY : disasm.s
-
-# target to generate assembly for a file
-disasm.c.s:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/disasm.dir/build.make afuc/CMakeFiles/disasm.dir/disasm.c.s
-.PHONY : disasm.c.s
-
-lexer.o: lexer.c.o
-
-.PHONY : lexer.o
-
-# target to build an object file
-lexer.c.o:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/lexer.c.o
-.PHONY : lexer.c.o
-
-lexer.i: lexer.c.i
-
-.PHONY : lexer.i
-
-# target to preprocess a source file
-lexer.c.i:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/lexer.c.i
-.PHONY : lexer.c.i
-
-lexer.s: lexer.c.s
-
-.PHONY : lexer.s
-
-# target to generate assembly for a file
-lexer.c.s:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/lexer.c.s
-.PHONY : lexer.c.s
-
-parser.o: parser.c.o
-
-.PHONY : parser.o
-
-# target to build an object file
-parser.c.o:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/parser.c.o
-.PHONY : parser.c.o
-
-parser.i: parser.c.i
-
-.PHONY : parser.i
-
-# target to preprocess a source file
-parser.c.i:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/parser.c.i
-.PHONY : parser.c.i
-
-parser.s: parser.c.s
-
-.PHONY : parser.s
-
-# target to generate assembly for a file
-parser.c.s:
-	cd /home/robclark/src/envytools && $(MAKE) $(MAKESILENT) -f afuc/CMakeFiles/asm.dir/build.make afuc/CMakeFiles/asm.dir/parser.c.s
-.PHONY : parser.c.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... test"
-	@echo "... asm"
-	@echo "... disasm"
-	@echo "... asm.o"
-	@echo "... asm.i"
-	@echo "... asm.s"
-	@echo "... disasm.o"
-	@echo "... disasm.i"
-	@echo "... disasm.s"
-	@echo "... lexer.o"
-	@echo "... lexer.i"
-	@echo "... lexer.s"
-	@echo "... parser.o"
-	@echo "... parser.i"
-	@echo "... parser.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/robclark/src/envytools && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/lib/mesa/src/freedreno/computerator/a6xx.c b/lib/mesa/src/freedreno/computerator/a6xx.c
index 67104a6db..a0ce6f986 100644
--- a/lib/mesa/src/freedreno/computerator/a6xx.c
+++ b/lib/mesa/src/freedreno/computerator/a6xx.c
@@ -158,6 +158,12 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
    OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
    OUT_RING(ring, 0x41);
 
+   if (a6xx_backend->info->a6xx.has_lpac) {
+      OUT_PKT4(ring, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
+      OUT_RING(ring, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(1) |
+                        A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
+   }
+
    uint32_t local_invocation_id, work_group_id;
    local_invocation_id =
       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
@@ -171,6 +177,16 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
    OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
                      A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
 
+   if (a6xx_backend->info->a6xx.has_lpac) {
+      OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
+      OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
+                        A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
+                        A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
+                        A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+      OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
+                        A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
+   }
+
    OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
    OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */
 
@@ -180,12 +196,14 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
    OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
    OUT_RELOC(ring, v->bo, 0, 0, 0);
 
+   uint32_t shader_preload_size =
+      MIN2(v->instrlen, a6xx_backend->info->a6xx.instr_cache_size);
    OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
    OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
                      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
-                     CP_LOAD_STATE6_0_NUM_UNIT(v->instrlen));
+                     CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
    OUT_RELOC(ring, v->bo, 0, 0, 0);
 
    if (v->pvtmem_size > 0) {
@@ -296,11 +314,11 @@ cs_ibo_emit(struct fd_ringbuffer *ring, struct fd_submit *submit,
       unsigned width = sz & MASK(15);
       unsigned height = sz >> 15;
 
-      OUT_RING(state, A6XX_IBO_0_FMT(FMT6_32_UINT) | A6XX_IBO_0_TILE_MODE(0));
-      OUT_RING(state, A6XX_IBO_1_WIDTH(width) | A6XX_IBO_1_HEIGHT(height));
-      OUT_RING(state, A6XX_IBO_2_PITCH(0) | A6XX_IBO_2_UNK4 | A6XX_IBO_2_UNK31 |
-                         A6XX_IBO_2_TYPE(A6XX_TEX_1D));
-      OUT_RING(state, A6XX_IBO_3_ARRAY_PITCH(0));
+      OUT_RING(state, A6XX_TEX_CONST_0_FMT(FMT6_32_UINT) | A6XX_TEX_CONST_0_TILE_MODE(0));
+      OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height));
+      OUT_RING(state, A6XX_TEX_CONST_2_PITCH(0) | A6XX_TEX_CONST_2_BUFFER |
+                         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER));
+      OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(0));
       OUT_RELOC(state, kernel->bufs[i], 0, 0, 0);
       OUT_RING(state, 0x00000000);
       OUT_RING(state, 0x00000000);
@@ -519,7 +537,8 @@ a6xx_init(struct fd_device *dev, const struct fd_dev_id *dev_id)
       .read_perfcntrs = a6xx_read_perfcntrs,
    };
 
-   a6xx_backend->compiler = ir3_compiler_create(dev, dev_id, false);
+   a6xx_backend->compiler = ir3_compiler_create(dev, dev_id,
+                                                &(struct ir3_compiler_options){});
    a6xx_backend->dev = dev;
 
    a6xx_backend->info = fd_dev_info(dev_id);
diff --git a/lib/mesa/src/freedreno/computerator/ir3_asm.c b/lib/mesa/src/freedreno/computerator/ir3_asm.c
index b9c295adf..e0f3c9bc5 100644
--- a/lib/mesa/src/freedreno/computerator/ir3_asm.c
+++ b/lib/mesa/src/freedreno/computerator/ir3_asm.c
@@ -35,8 +35,6 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
       errx(-1, "assembler failed");
    struct ir3_shader_variant *v = shader->variants;
 
-   v->mergedregs = true;
-
    kernel->v = v;
    kernel->bin = v->bin;
 
@@ -55,6 +53,9 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
 
    memcpy(fd_bo_map(v->bo), kernel->bin, sz);
 
+   /* Always include shaders in kernel crash dumps. */
+   fd_bo_mark_for_dump(v->bo);
+
    return kernel;
 }
 
diff --git a/lib/mesa/src/freedreno/computerator/main.c b/lib/mesa/src/freedreno/computerator/main.c
index 0468380be..6c4f14534 100644
--- a/lib/mesa/src/freedreno/computerator/main.c
+++ b/lib/mesa/src/freedreno/computerator/main.c
@@ -236,11 +236,7 @@ main(int argc, char **argv)
       }
    }
 
-   int fd = drmOpenWithType("msm", NULL, DRM_NODE_RENDER);
-   if (fd < 0)
-      err(1, "could not open drm device");
-
-   struct fd_device *dev = fd_device_new(fd);
+   struct fd_device *dev = fd_device_open();
    struct fd_pipe *pipe = fd_pipe_new(dev, FD_PIPE_3D);
 
    const struct fd_dev_id *dev_id = fd_pipe_dev_id(pipe);
diff --git a/lib/mesa/src/freedreno/drm/msm_bo.c b/lib/mesa/src/freedreno/drm/msm_bo.c
deleted file mode 100644
index da2609903..000000000
--- a/lib/mesa/src/freedreno/drm/msm_bo.c
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "msm_priv.h"
-
-static int bo_allocate(struct msm_bo *msm_bo)
-{
-	struct fd_bo *bo = &msm_bo->base;
-	if (!msm_bo->offset) {
-		struct drm_msm_gem_info req = {
-				.handle = bo->handle,
-				.info = MSM_INFO_GET_OFFSET,
-		};
-		int ret;
-
-		/* if the buffer is already backed by pages then this
-		 * doesn't actually do anything (other than giving us
-		 * the offset)
-		 */
-		ret = drmCommandWriteRead(bo->dev->fd, DRM_MSM_GEM_INFO,
-				&req, sizeof(req));
-		if (ret) {
-			ERROR_MSG("alloc failed: %s", strerror(errno));
-			return ret;
-		}
-
-		msm_bo->offset = req.value;
-	}
-
-	return 0;
-}
-
-static int msm_bo_offset(struct fd_bo *bo, uint64_t *offset)
-{
-	struct msm_bo *msm_bo = to_msm_bo(bo);
-	int ret = bo_allocate(msm_bo);
-	if (ret)
-		return ret;
-	*offset = msm_bo->offset;
-	return 0;
-}
-
-static int msm_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)
-{
-	struct drm_msm_gem_cpu_prep req = {
-			.handle = bo->handle,
-			.op = op,
-	};
-
-	get_abs_timeout(&req.timeout, 5000000000);
-
-	return drmCommandWrite(bo->dev->fd, DRM_MSM_GEM_CPU_PREP, &req, sizeof(req));
-}
-
-static void msm_bo_cpu_fini(struct fd_bo *bo)
-{
-	struct drm_msm_gem_cpu_fini req = {
-			.handle = bo->handle,
-	};
-
-	drmCommandWrite(bo->dev->fd, DRM_MSM_GEM_CPU_FINI, &req, sizeof(req));
-}
-
-static int msm_bo_madvise(struct fd_bo *bo, int willneed)
-{
-	struct drm_msm_gem_madvise req = {
-			.handle = bo->handle,
-			.madv = willneed ? MSM_MADV_WILLNEED : MSM_MADV_DONTNEED,
-	};
-	int ret;
-
-	/* older kernels do not support this: */
-	if (bo->dev->version < FD_VERSION_MADVISE)
-		return willneed;
-
-	ret = drmCommandWriteRead(bo->dev->fd, DRM_MSM_GEM_MADVISE, &req, sizeof(req));
-	if (ret)
-		return ret;
-
-	return req.retained;
-}
-
-static uint64_t msm_bo_iova(struct fd_bo *bo)
-{
-	struct drm_msm_gem_info req = {
-			.handle = bo->handle,
-			.info = MSM_INFO_GET_IOVA,
-	};
-	int ret;
-
-	ret = drmCommandWriteRead(bo->dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
-	debug_assert(ret == 0);
-
-	return req.value;
-}
-
-static void msm_bo_set_name(struct fd_bo *bo, const char *fmt, va_list ap)
-{
-	struct drm_msm_gem_info req = {
-			.handle = bo->handle,
-			.info = MSM_INFO_SET_NAME,
-	};
-	char buf[32];
-	int sz;
-
-	if (bo->dev->version < FD_VERSION_SOFTPIN)
-		return;
-
-	sz = vsnprintf(buf, sizeof(buf), fmt, ap);
-
-	req.value = VOID2U64(buf);
-	req.len = MIN2(sz, sizeof(buf));
-
-	drmCommandWrite(bo->dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
-}
-
-static void msm_bo_destroy(struct fd_bo *bo)
-{
-	struct msm_bo *msm_bo = to_msm_bo(bo);
-	free(msm_bo);
-}
-
-static const struct fd_bo_funcs funcs = {
-		.offset = msm_bo_offset,
-		.cpu_prep = msm_bo_cpu_prep,
-		.cpu_fini = msm_bo_cpu_fini,
-		.madvise = msm_bo_madvise,
-		.iova = msm_bo_iova,
-		.set_name = msm_bo_set_name,
-		.destroy = msm_bo_destroy,
-};
-
-/* allocate a buffer handle: */
-int msm_bo_new_handle(struct fd_device *dev,
-		uint32_t size, uint32_t flags, uint32_t *handle)
-{
-	struct drm_msm_gem_new req = {
-			.size = size,
-			.flags = MSM_BO_WC,  // TODO figure out proper flags..
-	};
-	int ret;
-
-	if (flags & DRM_FREEDRENO_GEM_SCANOUT)
-		req.flags |= MSM_BO_SCANOUT;
-
-	if (flags & DRM_FREEDRENO_GEM_GPUREADONLY)
-		req.flags |= MSM_BO_GPU_READONLY;
-
-	ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW,
-			&req, sizeof(req));
-	if (ret)
-		return ret;
-
-	*handle = req.handle;
-
-	return 0;
-}
-
-/* allocate a new buffer object */
-struct fd_bo * msm_bo_from_handle(struct fd_device *dev,
-		uint32_t size, uint32_t handle)
-{
-	struct msm_bo *msm_bo;
-	struct fd_bo *bo;
-
-	msm_bo = calloc(1, sizeof(*msm_bo));
-	if (!msm_bo)
-		return NULL;
-
-	bo = &msm_bo->base;
-	bo->funcs = &funcs;
-
-	return bo;
-}
diff --git a/lib/mesa/src/freedreno/drm/msm_device.c b/lib/mesa/src/freedreno/drm/msm_device.c
deleted file mode 100644
index d391ef013..000000000
--- a/lib/mesa/src/freedreno/drm/msm_device.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "msm_priv.h"
-
-static void msm_device_destroy(struct fd_device *dev)
-{
-	struct msm_device *msm_dev = to_msm_device(dev);
-	free(msm_dev);
-}
-
-static const struct fd_device_funcs funcs = {
-		.bo_new_handle = msm_bo_new_handle,
-		.bo_from_handle = msm_bo_from_handle,
-		.pipe_new = msm_pipe_new,
-		.destroy = msm_device_destroy,
-};
-
-struct fd_device * msm_device_new(int fd)
-{
-	struct msm_device *msm_dev;
-	struct fd_device *dev;
-
-	msm_dev = calloc(1, sizeof(*msm_dev));
-	if (!msm_dev)
-		return NULL;
-
-	dev = &msm_dev->base;
-	dev->funcs = &funcs;
-
-	dev->bo_size = sizeof(struct msm_bo);
-
-	return dev;
-}
diff --git a/lib/mesa/src/freedreno/drm/msm_pipe.c b/lib/mesa/src/freedreno/drm/msm_pipe.c
deleted file mode 100644
index 7d5b9fcd7..000000000
--- a/lib/mesa/src/freedreno/drm/msm_pipe.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/slab.h"
-
-#include "msm_priv.h"
-
-static int query_param(struct fd_pipe *pipe, uint32_t param,
-		uint64_t *value)
-{
-	struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
-	struct drm_msm_param req = {
-			.pipe = msm_pipe->pipe,
-			.param = param,
-	};
-	int ret;
-
-	ret = drmCommandWriteRead(pipe->dev->fd, DRM_MSM_GET_PARAM,
-			&req, sizeof(req));
-	if (ret)
-		return ret;
-
-	*value = req.value;
-
-	return 0;
-}
-
-static int msm_pipe_get_param(struct fd_pipe *pipe,
-		enum fd_param_id param, uint64_t *value)
-{
-	struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
-	switch(param) {
-	case FD_DEVICE_ID: // XXX probably get rid of this..
-	case FD_GPU_ID:
-		*value = msm_pipe->gpu_id;
-		return 0;
-	case FD_GMEM_SIZE:
-		*value = msm_pipe->gmem;
-		return 0;
-	case FD_CHIP_ID:
-		*value = msm_pipe->chip_id;
-		return 0;
-	case FD_MAX_FREQ:
-		return query_param(pipe, MSM_PARAM_MAX_FREQ, value);
-	case FD_TIMESTAMP:
-		return query_param(pipe, MSM_PARAM_TIMESTAMP, value);
-	case FD_NR_RINGS:
-		return query_param(pipe, MSM_PARAM_NR_RINGS, value);
-	default:
-		ERROR_MSG("invalid param id: %d", param);
-		return -1;
-	}
-}
-
-static int msm_pipe_wait(struct fd_pipe *pipe, uint32_t timestamp,
-		uint64_t timeout)
-{
-	struct fd_device *dev = pipe->dev;
-	struct drm_msm_wait_fence req = {
-			.fence = timestamp,
-			.queueid = to_msm_pipe(pipe)->queue_id,
-	};
-	int ret;
-
-	get_abs_timeout(&req.timeout, timeout);
-
-	ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req));
-	if (ret) {
-		ERROR_MSG("wait-fence failed! %d (%s)", ret, strerror(errno));
-		return ret;
-	}
-
-	return 0;
-}
-
-static int open_submitqueue(struct fd_pipe *pipe, uint32_t prio)
-{
-	struct drm_msm_submitqueue req = {
-		.flags = 0,
-		.prio = prio,
-	};
-	uint64_t nr_rings = 1;
-	int ret;
-
-	if (fd_device_version(pipe->dev) < FD_VERSION_SUBMIT_QUEUES) {
-		to_msm_pipe(pipe)->queue_id = 0;
-		return 0;
-	}
-
-	msm_pipe_get_param(pipe, FD_NR_RINGS, &nr_rings);
-
-	req.prio = MIN2(req.prio, MAX2(nr_rings, 1) - 1);
-
-	ret = drmCommandWriteRead(pipe->dev->fd, DRM_MSM_SUBMITQUEUE_NEW,
-			&req, sizeof(req));
-	if (ret) {
-		ERROR_MSG("could not create submitqueue! %d (%s)", ret, strerror(errno));
-		return ret;
-	}
-
-	to_msm_pipe(pipe)->queue_id = req.id;
-	return 0;
-}
-
-static void close_submitqueue(struct fd_pipe *pipe, uint32_t queue_id)
-{
-	if (fd_device_version(pipe->dev) < FD_VERSION_SUBMIT_QUEUES)
-		return;
-
-	drmCommandWrite(pipe->dev->fd, DRM_MSM_SUBMITQUEUE_CLOSE,
-			&queue_id, sizeof(queue_id));
-}
-
-static void msm_pipe_destroy(struct fd_pipe *pipe)
-{
-	struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
-	close_submitqueue(pipe, msm_pipe->queue_id);
-	free(msm_pipe);
-}
-
-static const struct fd_pipe_funcs sp_funcs = {
-		.ringbuffer_new_object = msm_ringbuffer_sp_new_object,
-		.submit_new = msm_submit_sp_new,
-		.get_param = msm_pipe_get_param,
-		.wait = msm_pipe_wait,
-		.destroy = msm_pipe_destroy,
-};
-
-static const struct fd_pipe_funcs legacy_funcs = {
-		.ringbuffer_new_object = msm_ringbuffer_new_object,
-		.submit_new = msm_submit_new,
-		.get_param = msm_pipe_get_param,
-		.wait = msm_pipe_wait,
-		.destroy = msm_pipe_destroy,
-};
-
-static uint64_t get_param(struct fd_pipe *pipe, uint32_t param)
-{
-	uint64_t value;
-	int ret = query_param(pipe, param, &value);
-	if (ret) {
-		ERROR_MSG("get-param failed! %d (%s)", ret, strerror(errno));
-		return 0;
-	}
-	return value;
-}
-
-struct fd_pipe * msm_pipe_new(struct fd_device *dev,
-		enum fd_pipe_id id, uint32_t prio)
-{
-	static const uint32_t pipe_id[] = {
-			[FD_PIPE_3D] = MSM_PIPE_3D0,
-			[FD_PIPE_2D] = MSM_PIPE_2D0,
-	};
-	struct msm_pipe *msm_pipe = NULL;
-	struct fd_pipe *pipe = NULL;
-
-	msm_pipe = calloc(1, sizeof(*msm_pipe));
-	if (!msm_pipe) {
-		ERROR_MSG("allocation failed");
-		goto fail;
-	}
-
-	pipe = &msm_pipe->base;
-
-	if (fd_device_version(dev) >= FD_VERSION_SOFTPIN) {
-		pipe->funcs = &sp_funcs;
-	} else {
-		pipe->funcs = &legacy_funcs;
-	}
-
-	/* initialize before get_param(): */
-	pipe->dev = dev;
-	msm_pipe->pipe = pipe_id[id];
-
-	/* these params should be supported since the first version of drm/msm: */
-	msm_pipe->gpu_id = get_param(pipe, MSM_PARAM_GPU_ID);
-	msm_pipe->gmem   = get_param(pipe, MSM_PARAM_GMEM_SIZE);
-	msm_pipe->chip_id = get_param(pipe, MSM_PARAM_CHIP_ID);
-
-	if (! msm_pipe->gpu_id)
-		goto fail;
-
-	INFO_MSG("Pipe Info:");
-	INFO_MSG(" GPU-id:          %d", msm_pipe->gpu_id);
-	INFO_MSG(" Chip-id:         0x%08x", msm_pipe->chip_id);
-	INFO_MSG(" GMEM size:       0x%08x", msm_pipe->gmem);
-
-	if (open_submitqueue(pipe, prio))
-		goto fail;
-
-	return pipe;
-fail:
-	if (pipe)
-		fd_pipe_del(pipe);
-	return NULL;
-}
diff --git a/lib/mesa/src/freedreno/drm/msm_priv.h b/lib/mesa/src/freedreno/drm/msm_priv.h
deleted file mode 100644
index 9cb60bc1d..000000000
--- a/lib/mesa/src/freedreno/drm/msm_priv.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef MSM_PRIV_H_
-#define MSM_PRIV_H_
-
-#include "freedreno_priv.h"
-
-#ifndef __user
-#  define __user
-#endif
-
-#include "msm_drm.h"
-
-struct msm_device {
-	struct fd_device base;
-	struct fd_bo_cache ring_cache;
-};
-FD_DEFINE_CAST(fd_device, msm_device);
-
-struct fd_device * msm_device_new(int fd);
-
-struct msm_pipe {
-	struct fd_pipe base;
-	uint32_t pipe;
-	uint32_t gpu_id;
-	uint32_t gmem;
-	uint32_t chip_id;
-	uint32_t queue_id;
-};
-FD_DEFINE_CAST(fd_pipe, msm_pipe);
-
-struct fd_pipe * msm_pipe_new(struct fd_device *dev,
-		enum fd_pipe_id id, uint32_t prio);
-
-struct fd_ringbuffer * msm_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size);
-struct fd_ringbuffer * msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size);
-
-struct fd_submit * msm_submit_new(struct fd_pipe *pipe);
-struct fd_submit * msm_submit_sp_new(struct fd_pipe *pipe);
-
-struct msm_bo {
-	struct fd_bo base;
-	uint64_t offset;
-	/* to avoid excess hashtable lookups, cache the ring this bo was
-	 * last emitted on (since that will probably also be the next ring
-	 * it is emitted on)
-	 */
-	unsigned current_submit_seqno;
-	uint32_t idx;
-};
-FD_DEFINE_CAST(fd_bo, msm_bo);
-
-int msm_bo_new_handle(struct fd_device *dev,
-		uint32_t size, uint32_t flags, uint32_t *handle);
-struct fd_bo * msm_bo_from_handle(struct fd_device *dev,
-		uint32_t size, uint32_t handle);
-
-static inline void
-msm_dump_submit(struct drm_msm_gem_submit *req)
-{
-	for (unsigned i = 0; i < req->nr_bos; i++) {
-		struct drm_msm_gem_submit_bo *bos = U642VOID(req->bos);
-		struct drm_msm_gem_submit_bo *bo = &bos[i];
-		ERROR_MSG("  bos[%d]: handle=%u, flags=%x", i, bo->handle, bo->flags);
-	}
-	for (unsigned i = 0; i < req->nr_cmds; i++) {
-		struct drm_msm_gem_submit_cmd *cmds = U642VOID(req->cmds);
-		struct drm_msm_gem_submit_cmd *cmd = &cmds[i];
-		struct drm_msm_gem_submit_reloc *relocs = U642VOID(cmd->relocs);
-		ERROR_MSG("  cmd[%d]: type=%u, submit_idx=%u, submit_offset=%u, size=%u",
-				i, cmd->type, cmd->submit_idx, cmd->submit_offset, cmd->size);
-		for (unsigned j = 0; j < cmd->nr_relocs; j++) {
-			struct drm_msm_gem_submit_reloc *r = &relocs[j];
-			ERROR_MSG("    reloc[%d]: submit_offset=%u, or=%08x, shift=%d, reloc_idx=%u"
-					", reloc_offset=%"PRIu64, j, r->submit_offset, r->or, r->shift,
-					r->reloc_idx, r->reloc_offset);
-		}
-	}
-}
-
-static inline void get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
-{
-	struct timespec t;
-	uint32_t s = ns / 1000000000;
-	clock_gettime(CLOCK_MONOTONIC, &t);
-	tv->tv_sec = t.tv_sec + s;
-	tv->tv_nsec = t.tv_nsec + ns - (s * 1000000000);
-}
-
-/*
- * Stupid/simple growable array implementation:
- */
-
-static inline void *
-grow(void *ptr, uint16_t nr, uint16_t *max, uint16_t sz)
-{
-	if ((nr + 1) > *max) {
-		if ((*max * 2) < (nr + 1))
-			*max = nr + 5;
-		else
-			*max = *max * 2;
-		ptr = realloc(ptr, *max * sz);
-	}
-	return ptr;
-}
-
-#define DECLARE_ARRAY(type, name) \
-	unsigned short nr_ ## name, max_ ## name; \
-	type * name;
-
-#define APPEND(x, name) ({ \
-	(x)->name = grow((x)->name, (x)->nr_ ## name, &(x)->max_ ## name, sizeof((x)->name[0])); \
-	(x)->nr_ ## name ++; \
-})
-
-#endif /* MSM_PRIV_H_ */
diff --git a/lib/mesa/src/freedreno/drm/msm_ringbuffer.c b/lib/mesa/src/freedreno/drm/msm_ringbuffer.c
deleted file mode 100644
index 369f26f98..000000000
--- a/lib/mesa/src/freedreno/drm/msm_ringbuffer.c
+++ /dev/null
@@ -1,722 +0,0 @@
-/*
- * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <assert.h>
-#include <inttypes.h>
-
-#include "util/hash_table.h"
-#include "util/set.h"
-#include "util/slab.h"
-
-#include "drm/freedreno_ringbuffer.h"
-#include "msm_priv.h"
-
-/* The legacy implementation of submit/ringbuffer, which still does the
- * traditional reloc and cmd tracking
- */
-
-
-#define INIT_SIZE 0x1000
-
-static pthread_mutex_t idx_lock = PTHREAD_MUTEX_INITIALIZER;
-
-
-struct msm_submit {
-	struct fd_submit base;
-
-	DECLARE_ARRAY(struct drm_msm_gem_submit_bo, submit_bos);
-	DECLARE_ARRAY(struct fd_bo *, bos);
-
-	unsigned seqno;
-
-	/* maps fd_bo to idx in bos table: */
-	struct hash_table *bo_table;
-
-	struct slab_mempool ring_pool;
-
-	/* hash-set of associated rings: */
-	struct set *ring_set;
-
-	struct fd_ringbuffer *primary;
-
-	/* Allow for sub-allocation of stateobj ring buffers (ie. sharing
-	 * the same underlying bo)..
-	 *
-	 * We also rely on previous stateobj having been fully constructed
-	 * so we can reclaim extra space at it's end.
-	 */
-	struct fd_ringbuffer *suballoc_ring;
-};
-FD_DEFINE_CAST(fd_submit, msm_submit);
-
-/* for FD_RINGBUFFER_GROWABLE rb's, tracks the 'finalized' cmdstream buffers
- * and sizes.  Ie. a finalized buffer can have no more commands appended to
- * it.
- */
-struct msm_cmd {
-	struct fd_bo *ring_bo;
-	unsigned size;
-	DECLARE_ARRAY(struct drm_msm_gem_submit_reloc, relocs);
-};
-
-static struct msm_cmd *
-cmd_new(struct fd_bo *ring_bo)
-{
-	struct msm_cmd *cmd = malloc(sizeof(*cmd));
-	cmd->ring_bo = fd_bo_ref(ring_bo);
-	cmd->size = 0;
-	cmd->nr_relocs = cmd->max_relocs = 0;
-	cmd->relocs = NULL;
-	return cmd;
-}
-
-static void
-cmd_free(struct msm_cmd *cmd)
-{
-	fd_bo_del(cmd->ring_bo);
-	free(cmd->relocs);
-	free(cmd);
-}
-
-/* for _FD_RINGBUFFER_OBJECT rb's we need to track the bo's and flags to
- * later copy into the submit when the stateobj rb is later referenced by
- * a regular rb:
- */
-struct msm_reloc_bo {
-	struct fd_bo *bo;
-	unsigned flags;
-};
-
-struct msm_ringbuffer {
-	struct fd_ringbuffer base;
-
-	/* for FD_RINGBUFFER_STREAMING rb's which are sub-allocated */
-	unsigned offset;
-
-	union {
-		/* for _FD_RINGBUFFER_OBJECT case: */
-		struct {
-			struct fd_pipe *pipe;
-			DECLARE_ARRAY(struct msm_reloc_bo, reloc_bos);
-			struct set *ring_set;
-		};
-		/* for other cases: */
-		struct {
-			struct fd_submit *submit;
-			DECLARE_ARRAY(struct msm_cmd *, cmds);
-		};
-	} u;
-
-	struct msm_cmd *cmd;          /* current cmd */
-	struct fd_bo *ring_bo;
-};
-FD_DEFINE_CAST(fd_ringbuffer, msm_ringbuffer);
-
-static void finalize_current_cmd(struct fd_ringbuffer *ring);
-static struct fd_ringbuffer * msm_ringbuffer_init(
-		struct msm_ringbuffer *msm_ring,
-		uint32_t size, enum fd_ringbuffer_flags flags);
-
-/* add (if needed) bo to submit and return index: */
-static uint32_t
-append_bo(struct msm_submit *submit, struct fd_bo *bo, uint32_t flags)
-{
-	struct msm_bo *msm_bo = to_msm_bo(bo);
-	uint32_t idx;
-	pthread_mutex_lock(&idx_lock);
-	if (likely(msm_bo->current_submit_seqno == submit->seqno)) {
-		idx = msm_bo->idx;
-	} else {
-		uint32_t hash = _mesa_hash_pointer(bo);
-		struct hash_entry *entry;
-
-		entry = _mesa_hash_table_search_pre_hashed(submit->bo_table, hash, bo);
-		if (entry) {
-			/* found */
-			idx = (uint32_t)(uintptr_t)entry->data;
-		} else {
-			idx = APPEND(submit, submit_bos);
-			idx = APPEND(submit, bos);
-
-			submit->submit_bos[idx].flags = 0;
-			submit->submit_bos[idx].handle = bo->handle;
-			submit->submit_bos[idx].presumed = 0;
-
-			submit->bos[idx] = fd_bo_ref(bo);
-
-			_mesa_hash_table_insert_pre_hashed(submit->bo_table, hash, bo,
-					(void *)(uintptr_t)idx);
-		}
-		msm_bo->current_submit_seqno = submit->seqno;
-		msm_bo->idx = idx;
-	}
-	pthread_mutex_unlock(&idx_lock);
-	if (flags & FD_RELOC_READ)
-		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_READ;
-	if (flags & FD_RELOC_WRITE)
-		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_WRITE;
-	return idx;
-}
-
-static void
-append_ring(struct set *set, struct fd_ringbuffer *ring)
-{
-	uint32_t hash = _mesa_hash_pointer(ring);
-
-	if (!_mesa_set_search_pre_hashed(set, hash, ring)) {
-		fd_ringbuffer_ref(ring);
-		_mesa_set_add_pre_hashed(set, hash, ring);
-	}
-}
-
-static void
-msm_submit_suballoc_ring_bo(struct fd_submit *submit,
-		struct msm_ringbuffer *msm_ring, uint32_t size)
-{
-	struct msm_submit *msm_submit = to_msm_submit(submit);
-	unsigned suballoc_offset = 0;
-	struct fd_bo *suballoc_bo = NULL;
-
-	if (msm_submit->suballoc_ring) {
-		struct msm_ringbuffer *suballoc_ring =
-				to_msm_ringbuffer(msm_submit->suballoc_ring);
-
-		suballoc_bo = suballoc_ring->ring_bo;
-		suballoc_offset = fd_ringbuffer_size(msm_submit->suballoc_ring) +
-				suballoc_ring->offset;
-
-		suballoc_offset = align(suballoc_offset, 0x10);
-
-		if ((size + suballoc_offset) > suballoc_bo->size) {
-			suballoc_bo = NULL;
-		}
-	}
-
-	if (!suballoc_bo) {
-		// TODO possibly larger size for streaming bo?
-		msm_ring->ring_bo = fd_bo_new_ring(
-				submit->pipe->dev, 0x8000, 0);
-		msm_ring->offset = 0;
-	} else {
-		msm_ring->ring_bo = fd_bo_ref(suballoc_bo);
-		msm_ring->offset = suballoc_offset;
-	}
-
-	struct fd_ringbuffer *old_suballoc_ring = msm_submit->suballoc_ring;
-
-	msm_submit->suballoc_ring = fd_ringbuffer_ref(&msm_ring->base);
-
-	if (old_suballoc_ring)
-		fd_ringbuffer_del(old_suballoc_ring);
-}
-
-static struct fd_ringbuffer *
-msm_submit_new_ringbuffer(struct fd_submit *submit, uint32_t size,
-		enum fd_ringbuffer_flags flags)
-{
-	struct msm_submit *msm_submit = to_msm_submit(submit);
-	struct msm_ringbuffer *msm_ring;
-
-	msm_ring = slab_alloc_st(&msm_submit->ring_pool);
-
-	msm_ring->u.submit = submit;
-
-	/* NOTE: needs to be before _suballoc_ring_bo() since it could
-	 * increment the refcnt of the current ring
-	 */
-	msm_ring->base.refcnt = 1;
-
-	if (flags & FD_RINGBUFFER_STREAMING) {
-		msm_submit_suballoc_ring_bo(submit, msm_ring, size);
-	} else {
-		if (flags & FD_RINGBUFFER_GROWABLE)
-			size = INIT_SIZE;
-
-		msm_ring->offset = 0;
-		msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, size, 0);
-	}
-
-	if (!msm_ringbuffer_init(msm_ring, size, flags))
-		return NULL;
-
-	if (flags & FD_RINGBUFFER_PRIMARY) {
-		debug_assert(!msm_submit->primary);
-		msm_submit->primary = fd_ringbuffer_ref(&msm_ring->base);
-	}
-
-	return &msm_ring->base;
-}
-
-static struct drm_msm_gem_submit_reloc *
-handle_stateobj_relocs(struct msm_submit *submit, struct msm_ringbuffer *ring)
-{
-	struct msm_cmd *cmd = ring->cmd;
-	struct drm_msm_gem_submit_reloc *relocs;
-
-	relocs = malloc(cmd->nr_relocs * sizeof(*relocs));
-
-	for (unsigned i = 0; i < cmd->nr_relocs; i++) {
-		unsigned idx = cmd->relocs[i].reloc_idx;
-		struct fd_bo *bo = ring->u.reloc_bos[idx].bo;
-		unsigned flags = 0;
-
-		if (ring->u.reloc_bos[idx].flags & MSM_SUBMIT_BO_READ)
-			flags |= FD_RELOC_READ;
-		if (ring->u.reloc_bos[idx].flags & MSM_SUBMIT_BO_WRITE)
-			flags |= FD_RELOC_WRITE;
-
-		relocs[i] = cmd->relocs[i];
-		relocs[i].reloc_idx = append_bo(submit, bo, flags);
-	}
-
-	return relocs;
-}
-
-static int
-msm_submit_flush(struct fd_submit *submit, int in_fence_fd,
-		int *out_fence_fd, uint32_t *out_fence)
-{
-	struct msm_submit *msm_submit = to_msm_submit(submit);
-	struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
-	struct drm_msm_gem_submit req = {
-			.flags = msm_pipe->pipe,
-			.queueid = msm_pipe->queue_id,
-	};
-	int ret;
-
-	debug_assert(msm_submit->primary);
-
-	finalize_current_cmd(msm_submit->primary);
-	append_ring(msm_submit->ring_set, msm_submit->primary);
-
-	unsigned nr_cmds = 0;
-	unsigned nr_objs = 0;
-
-	set_foreach(msm_submit->ring_set, entry) {
-		struct fd_ringbuffer *ring = (void *)entry->key;
-		if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-			nr_cmds += 1;
-			nr_objs += 1;
-		} else {
-			if (ring != msm_submit->primary)
-				finalize_current_cmd(ring);
-			nr_cmds += to_msm_ringbuffer(ring)->u.nr_cmds;
-		}
-	}
-
-	void *obj_relocs[nr_objs];
-	struct drm_msm_gem_submit_cmd cmds[nr_cmds];
-	unsigned i = 0, o = 0;
-
-	set_foreach(msm_submit->ring_set, entry) {
-		struct fd_ringbuffer *ring = (void *)entry->key;
-		struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
-
-		debug_assert(i < nr_cmds);
-
-		// TODO handle relocs:
-		if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-
-			debug_assert(o < nr_objs);
-
-			void *relocs = handle_stateobj_relocs(msm_submit, msm_ring);
-			obj_relocs[o++] = relocs;
-
-			cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF;
-			cmds[i].submit_idx =
-				append_bo(msm_submit, msm_ring->ring_bo, FD_RELOC_READ);
-			cmds[i].submit_offset = msm_ring->offset;
-			cmds[i].size = offset_bytes(ring->cur, ring->start);
-			cmds[i].pad = 0;
-			cmds[i].nr_relocs = msm_ring->cmd->nr_relocs;
-			cmds[i].relocs = VOID2U64(relocs);
-
-			i++;
-		} else {
-			for (unsigned j = 0; j < msm_ring->u.nr_cmds; j++) {
-				if (ring->flags & FD_RINGBUFFER_PRIMARY) {
-					cmds[i].type = MSM_SUBMIT_CMD_BUF;
-				} else {
-					cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF;
-				}
-				cmds[i].submit_idx = append_bo(msm_submit,
-						msm_ring->u.cmds[j]->ring_bo, FD_RELOC_READ);
-				cmds[i].submit_offset = msm_ring->offset;
-				cmds[i].size = msm_ring->u.cmds[j]->size;
-				cmds[i].pad = 0;
-				cmds[i].nr_relocs = msm_ring->u.cmds[j]->nr_relocs;
-				cmds[i].relocs = VOID2U64(msm_ring->u.cmds[j]->relocs);
-
-				i++;
-			}
-		}
-	}
-
-	if (in_fence_fd != -1) {
-		req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
-		req.fence_fd = in_fence_fd;
-	}
-
-	if (out_fence_fd) {
-		req.flags |= MSM_SUBMIT_FENCE_FD_OUT;
-	}
-
-	/* needs to be after get_cmd() as that could create bos/cmds table: */
-	req.bos = VOID2U64(msm_submit->submit_bos),
-	req.nr_bos = msm_submit->nr_submit_bos;
-	req.cmds = VOID2U64(cmds),
-	req.nr_cmds = nr_cmds;
-
-	DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos);
-
-	ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT,
-			&req, sizeof(req));
-	if (ret) {
-		ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno));
-		msm_dump_submit(&req);
-	} else if (!ret) {
-		if (out_fence)
-			*out_fence = req.fence;
-
-		if (out_fence_fd)
-			*out_fence_fd = req.fence_fd;
-	}
-
-	for (unsigned o = 0; o < nr_objs; o++)
-		free(obj_relocs[o]);
-
-	return ret;
-}
-
-static void
-unref_rings(struct set_entry *entry)
-{
-	struct fd_ringbuffer *ring = (void *)entry->key;
-	fd_ringbuffer_del(ring);
-}
-
-static void
-msm_submit_destroy(struct fd_submit *submit)
-{
-	struct msm_submit *msm_submit = to_msm_submit(submit);
-
-	if (msm_submit->primary)
-		fd_ringbuffer_del(msm_submit->primary);
-	if (msm_submit->suballoc_ring)
-		fd_ringbuffer_del(msm_submit->suballoc_ring);
-
-	_mesa_hash_table_destroy(msm_submit->bo_table, NULL);
-	_mesa_set_destroy(msm_submit->ring_set, unref_rings);
-
-	// TODO it would be nice to have a way to debug_assert() if all
-	// rb's haven't been free'd back to the slab, because that is
-	// an indication that we are leaking bo's
-	slab_destroy(&msm_submit->ring_pool);
-
-	for (unsigned i = 0; i < msm_submit->nr_bos; i++)
-		fd_bo_del(msm_submit->bos[i]);
-
-	free(msm_submit->submit_bos);
-	free(msm_submit->bos);
-	free(msm_submit);
-}
-
-static const struct fd_submit_funcs submit_funcs = {
-		.new_ringbuffer = msm_submit_new_ringbuffer,
-		.flush = msm_submit_flush,
-		.destroy = msm_submit_destroy,
-};
-
-struct fd_submit *
-msm_submit_new(struct fd_pipe *pipe)
-{
-	struct msm_submit *msm_submit = calloc(1, sizeof(*msm_submit));
-	struct fd_submit *submit;
-	static unsigned submit_cnt = 0;
-
-	msm_submit->seqno = ++submit_cnt;
-	msm_submit->bo_table = _mesa_hash_table_create(NULL,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
-	msm_submit->ring_set = _mesa_set_create(NULL,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
-	// TODO tune size:
-	slab_create(&msm_submit->ring_pool, sizeof(struct msm_ringbuffer), 16);
-
-	submit = &msm_submit->base;
-	submit->pipe = pipe;
-	submit->funcs = &submit_funcs;
-
-	return submit;
-}
-
-
-static void
-finalize_current_cmd(struct fd_ringbuffer *ring)
-{
-	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
-
-	debug_assert(!(ring->flags & _FD_RINGBUFFER_OBJECT));
-
-	if (!msm_ring->cmd)
-		return;
-
-	debug_assert(msm_ring->cmd->ring_bo == msm_ring->ring_bo);
-
-	unsigned idx = APPEND(&msm_ring->u, cmds);
-
-	msm_ring->u.cmds[idx] = msm_ring->cmd;
-	msm_ring->cmd = NULL;
-
-	msm_ring->u.cmds[idx]->size = offset_bytes(ring->cur, ring->start);
-}
-
-static void
-msm_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t size)
-{
-	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
-	struct fd_pipe *pipe = msm_ring->u.submit->pipe;
-
-	debug_assert(ring->flags & FD_RINGBUFFER_GROWABLE);
-
-	finalize_current_cmd(ring);
-
-	fd_bo_del(msm_ring->ring_bo);
-	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size, 0);
-	msm_ring->cmd = cmd_new(msm_ring->ring_bo);
-
-	ring->start = fd_bo_map(msm_ring->ring_bo);
-	ring->end = &(ring->start[size/4]);
-	ring->cur = ring->start;
-	ring->size = size;
-}
-
-static void
-msm_ringbuffer_emit_reloc(struct fd_ringbuffer *ring,
-		const struct fd_reloc *reloc)
-{
-	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
-	struct fd_pipe *pipe;
-	unsigned reloc_idx;
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		unsigned idx = APPEND(&msm_ring->u, reloc_bos);
-
-		msm_ring->u.reloc_bos[idx].bo = fd_bo_ref(reloc->bo);
-		msm_ring->u.reloc_bos[idx].flags = reloc->flags;
-
-		/* this gets fixed up at submit->flush() time, since this state-
-		 * object rb can be used with many different submits
-		 */
-		reloc_idx = idx;
-
-		pipe = msm_ring->u.pipe;
-	} else {
-		struct msm_submit *msm_submit =
-				to_msm_submit(msm_ring->u.submit);
-
-		reloc_idx = append_bo(msm_submit, reloc->bo, reloc->flags);
-
-		pipe = msm_ring->u.submit->pipe;
-	}
-
-	struct drm_msm_gem_submit_reloc *r;
-	unsigned idx = APPEND(msm_ring->cmd, relocs);
-
-	r = &msm_ring->cmd->relocs[idx];
-
-	r->reloc_idx = reloc_idx;
-	r->reloc_offset = reloc->offset;
-	r->or = reloc->or;
-	r->shift = reloc->shift;
-	r->submit_offset = offset_bytes(ring->cur, ring->start) +
-			msm_ring->offset;
-
-	ring->cur++;
-
-	if (pipe->gpu_id >= 500) {
-		idx = APPEND(msm_ring->cmd, relocs);
-		r = &msm_ring->cmd->relocs[idx];
-
-		r->reloc_idx = reloc_idx;
-		r->reloc_offset = reloc->offset;
-		r->or = reloc->orhi;
-		r->shift = reloc->shift - 32;
-		r->submit_offset = offset_bytes(ring->cur, ring->start) +
-				msm_ring->offset;
-
-		ring->cur++;
-	}
-}
-
-static void
-append_stateobj_rings(struct msm_submit *submit, struct fd_ringbuffer *target)
-{
-	struct msm_ringbuffer *msm_target = to_msm_ringbuffer(target);
-
-	debug_assert(target->flags & _FD_RINGBUFFER_OBJECT);
-
-	set_foreach(msm_target->u.ring_set, entry) {
-		struct fd_ringbuffer *ring = (void *)entry->key;
-
-		append_ring(submit->ring_set, ring);
-
-		if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-			append_stateobj_rings(submit, ring);
-		}
-	}
-}
-
-static uint32_t
-msm_ringbuffer_emit_reloc_ring(struct fd_ringbuffer *ring,
-		struct fd_ringbuffer *target, uint32_t cmd_idx)
-{
-	struct msm_ringbuffer *msm_target = to_msm_ringbuffer(target);
-	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
-	struct fd_bo *bo;
-	uint32_t size;
-
-	if ((target->flags & FD_RINGBUFFER_GROWABLE) &&
-			(cmd_idx < msm_target->u.nr_cmds)) {
-		bo   = msm_target->u.cmds[cmd_idx]->ring_bo;
-		size = msm_target->u.cmds[cmd_idx]->size;
-	} else {
-		bo   = msm_target->ring_bo;
-		size = offset_bytes(target->cur, target->start);
-	}
-
-	msm_ringbuffer_emit_reloc(ring, &(struct fd_reloc){
-		.bo     = bo,
-		.flags  = FD_RELOC_READ,
-		.offset = msm_target->offset,
-	});
-
-	if ((target->flags & _FD_RINGBUFFER_OBJECT) &&
-			!(ring->flags & _FD_RINGBUFFER_OBJECT)) {
-		struct msm_submit *msm_submit = to_msm_submit(msm_ring->u.submit);
-
-		append_stateobj_rings(msm_submit, target);
-	}
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		append_ring(msm_ring->u.ring_set, target);
-	} else {
-		struct msm_submit *msm_submit = to_msm_submit(msm_ring->u.submit);
-		append_ring(msm_submit->ring_set, target);
-	}
-
-	return size;
-}
-
-static uint32_t
-msm_ringbuffer_cmd_count(struct fd_ringbuffer *ring)
-{
-	if (ring->flags & FD_RINGBUFFER_GROWABLE)
-		return to_msm_ringbuffer(ring)->u.nr_cmds + 1;
-	return 1;
-}
-
-static void
-msm_ringbuffer_destroy(struct fd_ringbuffer *ring)
-{
-	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
-
-	fd_bo_del(msm_ring->ring_bo);
-	if (msm_ring->cmd)
-		cmd_free(msm_ring->cmd);
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		for (unsigned i = 0; i < msm_ring->u.nr_reloc_bos; i++) {
-			fd_bo_del(msm_ring->u.reloc_bos[i].bo);
-		}
-
-		_mesa_set_destroy(msm_ring->u.ring_set, unref_rings);
-
-		free(msm_ring->u.reloc_bos);
-		free(msm_ring);
-	} else {
-		struct fd_submit *submit = msm_ring->u.submit;
-
-		for (unsigned i = 0; i < msm_ring->u.nr_cmds; i++) {
-			cmd_free(msm_ring->u.cmds[i]);
-		}
-
-		free(msm_ring->u.cmds);
-		slab_free_st(&to_msm_submit(submit)->ring_pool, msm_ring);
-	}
-}
-
-static const struct fd_ringbuffer_funcs ring_funcs = {
-		.grow = msm_ringbuffer_grow,
-		.emit_reloc = msm_ringbuffer_emit_reloc,
-		.emit_reloc_ring = msm_ringbuffer_emit_reloc_ring,
-		.cmd_count = msm_ringbuffer_cmd_count,
-		.destroy = msm_ringbuffer_destroy,
-};
-
-static inline struct fd_ringbuffer *
-msm_ringbuffer_init(struct msm_ringbuffer *msm_ring, uint32_t size,
-		enum fd_ringbuffer_flags flags)
-{
-	struct fd_ringbuffer *ring = &msm_ring->base;
-
-	debug_assert(msm_ring->ring_bo);
-
-	uint8_t *base = fd_bo_map(msm_ring->ring_bo);
-	ring->start = (void *)(base + msm_ring->offset);
-	ring->end = &(ring->start[size/4]);
-	ring->cur = ring->start;
-
-	ring->size = size;
-	ring->flags = flags;
-
-	ring->funcs = &ring_funcs;
-
-	msm_ring->u.cmds = NULL;
-	msm_ring->u.nr_cmds = msm_ring->u.max_cmds = 0;
-
-	msm_ring->cmd = cmd_new(msm_ring->ring_bo);
-
-	return ring;
-}
-
-struct fd_ringbuffer *
-msm_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size)
-{
-	struct msm_ringbuffer *msm_ring = malloc(sizeof(*msm_ring));
-
-	msm_ring->u.pipe = pipe;
-	msm_ring->offset = 0;
-	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size, 0);
-	msm_ring->base.refcnt = 1;
-
-	msm_ring->u.reloc_bos = NULL;
-	msm_ring->u.nr_reloc_bos = msm_ring->u.max_reloc_bos = 0;
-
-	msm_ring->u.ring_set = _mesa_set_create(NULL,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
-
-	return msm_ringbuffer_init(msm_ring, size, _FD_RINGBUFFER_OBJECT);
-}
diff --git a/lib/mesa/src/freedreno/drm/msm_ringbuffer_sp.c b/lib/mesa/src/freedreno/drm/msm_ringbuffer_sp.c
deleted file mode 100644
index 2b8f53172..000000000
--- a/lib/mesa/src/freedreno/drm/msm_ringbuffer_sp.c
+++ /dev/null
@@ -1,568 +0,0 @@
-/*
- * Copyright (C) 2018 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <assert.h>
-#include <inttypes.h>
-
-#include "util/hash_table.h"
-#include "util/slab.h"
-
-#include "drm/freedreno_ringbuffer.h"
-#include "msm_priv.h"
-
-/* A "softpin" implementation of submit/ringbuffer, which lowers CPU overhead
- * by avoiding the additional tracking necessary to build cmds/relocs tables
- * (but still builds a bos table)
- */
-
-
-#define INIT_SIZE 0x1000
-
-static pthread_mutex_t idx_lock = PTHREAD_MUTEX_INITIALIZER;
-
-
-struct msm_submit_sp {
-	struct fd_submit base;
-
-	DECLARE_ARRAY(struct drm_msm_gem_submit_bo, submit_bos);
-	DECLARE_ARRAY(struct fd_bo *, bos);
-
-	unsigned seqno;
-
-	/* maps fd_bo to idx in bos table: */
-	struct hash_table *bo_table;
-
-	struct slab_mempool ring_pool;
-
-	struct fd_ringbuffer *primary;
-
-	/* Allow for sub-allocation of stateobj ring buffers (ie. sharing
-	 * the same underlying bo)..
-	 *
-	 * We also rely on previous stateobj having been fully constructed
-	 * so we can reclaim extra space at it's end.
-	 */
-	struct fd_ringbuffer *suballoc_ring;
-};
-FD_DEFINE_CAST(fd_submit, msm_submit_sp);
-
-/* for FD_RINGBUFFER_GROWABLE rb's, tracks the 'finalized' cmdstream buffers
- * and sizes.  Ie. a finalized buffer can have no more commands appended to
- * it.
- */
-struct msm_cmd_sp {
-	struct fd_bo *ring_bo;
-	unsigned size;
-};
-
-/* for _FD_RINGBUFFER_OBJECT rb's we need to track the bo's and flags to
- * later copy into the submit when the stateobj rb is later referenced by
- * a regular rb:
- */
-struct msm_reloc_bo_sp {
-	struct fd_bo *bo;
-	unsigned flags;
-};
-
-struct msm_ringbuffer_sp {
-	struct fd_ringbuffer base;
-
-	/* for FD_RINGBUFFER_STREAMING rb's which are sub-allocated */
-	unsigned offset;
-
-// TODO check disasm.. hopefully compilers CSE can realize that
-// reloc_bos and cmds are at the same offsets and optimize some
-// divergent cases into single case
-	union {
-		/* for _FD_RINGBUFFER_OBJECT case: */
-		struct {
-			struct fd_pipe *pipe;
-			DECLARE_ARRAY(struct msm_reloc_bo_sp, reloc_bos);
-		};
-		/* for other cases: */
-		struct {
-			struct fd_submit *submit;
-			DECLARE_ARRAY(struct msm_cmd_sp, cmds);
-		};
-	} u;
-
-	struct fd_bo *ring_bo;
-};
-FD_DEFINE_CAST(fd_ringbuffer, msm_ringbuffer_sp);
-
-static void finalize_current_cmd(struct fd_ringbuffer *ring);
-static struct fd_ringbuffer * msm_ringbuffer_sp_init(
-		struct msm_ringbuffer_sp *msm_ring,
-		uint32_t size, enum fd_ringbuffer_flags flags);
-
-/* add (if needed) bo to submit and return index: */
-static uint32_t
-append_bo(struct msm_submit_sp *submit, struct fd_bo *bo, uint32_t flags)
-{
-	struct msm_bo *msm_bo = to_msm_bo(bo);
-	uint32_t idx;
-	pthread_mutex_lock(&idx_lock);
-	if (likely(msm_bo->current_submit_seqno == submit->seqno)) {
-		idx = msm_bo->idx;
-	} else {
-		uint32_t hash = _mesa_hash_pointer(bo);
-		struct hash_entry *entry;
-
-		entry = _mesa_hash_table_search_pre_hashed(submit->bo_table, hash, bo);
-		if (entry) {
-			/* found */
-			idx = (uint32_t)(uintptr_t)entry->data;
-		} else {
-			idx = APPEND(submit, submit_bos);
-			idx = APPEND(submit, bos);
-
-			submit->submit_bos[idx].flags = 0;
-			submit->submit_bos[idx].handle = bo->handle;
-			submit->submit_bos[idx].presumed = 0;
-
-			submit->bos[idx] = fd_bo_ref(bo);
-
-			_mesa_hash_table_insert_pre_hashed(submit->bo_table, hash, bo,
-					(void *)(uintptr_t)idx);
-		}
-		msm_bo->current_submit_seqno = submit->seqno;
-		msm_bo->idx = idx;
-	}
-	pthread_mutex_unlock(&idx_lock);
-	if (flags & FD_RELOC_READ)
-		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_READ;
-	if (flags & FD_RELOC_WRITE)
-		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_WRITE;
-	if (flags & FD_RELOC_DUMP)
-		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_DUMP;
-	return idx;
-}
-
-static void
-msm_submit_suballoc_ring_bo(struct fd_submit *submit,
-		struct msm_ringbuffer_sp *msm_ring, uint32_t size)
-{
-	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
-	unsigned suballoc_offset = 0;
-	struct fd_bo *suballoc_bo = NULL;
-
-	if (msm_submit->suballoc_ring) {
-		struct msm_ringbuffer_sp *suballoc_ring =
-				to_msm_ringbuffer_sp(msm_submit->suballoc_ring);
-
-		suballoc_bo = suballoc_ring->ring_bo;
-		suballoc_offset = fd_ringbuffer_size(msm_submit->suballoc_ring) +
-				suballoc_ring->offset;
-
-		suballoc_offset = align(suballoc_offset, 0x10);
-
-		if ((size + suballoc_offset) > suballoc_bo->size) {
-			suballoc_bo = NULL;
-		}
-	}
-
-	if (!suballoc_bo) {
-		// TODO possibly larger size for streaming bo?
-		msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev,
-				0x8000, DRM_FREEDRENO_GEM_GPUREADONLY);
-		msm_ring->offset = 0;
-	} else {
-		msm_ring->ring_bo = fd_bo_ref(suballoc_bo);
-		msm_ring->offset = suballoc_offset;
-	}
-
-	struct fd_ringbuffer *old_suballoc_ring = msm_submit->suballoc_ring;
-
-	msm_submit->suballoc_ring = fd_ringbuffer_ref(&msm_ring->base);
-
-	if (old_suballoc_ring)
-		fd_ringbuffer_del(old_suballoc_ring);
-}
-
-static struct fd_ringbuffer *
-msm_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size,
-		enum fd_ringbuffer_flags flags)
-{
-	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
-	struct msm_ringbuffer_sp *msm_ring;
-
-	msm_ring = slab_alloc_st(&msm_submit->ring_pool);
-
-	msm_ring->u.submit = submit;
-
-	/* NOTE: needs to be before _suballoc_ring_bo() since it could
-	 * increment the refcnt of the current ring
-	 */
-	msm_ring->base.refcnt = 1;
-
-	if (flags & FD_RINGBUFFER_STREAMING) {
-		msm_submit_suballoc_ring_bo(submit, msm_ring, size);
-	} else {
-		if (flags & FD_RINGBUFFER_GROWABLE)
-			size = INIT_SIZE;
-
-		msm_ring->offset = 0;
-		msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, size,
-				DRM_FREEDRENO_GEM_GPUREADONLY);
-	}
-
-	if (!msm_ringbuffer_sp_init(msm_ring, size, flags))
-		return NULL;
-
-	if (flags & FD_RINGBUFFER_PRIMARY) {
-		debug_assert(!msm_submit->primary);
-		msm_submit->primary = fd_ringbuffer_ref(&msm_ring->base);
-	}
-
-	return &msm_ring->base;
-}
-
-static int
-msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
-		int *out_fence_fd, uint32_t *out_fence)
-{
-	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
-	struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
-	struct drm_msm_gem_submit req = {
-			.flags = msm_pipe->pipe,
-			.queueid = msm_pipe->queue_id,
-	};
-	int ret;
-
-	debug_assert(msm_submit->primary);
-	finalize_current_cmd(msm_submit->primary);
-
-	struct msm_ringbuffer_sp *primary = to_msm_ringbuffer_sp(msm_submit->primary);
-	struct drm_msm_gem_submit_cmd cmds[primary->u.nr_cmds];
-
-	for (unsigned i = 0; i < primary->u.nr_cmds; i++) {
-		cmds[i].type = MSM_SUBMIT_CMD_BUF;
-		cmds[i].submit_idx = append_bo(msm_submit,
-				primary->u.cmds[i].ring_bo, FD_RELOC_READ | FD_RELOC_DUMP);
-		cmds[i].submit_offset = primary->offset;
-		cmds[i].size = primary->u.cmds[i].size;
-		cmds[i].pad = 0;
-		cmds[i].nr_relocs = 0;
-	}
-
-	if (in_fence_fd != -1) {
-		req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
-		req.fence_fd = in_fence_fd;
-	}
-
-	if (out_fence_fd) {
-		req.flags |= MSM_SUBMIT_FENCE_FD_OUT;
-	}
-
-	/* needs to be after get_cmd() as that could create bos/cmds table: */
-	req.bos = VOID2U64(msm_submit->submit_bos),
-	req.nr_bos = msm_submit->nr_submit_bos;
-	req.cmds = VOID2U64(cmds),
-	req.nr_cmds = primary->u.nr_cmds;
-
-	DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos);
-
-	ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT,
-			&req, sizeof(req));
-	if (ret) {
-		ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno));
-		msm_dump_submit(&req);
-	} else if (!ret) {
-		if (out_fence)
-			*out_fence = req.fence;
-
-		if (out_fence_fd)
-			*out_fence_fd = req.fence_fd;
-	}
-
-	return ret;
-}
-
-static void
-msm_submit_sp_destroy(struct fd_submit *submit)
-{
-	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
-
-	if (msm_submit->primary)
-		fd_ringbuffer_del(msm_submit->primary);
-	if (msm_submit->suballoc_ring)
-		fd_ringbuffer_del(msm_submit->suballoc_ring);
-
-	_mesa_hash_table_destroy(msm_submit->bo_table, NULL);
-
-	// TODO it would be nice to have a way to debug_assert() if all
-	// rb's haven't been free'd back to the slab, because that is
-	// an indication that we are leaking bo's
-	slab_destroy(&msm_submit->ring_pool);
-
-	for (unsigned i = 0; i < msm_submit->nr_bos; i++)
-		fd_bo_del(msm_submit->bos[i]);
-
-	free(msm_submit->submit_bos);
-	free(msm_submit->bos);
-	free(msm_submit);
-}
-
-static const struct fd_submit_funcs submit_funcs = {
-		.new_ringbuffer = msm_submit_sp_new_ringbuffer,
-		.flush = msm_submit_sp_flush,
-		.destroy = msm_submit_sp_destroy,
-};
-
-struct fd_submit *
-msm_submit_sp_new(struct fd_pipe *pipe)
-{
-	struct msm_submit_sp *msm_submit = calloc(1, sizeof(*msm_submit));
-	struct fd_submit *submit;
-	static unsigned submit_cnt = 0;
-
-	msm_submit->seqno = ++submit_cnt;
-	msm_submit->bo_table = _mesa_hash_table_create(NULL,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
-	// TODO tune size:
-	slab_create(&msm_submit->ring_pool, sizeof(struct msm_ringbuffer_sp), 16);
-
-	submit = &msm_submit->base;
-	submit->pipe = pipe;
-	submit->funcs = &submit_funcs;
-
-	return submit;
-}
-
-
-static void
-finalize_current_cmd(struct fd_ringbuffer *ring)
-{
-	debug_assert(!(ring->flags & _FD_RINGBUFFER_OBJECT));
-
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-	unsigned idx = APPEND(&msm_ring->u, cmds);
-
-	msm_ring->u.cmds[idx].ring_bo = fd_bo_ref(msm_ring->ring_bo);
-	msm_ring->u.cmds[idx].size = offset_bytes(ring->cur, ring->start);
-}
-
-static void
-msm_ringbuffer_sp_grow(struct fd_ringbuffer *ring, uint32_t size)
-{
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-	struct fd_pipe *pipe = msm_ring->u.submit->pipe;
-
-	debug_assert(ring->flags & FD_RINGBUFFER_GROWABLE);
-
-	finalize_current_cmd(ring);
-
-	fd_bo_del(msm_ring->ring_bo);
-	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size,
-			DRM_FREEDRENO_GEM_GPUREADONLY);
-
-	ring->start = fd_bo_map(msm_ring->ring_bo);
-	ring->end = &(ring->start[size/4]);
-	ring->cur = ring->start;
-	ring->size = size;
-}
-
-static void
-msm_ringbuffer_sp_emit_reloc(struct fd_ringbuffer *ring,
-		const struct fd_reloc *reloc)
-{
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-	struct fd_pipe *pipe;
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		unsigned idx = APPEND(&msm_ring->u, reloc_bos);
-
-		msm_ring->u.reloc_bos[idx].bo = fd_bo_ref(reloc->bo);
-		msm_ring->u.reloc_bos[idx].flags = reloc->flags;
-
-		pipe = msm_ring->u.pipe;
-	} else {
-		struct msm_submit_sp *msm_submit =
-				to_msm_submit_sp(msm_ring->u.submit);
-
-		append_bo(msm_submit, reloc->bo, reloc->flags);
-
-		pipe = msm_ring->u.submit->pipe;
-	}
-
-	uint64_t iova = fd_bo_get_iova(reloc->bo) + reloc->offset;
-	uint32_t dword = iova;
-	int shift = reloc->shift;
-
-	if (shift < 0)
-		dword >>= -shift;
-	else
-		dword <<= shift;
-
-	(*ring->cur++) = dword | reloc->or;
-
-	if (pipe->gpu_id >= 500) {
-		dword = iova >> 32;
-		shift -= 32;
-
-		if (shift < 0)
-			dword >>= -shift;
-		else
-			dword <<= shift;
-
-		(*ring->cur++) = dword | reloc->orhi;
-	}
-}
-
-static uint32_t
-msm_ringbuffer_sp_emit_reloc_ring(struct fd_ringbuffer *ring,
-		struct fd_ringbuffer *target, uint32_t cmd_idx)
-{
-	struct msm_ringbuffer_sp *msm_target = to_msm_ringbuffer_sp(target);
-	struct fd_bo *bo;
-	uint32_t size;
-
-	if ((target->flags & FD_RINGBUFFER_GROWABLE) &&
-			(cmd_idx < msm_target->u.nr_cmds)) {
-		bo   = msm_target->u.cmds[cmd_idx].ring_bo;
-		size = msm_target->u.cmds[cmd_idx].size;
-	} else {
-		bo   = msm_target->ring_bo;
-		size = offset_bytes(target->cur, target->start);
-	}
-
-	msm_ringbuffer_sp_emit_reloc(ring, &(struct fd_reloc){
-		.bo     = bo,
-		.flags  = FD_RELOC_READ | FD_RELOC_DUMP,
-		.offset = msm_target->offset,
-	});
-
-	if (!(target->flags & _FD_RINGBUFFER_OBJECT))
-		return size;
-
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) {
-			unsigned idx = APPEND(&msm_ring->u, reloc_bos);
-
-			msm_ring->u.reloc_bos[idx].bo =
-				fd_bo_ref(msm_target->u.reloc_bos[i].bo);
-			msm_ring->u.reloc_bos[idx].flags =
-				msm_target->u.reloc_bos[i].flags;
-		}
-	} else {
-		// TODO it would be nice to know whether we have already
-		// seen this target before.  But hopefully we hit the
-		// append_bo() fast path enough for this to not matter:
-		struct msm_submit_sp *msm_submit = to_msm_submit_sp(msm_ring->u.submit);
-
-		for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) {
-			append_bo(msm_submit, msm_target->u.reloc_bos[i].bo,
-					msm_target->u.reloc_bos[i].flags);
-		}
-	}
-
-	return size;
-}
-
-static uint32_t
-msm_ringbuffer_sp_cmd_count(struct fd_ringbuffer *ring)
-{
-	if (ring->flags & FD_RINGBUFFER_GROWABLE)
-		return to_msm_ringbuffer_sp(ring)->u.nr_cmds + 1;
-	return 1;
-}
-
-static void
-msm_ringbuffer_sp_destroy(struct fd_ringbuffer *ring)
-{
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-
-	fd_bo_del(msm_ring->ring_bo);
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		for (unsigned i = 0; i < msm_ring->u.nr_reloc_bos; i++) {
-			fd_bo_del(msm_ring->u.reloc_bos[i].bo);
-		}
-
-		free(msm_ring);
-	} else {
-		struct fd_submit *submit = msm_ring->u.submit;
-
-		for (unsigned i = 0; i < msm_ring->u.nr_cmds; i++) {
-			fd_bo_del(msm_ring->u.cmds[i].ring_bo);
-		}
-
-		slab_free_st(&to_msm_submit_sp(submit)->ring_pool, msm_ring);
-	}
-}
-
-static const struct fd_ringbuffer_funcs ring_funcs = {
-		.grow = msm_ringbuffer_sp_grow,
-		.emit_reloc = msm_ringbuffer_sp_emit_reloc,
-		.emit_reloc_ring = msm_ringbuffer_sp_emit_reloc_ring,
-		.cmd_count = msm_ringbuffer_sp_cmd_count,
-		.destroy = msm_ringbuffer_sp_destroy,
-};
-
-static inline struct fd_ringbuffer *
-msm_ringbuffer_sp_init(struct msm_ringbuffer_sp *msm_ring, uint32_t size,
-		enum fd_ringbuffer_flags flags)
-{
-	struct fd_ringbuffer *ring = &msm_ring->base;
-
-	debug_assert(msm_ring->ring_bo);
-
-	uint8_t *base = fd_bo_map(msm_ring->ring_bo);
-	ring->start = (void *)(base + msm_ring->offset);
-	ring->end = &(ring->start[size/4]);
-	ring->cur = ring->start;
-
-	ring->size = size;
-	ring->flags = flags;
-
-	ring->funcs = &ring_funcs;
-
-	// TODO initializing these could probably be conditional on flags
-	// since unneed for FD_RINGBUFFER_STAGING case..
-	msm_ring->u.cmds = NULL;
-	msm_ring->u.nr_cmds = msm_ring->u.max_cmds = 0;
-
-	msm_ring->u.reloc_bos = NULL;
-	msm_ring->u.nr_reloc_bos = msm_ring->u.max_reloc_bos = 0;
-
-	return ring;
-}
-
-struct fd_ringbuffer *
-msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size)
-{
-	struct msm_ringbuffer_sp *msm_ring = malloc(sizeof(*msm_ring));
-
-	msm_ring->u.pipe = pipe;
-	msm_ring->offset = 0;
-	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size,
-			DRM_FREEDRENO_GEM_GPUREADONLY);
-	msm_ring->base.refcnt = 1;
-
-	return msm_ringbuffer_sp_init(msm_ring, size, _FD_RINGBUFFER_OBJECT);
-}
diff --git a/lib/mesa/src/freedreno/drm/msm_ringbuffer_sp.h b/lib/mesa/src/freedreno/drm/msm_ringbuffer_sp.h
deleted file mode 100644
index 79dba3b84..000000000
--- a/lib/mesa/src/freedreno/drm/msm_ringbuffer_sp.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright © 2021 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifdef X
-#  undef X
-#endif
-
-#if PTRSZ == 32
-#  define X(n) n ## _32
-#else
-#  define X(n) n ## _64
-#endif
-
-
-static void
-X(emit_reloc_common)(struct fd_ringbuffer *ring, const struct fd_reloc *reloc)
-{
-	(*ring->cur++) = (uint32_t)reloc->iova;
-#if PTRSZ == 64
-	(*ring->cur++) = (uint32_t)(reloc->iova >> 32);
-#endif
-}
-
-static void
-X(msm_ringbuffer_sp_emit_reloc_nonobj)(struct fd_ringbuffer *ring,
-		const struct fd_reloc *reloc)
-{
-	X(emit_reloc_common)(ring, reloc);
-
-	assert(!(ring->flags & _FD_RINGBUFFER_OBJECT));
-
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-
-	struct msm_submit_sp *msm_submit =
-			to_msm_submit_sp(msm_ring->u.submit);
-
-	msm_submit_append_bo(msm_submit, reloc->bo);
-}
-
-static void
-X(msm_ringbuffer_sp_emit_reloc_obj)(struct fd_ringbuffer *ring,
-		const struct fd_reloc *reloc)
-{
-	X(emit_reloc_common)(ring, reloc);
-
-	assert(ring->flags & _FD_RINGBUFFER_OBJECT);
-
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-
-	/* Avoid emitting duplicate BO references into the list.  Ringbuffer
-	 * objects are long-lived, so this saves ongoing work at draw time in
-	 * exchange for a bit at context setup/first draw.  And the number of
-	 * relocs per ringbuffer object is fairly small, so the O(n^2) doesn't
-	 * hurt much.
-	 */
-	bool found = false;
-	for (int i = 0; i < msm_ring->u.nr_reloc_bos; i++) {
-		if (msm_ring->u.reloc_bos[i] == reloc->bo) {
-			found = true;
-			break;
-		}
-	}
-	if (!found) {
-		APPEND(&msm_ring->u, reloc_bos, fd_bo_ref(reloc->bo));
-	}
-}
-
-static uint32_t
-X(msm_ringbuffer_sp_emit_reloc_ring)(struct fd_ringbuffer *ring,
-		struct fd_ringbuffer *target, uint32_t cmd_idx)
-{
-	struct msm_ringbuffer_sp *msm_target = to_msm_ringbuffer_sp(target);
-	struct fd_bo *bo;
-	uint32_t size;
-
-	if ((target->flags & FD_RINGBUFFER_GROWABLE) &&
-			(cmd_idx < msm_target->u.nr_cmds)) {
-		bo   = msm_target->u.cmds[cmd_idx].ring_bo;
-		size = msm_target->u.cmds[cmd_idx].size;
-	} else {
-		bo   = msm_target->ring_bo;
-		size = offset_bytes(target->cur, target->start);
-	}
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		X(msm_ringbuffer_sp_emit_reloc_obj)(ring, &(struct fd_reloc){
-			.bo     = bo,
-			.iova   = bo->iova + msm_target->offset,
-			.offset = msm_target->offset,
-		});
-	} else {
-		X(msm_ringbuffer_sp_emit_reloc_nonobj)(ring, &(struct fd_reloc){
-			.bo     = bo,
-			.iova   = bo->iova + msm_target->offset,
-			.offset = msm_target->offset,
-		});
-	}
-
-	if (!(target->flags & _FD_RINGBUFFER_OBJECT))
-		return size;
-
-	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
-
-	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
-		for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) {
-			APPEND(&msm_ring->u, reloc_bos, fd_bo_ref(msm_target->u.reloc_bos[i]));
-		}
-	} else {
-		// TODO it would be nice to know whether we have already
-		// seen this target before.  But hopefully we hit the
-		// append_bo() fast path enough for this to not matter:
-		struct msm_submit_sp *msm_submit = to_msm_submit_sp(msm_ring->u.submit);
-
-		for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) {
-			msm_submit_append_bo(msm_submit, msm_target->u.reloc_bos[i]);
-		}
-	}
-
-	return size;
-}
diff --git a/lib/mesa/src/freedreno/fdl/fd6_layout_test.c b/lib/mesa/src/freedreno/fdl/fd6_layout_test.c
index 91639843d..f4eda1135 100644
--- a/lib/mesa/src/freedreno/fdl/fd6_layout_test.c
+++ b/lib/mesa/src/freedreno/fdl/fd6_layout_test.c
@@ -683,6 +683,243 @@ static const struct testcase
                      },
                },
          },
+
+         /* Easy 32x32x32 3d case */
+         {
+            .format = PIPE_FORMAT_R9G9B9E5_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 32,
+                  .height0 = 32,
+                  .depth0 = 32,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 256, .size0 = 8192},
+                        {.offset = 262144, .pitch = 256, .size0 = 4096},
+                        {.offset = 327680, .pitch = 256, .size0 = 4096},
+                        {.offset = 360448, .pitch = 256, .size0 = 4096},
+                        {.offset = 376832, .pitch = 256, .size0 = 4096},
+                        {.offset = 385024, .pitch = 256},
+                     },
+               },
+         },
+
+         /* Scale up a bit to 128x128x32 3d */
+         {
+            .format = PIPE_FORMAT_R9G9B9E5_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 128,
+                  .height0 = 128,
+                  .depth0 = 32,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 512, .size0 = 65536},
+                        {.offset = 2097152, .pitch = 256, .size0 = 16384},
+                        {.offset = 2359296, .pitch = 256, .size0 = 8192},
+                        {.offset = 2424832, .pitch = 256, .size0 = 8192},
+                        {.offset = 2457600, .pitch = 256, .size0 = 8192},
+                        {.offset = 2473984, .pitch = 256},
+                        {.offset = 2482176, .pitch = 256},
+                        {.offset = 2490368, .pitch = 256},
+                     },
+               },
+         },
+
+         /* Changing width to 1 changes where minimum layer size happens. */
+         {
+            .format = PIPE_FORMAT_R9G9B9E5_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_LINEAR,
+                  .ubwc = false,
+                  .width0 = 1,
+                  .height0 = 128,
+                  .depth0 = 32,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 256, .size0 = 32768},
+                        {.offset = 1048576, .pitch = 256, .size0 = 16384},
+                        {.offset = 1310720, .pitch = 256, .size0 = 16384},
+                        {.offset = 1441792, .pitch = 256, .size0 = 16384},
+                        {.offset = 1507328, .pitch = 256, .size0 = 16384},
+                        {.offset = 1540096, .pitch = 256},
+                        {.offset = 1556480, .pitch = 256},
+                        {.offset = 1572864, .pitch = 256},
+                     },
+               },
+         },
+
+         /* And increasing width makes it happen later. */
+         {
+            .format = PIPE_FORMAT_R9G9B9E5_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 1024,
+                  .height0 = 128,
+                  .depth0 = 32,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 4096, .size0 = 524288},
+                        {.offset = 16777216, .pitch = 2048, .size0 = 131072},
+                        {.offset = 18874368, .pitch = 1024, .size0 = 32768},
+                        {.offset = 19136512, .pitch = 512, .size0 = 8192},
+                        {.offset = 19169280, .pitch = 256, .size0 = 4096},
+                        {.offset = 19177472, .pitch = 256},
+                        {.offset = 19181568, .pitch = 256},
+                        {.offset = 19185664, .pitch = 256},
+                        {.offset = 19189760, .pitch = 256},
+                        {.offset = 19193856, .pitch = 256},
+                        {.offset = 19197952, .pitch = 256},
+                     },
+               },
+         },
+
+         /* NPOT height case that piglit was catching 3d texture failure in, we
+          * use a higher depth though to get more slice pitches detected from
+          * the blob.
+          */
+         {
+            .format = PIPE_FORMAT_R9G9B9E5_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 128,
+                  .height0 = 129,
+                  .depth0 = 16,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 512, .size0 = 73728},
+                        {.offset = 1179648, .pitch = 256, .size0 = 20480},
+                        {.offset = 1343488, .pitch = 256, .size0 = 20480},
+                        {.offset = 1425408, .pitch = 256, .size0 = 20480},
+                        {.offset = 1466368, .pitch = 256},
+                        {.offset = 1486848, .pitch = 256},
+                        {.offset = 1507328, .pitch = 256},
+                        {.offset = 1527808, .pitch = 256},
+                     },
+               },
+         },
+
+         /* NPOT height case that my first 3d layout ideas failed on. */
+         {
+            .format = PIPE_FORMAT_R9G9B9E5_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 128,
+                  .height0 = 132,
+                  .depth0 = 16,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 512, .size0 = 73728},
+                        {.offset = 1179648, .pitch = 256, .size0 = 20480},
+                        {.offset = 1343488, .pitch = 256, .size0 = 20480},
+                        {.offset = 1425408, .pitch = 256, .size0 = 20480},
+                        {.offset = 1466368, .pitch = 256},
+                        {.offset = 1486848, .pitch = 256},
+                        {.offset = 1507328, .pitch = 256},
+                        {.offset = 1527808, .pitch = 256},
+                     },
+               },
+         },
+
+         /* blob used MIN_LAYERSZ = 0x3000 here.
+          *
+          * This is an interesting case for 3d layout, since pitch stays NPOT for a while.
+          */
+         {
+            .format = PIPE_FORMAT_R9G9B9E5_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 768,
+                  .height0 = 32,
+                  .depth0 = 128,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 3072, .size0 = 98304},
+                        {.offset = 12582912, .pitch = 1536, .size0 = 24576},
+                        {.offset = 14155776, .pitch = 768, .size0 = 12288},
+                        {.offset = 14548992, .pitch = 512, .size0 = 12288},
+                        {.offset = 14745600, .pitch = 256, .size0 = 12288},
+                        {.offset = 14843904, .pitch = 256, .size0 = 12288},
+                        {.offset = 14893056, .pitch = 256, .size0 = 12288},
+                        {.offset = 14917632, .pitch = 256},
+                        {.offset = 14929920, .pitch = 256},
+                        {.offset = 14942208, .pitch = 256},
+                     },
+               },
+         },
+
+         /* dEQP-GLES31.functional.copy_image.mixed.viewclass_128_bits_mixed.rgba32f_rg11_eac.texture3d_to_texture2d */
+#if 0 /* XXX: We disagree with the blob about level 0 size0, but the testcase passes. */
+         {
+            .format = PIPE_FORMAT_R32G32B32A32_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 129,
+                  .height0 = 129,
+                  .depth0 = 17,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 3072, .size0 = 524288},
+                        {.offset = 8912896, .pitch = 2048, .size0 = 131072},
+                        {.offset = 9961472, .pitch = 1024, .size0 = 32768},
+                        {.offset = 10092544, .pitch = 1024, .size0 = 16384},
+                        {.offset = 10125312, .pitch = 1024},
+                        {.offset = 10141696, .pitch = 1024},
+                        {.offset = 10158080, .pitch = 1024},
+                        {.offset = 10174464, .pitch = 1024},
+                     },
+               },
+         },
+#endif
+
+         /* Size minification issue found while looking at the above test. */
+         {
+            .format = PIPE_FORMAT_R32G32B32A32_FLOAT,
+            .is_3d = true,
+            .layout =
+               {
+                  .tile_mode = TILE6_3,
+                  .ubwc = false,
+                  .width0 = 129,
+                  .height0 = 9,
+                  .depth0 = 8,
+                  .slices =
+                     {
+                        {.offset = 0, .pitch = 3072, .size0 = 49152},
+                        {.offset = 393216, .pitch = 2048, .size0 = 32768},
+                        {.offset = 524288, .pitch = 1024, .size0 = 32768},
+                        {.offset = 589824, .pitch = 1024},
+                        {.offset = 622592, .pitch = 1024},
+                        {.offset = 655360, .pitch = 1024},
+                        {.offset = 688128, .pitch = 1024},
+                        {.offset = 720896, .pitch = 1024},
+                     },
+               },
+         },
+
 };
 
 int
diff --git a/lib/mesa/src/freedreno/ir3/ir3_dce.c b/lib/mesa/src/freedreno/ir3/ir3_dce.c
index 76298e64a..a3ddbe802 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_dce.c
+++ b/lib/mesa/src/freedreno/ir3/ir3_dce.c
@@ -53,8 +53,10 @@ instr_dce(struct ir3_instruction *instr, bool falsedep)
    if (ir3_instr_check_mark(instr))
       return;
 
-   if (writes_gpr(instr))
-      mark_array_use(instr, instr->dsts[0]); /* dst */
+   foreach_dst (dst, instr) {
+      if (is_dest_gpr(dst))
+         mark_array_use(instr, dst);
+   }
 
    foreach_src (reg, instr)
       mark_array_use(instr, reg); /* src */
diff --git a/lib/mesa/src/freedreno/ir3/ir3_delay.c b/lib/mesa/src/freedreno/ir3/ir3_delay.c
index 14bb403b9..054f4c831 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_delay.c
+++ b/lib/mesa/src/freedreno/ir3/ir3_delay.c
@@ -30,19 +30,6 @@
  */
 #define MAX_NOPS 6
 
-/* The soft delay for approximating the cost of (ss). On a6xx, it takes the
- * number of delay slots to get a SFU result back (ie. using nop's instead of
- * (ss) is:
- *
- *     8 - single warp
- *     9 - two warps
- *    10 - four warps
- *
- * and so on. Not quite sure where it tapers out (ie. how many warps share an
- * SFU unit). But 10 seems like a reasonable # to choose:
- */
-#define SOFT_SS_NOPS 10
-
 /*
  * Helpers to figure out the necessary delay slots between instructions.  Used
  * both in scheduling pass(es) and the final pass to insert any required nop's
@@ -76,11 +63,11 @@ ir3_delayslots(struct ir3_instruction *assigner,
    if (writes_addr0(assigner) || writes_addr1(assigner))
       return 6;
 
-   if (soft && is_sfu(assigner))
-      return SOFT_SS_NOPS;
+   if (soft && is_ss_producer(assigner))
+      return soft_ss_delay(assigner);
 
    /* handled via sync flags: */
-   if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+   if (is_ss_producer(assigner) || is_sy_producer(assigner))
       return 0;
 
    /* As far as we know, shader outputs don't need any delay. */
@@ -89,7 +76,7 @@ ir3_delayslots(struct ir3_instruction *assigner,
 
    /* assigner must be alu: */
    if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
-       is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
+       is_mem(consumer)) {
       return 6;
    } else {
       /* In mergedregs mode, there is an extra 2-cycle penalty when half of
@@ -119,74 +106,6 @@ count_instruction(struct ir3_instruction *n)
           (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 }
 
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
-{
-   unsigned d = 0;
-
-   /* Note that this relies on incrementally building up the block's
-    * instruction list.. but this is how scheduling and nopsched
-    * work.
-    */
-   foreach_instr_rev (n, &block->instr_list) {
-      if ((n == instr) || (d >= maxd))
-         return MIN2(maxd, d + n->nop);
-      if (count_instruction(n))
-         d = MIN2(maxd, d + 1 + n->repeat + n->nop);
-   }
-
-   return maxd;
-}
-
-static unsigned
-delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
-                      struct ir3_instruction *consumer, unsigned srcn)
-{
-   unsigned delay = 0;
-
-   if (assigner->opc == OPC_META_PHI)
-      return 0;
-
-   if (is_meta(assigner)) {
-      foreach_src_n (src, n, assigner) {
-         unsigned d;
-
-         if (!src->def)
-            continue;
-
-         d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
-         delay = MAX2(delay, d);
-      }
-   } else {
-      delay = ir3_delayslots(assigner, consumer, srcn, false);
-      delay -= distance(block, assigner, delay);
-   }
-
-   return delay;
-}
-
-/**
- * Calculate delay for instruction before register allocation, using SSA
- * source pointers. This can't handle inter-block dependencies.
- */
-unsigned
-ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
-{
-   unsigned delay = 0;
-
-   foreach_src_n (src, i, instr) {
-      unsigned d = 0;
-
-      if (src->def && src->def->instr->block == block) {
-         d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
-      }
-
-      delay = MAX2(delay, d);
-   }
-
-   return delay;
-}
-
 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
  * and have to handle relative accesses specially.
  */
@@ -207,35 +126,21 @@ post_ra_reg_num(struct ir3_register *reg)
    return reg->num;
 }
 
-static unsigned
-delay_calc_srcn_postra(struct ir3_instruction *assigner,
-                       struct ir3_instruction *consumer, unsigned assigner_n,
-                       unsigned consumer_n, bool soft, bool mergedregs)
+unsigned
+ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
+                           struct ir3_instruction *consumer,
+                           unsigned assigner_n, unsigned consumer_n)
 {
+   unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);
+
    struct ir3_register *src = consumer->srcs[consumer_n];
    struct ir3_register *dst = assigner->dsts[assigner_n];
-   bool mismatched_half =
-      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
 
-   /* In the mergedregs case or when the register is a special register,
-    * half-registers do not alias with full registers.
-    */
-   if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
-       mismatched_half)
-      return 0;
+   if (assigner->repeat == 0 && consumer->repeat == 0)
+      return delay;
 
    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
-   unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
-   unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
-
-   if (dst_start >= src_end || src_start >= dst_end)
-      return 0;
-
-   unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
-
-   if (assigner->repeat == 0 && consumer->repeat == 0)
-      return delay;
 
    /* If either side is a relative access, we can't really apply most of the
     * reasoning below because we don't know which component aliases which.
@@ -250,6 +155,9 @@ delay_calc_srcn_postra(struct ir3_instruction *assigner,
    if (assigner->opc == OPC_MOVMSK)
       return delay;
 
+   bool mismatched_half =
+      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
+
    /* TODO: Handle the combination of (rpt) and different component sizes
     * better like below. This complicates things significantly because the
     * components don't line up.
@@ -303,10 +211,41 @@ delay_calc_srcn_postra(struct ir3_instruction *assigner,
 }
 
 static unsigned
-delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
-                  struct ir3_instruction *consumer, unsigned distance,
-                  bool soft, bool pred, bool mergedregs)
+delay_calc_srcn(struct ir3_instruction *assigner,
+                struct ir3_instruction *consumer, unsigned assigner_n,
+                unsigned consumer_n, bool mergedregs)
+{
+   struct ir3_register *src = consumer->srcs[consumer_n];
+   struct ir3_register *dst = assigner->dsts[assigner_n];
+   bool mismatched_half =
+      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
+
+   /* In the mergedregs case or when the register is a special register,
+    * half-registers do not alias with full registers.
+    */
+   if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
+       mismatched_half)
+      return 0;
+
+   unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
+   unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
+   unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
+   unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
+
+   if (dst_start >= src_end || src_start >= dst_end)
+      return 0;
+
+   return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
+}
+
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *start,
+           struct ir3_instruction *consumer, unsigned distance,
+           regmask_t *in_mask, bool mergedregs)
 {
+   regmask_t mask;
+   memcpy(&mask, in_mask, sizeof(mask));
+
    unsigned delay = 0;
    /* Search backwards starting at the instruction before start, unless it's
     * NULL then search backwards from the block end.
@@ -318,7 +257,7 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
       if (count_instruction(assigner))
          distance += assigner->nop;
 
-      if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
+      if (distance + delay >= MAX_NOPS)
          return delay;
 
       if (is_meta(assigner))
@@ -329,14 +268,17 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
       foreach_dst_n (dst, dst_n, assigner) {
          if (dst->wrmask == 0)
             continue;
+         if (!regmask_get(&mask, dst))
+            continue;
          foreach_src_n (src, src_n, consumer) {
             if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
                continue;
 
-            unsigned src_delay = delay_calc_srcn_postra(
-               assigner, consumer, dst_n, src_n, soft, mergedregs);
+            unsigned src_delay = delay_calc_srcn(
+               assigner, consumer, dst_n, src_n, mergedregs);
             new_delay = MAX2(new_delay, src_delay);
          }
+         regmask_clear(&mask, dst);
       }
 
       new_delay = new_delay > distance ? new_delay - distance : 0;
@@ -360,13 +302,13 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
     * However any other recursion would be unnecessary.
     */
 
-   if (pred && block->data != block) {
+   if (block->data != block) {
       block->data = block;
 
       for (unsigned i = 0; i < block->predecessors_count; i++) {
          struct ir3_block *pred = block->predecessors[i];
-         unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
-                                                 soft, pred, mergedregs);
+         unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
+                                          &mask, mergedregs);
          delay = MAX2(delay, pred_delay);
       }
 
@@ -377,50 +319,19 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
 }
 
 /**
- * Calculate delay for post-RA scheduling based on physical registers but not
- * exact (i.e. don't recurse into predecessors, and make it possible to
- * estimate impact of sync flags).
- *
- * @soft:  If true, add additional delay for situations where they
- *    would not be strictly required because a sync flag would be
- *    used (but scheduler would prefer to schedule some other
- *    instructions first to avoid stalling on sync flag)
- * @mergedregs: True if mergedregs is enabled.
- */
-unsigned
-ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
-                      bool soft, bool mergedregs)
-{
-   return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
-}
-
-/**
  * Calculate delay for nop insertion. This must exactly match hardware
  * requirements, including recursing into predecessor blocks.
  */
 unsigned
-ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
-                     bool mergedregs)
+ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool mergedregs)
 {
-   return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
-}
-
-/**
- * Remove nop instructions.  The scheduler can insert placeholder nop's
- * so that ir3_delay_calc() can account for nop's that won't be needed
- * due to nop's triggered by a previous instruction.  However, before
- * legalize, we want to remove these.  The legalize pass can insert
- * some nop's if needed to hold (for example) sync flags.  This final
- * remaining nops are inserted by legalize after this.
- */
-void
-ir3_remove_nops(struct ir3 *ir)
-{
-   foreach_block (block, &ir->block_list) {
-      foreach_instr_safe (instr, &block->instr_list) {
-         if (instr->opc == OPC_NOP) {
-            list_del(&instr->node);
-         }
-      }
+   regmask_t mask;
+   regmask_init(&mask, mergedregs);
+   foreach_src (src, instr) {
+      if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
+         regmask_set(&mask, src);
    }
+
+   return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
 }
diff --git a/lib/mesa/src/freedreno/ir3/ir3_lexer.l b/lib/mesa/src/freedreno/ir3/ir3_lexer.l
index 2d5582e5b..52b977896 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_lexer.l
+++ b/lib/mesa/src/freedreno/ir3/ir3_lexer.l
@@ -72,16 +72,6 @@ static int parse_reg(const char *str)
 	return num;
 }
 
-static int parse_w(const char *str)
-{
-	str++;
-	unsigned num = strtol(str, NULL, 10);
-	if ((num % 32) != 0)
-		yy_fatal_error("w# must be multiple of 32");
-	if (num < 32)
-		yy_fatal_error("w# must be at least 32");
-	return num / 32;
-}
 %}
 
 %option noyywrap
@@ -139,7 +129,7 @@ static int parse_w(const char *str)
 "a0.x"                            return T_A0;
 "a1.x"                            return T_A1;
 "p0."[xyzw]                       ir3_yylval.num = parse_reg(yytext); return T_P0;
-"w"[0-9]+                         ir3_yylval.num = parse_w(yytext);   return T_W;
+"w"[0-9]+                         ir3_yylval.num = strtol(yytext+1, NULL, 10); return T_W;
 "s#"[0-9]+                        ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_SAMP;
 "t#"[0-9]+                        ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_TEX;
 
@@ -167,6 +157,7 @@ static int parse_w(const char *str)
 "stkr"                            return TOKEN(T_OP_STKR);
 "xset"                            return TOKEN(T_OP_XSET);
 "xclr"                            return TOKEN(T_OP_XCLR);
+"getlast"                         return TOKEN(T_OP_GETLAST);
 "getone"                          return TOKEN(T_OP_GETONE);
 "dbg"                             return TOKEN(T_OP_DBG);
 "shps"                            return TOKEN(T_OP_SHPS);
@@ -228,6 +219,7 @@ static int parse_w(const char *str)
 "shr.b"                           return TOKEN(T_OP_SHR_B);
 "ashr.b"                          return TOKEN(T_OP_ASHR_B);
 "bary.f"                          return TOKEN(T_OP_BARY_F);
+"flat.b"                          return TOKEN(T_OP_FLAT_B);
 "mgen.b"                          return TOKEN(T_OP_MGEN_B);
 "getbit.b"                        return TOKEN(T_OP_GETBIT_B);
 "setrm"                           return TOKEN(T_OP_SETRM);
@@ -252,7 +244,15 @@ static int parse_w(const char *str)
 "sel.f32"                         return TOKEN(T_OP_SEL_F32);
 "sad.s16"                         return TOKEN(T_OP_SAD_S16);
 "sad.s32"                         return TOKEN(T_OP_SAD_S32);
-"shlg.b16"                        return TOKEN(T_OP_SHLG_B16);
+"shrm"                            return TOKEN(T_OP_SHRM);
+"shlm"                            return TOKEN(T_OP_SHLM);
+"shrg"                            return TOKEN(T_OP_SHRG);
+"shlg"                            return TOKEN(T_OP_SHLG);
+"andg"                            return TOKEN(T_OP_ANDG);
+"dp2acc"                          return TOKEN(T_OP_DP2ACC);
+"dp4acc"                          return TOKEN(T_OP_DP4ACC);
+"wmm"                             return TOKEN(T_OP_WMM);
+"wmm.accu"                        return TOKEN(T_OP_WMM_ACCU);
 
                                   /* category 4: */
 "rcp"                             return TOKEN(T_OP_RCP);
@@ -295,6 +295,11 @@ static int parse_w(const char *str)
 "dsypp.1"                         return TOKEN(T_OP_DSYPP_1);
 "rgetpos"                         return TOKEN(T_OP_RGETPOS);
 "rgetinfo"                        return TOKEN(T_OP_RGETINFO);
+"brcst.active"                    return TOKEN(T_OP_BRCST_A);
+"quad_shuffle.brcst"              return TOKEN(T_OP_QSHUFFLE_BRCST);
+"quad_shuffle.horiz"              return TOKEN(T_OP_QSHUFFLE_H);
+"quad_shuffle.vert"               return TOKEN(T_OP_QSHUFFLE_V);
+"quad_shuffle.diag"               return TOKEN(T_OP_QSHUFFLE_DIAG);
 
                                   /* category 6: */
 "ldg"                             return TOKEN(T_OP_LDG);
@@ -338,6 +343,29 @@ static int parse_w(const char *str)
 "atomic.b.and"                    return TOKEN(T_OP_ATOMIC_B_AND);
 "atomic.b.or"                     return TOKEN(T_OP_ATOMIC_B_OR);
 "atomic.b.xor"                    return TOKEN(T_OP_ATOMIC_B_XOR);
+"atomic.s.add"                    return TOKEN(T_OP_ATOMIC_S_ADD);
+"atomic.s.sub"                    return TOKEN(T_OP_ATOMIC_S_SUB);
+"atomic.s.xchg"                   return TOKEN(T_OP_ATOMIC_S_XCHG);
+"atomic.s.inc"                    return TOKEN(T_OP_ATOMIC_S_INC);
+"atomic.s.dec"                    return TOKEN(T_OP_ATOMIC_S_DEC);
+"atomic.s.cmpxchg"                return TOKEN(T_OP_ATOMIC_S_CMPXCHG);
+"atomic.s.min"                    return TOKEN(T_OP_ATOMIC_S_MIN);
+"atomic.s.max"                    return TOKEN(T_OP_ATOMIC_S_MAX);
+"atomic.s.and"                    return TOKEN(T_OP_ATOMIC_S_AND);
+"atomic.s.or"                     return TOKEN(T_OP_ATOMIC_S_OR);
+"atomic.s.xor"                    return TOKEN(T_OP_ATOMIC_S_XOR);
+"atomic.g.add"                    return TOKEN(T_OP_ATOMIC_G_ADD);
+"atomic.g.sub"                    return TOKEN(T_OP_ATOMIC_G_SUB);
+"atomic.g.xchg"                   return TOKEN(T_OP_ATOMIC_G_XCHG);
+"atomic.g.inc"                    return TOKEN(T_OP_ATOMIC_G_INC);
+"atomic.g.dec"                    return TOKEN(T_OP_ATOMIC_G_DEC);
+"atomic.g.cmpxchg"                return TOKEN(T_OP_ATOMIC_G_CMPXCHG);
+"atomic.g.min"                    return TOKEN(T_OP_ATOMIC_G_MIN);
+"atomic.g.max"                    return TOKEN(T_OP_ATOMIC_G_MAX);
+"atomic.g.and"                    return TOKEN(T_OP_ATOMIC_G_AND);
+"atomic.g.or"                     return TOKEN(T_OP_ATOMIC_G_OR);
+"atomic.g.xor"                    return TOKEN(T_OP_ATOMIC_G_XOR);
+
 "ldgb"                            return TOKEN(T_OP_LDGB);
 "stgb"                            return TOKEN(T_OP_STGB);
 "stib"                            return TOKEN(T_OP_STIB);
@@ -345,6 +373,8 @@ static int parse_w(const char *str)
 "ldlv"                            return TOKEN(T_OP_LDLV);
 "getspid"                         return TOKEN(T_OP_GETSPID);
 "getwid"                          return TOKEN(T_OP_GETWID);
+"getfiberid"                      return TOKEN(T_OP_GETFIBERID);
+"stc"                             return TOKEN(T_OP_STC);
 
                                   /* category 7: */
 "bar"                             return TOKEN(T_OP_BAR);
@@ -362,6 +392,11 @@ static int parse_w(const char *str)
 "untyped"                         return TOKEN(T_UNTYPED);
 "typed"                           return TOKEN(T_TYPED);
 
+"unsigned"                        return TOKEN(T_UNSIGNED);
+"mixed"                           return TOKEN(T_MIXED);
+"low"                             return TOKEN(T_LOW);
+"high"                            return TOKEN(T_HIGH);
+
 "1d"                              return TOKEN(T_1D);
 "2d"                              return TOKEN(T_2D);
 "3d"                              return TOKEN(T_3D);
@@ -379,6 +414,7 @@ static int parse_w(const char *str)
 "p"                               return 'p';
 "s2en"                            return TOKEN(T_S2EN);
 "s"                               return 's';
+"k"                               return 'k';
 "base"[0-9]+                      ir3_yylval.num = strtol(yytext+4, NULL, 10); return T_BASE;
 "offset"[0-9]+                    ir3_yylval.num = strtol(yytext+6, NULL, 10); return T_OFFSET;
 "uniform"                         return T_UNIFORM;
diff --git a/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
deleted file mode 100644
index 37a3dcb26..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright © 2017 Ilia Mirkin
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "ir3_nir.h"
-#include "compiler/nir/nir_builder.h"
-
-/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
- * gather results, rather than before. As a result, it must be emulated with
- * direct texture calls.
- */
-
-static bool
-lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
-{
-	bool progress = false;
-
-	static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
-
-	nir_foreach_instr_safe(instr, block) {
-		if (instr->type != nir_instr_type_tex)
-			continue;
-
-		nir_tex_instr *tg4 = (nir_tex_instr *)instr;
-
-		if (tg4->op != nir_texop_tg4)
-			continue;
-
-		b->cursor = nir_before_instr(&tg4->instr);
-
-		nir_ssa_def *results[4];
-		int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
-		for (int i = 0; i < 4; i++) {
-			int num_srcs = tg4->num_srcs + 1 /* lod */;
-			if (offset_index < 0 && i < 3)
-				num_srcs++;
-
-			nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
-			tex->op = nir_texop_txl;
-			tex->sampler_dim = tg4->sampler_dim;
-			tex->coord_components = tg4->coord_components;
-			tex->is_array = tg4->is_array;
-			tex->is_shadow = tg4->is_shadow;
-			tex->is_new_style_shadow = tg4->is_new_style_shadow;
-			tex->texture_index = tg4->texture_index;
-			tex->sampler_index = tg4->sampler_index;
-			tex->dest_type = tg4->dest_type;
-
-			for (int j = 0; j < tg4->num_srcs; j++) {
-				nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
-				tex->src[j].src_type = tg4->src[j].src_type;
-			}
-			if (i != 3) {
-				nir_ssa_def *offset =
-					nir_vec2(b, nir_imm_int(b, offsets[i][0]),
-							 nir_imm_int(b, offsets[i][1]));
-				if (offset_index < 0) {
-					tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
-					tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
-				} else {
-					assert(nir_tex_instr_src_size(tex, offset_index) == 2);
-					nir_ssa_def *orig = nir_ssa_for_src(
-							b, tex->src[offset_index].src, 2);
-					tex->src[offset_index].src =
-						nir_src_for_ssa(nir_iadd(b, orig, offset));
-				}
-			}
-			tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
-			tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
-
-			nir_ssa_dest_init(&tex->instr, &tex->dest,
-							  nir_tex_instr_dest_size(tex), 32, NULL);
-			nir_builder_instr_insert(b, &tex->instr);
-
-			results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
-		}
-
-		nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
-		nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
-
-		nir_instr_remove(&tg4->instr);
-
-		progress = true;
-	}
-
-	return progress;
-}
-
-static bool
-lower_tg4_func(nir_function_impl *impl)
-{
-	void *mem_ctx = ralloc_parent(impl);
-	nir_builder b;
-	nir_builder_init(&b, impl);
-
-	bool progress = false;
-	nir_foreach_block_safe(block, impl) {
-		progress |= lower_tg4(block, &b, mem_ctx);
-	}
-
-	if (progress)
-		nir_metadata_preserve(impl, nir_metadata_block_index |
-									nir_metadata_dominance);
-
-	return progress;
-}
-
-bool
-ir3_nir_lower_tg4_to_tex(nir_shader *shader)
-{
-	bool progress = false;
-
-	nir_foreach_function(function, shader) {
-		if (function->impl)
-			progress |= lower_tg4_func(function->impl);
-	}
-
-	return progress;
-}
diff --git a/lib/mesa/src/freedreno/ir3/ir3_parser.y b/lib/mesa/src/freedreno/ir3/ir3_parser.y
index acd94b35a..fd29c639d 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_parser.y
+++ b/lib/mesa/src/freedreno/ir3/ir3_parser.y
@@ -399,6 +399,7 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_OP_STKR
 %token <tok> T_OP_XSET
 %token <tok> T_OP_XCLR
+%token <tok> T_OP_GETLAST
 %token <tok> T_OP_GETONE
 %token <tok> T_OP_DBG
 %token <tok> T_OP_SHPS
@@ -458,6 +459,7 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_OP_SHR_B
 %token <tok> T_OP_ASHR_B
 %token <tok> T_OP_BARY_F
+%token <tok> T_OP_FLAT_B
 %token <tok> T_OP_MGEN_B
 %token <tok> T_OP_GETBIT_B
 %token <tok> T_OP_SETRM
@@ -482,7 +484,15 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_OP_SEL_F32
 %token <tok> T_OP_SAD_S16
 %token <tok> T_OP_SAD_S32
-%token <tok> T_OP_SHLG_B16
+%token <tok> T_OP_SHRM
+%token <tok> T_OP_SHLM
+%token <tok> T_OP_SHRG
+%token <tok> T_OP_SHLG
+%token <tok> T_OP_ANDG
+%token <tok> T_OP_DP2ACC
+%token <tok> T_OP_DP4ACC
+%token <tok> T_OP_WMM
+%token <tok> T_OP_WMM_ACCU
 
 /* category 4: */
 %token <tok> T_OP_RCP
@@ -525,6 +535,11 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_OP_DSYPP_1
 %token <tok> T_OP_RGETPOS
 %token <tok> T_OP_RGETINFO
+%token <tok> T_OP_BRCST_A
+%token <tok> T_OP_QSHUFFLE_BRCST
+%token <tok> T_OP_QSHUFFLE_H
+%token <tok> T_OP_QSHUFFLE_V
+%token <tok> T_OP_QSHUFFLE_DIAG
 
 /* category 6: */
 %token <tok> T_OP_LDG
@@ -568,6 +583,28 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_OP_ATOMIC_B_AND
 %token <tok> T_OP_ATOMIC_B_OR
 %token <tok> T_OP_ATOMIC_B_XOR
+%token <tok> T_OP_ATOMIC_S_ADD
+%token <tok> T_OP_ATOMIC_S_SUB
+%token <tok> T_OP_ATOMIC_S_XCHG
+%token <tok> T_OP_ATOMIC_S_INC
+%token <tok> T_OP_ATOMIC_S_DEC
+%token <tok> T_OP_ATOMIC_S_CMPXCHG
+%token <tok> T_OP_ATOMIC_S_MIN
+%token <tok> T_OP_ATOMIC_S_MAX
+%token <tok> T_OP_ATOMIC_S_AND
+%token <tok> T_OP_ATOMIC_S_OR
+%token <tok> T_OP_ATOMIC_S_XOR
+%token <tok> T_OP_ATOMIC_G_ADD
+%token <tok> T_OP_ATOMIC_G_SUB
+%token <tok> T_OP_ATOMIC_G_XCHG
+%token <tok> T_OP_ATOMIC_G_INC
+%token <tok> T_OP_ATOMIC_G_DEC
+%token <tok> T_OP_ATOMIC_G_CMPXCHG
+%token <tok> T_OP_ATOMIC_G_MIN
+%token <tok> T_OP_ATOMIC_G_MAX
+%token <tok> T_OP_ATOMIC_G_AND
+%token <tok> T_OP_ATOMIC_G_OR
+%token <tok> T_OP_ATOMIC_G_XOR
 %token <tok> T_OP_LDGB
 %token <tok> T_OP_STGB
 %token <tok> T_OP_STIB
@@ -575,6 +612,8 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_OP_LDLV
 %token <tok> T_OP_GETSPID
 %token <tok> T_OP_GETWID
+%token <tok> T_OP_GETFIBERID
+%token <tok> T_OP_STC
 
 /* category 7: */
 %token <tok> T_OP_BAR
@@ -593,6 +632,11 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_UNTYPED
 %token <tok> T_TYPED
 
+%token <tok> T_MIXED
+%token <tok> T_UNSIGNED
+%token <tok> T_LOW
+%token <tok> T_HIGH
+
 %token <tok> T_1D
 %token <tok> T_2D
 %token <tok> T_3D
@@ -746,7 +790,7 @@ iflag:             T_SY   { iflags.flags |= IR3_INSTR_SY; }
 iflags:
 |                  iflag iflags
 
-instrs:            instr instrs
+instrs:            instrs instr
 |                  instr
 
 instr:             iflags cat0_instr
@@ -800,6 +844,7 @@ cat0_instr:        T_OP_NOP        { new_instr(OPC_NOP); }
 |                  T_OP_PREDT      { new_instr(OPC_PREDT); }    cat0_src1
 |                  T_OP_PREDF      { new_instr(OPC_PREDF); }    cat0_src1
 |                  T_OP_PREDE      { new_instr(OPC_PREDE); }
+|                  T_OP_GETLAST '.' T_W { new_instr(OPC_GETLAST); }   cat0_immed
 
 cat1_opc:          T_OP_MOV '.' T_CAT1_TYPE_TYPE {
                        parse_type_type(new_instr(OPC_MOV), $3);
@@ -815,9 +860,16 @@ cat1_movmsk:       T_OP_MOVMSK '.' T_W {
                        new_instr(OPC_MOVMSK);
                        instr->cat1.src_type = TYPE_U32;
                        instr->cat1.dst_type = TYPE_U32;
-                       instr->repeat = $3 - 1;
                    } dst_reg {
-                       instr->dsts[0]->wrmask = (1 << $3) - 1;
+                       if (($3 % 32) != 0)
+                          yyerror("w# must be multiple of 32");
+                       if ($3 < 32)
+                          yyerror("w# must be at least 32");
+
+                       int num = $3 / 32;
+
+                       instr->repeat = num - 1;
+                       instr->dsts[0]->wrmask = (1 << num) - 1;
                    }
 
 cat1_mova1:        T_OP_MOVA1 T_A1 ',' {
@@ -894,6 +946,7 @@ cat2_opc_2src:     T_OP_ADD_F     { new_instr(OPC_ADD_F); }
 |                  T_OP_SHR_B     { new_instr(OPC_SHR_B); }
 |                  T_OP_ASHR_B    { new_instr(OPC_ASHR_B); }
 |                  T_OP_BARY_F    { new_instr(OPC_BARY_F); }
+|                  T_OP_FLAT_B    { new_instr(OPC_FLAT_B); }
 |                  T_OP_MGEN_B    { new_instr(OPC_MGEN_B); }
 |                  T_OP_GETBIT_B  { new_instr(OPC_GETBIT_B); }
 |                  T_OP_SHB       { new_instr(OPC_SHB); }
@@ -910,6 +963,12 @@ cat2_instr:        cat2_opc_1src dst_reg ',' src_reg_or_const_or_rel_or_imm
 |                  cat2_opc_2src_cnd '.' cond dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm
 |                  cat2_opc_2src dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm
 
+cat3_dp_signedness:'.' T_MIXED   { instr->cat3.signedness = IR3_SRC_MIXED; }
+|                  '.' T_UNSIGNED{ instr->cat3.signedness = IR3_SRC_UNSIGNED; }
+
+cat3_dp_pack:      '.' T_LOW     { instr->cat3.packed = IR3_SRC_PACKED_LOW; }
+|                  '.' T_HIGH    { instr->cat3.packed = IR3_SRC_PACKED_HIGH; }
+
 cat3_opc:          T_OP_MAD_U16   { new_instr(OPC_MAD_U16); }
 |                  T_OP_MADSH_U16 { new_instr(OPC_MADSH_U16); }
 |                  T_OP_MAD_S16   { new_instr(OPC_MAD_S16); }
@@ -927,8 +986,22 @@ cat3_opc:          T_OP_MAD_U16   { new_instr(OPC_MAD_U16); }
 |                  T_OP_SAD_S16   { new_instr(OPC_SAD_S16); }
 |                  T_OP_SAD_S32   { new_instr(OPC_SAD_S32); }
 
+cat3_imm_reg_opc:  T_OP_SHRM      { new_instr(OPC_SHRM); }
+|                  T_OP_SHLM      { new_instr(OPC_SHLM); }
+|                  T_OP_SHRG      { new_instr(OPC_SHRG); }
+|                  T_OP_SHLG      { new_instr(OPC_SHLG); }
+|                  T_OP_ANDG      { new_instr(OPC_ANDG); }
+
+cat3_wmm:          T_OP_WMM       { new_instr(OPC_WMM); }
+|                  T_OP_WMM_ACCU  { new_instr(OPC_WMM_ACCU); }
+
+cat3_dp:           T_OP_DP2ACC    { new_instr(OPC_DP2ACC); }
+|                  T_OP_DP4ACC    { new_instr(OPC_DP4ACC); }
+
 cat3_instr:        cat3_opc dst_reg ',' src_reg_or_const_or_rel ',' src_reg_or_const ',' src_reg_or_const_or_rel
-|                  T_OP_SHLG_B16 { new_instr(OPC_SHLG_B16); } dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm
+|                  cat3_imm_reg_opc dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm
+|                  cat3_wmm         dst_reg ',' src_reg_gpr ',' src_reg ',' immediate
+|                  cat3_dp cat3_dp_signedness cat3_dp_pack dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm
 
 cat4_opc:          T_OP_RCP       { new_instr(OPC_RCP); }
 |                  T_OP_RSQ       { new_instr(OPC_RSQ); }
@@ -972,6 +1045,11 @@ cat5_opc:          T_OP_ISAM      { new_instr(OPC_ISAM); }
 |                  T_OP_SAMGP3    { new_instr(OPC_SAMGP3); }
 |                  T_OP_RGETPOS   { new_instr(OPC_RGETPOS); }
 |                  T_OP_RGETINFO  { new_instr(OPC_RGETINFO); }
+|                  T_OP_BRCST_A   { new_instr(OPC_BRCST_ACTIVE); }
+|                  T_OP_QSHUFFLE_BRCST { new_instr(OPC_QUAD_SHUFFLE_BRCST); }
+|                  T_OP_QSHUFFLE_H     { new_instr(OPC_QUAD_SHUFFLE_HORIZ); }
+|                  T_OP_QSHUFFLE_V     { new_instr(OPC_QUAD_SHUFFLE_VERT); }
+|                  T_OP_QSHUFFLE_DIAG  { new_instr(OPC_QUAD_SHUFFLE_DIAG); }
 
 cat5_flag:         '.' T_3D       { instr->flags |= IR3_INSTR_3D; }
 |                  '.' 'a'        { instr->flags |= IR3_INSTR_A; }
@@ -979,13 +1057,15 @@ cat5_flag:         '.' T_3D       { instr->flags |= IR3_INSTR_3D; }
 |                  '.' 'p'        { instr->flags |= IR3_INSTR_P; }
 |                  '.' 's'        { instr->flags |= IR3_INSTR_S; }
 |                  '.' T_S2EN     { instr->flags |= IR3_INSTR_S2EN; }
+|                  '.' T_UNIFORM  { }
 |                  '.' T_NONUNIFORM  { instr->flags |= IR3_INSTR_NONUNIF; }
 |                  '.' T_BASE     { instr->flags |= IR3_INSTR_B; instr->cat5.tex_base = $2; }
+|                  '.' T_W        { instr->cat5.cluster_size = $2; }
 cat5_flags:
 |                  cat5_flag cat5_flags
 
 cat5_samp:         T_SAMP         { instr->cat5.samp = $1; }
-cat5_tex:          T_TEX          { if (instr->flags & IR3_INSTR_B) instr->cat5.samp |= ($1 << 4); else instr->cat5.tex = $1; }
+cat5_tex:          T_TEX          { instr->cat5.tex = $1; }
 cat5_type:         '(' type ')'   { instr->cat5.type = $2; }
 cat5_a1:           src_reg        { instr->flags |= IR3_INSTR_A1EN; }
 
@@ -1018,7 +1098,7 @@ cat6_imm_offset:   offset    { new_src(0, IR3_REG_IMMED)->iim_val = $1; }
 cat6_offset:       cat6_imm_offset
 |                  '+' src
 cat6_dst_offset:   offset    { instr->cat6.dst_offset = $1; }
-|                  '+' src   { instr->flags |= IR3_INSTR_G; }
+|                  '+' src
 
 cat6_immed:        integer   { instr->cat6.iim_val = $1; }
 
@@ -1066,14 +1146,39 @@ cat6_atomic_opc:   T_OP_ATOMIC_ADD     { new_instr(OPC_ATOMIC_ADD); }
 |                  T_OP_ATOMIC_OR      { new_instr(OPC_ATOMIC_OR); }
 |                  T_OP_ATOMIC_XOR     { new_instr(OPC_ATOMIC_XOR); }
 
-cat6_atomic_g:     cat6_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' 'g' '[' cat6_reg_or_immed ']' ',' src ',' src ',' src {
-                       instr->flags |= IR3_INSTR_G;
-                   }
+cat6_a3xx_atomic_opc:   T_OP_ATOMIC_S_ADD     { new_instr(OPC_ATOMIC_S_ADD); }
+|                       T_OP_ATOMIC_S_SUB     { new_instr(OPC_ATOMIC_S_SUB); }
+|                       T_OP_ATOMIC_S_XCHG    { new_instr(OPC_ATOMIC_S_XCHG); }
+|                       T_OP_ATOMIC_S_INC     { new_instr(OPC_ATOMIC_S_INC); }
+|                       T_OP_ATOMIC_S_DEC     { new_instr(OPC_ATOMIC_S_DEC); }
+|                       T_OP_ATOMIC_S_CMPXCHG { new_instr(OPC_ATOMIC_S_CMPXCHG); }
+|                       T_OP_ATOMIC_S_MIN     { new_instr(OPC_ATOMIC_S_MIN); }
+|                       T_OP_ATOMIC_S_MAX     { new_instr(OPC_ATOMIC_S_MAX); }
+|                       T_OP_ATOMIC_S_AND     { new_instr(OPC_ATOMIC_S_AND); }
+|                       T_OP_ATOMIC_S_OR      { new_instr(OPC_ATOMIC_S_OR); }
+|                       T_OP_ATOMIC_S_XOR     { new_instr(OPC_ATOMIC_S_XOR); }
+
+cat6_a6xx_atomic_opc:   T_OP_ATOMIC_G_ADD     { new_instr(OPC_ATOMIC_G_ADD); }
+|                       T_OP_ATOMIC_G_SUB     { new_instr(OPC_ATOMIC_G_SUB); }
+|                       T_OP_ATOMIC_G_XCHG    { new_instr(OPC_ATOMIC_G_XCHG); }
+|                       T_OP_ATOMIC_G_INC     { new_instr(OPC_ATOMIC_G_INC); }
+|                       T_OP_ATOMIC_G_DEC     { new_instr(OPC_ATOMIC_G_DEC); }
+|                       T_OP_ATOMIC_G_CMPXCHG { new_instr(OPC_ATOMIC_G_CMPXCHG); }
+|                       T_OP_ATOMIC_G_MIN     { new_instr(OPC_ATOMIC_G_MIN); }
+|                       T_OP_ATOMIC_G_MAX     { new_instr(OPC_ATOMIC_G_MAX); }
+|                       T_OP_ATOMIC_G_AND     { new_instr(OPC_ATOMIC_G_AND); }
+|                       T_OP_ATOMIC_G_OR      { new_instr(OPC_ATOMIC_G_OR); }
+|                       T_OP_ATOMIC_G_XOR     { new_instr(OPC_ATOMIC_G_XOR); }
+
+cat6_a3xx_atomic_s: cat6_a3xx_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' 'g' '[' cat6_reg_or_immed ']' ',' src ',' src ',' src
+
+cat6_a6xx_atomic_g: cat6_a6xx_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' src ',' src
 
 cat6_atomic_l:     cat6_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'l' dst_reg ',' 'l' '[' cat6_reg_or_immed ']' ',' src
 
-cat6_atomic:       cat6_atomic_g
-|                  cat6_atomic_l
+cat6_atomic:       cat6_atomic_l
+|                  cat6_a3xx_atomic_s
+|                  cat6_a6xx_atomic_g
 
 cat6_ibo_opc_1src: T_OP_RESINFO   { new_instr(OPC_RESINFO); }
 
@@ -1087,6 +1192,7 @@ cat6_ibo:          cat6_ibo_opc_1src cat6_type cat6_dim dst_reg ',' 'g' '[' cat6
 cat6_id_opc:
                    T_OP_GETSPID { new_instr(OPC_GETSPID); }
 |                  T_OP_GETWID  { new_instr(OPC_GETWID); }
+|                  T_OP_GETFIBERID { new_instr(OPC_GETFIBERID); }
 
 cat6_id:           cat6_id_opc cat6_type dst_reg
 
@@ -1102,17 +1208,17 @@ cat6_reg_or_immed: src
 
 cat6_bindless_ibo_opc_1src: T_OP_RESINFO_B       { new_instr(OPC_RESINFO); }
 
-cat6_bindless_ibo_opc_2src: T_OP_ATOMIC_B_ADD        { new_instr(OPC_ATOMIC_ADD)->flags  |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_SUB        { new_instr(OPC_ATOMIC_SUB)->flags  |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_XCHG       { new_instr(OPC_ATOMIC_XCHG)->flags |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_INC        { new_instr(OPC_ATOMIC_INC)->flags  |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_DEC        { new_instr(OPC_ATOMIC_DEC)->flags  |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_CMPXCHG    { new_instr(OPC_ATOMIC_CMPXCHG)->flags |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_MIN        { new_instr(OPC_ATOMIC_MIN)->flags  |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_MAX        { new_instr(OPC_ATOMIC_MAX)->flags  |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_AND        { new_instr(OPC_ATOMIC_AND)->flags  |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_OR         { new_instr(OPC_ATOMIC_OR)->flags   |= IR3_INSTR_G; dummy_dst(); }
-|                  T_OP_ATOMIC_B_XOR        { new_instr(OPC_ATOMIC_XOR)->flags  |= IR3_INSTR_G; dummy_dst(); }
+cat6_bindless_ibo_opc_2src: T_OP_ATOMIC_B_ADD        { new_instr(OPC_ATOMIC_B_ADD); dummy_dst(); }
+|                  T_OP_ATOMIC_B_SUB        { new_instr(OPC_ATOMIC_B_SUB); dummy_dst(); }
+|                  T_OP_ATOMIC_B_XCHG       { new_instr(OPC_ATOMIC_B_XCHG); dummy_dst(); }
+|                  T_OP_ATOMIC_B_INC        { new_instr(OPC_ATOMIC_B_INC); dummy_dst(); }
+|                  T_OP_ATOMIC_B_DEC        { new_instr(OPC_ATOMIC_B_DEC); dummy_dst(); }
+|                  T_OP_ATOMIC_B_CMPXCHG    { new_instr(OPC_ATOMIC_B_CMPXCHG); dummy_dst(); }
+|                  T_OP_ATOMIC_B_MIN        { new_instr(OPC_ATOMIC_B_MIN); dummy_dst(); }
+|                  T_OP_ATOMIC_B_MAX        { new_instr(OPC_ATOMIC_B_MAX); dummy_dst(); }
+|                  T_OP_ATOMIC_B_AND        { new_instr(OPC_ATOMIC_B_AND); dummy_dst(); }
+|                  T_OP_ATOMIC_B_OR         { new_instr(OPC_ATOMIC_B_OR); dummy_dst(); }
+|                  T_OP_ATOMIC_B_XOR        { new_instr(OPC_ATOMIC_B_XOR); dummy_dst(); }
 |                  T_OP_STIB_B              { new_instr(OPC_STIB); dummy_dst(); }
 
 cat6_bindless_ibo_opc_2src_dst: T_OP_LDIB_B              { new_instr(OPC_LDIB); }
@@ -1123,13 +1229,23 @@ cat6_bindless_ibo: cat6_bindless_ibo_opc_1src cat6_typed cat6_dim cat6_type '.'
 
 cat6_bindless_ldc_opc: T_OP_LDC  { new_instr(OPC_LDC); }
 
-cat6_bindless_ldc: cat6_bindless_ldc_opc '.' T_OFFSET '.' cat6_immed '.' cat6_bindless_mode dst_reg ',' cat6_reg_or_immed ',' cat6_reg_or_immed {
-                      instr->cat6.d = $3;
+/* This is separated from the opcode to avoid lookahead/shift-reduce conflicts */
+cat6_bindless_ldc_middle:
+                        T_OFFSET '.' cat6_immed '.' cat6_bindless_mode dst_reg { instr->cat6.d = $1; }
+|                       cat6_immed '.' 'k' '.' cat6_bindless_mode 'c' '[' T_A1 ']' { instr->opc = OPC_LDC_K; }
+
+cat6_bindless_ldc: cat6_bindless_ldc_opc '.' cat6_bindless_ldc_middle ',' cat6_reg_or_immed ',' cat6_reg_or_immed {
                       instr->cat6.type = TYPE_U32;
                       /* TODO cleanup ir3 src order: */
                       swap(instr->srcs[0], instr->srcs[1]);
                    }
 
+stc_dst:          integer { new_src(0, IR3_REG_IMMED)->iim_val = $1; }
+|                 T_A1 { new_src(0, IR3_REG_IMMED)->iim_val = 0; instr->flags |= IR3_INSTR_A1EN; }
+|                 T_A1 '+' integer { new_src(0, IR3_REG_IMMED)->iim_val = $3; instr->flags |= IR3_INSTR_A1EN; }
+
+cat6_stc: T_OP_STC { new_instr(OPC_STC); } cat6_type 'c' '[' stc_dst ']' ',' src_reg ',' cat6_immed
+
 cat6_todo:         T_OP_G2L                 { new_instr(OPC_G2L); }
 |                  T_OP_L2G                 { new_instr(OPC_L2G); }
 |                  T_OP_RESFMT              { new_instr(OPC_RESFMT); }
@@ -1144,6 +1260,7 @@ cat6_instr:        cat6_load
 |                  cat6_id
 |                  cat6_bindless_ldc
 |                  cat6_bindless_ibo
+|                  cat6_stc
 |                  cat6_todo
 
 cat7_scope:        '.' 'w'  { instr->cat7.w = true; }
@@ -1195,6 +1312,9 @@ src_reg_flags:     src_reg_flag
 src_reg:           src
 |                  src_reg_flags src
 
+src_reg_gpr:       src_reg
+|                  relative_gpr_src
+
 src_const:         const
 |                  src_reg_flags const
 
diff --git a/lib/mesa/src/freedreno/ir3/ir3_postsched.c b/lib/mesa/src/freedreno/ir3/ir3_postsched.c
index 507302a00..39de84add 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_postsched.c
+++ b/lib/mesa/src/freedreno/ir3/ir3_postsched.c
@@ -68,8 +68,10 @@ struct ir3_postsched_ctx {
 
    struct list_head unscheduled_list; /* unscheduled instructions */
 
-   int sfu_delay;
-   int tex_delay;
+   unsigned ip;
+
+   int ss_delay;
+   int sy_delay;
 };
 
 struct ir3_postsched_node {
@@ -77,7 +79,9 @@ struct ir3_postsched_node {
    struct ir3_instruction *instr;
    bool partially_evaluated_path;
 
-   bool has_tex_src, has_sfu_src;
+   unsigned earliest_ip;
+
+   bool has_sy_src, has_ss_src;
 
    unsigned delay;
    unsigned max_delay;
@@ -87,17 +91,17 @@ struct ir3_postsched_node {
    list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
 
 static bool
-has_tex_src(struct ir3_instruction *instr)
+has_sy_src(struct ir3_instruction *instr)
 {
    struct ir3_postsched_node *node = instr->data;
-   return node->has_tex_src;
+   return node->has_sy_src;
 }
 
 static bool
-has_sfu_src(struct ir3_instruction *instr)
+has_ss_src(struct ir3_instruction *instr)
 {
    struct ir3_postsched_node *node = instr->data;
-   return node->has_sfu_src;
+   return node->has_ss_src;
 }
 
 static void
@@ -111,28 +115,45 @@ schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 
    di(instr, "schedule");
 
-   list_addtail(&instr->node, &instr->block->instr_list);
+   bool counts_for_delay = is_alu(instr) || is_flow(instr);
+
+   unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
 
    struct ir3_postsched_node *n = instr->data;
+
+   /* We insert any nop's needed to get to earliest_ip, then advance
+    * delay_cycles by scheduling the instruction.
+    */
+   ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
+
+   util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
+      unsigned delay = (unsigned)(uintptr_t)edge->data;
+      struct ir3_postsched_node *child =
+         container_of(edge->child, struct ir3_postsched_node, dag);
+      child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
+   }
+
+   list_addtail(&instr->node, &instr->block->instr_list);
+
    dag_prune_head(ctx->dag, &n->dag);
 
    if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
       return;
 
-   if (is_sfu(instr)) {
-      ctx->sfu_delay = 8;
-   } else if (has_sfu_src(instr)) {
-      ctx->sfu_delay = 0;
-   } else if (ctx->sfu_delay > 0) {
-      ctx->sfu_delay--;
+   if (is_ss_producer(instr)) {
+      ctx->ss_delay = soft_ss_delay(instr);
+   } else if (has_ss_src(instr)) {
+      ctx->ss_delay = 0;
+   } else if (ctx->ss_delay > 0) {
+      ctx->ss_delay--;
    }
 
-   if (is_tex_or_prefetch(instr)) {
-      ctx->tex_delay = 10;
-   } else if (has_tex_src(instr)) {
-      ctx->tex_delay = 0;
-   } else if (ctx->tex_delay > 0) {
-      ctx->tex_delay--;
+   if (is_sy_producer(instr)) {
+      ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
+   } else if (has_sy_src(instr)) {
+      ctx->sy_delay = 0;
+   } else if (ctx->sy_delay > 0) {
+      ctx->sy_delay--;
    }
 }
 
@@ -154,25 +175,26 @@ dump_state(struct ir3_postsched_ctx *ctx)
    }
 }
 
-/* Determine if this is an instruction that we'd prefer not to schedule
- * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
- * counter, ie. the more cycles it has been since the last SFU, the less
- * costly a sync would be.
- */
-static bool
-would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+static unsigned
+node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
 {
-   if (ctx->sfu_delay) {
-      if (has_sfu_src(instr))
-         return true;
-   }
+   return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
+}
 
-   if (ctx->tex_delay) {
-      if (has_tex_src(instr))
-         return true;
-   }
+static unsigned
+node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
+{
+   unsigned delay = node_delay(ctx, n);
+
+   /* This takes into account that as when we schedule multiple tex or sfu, the
+    * first user has to wait for all of them to complete.
+    */
+   if (n->has_ss_src)
+      delay = MAX2(delay, ctx->ss_delay);
+   if (n->has_sy_src)
+      delay = MAX2(delay, ctx->sy_delay);
 
-   return false;
+   return delay;
 }
 
 /* find instruction to schedule: */
@@ -215,8 +237,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
    /* Next prioritize discards: */
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+      unsigned d = node_delay(ctx, n);
 
       if (d > 0)
          continue;
@@ -235,13 +256,12 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
    /* Next prioritize expensive instructions: */
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+      unsigned d = node_delay_soft(ctx, n);
 
       if (d > 0)
          continue;
 
-      if (!(is_sfu(n->instr) || is_tex(n->instr)))
+      if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
          continue;
 
       if (!chosen || (chosen->max_delay < n->max_delay))
@@ -249,53 +269,36 @@ choose_instr(struct ir3_postsched_ctx *ctx)
    }
 
    if (chosen) {
-      di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
+      di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
       return chosen->instr;
    }
 
-   /*
-    * Sometimes be better to take a nop, rather than scheduling an
-    * instruction that would require an (ss) shortly after another
-    * SFU..  ie. if last SFU was just one or two instr ago, and we
-    * could choose between taking a nop and then scheduling
-    * something else, vs scheduling the immed avail instruction that
-    * would require (ss), we are better with the nop.
-    */
-   for (unsigned delay = 0; delay < 4; delay++) {
-      foreach_sched_node (n, &ctx->dag->heads) {
-         if (would_sync(ctx, n->instr))
-            continue;
-
-         unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
-                                            ctx->v->mergedregs);
-
-         if (d > delay)
-            continue;
-
-         if (!chosen || (chosen->max_delay < n->max_delay))
-            chosen = n;
-      }
-
-      if (chosen) {
-         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
-         return chosen->instr;
-      }
-   }
-
    /* Next try to find a ready leader w/ soft delay (ie. including extra
     * delay for things like tex fetch which can be synchronized w/ sync
     * bit (but we probably do want to schedule some other instructions
-    * while we wait)
+    * while we wait). We also allow a small amount of nops, to prefer now-nops
+    * over future-nops up to a point, as that gives better results.
     */
+   unsigned chosen_delay = 0;
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
+      unsigned d = node_delay_soft(ctx, n);
 
-      if (d > 0)
+      if (d > 3)
          continue;
 
-      if (!chosen || (chosen->max_delay < n->max_delay))
+      if (!chosen || d < chosen_delay) {
+         chosen = n;
+         chosen_delay = d;
+         continue;
+      }
+
+      if (d > chosen_delay)
+         continue;
+
+      if (chosen->max_delay < n->max_delay) {
          chosen = n;
+         chosen_delay = d;
+      }
    }
 
    if (chosen) {
@@ -308,8 +311,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
     * stalls.. but we've already decided there is not a better option.
     */
    foreach_sched_node (n, &ctx->dag->heads) {
-      unsigned d =
-         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+      unsigned d = node_delay(ctx, n);
 
       if (d > 0)
          continue;
@@ -324,9 +326,6 @@ choose_instr(struct ir3_postsched_ctx *ctx)
    }
 
    /* Otherwise choose leader with maximum cost:
-    *
-    * TODO should we try to balance cost and delays?  I guess it is
-    * a balance between now-nop's and future-nop's?
     */
    foreach_sched_node (n, &ctx->dag->heads) {
       if (!chosen || chosen->max_delay < n->max_delay)
@@ -361,6 +360,7 @@ struct ir3_postsched_deps_state {
     * for full precision and 2nd half for half-precision.
     */
    struct ir3_postsched_node *regs[2 * 256];
+   unsigned dst_n[2 * 256];
 };
 
 /* bounds checking read/write accessors, since OoB access to stuff on
@@ -374,7 +374,8 @@ struct ir3_postsched_deps_state {
 
 static void
 add_dep(struct ir3_postsched_deps_state *state,
-        struct ir3_postsched_node *before, struct ir3_postsched_node *after)
+        struct ir3_postsched_node *before, struct ir3_postsched_node *after,
+        unsigned d)
 {
    if (!before || !after)
       return;
@@ -382,30 +383,36 @@ add_dep(struct ir3_postsched_deps_state *state,
    assert(before != after);
 
    if (state->direction == F) {
-      dag_add_edge(&before->dag, &after->dag, NULL);
+      dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
    } else {
-      dag_add_edge(&after->dag, &before->dag, NULL);
+      dag_add_edge_max_data(&after->dag, &before->dag, 0);
    }
 }
 
 static void
 add_single_reg_dep(struct ir3_postsched_deps_state *state,
-                   struct ir3_postsched_node *node, unsigned num, int src_n)
+                   struct ir3_postsched_node *node, unsigned num, int src_n,
+                   int dst_n)
 {
    struct ir3_postsched_node *dep = dep_reg(state, num);
 
+   unsigned d = 0;
    if (src_n >= 0 && dep && state->direction == F) {
-      unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
-      node->delay = MAX2(node->delay, d);
-      if (is_tex_or_prefetch(dep->instr))
-         node->has_tex_src = true;
-      if (is_tex_or_prefetch(dep->instr))
-         node->has_sfu_src = true;
-   }
-
-   add_dep(state, dep, node);
+      /* get the dst_n this corresponds to */
+      unsigned dst_n = state->dst_n[num];
+      unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true);
+      d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n);
+      node->delay = MAX2(node->delay, d_soft);
+      if (is_sy_producer(dep->instr))
+         node->has_sy_src = true;
+      if (is_ss_producer(dep->instr))
+         node->has_ss_src = true;
+   }
+
+   add_dep(state, dep, node, d);
    if (src_n < 0) {
       dep_reg(state, num) = node;
+      state->dst_n[num] = dst_n;
    }
 }
 
@@ -413,15 +420,15 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state,
  * between half and full precision that result in additional dependencies.
  * The 'reg' arg is really just to know half vs full precision.
  *
- * If non-negative, then this adds a dependency on a source register, and
+ * If src_n is positive, then this adds a dependency on a source register, and
  * src_n is the index passed into ir3_delayslots() for calculating the delay:
- * If positive, corresponds to node->instr->regs[src_n]. If negative, then
- * this is for a destination register.
+ * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
+ * this is for the destination register corresponding to dst_n.
  */
 static void
 add_reg_dep(struct ir3_postsched_deps_state *state,
             struct ir3_postsched_node *node, const struct ir3_register *reg,
-            unsigned num, int src_n)
+            unsigned num, int src_n, int dst_n)
 {
    if (state->merged) {
       /* Make sure that special registers like a0.x that are written as
@@ -430,16 +437,16 @@ add_reg_dep(struct ir3_postsched_deps_state *state,
        */
       if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
          /* single conflict in half-reg space: */
-         add_single_reg_dep(state, node, num, src_n);
+         add_single_reg_dep(state, node, num, src_n, dst_n);
       } else {
          /* two conflicts in half-reg space: */
-         add_single_reg_dep(state, node, 2 * num + 0, src_n);
-         add_single_reg_dep(state, node, 2 * num + 1, src_n);
+         add_single_reg_dep(state, node, 2 * num + 0, src_n, dst_n);
+         add_single_reg_dep(state, node, 2 * num + 1, src_n, dst_n);
       }
    } else {
       if (reg->flags & IR3_REG_HALF)
          num += ARRAY_SIZE(state->regs) / 2;
-      add_single_reg_dep(state, node, num, src_n);
+      add_single_reg_dep(state, node, num, src_n, dst_n);
    }
 }
 
@@ -457,12 +464,12 @@ calculate_deps(struct ir3_postsched_deps_state *state,
       if (reg->flags & IR3_REG_RELATIV) {
          /* mark entire array as read: */
          for (unsigned j = 0; j < reg->size; j++) {
-            add_reg_dep(state, node, reg, reg->array.base + j, i);
+            add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
          }
       } else {
          assert(reg->wrmask >= 1);
          u_foreach_bit (b, reg->wrmask) {
-            add_reg_dep(state, node, reg, reg->num + b, i);
+            add_reg_dep(state, node, reg, reg->num + b, i, -1);
          }
       }
    }
@@ -470,18 +477,18 @@ calculate_deps(struct ir3_postsched_deps_state *state,
    /* And then after we update the state for what this instruction
     * wrote:
     */
-   foreach_dst (reg, node->instr) {
+   foreach_dst_n (reg, i, node->instr) {
       if (reg->wrmask == 0)
          continue;
       if (reg->flags & IR3_REG_RELATIV) {
          /* mark the entire array as written: */
-         for (unsigned i = 0; i < reg->size; i++) {
-            add_reg_dep(state, node, reg, reg->array.base + i, -1);
+         for (unsigned j = 0; j < reg->size; j++) {
+            add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
          }
       } else {
          assert(reg->wrmask >= 1);
          u_foreach_bit (b, reg->wrmask) {
-            add_reg_dep(state, node, reg, reg->num + b, -1);
+            add_reg_dep(state, node, reg, reg->num + b, -1, i);
          }
       }
    }
@@ -593,7 +600,7 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
          if (src->block != instr->block)
             continue;
 
-         dag_add_edge(&sn->dag, &n->dag, NULL);
+         dag_add_edge_max_data(&sn->dag, &n->dag, 0);
       }
 
       if (is_input(instr)) {
@@ -602,14 +609,14 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
          util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
             struct ir3_instruction *input = *instrp;
             struct ir3_postsched_node *in = input->data;
-            dag_add_edge(&in->dag, &n->dag, NULL);
+            dag_add_edge_max_data(&in->dag, &n->dag, 0);
          }
          util_dynarray_append(&kills, struct ir3_instruction *, instr);
       } else if (is_tex(instr) || is_mem(instr)) {
          util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
             struct ir3_instruction *kill = *instrp;
             struct ir3_postsched_node *kn = kill->data;
-            dag_add_edge(&kn->dag, &n->dag, NULL);
+            dag_add_edge_max_data(&kn->dag, &n->dag, 0);
          }
       }
    }
@@ -630,8 +637,8 @@ static void
 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 {
    ctx->block = block;
-   ctx->tex_delay = 0;
-   ctx->sfu_delay = 0;
+   ctx->sy_delay = 0;
+   ctx->ss_delay = 0;
 
    /* move all instructions to the unscheduled list, and
     * empty the block's instruction list (to which we will
@@ -677,18 +684,10 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
    while (!list_is_empty(&ctx->unscheduled_list)) {
       struct ir3_instruction *instr = choose_instr(ctx);
 
-      unsigned delay =
-         ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
+      unsigned delay = node_delay(ctx, instr->data);
       d("delay=%u", delay);
 
-      /* and if we run out of instructions that can be scheduled,
-       * then it is time for nop's:
-       */
       debug_assert(delay <= 6);
-      while (delay > 0) {
-         ir3_NOP(block);
-         delay--;
-      }
 
       schedule(ctx, instr);
    }
@@ -750,7 +749,6 @@ ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
       .v = v,
    };
 
-   ir3_remove_nops(ir);
    cleanup_self_movs(ir);
 
    foreach_block (block, &ir->block_list) {
diff --git a/lib/mesa/src/freedreno/ir3/ir3_ra.h b/lib/mesa/src/freedreno/ir3/ir3_ra.h
index 259341eaa..c6837aaae 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_ra.h
+++ b/lib/mesa/src/freedreno/ir3/ir3_ra.h
@@ -124,7 +124,7 @@ ra_reg_is_dst(const struct ir3_register *reg)
          if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
 
 #define ra_foreach_dst_n(__dstreg, __n, __instr)                               \
-   foreach_dst_n(__dstreg, __n, instr)                                         \
+   foreach_dst_n(__dstreg, __n, __instr)                                         \
       if (ra_reg_is_dst(__dstreg))
 
 #define ra_foreach_dst(__dstreg, __instr)                                      \
diff --git a/lib/mesa/src/freedreno/ir3/tests/disasm.c b/lib/mesa/src/freedreno/ir3/tests/disasm.c
index 542469aa1..2f1b89f0d 100644
--- a/lib/mesa/src/freedreno/ir3/tests/disasm.c
+++ b/lib/mesa/src/freedreno/ir3/tests/disasm.c
@@ -43,6 +43,8 @@
 #include "isa/isa.h"
 
 /* clang-format off */
+/* Note: @anholt's 4xx disasm was done on an a418 Nexus 5x */
+#define INSTR_4XX(i, d, ...) { .gpu_id = 420, .instr = #i, .expected = d, __VA_ARGS__ }
 #define INSTR_5XX(i, d, ...) { .gpu_id = 540, .instr = #i, .expected = d, __VA_ARGS__ }
 #define INSTR_6XX(i, d, ...) { .gpu_id = 630, .instr = #i, .expected = d, __VA_ARGS__ }
 /* clang-format on */
@@ -58,153 +60,185 @@ static const struct test {
    bool parse_fail;
 } tests[] = {
    /* clang-format off */
-	/* cat0 */
-	INSTR_6XX(00000000_00000000, "nop"),
-	INSTR_6XX(00000200_00000000, "(rpt2)nop"),
-	INSTR_6XX(03000000_00000000, "end"),
-	INSTR_6XX(00800000_00000004, "br p0.x, #4"),
-	INSTR_6XX(00900000_00000003, "br !p0.x, #3"),
-	INSTR_6XX(03820000_00000015, "shps #21"), /* emit */
-	INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */
-	INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */
-	INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"),
-	INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"),
-	INSTR_6XX(07820000_00000000, "prede"),
-	INSTR_6XX(00800063_0000001e, "brac.3 #30"),
-	INSTR_6XX(06820000_00000000, "predt p0.x"),
-	INSTR_6XX(07020000_00000000, "predf p0.x"),
-	INSTR_6XX(07820000_00000000, "prede"),
-
-	/* cat1 */
-	INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"),
-	INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"),
-	INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"),
-	INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"),
-	INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"),
-	INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"),
-	INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"),
-	INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"),
-	INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"),
-	/* dEQP-VK.subgroups.ballot.compute.compute */
-	INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */
-
-	INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"),
-	INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"),
-	INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"),
-	INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"),
-
-	INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"),
-	INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"),
-	INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"),
-	INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"),
-
-	/* cat2 */
-	INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"),
-	INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"),
-	INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"),
-	INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"),
-	INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"),
-	INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"),
-	INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"),
-	INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"),
-	INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"),
-	INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"),
-	INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"),
-	INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"),
-	INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"),
-	INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"),
-
-	/* cat3 */
-	INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"),
-	INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"),
-	INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"),
-	INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"),
-	INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"),
-	INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"),
-	INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"),
-	INSTR_6XX(65900820_100cb008, "(nop3) shlg.b16 hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */
-	INSTR_6XX(65ae085c_0002a001, "(nop3) shlg.b16 hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */
-	INSTR_6XX(65900820_0c0aac05, "(nop3) shlg.b16 hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */
-
-	/* cat4 */
-	INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"),
-
-	/* cat5 */
-	/* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */
-	INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */
-	/* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */
-	INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */
-	/* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */
-	INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
-	INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
-
-	INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"),
-	INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"),  /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */
-	INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"),  /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */
-	INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"),
-	INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"),
-	INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"),
-	INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"),
-	INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"),
-	/* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */
-	INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"),
-	INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"),
-
-
-	/* cat6 */
-
-	INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"), /* ldgb.a.untyped.1dtype.u32.1 r0.x, g[r1.x], r0.x, 0 */
-	INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"), /* (sy)stib.a.u32.2d.1 g[r1.x], r0.x, r0.z, 1.  r1.x is offset in ibo, r0.x is value*/
-	/* dEQP-VK.image.load_store.1d_array.r8g8b8a8_unorm */
-	INSTR_5XX(c1a20006_0600ba01, "ldib.typed.2d.f32.4 r1.z, g[0], r0.z, r1.z"), /* ldib.a.f32.2d.4 r1.z, g[r0.z], r1.z, 0.  r0.z is offset in ibo as src.  r1.z */
-	/* dEQP-VK.image.load_store.3d.r32g32b32a32_sint */
-	INSTR_5XX(c1aa0003_0500fc01, "ldib.typed.3d.s32.4 r0.w, g[0], r0.w, r1.y"), /* ldib.a.s32.3d.4 r0.w, g[r0.w], r1.y, 0.  r0.w is offset in ibo as src, and dst */
-	/* dEQP-VK.binding_model.shader_access.primary_cmd_buf.storage_image.vertex.descriptor_array.3d */
-	INSTR_5XX(c1a20204_0401fc01, "ldib.typed.3d.f32.4 r1.x, g[1], r1.w, r1.x"), /* ldib.a.f32.3d.4 r1.x, g[r1.w], r1.x, 1 */
-	/* dEQP-VK.binding_model.shader_access.secondary_cmd_buf.with_push.storage_texel_buffer.vertex_fragment.single_descriptor.offset_zero */
-	INSTR_5XX(c1a20005_0501be01, "ldib.typed.4d.f32.4 r1.y, g[0], r1.z, r1.y"), /* ldib.a.f32.1dtype.4 r1.y, g[r1.z], r1.y, 0 */
-	/* dEQP-VK.texture.filtering.cube.formats.r8g8b8a8_snorm_nearest */
-	INSTR_5XX(c1a60200_0000ba01, "ldib.typed.2d.u32.4 r0.x, g[1], r0.z, r0.x"), /* ldib.a.u32.2d.4 r0.x, g[r0.z], r0.x, 1 */
-
-	// TODO is this a real instruction?  Or float -6.0 ?
-	// INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true),
-	/* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */
-	INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */
-	INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */
-	INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"),
-	INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"),
-	INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"),
-
-	/* Customely crafted */
-	INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"),
-	INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"),
-
-	INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
-	INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
-	INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
-	INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
-	INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"),
-
-	/* Found in TCS/TES shaders of GTA V */
-	INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */
-
-	/* Customely crafted */
-	INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"),
-
-	INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
-	INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
-	INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
-	INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
-
-	/* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */
-	INSTR_6XX(c7020020_01800000, "stc c[32], r0.x, 1", .parse_fail=true),
-	/* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
-	INSTR_6XX(c7060020_03800000, "stc c[32], r0.x, 3", .parse_fail=true),
-
-	/* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
-	INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */
-
-	INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 r0.z, r0.x, 2"),
+   /* cat0 */
+   INSTR_6XX(00000000_00000000, "nop"),
+   INSTR_6XX(00000200_00000000, "(rpt2)nop"),
+   INSTR_6XX(03000000_00000000, "end"),
+   INSTR_6XX(00800000_00000004, "br p0.x, #4"),
+   INSTR_6XX(00800000_fffffffc, "br p0.x, #-4"),
+   INSTR_6XX(00900000_00000003, "br !p0.x, #3"),
+   INSTR_6XX(03820000_00000015, "shps #21"), /* emit */
+   INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */
+   INSTR_6XX(02220000_00000004, "getlast.w8 #4"),
+   INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */
+   INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"),
+   INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"),
+   INSTR_6XX(07820000_00000000, "prede"),
+   INSTR_6XX(00800063_0000001e, "brac.3 #30"),
+   INSTR_6XX(06820000_00000000, "predt p0.x"),
+   INSTR_6XX(07020000_00000000, "predf p0.x"),
+   INSTR_6XX(07820000_00000000, "prede"),
+
+   /* cat1 */
+   INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"),
+   INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"),
+   INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"),
+   INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"),
+   INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"),
+   INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"),
+   INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"),
+   INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"),
+   INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"),
+   /* dEQP-VK.subgroups.ballot.compute.compute */
+   INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */
+
+   INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"),
+   INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"),
+   INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"),
+   INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"),
+
+   INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"),
+   INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"),
+   INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"),
+   INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"),
+
+   /* cat2 */
+   INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"),
+   INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"),
+   INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"),
+   INSTR_6XX(47348000_00002000, "flat.b (ei)r0.x, 0, r0.x"),
+   INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"),
+   INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"),
+   INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"),
+   INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"),
+   INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"),
+   INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"),
+   INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"),
+   INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"),
+   INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"),
+   INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"),
+   INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"),
+
+   /* cat3 */
+   INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"),
+   INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"),
+   INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"),
+   INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"),
+   INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"),
+   INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"),
+   INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"),
+   INSTR_6XX(65900820_100cb008, "(nop3) shlg hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */
+   INSTR_6XX(65ae085c_0002a001, "(nop3) shlg hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */
+   INSTR_6XX(65900820_0c0aac05, "(nop3) shlg hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */
+   INSTR_6XX(65ae0c5c_0002a001, "(nop3) shlg r23.x, r0.y, r23.x, r0.z"), /* (nop3) shlg.b32 r23.x, (r)r0.y, (r)r23.x, r0.z */
+   INSTR_6XX(64018802_0002e003, "(nop3) shrm hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+   INSTR_6XX(64818802_0002e003, "(nop3) shlm hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+   INSTR_6XX(65018802_0002e003, "(nop3) shrg hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+   INSTR_6XX(66018802_0002e003, "(nop3) andg hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+   INSTR_6XX(67018802_1002e003, "(nop3) wmm hr0.z, (neg)hr0.w, hr0.w, 2"), /* (nop3) wmm.f16f16 hr0.z, (abs)(r)hr0.w, (r)hr0.w, 2 */
+   INSTR_6XX(67018c02_1002e003, "(nop3) wmm.accu hr0.z, (neg)hr0.w, hr0.w, 2"),
+   INSTR_6XX(6701c802_9002a003, "(nop3) wmm r0.z, r0.w, r0.w, 2"), /* (nop3) wmm.f32f32 r0.z, (r)r0.w, (r)r0.w, 2 */
+   /* custom test with qcom_dot8 function from cl_qcom_dot_product8 */
+   INSTR_6XX(66818c02_0002e003, "(sat)(nop3) dp2acc.mixed.low r0.z, r0.w, r0.w, r0.z"), /* (nop3) dp2acc (sat)r0.z, (signed)(low)(r)r0.w, (low)(r)r0.w, r0.z */
+   INSTR_6XX(6681c802_8002a003, "(nop3) dp4acc.unsigned.low r0.z, r0.w, r0.w, (neg)r0.z"), /* (nop3) dp4acc r0.z, (unsigned)(r)r0.w, (r)r0.w, (neg)r0.z */
+
+   /* cat4 */
+   INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"),
+
+   /* cat5 */
+   /* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */
+   INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */
+   /* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */
+   INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */
+   /* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */
+   INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
+   INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
+
+   INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"),
+   INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"),  /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */
+   INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"),  /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */
+   INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"),
+   INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"),
+   INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"),
+   INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"),
+   INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"),
+   /* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */
+   INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"),
+   INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"),
+   INSTR_6XX(a0c81108_e2000001, "sam.base0 (f32)(x)r2.x, r0.x, s#16, a1.x"),
+   INSTR_6XX(a048d107_cc080a07, "isaml.base3 (s32)(x)r1.w, r0.w, r1.y, s#0, t#6"),
+
+
+   /* dEQP-VK.subgroups.arithmetic.compute.subgroupadd_float */
+   INSTR_6XX(a7c03102_00100003, "brcst.active.w8 (u32)(x)r0.z, r0.y"), /* brcst.active.w8 (u32)(xOOO)r0.z, r0.y */
+   /* dEQP-VK.subgroups.quad.graphics.subgroupquadbroadcast_int */
+   INSTR_6XX(b7e03107_00000401, "(sy)quad_shuffle.brcst (u32)(x)r1.w, r0.x, r0.z"), /* (sy)quad_shuffle.brcst (u32)(xOOO)r1.w, r0.x, r0.z */
+   /* dEQP-VK.subgroups.quad.graphics.subgroupquadswapdiagonal_int */
+   INSTR_6XX(b7e03104_00180001, "(sy)quad_shuffle.diag (u32)(x)r1.x, r0.x"), /* (sy)quad_shuffle.diag (u32)(xOOO)r1.x, r0.x */
+
+   /* cat6 */
+
+   INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"), /* ldgb.a.untyped.1dtype.u32.1 r0.x, g[r1.x], r0.x, 0 */
+   INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"), /* (sy)stib.a.u32.2d.1 g[r1.x], r0.x, r0.z, 1.  r1.x is offset in ibo, r0.x is value*/
+   /* dEQP-VK.image.load_store.1d_array.r8g8b8a8_unorm */
+   INSTR_5XX(c1a20006_0600ba01, "ldib.typed.2d.f32.4 r1.z, g[0], r0.z, r1.z"), /* ldib.a.f32.2d.4 r1.z, g[r0.z], r1.z, 0.  r0.z is offset in ibo as src.  r1.z */
+   /* dEQP-VK.image.load_store.3d.r32g32b32a32_sint */
+   INSTR_5XX(c1aa0003_0500fc01, "ldib.typed.3d.s32.4 r0.w, g[0], r0.w, r1.y"), /* ldib.a.s32.3d.4 r0.w, g[r0.w], r1.y, 0.  r0.w is offset in ibo as src, and dst */
+   /* dEQP-VK.binding_model.shader_access.primary_cmd_buf.storage_image.vertex.descriptor_array.3d */
+   INSTR_5XX(c1a20204_0401fc01, "ldib.typed.3d.f32.4 r1.x, g[1], r1.w, r1.x"), /* ldib.a.f32.3d.4 r1.x, g[r1.w], r1.x, 1 */
+   /* dEQP-VK.binding_model.shader_access.secondary_cmd_buf.with_push.storage_texel_buffer.vertex_fragment.single_descriptor.offset_zero */
+   INSTR_5XX(c1a20005_0501be01, "ldib.typed.4d.f32.4 r1.y, g[0], r1.z, r1.y"), /* ldib.a.f32.1dtype.4 r1.y, g[r1.z], r1.y, 0 */
+   /* dEQP-VK.texture.filtering.cube.formats.r8g8b8a8_snorm_nearest */
+   INSTR_5XX(c1a60200_0000ba01, "ldib.typed.2d.u32.4 r0.x, g[1], r0.z, r0.x"), /* ldib.a.u32.2d.4 r0.x, g[r0.z], r0.x, 1 */
+
+   // TODO is this a real instruction?  Or float -6.0 ?
+   // INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true),
+   /* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */
+   INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */
+   INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */
+   INSTR_6XX(c0dc052e_01800042, "stg.a.u8 g[r0.z+(r11.z)<<2], hr8.y, 1"),
+   INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"),
+   INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"),
+   INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"),
+   INSTR_5XX(c0ce0100_02800000, "stg.s8 g[r0.x], hr0.x, 2"),
+   INSTR_5XX(c0c00100_02800000, "stg.f16 g[r0.x], hr0.x, 2"),
+
+   /* Customely crafted */
+   INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"),
+   INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"),
+
+   INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
+   INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
+   INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
+   INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
+   INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"),
+   INSTR_6XX(c0040003_0180c269, "ldg.u16 hr0.w, g[r0.w+308], 1"),
+
+   /* Found in TCS/TES shaders of GTA V */
+   INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */
+
+   /* Customely crafted */
+   INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"),
+
+   INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
+   INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
+   INSTR_6XX(c0000006_01c18017, "ldg.a.f16 hr1.z, g[r1.z+(r2.w)<<2], 1"),
+   INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
+   INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
+
+   /* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */
+   INSTR_6XX(c7020020_01800000, "stc.f32 c[32], r0.x, 1"), /* stc c[32], r0.x, 1 */
+   /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
+   INSTR_6XX(c7060020_03800000, "stc.u32 c[32], r0.x, 3"), /* stc c[32], r0.x, 3 */
+
+   /* custom */
+   INSTR_6XX(c7060100_03800000, "stc.u32 c[a1.x], r0.x, 3"), /* stc c[a1.x], r0.x, 3 */
+   INSTR_6XX(c7060120_03800000, "stc.u32 c[a1.x+32], r0.x, 3"), /* stc c[a1.x+32], r0.x, 3 */
+
+   /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
+   INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */
+
+   INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 hr0.z, r0.x, 2"),
 #if 0
    /* TODO blob sometimes/frequently sets b0, although there does not seem
     * to be an obvious pattern and our encoding never sets it.  AFAICT it
@@ -298,6 +332,13 @@ static const struct test {
    INSTR_6XX(c0260000_00c78080, "ldc.offset0.1.nonuniform r0.x, 0, r0.x"), /* ldc.1.mode2.base0 r0.x, 0, r0.x */
    INSTR_6XX(c0260201_00c78080, "ldc.offset0.1.nonuniform r0.y, 0, r0.y"), /* ldc.1.mode2.base0 r0.y, 0, r0.y */
 
+   /* a4xx-a5xx has the exact same instrs in
+    * dEQP-GLES31.functional.shaders.opaque_type_indexing.ubo.(dynamically_)uniform_fragment
+    * with no change based on the mode. Note that we can't decode this yet.
+    */
+   /* INSTR_4XX(c7860000_00810001), */ /* ldc.1 r0.x, g[r1.x], 0, r0.x */
+   /* INSTR_5XX(c7860000_00800000), */ /* ldc.a.1 r0.x, g[r0.x], 0, r0.x */
+
    /* custom */
    INSTR_6XX(c0260201_ffc78080, "ldc.offset0.1.nonuniform r0.y, 255, r0.y"), /* ldc.1.mode2.base0 r0.y, 255, r0.y */
 
@@ -307,6 +348,11 @@ static const struct test {
    INSTR_6XX(c0260000_00478400, "ldc.offset2.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */
    INSTR_6XX(c0260000_00478600, "ldc.offset3.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */
 
+   /* dEQP-VK.glsl.conditionals.if.if_else_vertex */
+   INSTR_6XX(c0360000_00c78100, "ldc.1.k.imm.base0 c[a1.x], 0, 0"), /* ldc.1.k.mode4.base0 c[a1.x], 0, 0 */
+   /* custom */
+   INSTR_6XX(c0360003_00c78100, "ldc.4.k.imm.base0 c[a1.x], 0, 0"), /* ldc.4.k.mode4.base0 c[a1.x], 0, 0 */
+
    /* dEQP-VK.glsl.struct.local.nested_struct_array_dynamic_index_fragment */
    INSTR_6XX(c1425b50_01803e02, "stp.f32 p[r11.y-176], r0.y, 1"),
    INSTR_6XX(c1425b98_02803e14, "stp.f32 p[r11.y-104], r2.z, 2"),
@@ -318,14 +364,17 @@ static const struct test {
    /* Atomic: */
 #if 0
    /* TODO our encoding differs in b53 for these two */
-   INSTR_5XX(c4d60002_00008001, "atomic.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
-   INSTR_5XX(c4160205_03000001, "atomic.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
+   INSTR_5XX(c4f60002_00008001, "atomic.s.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
+   INSTR_5XX(c4360205_03000001, "atomic.s.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
 #else
-   INSTR_5XX(c4f60002_00008001, "atomic.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
-   INSTR_5XX(c4360205_03000001, "atomic.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
+   INSTR_5XX(c4f60002_00008001, "atomic.s.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
+   INSTR_5XX(c4360205_03000001, "atomic.s.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
 #endif
    INSTR_6XX(d5c60003_03008001, "(sy)atomic.max.untyped.1d.u32.1.l r0.w, l[r0.z], r0.w"),
 
+   /* dEQP-VK.glsl.atomic_operations.add_unsigned_compute_reference */
+   INSTR_6XX(c4160002_02000001, "atomic.g.add.untyped.1d.u32.1.g r0.z, r0.x, r0.z"),
+
    /* Bindless atomic: */
    INSTR_6XX(c03a0003_01640000, "atomic.b.add.untyped.1d.s32.1.imm r0.w, r0.y, 0"), /* atomic.b.add.g.s32.1d.mode0.base0 r0.w,r0.y,0 */
    INSTR_6XX(c03a0003_01660000, "atomic.b.and.untyped.1d.s32.1.imm r0.w, r0.y, 0"), /* atomic.b.and.g.s32.1d.mode0.base0 r0.w,r0.y,0 */
@@ -333,10 +382,14 @@ static const struct test {
 
    /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_literal.fragment.sampler2d */
    INSTR_6XX(a0c01f04_0cc00005, "sam (f32)(xyzw)r1.x, r0.z, s#6, t#6"),
-   /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.uniform.fragment.sampler2d (looks like maybe the compiler didn't figure out */
-   INSTR_6XX(a0c81f07_0100000b, "sam.s2en (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode0 (f32)(xyzw)r1.w, r1.y, hr2.x */
+
+   /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.uniform.fragment.sampler2d */
+   INSTR_4XX(a0c81f02_00800001, "sam.s2en.uniform (f32)(xyzw)r0.z, r0.x, hr1.x"), /* sam.s2en.mode0 (f32)(xyzw)r0.z, r0.x, hr1.x */ /* same for 5xx */
+   INSTR_6XX(a0c81f07_0100000b, "sam.s2en.uniform (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode0 (f32)(xyzw)r1.w, r1.y, hr2.x */
+
    /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.dynamically_uniform.fragment.sampler2d */
-   INSTR_6XX(a0c81f07_8100000b, "sam.s2en.uniform (f32)(xyzw)r1.w, r1.y, hr2.x", .parse_fail=true), /* sam.s2en.mode4 (f32)(xyzw)r1.w, r1.y, hr2.x */
+   INSTR_4XX(a0c81f02_80800001, "sam.s2en.nonuniform (f32)(xyzw)r0.z, r0.x, hr1.x"), /* sam.s2en.uniform (f32)(xyzw)r0.z, r0.x, hr1.x */ /* same for 5xx */
+   INSTR_6XX(a0c81f07_8100000b, "sam.s2en.nonuniform (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode4 (f32)(xyzw)r1.w, r1.y, hr2.x */
 
    /* NonUniform: */
    /* dEQP-VK.descriptor_indexing.storage_buffer */
@@ -349,6 +402,9 @@ static const struct test {
    /* dEQP-VK.descriptor_indexing.sampler */
    INSTR_6XX(a0c81f00_40000005, "sam.s2en.nonuniform.base0 (f32)(xyzw)r0.x, r0.z, r0.x"),
 
+   /* dEQP-VK.subgroups.quad.graphics.subgroupquadbroadcast_int */
+   INSTR_6XX(c0260001_00c98000, "getfiberid.u32 r0.y"),
+
    /* Custom test since we've never seen the blob emit these. */
    INSTR_6XX(c0260004_00490000, "getspid.u32 r1.x"),
    INSTR_6XX(c0260005_00494000, "getwid.u32 r1.y"),
@@ -416,7 +472,6 @@ main(int argc, char **argv)
          printf("  Got:      \"%s\"\n", disasm_output);
          retval = 1;
          decode_fails++;
-         continue;
       }
 
       /*
@@ -426,7 +481,8 @@ main(int argc, char **argv)
       unsigned gen = test->gpu_id / 100;
       if (!compilers[gen]) {
          dev_ids[gen].gpu_id = test->gpu_id;
-         compilers[gen] = ir3_compiler_create(NULL, &dev_ids[gen], false);
+         compilers[gen] = ir3_compiler_create(NULL, &dev_ids[gen],
+                                              &(struct ir3_compiler_options){});
       }
 
       FILE *fasm =
diff --git a/lib/mesa/src/freedreno/registers/dsi/dsi_phy_5nm.xml b/lib/mesa/src/freedreno/registers/dsi/dsi_phy_5nm.xml
deleted file mode 100644
index 7e3505b9e..000000000
--- a/lib/mesa/src/freedreno/registers/dsi/dsi_phy_5nm.xml
+++ /dev/null
@@ -1,228 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<database xmlns="http://nouveau.freedesktop.org/"
-xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
-<import file="freedreno_copyright.xml"/>
-
-<domain name="DSI_5nm_PHY_CMN" width="32">
-	<reg32 offset="0x00000" name="REVISION_ID0"/>
-	<reg32 offset="0x00004" name="REVISION_ID1"/>
-	<reg32 offset="0x00008" name="REVISION_ID2"/>
-	<reg32 offset="0x0000c" name="REVISION_ID3"/>
-	<reg32 offset="0x00010" name="CLK_CFG0"/>
-	<reg32 offset="0x00014" name="CLK_CFG1"/>
-	<reg32 offset="0x00018" name="GLBL_CTRL"/>
-	<reg32 offset="0x0001c" name="RBUF_CTRL"/>
-	<reg32 offset="0x00020" name="VREG_CTRL_0"/>
-	<reg32 offset="0x00024" name="CTRL_0"/>
-	<reg32 offset="0x00028" name="CTRL_1"/>
-	<reg32 offset="0x0002c" name="CTRL_2"/>
-	<reg32 offset="0x00030" name="CTRL_3"/>
-	<reg32 offset="0x00034" name="LANE_CFG0"/>
-	<reg32 offset="0x00038" name="LANE_CFG1"/>
-	<reg32 offset="0x0003c" name="PLL_CNTRL"/>
-	<reg32 offset="0x00040" name="DPHY_SOT"/>
-	<reg32 offset="0x000a0" name="LANE_CTRL0"/>
-	<reg32 offset="0x000a4" name="LANE_CTRL1"/>
-	<reg32 offset="0x000a8" name="LANE_CTRL2"/>
-	<reg32 offset="0x000ac" name="LANE_CTRL3"/>
-	<reg32 offset="0x000b0" name="LANE_CTRL4"/>
-	<reg32 offset="0x000b4" name="TIMING_CTRL_0"/>
-	<reg32 offset="0x000b8" name="TIMING_CTRL_1"/>
-	<reg32 offset="0x000bc" name="TIMING_CTRL_2"/>
-	<reg32 offset="0x000c0" name="TIMING_CTRL_3"/>
-	<reg32 offset="0x000c4" name="TIMING_CTRL_4"/>
-	<reg32 offset="0x000c8" name="TIMING_CTRL_5"/>
-	<reg32 offset="0x000cc" name="TIMING_CTRL_6"/>
-	<reg32 offset="0x000d0" name="TIMING_CTRL_7"/>
-	<reg32 offset="0x000d4" name="TIMING_CTRL_8"/>
-	<reg32 offset="0x000d8" name="TIMING_CTRL_9"/>
-	<reg32 offset="0x000dc" name="TIMING_CTRL_10"/>
-	<reg32 offset="0x000e0" name="TIMING_CTRL_11"/>
-	<reg32 offset="0x000e4" name="TIMING_CTRL_12"/>
-	<reg32 offset="0x000e8" name="TIMING_CTRL_13"/>
-	<reg32 offset="0x000ec" name="GLBL_HSTX_STR_CTRL_0"/>
-	<reg32 offset="0x000f0" name="GLBL_HSTX_STR_CTRL_1"/>
-	<reg32 offset="0x000f4" name="GLBL_RESCODE_OFFSET_TOP_CTRL"/>
-	<reg32 offset="0x000f8" name="GLBL_RESCODE_OFFSET_BOT_CTRL"/>
-	<reg32 offset="0x000fc" name="GLBL_RESCODE_OFFSET_MID_CTRL"/>
-	<reg32 offset="0x00100" name="GLBL_LPTX_STR_CTRL"/>
-	<reg32 offset="0x00104" name="GLBL_PEMPH_CTRL_0"/>
-	<reg32 offset="0x00108" name="GLBL_PEMPH_CTRL_1"/>
-	<reg32 offset="0x0010c" name="GLBL_STR_SWI_CAL_SEL_CTRL"/>
-	<reg32 offset="0x00110" name="VREG_CTRL_1"/>
-	<reg32 offset="0x00114" name="CTRL_4"/>
-	<reg32 offset="0x00140" name="PHY_STATUS"/>
-	<reg32 offset="0x00148" name="LANE_STATUS0"/>
-	<reg32 offset="0x0014c" name="LANE_STATUS1"/>
-</domain>
-
-<domain name="DSI_5nm_PHY" width="32">
-	<array offset="0x00000" name="LN" length="5" stride="0x80">
-		<reg32 offset="0x00" name="CFG0"/>
-		<reg32 offset="0x04" name="CFG1"/>
-		<reg32 offset="0x08" name="CFG2"/>
-		<reg32 offset="0x0c" name="TEST_DATAPATH"/>
-		<reg32 offset="0x10" name="PIN_SWAP"/>
-		<reg32 offset="0x14" name="LPRX_CTRL"/>
-		<reg32 offset="0x18" name="TX_DCTRL"/>
-	</array>
-</domain>
-
-<domain name="DSI_5nm_PHY_PLL" width="32">
-	<reg32 offset="0x0000" name="ANALOG_CONTROLS_ONE"/>
-	<reg32 offset="0x0004" name="ANALOG_CONTROLS_TWO"/>
-	<reg32 offset="0x0008" name="INT_LOOP_SETTINGS"/>
-	<reg32 offset="0x000c" name="INT_LOOP_SETTINGS_TWO"/>
-	<reg32 offset="0x0010" name="ANALOG_CONTROLS_THREE"/>
-	<reg32 offset="0x0014" name="ANALOG_CONTROLS_FOUR"/>
-	<reg32 offset="0x0018" name="ANALOG_CONTROLS_FIVE"/>
-	<reg32 offset="0x001c" name="INT_LOOP_CONTROLS"/>
-	<reg32 offset="0x0020" name="DSM_DIVIDER"/>
-	<reg32 offset="0x0024" name="FEEDBACK_DIVIDER"/>
-	<reg32 offset="0x0028" name="SYSTEM_MUXES"/>
-	<reg32 offset="0x002c" name="FREQ_UPDATE_CONTROL_OVERRIDES"/>
-	<reg32 offset="0x0030" name="CMODE"/>
-	<reg32 offset="0x0034" name="PSM_CTRL"/>
-	<reg32 offset="0x0038" name="RSM_CTRL"/>
-	<reg32 offset="0x003c" name="VCO_TUNE_MAP"/>
-	<reg32 offset="0x0040" name="PLL_CNTRL"/>
-	<reg32 offset="0x0044" name="CALIBRATION_SETTINGS"/>
-	<reg32 offset="0x0048" name="BAND_SEL_CAL_TIMER_LOW"/>
-	<reg32 offset="0x004c" name="BAND_SEL_CAL_TIMER_HIGH"/>
-	<reg32 offset="0x0050" name="BAND_SEL_CAL_SETTINGS"/>
-	<reg32 offset="0x0054" name="BAND_SEL_MIN"/>
-	<reg32 offset="0x0058" name="BAND_SEL_MAX"/>
-	<reg32 offset="0x005c" name="BAND_SEL_PFILT"/>
-	<reg32 offset="0x0060" name="BAND_SEL_IFILT"/>
-	<reg32 offset="0x0064" name="BAND_SEL_CAL_SETTINGS_TWO"/>
-	<reg32 offset="0x0068" name="BAND_SEL_CAL_SETTINGS_THREE"/>
-	<reg32 offset="0x006c" name="BAND_SEL_CAL_SETTINGS_FOUR"/>
-	<reg32 offset="0x0070" name="BAND_SEL_ICODE_HIGH"/>
-	<reg32 offset="0x0074" name="BAND_SEL_ICODE_LOW"/>
-	<reg32 offset="0x0078" name="FREQ_DETECT_SETTINGS_ONE"/>
-	<reg32 offset="0x007c" name="FREQ_DETECT_THRESH"/>
-	<reg32 offset="0x0080" name="FREQ_DET_REFCLK_HIGH"/>
-	<reg32 offset="0x0084" name="FREQ_DET_REFCLK_LOW"/>
-	<reg32 offset="0x0088" name="FREQ_DET_PLLCLK_HIGH"/>
-	<reg32 offset="0x008c" name="FREQ_DET_PLLCLK_LOW"/>
-	<reg32 offset="0x0090" name="PFILT"/>
-	<reg32 offset="0x0094" name="IFILT"/>
-	<reg32 offset="0x0098" name="PLL_GAIN"/>
-	<reg32 offset="0x009c" name="ICODE_LOW"/>
-	<reg32 offset="0x00a0" name="ICODE_HIGH"/>
-	<reg32 offset="0x00a4" name="LOCKDET"/>
-	<reg32 offset="0x00a8" name="OUTDIV"/>
-	<reg32 offset="0x00ac" name="FASTLOCK_CONTROL"/>
-	<reg32 offset="0x00b0" name="PASS_OUT_OVERRIDE_ONE"/>
-	<reg32 offset="0x00b4" name="PASS_OUT_OVERRIDE_TWO"/>
-	<reg32 offset="0x00b8" name="CORE_OVERRIDE"/>
-	<reg32 offset="0x00bc" name="CORE_INPUT_OVERRIDE"/>
-	<reg32 offset="0x00c0" name="RATE_CHANGE"/>
-	<reg32 offset="0x00c4" name="PLL_DIGITAL_TIMERS"/>
-	<reg32 offset="0x00c8" name="PLL_DIGITAL_TIMERS_TWO"/>
-	<reg32 offset="0x00cc" name="DECIMAL_DIV_START"/>
-	<reg32 offset="0x00d0" name="FRAC_DIV_START_LOW"/>
-	<reg32 offset="0x00d4" name="FRAC_DIV_START_MID"/>
-	<reg32 offset="0x00d8" name="FRAC_DIV_START_HIGH"/>
-	<reg32 offset="0x00dc" name="DEC_FRAC_MUXES"/>
-	<reg32 offset="0x00e0" name="DECIMAL_DIV_START_1"/>
-	<reg32 offset="0x00e4" name="FRAC_DIV_START_LOW_1"/>
-	<reg32 offset="0x00e8" name="FRAC_DIV_START_MID_1"/>
-	<reg32 offset="0x00ec" name="FRAC_DIV_START_HIGH_1"/>
-	<reg32 offset="0x00f0" name="DECIMAL_DIV_START_2"/>
-	<reg32 offset="0x00f4" name="FRAC_DIV_START_LOW_2"/>
-	<reg32 offset="0x00f8" name="FRAC_DIV_START_MID_2"/>
-	<reg32 offset="0x00fc" name="FRAC_DIV_START_HIGH_2"/>
-	<reg32 offset="0x0100" name="MASH_CONTROL"/>
-	<reg32 offset="0x0104" name="SSC_STEPSIZE_LOW"/>
-	<reg32 offset="0x0108" name="SSC_STEPSIZE_HIGH"/>
-	<reg32 offset="0x010c" name="SSC_DIV_PER_LOW"/>
-	<reg32 offset="0x0110" name="SSC_DIV_PER_HIGH"/>
-	<reg32 offset="0x0114" name="SSC_ADJPER_LOW"/>
-	<reg32 offset="0x0118" name="SSC_ADJPER_HIGH"/>
-	<reg32 offset="0x011c" name="SSC_MUX_CONTROL"/>
-	<reg32 offset="0x0120" name="SSC_STEPSIZE_LOW_1"/>
-	<reg32 offset="0x0124" name="SSC_STEPSIZE_HIGH_1"/>
-	<reg32 offset="0x0128" name="SSC_DIV_PER_LOW_1"/>
-	<reg32 offset="0x012c" name="SSC_DIV_PER_HIGH_1"/>
-	<reg32 offset="0x0130" name="SSC_ADJPER_LOW_1"/>
-	<reg32 offset="0x0134" name="SSC_ADJPER_HIGH_1"/>
-	<reg32 offset="0x0138" name="SSC_STEPSIZE_LOW_2"/>
-	<reg32 offset="0x013c" name="SSC_STEPSIZE_HIGH_2"/>
-	<reg32 offset="0x0140" name="SSC_DIV_PER_LOW_2"/>
-	<reg32 offset="0x0144" name="SSC_DIV_PER_HIGH_2"/>
-	<reg32 offset="0x0148" name="SSC_ADJPER_LOW_2"/>
-	<reg32 offset="0x014c" name="SSC_ADJPER_HIGH_2"/>
-	<reg32 offset="0x0150" name="SSC_CONTROL"/>
-	<reg32 offset="0x0154" name="PLL_OUTDIV_RATE"/>
-	<reg32 offset="0x0158" name="PLL_LOCKDET_RATE_1"/>
-	<reg32 offset="0x015c" name="PLL_LOCKDET_RATE_2"/>
-	<reg32 offset="0x0160" name="PLL_PROP_GAIN_RATE_1"/>
-	<reg32 offset="0x0164" name="PLL_PROP_GAIN_RATE_2"/>
-	<reg32 offset="0x0168" name="PLL_BAND_SEL_RATE_1"/>
-	<reg32 offset="0x016c" name="PLL_BAND_SEL_RATE_2"/>
-	<reg32 offset="0x0170" name="PLL_INT_GAIN_IFILT_BAND_1"/>
-	<reg32 offset="0x0174" name="PLL_INT_GAIN_IFILT_BAND_2"/>
-	<reg32 offset="0x0178" name="PLL_FL_INT_GAIN_PFILT_BAND_1"/>
-	<reg32 offset="0x017c" name="PLL_FL_INT_GAIN_PFILT_BAND_2"/>
-	<reg32 offset="0x0180" name="PLL_FASTLOCK_EN_BAND"/>
-	<reg32 offset="0x0184" name="FREQ_TUNE_ACCUM_INIT_MID"/>
-	<reg32 offset="0x0188" name="FREQ_TUNE_ACCUM_INIT_HIGH"/>
-	<reg32 offset="0x018c" name="FREQ_TUNE_ACCUM_INIT_MUX"/>
-	<reg32 offset="0x0190" name="PLL_LOCK_OVERRIDE"/>
-	<reg32 offset="0x0194" name="PLL_LOCK_DELAY"/>
-	<reg32 offset="0x0198" name="PLL_LOCK_MIN_DELAY"/>
-	<reg32 offset="0x019c" name="CLOCK_INVERTERS"/>
-	<reg32 offset="0x01a0" name="SPARE_AND_JPC_OVERRIDES"/>
-	<reg32 offset="0x01a4" name="BIAS_CONTROL_1"/>
-	<reg32 offset="0x01a8" name="BIAS_CONTROL_2"/>
-	<reg32 offset="0x01ac" name="ALOG_OBSV_BUS_CTRL_1"/>
-	<reg32 offset="0x01b0" name="COMMON_STATUS_ONE"/>
-	<reg32 offset="0x01b4" name="COMMON_STATUS_TWO"/>
-	<reg32 offset="0x01b8" name="BAND_SEL_CAL"/>
-	<reg32 offset="0x01bc" name="ICODE_ACCUM_STATUS_LOW"/>
-	<reg32 offset="0x01c0" name="ICODE_ACCUM_STATUS_HIGH"/>
-	<reg32 offset="0x01c4" name="FD_OUT_LOW"/>
-	<reg32 offset="0x01c8" name="FD_OUT_HIGH"/>
-	<reg32 offset="0x01cc" name="ALOG_OBSV_BUS_STATUS_1"/>
-	<reg32 offset="0x01d0" name="PLL_MISC_CONFIG"/>
-	<reg32 offset="0x01d4" name="FLL_CONFIG"/>
-	<reg32 offset="0x01d8" name="FLL_FREQ_ACQ_TIME"/>
-	<reg32 offset="0x01dc" name="FLL_CODE0"/>
-	<reg32 offset="0x01e0" name="FLL_CODE1"/>
-	<reg32 offset="0x01e4" name="FLL_GAIN0"/>
-	<reg32 offset="0x01e8" name="FLL_GAIN1"/>
-	<reg32 offset="0x01ec" name="SW_RESET"/>
-	<reg32 offset="0x01f0" name="FAST_PWRUP"/>
-	<reg32 offset="0x01f4" name="LOCKTIME0"/>
-	<reg32 offset="0x01f8" name="LOCKTIME1"/>
-	<reg32 offset="0x01fc" name="DEBUG_BUS_SEL"/>
-	<reg32 offset="0x0200" name="DEBUG_BUS0"/>
-	<reg32 offset="0x0204" name="DEBUG_BUS1"/>
-	<reg32 offset="0x0208" name="DEBUG_BUS2"/>
-	<reg32 offset="0x020c" name="DEBUG_BUS3"/>
-	<reg32 offset="0x0210" name="ANALOG_FLL_CONTROL_OVERRIDES"/>
-	<reg32 offset="0x0214" name="VCO_CONFIG"/>
-	<reg32 offset="0x0218" name="VCO_CAL_CODE1_MODE0_STATUS"/>
-	<reg32 offset="0x021c" name="VCO_CAL_CODE1_MODE1_STATUS"/>
-	<reg32 offset="0x0220" name="RESET_SM_STATUS"/>
-	<reg32 offset="0x0224" name="TDC_OFFSET"/>
-	<reg32 offset="0x0228" name="PS3_PWRDOWN_CONTROLS"/>
-	<reg32 offset="0x022c" name="PS4_PWRDOWN_CONTROLS"/>
-	<reg32 offset="0x0230" name="PLL_RST_CONTROLS"/>
-	<reg32 offset="0x0234" name="GEAR_BAND_SELECT_CONTROLS"/>
-	<reg32 offset="0x0238" name="PSM_CLK_CONTROLS"/>
-	<reg32 offset="0x023c" name="SYSTEM_MUXES_2"/>
-	<reg32 offset="0x0240" name="VCO_CONFIG_1"/>
-	<reg32 offset="0x0244" name="VCO_CONFIG_2"/>
-	<reg32 offset="0x0248" name="CLOCK_INVERTERS_1"/>
-	<reg32 offset="0x024c" name="CLOCK_INVERTERS_2"/>
-	<reg32 offset="0x0250" name="CMODE_1"/>
-	<reg32 offset="0x0254" name="CMODE_2"/>
-	<reg32 offset="0x0258" name="ANALOG_CONTROLS_FIVE_1"/>
-	<reg32 offset="0x025c" name="ANALOG_CONTROLS_FIVE_2"/>
-	<reg32 offset="0x0260" name="PERF_OPTIMIZE"/>
-</domain>
-
-</database>
diff --git a/lib/mesa/src/freedreno/vulkan/tu_clear_blit.c b/lib/mesa/src/freedreno/vulkan/tu_clear_blit.c
index 8d38a8fd0..6caa31beb 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_clear_blit.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_clear_blit.c
@@ -30,27 +30,27 @@ tu_pack_float32_for_unorm(float val, int bits)
 /* r2d_ = BLIT_OP_SCALE operations */
 
 static enum a6xx_2d_ifmt
-format_to_ifmt(VkFormat format)
+format_to_ifmt(enum pipe_format format)
 {
-   if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
-       format == VK_FORMAT_X8_D24_UNORM_PACK32)
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+       format == PIPE_FORMAT_Z24X8_UNORM)
       return R2D_UNORM8;
 
    /* get_component_bits doesn't work with depth/stencil formats: */
-   if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
+   if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
       return R2D_FLOAT32;
-   if (format == VK_FORMAT_S8_UINT)
+   if (format == PIPE_FORMAT_S8_UINT)
       return R2D_INT8;
 
    /* use the size of the red channel to find the corresponding "ifmt" */
-   bool is_int = vk_format_is_int(format);
-   switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
+   bool is_int = util_format_is_pure_integer(format);
+   switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
    case 4: case 5: case 8:
       return is_int ? R2D_INT8 : R2D_UNORM8;
    case 10: case 11:
       return is_int ? R2D_INT16 : R2D_FLOAT16;
    case 16:
-      if (vk_format_is_float(format))
+      if (util_format_is_float(format))
          return R2D_FLOAT16;
       return is_int ? R2D_INT16 : R2D_FLOAT32;
    case 32:
@@ -82,38 +82,38 @@ r2d_coords(struct tu_cs *cs,
 }
 
 static void
-r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
+r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
 {
    uint32_t clear_value[4] = {};
 
    switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
       /* cleared as r8g8b8a8_unorm using special format */
       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
       clear_value[1] = clear_value[0] >> 8;
       clear_value[2] = clear_value[0] >> 16;
       clear_value[3] = val->depthStencil.stencil;
       break;
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D32_SFLOAT:
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
       /* R2D_FLOAT32 */
       clear_value[0] = fui(val->depthStencil.depth);
       break;
-   case VK_FORMAT_S8_UINT:
+   case PIPE_FORMAT_S8_UINT:
       clear_value[0] = val->depthStencil.stencil;
       break;
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+   case PIPE_FORMAT_R9G9B9E5_FLOAT:
       /* cleared as UINT32 */
       clear_value[0] = float3_to_rgb9e5(val->color.float32);
       break;
    default:
-      assert(!vk_format_is_depth_or_stencil(format));
-      const struct util_format_description *desc = vk_format_description(format);
+      assert(!util_format_is_depth_or_stencil(format));
+      const struct util_format_description *desc = util_format_description(format);
       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 
       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
-                      format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
+                      format == PIPE_FORMAT_R11G11B10_FLOAT));
 
       for (unsigned i = 0; i < desc->nr_channels; i++) {
          const struct util_format_channel_description *ch = &desc->channel[i];
@@ -144,7 +144,7 @@ r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 static void
 r2d_src(struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
-        const struct tu_image_view *iview,
+        const struct fdl6_view *iview,
         uint32_t layer,
         VkFilter filter)
 {
@@ -162,6 +162,24 @@ r2d_src(struct tu_cmd_buffer *cmd,
 }
 
 static void
+r2d_src_depth(struct tu_cmd_buffer *cmd,
+                struct tu_cs *cs,
+                const struct tu_image_view *iview,
+                uint32_t layer,
+                VkFilter filter)
+{
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
+   tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
+   tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
+   tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
+   /* SP_PS_2D_SRC_PITCH has shifted pitch field */
+   tu_cs_emit(cs, iview->depth_PITCH << 9);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
+   tu_cs_image_flag_ref(cs, &iview->view, layer);
+}
+
+static void
 r2d_src_stencil(struct tu_cmd_buffer *cmd,
                 struct tu_cs *cs,
                 const struct tu_image_view *iview,
@@ -170,7 +188,7 @@ r2d_src_stencil(struct tu_cmd_buffer *cmd,
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
-   tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
+   tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
    tu_cs_emit(cs, iview->stencil_PITCH << 9);
@@ -179,17 +197,17 @@ r2d_src_stencil(struct tu_cmd_buffer *cmd,
 static void
 r2d_src_buffer(struct tu_cmd_buffer *cmd,
                struct tu_cs *cs,
-               VkFormat vk_format,
+               enum pipe_format format,
                uint64_t va, uint32_t pitch,
                uint32_t width, uint32_t height)
 {
-   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
 
    tu_cs_emit_regs(cs,
                    A6XX_SP_PS_2D_SRC_INFO(
-                      .color_format = format.fmt,
-                      .color_swap = format.swap,
-                      .srgb = vk_format_is_srgb(vk_format),
+                      .color_format = fmt.fmt,
+                      .color_swap = fmt.swap,
+                      .srgb = util_format_is_srgb(format),
                       .unk20 = 1,
                       .unk22 = 1),
                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
@@ -198,7 +216,7 @@ r2d_src_buffer(struct tu_cmd_buffer *cmd,
 }
 
 static void
-r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
@@ -209,6 +227,18 @@ r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 }
 
 static void
+r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+{
+   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
+   tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
+   tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
+   tu_cs_emit(cs, iview->depth_PITCH);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
+   tu_cs_image_flag_ref(cs, &iview->view, layer);
+}
+
+static void
 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
@@ -218,15 +248,15 @@ r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t la
 }
 
 static void
-r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
+r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch)
 {
-   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
 
    tu_cs_emit_regs(cs,
                    A6XX_RB_2D_DST_INFO(
-                      .color_format = format.fmt,
-                      .color_swap = format.swap,
-                      .srgb = vk_format_is_srgb(vk_format)),
+                      .color_format = fmt.fmt,
+                      .color_swap = fmt.swap,
+                      .srgb = util_format_is_srgb(format)),
                    A6XX_RB_2D_DST(.qword = va),
                    A6XX_RB_2D_DST_PITCH(pitch));
 }
@@ -234,24 +264,25 @@ r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch
 static void
 r2d_setup_common(struct tu_cmd_buffer *cmd,
                  struct tu_cs *cs,
-                 VkFormat vk_format,
+                 enum pipe_format format,
                  VkImageAspectFlags aspect_mask,
                  unsigned blit_param,
                  bool clear,
                  bool ubwc,
                  bool scissor)
 {
-   enum a6xx_format format = tu6_base_format(vk_format);
-   enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
+   enum a6xx_format fmt = tu6_base_format(format);
+   enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
+
    uint32_t unknown_8c01 = 0;
 
-   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
-      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+   if ((format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+       format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
+      fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
    }
 
    /* note: the only format with partial clearing is D24S8 */
-   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
       /* preserve stencil channel */
       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
          unknown_8c01 = 0x08000041;
@@ -267,10 +298,10 @@ r2d_setup_common(struct tu_cmd_buffer *cmd,
          .scissor = scissor,
          .rotate = blit_param,
          .solid_color = clear,
-         .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
-         .color_format = format,
+         .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
+         .color_format = fmt,
          .mask = 0xf,
-         .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
+         .ifmt = util_format_is_srgb(format) ? R2D_UNORM8_SRGB : ifmt,
       ).value;
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
@@ -279,21 +310,21 @@ r2d_setup_common(struct tu_cmd_buffer *cmd,
    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
    tu_cs_emit(cs, blit_cntl);
 
-   if (format == FMT6_10_10_10_2_UNORM_DEST)
-      format = FMT6_16_16_16_16_FLOAT;
+   if (fmt == FMT6_10_10_10_2_UNORM_DEST)
+      fmt = FMT6_16_16_16_16_FLOAT;
 
    tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
-         .sint = vk_format_is_sint(vk_format),
-         .uint = vk_format_is_uint(vk_format),
-         .color_format = format,
-         .srgb = vk_format_is_srgb(vk_format),
+         .sint = util_format_is_pure_sint(format),
+         .uint = util_format_is_pure_uint(format),
+         .color_format = fmt,
+         .srgb = util_format_is_srgb(format),
          .mask = 0xf));
 }
 
 static void
 r2d_setup(struct tu_cmd_buffer *cmd,
           struct tu_cs *cs,
-          VkFormat vk_format,
+          enum pipe_format format,
           VkImageAspectFlags aspect_mask,
           unsigned blit_param,
           bool clear,
@@ -302,9 +333,11 @@ r2d_setup(struct tu_cmd_buffer *cmd,
 {
    assert(samples == VK_SAMPLE_COUNT_1_BIT);
 
-   tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
+   if (!cmd->state.pass) {
+      tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
+   }
 
-   r2d_setup_common(cmd, cs, vk_format, aspect_mask, blit_param, clear, ubwc, false);
+   r2d_setup_common(cmd, cs, format, aspect_mask, blit_param, clear, ubwc, false);
 }
 
 static void
@@ -546,21 +579,25 @@ compile_shader(struct tu_device *dev, struct nir_shader *nir,
 
    ir3_finalize_nir(dev->compiler, nir);
 
-   struct ir3_shader *sh = ir3_shader_from_nir(dev->compiler, nir,
-                                               align(consts, 4), NULL);
+   struct ir3_shader *sh =
+      ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) {
+                              .api_wavesize = IR3_SINGLE_OR_DOUBLE,
+                              .real_wavesize = IR3_SINGLE_OR_DOUBLE,
+                              .reserved_user_consts = align(consts, 4),
+                          }, NULL);
 
    struct ir3_shader_key key = {};
    bool created;
    struct ir3_shader_variant *so =
       ir3_shader_get_variant(sh, &key, false, false, &created);
 
-   struct tu6_global *global = dev->global_bo.map;
+   struct tu6_global *global = dev->global_bo->map;
 
    assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
    dev->global_shaders[idx] = so;
    memcpy(&global->shaders[*offset], so->bin,
           sizeof(uint32_t) * so->info.sizedwords);
-   dev->global_shader_va[idx] = dev->global_bo.iova +
+   dev->global_shader_va[idx] = dev->global_bo->iova +
       gb_offset(shaders[*offset]);
    *offset += align(so->info.sizedwords, 32);
 }
@@ -749,7 +786,7 @@ r3d_coords(struct tu_cs *cs,
 }
 
 static void
-r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
+r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
 {
    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
@@ -760,8 +797,8 @@ r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
    switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT: {
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
       /* cleared as r8g8b8a8_unorm using special format */
       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
@@ -769,14 +806,14 @@ r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
    } break;
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D32_SFLOAT:
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
       tu_cs_emit(cs, fui(val->depthStencil.depth));
       tu_cs_emit(cs, 0);
       tu_cs_emit(cs, 0);
       tu_cs_emit(cs, 0);
       break;
-   case VK_FORMAT_S8_UINT:
+   case PIPE_FORMAT_S8_UINT:
       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
       tu_cs_emit(cs, 0);
       tu_cs_emit(cs, 0);
@@ -784,7 +821,7 @@ r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
       break;
    default:
       /* as color formats use clear value as-is */
-      assert(!vk_format_is_depth_or_stencil(format));
+      assert(!util_format_is_depth_or_stencil(format));
       tu_cs_emit_array(cs, val->color.uint32, 4);
       break;
    }
@@ -823,7 +860,6 @@ r3d_src_common(struct tu_cmd_buffer *cmd,
       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
       0x60000; /* XXX used by blob, doesn't seem necessary */
    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
-      0x1 | /* XXX used by blob, doesn't seem necessary */
       A6XX_TEX_SAMP_1_UNNORM_COORDS |
       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
@@ -854,7 +890,7 @@ r3d_src_common(struct tu_cmd_buffer *cmd,
 static void
 r3d_src(struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
-        const struct tu_image_view *iview,
+        const struct fdl6_view *iview,
         uint32_t layer,
         VkFilter filter)
 {
@@ -867,23 +903,23 @@ r3d_src(struct tu_cmd_buffer *cmd,
 static void
 r3d_src_buffer(struct tu_cmd_buffer *cmd,
                struct tu_cs *cs,
-               VkFormat vk_format,
+               enum pipe_format format,
                uint64_t va, uint32_t pitch,
                uint32_t width, uint32_t height)
 {
    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 
-   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
 
    desc[0] =
-      COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
-      A6XX_TEX_CONST_0_FMT(format.fmt) |
-      A6XX_TEX_CONST_0_SWAP(format.swap) |
+      COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
+      A6XX_TEX_CONST_0_FMT(fmt.fmt) |
+      A6XX_TEX_CONST_0_SWAP(fmt.swap) |
       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
       // XXX to swizzle into .w for stencil buffer_to_image
-      A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
-      A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
-      A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
+      A6XX_TEX_CONST_0_SWIZ_Y(format == PIPE_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
+      A6XX_TEX_CONST_0_SWIZ_Z(format == PIPE_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
+      A6XX_TEX_CONST_0_SWIZ_W(format == PIPE_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
    desc[2] =
       A6XX_TEX_CONST_2_PITCH(pitch) |
@@ -901,16 +937,22 @@ static void
 r3d_src_gmem(struct tu_cmd_buffer *cmd,
              struct tu_cs *cs,
              const struct tu_image_view *iview,
-             VkFormat format,
+             enum pipe_format format,
              uint32_t gmem_offset,
              uint32_t cpp)
 {
    uint32_t desc[A6XX_TEX_CONST_DWORDS];
-   memcpy(desc, iview->descriptor, sizeof(desc));
-
-   /* patch the format so that depth/stencil get the right format */
-   desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
-   desc[0] |= A6XX_TEX_CONST_0_FMT(tu6_format_texture(format, TILE6_2).fmt);
+   memcpy(desc, iview->view.descriptor, sizeof(desc));
+
+   /* patch the format so that depth/stencil get the right format and swizzle */
+   desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
+                A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
+                A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
+   desc[0] |= A6XX_TEX_CONST_0_FMT(tu6_format_texture(format, TILE6_2).fmt) |
+               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
+               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
+               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
+               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
 
    /* patched for gmem */
    desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
@@ -928,7 +970,7 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
 }
 
 static void
-r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
@@ -938,10 +980,29 @@ r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
    tu_cs_image_flag_ref(cs, iview, layer);
 
+   /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
+    * FMT6_NV12_Y.
+    */
+   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = iview->RB_MRT_BUF_INFO & 0xff));
+
    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 }
 
 static void
+r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+{
+   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
+   tu_cs_emit(cs, tu_image_view_depth(iview, RB_MRT_BUF_INFO));
+   tu_cs_image_depth_ref(cs, iview, layer);
+   tu_cs_emit(cs, 0);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
+   tu_cs_image_flag_ref(cs, &iview->view, layer);
+
+   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->view.ubwc_enabled));
+}
+
+static void
 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
@@ -953,12 +1014,12 @@ r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t la
 }
 
 static void
-r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
+r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch)
 {
-   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
 
    tu_cs_emit_regs(cs,
-                   A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
+                   A6XX_RB_MRT_BUF_INFO(0, .color_format = fmt.fmt, .color_swap = fmt.swap),
                    A6XX_RB_MRT_PITCH(0, pitch),
                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
                    A6XX_RB_MRT_BASE(0, .qword = va),
@@ -968,14 +1029,14 @@ r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch
 }
 
 static uint8_t
-aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
+aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
 {
    uint8_t mask = 0xf;
    assert(aspect_mask);
    /* note: the only format with partial writing is D24S8,
     * clear/blit uses the _AS_R8G8B8A8 format to access it
     */
-   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
          mask = 0x7;
       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
@@ -987,18 +1048,18 @@ aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
 static void
 r3d_setup(struct tu_cmd_buffer *cmd,
           struct tu_cs *cs,
-          VkFormat vk_format,
+          enum pipe_format format,
           VkImageAspectFlags aspect_mask,
           unsigned blit_param,
           bool clear,
           bool ubwc,
           VkSampleCountFlagBits samples)
 {
-   enum a6xx_format format = tu6_base_format(vk_format);
+   enum a6xx_format fmt = tu6_base_format(format);
 
-   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
-      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+   if ((format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+        format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
+      fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
    }
 
    if (!cmd->state.pass) {
@@ -1036,14 +1097,14 @@ r3d_setup(struct tu_cmd_buffer *cmd,
    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 
    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
-                        .color_format = format,
-                        .color_sint = vk_format_is_sint(vk_format),
-                        .color_uint = vk_format_is_uint(vk_format)));
+                        .color_format = fmt,
+                        .color_sint = util_format_is_pure_sint(format),
+                        .color_uint = util_format_is_pure_uint(format)));
 
    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
-      .component_enable = aspect_write_mask(vk_format, aspect_mask)));
-   tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
-   tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
+      .component_enable = aspect_write_mask(format, aspect_mask)));
+   tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(format)));
+   tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(format)));
 
    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
@@ -1084,22 +1145,24 @@ struct blit_ops {
                   const VkOffset2D *dst,
                   const VkOffset2D *src,
                   const VkExtent2D *extent);
-   void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
+   void (*clear_value)(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val);
    void (*src)(
         struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
-        const struct tu_image_view *iview,
+        const struct fdl6_view *iview,
         uint32_t layer,
         VkFilter filter);
    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      VkFormat vk_format,
+                      enum pipe_format format,
                       uint64_t va, uint32_t pitch,
                       uint32_t width, uint32_t height);
-   void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-   void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
+   void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
+   void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
+   void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
+   void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch);
    void (*setup)(struct tu_cmd_buffer *cmd,
                  struct tu_cs *cs,
-                 VkFormat vk_format,
+                 enum pipe_format format,
                  VkImageAspectFlags aspect_mask,
                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
                  bool clear,
@@ -1116,6 +1179,8 @@ static const struct blit_ops r2d_ops = {
    .src = r2d_src,
    .src_buffer = r2d_src_buffer,
    .dst = r2d_dst,
+   .dst_depth = r2d_dst_depth,
+   .dst_stencil = r2d_dst_stencil,
    .dst_buffer = r2d_dst_buffer,
    .setup = r2d_setup,
    .run = r2d_run,
@@ -1128,6 +1193,8 @@ static const struct blit_ops r3d_ops = {
    .src = r3d_src,
    .src_buffer = r3d_src_buffer,
    .dst = r3d_dst,
+   .dst_depth = r3d_dst_depth,
+   .dst_stencil = r3d_dst_stencil,
    .dst_buffer = r3d_dst_buffer,
    .setup = r3d_setup,
    .run = r3d_run,
@@ -1150,76 +1217,53 @@ coords(const struct blit_ops *ops,
  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
  * everything.
  */
-static VkFormat
-copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
-{
-   if (vk_format_is_compressed(format)) {
-      switch (vk_format_get_blocksize(format)) {
-      case 1: return VK_FORMAT_R8_UINT;
-      case 2: return VK_FORMAT_R16_UINT;
-      case 4: return VK_FORMAT_R32_UINT;
-      case 8: return VK_FORMAT_R32G32_UINT;
-      case 16:return VK_FORMAT_R32G32B32A32_UINT;
+static enum pipe_format
+copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask, bool copy_buffer)
+{
+   if (vk_format_is_compressed(vk_format)) {
+      switch (vk_format_get_blocksize(vk_format)) {
+      case 1: return PIPE_FORMAT_R8_UINT;
+      case 2: return PIPE_FORMAT_R16_UINT;
+      case 4: return PIPE_FORMAT_R32_UINT;
+      case 8: return PIPE_FORMAT_R32G32_UINT;
+      case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
       default:
          unreachable("unhandled format size");
       }
    }
 
-   switch (format) {
+   enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
+
    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
     * (also -1.0), when we're supposed to be memcpying the bits. See
     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
     */
-   case VK_FORMAT_R8_SNORM:
-      return VK_FORMAT_R8_UNORM;
-   case VK_FORMAT_R8G8_SNORM:
-      return VK_FORMAT_R8G8_UNORM;
-   case VK_FORMAT_R8G8B8_SNORM:
-      return VK_FORMAT_R8G8B8_UNORM;
-   case VK_FORMAT_B8G8R8_SNORM:
-      return VK_FORMAT_B8G8R8_UNORM;
-   case VK_FORMAT_R8G8B8A8_SNORM:
-      return VK_FORMAT_R8G8B8A8_UNORM;
-   case VK_FORMAT_B8G8R8A8_SNORM:
-      return VK_FORMAT_B8G8R8A8_UNORM;
-   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-      return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
-   case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
-      return VK_FORMAT_A2R10G10B10_UNORM_PACK32;
-   case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
-      return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
-   case VK_FORMAT_R16_SNORM:
-      return VK_FORMAT_R16_UNORM;
-   case VK_FORMAT_R16G16_SNORM:
-      return VK_FORMAT_R16G16_UNORM;
-   case VK_FORMAT_R16G16B16_SNORM:
-      return VK_FORMAT_R16G16B16_UNORM;
-   case VK_FORMAT_R16G16B16A16_SNORM:
-      return VK_FORMAT_R16G16B16A16_UNORM;
-
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-      return VK_FORMAT_R32_UINT;
-
-   case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
+   format = util_format_snorm_to_unorm(format);
+
+   switch (format) {
+   case PIPE_FORMAT_R9G9B9E5_FLOAT:
+      return PIPE_FORMAT_R32_UINT;
+
+   case PIPE_FORMAT_G8_B8R8_420_UNORM:
       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
-         return VK_FORMAT_R8G8_UNORM;
+         return PIPE_FORMAT_R8G8_UNORM;
       else
-         return VK_FORMAT_R8_UNORM;
-   case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
-      return VK_FORMAT_R8_UNORM;
+         return PIPE_FORMAT_Y8_UNORM;
+   case PIPE_FORMAT_G8_B8_R8_420_UNORM:
+      return PIPE_FORMAT_R8_UNORM;
 
-   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
-         return VK_FORMAT_R8_UNORM;
+         return PIPE_FORMAT_R8_UNORM;
       else
          return format;
 
-   case VK_FORMAT_D32_SFLOAT_S8_UINT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
-         return VK_FORMAT_S8_UINT;
+         return PIPE_FORMAT_S8_UINT;
       assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
-      return VK_FORMAT_D32_SFLOAT;
+      return PIPE_FORMAT_Z32_FLOAT;
 
    default:
       return format;
@@ -1234,11 +1278,11 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
 {
    const struct blit_ops *ops = &r2d_ops;
 
-   ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
+   ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
               VK_SAMPLE_COUNT_1_BIT);
-   ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
-   ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
-                   image->bo->iova + image->bo_offset + image->lrz_offset,
+   ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value);
+   ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
+                   image->iova + image->lrz_offset,
                    image->lrz_pitch * 2);
    ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
    ops->run(cmd, cs);
@@ -1246,9 +1290,9 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
 }
 
 static void
-tu_image_view_copy_blit(struct tu_image_view *iview,
+tu_image_view_copy_blit(struct fdl6_view *iview,
                         struct tu_image *image,
-                        VkFormat format,
+                        enum pipe_format format,
                         const VkImageSubresourceLayers *subres,
                         uint32_t layer,
                         bool stencil_read,
@@ -1257,53 +1301,58 @@ tu_image_view_copy_blit(struct tu_image_view *iview,
    VkImageAspectFlags aspect_mask = subres->aspectMask;
 
    /* always use the AS_R8G8B8A8 format for these */
-   if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
-       format == VK_FORMAT_X8_D24_UNORM_PACK32) {
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+       format == PIPE_FORMAT_Z24X8_UNORM) {
       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
    }
 
-   tu_image_view_init(iview, &(VkImageViewCreateInfo) {
-      .image = tu_image_to_handle(image),
-      .viewType = z_scale ? VK_IMAGE_VIEW_TYPE_3D : VK_IMAGE_VIEW_TYPE_2D,
-      .format = format,
-      /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
-      .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
-      .subresourceRange = {
-         .aspectMask = aspect_mask,
-         .baseMipLevel = subres->mipLevel,
-         .levelCount = 1,
-         .baseArrayLayer = subres->baseArrayLayer + layer,
-         .layerCount = 1,
+   const struct fdl_layout *layout =
+      &image->layout[tu6_plane_index(image->vk_format, aspect_mask)];
+
+   fdl6_view_init(iview, &layout, &(struct fdl_view_args) {
+      .iova = image->iova,
+      .base_array_layer = subres->baseArrayLayer + layer,
+      .layer_count = 1,
+      .base_miplevel = subres->mipLevel,
+      .level_count = 1,
+      .format = tu_format_for_aspect(format, aspect_mask),
+      .swiz = {
+         /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
+         stencil_read ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_X,
+         PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
       },
+      .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
    }, false);
 }
 
 static void
-tu_image_view_copy(struct tu_image_view *iview,
+tu_image_view_copy(struct fdl6_view *iview,
                    struct tu_image *image,
-                   VkFormat format,
+                   enum pipe_format format,
                    const VkImageSubresourceLayers *subres,
                    uint32_t layer,
                    bool stencil_read)
 {
-   format = copy_format(format, subres->aspectMask, false);
    tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read, false);
 }
 
 static void
-tu_image_view_blit(struct tu_image_view *iview,
+tu_image_view_blit(struct fdl6_view *iview,
                    struct tu_image *image,
                    const VkImageSubresourceLayers *subres,
                    uint32_t layer)
 {
-   tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false, false);
+   enum pipe_format format =
+      tu6_plane_format(image->vk_format, tu6_plane_index(image->vk_format,
+                                                         subres->aspectMask));
+   tu_image_view_copy_blit(iview, image, format, subres, layer, false, false);
 }
 
 static void
 tu6_blit_image(struct tu_cmd_buffer *cmd,
                struct tu_image *src_image,
                struct tu_image *dst_image,
-               const VkImageBlit *info,
+               const VkImageBlit2KHR *info,
                VkFilter filter)
 {
    const struct blit_ops *ops = &r2d_ops;
@@ -1375,7 +1424,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
 
    trace_start_blit(&cmd->trace, cs);
 
-   ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
+   ops->setup(cmd, cs, tu_vk_format_to_pipe_format(format), info->dstSubresource.aspectMask,
               blit_param, false, dst_image->layout[0].ubwc,
               dst_image->layout[0].nr_samples);
 
@@ -1399,12 +1448,16 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
    }
 
-   struct tu_image_view dst, src;
+   struct fdl6_view dst, src;
    tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
                       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
 
    if (z_scale) {
-      tu_image_view_copy_blit(&src, src_image, src_image->vk_format,
+      enum pipe_format src_format =
+         tu6_plane_format(src_image->vk_format,
+                          tu6_plane_index(src_image->vk_format,
+                                          info->srcSubresource.aspectMask));
+      tu_image_view_copy_blit(&src, src_image, src_format,
                               &info->srcSubresource, 0, false, true);
       ops->src(cmd, cs, &src, 0, filter);
    } else {
@@ -1432,35 +1485,30 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-tu_CmdBlitImage(VkCommandBuffer commandBuffer,
-                VkImage srcImage,
-                VkImageLayout srcImageLayout,
-                VkImage dstImage,
-                VkImageLayout dstImageLayout,
-                uint32_t regionCount,
-                const VkImageBlit *pRegions,
-                VkFilter filter)
+tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
+                    const VkBlitImageInfo2KHR* pBlitImageInfo)
 
 {
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
+   TU_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
+   TU_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
 
-   for (uint32_t i = 0; i < regionCount; ++i) {
+   for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
       /* can't blit both depth and stencil at once with D32_S8
        * TODO: more advanced 3D blit path to support it instead?
        */
       if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
           dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-         VkImageBlit region = pRegions[i];
-         u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
+         VkImageBlit2KHR region = pBlitImageInfo->pRegions[i];
+         u_foreach_bit(b, region.dstSubresource.aspectMask) {
             region.srcSubresource.aspectMask = BIT(b);
             region.dstSubresource.aspectMask = BIT(b);
-            tu6_blit_image(cmd, src_image, dst_image, &region, filter);
+            tu6_blit_image(cmd, src_image, dst_image, &region, pBlitImageInfo->filter);
          }
          continue;
       }
-      tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
+      tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
+                     pBlitImageInfo->filter);
    }
 }
 
@@ -1494,12 +1542,14 @@ static void
 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
                         struct tu_buffer *src_buffer,
                         struct tu_image *dst_image,
-                        const VkBufferImageCopy *info)
+                        const VkBufferImageCopy2KHR *info)
 {
    struct tu_cs *cs = &cmd->cs;
    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat src_format =
+   enum pipe_format src_format =
       copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
+   enum pipe_format dst_format =
+      copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false);
    const struct blit_ops *ops = &r2d_ops;
 
    /* special case for buffer to stencil */
@@ -1508,9 +1558,9 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
       ops = &r3d_ops;
    }
 
-   /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
-    * which matters for UBWC. buffer_to_image/etc can fail because of this
-    */
+   /* note: could use "R8_UNORM" when no UBWC */
+   if (src_format == PIPE_FORMAT_Y8_UNORM)
+      ops = &r3d_ops;
 
    VkOffset3D offset = info->imageOffset;
    VkExtent3D extent = info->imageExtent;
@@ -1519,24 +1569,23 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
 
    copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
 
-   uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
+   uint32_t pitch = src_width * util_format_get_blocksize(src_format);
    uint32_t layer_size = src_height * pitch;
 
-   ops->setup(cmd, cs,
-              copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
+   ops->setup(cmd, cs, dst_format,
               info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
               dst_image->layout[0].nr_samples);
 
-   struct tu_image_view dst;
-   tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
+   struct fdl6_view dst;
+   tu_image_view_copy(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
 
    for (uint32_t i = 0; i < layers; i++) {
       ops->dst(cs, &dst, i);
 
-      uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
+      uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
       if ((src_va & 63) || (pitch & 63)) {
          for (uint32_t y = 0; y < extent.height; y++) {
-            uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
+            uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
                             x + extent.width, 1);
             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
@@ -1555,39 +1604,43 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
-                        VkBuffer srcBuffer,
-                        VkImage dstImage,
-                        VkImageLayout dstImageLayout,
-                        uint32_t regionCount,
-                        const VkBufferImageCopy *pRegions)
+tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
+                            const VkCopyBufferToImageInfo2KHR *pCopyBufferToImageInfo)
 {
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
-   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
+   TU_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
+   TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
 
-   for (unsigned i = 0; i < regionCount; ++i)
-      tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
+   for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
+      tu_copy_buffer_to_image(cmd, src_buffer, dst_image,
+                              pCopyBufferToImageInfo->pRegions + i);
 }
 
 static void
 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
                         struct tu_image *src_image,
                         struct tu_buffer *dst_buffer,
-                        const VkBufferImageCopy *info)
+                        const VkBufferImageCopy2KHR *info)
 {
    struct tu_cs *cs = &cmd->cs;
    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat dst_format =
+   enum pipe_format dst_format =
       copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
+   enum pipe_format src_format =
+      copy_format(src_image->vk_format, info->imageSubresource.aspectMask, false);
+   const struct blit_ops *ops = &r2d_ops;
    bool stencil_read = false;
 
    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+      ops = &r3d_ops;
       stencil_read = true;
    }
 
-   const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
+   /* note: could use "R8_UNORM" when no UBWC */
+   if (dst_format == PIPE_FORMAT_Y8_UNORM)
+      ops = &r3d_ops;
+
    VkOffset3D offset = info->imageOffset;
    VkExtent3D extent = info->imageExtent;
    uint32_t dst_width = info->bufferRowLength ?: extent.width;
@@ -1595,22 +1648,22 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
 
    copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
 
-   uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
+   uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
    uint32_t layer_size = pitch * dst_height;
 
    ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
               VK_SAMPLE_COUNT_1_BIT);
 
-   struct tu_image_view src;
-   tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
+   struct fdl6_view src;
+   tu_image_view_copy(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
 
    for (uint32_t i = 0; i < layers; i++) {
       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
 
-      uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
+      uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
       if ((dst_va & 63) || (pitch & 63)) {
          for (uint32_t y = 0; y < extent.height; y++) {
-            uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
+            uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
                         &(VkExtent2D) {extent.width, 1});
@@ -1628,19 +1681,16 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
-                        VkImage srcImage,
-                        VkImageLayout srcImageLayout,
-                        VkBuffer dstBuffer,
-                        uint32_t regionCount,
-                        const VkBufferImageCopy *pRegions)
+tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
+                            const VkCopyImageToBufferInfo2KHR* pCopyImageToBufferInfo)
 {
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
+   TU_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
+   TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
 
-   for (unsigned i = 0; i < regionCount; ++i)
-      tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
+   for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
+      tu_copy_image_to_buffer(cmd, src_image, dst_buffer,
+                              pCopyImageToBufferInfo->pRegions + i);
 }
 
 /* Tiled formats don't support swapping, which means that we can't support
@@ -1654,7 +1704,7 @@ tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
  */
 
 static bool
-is_swapped_format(VkFormat format)
+is_swapped_format(enum pipe_format format)
 {
    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
@@ -1676,7 +1726,7 @@ static void
 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
                        struct tu_image *src_image,
                        struct tu_image *dst_image,
-                       const VkImageCopy *info)
+                       const VkImageCopy2KHR *info)
 {
    const struct blit_ops *ops = &r2d_ops;
    struct tu_cs *cs = &cmd->cs;
@@ -1684,7 +1734,7 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
    if (dst_image->layout[0].nr_samples > 1)
       ops = &r3d_ops;
 
-   VkFormat format = VK_FORMAT_UNDEFINED;
+   enum pipe_format format = PIPE_FORMAT_NONE;
    VkOffset3D src_offset = info->srcOffset;
    VkOffset3D dst_offset = info->dstOffset;
    VkExtent3D extent = info->extent;
@@ -1709,8 +1759,13 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
 
-   VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
-   VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
+   enum pipe_format dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
+   enum pipe_format src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
+
+   /* note: could use "R8_UNORM" when no UBWC */
+   if (dst_format == PIPE_FORMAT_Y8_UNORM ||
+       src_format == PIPE_FORMAT_Y8_UNORM)
+      ops = &r3d_ops;
 
    bool use_staging_blit = false;
 
@@ -1748,54 +1803,50 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
       use_staging_blit = true;
    }
 
-   struct tu_image_view dst, src;
+   struct fdl6_view dst, src;
 
    if (use_staging_blit) {
       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
 
-      struct tu_image staging_image = {
-         .base.type = VK_OBJECT_TYPE_IMAGE,
-         .vk_format = src_format,
-         .level_count = 1,
-         .layer_count = info->srcSubresource.layerCount,
-         .bo_offset = 0,
-      };
-
-      VkImageSubresourceLayers staging_subresource = {
-         .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
-         .mipLevel = 0,
-         .baseArrayLayer = 0,
-         .layerCount = info->srcSubresource.layerCount,
-      };
-
+      struct fdl_layout staging_layout = { 0 };
       VkOffset3D staging_offset = { 0 };
 
-      staging_image.layout[0].tile_mode = TILE6_LINEAR;
-      staging_image.layout[0].ubwc = false;
+      staging_layout.tile_mode = TILE6_LINEAR;
+      staging_layout.ubwc = false;
 
-      fdl6_layout(&staging_image.layout[0],
-                  vk_format_to_pipe_format(staging_image.vk_format),
+      fdl6_layout(&staging_layout,
+                  src_format,
                   src_image->layout[0].nr_samples,
                   extent.width,
                   extent.height,
                   extent.depth,
-                  staging_image.level_count,
-                  staging_image.layer_count,
+                  1,
+                  info->srcSubresource.layerCount,
                   extent.depth > 1,
                   NULL);
 
+      struct tu_bo *staging_bo;
       VkResult result = tu_get_scratch_bo(cmd->device,
-                                          staging_image.layout[0].size,
-                                          &staging_image.bo);
+                                          staging_layout.size,
+                                          &staging_bo);
       if (result != VK_SUCCESS) {
          cmd->record_result = result;
          return;
       }
 
-      struct tu_image_view staging;
-      tu_image_view_copy(&staging, &staging_image, src_format,
-                         &staging_subresource, 0, false);
+      struct fdl6_view staging;
+      const struct fdl_layout *staging_layout_ptr = &staging_layout;
+      fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
+         .iova = staging_bo->iova,
+         .base_array_layer = 0,
+         .layer_count = 1,
+         .base_miplevel = 0,
+         .level_count = info->srcSubresource.layerCount,
+         .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
+         .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
+         .type = FDL_VIEW_TYPE_2D,
+      }, false);
 
       ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
                  dst_image->layout[0].nr_samples);
@@ -1814,8 +1865,16 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
       tu_cs_emit_wfi(cs);
 
-      tu_image_view_copy(&staging, &staging_image, dst_format,
-                         &staging_subresource, 0, false);
+      fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
+         .iova = staging_bo->iova,
+         .base_array_layer = 0,
+         .layer_count = 1,
+         .base_miplevel = 0,
+         .level_count = info->srcSubresource.layerCount,
+         .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
+         .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
+         .type = FDL_VIEW_TYPE_2D,
+      }, false);
 
       ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
                  0, false, dst_image->layout[0].ubwc,
@@ -1847,22 +1906,17 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-tu_CmdCopyImage(VkCommandBuffer commandBuffer,
-                VkImage srcImage,
-                VkImageLayout srcImageLayout,
-                VkImage destImage,
-                VkImageLayout destImageLayout,
-                uint32_t regionCount,
-                const VkImageCopy *pRegions)
+tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
+                    const VkCopyImageInfo2KHR* pCopyImageInfo)
 {
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_image, dst_image, destImage);
+   TU_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
+   TU_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
 
-   for (uint32_t i = 0; i < regionCount; ++i) {
+   for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
       if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-         VkImageCopy info = pRegions[i];
-         u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
+         VkImageCopy2KHR info = pCopyImageInfo->pRegions[i];
+         u_foreach_bit(b, info.dstSubresource.aspectMask) {
             info.srcSubresource.aspectMask = BIT(b);
             info.dstSubresource.aspectMask = BIT(b);
             tu_copy_image_to_image(cmd, src_image, dst_image, &info);
@@ -1870,7 +1924,8 @@ tu_CmdCopyImage(VkCommandBuffer commandBuffer,
          continue;
       }
 
-      tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
+      tu_copy_image_to_image(cmd, src_image, dst_image,
+                             pCopyImageInfo->pRegions + i);
    }
 }
 
@@ -1883,7 +1938,7 @@ copy_buffer(struct tu_cmd_buffer *cmd,
 {
    const struct blit_ops *ops = &r2d_ops;
    struct tu_cs *cs = &cmd->cs;
-   VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
+   enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
    uint64_t blocks = size / block_size;
 
    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
@@ -1908,21 +1963,19 @@ copy_buffer(struct tu_cmd_buffer *cmd,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
-                 VkBuffer srcBuffer,
-                 VkBuffer dstBuffer,
-                 uint32_t regionCount,
-                 const VkBufferCopy *pRegions)
+tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
+                     const VkCopyBufferInfo2KHR *pCopyBufferInfo)
 {
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
-   TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
+   TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
+   TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
 
-   for (unsigned i = 0; i < regionCount; ++i) {
+   for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
+      const VkBufferCopy2KHR *region = &pCopyBufferInfo->pRegions[i];
       copy_buffer(cmd,
-                  tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
-                  tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
-                  pRegions[i].size, 1);
+                  dst_buffer->iova + region->dstOffset,
+                  src_buffer->iova + region->srcOffset,
+                  region->size, 1);
    }
 }
 
@@ -1944,7 +1997,7 @@ tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
    }
 
    memcpy(tmp.map, pData, dataSize);
-   copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
+   copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -1962,18 +2015,18 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
    if (fillSize == VK_WHOLE_SIZE)
       fillSize = buffer->size - dstOffset;
 
-   uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
+   uint64_t dst_va = buffer->iova + dstOffset;
    uint32_t blocks = fillSize / 4;
 
-   ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
+   ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
               VK_SAMPLE_COUNT_1_BIT);
-   ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
+   ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
 
    while (blocks) {
       uint32_t dst_x = (dst_va & 63) / 4;
       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
 
-      ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
+      ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0);
       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
       ops->run(cmd, cs);
 
@@ -1985,25 +2038,21 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
 }
 
 VKAPI_ATTR void VKAPI_CALL
-tu_CmdResolveImage(VkCommandBuffer commandBuffer,
-                   VkImage srcImage,
-                   VkImageLayout srcImageLayout,
-                   VkImage dstImage,
-                   VkImageLayout dstImageLayout,
-                   uint32_t regionCount,
-                   const VkImageResolve *pRegions)
+tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
+                       const VkResolveImageInfo2KHR* pResolveImageInfo)
 {
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
+   TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
+   TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
    const struct blit_ops *ops = &r2d_ops;
    struct tu_cs *cs = &cmd->cs;
 
-   ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
-              0, false, dst_image->layout[0].ubwc, VK_SAMPLE_COUNT_1_BIT);
+   ops->setup(cmd, cs, tu_vk_format_to_pipe_format(dst_image->vk_format),
+              VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc, 
+              VK_SAMPLE_COUNT_1_BIT);
 
-   for (uint32_t i = 0; i < regionCount; ++i) {
-      const VkImageResolve *info = &pRegions[i];
+   for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
+      const VkImageResolve2KHR *info = &pResolveImageInfo->pRegions[i];
       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
 
       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
@@ -2011,7 +2060,7 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer,
 
       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
 
-      struct tu_image_view dst, src;
+      struct fdl6_view dst, src;
       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
 
@@ -2040,23 +2089,29 @@ resolve_sysmem(struct tu_cmd_buffer *cmd,
                uint32_t layer_mask,
                uint32_t layers,
                const VkRect2D *rect,
-               bool separate_stencil)
+               bool separate_ds)
 {
    const struct blit_ops *ops = &r2d_ops;
 
    trace_start_sysmem_resolve(&cmd->trace, cs);
 
-   ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT,
-              0, false, dst->ubwc_enabled, VK_SAMPLE_COUNT_1_BIT);
+   ops->setup(cmd, cs, tu_vk_format_to_pipe_format(format),
+              VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
+              VK_SAMPLE_COUNT_1_BIT);
    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
 
    for_each_layer(i, layer_mask, layers) {
-      if (separate_stencil) {
-         r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
-         r2d_dst_stencil(cs, dst, i);
+      if (separate_ds) {
+         if (format == VK_FORMAT_D32_SFLOAT) {
+            r2d_src_depth(cmd, cs, src, i, VK_FILTER_NEAREST);
+            ops->dst_depth(cs, dst, i);
+         } else {
+            r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
+            ops->dst_stencil(cs, dst, i);
+         }
       } else {
-         ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
-         ops->dst(cs, dst, i);
+         ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST);
+         ops->dst(cs, &dst->view, i);
       }
       ops->run(cmd, cs);
    }
@@ -2079,7 +2134,7 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
 
    if (dst->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT,
-                     src, dst, layer_mask, layers, rect, false);
+                     src, dst, layer_mask, layers, rect, true);
       resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT,
                      src, dst, layer_mask, layers, rect, true);
    } else {
@@ -2098,9 +2153,14 @@ clear_image(struct tu_cmd_buffer *cmd,
    uint32_t level_count = tu_get_levelCount(image, range);
    uint32_t layer_count = tu_get_layerCount(image, range);
    struct tu_cs *cs = &cmd->cs;
-   VkFormat format = image->vk_format;
-   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
-      format = copy_format(format, aspect_mask, false);
+   enum pipe_format format;
+   if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
+      format = PIPE_FORMAT_R32_UINT;
+   } else {
+      format = tu6_plane_format(image->vk_format,
+                                tu6_plane_index(image->vk_format,
+                                                aspect_mask));
+   }
 
    if (image->layout[0].depth0 > 1) {
       assert(layer_count == 1);
@@ -2112,7 +2172,7 @@ clear_image(struct tu_cmd_buffer *cmd,
    ops->setup(cmd, cs, format, aspect_mask, 0, true, image->layout[0].ubwc,
               image->layout[0].nr_samples);
    if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
-      ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
+      ops->clear_value(cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
    else
       ops->clear_value(cs, format, clear_value);
 
@@ -2125,7 +2185,7 @@ clear_image(struct tu_cmd_buffer *cmd,
                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
                   });
 
-      struct tu_image_view dst;
+      struct fdl6_view dst;
       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
          .aspectMask = aspect_mask,
          .mipLevel = range->baseMipLevel + j,
@@ -2338,21 +2398,21 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
 }
 
 static void
-pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
+pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
 {
    switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
                        val->depthStencil.stencil << 24;
       return;
-   case VK_FORMAT_D16_UNORM:
+   case PIPE_FORMAT_Z16_UNORM:
       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
       return;
-   case VK_FORMAT_D32_SFLOAT:
+   case PIPE_FORMAT_Z32_FLOAT:
       clear_value[0] = fui(val->depthStencil.depth);
       return;
-   case VK_FORMAT_S8_UINT:
+   case PIPE_FORMAT_S8_UINT:
       clear_value[0] = val->depthStencil.stencil;
       return;
    default:
@@ -2361,33 +2421,33 @@ pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_v
 
    float tmp[4];
    memcpy(tmp, val->color.float32, 4 * sizeof(float));
-   if (vk_format_is_srgb(format)) {
+   if (util_format_is_srgb(format)) {
       for (int i = 0; i < 3; i++)
          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
    }
 
 #define PACK_F(type) util_format_##type##_pack_rgba_float \
    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
-   switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
+   switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
    case 4:
       PACK_F(r4g4b4a4_unorm);
       break;
    case 5:
-      if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
+      if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
          PACK_F(r5g6b5_unorm);
       else
          PACK_F(r5g5b5a1_unorm);
       break;
    case 8:
-      if (vk_format_is_snorm(format))
+      if (util_format_is_snorm(format))
          PACK_F(r8g8b8a8_snorm);
-      else if (vk_format_is_unorm(format))
+      else if (util_format_is_unorm(format))
          PACK_F(r8g8b8a8_unorm);
       else
          pack_int8(clear_value, val->color.uint32);
       break;
    case 10:
-      if (vk_format_is_int(format))
+      if (util_format_is_pure_integer(format))
          pack_int10_2(clear_value, val->color.uint32);
       else
          PACK_F(r10g10b10a2_unorm);
@@ -2396,11 +2456,11 @@ pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_v
       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
       break;
    case 16:
-      if (vk_format_is_snorm(format))
+      if (util_format_is_snorm(format))
          PACK_F(r16g16b16a16_snorm);
-      else if (vk_format_is_unorm(format))
+      else if (util_format_is_unorm(format))
          PACK_F(r16g16b16a16_unorm);
-      else if (vk_format_is_float(format))
+      else if (util_format_is_float(format))
          PACK_F(r16g16b16a16_float);
       else
          pack_int16(clear_value, val->color.uint32);
@@ -2417,7 +2477,7 @@ pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_v
 static void
 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
                       struct tu_cs *cs,
-                      VkFormat format,
+                      enum pipe_format format,
                       uint8_t clear_mask,
                       uint32_t gmem_offset,
                       const VkClearValue *value)
@@ -2454,15 +2514,16 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
 
    trace_start_gmem_clear(&cmd->trace, cs);
 
+   enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-         clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, att->gmem_offset, value);
       if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-         clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
       return;
    }
 
-   clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
+   clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), att->gmem_offset, value);
 
    trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
 }
@@ -2554,12 +2615,13 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
 static void
 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
                         struct tu_cs *cs,
-                        VkFormat format,
+                        VkFormat vk_format,
                         VkImageAspectFlags clear_mask,
                         const VkRenderPassBeginInfo *info,
                         uint32_t a,
-                        bool separate_stencil)
+                        bool separate_ds)
 {
+   enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
    const struct tu_image_view *iview = cmd->state.attachments[a];
    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
@@ -2569,19 +2631,20 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
 
    trace_start_sysmem_clear(&cmd->trace, cs);
 
-   ops->setup(cmd, cs, format, clear_mask, 0, true, iview->ubwc_enabled,
+   ops->setup(cmd, cs, format, clear_mask, 0, true, iview->view.ubwc_enabled,
               cmd->state.pass->attachments[a].samples);
    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
    ops->clear_value(cs, format, &info->pClearValues[a]);
 
    for_each_layer(i, clear_views, fb->layers) {
-      if (separate_stencil) {
-         if (ops == &r3d_ops)
-            r3d_dst_stencil(cs, iview, i);
-         else
-            r2d_dst_stencil(cs, iview, i);
+      if (separate_ds) {
+         if (vk_format == VK_FORMAT_D32_SFLOAT) {
+            ops->dst_depth(cs, iview, i);
+         } else {
+            ops->dst_stencil(cs, iview, i);
+         }
       } else {
-         ops->dst(cs, iview, i);
+         ops->dst(cs, &iview->view, i);
       }
       ops->run(cmd, cs);
    }
@@ -2589,7 +2652,7 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
    ops->teardown(cmd, cs);
 
    trace_end_sysmem_clear(&cmd->trace, cs,
-                          format, ops == &r3d_ops,
+                          vk_format, ops == &r3d_ops,
                           cmd->state.pass->attachments[a].samples);
 }
 
@@ -2608,7 +2671,7 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
-                                 info, a, false);
+                                 info, a, true);
       }
       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
          clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
@@ -2630,6 +2693,7 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
     */
    if (vk_format_is_depth_or_stencil(attachment->format)) {
       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
+      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
    } else {
       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
@@ -2672,23 +2736,35 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
       .unk0 = !resolve,
       .gmem = !resolve,
-      .sample_0 = vk_format_is_int(attachment->format) |
+      .sample_0 = vk_format_is_int(attachment->format) ||
          vk_format_is_depth_or_stencil(attachment->format)));
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
-   if (separate_stencil) {
-      tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
-      tu_cs_emit_qw(cs, iview->stencil_base_addr);
-      tu_cs_emit(cs, iview->stencil_PITCH);
+   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+      if (!separate_stencil) {
+         tu_cs_emit(cs, tu_image_view_depth(iview, RB_BLIT_DST_INFO));
+         tu_cs_emit_qw(cs, iview->depth_base_addr);
+         tu_cs_emit(cs, iview->depth_PITCH);
 
-      tu_cs_emit_regs(cs,
-                      A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
+         tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
+         tu_cs_image_flag_ref(cs, &iview->view, 0);
+
+         tu_cs_emit_regs(cs,
+                        A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
+      } else {
+         tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
+         tu_cs_emit_qw(cs, iview->stencil_base_addr);
+         tu_cs_emit(cs, iview->stencil_PITCH);
+
+         tu_cs_emit_regs(cs,
+                        A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
+      }
    } else {
-      tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
-      tu_cs_image_ref_2d(cs, iview, 0, false);
+      tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO);
+      tu_cs_image_ref_2d(cs, &iview->view, 0, false);
 
       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
-      tu_cs_image_flag_ref(cs, iview, 0);
+      tu_cs_image_flag_ref(cs, &iview->view, 0);
 
       tu_cs_emit_regs(cs,
                       A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
@@ -2759,25 +2835,31 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
               const struct tu_image_view *iview,
               uint32_t samples,
               bool separate_stencil,
-              VkFormat format,
+              enum pipe_format format,
               uint32_t gmem_offset,
               uint32_t cpp)
 {
    r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
-                    iview->ubwc_enabled, true);
-   if (separate_stencil)
-      r2d_dst_stencil(cs, iview, 0);
-   else
-      r2d_dst(cs, iview, 0);
+                    iview->view.ubwc_enabled, true);
+
+   if (iview->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+      if (!separate_stencil) {
+         r2d_dst_depth(cs, iview, 0);
+      } else {
+         r2d_dst_stencil(cs, iview, 0);
+      }
+   } else {
+      r2d_dst(cs, &iview->view, 0);
+   }
 
    tu_cs_emit_regs(cs,
                    A6XX_SP_PS_2D_SRC_INFO(
                       .color_format = tu6_format_texture(format, TILE6_2).fmt,
                       .tile_mode = TILE6_2,
-                      .srgb = vk_format_is_srgb(format),
+                      .srgb = util_format_is_srgb(format),
                       .samples = tu_msaa_samples(samples),
-                      .samples_average = !vk_format_is_int(format) &&
-                                         !vk_format_is_depth_or_stencil(format),
+                      .samples_average = !util_format_is_pure_integer(format) &&
+                                         !util_format_is_depth_or_stencil(format),
                       .unk20 = 1,
                       .unk22 = 1),
                    /* note: src size does not matter when not scaling */
@@ -2807,26 +2889,45 @@ store_3d_blit(struct tu_cmd_buffer *cmd,
               const struct tu_image_view *iview,
               uint32_t dst_samples,
               bool separate_stencil,
-              VkFormat format,
+              enum pipe_format format,
               const VkRect2D *render_area,
               uint32_t gmem_offset,
               uint32_t cpp)
 {
+   /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
+    * aren't set until we know whether we're HW binning or not, and we want to
+    * avoid a dependence on that here to be able to store attachments before
+    * the end of the renderpass in the future. Use the scratch space to
+    * save/restore them dynamically.
+    */
+   tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
+   tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
+                  CP_REG_TO_SCRATCH_0_SCRATCH(0) |
+                  CP_REG_TO_SCRATCH_0_CNT(1 - 1));
+
    r3d_setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
-             iview->ubwc_enabled, dst_samples);
+             iview->view.ubwc_enabled, dst_samples);
 
    r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
 
-   if (separate_stencil)
-      r3d_dst_stencil(cs, iview, 0);
-   else
-      r3d_dst(cs, iview, 0);
+   if (iview->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+      if (!separate_stencil) {
+         r3d_dst_depth(cs, iview, 0);
+      } else {
+         r3d_dst_stencil(cs, iview, 0);
+      }
+   } else {
+      r3d_dst(cs, &iview->view, 0);
+   }
 
    r3d_src_gmem(cmd, cs, iview, format, gmem_offset, cpp);
 
    /* sync GMEM writes with CACHE. */
    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
 
+   /* Wait for CACHE_INVALIDATE to land */
+   tu_cs_emit_wfi(cs);
+
    r3d_run(cmd, cs);
 
    /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
@@ -2835,6 +2936,17 @@ store_3d_blit(struct tu_cmd_buffer *cmd,
     * writes to depth images as a color RT, so there's no need to flush depth.
     */
    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
+
+   /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
+   tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
+   tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
+                  CP_SCRATCH_TO_REG_0_SCRATCH(0) |
+                  CP_SCRATCH_TO_REG_0_CNT(1 - 1));
+
+   tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
+   tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
+                  CP_SCRATCH_TO_REG_0_SCRATCH(0) |
+                  CP_SCRATCH_TO_REG_0_CNT(1 - 1));
 }
 
 void
@@ -2862,13 +2974,17 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
     * required y padding in the layout (except for the last level)
     */
    bool need_y2_align =
-      y2 != iview->extent.height || iview->need_y2_align;
+      y2 != iview->view.height || iview->view.need_y2_align;
 
    bool unaligned =
       x1 % phys_dev->info->gmem_align_w ||
-      (x2 % phys_dev->info->gmem_align_w && x2 != iview->extent.width) ||
+      (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
       y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);
 
+   /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
+   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
+      unaligned = true;
+
    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
     * one for depth and other for stencil. When resolving a MSAA
     * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
@@ -2877,22 +2993,25 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
       dst->format == VK_FORMAT_S8_UINT;
 
+   bool store_common = dst->store && !resolve_d32s8_s8;
+   bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
+
    trace_start_gmem_store(&cmd->trace, cs);
 
    /* use fast path when render area is aligned, except for unsupported resolve cases */
    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
-      if (dst->store)
-         tu_emit_blit(cmd, cs, iview, src, true, resolve_d32s8_s8);
-      if (dst->store_stencil)
+      if (store_common)
+         tu_emit_blit(cmd, cs, iview, src, true, false);
+      if (store_separate_stencil)
          tu_emit_blit(cmd, cs, iview, src, true, true);
 
       trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
       return;
    }
 
-   VkFormat format = src->format;
-   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
-      format = VK_FORMAT_D32_SFLOAT;
+   enum pipe_format format = tu_vk_format_to_pipe_format(src->format);
+   if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+      format = PIPE_FORMAT_Z32_FLOAT;
 
    if (dst->samples > 1) {
       /* If we hit this path, we have to disable draw states after every tile
@@ -2902,26 +3021,26 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
        * TODO: store a flag somewhere so we don't do this more than once and
        * don't do it after the renderpass when this happens.
        */
-      if (dst->store || dst->store_stencil)
+      if (store_common || store_separate_stencil)
          tu_disable_draw_states(cmd, cs);
 
-      if (dst->store) {
-         store_3d_blit(cmd, cs, iview, dst->samples, resolve_d32s8_s8, format,
+      if (store_common) {
+         store_3d_blit(cmd, cs, iview, dst->samples, false, format,
                        render_area, src->gmem_offset, src->cpp);
       }
-      if (dst->store_stencil) {
-         store_3d_blit(cmd, cs, iview, dst->samples, true, VK_FORMAT_S8_UINT,
-                       render_area, src->gmem_offset, src->samples);
+      if (store_separate_stencil) {
+         store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
+                       render_area, src->gmem_offset_stencil, src->samples);
       }
    } else {
       r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
 
-      if (dst->store) {
-         store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format,
+      if (store_common) {
+         store_cp_blit(cmd, cs, iview, src->samples, false, format,
                        src->gmem_offset, src->cpp);
       }
-      if (dst->store_stencil) {
-         store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
+      if (store_separate_stencil) {
+         store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
                        src->gmem_offset_stencil, src->samples);
       }
    }
diff --git a/lib/mesa/src/freedreno/vulkan/tu_legacy.c b/lib/mesa/src/freedreno/vulkan/tu_legacy.c
deleted file mode 100644
index 8209a96b0..000000000
--- a/lib/mesa/src/freedreno/vulkan/tu_legacy.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2020 Valve Corporation
- * SPDX-License-Identifier: MIT
- *
- * Authors:
- *    Jonathan Marek <jonathan@marek.ca>
- */
-
-#include <vulkan/vulkan.h>
-#include <vulkan/vk_android_native_buffer.h> /* android tu_entrypoints.h depends on this */
-#include <assert.h>
-
-#include "tu_entrypoints.h"
-#include "vk_util.h"
-
-void
-tu_GetPhysicalDeviceQueueFamilyProperties(VkPhysicalDevice pdev,
-                                          uint32_t *count,
-                                          VkQueueFamilyProperties *props)
-{
-   if (!props)
-      return tu_GetPhysicalDeviceQueueFamilyProperties2(pdev, count, NULL);
-
-   VkQueueFamilyProperties2 props2[*count];
-   for (uint32_t i = 0; i < *count; i++) {
-      props2[i].sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2;
-      props2[i].pNext = NULL;
-   }
-   tu_GetPhysicalDeviceQueueFamilyProperties2(pdev, count, props2);
-   for (uint32_t i = 0; i < *count; i++)
-      props[i] = props2[i].queueFamilyProperties;
-}
-
-void
-tu_GetPhysicalDeviceSparseImageFormatProperties(VkPhysicalDevice pdev,
-                                                VkFormat format,
-                                                VkImageType type,
-                                                VkSampleCountFlagBits samples,
-                                                VkImageUsageFlags usage,
-                                                VkImageTiling tiling,
-                                                uint32_t *count,
-                                                VkSparseImageFormatProperties *props)
-{
-   const VkPhysicalDeviceSparseImageFormatInfo2 info = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
-      .format = format,
-      .type = type,
-      .samples = samples,
-      .usage = usage,
-      .tiling = tiling,
-   };
-
-   if (!props)
-      return tu_GetPhysicalDeviceSparseImageFormatProperties2(pdev, &info, count, NULL);
-
-   VkSparseImageFormatProperties2 props2[*count];
-   for (uint32_t i = 0; i < *count; i++) {
-      props2[i].sType = VK_STRUCTURE_TYPE_SPARSE_IMAGE_FORMAT_PROPERTIES_2;
-      props2[i].pNext = NULL;
-   }
-   tu_GetPhysicalDeviceSparseImageFormatProperties2(pdev, &info, count, props2);
-   for (uint32_t i = 0; i < *count; i++)
-      props[i] = props2[i].properties;
-}
-
-void
-tu_GetImageSparseMemoryRequirements(VkDevice device,
-                                    VkImage image,
-                                    uint32_t *count,
-                                    VkSparseImageMemoryRequirements *reqs)
-{
-   const VkImageSparseMemoryRequirementsInfo2 info = {
-      .sType = VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2,
-      .image = image
-   };
-
-   if (!reqs)
-      return tu_GetImageSparseMemoryRequirements2(device, &info, count, NULL);
-
-   VkSparseImageMemoryRequirements2 reqs2[*count];
-   for (uint32_t i = 0; i < *count; i++) {
-      reqs2[i].sType = VK_STRUCTURE_TYPE_SPARSE_IMAGE_MEMORY_REQUIREMENTS_2;
-      reqs2[i].pNext = NULL;
-   }
-   tu_GetImageSparseMemoryRequirements2(device, &info, count, reqs2);
-   for (uint32_t i = 0; i < *count; i++)
-      reqs[i] = reqs2[i].memoryRequirements;
-}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_wsi_display.c b/lib/mesa/src/freedreno/vulkan/tu_wsi_display.c
deleted file mode 100644
index 9a9696d93..000000000
--- a/lib/mesa/src/freedreno/vulkan/tu_wsi_display.c
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright © 2017 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that copyright
- * notice and this permission notice appear in supporting documentation, and
- * that the name of the copyright holders not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  The copyright holders make no representations
- * about the suitability of this software for any purpose.  It is provided "as
- * is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THIS SOFTWARE.
- */
-
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include "tu_private.h"
-#include "tu_cs.h"
-#include "util/disk_cache.h"
-#include "util/strtod.h"
-#include "vk_util.h"
-#include <xf86drm.h>
-#include <xf86drmMode.h>
-#include "vk_format.h"
-#include "util/debug.h"
-#include "wsi_common_display.h"
-
-VkResult
-tu_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
-                                         uint32_t *property_count,
-                                         VkDisplayPropertiesKHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VkResult
-tu_GetPhysicalDeviceDisplayProperties2KHR(VkPhysicalDevice physical_device,
-                                          uint32_t *property_count,
-                                          VkDisplayProperties2KHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties2(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VkResult
-tu_GetPhysicalDeviceDisplayPlanePropertiesKHR(
-   VkPhysicalDevice physical_device,
-   uint32_t *property_count,
-   VkDisplayPlanePropertiesKHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VkResult
-tu_GetPhysicalDeviceDisplayPlaneProperties2KHR(
-   VkPhysicalDevice physical_device,
-   uint32_t *property_count,
-   VkDisplayPlaneProperties2KHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties2(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VkResult
-tu_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
-                                       uint32_t plane_index,
-                                       uint32_t *display_count,
-                                       VkDisplayKHR *displays)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_plane_supported_displays(
-             physical_device,
-             &pdevice->wsi_device,
-             plane_index,
-             display_count,
-             displays);
-}
-
-
-VkResult
-tu_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
-                               VkDisplayKHR display,
-                               uint32_t *property_count,
-                               VkDisplayModePropertiesKHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties(physical_device,
-                                                  &pdevice->wsi_device,
-                                                  display,
-                                                  property_count,
-                                                  properties);
-}
-
-VkResult
-tu_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device,
-                                VkDisplayKHR display,
-                                uint32_t *property_count,
-                                VkDisplayModeProperties2KHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties2(physical_device,
-                                                   &pdevice->wsi_device,
-                                                   display,
-                                                   property_count,
-                                                   properties);
-}
-
-VkResult
-tu_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
-                        VkDisplayKHR display,
-                        const VkDisplayModeCreateInfoKHR *create_info,
-                        const VkAllocationCallbacks *allocator,
-                        VkDisplayModeKHR *mode)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_create_display_mode(physical_device,
-                                          &pdevice->wsi_device,
-                                          display,
-                                          create_info,
-                                          allocator,
-                                          mode);
-}
-
-VkResult
-tu_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
-                                  VkDisplayModeKHR mode_khr,
-                                  uint32_t plane_index,
-                                  VkDisplayPlaneCapabilitiesKHR *capabilities)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities(physical_device,
-                                             &pdevice->wsi_device,
-                                             mode_khr,
-                                             plane_index,
-                                             capabilities);
-}
-
-VkResult
-tu_GetDisplayPlaneCapabilities2KHR(VkPhysicalDevice physical_device,
-                                   const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
-                                   VkDisplayPlaneCapabilities2KHR *capabilities)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities2(physical_device,
-                                              &pdevice->wsi_device,
-                                              pDisplayPlaneInfo,
-                                              capabilities);
-}
-
-VkResult
-tu_CreateDisplayPlaneSurfaceKHR(
-   VkInstance _instance,
-   const VkDisplaySurfaceCreateInfoKHR *create_info,
-   const VkAllocationCallbacks *allocator,
-   VkSurfaceKHR *surface)
-{
-   TU_FROM_HANDLE(tu_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-
-   if (allocator)
-      alloc = allocator;
-   else
-      alloc = &instance->alloc;
-
-   return wsi_create_display_surface(_instance, alloc,
-                                     create_info, surface);
-}
-
-VkResult
-tu_ReleaseDisplayEXT(VkPhysicalDevice physical_device,
-                     VkDisplayKHR     display)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_release_display(physical_device,
-                              &pdevice->wsi_device,
-                              display);
-}
-
-#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
-VkResult
-tu_AcquireXlibDisplayEXT(VkPhysicalDevice     physical_device,
-                         Display              *dpy,
-                         VkDisplayKHR         display)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_acquire_xlib_display(physical_device,
-                                   &pdevice->wsi_device,
-                                   dpy,
-                                   display);
-}
-
-VkResult
-tu_GetRandROutputDisplayEXT(VkPhysicalDevice  physical_device,
-                            Display           *dpy,
-                            RROutput          output,
-                            VkDisplayKHR      *display)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_get_randr_output_display(physical_device,
-                                       &pdevice->wsi_device,
-                                       dpy,
-                                       output,
-                                       display);
-}
-#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
-
-/* VK_EXT_display_control */
-
-VkResult
-tu_DisplayPowerControlEXT(VkDevice                    _device,
-                          VkDisplayKHR                display,
-                          const VkDisplayPowerInfoEXT *display_power_info)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-
-   return wsi_display_power_control(_device,
-                                    &device->physical_device->wsi_device,
-                                    display,
-                                    display_power_info);
-}
-
-VkResult
-tu_RegisterDeviceEventEXT(VkDevice                    _device,
-                          const VkDeviceEventInfoEXT  *device_event_info,
-                          const VkAllocationCallbacks *allocator,
-                          VkFence                     *_fence)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-   struct tu_fence            *fence;
-   VkResult                     ret;
-
-   fence = vk_alloc2(&device->instance->alloc, allocator, sizeof (*fence),
-                     8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (!fence)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   tu_fence_init(fence, false);
-
-   ret = wsi_register_device_event(_device,
-                                   &device->physical_device->wsi_device,
-                                   device_event_info,
-                                   allocator,
-                                   &fence->fence_wsi);
-   if (ret == VK_SUCCESS)
-      *_fence = tu_fence_to_handle(fence);
-   else
-      vk_free2(&device->instance->alloc, allocator, fence);
-   return ret;
-}
-
-VkResult
-tu_RegisterDisplayEventEXT(VkDevice                           _device,
-                           VkDisplayKHR                       display,
-                           const VkDisplayEventInfoEXT        *display_event_info,
-                           const VkAllocationCallbacks        *allocator,
-                           VkFence                            *_fence)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-
-   struct tu_fence            *fence;
-   VkResult                     ret;
-
-   fence = vk_alloc2(&device->instance->alloc, allocator, sizeof (*fence),
-                     8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (!fence)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   tu_fence_init(fence, false);
-
-   ret = wsi_register_display_event(_device,
-                                    &device->physical_device->wsi_device,
-                                    display,
-                                    display_event_info,
-                                    allocator,
-                                    &fence->fence_wsi);
-
-   if (ret == VK_SUCCESS)
-      *_fence = tu_fence_to_handle(fence);
-   else
-      vk_free2(&device->instance->alloc, allocator, fence);
-   return ret;
-}
-
-VkResult
-tu_GetSwapchainCounterEXT(VkDevice                    _device,
-                          VkSwapchainKHR              swapchain,
-                          VkSurfaceCounterFlagBitsEXT flag_bits,
-                          uint64_t                    *value)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-
-   return wsi_get_swapchain_counter(_device,
-                                    &device->physical_device->wsi_device,
-                                    swapchain,
-                                    flag_bits,
-                                    value);
-}
-
author	Jonathan Gray <jsg@cvs.openbsd.org>	2022-09-02 05:47:02 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2022-09-02 05:47:02 +0000
commit	0dbbf1e0708df85a357d70e2708c0a11aeb5480e (patch)
tree	6656ff8eb8b15a2fc1c02888973caf618388cfd0 /lib/mesa/src/freedreno
parent	5f66494d31f735486b8222ecfa0a0c9046e92543 (diff)