Merge Mesa 20.0.8

With Mesa 20.1 even after the kernel change to do wbinvd on all cpus sthen@ reported that hard hangs still occurred on his Haswell system with inteldrm. Mark Kane also reported seeing hangs on Ivy Bridge on bugs@. Some systems/workloads seem to be more prone to triggering this than others as I have not seen any hangs on Ivy Bridge and the only hangs I saw on Haswell when running piglit went away with the wbinvd change. It seems something is wrong with drm memory attributes or coherency in the kernel and newer Mesa versions expect behaviour we don't have.
author: Jonathan Gray <jsg@cvs.openbsd.org> 2020-09-22 02:09:17 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2020-09-22 02:09:17 +0000
commit: 865c23c9c56f47f6cf8d73e8a6060a0c33a28b93 (patch)
tree: aeed22bc39ce87dd6f09ff173c8273beaef65fe7 /lib/mesa/src/freedreno
parent: 27e7bb02bd0f89f96d9e3b402b46c2c97ee4defe (diff)
26 files changed, 754 insertions, 7650 deletions
diff --git a/lib/mesa/src/freedreno/computerator/README.rst b/lib/mesa/src/freedreno/computerator/README.rst
deleted file mode 100644
index 4e5f81c5b..000000000
--- a/lib/mesa/src/freedreno/computerator/README.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-Overview
-========
-
-Computerator is a tool to launch compute shaders, written in assembly.
-The main purpose is to have an easy way to experiment with instructions
-without dealing with the entire compiler stack (which makes controlling
-the order of instructions, the registers chosen, etc, difficult).  The
-choice of compute shaders is simply because there is far less state
-setup required.
-
-Headers
--------
-
-The shader assembly can be prefixed with headers to control state setup:
-
-* ``@localsize X, Y, Z`` - configures local workgroup size
-* ``@buf SZ`` - configures an SSBO of the specified size (in dwords).
-  The order of the ``@buf`` headers determines the index, ie the first
-  ``@buf`` header is ``g[0]``, the second ``g[1]``, and so on
-* ``@const(cN.c)`` configures a const vec4 starting at specified
-  const register, ie ``@const(c1.x) 1.0, 2.0, 3.0, 4.0`` will populate
-  ``c1.xyzw`` with ``vec4(1.0, 2.0, 3.0, 4.0)``
-* ``@invocationid(rN.c)`` will populate a vec3 starting at the specified
-  register with the local invocation-id
-* ``@wgid(rN.c)`` will populate a vec3 starting at the specified register
-  with the workgroup-id (must be a high-reg, ie. ``r48.x`` and above)
-* ``@numwg(cN.c)`` will populate a vec3 starting at the specified const
-  register
-
-Example
--------
-
-```
-@localsize 32, 1, 1
-@buf 32  ; g[0]
-@const(c0.x)  0.0, 0.0, 0.0, 0.0
-@const(c1.x)  1.0, 2.0, 3.0, 4.0
-@wgid(r48.x)        ; r48.xyz
-@invocationid(r0.x) ; r0.xyz
-@numwg(c2.x)        ; c2.xyz
-mov.u32u32 r0.y, r0.x
-(rpt5)nop
-stib.untyped.1d.u32.1 g[0] + r0.y, r0.x
-end
-nop
-```
-
-Usage
------
-
-```
-cat myshader.asm | ./computerator --disasm --groups=4,4,4
-```
-
diff --git a/lib/mesa/src/freedreno/computerator/a6xx.c b/lib/mesa/src/freedreno/computerator/a6xx.c
deleted file mode 100644
index b02718bc8..000000000
--- a/lib/mesa/src/freedreno/computerator/a6xx.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright © 2020 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ir3/ir3_compiler.h"
-
-#include "util/u_math.h"
-
-#include "registers/adreno_pm4.xml.h"
-#include "registers/adreno_common.xml.h"
-#include "registers/a6xx.xml.h"
-
-#include "main.h"
-#include "ir3_asm.h"
-
-struct a6xx_backend {
-	struct backend base;
-
-	struct ir3_compiler *compiler;
-	struct fd_device *dev;
-
-	unsigned seqno;
-	struct fd_bo *control_mem;
-
-	struct fd_bo *query_mem;
-	const struct perfcntr *perfcntrs;
-	unsigned num_perfcntrs;
-};
-define_cast(backend, a6xx_backend);
-
-/*
- * Data structures shared with GPU:
- */
-
-/* This struct defines the layout of the fd6_context::control buffer: */
-struct fd6_control {
-	uint32_t seqno;          /* seqno for async CP_EVENT_WRITE, etc */
-	uint32_t _pad0;
-	volatile uint32_t vsc_overflow;
-	uint32_t _pad1;
-	/* flag set from cmdstream when VSC overflow detected: */
-	uint32_t vsc_scratch;
-	uint32_t _pad2;
-	uint32_t _pad3;
-	uint32_t _pad4;
-
-	/* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
-	struct {
-		uint32_t offset;
-		uint32_t pad[7];
-	} flush_base[4];
-};
-
-#define control_ptr(a6xx_backend, member)  \
-	(a6xx_backend)->control_mem, offsetof(struct fd6_control, member), 0, 0
-
-
-struct PACKED fd6_query_sample {
-	uint64_t start;
-	uint64_t result;
-	uint64_t stop;
-};
-
-
-/* offset of a single field of an array of fd6_query_sample: */
-#define query_sample_idx(a6xx_backend, idx, field)    \
-	(a6xx_backend)->query_mem,                        \
-	(idx * sizeof(struct fd6_query_sample)) +         \
-	offsetof(struct fd6_query_sample, field),         \
-	0, 0
-
-
-/*
- * Backend implementation:
- */
-
-static struct kernel *
-a6xx_assemble(struct backend *b, FILE *in)
-{
-	struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);
-	struct ir3_kernel *ir3_kernel =
-		ir3_asm_assemble(a6xx_backend->compiler, in);
-	ir3_kernel->backend = b;
-	return &ir3_kernel->base;
-}
-
-static void
-a6xx_disassemble(struct kernel *kernel, FILE *out)
-{
-	ir3_asm_disassemble(to_ir3_kernel(kernel), out);
-}
-
-static void
-cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
-{
-	struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
-	struct ir3_shader_variant *v = ir3_kernel->v;
-	const struct ir3_info *i = &v->info;
-	enum a3xx_threadsize thrsz = FOUR_QUADS;
-
-	OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
-	OUT_RING(ring, 0xff);
-
-	unsigned constlen = align(v->constlen, 4);
-	OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL, 1);
-	OUT_RING(ring, A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) |
-			A6XX_HLSQ_CS_CNTL_ENABLED);
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2);
-	OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED |
-		A6XX_SP_CS_CONFIG_NIBO(kernel->num_bufs) |
-		A6XX_SP_CS_CONFIG_NTEX(v->num_samp) |
-		A6XX_SP_CS_CONFIG_NSAMP(v->num_samp));    /* SP_VS_CONFIG */
-	OUT_RING(ring, v->instrlen);                      /* SP_VS_INSTRLEN */
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_CTRL_REG0, 1);
-	OUT_RING(ring, A6XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) |
-		A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) |
-		A6XX_SP_CS_CTRL_REG0_MERGEDREGS |
-		A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) |
-		COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE));
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
-	OUT_RING(ring, 0x41);
-
-	uint32_t local_invocation_id, work_group_id;
-	local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
-	work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID);
-
-	OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2);
-	OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
-		A6XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
-		A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
-		A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
-	OUT_RING(ring, 0x2fc);             /* HLSQ_CS_UNKNOWN_B998 */
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START_LO, 2);
-	OUT_RELOC(ring, v->bo, 0, 0, 0);   /* SP_CS_OBJ_START_LO/HI */
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_INSTRLEN, 1);
-	OUT_RING(ring, v->instrlen);
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START_LO, 2);
-	OUT_RELOC(ring, v->bo, 0, 0, 0);
-
-	OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
-	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
-		CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
-		CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-		CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
-		CP_LOAD_STATE6_0_NUM_UNIT(v->instrlen));
-	OUT_RELOCD(ring, v->bo, 0, 0, 0);
-}
-
-static void
-emit_const(struct fd_ringbuffer *ring, uint32_t regid,
-		uint32_t sizedwords, const uint32_t *dwords)
-{
-	uint32_t align_sz;
-
-	debug_assert((regid % 4) == 0);
-
-	align_sz = align(sizedwords, 4);
-
-	OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3 + align_sz);
-	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) |
-		CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-		CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-		CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
-		CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4)));
-	OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-
-	for (uint32_t i = 0; i < sizedwords; i++) {
-		OUT_RING(ring, dwords[i]);
-	}
-
-	/* Zero-pad to multiple of 4 dwords */
-	for (uint32_t i = sizedwords; i < align_sz; i++) {
-		OUT_RING(ring, 0);
-	}
-}
-
-
-static void
-cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, uint32_t grid[3])
-{
-	struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
-	struct ir3_shader_variant *v = ir3_kernel->v;
-
-	const struct ir3_const_state *const_state = &v->shader->const_state;
-	uint32_t base = const_state->offsets.immediate;
-	int size = const_state->immediates_count;
-
-	if (ir3_kernel->info.numwg != INVALID_REG) {
-		assert((ir3_kernel->info.numwg & 0x3) == 0);
-		int idx = ir3_kernel->info.numwg >> 2;
-		const_state->immediates[idx].val[0] = grid[0];
-		const_state->immediates[idx].val[1] = grid[1];
-		const_state->immediates[idx].val[2] = grid[2];
-	}
-
-	/* truncate size to avoid writing constants that shader
-	 * does not use:
-	 */
-	size = MIN2(size + base, v->constlen) - base;
-
-	/* convert out of vec4: */
-	base *= 4;
-	size *= 4;
-
-	if (size > 0) {
-		emit_const(ring, base, size, const_state->immediates[0].val);
-	}
-}
-
-static void
-cs_ibo_emit(struct fd_ringbuffer *ring, struct fd_submit *submit,
-		struct kernel *kernel)
-{
-	struct fd_ringbuffer *state =
-		fd_submit_new_ringbuffer(submit,
-				kernel->num_bufs * 16 * 4,
-				FD_RINGBUFFER_STREAMING);
-
-	for (unsigned i = 0; i < kernel->num_bufs; i++) {
-		/* size is encoded with low 15b in WIDTH and high bits in HEIGHT,
-		 * in units of elements:
-		 */
-		unsigned sz = kernel->buf_sizes[i];
-		unsigned width  = sz & MASK(15);
-		unsigned height = sz >> 15;
-
-		OUT_RING(state, A6XX_IBO_0_FMT(FMT6_32_UINT) |
-			A6XX_IBO_0_TILE_MODE(0));
-		OUT_RING(state, A6XX_IBO_1_WIDTH(width) |
-			A6XX_IBO_1_HEIGHT(height));
-		OUT_RING(state, A6XX_IBO_2_PITCH(0) |
-			A6XX_IBO_2_UNK4 | A6XX_IBO_2_UNK31 |
-			A6XX_IBO_2_TYPE(A6XX_TEX_1D));
-		OUT_RING(state, A6XX_IBO_3_ARRAY_PITCH(0));
-		OUT_RELOCW(state, kernel->bufs[i], 0, 0, 0);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-		OUT_RING(state, 0x00000000);
-	}
-
-	OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
-	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
-		CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) |
-		CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-		CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
-		CP_LOAD_STATE6_0_NUM_UNIT(kernel->num_bufs));
-	OUT_RB(ring, state);
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_LO, 2);
-	OUT_RB(ring, state);
-
-	OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1);
-	OUT_RING(ring, kernel->num_bufs);
-
-	fd_ringbuffer_del(state);
-}
-
-static inline unsigned
-event_write(struct fd_ringbuffer *ring, struct kernel *kernel,
-		enum vgt_event_type evt, bool timestamp)
-{
-	unsigned seqno = 0;
-
-	OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1);
-	OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt));
-	if (timestamp) {
-		struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
-		struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
-		seqno = ++a6xx_backend->seqno;
-		OUT_RELOCW(ring, control_ptr(a6xx_backend, seqno));  /* ADDR_LO/HI */
-		OUT_RING(ring, seqno);
-	}
-
-	return seqno;
-}
-
-static inline void
-cache_flush(struct fd_ringbuffer *ring, struct kernel *kernel)
-{
-	struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
-	struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
-	unsigned seqno;
-
-	seqno = event_write(ring, kernel, RB_DONE_TS, true);
-
-	OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
-	OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
-		CP_WAIT_REG_MEM_0_POLL_MEMORY);
-	OUT_RELOC(ring, control_ptr(a6xx_backend, seqno));
-	OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno));
-	OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0));
-	OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
-
-	seqno = event_write(ring, kernel, CACHE_FLUSH_TS, true);
-
-	OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4);
-	OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0));
-	OUT_RELOC(ring, control_ptr(a6xx_backend, seqno));
-	OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno));
-}
-
-static void
-a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit)
-{
-	struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
-	struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
-	struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0,
-			FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
-
-	cs_program_emit(ring, kernel);
-	cs_const_emit(ring, kernel, grid);
-	cs_ibo_emit(ring, submit, kernel);
-
-	OUT_PKT7(ring, CP_SET_MARKER, 1);
-	OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
-
-	const unsigned *local_size = kernel->local_size;
-	const unsigned *num_groups = grid;
-
-	unsigned work_dim = 0;
-	for (int i = 0; i < 3; i++) {
-		if (!grid[i])
-			break;
-		work_dim++;
-	}
-
-	OUT_PKT4(ring, REG_A6XX_HLSQ_CS_NDRANGE_0, 7);
-	OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) |
-		A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
-		A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
-		A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
-	OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0]));
-	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */
-	OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1]));
-	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */
-	OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2]));
-	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */
-
-	OUT_PKT4(ring, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3);
-	OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_X */
-	OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_Y */
-	OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_Z */
-
-	if (a6xx_backend->num_perfcntrs > 0) {
-		a6xx_backend->query_mem = fd_bo_new(a6xx_backend->dev,
-			a6xx_backend->num_perfcntrs * sizeof(struct fd6_query_sample),
-			DRM_FREEDRENO_GEM_TYPE_KMEM, "query");
-
-		/* configure the performance counters to count the requested
-		 * countables:
-		 */
-		for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
-			const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
-
-			OUT_PKT4(ring, counter->select_reg, 1);
-			OUT_RING(ring, counter->selector);
-		}
-
-		OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
-
-		/* and snapshot the start values: */
-		for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
-			const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
-
-			OUT_PKT7(ring, CP_REG_TO_MEM, 3);
-			OUT_RING(ring, CP_REG_TO_MEM_0_64B |
-				CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
-			OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, start));
-		}
-	}
-
-	OUT_PKT7(ring, CP_EXEC_CS, 4);
-	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0]));
-	OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(grid[1]));
-	OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(grid[2]));
-
-	OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
-
-	if (a6xx_backend->num_perfcntrs > 0) {
-		/* snapshot the end values: */
-		for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
-			const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
-
-			OUT_PKT7(ring, CP_REG_TO_MEM, 3);
-			OUT_RING(ring, CP_REG_TO_MEM_0_64B |
-				CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
-			OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, stop));
-		}
-
-		/* and compute the result: */
-		for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
-			/* result += stop - start: */
-			OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
-			OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
-					CP_MEM_TO_MEM_0_NEG_C);
-			OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, result));     /* dst */
-			OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result));      /* srcA */
-			OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop));        /* srcB */
-			OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start));       /* srcC */
-		}
-	}
-
-	cache_flush(ring, kernel);
-}
-
-static void
-a6xx_set_perfcntrs(struct backend *b, const struct perfcntr *perfcntrs,
-		unsigned num_perfcntrs)
-{
-	struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);
-
-	a6xx_backend->perfcntrs = perfcntrs;
-	a6xx_backend->num_perfcntrs = num_perfcntrs;
-}
-
-static void
-a6xx_read_perfcntrs(struct backend *b, uint64_t *results)
-{
-	struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);
-
-	fd_bo_cpu_prep(a6xx_backend->query_mem, NULL, DRM_FREEDRENO_PREP_READ);
-	struct fd6_query_sample *samples = fd_bo_map(a6xx_backend->query_mem);
-
-	for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
-		results[i] = samples[i].result;
-	}
-}
-
-struct backend *
-a6xx_init(struct fd_device *dev, uint32_t gpu_id)
-{
-	struct a6xx_backend *a6xx_backend = calloc(1, sizeof(*a6xx_backend));
-
-	a6xx_backend->base = (struct backend) {
-		.assemble = a6xx_assemble,
-		.disassemble = a6xx_disassemble,
-		.emit_grid = a6xx_emit_grid,
-		.set_perfcntrs = a6xx_set_perfcntrs,
-		.read_perfcntrs = a6xx_read_perfcntrs,
-	};
-
-	a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id);
-	a6xx_backend->dev = dev;
-
-	a6xx_backend->control_mem = fd_bo_new(dev, 0x1000,
-		DRM_FREEDRENO_GEM_TYPE_KMEM, "control");
-
-	return &a6xx_backend->base;
-}
diff --git a/lib/mesa/src/freedreno/computerator/examples/invocationid.asm b/lib/mesa/src/freedreno/computerator/examples/invocationid.asm
deleted file mode 100644
index e79421836..000000000
--- a/lib/mesa/src/freedreno/computerator/examples/invocationid.asm
+++ /dev/null
@@ -1,13 +0,0 @@
-@localsize 32, 1, 1
-@buf 32  ; g[0]
-@const(c0.x)  0.0, 0.0, 0.0, 0.0
-@const(c1.x)  1.0, 2.0, 3.0, 4.0
-@wgid(r48.x)        ; r48.xyz
-@invocationid(r0.x) ; r0.xyz
-@numwg(c2.x)        ; c2.xyz
-mov.u32u32 r0.y, r0.x
-(rpt5)nop
-stib.untyped.1d.u32.1 g[0] + r0.y, r0.x
-end
-nop
-
diff --git a/lib/mesa/src/freedreno/computerator/examples/simple.asm b/lib/mesa/src/freedreno/computerator/examples/simple.asm
deleted file mode 100644
index 05350a98c..000000000
--- a/lib/mesa/src/freedreno/computerator/examples/simple.asm
+++ /dev/null
@@ -1,28 +0,0 @@
-@localsize 1, 1, 1
-@buf 4  ; g[0]
-@const(c0.x)  0.0, 0.0, 0.0, 0.0
-@const(c1.x)  1.0, 2.0, 3.0, 4.0
-@wgid(r48.x)        ; r48.xyz
-@invocationid(r0.x) ; r0.xyz
-@numwg(c2.x)        ; c2.xyz
-mov.f32f32 r2.x, c0.y
-mov.u32u32 r0.x, 0x12345678
-mov.u32u32 r0.y, 0x12345678
-mov.u32u32 r0.z, 0x12345678
-add.u r2.x, c0.x, r2.x
-mov.u32u32 r0.w, 0x12345678
-mov.u32u32 r1.x, 0x12345678
-mov.u32u32 r1.y, 0x12345678
-cov.u32s16 hr4.x, r2.x
-mov.u32u32 r1.z, 0x12345678
-mov.u32u32 r1.w, 0x12345678
-nop
-mova a0.x, hr4.x
-(rpt5)nop
-(ul)mov.u32u32 r0.x, r<a0.x>
-mov.u32u32 r0.y, 0x00000000
-(rpt5)nop
-stib.untyped.1d.u32.1 g[0] + r0.y, r0.x
-end
-nop
-
diff --git a/lib/mesa/src/freedreno/computerator/examples/test-opcodes.sh b/lib/mesa/src/freedreno/computerator/examples/test-opcodes.sh
deleted file mode 100755
index e6f7e73e5..000000000
--- a/lib/mesa/src/freedreno/computerator/examples/test-opcodes.sh
+++ /dev/null
@@ -1,297 +0,0 @@
-#!/bin/bash
-#
-# Test various instructions to check whether half<->full widening/narrowing
-# works.  The basic premise is to perform the same instruction with and
-# without the widening/narrowing folded in and check if the results match.
-#
-# Note this doesn't currently diferentiate between signed/unsigned/bool,
-# and just assumes int is signed (since unsigned is basically(ish) like
-# signed but without sign extension)
-#
-# TODO probably good pick numeric src values that are better at triggering
-# edge cases, while still not loosing precision in a full->half->full
-# seqeuence.. but some instructions like absneg don't even appear to be
-# subtlely wrong when you try to fold in a precision conversion.
-#
-# add '-v' arg to see the result values
-
-set -e
-
-#
-# Templates for float->float instructions:
-#
-f2f_instrs=(
-	'add.f $dst, $src1, $src2'
-	'min.f $dst, $src1, $src2'
-	'min.f $dst, $src2, $src1'
-	'max.f $dst, $src1, $src2'
-	'max.f $dst, $src2, $src1'
-	'mul.f $dst, $src1, $src2'
-	'sign.f $dst, $src1'
-	'absneg.f $dst, \(neg\)$src1'
-	'absneg.f $dst, \(abs\)$src1'
-	'floor.f $dst, $src1'
-	'ceil.f $dst, $src1'
-	'rndne.f $dst, $src1'
-	'rndaz.f $dst, $src1'
-	'trunc.f $dst, $src1'
-)
-
-#
-# Templates for float->int instructions:
-#
-f2i_instrs=(
-	'cmps.f.gt $dst, $src1, $src2'
-	'cmps.f.lt $dst, $src1, $src2'
-	'cmpv.f.gt $dst, $src1, $src2'
-	'cmpv.f.lt $dst, $src1, $src2'
-)
-
-#
-# Templates for int->int instructions:
-#
-i2i_instrs=(
-	'add.u $dst, $src1, $src2'
-	'add.s $dst, $src1, $src2'
-	'sub.u $dst, $src1, $src2'
-	'sub.s $dst, $src1, $src2'
-	'cmps.f.gt $dst, $src1, $src2'
-	'cmps.f.lt $dst, $src1, $src2'
-	'min.u $dst, $src1, $src2'
-	'min.u $dst, $src2, $src1'
-	'min.s $dst, $src1, $src2'
-	'min.s $dst, $src2, $src1'
-	'max.u $dst, $src1, $src2'
-	'max.u $dst, $src2, $src1'
-	'max.s $dst, $src1, $src2'
-	'max.s $dst, $src2, $src1'
-	'absneg.s $dst, \(neg\)$src1'
-	'absneg.s $dst, \(abs\)$src1'
-	'and.b $dst, $src2, $src3'
-	'or.b $dst, $src1, $src2'
-	'not.b $dst, $src1'
-	'xor.b $dst, $src1, $src2'
-	'cmpv.u.gt $dst, $src1, $src2'
-	'cmpv.u.lt $dst, $src1, $src2'
-	'cmpv.s.gt $dst, $src1, $src2'
-	'cmpv.s.lt $dst, $src1, $src2'
-	'mul.u24 $dst, $src1, $src2'
-	'mul.s24 $dst, $src1, $src2'
-	'mull.u $dst, $src1, $src2'
-	'bfrev.b $dst, $src1'
-	'clz.s $dst, $src2'
-	'clz.b $dst, $src2'
-	'shl.b $dst, $src1, $src2'
-	'shr.b $dst, $src3, $src1'
-	'ashr.b $dst, $src3, $src1'
-	'mgen.b $dst, $src1, $src2'
-	'getbit.b $dst, $src3, $src2'
-	'setrm $dst, $src1'
-	'cbits.b $dst, $src3'
-	'shb $dst, $src1, $src2'
-	'msad $dst, $src1, $src2'
-)
-
-#
-# Helper to expand instruction template:
-#
-expand() {
-	instr=$1
-	dst=$2
-	src1=$3
-	src2=$4
-	src3=$5
-	eval echo $instr
-}
-
-expand_test() {
-	instr=$1
-
-	echo '; control, half->half:'
-	expand $instr "hr1.x" "hr0.x" "hr0.y" "hr0.z"
-	echo '; test, full->half:'
-	expand $instr "hr1.y" "r1.x" "r1.y" "r1.z"
-
-	echo '; control, full->full:'
-	expand $instr "r2.x" "r1.x" "r1.y" "r1.z"
-	echo '; test, half->full:'
-	expand $instr "r2.y" "hr0.x" "hr0.y" "hr0.z"
-
-	echo "(rpt5)nop"
-}
-
-#
-# Helpers to construct test program assembly:
-#
-header_asm() {
-	cat <<EOF
-@localsize 1, 1, 1
-@buf 4  ; g[0]
-EOF
-}
-
-footer_asm() {
-	cat <<EOF
-; dest offsets:
-mov.u32u32 r3.x, 0
-mov.u32u32 r3.y, 1
-mov.u32u32 r3.z, 2
-mov.u32u32 r3.w, 3
-(rpt5)nop
-
-; and store results:
-stib.untyped.1d.u32.1 g[0] + r3.x, r2.x   ; control: full->full
-stib.untyped.1d.u32.1 g[0] + r3.y, r2.y   ; test:    half->full
-stib.untyped.1d.u32.1 g[0] + r3.z, r2.z   ; control: half->half
-stib.untyped.1d.u32.1 g[0] + r3.w, r2.w   ; test:    full->half
-(sy)nop
-end
-EOF
-}
-
-setup_asm_float() {
-	cat <<EOF
-; hr0->hr1 (r0) avail for half, hr0 for src, hr1 for dst
-; r1->r2 avail for full, r1 for src, r2 for dst
-cov.f32f16 hr0.x, (1.0)
-cov.f32f16 hr0.y, (2.0)
-cov.f32f16 hr0.z, (3.0)
-mov.f32f32 r1.x,  (1.0)
-mov.f32f32 r1.y,  (2.0)
-mov.f32f32 r1.z,  (3.0)
-(rpt5)nop
-EOF
-}
-
-setup_asm_int() {
-	cat <<EOF
-; hr0->hr1 (r0) avail for half, hr0 for src, hr1 for dst
-; r1->r2 avail for full, r1 for src, r2 for dst
-cov.s32s16 hr0.x,  1
-cov.s32s16 hr0.y, -2
-cov.s32s16 hr0.z,  3
-mov.s32s32 r1.x,   1
-mov.s32s32 r1.y,  -2
-mov.s32s32 r1.z,   3
-(rpt5)nop
-EOF
-}
-
-#
-# Generate assembly code to test float->float opcode
-#
-f2f_asm() {
-	instr=$1
-
-	header_asm
-	setup_asm_float
-	expand_test $instr
-
-	cat <<EOF
-; convert half results back to full:
-cov.f16f32 r2.z, hr1.x
-cov.f16f32 r2.w, hr1.y
-EOF
-
-	footer_asm
-}
-
-#
-# Generate assembly code to test float->int opcode
-#
-f2i_asm() {
-	instr=$1
-
-	header_asm
-	setup_asm_float
-	expand_test $instr
-
-	cat <<EOF
-; convert half results back to full:
-cov.s16s32 r2.z, hr1.x
-cov.s16s32 r2.w, hr1.y
-EOF
-
-	footer_asm
-}
-
-#
-# Generate assembly code to test int->int opcode
-#
-i2i_asm() {
-	instr=$1
-
-	header_asm
-	setup_asm_int
-	expand_test $instr
-
-	cat <<EOF
-; convert half results back to full:
-cov.s16s32 r2.z, hr1.x
-cov.s16s32 r2.w, hr1.y
-EOF
-
-	footer_asm
-}
-
-
-#
-# Helper to parse computerator output and print results:
-#
-check_results() {
-	str=`cat - | grep "	" | head -1 | xargs`
-
-	if [ "$verbose" = "true" ]; then
-		echo $str
-	fi
-
-	# Split components of result buffer:
-	cf=$(echo $str | cut -f1 -d' ')
-	tf=$(echo $str | cut -f2 -d' ')
-	ch=$(echo $str | cut -f3 -d' ')
-	th=$(echo $str | cut -f4 -d' ')
-
-	# Sanity test, make sure the control results match:
-	if [ $cf != $ch ]; then
-		echo "    FAIL: control results do not match!  Half vs full op is not equivalent!"
-		echo "    full=$cf half=$ch"
-	fi
-
-	# Compare test (with conversion folded) to control:
-	if [ $cf != $tf ]; then
-		echo "    FAIL: half -> full widening result does not match control!"
-		echo "    control=$cf result=$tf"
-	fi
-	if [ $ch != $th ]; then
-		echo "    FAIL: full -> half narrowing result does not match control!"
-		echo "    control=$ch result=$th"
-	fi
-
-	# HACK without a delay different invocations
-	# of computerator seem to somehow clobber each
-	# other.. which isn't great..
-	sleep 0.1
-}
-
-#
-# Run the tests!
-#
-
-if [ "$1" = "-v" ]; then
-	verbose="true"
-fi
-
-IFS=""
-for instr in ${f2f_instrs[@]}; do
-	echo "TEST: $instr"
-	f2f_asm $instr | ./computerator -g 1,1,1 | check_results
-done
-for instr in ${f2i_instrs[@]}; do
-	echo "TEST: $instr"
-	f2i_asm $instr | ./computerator -g 1,1,1 | check_results
-done
-for instr in ${i2i_instrs[@]}; do
-	echo "TEST: $instr"
-	i2i_asm $instr | ./computerator -g 1,1,1 | check_results
-done
-
diff --git a/lib/mesa/src/freedreno/computerator/ir3_asm.c b/lib/mesa/src/freedreno/computerator/ir3_asm.c
deleted file mode 100644
index cbab6e8e1..000000000
--- a/lib/mesa/src/freedreno/computerator/ir3_asm.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright © 2020 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ir3/ir3_compiler.h"
-#include "ir3/ir3_parser.h"
-
-#include "ir3_asm.h"
-
-struct ir3_kernel *
-ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
-{
-	struct ir3_kernel *kernel = calloc(1, sizeof(*kernel));
-
-	struct ir3_shader *shader = calloc(1, sizeof(*shader));
-	shader->compiler = c;
-	shader->type = MESA_SHADER_COMPUTE;
-
-	struct ir3_shader_variant *v = calloc(1, sizeof(*v));
-	v->type = MESA_SHADER_COMPUTE;
-	v->shader = shader;
-
-	kernel->v = v;
-
-	kernel->info.numwg = INVALID_REG;
-
-	v->ir = ir3_parse(v, &kernel->info, in);
-	if (!v->ir)
-		errx(-1, "parse failed");
-
-	ir3_debug_print(v->ir, "AFTER PARSING");
-
-	memcpy(kernel->base.local_size, kernel->info.local_size, sizeof(kernel->base.local_size));
-	kernel->base.num_bufs = kernel->info.num_bufs;
-	memcpy(kernel->base.buf_sizes, kernel->info.buf_sizes, sizeof(kernel->base.buf_sizes));
-
-	kernel->bin = ir3_shader_assemble(v, c->gpu_id);
-
-	unsigned sz = v->info.sizedwords * 4;
-
-	v->bo = fd_bo_new(c->dev, sz,
-			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
-			DRM_FREEDRENO_GEM_TYPE_KMEM,
-			"%s", ir3_shader_stage(v));
-
-	memcpy(fd_bo_map(v->bo), kernel->bin, sz);
-
-	return kernel;
-}
-
-void
-ir3_asm_disassemble(struct ir3_kernel *k, FILE *out)
-{
-	ir3_shader_disasm(k->v, k->bin, out);
-}
diff --git a/lib/mesa/src/freedreno/computerator/ir3_asm.h b/lib/mesa/src/freedreno/computerator/ir3_asm.h
deleted file mode 100644
index 1a03eb254..000000000
--- a/lib/mesa/src/freedreno/computerator/ir3_asm.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright © 2020 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __IR3_ASM_H__
-#define __IR3_ASM_H__
-
-#include "main.h"
-
-#include "ir3/ir3_shader.h"
-#include "ir3/ir3_parser.h"
-
-struct ir3_kernel {
-	struct kernel base;
-	struct ir3_kernel_info info;
-	struct backend *backend;
-	struct ir3_shader_variant *v;
-	void *bin;
-};
-define_cast(kernel, ir3_kernel);
-
-struct ir3_kernel *ir3_asm_assemble(struct ir3_compiler *c, FILE *in);
-void ir3_asm_disassemble(struct ir3_kernel *k, FILE *out);
-
-#endif /* __IR3_ASM_H__ */
diff --git a/lib/mesa/src/freedreno/computerator/main.c b/lib/mesa/src/freedreno/computerator/main.c
deleted file mode 100644
index 5224bf0cf..000000000
--- a/lib/mesa/src/freedreno/computerator/main.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright © 2020 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <getopt.h>
-#include <inttypes.h>
-#include <locale.h>
-#include <xf86drm.h>
-#include <stdlib.h>
-
-#include "util/u_math.h"
-
-#include "perfcntrs/freedreno_perfcntr.h"
-
-#include "main.h"
-
-
-static void
-dump_float(void *buf, int sz)
-{
-	uint8_t *ptr = (uint8_t *)buf;
-	uint8_t *end = ptr + sz - 3;
-	int i = 0;
-
-	while (ptr < end) {
-		uint32_t d = 0;
-
-		printf((i % 8) ? " " : "\t");
-
-		d |= *(ptr++) <<  0;
-		d |= *(ptr++) <<  8;
-		d |= *(ptr++) << 16;
-		d |= *(ptr++) << 24;
-
-		printf("%8f", uif(d));
-
-		if ((i % 8) == 7) {
-			printf("\n");
-		}
-
-		i++;
-	}
-
-	if (i % 8) {
-		printf("\n");
-	}
-}
-
-static void
-dump_hex(void *buf, int sz)
-{
-	uint8_t *ptr = (uint8_t *)buf;
-	uint8_t *end = ptr + sz;
-	int i = 0;
-
-	while (ptr < end) {
-		uint32_t d = 0;
-
-		printf((i % 8) ? " " : "\t");
-
-		d |= *(ptr++) <<  0;
-		d |= *(ptr++) <<  8;
-		d |= *(ptr++) << 16;
-		d |= *(ptr++) << 24;
-
-		printf("%08x", d);
-
-		if ((i % 8) == 7) {
-			printf("\n");
-		}
-
-		i++;
-	}
-
-	if (i % 8) {
-		printf("\n");
-	}
-}
-
-static const char *shortopts = "df:g:hp:";
-
-static const struct option longopts[] = {
-	{"disasm",   no_argument,       0, 'd'},
-	{"file",     required_argument, 0, 'f'},
-	{"groups",   required_argument, 0, 'g'},
-	{"help",     no_argument,       0, 'h'},
-	{"perfcntr", required_argument, 0, 'p'},
-	{0, 0, 0, 0}
-};
-
-static void
-usage(const char *name)
-{
-	printf("Usage: %s [-dfgh]\n"
-		"\n"
-		"options:\n"
-		"    -d, --disasm             print disassembled shader\n"
-		"    -f, --file=FILE          read shader from file (instead of stdin)\n"
-		"    -g, --groups=X,Y,Z       use specified group size\n"
-		"    -h, --help               show this message\n"
-		"    -p, --perfcntr=LIST      sample specified performance counters (comma\n"
-		"                             separated list)\n"
-		,
-		name);
-}
-
-/* performance counter description: */
-static unsigned num_groups;
-static const struct fd_perfcntr_group *groups;
-
-/* Track enabled counters per group: */
-static unsigned *enabled_counters;
-
-static void
-setup_counter(const char *name, struct perfcntr *c)
-{
-	for (int i = 0; i < num_groups; i++) {
-		const struct fd_perfcntr_group *group = &groups[i];
-
-		for (int j = 0; j < group->num_countables; j++) {
-			const struct fd_perfcntr_countable *countable = &group->countables[j];
-
-			if (strcmp(name, countable->name) != 0)
-				continue;
-
-			/*
-			 * Allocate a counter to use to monitor the requested countable:
-			 */
-			if (enabled_counters[i] >= group->num_counters) {
-				errx(-1, "Too many counters selected in group: %s", group->name);
-			}
-
-			unsigned idx = enabled_counters[i]++;
-			const struct fd_perfcntr_counter *counter = &group->counters[idx];
-
-			/*
-			 * And initialize the perfcntr struct, pulling together the info
-			 * about selected counter and countable, to simplify life for the
-			 * backend:
-			 */
-			c->name           = name;
-			c->select_reg     = counter->select_reg;
-			c->counter_reg_lo = counter->counter_reg_lo;
-			c->counter_reg_hi = counter->counter_reg_hi;
-			c->selector       = countable->selector;
-
-			return;
-		}
-	}
-
-	errx(-1, "could not find countable: %s", name);
-}
-
-static struct perfcntr *
-parse_perfcntrs(uint32_t gpu_id, const char *perfcntrstr, unsigned *num_perfcntrs)
-{
-	struct perfcntr *counters = NULL;
-	char *cnames, *s;
-	unsigned cnt = 0;
-
-	groups = fd_perfcntrs(gpu_id, &num_groups);
-	enabled_counters = calloc(num_groups, sizeof(enabled_counters[0]));
-
-	cnames = strdup(perfcntrstr);
-	while ((s = strstr(cnames, ","))) {
-		char *name = cnames;
-		s[0] = '\0';
-		cnames = &s[1];
-
-		counters = realloc(counters, ++cnt * sizeof(counters[0]));
-		setup_counter(name, &counters[cnt-1]);
-	}
-
-	char * name = cnames;
-	counters = realloc(counters, ++cnt * sizeof(counters[0]));
-	setup_counter(name, &counters[cnt-1]);
-
-	*num_perfcntrs = cnt;
-
-	return counters;
-}
-
-int
-main(int argc, char **argv)
-{
-	FILE *in = stdin;
-	const char *perfcntrstr = NULL;
-	struct perfcntr *perfcntrs = NULL;
-	unsigned num_perfcntrs = 0;
-	bool disasm = false;
-	uint32_t grid[3] = {0};
-	int opt, ret;
-
-	setlocale(LC_NUMERIC, "en_US.UTF-8");
-
-	while ((opt = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
-		switch (opt) {
-		case 'd':
-			disasm = true;
-			break;
-		case 'f':
-			in = fopen(optarg, "r");
-			if (!in)
-				err(1, "could not open '%s'", optarg);
-			break;
-		case 'g':
-			ret = sscanf(optarg, "%u,%u,%u", &grid[0], &grid[1], &grid[2]);
-			if (ret != 3)
-				goto usage;
-			break;
-		case 'h':
-			goto usage;
-		case 'p':
-			perfcntrstr = optarg;
-			break;
-		default:
-			printf("unrecognized arg: %c\n", opt);
-			goto usage;
-		}
-	}
-
-	int fd = drmOpen("msm", NULL);
-	if (fd < 0)
-		err(1, "could not open drm device");
-
-	struct fd_device *dev = fd_device_new(fd);
-	struct fd_pipe *pipe = fd_pipe_new(dev, FD_PIPE_3D);
-
-	uint64_t val;
-	fd_pipe_get_param(pipe, FD_GPU_ID, &val);
-	uint32_t gpu_id = val;
-
-	printf("got gpu_id: %u\n", gpu_id);
-
-	struct backend *backend;
-	switch (gpu_id) {
-	case 600 ... 699:
-		backend = a6xx_init(dev, gpu_id);
-		break;
-	default:
-		err(1, "unsupported gpu: a%u", gpu_id);
-	}
-
-	struct kernel *kernel = backend->assemble(backend, in);
-	printf("localsize: %dx%dx%d\n", kernel->local_size[0],
-			kernel->local_size[1], kernel->local_size[2]);
-	for (int i = 0; i < kernel->num_bufs; i++) {
-		printf("buf[%d]: size=%u\n", i, kernel->buf_sizes[i]);
-		kernel->bufs[i] = fd_bo_new(dev, kernel->buf_sizes[i] * 4,
-				DRM_FREEDRENO_GEM_TYPE_KMEM, "buf[%d]", i);
-	}
-
-	if (disasm)
-		backend->disassemble(kernel, stdout);
-
-	if (grid[0] == 0)
-		return 0;
-
-	struct fd_submit *submit = fd_submit_new(pipe);
-
-	if (perfcntrstr) {
-		if (!backend->set_perfcntrs) {
-			err(1, "performance counters not supported");
-		}
-		perfcntrs = parse_perfcntrs(gpu_id, perfcntrstr, &num_perfcntrs);
-		backend->set_perfcntrs(backend, perfcntrs, num_perfcntrs);
-	}
-
-	backend->emit_grid(kernel, grid, submit);
-
-	fd_submit_flush(submit, -1, NULL, NULL);
-
-	for (int i = 0; i < kernel->num_bufs; i++) {
-		fd_bo_cpu_prep(kernel->bufs[i], pipe, DRM_FREEDRENO_PREP_READ);
-		void *map = fd_bo_map(kernel->bufs[i]);
-
-		printf("buf[%d]:\n", i);
-		dump_hex(map, kernel->buf_sizes[i] * 4);
-		dump_float(map, kernel->buf_sizes[i] * 4);
-	}
-
-	if (perfcntrstr) {
-		uint64_t results[num_perfcntrs];
-		backend->read_perfcntrs(backend, results);
-
-		for (unsigned i = 0; i < num_perfcntrs; i++) {
-			printf("%s:\t%'"PRIu64"\n", perfcntrs[i].name, results[i]);
-		}
-	}
-
-	return 0;
-
-usage:
-	usage(argv[0]);
-	return -1;
-}
diff --git a/lib/mesa/src/freedreno/computerator/main.h b/lib/mesa/src/freedreno/computerator/main.h
deleted file mode 100644
index 57b1ac07c..000000000
--- a/lib/mesa/src/freedreno/computerator/main.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright © 2020 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __MAIN_H__
-#define __MAIN_H__
-
-#include <err.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include "drm/freedreno_drmif.h"
-#include "drm/freedreno_ringbuffer.h"
-
-#include "registers/adreno_pm4.xml.h"
-#include "registers/adreno_common.xml.h"
-
-#define MAX_BUFS 4
-
-struct kernel {
-	/* filled in by backend when shader is assembled: */
-	uint32_t local_size[3];
-	uint32_t num_bufs;
-	uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
-
-	/* filled in by frontend before launching grid: */
-	struct fd_bo *bufs[MAX_BUFS];
-};
-
-struct perfcntr {
-	const char *name;
-
-	/* for backend to configure/read the counter, describes
-	 * the selected counter:
-	 */
-	unsigned select_reg;
-	unsigned counter_reg_lo;
-	unsigned counter_reg_hi;
-	/* and selected countable:
-	 */
-	unsigned selector;
-};
-
-/* per-generation entry-points: */
-struct backend {
-	struct kernel *(*assemble)(struct backend *b, FILE *in);
-	void (*disassemble)(struct kernel *kernel, FILE *out);
-	void (*emit_grid)(struct kernel *kernel, uint32_t grid[3],
-			struct fd_submit *submit);
-
-	/* performance-counter API: */
-	void (*set_perfcntrs)(struct backend *b, const struct perfcntr *perfcntrs,
-			unsigned num_perfcntrs);
-	void (*read_perfcntrs)(struct backend *b, uint64_t *results);
-};
-
-#define define_cast(_from, _to)	\
-static inline struct _to *		\
-to_ ## _to(struct _from *f)		\
-{ return (struct _to *)f; }
-
-struct backend *a6xx_init(struct fd_device *dev, uint32_t gpu_id);
-
-/*
- * cmdstream helpers:
- */
-
-static inline void
-BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords)
-{
-	if (ring->cur + ndwords > ring->end)
-		fd_ringbuffer_grow(ring, ndwords);
-}
-
-static inline void
-OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
-{
-	fd_ringbuffer_emit(ring, data);
-}
-
-static inline unsigned
-_odd_parity_bit(unsigned val)
-{
-	/* See: http://graphics.stanford.edu/~seander/bithacks.html#ParityParallel
-	 * note that we want odd parity so 0x6996 is inverted.
-	 */
-	val ^= val >> 16;
-	val ^= val >> 8;
-	val ^= val >> 4;
-	val &= 0xf;
-	return (~0x6996 >> val) & 1;
-}
-
-static inline void
-OUT_PKT4(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
-{
-	BEGIN_RING(ring, cnt+1);
-	OUT_RING(ring, CP_TYPE4_PKT | cnt |
-			(_odd_parity_bit(cnt) << 7) |
-			((regindx & 0x3ffff) << 8) |
-			((_odd_parity_bit(regindx) << 27)));
-}
-
-static inline void
-OUT_PKT7(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
-{
-	BEGIN_RING(ring, cnt+1);
-	OUT_RING(ring, CP_TYPE7_PKT | cnt |
-			(_odd_parity_bit(cnt) << 15) |
-			((opcode & 0x7f) << 16) |
-			((_odd_parity_bit(opcode) << 23)));
-}
-
-/*
- * NOTE: OUT_RELOC*() is 2 dwords (64b) on a5xx+
- */
-
-static inline void
-__out_reloc(struct fd_ringbuffer *ring, struct fd_bo *bo,
-		uint32_t offset, uint64_t or, int32_t shift, uint32_t flags)
-{
-	debug_assert(offset < fd_bo_size(bo));
-	fd_ringbuffer_reloc(ring, &(struct fd_reloc){
-		.bo = bo,
-		.flags = flags,
-		.offset = offset,
-		.or = or,
-		.shift = shift,
-		.orhi = or >> 32,
-	});
-}
-
-static inline void
-OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo,
-		uint32_t offset, uint64_t or, int32_t shift)
-{
-	__out_reloc(ring, bo, offset, or, shift, FD_RELOC_READ);
-}
-
-static inline void
-OUT_RELOCW(struct fd_ringbuffer *ring, struct fd_bo *bo,
-		uint32_t offset, uint64_t or, int32_t shift)
-{
-	__out_reloc(ring, bo, offset, or, shift, FD_RELOC_READ | FD_RELOC_WRITE);
-}
-
-static inline void
-OUT_RELOCD(struct fd_ringbuffer *ring, struct fd_bo *bo,
-		uint32_t offset, uint64_t or, int32_t shift)
-{
-	__out_reloc(ring, bo, offset, or, shift, FD_RELOC_READ | FD_RELOC_DUMP);
-}
-
-static inline void
-OUT_RB(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)
-{
-	fd_ringbuffer_emit_reloc_ring_full(ring, target, 0);
-}
-
-/* for conditionally setting boolean flag(s): */
-#define COND(bool, val) ((bool) ? (val) : 0)
-
-#endif /* __MAIN_H__ */
diff --git a/lib/mesa/src/freedreno/computerator/meson.build b/lib/mesa/src/freedreno/computerator/meson.build
deleted file mode 100644
index 3203e48f3..000000000
--- a/lib/mesa/src/freedreno/computerator/meson.build
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright © 2020 Google, Inc
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-computerator_files = [
-  'a6xx.c',
-  'ir3_asm.c',
-  'main.c',
-  freedreno_xml_header_files,
-  ir3_parser,
-]
-
-computerator = executable(
-  'computerator',
-  computerator_files,
-  include_directories : [
-    inc_freedreno,
-    inc_include,
-    inc_src,
-    inc_mapi,
-    inc_mesa,
-    inc_gallium,
-    inc_gallium_aux,
-  ],
-  link_with : [
-    libfreedreno_drm,
-    libfreedreno_ir3,
-    libfreedreno_perfcntrs,
-  ],
-  dependencies : [
-    dep_libdrm,
-    idep_mesautil,
-    # We don't actually use nir, but ir3 wants some nir headers:
-    idep_nir,
-  ],
-  build_by_default : with_tools.contains('freedreno'),
-  install : with_tools.contains('freedreno'),
-)
diff --git a/lib/mesa/src/freedreno/fdl/fd6_layout_test.c b/lib/mesa/src/freedreno/fdl/fd6_layout_test.c
deleted file mode 100644
index 378e6f10a..000000000
--- a/lib/mesa/src/freedreno/fdl/fd6_layout_test.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright © 2020 Google LLC
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "freedreno_layout.h"
-#include "adreno_common.xml.h"
-#include "a6xx.xml.h"
-
-#include <stdio.h>
-
-struct testcase {
-	enum pipe_format format;
-
-	int array_size; /* Size for array textures, or 0 otherwise. */
-	bool is_3d;
-
-    /* Partially filled layout of input parameters and expected results. */
-	struct fdl_layout layout;
-};
-
-static const struct testcase testcases[] = {
-	/* A straightforward first testcase, linear, with an obvious format. */
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 32, .height0 = 32,
-			.slices = {
-				{ .offset = 0, .pitch = 256 },
-				{ .offset = 8192, .pitch = 256 },
-				{ .offset = 12288, .pitch = 256 },
-				{ .offset = 14336, .pitch = 256 },
-				{ .offset = 15360, .pitch = 256 },
-				{ .offset = 15872, .pitch = 256 },
-			},
-		},
-	},
-
-	/* A tiled/ubwc layout from the blob driver, at a size where the a630 blob
-	 * driver does something interesting for linear.
-	 */
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.tile_mode = TILE6_3,
-			.ubwc = true,
-			.width0 = 1024, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 4096 },
-				{ .offset = 65536, .pitch = 2048 },
-				{ .offset = 98304, .pitch = 1024 },
-				{ .offset = 114688, .pitch = 512 },
-				{ .offset = 122880, .pitch = 256 },
-				{ .offset = 126976, .pitch = 256 },
-				{ .offset = 131072, .pitch = 256 },
-				{ .offset = 135168, .pitch = 256 },
-				{ .offset = 139264, .pitch = 256 },
-				{ .offset = 143360, .pitch = 256 },
-				{ .offset = 147456, .pitch = 256 },
-			},
-			.ubwc_slices = {
-				{ .offset = 0, .pitch = 64 },
-				{ .offset = 4096, .pitch = 64 },
-				{ .offset = 8192, .pitch = 64 },
-				{ .offset = 12288, .pitch = 64 },
-				{ .offset = 16384, .pitch = 64 },
-				{ .offset = 20480, .pitch = 64 },
-				{ .offset = 24576, .pitch = 64 },
-				{ .offset = 28672, .pitch = 64 },
-				{ .offset = 32768, .pitch = 64 },
-				{ .offset = 36864, .pitch = 64 },
-				{ .offset = 40960, .pitch = 64 },
-			},
-		},
-	},
-
-	/* An interesting layout from the blob driver on a630, showing that
-	 * per-level pitch must be derived from level 0's pitch, not width0.  We
-	 * don't do this level 0 pitch disalignment (we pick 4096), so disabled
-	 * this test for now.
-	 */
-#if 0
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 1024, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 5120 },
-				{ .offset = 5120, .pitch = 2560 },
-				{ .offset = 7680, .pitch = 1280 },
-				{ .offset = 8960, .pitch = 768 },
-				{ .offset = 9728, .pitch = 512 },
-				{ .offset = 10240, .pitch = 256 },
-				{ .offset = 10496, .pitch = 256 },
-				{ .offset = 10752, .pitch = 256 },
-				{ .offset = 11008, .pitch = 256 },
-				{ .offset = 11264, .pitch = 256 },
-				{ .offset = 11520, .pitch = 256 },
-			},
-		},
-	},
-#endif
-
-	/* A layout that we failed on (129 wide has a surprise level 1 pitch
-	 * increase), and the sizes bracketing it.
-	 */
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 128, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 512 },
-				{ .offset = 512, .pitch = 256 },
-				{ .offset = 768, .pitch = 256 },
-				{ .offset = 1024, .pitch = 256 },
-				{ .offset = 1280, .pitch = 256 },
-				{ .offset = 1536, .pitch = 256 },
-				{ .offset = 1792, .pitch = 256 },
-				{ .offset = 2048, .pitch = 256 },
-			},
-		},
-	},
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 129, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 768 },
-				{ .offset = 768, .pitch = 512 },
-				{ .offset = 1280, .pitch = 256 },
-				{ .offset = 1536, .pitch = 256 },
-				{ .offset = 1792, .pitch = 256 },
-				{ .offset = 2048, .pitch = 256 },
-				{ .offset = 2304, .pitch = 256 },
-				{ .offset = 2560, .pitch = 256 },
-			},
-		},
-	},
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 130, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 768 },
-				{ .offset = 768, .pitch = 512 },
-				{ .offset = 1280, .pitch = 256 },
-				{ .offset = 1536, .pitch = 256 },
-				{ .offset = 1792, .pitch = 256 },
-				{ .offset = 2048, .pitch = 256 },
-				{ .offset = 2304, .pitch = 256 },
-				{ .offset = 2560, .pitch = 256 },
-			},
-		},
-	},
-
-	/* The 129 failure seems to be across formats, let's test some cpps */
-	{
-		.format = PIPE_FORMAT_R8_UNORM,
-		.layout = {
-			.width0 = 129, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 192 },
-				{ .offset = 192, .pitch = 128 },
-				{ .offset = 320, .pitch = 64 },
-				{ .offset = 384, .pitch = 64 },
-				{ .offset = 448, .pitch = 64 },
-				{ .offset = 512, .pitch = 64 },
-				{ .offset = 576, .pitch = 64 },
-				{ .offset = 640, .pitch = 64 },
-			},
-		},
-	},
-	{
-		.format = PIPE_FORMAT_R16_UINT,
-		.layout = {
-			.width0 = 129, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 384 },
-				{ .offset = 384, .pitch = 256 },
-				{ .offset = 640, .pitch = 128 },
-				{ .offset = 768, .pitch = 128 },
-				{ .offset = 896, .pitch = 128 },
-				{ .offset = 1024, .pitch = 128 },
-				{ .offset = 1152, .pitch = 128 },
-				{ .offset = 1280, .pitch = 128 },
-			},
-		},
-	},
-	{
-		.format = PIPE_FORMAT_R32G32B32A32_FLOAT,
-		.layout = {
-			.width0 = 129, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 3072 },
-				{ .offset = 3072, .pitch = 2048 },
-				{ .offset = 5120, .pitch = 1024 },
-				{ .offset = 6144, .pitch = 1024 },
-				{ .offset = 7168, .pitch = 1024 },
-				{ .offset = 8192, .pitch = 1024 },
-				{ .offset = 9216, .pitch = 1024 },
-				{ .offset = 10240, .pitch = 1024 },
-			},
-		},
-	},
-
-	/* The 129 failure replicated at every +256 pixels wide.  Pick one of
-	 * them, and this time increase the height as a new variable as well.
-	 */
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 385, .height0 = 128,
-			.slices = {
-				{ .offset = 0, .pitch = 1792 },
-				{ .offset = 229376, .pitch = 1024 },
-				{ .offset = 294912, .pitch = 512 },
-				{ .offset = 311296, .pitch = 256 },
-				{ .offset = 315392, .pitch = 256 },
-				{ .offset = 317440, .pitch = 256 },
-				{ .offset = 318464, .pitch = 256 },
-				{ .offset = 318976, .pitch = 256 },
-				{ .offset = 319232, .pitch = 256 },
-			},
-		},
-	},
-
-	/* At 257-259 (and replicated every +256 pixels) we had another failure. */
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 257, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 1280 },
-				{ .offset = 1280, .pitch = 768 },
-				{ .offset = 2048, .pitch = 512 },
-				{ .offset = 2560, .pitch = 256 },
-				{ .offset = 2816, .pitch = 256 },
-				{ .offset = 3072, .pitch = 256 },
-				{ .offset = 3328, .pitch = 256 },
-				{ .offset = 3584, .pitch = 256 },
-				{ .offset = 3840, .pitch = 256 },
-			},
-		},
-	},
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 258, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 1280 },
-				{ .offset = 1280, .pitch = 768 },
-				{ .offset = 2048, .pitch = 512 },
-				{ .offset = 2560, .pitch = 256 },
-				{ .offset = 2816, .pitch = 256 },
-				{ .offset = 3072, .pitch = 256 },
-				{ .offset = 3328, .pitch = 256 },
-				{ .offset = 3584, .pitch = 256 },
-				{ .offset = 3840, .pitch = 256 },
-			},
-		},
-	},
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 259, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 1280 },
-				{ .offset = 1280, .pitch = 768 },
-				{ .offset = 2048, .pitch = 512 },
-				{ .offset = 2560, .pitch = 256 },
-				{ .offset = 2816, .pitch = 256 },
-				{ .offset = 3072, .pitch = 256 },
-				{ .offset = 3328, .pitch = 256 },
-				{ .offset = 3584, .pitch = 256 },
-				{ .offset = 3840, .pitch = 256 },
-			},
-		},
-	},
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 260, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 1280 },
-				{ .offset = 1280, .pitch = 768 },
-				{ .offset = 2048, .pitch = 512 },
-				{ .offset = 2560, .pitch = 256 },
-				{ .offset = 2816, .pitch = 256 },
-				{ .offset = 3072, .pitch = 256 },
-				{ .offset = 3328, .pitch = 256 },
-				{ .offset = 3584, .pitch = 256 },
-				{ .offset = 3840, .pitch = 256 },
-			},
-		},
-	},
-
-	/* And, again for the 257-9 failure, test a replica with a larger size*/
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 513, .height0 = 32,
-			.slices = {
-				{ .offset = 0, .pitch = 2304 },
-				{ .offset = 73728, .pitch = 1280 },
-				{ .offset = 94208, .pitch = 768 },
-				{ .offset = 100352, .pitch = 512 },
-				{ .offset = 102400, .pitch = 256 },
-				{ .offset = 102912, .pitch = 256 },
-				{ .offset = 103168, .pitch = 256 },
-				{ .offset = 103424, .pitch = 256 },
-				{ .offset = 103680, .pitch = 256 },
-				{ .offset = 103936, .pitch = 256 },
-			},
-		},
-	},
-
-	/* Oh, look.  The 513-517 failure extends up to 518 at the next texture
-	 * level!
-	 */
-	{
-		.format = PIPE_FORMAT_R8G8B8A8_UNORM,
-		.layout = {
-			.width0 = 518, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 2304 },
-				{ .offset = 2304, .pitch = 1280 },
-				{ .offset = 3584, .pitch = 768 },
-				{ .offset = 4352, .pitch = 512 },
-				{ .offset = 4864, .pitch = 256 },
-				{ .offset = 5120, .pitch = 256 },
-				{ .offset = 5376, .pitch = 256 },
-				{ .offset = 5632, .pitch = 256 },
-				{ .offset = 5888, .pitch = 256 },
-				{ .offset = 6144, .pitch = 256 },
-			},
-		},
-	},
-
-	/* Tiled mode testing of the unusual 1/2-bytes-per-pixel pitch alignment */
-	{
-		.format = PIPE_FORMAT_R8_UNORM,
-		.layout = {
-			.tile_mode = TILE6_3,
-			.width0 = 129, .height0 = 1,
-			.slices = {
-				{ .offset = 0, .pitch = 256 },
-				{ .offset = 8192, .pitch = 128 },
-				{ .offset = 12288, .pitch = 128 },
-				{ .offset = 16384, .pitch = 128 },
-				{ .offset = 20480, .pitch = 64 },
-				{ .offset = 20544, .pitch = 64 },
-				{ .offset = 20608, .pitch = 64 },
-				{ .offset = 20672, .pitch = 64 },
-			},
-		},
-	},
-
-};
-
-static bool test_layout(const struct testcase *testcase)
-{
-	struct fdl_layout layout = {
-		.ubwc = testcase->layout.ubwc,
-		.tile_mode = testcase->layout.tile_mode,
-	};
-	bool ok = true;
-
-	int max_size = MAX2(testcase->layout.width0, testcase->layout.height0);
-	int mip_levels = 1;
-	while (max_size > 1) {
-		mip_levels++;
-		max_size = u_minify(max_size, 1);
-	}
-
-	fdl6_layout(&layout,
-			testcase->format,
-			MAX2(testcase->layout.nr_samples, 1),
-			testcase->layout.width0,
-			MAX2(testcase->layout.height0, 1),
-			MAX2(testcase->layout.depth0, 1),
-			mip_levels,
-			MAX2(testcase->array_size, 1),
-			testcase->is_3d);
-
-	/* fdl lays out UBWC data before the color data, while all we have
-	 * recorded in this testcase are the color offsets.  Shift the fdl layout
-	 * down so we can compare color offsets.
-	 */
-	if (layout.ubwc) {
-		for (int l = 1; l < mip_levels; l++)
-			layout.slices[l].offset -= layout.slices[0].offset;
-		layout.slices[0].offset = 0;
-	}
-
-	for (int l = 0; l < mip_levels; l++) {
-		if (layout.slices[l].offset != testcase->layout.slices[l].offset) {
-			fprintf(stderr, "%s %dx%dx%d@%dx lvl%d: offset 0x%x != 0x%x\n",
-					util_format_short_name(testcase->format),
-					layout.width0, layout.height0, layout.depth0,
-					layout.nr_samples, l,
-					layout.slices[l].offset,
-					testcase->layout.slices[l].offset);
-			ok = false;
-		}
-		if (layout.slices[l].pitch != testcase->layout.slices[l].pitch) {
-			fprintf(stderr, "%s %dx%dx%d@%dx lvl%d: pitch %d != %d\n",
-					util_format_short_name(testcase->format),
-					layout.width0, layout.height0, layout.depth0,
-					layout.nr_samples, l,
-					layout.slices[l].pitch,
-					testcase->layout.slices[l].pitch);
-			ok = false;
-		}
-
-		if (layout.ubwc_slices[l].offset != testcase->layout.ubwc_slices[l].offset) {
-			fprintf(stderr, "%s %dx%dx%d@%dx lvl%d: offset 0x%x != 0x%x\n",
-					util_format_short_name(testcase->format),
-					layout.width0, layout.height0, layout.depth0,
-					layout.nr_samples, l,
-					layout.ubwc_slices[l].offset,
-					testcase->layout.ubwc_slices[l].offset);
-			ok = false;
-		}
-		if (layout.ubwc_slices[l].pitch != testcase->layout.ubwc_slices[l].pitch) {
-			fprintf(stderr, "%s %dx%dx%d@%dx lvl%d: pitch %d != %d\n",
-					util_format_short_name(testcase->format),
-					layout.width0, layout.height0, layout.depth0,
-					layout.nr_samples, l,
-					layout.ubwc_slices[l].pitch,
-					testcase->layout.ubwc_slices[l].pitch);
-			ok = false;
-		}
-	}
-
-	if (!ok)
-		fprintf(stderr, "\n");
-
-	return ok;
-}
-
-int
-main(int argc, char **argv)
-{
-	int ret = 0;
-
-	for (int i = 0; i < ARRAY_SIZE(testcases); i++) {
-		if (!test_layout(&testcases[i]))
-			ret = 1;
-	}
-
-	return ret;
-}
diff --git a/lib/mesa/src/freedreno/ir3/ir3_cf.c b/lib/mesa/src/freedreno/ir3/ir3_cf.c
deleted file mode 100644
index db9b5de00..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_cf.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (C) 2019 Google.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "util/ralloc.h"
-
-#include "ir3.h"
-
-static bool
-is_fp16_conv(struct ir3_instruction *instr)
-{
-	if (instr->opc != OPC_MOV)
-		return false;
-
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_register *src = instr->regs[1];
-
-	/* disallow conversions that cannot be folded into
-	 * alu instructions:
-	 */
-	if (dst->flags & (IR3_REG_EVEN | IR3_REG_POS_INF))
-		return false;
-
-	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-		return false;
-	if (src->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-		return false;
-
-	if (instr->cat1.src_type == TYPE_F32 &&
-			instr->cat1.dst_type == TYPE_F16)
-		return true;
-
-	if (instr->cat1.src_type == TYPE_F16 &&
-			instr->cat1.dst_type == TYPE_F32)
-		return true;
-
-	return false;
-}
-
-static bool
-all_uses_fp16_conv(struct ir3_instruction *conv_src)
-{
-	foreach_ssa_use (use, conv_src)
-		if (!is_fp16_conv(use))
-			return false;
-	return true;
-}
-
-static void
-rewrite_uses(struct ir3_instruction *conv, struct ir3_instruction *replace)
-{
-	foreach_ssa_use (use, conv) {
-		struct ir3_instruction *src;
-		foreach_ssa_src_n (src, n, use) {
-			if (src == conv)
-				use->regs[n]->instr = replace;
-		}
-	}
-}
-
-static void
-try_conversion_folding(struct ir3_instruction *conv)
-{
-	struct ir3_instruction *src;
-
-	if (!is_fp16_conv(conv))
-		return;
-
-	src = ssa(conv->regs[1]);
-	if (!is_alu(src))
-		return;
-
-	/* avoid folding f2f32(f2f16) together, in cases where this is legal to
-	 * do (glsl) nir should have handled that for us already:
-	 */
-	if (is_fp16_conv(src))
-		return;
-
-	switch (src->opc) {
-	case OPC_SEL_B32:
-	case OPC_SEL_B16:
-	case OPC_MAX_F:
-	case OPC_MIN_F:
-	case OPC_SIGN_F:
-	case OPC_ABSNEG_F:
-		return;
-	case OPC_MOV:
-		/* if src is a "cov" and type doesn't match, then it can't be folded
-		 * for example cov.u32u16+cov.f16f32 can't be folded to cov.u32f32
-		 */
-		if (src->cat1.dst_type != src->cat1.src_type &&
-			conv->cat1.src_type != src->cat1.dst_type)
-			return;
-	default:
-		break;
-	}
-
-	if (!all_uses_fp16_conv(src))
-		return;
-
-	if (src->opc == OPC_MOV) {
-		if (src->cat1.dst_type == src->cat1.src_type) {
-			/* If we're folding a conversion into a bitwise move, we need to
-			 * change the dst type to F32 to get the right behavior, since we
-			 * could be moving a float with a u32.u32 move.
-			 */
-			src->cat1.dst_type = conv->cat1.dst_type;
-			src->cat1.src_type = conv->cat1.src_type;
-		} else {
-			/* Otherwise, for typechanging movs, we can just change the dst
-			 * type to F16 to collaps the two conversions.  For example
-			 * cov.s32f32 follwed by cov.f32f16 becomes cov.s32f16.
-			 */
-			src->cat1.dst_type = conv->cat1.dst_type;
-		}
-	}
-
-	if (conv->regs[0]->flags & IR3_REG_HALF) {
-		src->regs[0]->flags |= IR3_REG_HALF;
-	} else {
-		src->regs[0]->flags &= ~IR3_REG_HALF;
-	}
-
-	rewrite_uses(conv, src);
-}
-
-void
-ir3_cf(struct ir3 *ir)
-{
-	void *mem_ctx = ralloc_context(NULL);
-
-	ir3_find_ssa_uses(ir, mem_ctx, false);
-
-	foreach_block (block, &ir->block_list) {
-		foreach_instr_safe (instr, &block->instr_list) {
-			try_conversion_folding(instr);
-		}
-	}
-
-	ralloc_free(mem_ctx);
-}
diff --git a/lib/mesa/src/freedreno/ir3/ir3_delay.c b/lib/mesa/src/freedreno/ir3/ir3_delay.c
deleted file mode 100644
index 3fc4d911f..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_delay.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (C) 2019 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "ir3.h"
-
-/*
- * Helpers to figure out the necessary delay slots between instructions.  Used
- * both in scheduling pass(es) and the final pass to insert any required nop's
- * so that the shader program is valid.
- *
- * Note that this needs to work both pre and post RA, so we can't assume ssa
- * src iterators work.
- */
-
-/* generally don't count false dependencies, since this can just be
- * something like a barrier, or SSBO store.  The exception is array
- * dependencies if the assigner is an array write and the consumer
- * reads the same array.
- */
-static bool
-ignore_dep(struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer, unsigned n)
-{
-	if (!__is_false_dep(consumer, n))
-		return false;
-
-	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
-		struct ir3_register *dst = assigner->regs[0];
-		struct ir3_register *src;
-
-		debug_assert(dst->flags & IR3_REG_ARRAY);
-
-		foreach_src (src, consumer) {
-			if ((src->flags & IR3_REG_ARRAY) &&
-					(dst->array.id == src->array.id)) {
-				return false;
-			}
-		}
-	}
-
-	return true;
-}
-
-/* calculate required # of delay slots between the instruction that
- * assigns a value and the one that consumes
- */
-int
-ir3_delayslots(struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer, unsigned n, bool soft)
-{
-	if (ignore_dep(assigner, consumer, n))
-		return 0;
-
-	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
-	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
-	 * handled with sync bits
-	 */
-
-	if (is_meta(assigner) || is_meta(consumer))
-		return 0;
-
-	if (writes_addr0(assigner) || writes_addr1(assigner))
-		return 6;
-
-	/* On a6xx, it takes the number of delay slots to get a SFU result
-	 * back (ie. using nop's instead of (ss) is:
-	 *
-	 *     8 - single warp
-	 *     9 - two warps
-	 *    10 - four warps
-	 *
-	 * and so on.  Not quite sure where it tapers out (ie. how many
-	 * warps share an SFU unit).  But 10 seems like a reasonable #
-	 * to choose:
-	 */
-	if (soft && is_sfu(assigner))
-		return 10;
-
-	/* handled via sync flags: */
-	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
-		return 0;
-
-	/* assigner must be alu: */
-	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
-			is_mem(consumer)) {
-		return 6;
-	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-			(n == 3)) {
-		/* special case, 3rd src to cat3 not required on first cycle */
-		return 1;
-	} else {
-		return 3;
-	}
-}
-
-static bool
-count_instruction(struct ir3_instruction *n)
-{
-	/* NOTE: don't count branch/jump since we don't know yet if they will
-	 * be eliminated later in resolve_jumps().. really should do that
-	 * earlier so we don't have this constraint.
-	 */
-	return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
-}
-
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled)
- */
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned maxd, bool pred)
-{
-	unsigned d = 0;
-
-	/* Note that this relies on incrementally building up the block's
-	 * instruction list.. but this is how scheduling and nopsched
-	 * work.
-	 */
-	foreach_instr_rev (n, &block->instr_list) {
-		if ((n == instr) || (d >= maxd))
-			return MIN2(maxd, d + n->nop);
-		if (count_instruction(n))
-			d = MIN2(maxd, d + 1 + n->repeat + n->nop);
-	}
-
-	/* if coming from a predecessor block, assume it is assigned far
-	 * enough away.. we'll fix up later.
-	 */
-	if (!pred)
-		return maxd;
-
-	if (pred && (block->data != block)) {
-		/* Search into predecessor blocks, finding the one with the
-		 * shortest distance, since that will be the worst case
-		 */
-		unsigned min = maxd - d;
-
-		/* (ab)use block->data to prevent recursion: */
-		block->data = block;
-
-		set_foreach (block->predecessors, entry) {
-			struct ir3_block *pred = (struct ir3_block *)entry->key;
-			unsigned n;
-
-			n = distance(pred, instr, min, pred);
-
-			min = MIN2(min, n);
-		}
-
-		block->data = NULL;
-		d += min;
-	}
-
-	return d;
-}
-
-/* calculate delay for specified src: */
-static unsigned
-delay_calc_srcn(struct ir3_block *block,
-		struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer,
-		unsigned srcn, bool soft, bool pred)
-{
-	unsigned delay = 0;
-
-	if (is_meta(assigner)) {
-		struct ir3_register *src;
-		foreach_src (src, assigner) {
-			unsigned d;
-
-			if (!src->instr)
-				continue;
-
-			d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
-			delay = MAX2(delay, d);
-		}
-	} else {
-		delay = ir3_delayslots(assigner, consumer, srcn, soft);
-		delay -= distance(block, assigner, delay, pred);
-	}
-
-	return delay;
-}
-
-static struct ir3_instruction *
-find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
-{
-	unsigned d = 0;
-
-	/* Note that this relies on incrementally building up the block's
-	 * instruction list.. but this is how scheduling and nopsched
-	 * work.
-	 */
-	foreach_instr_rev (n, &block->instr_list) {
-		if (d >= maxd)
-			return NULL;
-		if (count_instruction(n))
-			d++;
-		if (dest_regs(n) == 0)
-			continue;
-
-		/* note that a dest reg will never be an immediate */
-		if (n->regs[0]->array.id == array_id)
-			return n;
-	}
-
-	return NULL;
-}
-
-/* like list_length() but only counts instructions which count in the
- * delay determination:
- */
-static unsigned
-count_block_delay(struct ir3_block *block)
-{
-	unsigned delay = 0;
-	foreach_instr (n, &block->instr_list) {
-		if (!count_instruction(n))
-			continue;
-		delay++;
-	}
-	return delay;
-}
-
-static unsigned
-delay_calc_array(struct ir3_block *block, unsigned array_id,
-		struct ir3_instruction *consumer, unsigned srcn,
-		bool soft, bool pred, unsigned maxd)
-{
-	struct ir3_instruction *assigner;
-
-	assigner = find_array_write(block, array_id, maxd);
-	if (assigner)
-		return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
-
-	if (!pred)
-		return 0;
-
-	unsigned len = count_block_delay(block);
-	if (maxd <= len)
-		return 0;
-
-	maxd -= len;
-
-	if (block->data == block) {
-		/* we have a loop, return worst case: */
-		return maxd;
-	}
-
-	/* If we need to search into predecessors, find the one with the
-	 * max delay.. the resulting delay is that minus the number of
-	 * counted instructions in this block:
-	 */
-	unsigned max = 0;
-
-	/* (ab)use block->data to prevent recursion: */
-	block->data = block;
-
-	set_foreach (block->predecessors, entry) {
-		struct ir3_block *pred = (struct ir3_block *)entry->key;
-		unsigned delay =
-			delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
-
-		max = MAX2(max, delay);
-	}
-
-	block->data = NULL;
-
-	if (max < len)
-		return 0;
-
-	return max - len;
-}
-
-/**
- * Calculate delay for instruction (maximum of delay for all srcs):
- *
- * @soft:  If true, add additional delay for situations where they
- *    would not be strictly required because a sync flag would be
- *    used (but scheduler would prefer to schedule some other
- *    instructions first to avoid stalling on sync flag)
- * @pred:  If true, recurse into predecessor blocks
- */
-unsigned
-ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred)
-{
-	unsigned delay = 0;
-	struct ir3_register *src;
-
-	foreach_src_n (src, i, instr) {
-		unsigned d = 0;
-
-		if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
-			d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
-		} else if (src->instr) {
-			d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
-		}
-
-		delay = MAX2(delay, d);
-	}
-
-	if (instr->address) {
-		unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
-		delay = MAX2(delay, d);
-	}
-
-	return delay;
-}
-
-/**
- * Remove nop instructions.  The scheduler can insert placeholder nop's
- * so that ir3_delay_calc() can account for nop's that won't be needed
- * due to nop's triggered by a previous instruction.  However, before
- * legalize, we want to remove these.  The legalize pass can insert
- * some nop's if needed to hold (for example) sync flags.  This final
- * remaining nops are inserted by legalize after this.
- */
-void
-ir3_remove_nops(struct ir3 *ir)
-{
-	foreach_block (block, &ir->block_list) {
-		foreach_instr_safe (instr, &block->instr_list) {
-			if (instr->opc == OPC_NOP) {
-				list_del(&instr->node);
-			}
-		}
-	}
-
-}
diff --git a/lib/mesa/src/freedreno/ir3/ir3_dce.c b/lib/mesa/src/freedreno/ir3/ir3_depth.c
index 0bd8af537..f1f7b94b2 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_dce.c
+++ b/lib/mesa/src/freedreno/ir3/ir3_depth.c
@@ -30,11 +30,109 @@
 #include "ir3_shader.h"
 
 /*
- * Dead code elimination:
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST).  That is to say, an instructions depth is:
+ *
+ *   depth(instr) {
+ *     d = 0;
+ *     // for each src register:
+ *     foreach (src in instr->regs[1..n])
+ *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ *     return d + 1;
+ *   }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* generally don't count false dependencies, since this can just be
+ * something like a barrier, or SSBO store.  The exception is array
+ * dependencies if the assigner is an array write and the consumer
+ * reads the same array.
+ */
+static bool
+ignore_dep(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n)
+{
+	if (!__is_false_dep(consumer, n))
+		return false;
+
+	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
+		struct ir3_register *dst = assigner->regs[0];
+		struct ir3_register *src;
+
+		debug_assert(dst->flags & IR3_REG_ARRAY);
+
+		foreach_src(src, consumer) {
+			if ((src->flags & IR3_REG_ARRAY) &&
+					(dst->array.id == src->array.id)) {
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
  */
+int ir3_delayslots(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n)
+{
+	if (ignore_dep(assigner, consumer, n))
+		return 0;
+
+	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+	 * handled with sync bits
+	 */
+
+	if (is_meta(assigner) || is_meta(consumer))
+		return 0;
+
+	if (writes_addr(assigner))
+		return 6;
+
+	/* handled via sync flags: */
+	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+		return 0;
+
+	/* assigner must be alu: */
+	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+			is_mem(consumer)) {
+		return 6;
+	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+			(n == 3)) {
+		/* special case, 3rd src to cat3 not required on first cycle */
+		return 1;
+	} else {
+		return 3;
+	}
+}
+
+void
+ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
+{
+	/* remove from existing spot in list: */
+	list_delinit(&instr->node);
+
+	/* find where to re-insert instruction: */
+	foreach_instr (pos, list) {
+		if (pos->depth > instr->depth) {
+			list_add(&instr->node, &pos->node);
+			return;
+		}
+	}
+	/* if we get here, we didn't find an insertion spot: */
+	list_addtail(&instr->node, list);
+}
 
 static void
-instr_dce(struct ir3_instruction *instr, bool falsedep)
+ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
 {
 	struct ir3_instruction *src;
 
@@ -45,9 +143,28 @@ instr_dce(struct ir3_instruction *instr, bool falsedep)
 	if (ir3_instr_check_mark(instr))
 		return;
 
-	foreach_ssa_src_n (src, i, instr) {
-		instr_dce(src, __is_false_dep(instr, i));
+	instr->depth = 0;
+
+	foreach_ssa_src_n(src, i, instr) {
+		unsigned sd;
+
+		/* visit child to compute it's depth: */
+		ir3_instr_depth(src, boost, __is_false_dep(instr, i));
+
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
+
+		sd = ir3_delayslots(src, instr, i) + src->depth;
+		sd += boost;
+
+		instr->depth = MAX2(instr->depth, sd);
 	}
+
+	if (!is_meta(instr))
+		instr->depth++;
+
+	ir3_insert_by_depth(instr, &instr->block->instr_list);
 }
 
 static bool
@@ -63,7 +180,7 @@ remove_unused_by_block(struct ir3_block *block)
 				/* tex (cat5) instructions have a writemask, so we can
 				 * mask off unused components.  Other instructions do not.
 				 */
-				if (src && is_tex_or_prefetch(src) && (src->regs[0]->wrmask > 1)) {
+				if (is_tex(src) && (src->regs[0]->wrmask > 1)) {
 					src->regs[0]->wrmask &= ~(1 << instr->split.off);
 
 					/* prune no-longer needed right-neighbors.  We could
@@ -82,13 +199,6 @@ remove_unused_by_block(struct ir3_block *block)
 					}
 				}
 			}
-
-			/* prune false-deps, etc: */
-			foreach_ssa_use (use, instr)
-				foreach_ssa_srcp_n (srcp, n, use)
-					if (*srcp == instr)
-						*srcp = NULL;
-
 			list_delinit(&instr->node);
 			progress = true;
 		}
@@ -97,7 +207,7 @@ remove_unused_by_block(struct ir3_block *block)
 }
 
 static bool
-find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
+compute_depth_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
 {
 	unsigned i;
 	bool progress = false;
@@ -121,57 +231,35 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
 	}
 
 	struct ir3_instruction *out;
-	foreach_output (out, ir)
-		instr_dce(out, false);
+	foreach_output(out, ir)
+		ir3_instr_depth(out, 0, false);
 
 	foreach_block (block, &ir->block_list) {
 		for (i = 0; i < block->keeps_count; i++)
-			instr_dce(block->keeps[i], false);
+			ir3_instr_depth(block->keeps[i], 0, false);
 
 		/* We also need to account for if-condition: */
 		if (block->condition)
-			instr_dce(block->condition, false);
+			ir3_instr_depth(block->condition, 6, false);
 	}
 
-	/* remove un-used instructions: */
+	/* mark un-used instructions: */
 	foreach_block (block, &ir->block_list) {
 		progress |= remove_unused_by_block(block);
 	}
 
-	/* fixup wrmask of split instructions to account for adjusted tex
-	 * wrmask's:
-	 */
-	foreach_block (block, &ir->block_list) {
-		foreach_instr (instr, &block->instr_list) {
-			if (instr->opc != OPC_META_SPLIT)
-				continue;
-
-			struct ir3_instruction *src = ssa(instr->regs[1]);
-			if (!is_tex_or_prefetch(src))
-				continue;
-
-			instr->regs[1]->wrmask = src->regs[0]->wrmask;
-		}
-	}
-
 	/* note that we can end up with unused indirects, but we should
 	 * not end up with unused predicates.
 	 */
-	for (i = 0; i < ir->a0_users_count; i++) {
-		struct ir3_instruction *instr = ir->a0_users[i];
-		if (instr && (instr->flags & IR3_INSTR_UNUSED))
-			ir->a0_users[i] = NULL;
-	}
-
-	for (i = 0; i < ir->a1_users_count; i++) {
-		struct ir3_instruction *instr = ir->a1_users[i];
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *instr = ir->indirects[i];
 		if (instr && (instr->flags & IR3_INSTR_UNUSED))
-			ir->a1_users[i] = NULL;
+			ir->indirects[i] = NULL;
 	}
 
 	/* cleanup unused inputs: */
 	struct ir3_instruction *in;
-	foreach_input_n (in, n, ir)
+	foreach_input_n(in, n, ir)
 		if (in->flags & IR3_INSTR_UNUSED)
 			ir->inputs[n] = NULL;
 
@@ -179,16 +267,10 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
 }
 
 void
-ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so)
+ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so)
 {
-	void *mem_ctx = ralloc_context(NULL);
 	bool progress;
-
-	ir3_find_ssa_uses(ir, mem_ctx, true);
-
 	do {
-		progress = find_and_remove_unused(ir, so);
+		progress = compute_depth_and_remove_unused(ir, so);
 	} while (progress);
-
-	ralloc_free(mem_ctx);
 }
diff --git a/lib/mesa/src/freedreno/ir3/ir3_lexer.l b/lib/mesa/src/freedreno/ir3/ir3_lexer.l
deleted file mode 100644
index 823cb1587..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_lexer.l
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-%{
-#include <stdlib.h>
-#include "ir3/ir3.h"
-#include "ir3_parser.h"
-
-#define YY_NO_INPUT
-#define YY_NO_UNPUT
-#define TOKEN(t) (ir3_yylval.tok = t)
-extern YYSTYPE ir3_yylval;
-
-static int parse_wrmask(const char *src)
-{
-	int i, num = 0;
-	for (i = 0; i < 4; i++) {
-		if ("xyzw"[i] == src[1]) {
-			num |= (1 << i);
-			src++;
-		}
-	}
-	return num;
-}
-
-static int parse_reg(const char *str)
-{
-	int num = 0;
-	if (str[0] == 'h') {
-		str++;
-		num++;
-	}
-	str++;
-	num += strtol(str, (char **)&str, 10) << 3;
-	switch (str[1]) {
-	case 'x': num += 0; break;
-	case 'y': num += 2; break;
-	case 'z': num += 4; break;
-	case 'w': num += 6; break;
-	default: assert(0); break;
-	}
-	return num;
-}
-%}
-
-%option noyywrap
-%option prefix="ir3_yy"
-
-%%
-"\n"                              yylineno++;
-[ \t]                             ; /* ignore whitespace */
-";"[^\n]*"\n"                     yylineno++; /* ignore comments */
-[0-9]+"."[0-9]+                   ir3_yylval.flt = strtod(yytext, NULL);       return T_FLOAT;
-[0-9]*                            ir3_yylval.num = strtoul(yytext, NULL, 0);    return T_INT;
-"0x"[0-9a-fA-F]*                  ir3_yylval.num = strtoul(yytext, NULL, 0);    return T_HEX;
-"@localsize"                      return TOKEN(T_A_LOCALSIZE);
-"@const"                          return TOKEN(T_A_CONST);
-"@buf"                            return TOKEN(T_A_BUF);
-"@invocationid"                   return TOKEN(T_A_INVOCATIONID);
-"@wgid"                           return TOKEN(T_A_WGID);
-"@numwg"                          return TOKEN(T_A_NUMWG);
-"@in"                             return TOKEN(T_A_IN);
-"@out"                            return TOKEN(T_A_OUT);
-"@tex"                            return TOKEN(T_A_TEX);
-"(sy)"                            return TOKEN(T_SY);
-"(ss)"                            return TOKEN(T_SS);
-"(absneg)"                        return TOKEN(T_ABSNEG);
-"(neg)"                           return TOKEN(T_NEG);
-"(abs)"                           return TOKEN(T_ABS);
-"(r)"                             return TOKEN(T_R);
-"(ul)"                            return TOKEN(T_UL);
-"(even)"                          return TOKEN(T_EVEN);
-"(pos_infinity)"                  return TOKEN(T_POS_INFINITY);
-"(ei)"                            return TOKEN(T_EI);
-"(jp)"                            return TOKEN(T_JP);
-"(rpt"[0-7]")"                    ir3_yylval.num = strtol(yytext+4, NULL, 10); return T_RPT;
-"(nop"[0-7]")"                    ir3_yylval.num = strtol(yytext+4, NULL, 10); return T_NOP;
-"("[x]?[y]?[z]?[w]?")"            ir3_yylval.num = parse_wrmask(yytext); return T_WRMASK;
-
-[h]?"r"[0-9]+"."[xyzw]            ir3_yylval.num = parse_reg(yytext); return T_REGISTER;
-[h]?"c"[0-9]+"."[xyzw]            ir3_yylval.num = parse_reg(yytext); return T_CONSTANT;
-"a0."[xyzw]                       ir3_yylval.num = parse_reg(yytext); return T_A0;
-"p0."[xyzw]                       ir3_yylval.num = parse_reg(yytext); return T_P0;
-"s#"[0-9]+                        ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_SAMP;
-"t#"[0-9]+                        ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_TEX;
-
-                                  /* category 0: */
-"nop"                             return TOKEN(T_OP_NOP);
-"br"                              return TOKEN(T_OP_BR);
-"jump"                            return TOKEN(T_OP_JUMP);
-"call"                            return TOKEN(T_OP_CALL);
-"ret"                             return TOKEN(T_OP_RET);
-"kill"                            return TOKEN(T_OP_KILL);
-"end"                             return TOKEN(T_OP_END);
-"emit"                            return TOKEN(T_OP_EMIT);
-"cut"                             return TOKEN(T_OP_CUT);
-"chmask"                          return TOKEN(T_OP_CHMASK);
-"chsh"                            return TOKEN(T_OP_CHSH);
-"flow_rev"                        return TOKEN(T_OP_FLOW_REV);
-
-                                  /* category 1: */
-"mova"                            return TOKEN(T_OP_MOVA);
-"mov"                             return TOKEN(T_OP_MOV);
-"cov"                             return TOKEN(T_OP_COV);
-
-("f16"|"f32"|"u16"|"u32"|"s16"|"s32"|"u8"|"s8"){2} ir3_yylval.str = yytext; return T_CAT1_TYPE_TYPE;
-
-                                  /* category 2: */
-"add.f"                           return TOKEN(T_OP_ADD_F);
-"min.f"                           return TOKEN(T_OP_MIN_F);
-"max.f"                           return TOKEN(T_OP_MAX_F);
-"mul.f"                           return TOKEN(T_OP_MUL_F);
-"sign.f"                          return TOKEN(T_OP_SIGN_F);
-"cmps.f"                          return TOKEN(T_OP_CMPS_F);
-"absneg.f"                        return TOKEN(T_OP_ABSNEG_F);
-"cmpv.f"                          return TOKEN(T_OP_CMPV_F);
-"floor.f"                         return TOKEN(T_OP_FLOOR_F);
-"ceil.f"                          return TOKEN(T_OP_CEIL_F);
-"rndne.f"                         return TOKEN(T_OP_RNDNE_F);
-"rndaz.f"                         return TOKEN(T_OP_RNDAZ_F);
-"trunc.f"                         return TOKEN(T_OP_TRUNC_F);
-"add.u"                           return TOKEN(T_OP_ADD_U);
-"add.s"                           return TOKEN(T_OP_ADD_S);
-"sub.u"                           return TOKEN(T_OP_SUB_U);
-"sub.s"                           return TOKEN(T_OP_SUB_S);
-"cmps.u"                          return TOKEN(T_OP_CMPS_U);
-"cmps.s"                          return TOKEN(T_OP_CMPS_S);
-"min.u"                           return TOKEN(T_OP_MIN_U);
-"min.s"                           return TOKEN(T_OP_MIN_S);
-"max.u"                           return TOKEN(T_OP_MAX_U);
-"max.s"                           return TOKEN(T_OP_MAX_S);
-"absneg.s"                        return TOKEN(T_OP_ABSNEG_S);
-"and.b"                           return TOKEN(T_OP_AND_B);
-"or.b"                            return TOKEN(T_OP_OR_B);
-"not.b"                           return TOKEN(T_OP_NOT_B);
-"xor.b"                           return TOKEN(T_OP_XOR_B);
-"cmpv.u"                          return TOKEN(T_OP_CMPV_U);
-"cmpv.s"                          return TOKEN(T_OP_CMPV_S);
-"mul.u24"                         return TOKEN(T_OP_MUL_U24);
-"mul.s24"                         return TOKEN(T_OP_MUL_S24);
-"mull.u"                          return TOKEN(T_OP_MULL_U);
-"bfrev.b"                         return TOKEN(T_OP_BFREV_B);
-"clz.s"                           return TOKEN(T_OP_CLZ_S);
-"clz.b"                           return TOKEN(T_OP_CLZ_B);
-"shl.b"                           return TOKEN(T_OP_SHL_B);
-"shr.b"                           return TOKEN(T_OP_SHR_B);
-"ashr.b"                          return TOKEN(T_OP_ASHR_B);
-"bary.f"                          return TOKEN(T_OP_BARY_F);
-"mgen.b"                          return TOKEN(T_OP_MGEN_B);
-"getbit.b"                        return TOKEN(T_OP_GETBIT_B);
-"setrm"                           return TOKEN(T_OP_SETRM);
-"cbits.b"                         return TOKEN(T_OP_CBITS_B);
-"shb"                             return TOKEN(T_OP_SHB);
-"msad"                            return TOKEN(T_OP_MSAD);
-
-                                  /* category 3: */
-"mad.u16"                         return TOKEN(T_OP_MAD_U16);
-"madsh.u16"                       return TOKEN(T_OP_MADSH_U16);
-"mad.s16"                         return TOKEN(T_OP_MAD_S16);
-"madsh.m16"                       return TOKEN(T_OP_MADSH_M16);
-"mad.u24"                         return TOKEN(T_OP_MAD_U24);
-"mad.s24"                         return TOKEN(T_OP_MAD_S24);
-"mad.f16"                         return TOKEN(T_OP_MAD_F16);
-"mad.f32"                         return TOKEN(T_OP_MAD_F32);
-"sel.b16"                         return TOKEN(T_OP_SEL_B16);
-"sel.b32"                         return TOKEN(T_OP_SEL_B32);
-"sel.s16"                         return TOKEN(T_OP_SEL_S16);
-"sel.s32"                         return TOKEN(T_OP_SEL_S32);
-"sel.f16"                         return TOKEN(T_OP_SEL_F16);
-"sel.f32"                         return TOKEN(T_OP_SEL_F32);
-"sad.s16"                         return TOKEN(T_OP_SAD_S16);
-"sad.s32"                         return TOKEN(T_OP_SAD_S32);
-
-                                  /* category 4: */
-"rcp"                             return TOKEN(T_OP_RCP);
-"rsq"                             return TOKEN(T_OP_RSQ);
-"log2"                            return TOKEN(T_OP_LOG2);
-"exp2"                            return TOKEN(T_OP_EXP2);
-"sin"                             return TOKEN(T_OP_SIN);
-"cos"                             return TOKEN(T_OP_COS);
-"sqrt"                            return TOKEN(T_OP_SQRT);
-"hrsq"                            return TOKEN(T_OP_HRSQ);
-"hlog2"                           return TOKEN(T_OP_HLOG2);
-"hexp2"                           return TOKEN(T_OP_HEXP2);
-
-                                  /* category 5: */
-"isam"                            return TOKEN(T_OP_ISAM);
-"isaml"                           return TOKEN(T_OP_ISAML);
-"isamm"                           return TOKEN(T_OP_ISAMM);
-"sam"                             return TOKEN(T_OP_SAM);
-"samb"                            return TOKEN(T_OP_SAMB);
-"saml"                            return TOKEN(T_OP_SAML);
-"samgq"                           return TOKEN(T_OP_SAMGQ);
-"getlod"                          return TOKEN(T_OP_GETLOD);
-"conv"                            return TOKEN(T_OP_CONV);
-"convm"                           return TOKEN(T_OP_CONVM);
-"getsize"                         return TOKEN(T_OP_GETSIZE);
-"getbuf"                          return TOKEN(T_OP_GETBUF);
-"getpos"                          return TOKEN(T_OP_GETPOS);
-"getinfo"                         return TOKEN(T_OP_GETINFO);
-"dsx"                             return TOKEN(T_OP_DSX);
-"dsy"                             return TOKEN(T_OP_DSY);
-"gather4r"                        return TOKEN(T_OP_GATHER4R);
-"gather4g"                        return TOKEN(T_OP_GATHER4G);
-"gather4b"                        return TOKEN(T_OP_GATHER4B);
-"gather4a"                        return TOKEN(T_OP_GATHER4A);
-"samgp0"                          return TOKEN(T_OP_SAMGP0);
-"samgp1"                          return TOKEN(T_OP_SAMGP1);
-"samgp2"                          return TOKEN(T_OP_SAMGP2);
-"samgp3"                          return TOKEN(T_OP_SAMGP3);
-"dsxpp.1"                         return TOKEN(T_OP_DSXPP_1);
-"dsypp.1"                         return TOKEN(T_OP_DSYPP_1);
-"rgetpos"                         return TOKEN(T_OP_RGETPOS);
-"rgetinfo"                        return TOKEN(T_OP_RGETINFO);
-
-                                  /* category 6: */
-"ldg"                             return TOKEN(T_OP_LDG);
-"ldl"                             return TOKEN(T_OP_LDL);
-"ldp"                             return TOKEN(T_OP_LDP);
-"stg"                             return TOKEN(T_OP_STG);
-"stl"                             return TOKEN(T_OP_STL);
-"stp"                             return TOKEN(T_OP_STP);
-"ldib"                            return TOKEN(T_OP_LDIB);
-"g2l"                             return TOKEN(T_OP_G2L);
-"l2g"                             return TOKEN(T_OP_L2G);
-"prefetch"                        return TOKEN(T_OP_PREFETCH);
-"ldlw"                            return TOKEN(T_OP_LDLW);
-"stlw"                            return TOKEN(T_OP_STLW);
-"resfmt"                          return TOKEN(T_OP_RESFMT);
-"resinf"                          return TOKEN(T_OP_RESINF);
-"atomic.add"                      return TOKEN(T_OP_ATOMIC_ADD);
-"atomic.sub"                      return TOKEN(T_OP_ATOMIC_SUB);
-"atomic.xchg"                     return TOKEN(T_OP_ATOMIC_XCHG);
-"atomic.inc"                      return TOKEN(T_OP_ATOMIC_INC);
-"atomic.dec"                      return TOKEN(T_OP_ATOMIC_DEC);
-"atomic.cmpxchg"                  return TOKEN(T_OP_ATOMIC_CMPXCHG);
-"atomic.min"                      return TOKEN(T_OP_ATOMIC_MIN);
-"atomic.max"                      return TOKEN(T_OP_ATOMIC_MAX);
-"atomic.and"                      return TOKEN(T_OP_ATOMIC_AND);
-"atomic.or"                       return TOKEN(T_OP_ATOMIC_OR);
-"atomic.xor"                      return TOKEN(T_OP_ATOMIC_XOR);
-"ldgb"                            return TOKEN(T_OP_LDGB);
-"stgb"                            return TOKEN(T_OP_STGB);
-"stib"                            return TOKEN(T_OP_STIB);
-"ldc"                             return TOKEN(T_OP_LDC);
-"ldlv"                            return TOKEN(T_OP_LDLV);
-
-"f16"                             return TOKEN(T_TYPE_F16);
-"f32"                             return TOKEN(T_TYPE_F32);
-"u16"                             return TOKEN(T_TYPE_U16);
-"u32"                             return TOKEN(T_TYPE_U32);
-"s16"                             return TOKEN(T_TYPE_S16);
-"s32"                             return TOKEN(T_TYPE_S32);
-"u8"                              return TOKEN(T_TYPE_U8);
-"s8"                              return TOKEN(T_TYPE_S8);
-
-"untyped"                         return TOKEN(T_UNTYPED);
-"typed"                           return TOKEN(T_TYPED);
-
-"1d"                              return TOKEN(T_1D);
-"2d"                              return TOKEN(T_2D);
-"3d"                              return TOKEN(T_3D);
-"4d"                              return TOKEN(T_4D);
-
-"lt"                              return TOKEN(T_LT);
-"le"                              return TOKEN(T_LE);
-"gt"                              return TOKEN(T_GT);
-"ge"                              return TOKEN(T_GE);
-"eq"                              return TOKEN(T_EQ);
-"ne"                              return TOKEN(T_NE);
-
-"a"                               return 'a';
-"o"                               return 'o';
-"p"                               return 'p';
-"s2en"                            return TOKEN(T_S2EN);
-"s"                               return 's';
-"base"[0-9]+                      ir3_yylval.num = strtol(yytext+4, NULL, 10); return T_BASE;
-
-"="                               return '=';
-"("                               return '(';
-")"                               return ')';
-"["                               return '[';
-"]"                               return ']';
-","                               return ',';
-"."                               return '.';
-"-"                               return '-';
-"+"                               return '+';
-"|"                               return '|';
-"c"                               return 'c';
-"r"                               return 'r';
-"g"                               return 'g';
-"l"                               return 'l';
-"<"                               return '<';
-">"                               return '>';
-"!"                               return '!';
-"#"                               return '#';
-
-"nan"                             return TOKEN(T_NAN);
-"inf"                             return TOKEN(T_INF);
-
-[a-zA-Z_][a-zA-Z_0-9]*            ir3_yylval.str = yytext;     return T_IDENTIFIER;
-.                                 fprintf(stderr, "error at line %d: Unknown token: %s\n", ir3_yyget_lineno(), yytext); yyterminate();
-%%
diff --git a/lib/mesa/src/freedreno/ir3/ir3_parser.y b/lib/mesa/src/freedreno/ir3/ir3_parser.y
deleted file mode 100644
index c9cede4b7..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_parser.y
+++ /dev/null
@@ -1,905 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-%code requires {
-
-#define MAX_BUFS 4
-
-struct ir3_kernel_info {
-	uint32_t local_size[3];
-	uint32_t num_bufs;
-	uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
-
-	/* driver-param uniforms: */
-	unsigned numwg;
-};
-
-struct ir3 * ir3_parse(struct ir3_shader_variant *v,
-		struct ir3_kernel_info *k, FILE *f);
-}
-
-%{
-#define YYDEBUG 0
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "util/u_math.h"
-
-#include "ir3/ir3.h"
-#include "ir3/ir3_shader.h"
-#include "ir3/instr-a3xx.h"
-
-#include "ir3_parser.h"
-
-/* ir3 treats the abs/neg flags as separate flags for float vs integer,
- * but in the instruction encoding they are the same thing.  Tracking
- * them separately is only for the benefit of ir3 opt passes, and not
- * required here, so just use the float versions:
- */
-#define IR3_REG_ABS     IR3_REG_FABS
-#define IR3_REG_NEGATE  IR3_REG_FNEG
-
-static struct ir3_kernel_info    *info;
-static struct ir3_shader_variant *variant;
-/* NOTE the assembler doesn't really use the ir3_block construction
- * like the compiler does.  Everything is treated as one large block.
- * Which might happen to contain flow control.  But since we don't
- * use any of the ir3 backend passes (sched, RA, etc) this doesn't
- * really matter.
- */
-static struct ir3_block          *block;   /* current shader block */
-static struct ir3_instruction    *instr;   /* current instruction */
-
-static struct {
-	unsigned flags;
-	unsigned repeat;
-	unsigned nop;
-} iflags;
-
-static struct {
-	unsigned flags;
-	unsigned wrmask;
-} rflags;
-
-int ir3_yyget_lineno(void);
-
-static struct ir3_instruction * new_instr(opc_t opc)
-{
-	instr = ir3_instr_create(block, opc);
-	instr->flags = iflags.flags;
-	instr->repeat = iflags.repeat;
-	instr->nop = iflags.nop;
-	instr->line = ir3_yyget_lineno();
-	iflags.flags = iflags.repeat = iflags.nop = 0;
-	return instr;
-}
-
-static void new_shader(void)
-{
-	variant->ir = ir3_create(variant->shader->compiler, variant->shader->type);
-	block = ir3_block_create(variant->ir);
-	list_addtail(&block->node, &variant->ir->block_list);
-}
-
-static type_t parse_type(const char **type)
-{
-	if (!strncmp("f16", *type, 3)) {
-		*type += 3;
-		return TYPE_F16;
-	} else if (!strncmp("f32", *type, 3)) {
-		*type += 3;
-		return TYPE_F32;
-	} else if (!strncmp("u16", *type, 3)) {
-		*type += 3;
-		return TYPE_U16;
-	} else if (!strncmp("u32", *type, 3)) {
-		*type += 3;
-		return TYPE_U32;
-	} else if (!strncmp("s16", *type, 3)) {
-		*type += 3;
-		return TYPE_S16;
-	} else if (!strncmp("s32", *type, 3)) {
-		*type += 3;
-		return TYPE_S32;
-	} else if (!strncmp("u8", *type, 2)) {
-		*type += 2;
-		return TYPE_U8;
-	} else if (!strncmp("s8", *type, 2)) {
-		*type += 2;
-		return TYPE_S8;
-	} else {
-		assert(0);  /* shouldn't get here */
-		return ~0;
-	}
-}
-
-static struct ir3_instruction * parse_type_type(struct ir3_instruction *instr,
-		const char *type_type)
-{
-	instr->cat1.src_type = parse_type(&type_type);
-	instr->cat1.dst_type = parse_type(&type_type);
-	return instr;
-}
-
-static struct ir3_register * new_reg(int num, unsigned flags)
-{
-	struct ir3_register *reg;
-	flags |= rflags.flags;
-	if (num & 0x1)
-		flags |= IR3_REG_HALF;
-	reg = ir3_reg_create(instr, num>>1, flags);
-	reg->wrmask = MAX2(1, rflags.wrmask);
-	rflags.flags = rflags.wrmask = 0;
-	return reg;
-}
-
-static struct ir3_register * dummy_dst(void)
-{
-	return new_reg(0, 0);
-}
-
-static void add_const(unsigned reg, unsigned c0, unsigned c1, unsigned c2, unsigned c3)
-{
-	struct ir3_const_state *const_state = &variant->shader->const_state;
-	assert((reg & 0x7) == 0);
-	int idx = reg >> (1 + 2); /* low bit is half vs full, next two bits are swiz */
-	if (const_state->immediate_idx == const_state->immediates_size * 4) {
-		const_state->immediates_size += 4;
-		const_state->immediates = realloc (const_state->immediates,
-			const_state->immediates_size * sizeof(const_state->immediates[0]));
-	}
-	const_state->immediates[idx].val[0] = c0;
-	const_state->immediates[idx].val[1] = c1;
-	const_state->immediates[idx].val[2] = c2;
-	const_state->immediates[idx].val[3] = c3;
-	const_state->immediates_count = idx + 1;
-	const_state->immediate_idx++;
-}
-
-static void add_sysval(unsigned reg, unsigned compmask, gl_system_value sysval)
-{
-	unsigned n = variant->inputs_count++;
-	variant->inputs[n].regid = reg;
-	variant->inputs[n].sysval = true;
-	variant->inputs[n].slot = sysval;
-	variant->inputs[n].compmask = compmask;
-	variant->inputs[n].interpolate = INTERP_MODE_FLAT;
-	variant->total_in++;
-}
-
-#ifdef YYDEBUG
-int yydebug;
-#endif
-
-extern int yylex(void);
-extern FILE *ir3_yyin;
-void ir3_yyset_lineno(int _line_number);
-
-int yyparse(void);
-
-static void yyerror(const char *error)
-{
-	fprintf(stderr, "error at line %d: %s\n", ir3_yyget_lineno(), error);
-}
-
-struct ir3 * ir3_parse(struct ir3_shader_variant *v,
-		struct ir3_kernel_info *k, FILE *f)
-{
-	ir3_yyset_lineno(1);
-	ir3_yyin = f;
-#ifdef YYDEBUG
-	yydebug = 1;
-#endif
-	info = k;
-	variant = v;
-	if (yyparse()) {
-		ir3_destroy(variant->ir);
-		variant->ir = NULL;
-	}
-	return variant->ir;
-}
-%}
-
-%union {
-	int tok;
-	int num;
-	uint32_t unum;
-	double flt;
-	const char *str;
-	struct ir3_register *reg;
-	struct {
-		int start;
-		int num;
-	} range;
-	type_t type;
-}
-
-%{
-#if YYDEBUG
-static void print_token(FILE *file, int type, YYSTYPE value)
-{
-	fprintf(file, "\ntype: %d\n", type);
-}
-
-#define YYPRINT(file, type, value) print_token(file, type, value)
-#endif
-%}
-
-%token <num> T_INT
-%token <unum> T_HEX
-%token <flt> T_FLOAT
-%token <str> T_IDENTIFIER
-%token <num> T_REGISTER
-%token <num> T_CONSTANT
-
-/* @ headers (@const/@sampler/@uniform/@varying) */
-%token <tok> T_A_LOCALSIZE
-%token <tok> T_A_CONST
-%token <tok> T_A_BUF
-%token <tok> T_A_INVOCATIONID
-%token <tok> T_A_WGID
-%token <tok> T_A_NUMWG
-%token <tok> T_A_IN
-%token <tok> T_A_OUT
-%token <tok> T_A_TEX
-/* todo, re-add @sampler/@uniform/@varying if needed someday */
-
-/* src register flags */
-%token <tok> T_ABSNEG
-%token <tok> T_NEG
-%token <tok> T_ABS
-%token <tok> T_R
-
-/* dst register flags */
-%token <tok> T_EVEN
-%token <tok> T_POS_INFINITY
-%token <tok> T_EI
-%token <num> T_WRMASK
-
-/* instruction flags */
-%token <tok> T_SY
-%token <tok> T_SS
-%token <tok> T_JP
-%token <num> T_RPT
-%token <tok> T_UL
-%token <tok> T_NOP
-
-/* category 0: */
-%token <tok> T_OP_NOP
-%token <tok> T_OP_BR
-%token <tok> T_OP_JUMP
-%token <tok> T_OP_CALL
-%token <tok> T_OP_RET
-%token <tok> T_OP_KILL
-%token <tok> T_OP_END
-%token <tok> T_OP_EMIT
-%token <tok> T_OP_CUT
-%token <tok> T_OP_CHMASK
-%token <tok> T_OP_CHSH
-%token <tok> T_OP_FLOW_REV
-
-/* category 1: */
-%token <tok> T_OP_MOVA
-%token <tok> T_OP_MOV
-%token <tok> T_OP_COV
-
-/* category 2: */
-%token <tok> T_OP_ADD_F
-%token <tok> T_OP_MIN_F
-%token <tok> T_OP_MAX_F
-%token <tok> T_OP_MUL_F
-%token <tok> T_OP_SIGN_F
-%token <tok> T_OP_CMPS_F
-%token <tok> T_OP_ABSNEG_F
-%token <tok> T_OP_CMPV_F
-%token <tok> T_OP_FLOOR_F
-%token <tok> T_OP_CEIL_F
-%token <tok> T_OP_RNDNE_F
-%token <tok> T_OP_RNDAZ_F
-%token <tok> T_OP_TRUNC_F
-%token <tok> T_OP_ADD_U
-%token <tok> T_OP_ADD_S
-%token <tok> T_OP_SUB_U
-%token <tok> T_OP_SUB_S
-%token <tok> T_OP_CMPS_U
-%token <tok> T_OP_CMPS_S
-%token <tok> T_OP_MIN_U
-%token <tok> T_OP_MIN_S
-%token <tok> T_OP_MAX_U
-%token <tok> T_OP_MAX_S
-%token <tok> T_OP_ABSNEG_S
-%token <tok> T_OP_AND_B
-%token <tok> T_OP_OR_B
-%token <tok> T_OP_NOT_B
-%token <tok> T_OP_XOR_B
-%token <tok> T_OP_CMPV_U
-%token <tok> T_OP_CMPV_S
-%token <tok> T_OP_MUL_U24
-%token <tok> T_OP_MUL_S24
-%token <tok> T_OP_MULL_U
-%token <tok> T_OP_BFREV_B
-%token <tok> T_OP_CLZ_S
-%token <tok> T_OP_CLZ_B
-%token <tok> T_OP_SHL_B
-%token <tok> T_OP_SHR_B
-%token <tok> T_OP_ASHR_B
-%token <tok> T_OP_BARY_F
-%token <tok> T_OP_MGEN_B
-%token <tok> T_OP_GETBIT_B
-%token <tok> T_OP_SETRM
-%token <tok> T_OP_CBITS_B
-%token <tok> T_OP_SHB
-%token <tok> T_OP_MSAD
-
-/* category 3: */
-%token <tok> T_OP_MAD_U16
-%token <tok> T_OP_MADSH_U16
-%token <tok> T_OP_MAD_S16
-%token <tok> T_OP_MADSH_M16
-%token <tok> T_OP_MAD_U24
-%token <tok> T_OP_MAD_S24
-%token <tok> T_OP_MAD_F16
-%token <tok> T_OP_MAD_F32
-%token <tok> T_OP_SEL_B16
-%token <tok> T_OP_SEL_B32
-%token <tok> T_OP_SEL_S16
-%token <tok> T_OP_SEL_S32
-%token <tok> T_OP_SEL_F16
-%token <tok> T_OP_SEL_F32
-%token <tok> T_OP_SAD_S16
-%token <tok> T_OP_SAD_S32
-
-/* category 4: */
-%token <tok> T_OP_RCP
-%token <tok> T_OP_RSQ
-%token <tok> T_OP_LOG2
-%token <tok> T_OP_EXP2
-%token <tok> T_OP_SIN
-%token <tok> T_OP_COS
-%token <tok> T_OP_SQRT
-%token <tok> T_OP_HRSQ
-%token <tok> T_OP_HLOG2
-%token <tok> T_OP_HEXP2
-
-/* category 5: */
-%token <tok> T_OP_ISAM
-%token <tok> T_OP_ISAML
-%token <tok> T_OP_ISAMM
-%token <tok> T_OP_SAM
-%token <tok> T_OP_SAMB
-%token <tok> T_OP_SAML
-%token <tok> T_OP_SAMGQ
-%token <tok> T_OP_GETLOD
-%token <tok> T_OP_CONV
-%token <tok> T_OP_CONVM
-%token <tok> T_OP_GETSIZE
-%token <tok> T_OP_GETBUF
-%token <tok> T_OP_GETPOS
-%token <tok> T_OP_GETINFO
-%token <tok> T_OP_DSX
-%token <tok> T_OP_DSY
-%token <tok> T_OP_GATHER4R
-%token <tok> T_OP_GATHER4G
-%token <tok> T_OP_GATHER4B
-%token <tok> T_OP_GATHER4A
-%token <tok> T_OP_SAMGP0
-%token <tok> T_OP_SAMGP1
-%token <tok> T_OP_SAMGP2
-%token <tok> T_OP_SAMGP3
-%token <tok> T_OP_DSXPP_1
-%token <tok> T_OP_DSYPP_1
-%token <tok> T_OP_RGETPOS
-%token <tok> T_OP_RGETINFO
-
-/* category 6: */
-%token <tok> T_OP_LDG
-%token <tok> T_OP_LDL
-%token <tok> T_OP_LDP
-%token <tok> T_OP_STG
-%token <tok> T_OP_STL
-%token <tok> T_OP_STP
-%token <tok> T_OP_LDIB
-%token <tok> T_OP_G2L
-%token <tok> T_OP_L2G
-%token <tok> T_OP_PREFETCH
-%token <tok> T_OP_LDLW
-%token <tok> T_OP_STLW
-%token <tok> T_OP_RESFMT
-%token <tok> T_OP_RESINF
-%token <tok> T_OP_ATOMIC_ADD
-%token <tok> T_OP_ATOMIC_SUB
-%token <tok> T_OP_ATOMIC_XCHG
-%token <tok> T_OP_ATOMIC_INC
-%token <tok> T_OP_ATOMIC_DEC
-%token <tok> T_OP_ATOMIC_CMPXCHG
-%token <tok> T_OP_ATOMIC_MIN
-%token <tok> T_OP_ATOMIC_MAX
-%token <tok> T_OP_ATOMIC_AND
-%token <tok> T_OP_ATOMIC_OR
-%token <tok> T_OP_ATOMIC_XOR
-%token <tok> T_OP_LDGB
-%token <tok> T_OP_STGB
-%token <tok> T_OP_STIB
-%token <tok> T_OP_LDC
-%token <tok> T_OP_LDLV
-
-/* type qualifiers: */
-%token <tok> T_TYPE_F16
-%token <tok> T_TYPE_F32
-%token <tok> T_TYPE_U16
-%token <tok> T_TYPE_U32
-%token <tok> T_TYPE_S16
-%token <tok> T_TYPE_S32
-%token <tok> T_TYPE_U8
-%token <tok> T_TYPE_S8
-
-%token <tok> T_UNTYPED
-%token <tok> T_TYPED
-
-%token <tok> T_1D
-%token <tok> T_2D
-%token <tok> T_3D
-%token <tok> T_4D
-
-/* condition qualifiers: */
-%token <tok> T_LT
-%token <tok> T_LE
-%token <tok> T_GT
-%token <tok> T_GE
-%token <tok> T_EQ
-%token <tok> T_NE
-
-%token <tok> T_S2EN
-%token <tok> T_SAMP
-%token <tok> T_TEX
-%token <tok> T_BASE
-
-%token <tok> T_NAN
-%token <tok> T_INF
-%token <num> T_A0
-%token <num> T_P0
-%token <str> T_CAT1_TYPE_TYPE
-
-%type <num> integer offset
-%type <flt> float
-%type <reg> reg const
-%type <tok> cat1_opc
-%type <tok> cat2_opc_1src cat2_opc_2src_cnd cat2_opc_2src
-%type <tok> cat3_opc
-%type <tok> cat4_opc
-%type <tok> cat5_opc cat5_samp cat5_tex cat5_type
-%type <type> type
-%type <unum> const_val
-
-%error-verbose
-
-%start shader
-
-%%
-
-shader:            { new_shader(); } headers instrs
-
-headers:           
-|                  header headers
-
-header:            localsize_header
-|                  const_header
-|                  buf_header
-|                  invocationid_header
-|                  wgid_header
-|                  numwg_header
-|                  in_header
-|                  out_header
-|                  tex_header
-
-const_val:         T_FLOAT   { $$ = fui($1); }
-|                  T_INT     { $$ = $1;      }
-|                  '-' T_INT { $$ = -$2;     }
-|                  T_HEX     { $$ = $1;      }
-
-localsize_header:  T_A_LOCALSIZE const_val ',' const_val ',' const_val {
-                       info->local_size[0] = $2;
-                       info->local_size[1] = $4;
-                       info->local_size[2] = $6;
-}
-
-const_header:      T_A_CONST '(' T_CONSTANT ')' const_val ',' const_val ',' const_val ',' const_val {
-                       add_const($3, $5, $7, $9, $11);
-}
-
-buf_header:        T_A_BUF const_val {
-                       int idx = info->num_bufs++;
-                       assert(idx < MAX_BUFS);
-                       info->buf_sizes[idx] = $2;
-}
-
-invocationid_header: T_A_INVOCATIONID '(' T_REGISTER ')' {
-                       assert(($3 & 0x1) == 0);  /* half-reg not allowed */
-                       unsigned reg = $3 >> 1;
-                       add_sysval(reg, 0x7, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
-}
-
-wgid_header:       T_A_WGID '(' T_REGISTER ')' {
-                       assert(($3 & 0x1) == 0);  /* half-reg not allowed */
-                       unsigned reg = $3 >> 1;
-                       assert(reg >= regid(48, 0)); /* must be a high reg */
-                       add_sysval(reg, 0x7, SYSTEM_VALUE_WORK_GROUP_ID);
-}
-
-numwg_header:      T_A_NUMWG '(' T_CONSTANT ')' {
-                       assert(($3 & 0x1) == 0);  /* half-reg not allowed */
-                       unsigned reg = $3 >> 1;
-                       info->numwg = reg;
-                       /* reserve space in immediates for the actual value to be plugged in later: */
-                       add_const($3, 0, 0, 0, 0);
-}
-
-/* Stubs for now */
-in_header:         T_A_IN '(' T_REGISTER ')' T_IDENTIFIER '(' T_IDENTIFIER '=' integer ')' { }
-
-out_header:        T_A_OUT '(' T_REGISTER ')' T_IDENTIFIER '(' T_IDENTIFIER '=' integer ')' { }
-
-tex_header:        T_A_TEX '(' T_REGISTER ')'
-                       T_IDENTIFIER '=' integer ',' /* src */
-                       T_IDENTIFIER '=' integer ',' /* samp */
-                       T_IDENTIFIER '=' integer ',' /* tex */
-                       T_IDENTIFIER '=' integer ',' /* wrmask */
-                       T_IDENTIFIER '=' integer     /* cmd */ { }
-
-iflag:             T_SY   { iflags.flags |= IR3_INSTR_SY; }
-|                  T_SS   { iflags.flags |= IR3_INSTR_SS; }
-|                  T_JP   { iflags.flags |= IR3_INSTR_JP; }
-|                  T_RPT  { iflags.repeat = $1; }
-|                  T_UL   { iflags.flags |= IR3_INSTR_UL; }
-|                  T_NOP  { iflags.nop = $1; }
-
-iflags:
-|                  iflag iflags
-
-instrs:            instr instrs
-|                  instr
-
-instr:             iflags cat0_instr
-|                  iflags cat1_instr
-|                  iflags cat2_instr
-|                  iflags cat3_instr
-|                  iflags cat4_instr
-|                  iflags cat5_instr
-|                  iflags cat6_instr
-
-cat0_src:          '!' T_P0        { instr->cat0.inv = true; instr->cat0.comp = $2 >> 1; }
-|                  T_P0            { instr->cat0.comp = $1 >> 1; }
-
-cat0_immed:        '#' integer     { instr->cat0.immed = $2; }
-
-cat0_instr:        T_OP_NOP        { new_instr(OPC_NOP); }
-|                  T_OP_BR         { new_instr(OPC_BR); }    cat0_src ',' cat0_immed
-|                  T_OP_JUMP       { new_instr(OPC_JUMP); }  cat0_immed
-|                  T_OP_CALL       { new_instr(OPC_CALL); }  cat0_immed
-|                  T_OP_RET        { new_instr(OPC_RET); }
-|                  T_OP_KILL       { new_instr(OPC_KILL); }  cat0_src
-|                  T_OP_END        { new_instr(OPC_END); }
-|                  T_OP_EMIT       { new_instr(OPC_EMIT); }
-|                  T_OP_CUT        { new_instr(OPC_CUT); }
-|                  T_OP_CHMASK     { new_instr(OPC_CHMASK); }
-|                  T_OP_CHSH       { new_instr(OPC_CHSH); }
-|                  T_OP_FLOW_REV   { new_instr(OPC_FLOW_REV); }
-
-cat1_opc:          T_OP_MOVA {
-                       new_instr(OPC_MOV);
-                       instr->cat1.src_type = TYPE_S16;
-                       instr->cat1.dst_type = TYPE_S16;
-}
-|                  T_OP_MOV '.' T_CAT1_TYPE_TYPE {
-                       parse_type_type(new_instr(OPC_MOV), $3);
-}
-|                  T_OP_COV '.' T_CAT1_TYPE_TYPE {
-                       parse_type_type(new_instr(OPC_MOV), $3);
-}
-
-cat1_instr:        cat1_opc dst_reg ',' src_reg_or_const_or_rel_or_imm
-
-cat2_opc_1src:     T_OP_ABSNEG_F  { new_instr(OPC_ABSNEG_F); }
-|                  T_OP_ABSNEG_S  { new_instr(OPC_ABSNEG_S); }
-|                  T_OP_CLZ_B     { new_instr(OPC_CLZ_B); }
-|                  T_OP_CLZ_S     { new_instr(OPC_CLZ_S); }
-|                  T_OP_SIGN_F    { new_instr(OPC_SIGN_F); }
-|                  T_OP_FLOOR_F   { new_instr(OPC_FLOOR_F); }
-|                  T_OP_CEIL_F    { new_instr(OPC_CEIL_F); }
-|                  T_OP_RNDNE_F   { new_instr(OPC_RNDNE_F); }
-|                  T_OP_RNDAZ_F   { new_instr(OPC_RNDAZ_F); }
-|                  T_OP_TRUNC_F   { new_instr(OPC_TRUNC_F); }
-|                  T_OP_NOT_B     { new_instr(OPC_NOT_B); }
-|                  T_OP_BFREV_B   { new_instr(OPC_BFREV_B); }
-|                  T_OP_SETRM     { new_instr(OPC_SETRM); }
-|                  T_OP_CBITS_B   { new_instr(OPC_CBITS_B); }
-
-cat2_opc_2src_cnd: T_OP_CMPS_F    { new_instr(OPC_CMPS_F); }
-|                  T_OP_CMPS_U    { new_instr(OPC_CMPS_U); }
-|                  T_OP_CMPS_S    { new_instr(OPC_CMPS_S); }
-|                  T_OP_CMPV_F    { new_instr(OPC_CMPV_F); }
-|                  T_OP_CMPV_U    { new_instr(OPC_CMPV_U); }
-|                  T_OP_CMPV_S    { new_instr(OPC_CMPV_S); }
-
-cat2_opc_2src:     T_OP_ADD_F     { new_instr(OPC_ADD_F); }
-|                  T_OP_MIN_F     { new_instr(OPC_MIN_F); }
-|                  T_OP_MAX_F     { new_instr(OPC_MAX_F); }
-|                  T_OP_MUL_F     { new_instr(OPC_MUL_F); }
-|                  T_OP_ADD_U     { new_instr(OPC_ADD_U); }
-|                  T_OP_ADD_S     { new_instr(OPC_ADD_S); }
-|                  T_OP_SUB_U     { new_instr(OPC_SUB_U); }
-|                  T_OP_SUB_S     { new_instr(OPC_SUB_S); }
-|                  T_OP_MIN_U     { new_instr(OPC_MIN_U); }
-|                  T_OP_MIN_S     { new_instr(OPC_MIN_S); }
-|                  T_OP_MAX_U     { new_instr(OPC_MAX_U); }
-|                  T_OP_MAX_S     { new_instr(OPC_MAX_S); }
-|                  T_OP_AND_B     { new_instr(OPC_AND_B); }
-|                  T_OP_OR_B      { new_instr(OPC_OR_B); }
-|                  T_OP_XOR_B     { new_instr(OPC_XOR_B); }
-|                  T_OP_MUL_U24   { new_instr(OPC_MUL_U24); }
-|                  T_OP_MUL_S24   { new_instr(OPC_MUL_S24); }
-|                  T_OP_MULL_U    { new_instr(OPC_MULL_U); }
-|                  T_OP_SHL_B     { new_instr(OPC_SHL_B); }
-|                  T_OP_SHR_B     { new_instr(OPC_SHR_B); }
-|                  T_OP_ASHR_B    { new_instr(OPC_ASHR_B); }
-|                  T_OP_BARY_F    { new_instr(OPC_BARY_F); }
-|                  T_OP_MGEN_B    { new_instr(OPC_MGEN_B); }
-|                  T_OP_GETBIT_B  { new_instr(OPC_GETBIT_B); }
-|                  T_OP_SHB       { new_instr(OPC_SHB); }
-|                  T_OP_MSAD      { new_instr(OPC_MSAD); }
-
-cond:              T_LT           { instr->cat2.condition = IR3_COND_LT; }
-|                  T_LE           { instr->cat2.condition = IR3_COND_LE; }
-|                  T_GT           { instr->cat2.condition = IR3_COND_GT; }
-|                  T_GE           { instr->cat2.condition = IR3_COND_GE; }
-|                  T_EQ           { instr->cat2.condition = IR3_COND_EQ; }
-|                  T_NE           { instr->cat2.condition = IR3_COND_NE; }
-
-cat2_instr:        cat2_opc_1src dst_reg ',' src_reg_or_const_or_rel_or_imm
-|                  cat2_opc_2src_cnd '.' cond dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm
-|                  cat2_opc_2src dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm
-
-cat3_opc:          T_OP_MAD_U16   { new_instr(OPC_MAD_U16); }
-|                  T_OP_MADSH_U16 { new_instr(OPC_MADSH_U16); }
-|                  T_OP_MAD_S16   { new_instr(OPC_MAD_S16); }
-|                  T_OP_MADSH_M16 { new_instr(OPC_MADSH_M16); }
-|                  T_OP_MAD_U24   { new_instr(OPC_MAD_U24); }
-|                  T_OP_MAD_S24   { new_instr(OPC_MAD_S24); }
-|                  T_OP_MAD_F16   { new_instr(OPC_MAD_F16); }
-|                  T_OP_MAD_F32   { new_instr(OPC_MAD_F32); }
-|                  T_OP_SEL_B16   { new_instr(OPC_SEL_B16); }
-|                  T_OP_SEL_B32   { new_instr(OPC_SEL_B32); }
-|                  T_OP_SEL_S16   { new_instr(OPC_SEL_S16); }
-|                  T_OP_SEL_S32   { new_instr(OPC_SEL_S32); }
-|                  T_OP_SEL_F16   { new_instr(OPC_SEL_F16); }
-|                  T_OP_SEL_F32   { new_instr(OPC_SEL_F32); }
-|                  T_OP_SAD_S16   { new_instr(OPC_SAD_S16); }
-|                  T_OP_SAD_S32   { new_instr(OPC_SAD_S32); }
-
-cat3_instr:        cat3_opc dst_reg ',' src_reg_or_const_or_rel ',' src_reg_or_const ',' src_reg_or_const_or_rel
-
-cat4_opc:          T_OP_RCP       { new_instr(OPC_RCP); }
-|                  T_OP_RSQ       { new_instr(OPC_RSQ); }
-|                  T_OP_LOG2      { new_instr(OPC_LOG2); }
-|                  T_OP_EXP2      { new_instr(OPC_EXP2); }
-|                  T_OP_SIN       { new_instr(OPC_SIN); }
-|                  T_OP_COS       { new_instr(OPC_COS); }
-|                  T_OP_SQRT      { new_instr(OPC_SQRT); }
-|                  T_OP_HRSQ      { new_instr(OPC_HRSQ); }
-|                  T_OP_HLOG2     { new_instr(OPC_HLOG2); }
-|                  T_OP_HEXP2     { new_instr(OPC_HEXP2); }
-
-cat4_instr:        cat4_opc dst_reg ',' src_reg_or_const_or_rel_or_imm
-
-cat5_opc_dsxypp:   T_OP_DSXPP_1   { new_instr(OPC_DSXPP_1); }
-|                  T_OP_DSYPP_1   { new_instr(OPC_DSYPP_1); }
-
-cat5_opc:          T_OP_ISAM      { new_instr(OPC_ISAM); }
-|                  T_OP_ISAML     { new_instr(OPC_ISAML); }
-|                  T_OP_ISAMM     { new_instr(OPC_ISAMM); }
-|                  T_OP_SAM       { new_instr(OPC_SAM); }
-|                  T_OP_SAMB      { new_instr(OPC_SAMB); }
-|                  T_OP_SAML      { new_instr(OPC_SAML); }
-|                  T_OP_SAMGQ     { new_instr(OPC_SAMGQ); }
-|                  T_OP_GETLOD    { new_instr(OPC_GETLOD); }
-|                  T_OP_CONV      { new_instr(OPC_CONV); }
-|                  T_OP_CONVM     { new_instr(OPC_CONVM); }
-|                  T_OP_GETSIZE   { new_instr(OPC_GETSIZE); }
-|                  T_OP_GETBUF    { new_instr(OPC_GETBUF); }
-|                  T_OP_GETPOS    { new_instr(OPC_GETPOS); }
-|                  T_OP_GETINFO   { new_instr(OPC_GETINFO); }
-|                  T_OP_DSX       { new_instr(OPC_DSX); }
-|                  T_OP_DSY       { new_instr(OPC_DSY); }
-|                  T_OP_GATHER4R  { new_instr(OPC_GATHER4R); }
-|                  T_OP_GATHER4G  { new_instr(OPC_GATHER4G); }
-|                  T_OP_GATHER4B  { new_instr(OPC_GATHER4B); }
-|                  T_OP_GATHER4A  { new_instr(OPC_GATHER4A); }
-|                  T_OP_SAMGP0    { new_instr(OPC_SAMGP0); }
-|                  T_OP_SAMGP1    { new_instr(OPC_SAMGP1); }
-|                  T_OP_SAMGP2    { new_instr(OPC_SAMGP2); }
-|                  T_OP_SAMGP3    { new_instr(OPC_SAMGP3); }
-|                  T_OP_RGETPOS   { new_instr(OPC_RGETPOS); }
-|                  T_OP_RGETINFO  { new_instr(OPC_RGETINFO); }
-
-cat5_flag:         '.' T_3D       { instr->flags |= IR3_INSTR_3D; }
-|                  '.' 'a'        { instr->flags |= IR3_INSTR_A; }
-|                  '.' 'o'        { instr->flags |= IR3_INSTR_O; }
-|                  '.' 'p'        { instr->flags |= IR3_INSTR_P; }
-|                  '.' 's'        { instr->flags |= IR3_INSTR_S; }
-|                  '.' T_S2EN     { instr->flags |= IR3_INSTR_S2EN; }
-|                  '.' T_BASE     { instr->flags |= IR3_INSTR_B; instr->cat5.tex_base = $2; }
-cat5_flags:
-|                  cat5_flag cat5_flags
-
-cat5_samp:         T_SAMP         { instr->cat5.samp = $1; }
-cat5_tex:          T_TEX          { if (instr->flags & IR3_INSTR_B) instr->cat5.samp |= ($1 << 4); else instr->cat5.tex = $1; }
-cat5_type:         '(' type ')'   { instr->cat5.type = $2; }
-
-cat5_instr:        cat5_opc_dsxypp cat5_flags dst_reg ',' src_reg
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg ',' src_reg ',' cat5_samp ',' cat5_tex
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg ',' src_reg ',' cat5_samp
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg ',' src_reg ',' cat5_tex
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg ',' src_reg
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg ',' cat5_samp ',' cat5_tex
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg ',' cat5_samp
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg ',' cat5_tex
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' src_reg
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' cat5_samp ',' cat5_tex
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' cat5_samp
-|                  cat5_opc cat5_flags cat5_type dst_reg ',' cat5_tex
-|                  cat5_opc cat5_flags cat5_type dst_reg
-
-cat6_typed:        '.' T_UNTYPED  { instr->cat6.typed = 0; }
-|                  '.' T_TYPED    { instr->cat6.typed = 1; }
-
-cat6_dim:          '.' T_1D  { instr->cat6.d = 1; }
-|                  '.' T_2D  { instr->cat6.d = 2; }
-|                  '.' T_3D  { instr->cat6.d = 3; }
-|                  '.' T_4D  { instr->cat6.d = 4; }
-
-cat6_type:         '.' type  { instr->cat6.type = $2; }
-cat6_offset:       offset    { instr->cat6.src_offset = $1; }
-cat6_immed:        integer   { instr->cat6.iim_val = $1; }
-
-cat6_load:         T_OP_LDG  { new_instr(OPC_LDG); }  cat6_type dst_reg ',' 'g' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_LDP  { new_instr(OPC_LDP); }  cat6_type dst_reg ',' 'p' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_LDL  { new_instr(OPC_LDL); }  cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_LDLW { new_instr(OPC_LDLW); } cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_LDLV { new_instr(OPC_LDLV); } cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-
-// TODO some of the cat6 instructions have different syntax for a6xx..
-//|                  T_OP_LDIB { new_instr(OPC_LDIB); } cat6_type dst_reg cat6_offset ',' reg ',' cat6_immed
-
-cat6_store:        T_OP_STG  { new_instr(OPC_STG); }  cat6_type 'g' '[' dst_reg cat6_offset ']' ',' reg ',' cat6_immed
-|                  T_OP_STP  { new_instr(OPC_STP); }  cat6_type 'p' '[' dst_reg cat6_offset ']' ',' reg ',' cat6_immed
-|                  T_OP_STL  { new_instr(OPC_STL); }  cat6_type 'l' '[' dst_reg cat6_offset ']' ',' reg ',' cat6_immed
-|                  T_OP_STLW { new_instr(OPC_STLW); } cat6_type 'l' '[' dst_reg cat6_offset ']' ',' reg ',' cat6_immed
-
-cat6_storeib:      T_OP_STIB { new_instr(OPC_STIB); dummy_dst(); } cat6_typed cat6_dim cat6_type '.' cat6_immed'g' '[' immediate ']' '+' reg ',' reg
-
-cat6_prefetch:     T_OP_PREFETCH { new_instr(OPC_PREFETCH); new_reg(0,0); /* dummy dst */ } 'g' '[' reg cat6_offset ']' ',' cat6_immed
-
-cat6_atomic_l_g:   '.' 'g'  { instr->flags |= IR3_INSTR_G; }
-|                  '.' 'l'  {  }
-
-cat6_atomic:       T_OP_ATOMIC_ADD     { new_instr(OPC_ATOMIC_ADD); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_SUB     { new_instr(OPC_ATOMIC_SUB); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_XCHG    { new_instr(OPC_ATOMIC_XCHG); }   cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_INC     { new_instr(OPC_ATOMIC_INC); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_DEC     { new_instr(OPC_ATOMIC_DEC); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_CMPXCHG { new_instr(OPC_ATOMIC_CMPXCHG); }cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_MIN     { new_instr(OPC_ATOMIC_MIN); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_MAX     { new_instr(OPC_ATOMIC_MAX); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_AND     { new_instr(OPC_ATOMIC_AND); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_OR      { new_instr(OPC_ATOMIC_OR); }     cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-|                  T_OP_ATOMIC_XOR     { new_instr(OPC_ATOMIC_XOR); }    cat6_atomic_l_g cat6_type dst_reg ',' 'l' '[' reg cat6_offset ']' ',' cat6_immed
-
-cat6_todo:         T_OP_G2L                 { new_instr(OPC_G2L); }
-|                  T_OP_L2G                 { new_instr(OPC_L2G); }
-|                  T_OP_RESFMT              { new_instr(OPC_RESFMT); }
-|                  T_OP_RESINF              { new_instr(OPC_RESINFO); }
-|                  T_OP_LDGB                { new_instr(OPC_LDGB); }
-|                  T_OP_STGB                { new_instr(OPC_STGB); }
-|                  T_OP_LDC                 { new_instr(OPC_LDC); }
-
-cat6_instr:        cat6_load
-|                  cat6_store
-|                  cat6_storeib
-|                  cat6_prefetch
-|                  cat6_atomic
-|                  cat6_todo
-
-reg:               T_REGISTER     { $$ = new_reg($1, 0); }
-|                  T_A0           { $$ = new_reg((61 << 3) + $1, IR3_REG_HALF); }
-|                  T_P0           { $$ = new_reg((62 << 3) + $1, 0); }
-
-const:             T_CONSTANT     { $$ = new_reg($1, IR3_REG_CONST); }
-
-dst_reg_flag:      T_EVEN         { rflags.flags |= IR3_REG_EVEN; }
-|                  T_POS_INFINITY { rflags.flags |= IR3_REG_POS_INF; }
-|                  T_EI           { rflags.flags |= IR3_REG_EI; }
-|                  T_WRMASK       { rflags.wrmask = $1; }
-
-dst_reg_flags:     dst_reg_flag
-|                  dst_reg_flag dst_reg_flags
-
-                   /* note: destination registers are always incremented in repeat */
-dst_reg:           reg                 { $1->flags |= IR3_REG_R; }
-|                  dst_reg_flags reg   { $2->flags |= IR3_REG_R; }
-
-src_reg_flag:      T_ABSNEG       { rflags.flags |= IR3_REG_ABS|IR3_REG_NEGATE; }
-|                  T_NEG          { rflags.flags |= IR3_REG_NEGATE; }
-|                  T_ABS          { rflags.flags |= IR3_REG_ABS; }
-|                  T_R            { rflags.flags |= IR3_REG_R; }
-
-src_reg_flags:     src_reg_flag
-|                  src_reg_flag src_reg_flags
-
-src_reg:           reg
-|                  src_reg_flags reg
-
-src_const:         const
-|                  src_reg_flags const
-
-src_reg_or_const:  src_reg
-|                  src_const
-
-src_reg_or_const_or_rel: src_reg_or_const
-|                  relative
-
-src_reg_or_const_or_rel_or_imm: src_reg_or_const_or_rel
-|                  src_reg_flags immediate
-|                  immediate
-
-offset:            { $$ = 0; }
-|                  '+' integer { $$ = $2; }
-|                  '-' integer { $$ = -$2; }
-
-relative:          'r' '<' T_A0 offset '>'  { new_reg(0, IR3_REG_RELATIV)->array.offset = $4; }
-|                  'c' '<' T_A0 offset '>'  { new_reg(0, IR3_REG_RELATIV | IR3_REG_CONST)->array.offset = $4; }
-
-immediate:         integer             { new_reg(0, IR3_REG_IMMED)->iim_val = $1; }
-|                  '(' integer ')'     { new_reg(0, IR3_REG_IMMED)->fim_val = $2; }
-|                  '(' float ')'       { new_reg(0, IR3_REG_IMMED)->fim_val = $2; }
-|                  '(' T_NAN ')'       { new_reg(0, IR3_REG_IMMED)->fim_val = NAN; }
-|                  '(' T_INF ')'       { new_reg(0, IR3_REG_IMMED)->fim_val = INFINITY; }
-
-integer:           T_INT       { $$ = $1; }
-|                  '-' T_INT   { $$ = -$2; }
-|                  T_HEX       { $$ = $1; }
-|                  '-' T_HEX   { $$ = -$2; }
-
-float:             T_FLOAT     { $$ = $1; }
-|                  '-' T_FLOAT { $$ = -$2; }
-
-type:              T_TYPE_F16  { $$ = TYPE_F16; }
-|                  T_TYPE_F32  { $$ = TYPE_F32; }
-|                  T_TYPE_U16  { $$ = TYPE_U16; }
-|                  T_TYPE_U32  { $$ = TYPE_U32; }
-|                  T_TYPE_S16  { $$ = TYPE_S16; }
-|                  T_TYPE_S32  { $$ = TYPE_S32; }
-|                  T_TYPE_U8   { $$ = TYPE_U8;  }
-|                  T_TYPE_S8   { $$ = TYPE_S8;  }
diff --git a/lib/mesa/src/freedreno/ir3/ir3_postsched.c b/lib/mesa/src/freedreno/ir3/ir3_postsched.c
deleted file mode 100644
index 521078a04..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_postsched.c
+++ /dev/null
@@ -1,715 +0,0 @@
-/*
- * Copyright (C) 2019 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-
-#include "util/dag.h"
-#include "util/u_math.h"
-
-#include "ir3.h"
-#include "ir3_compiler.h"
-#include "ir3_context.h"
-
-#ifdef DEBUG
-#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
-#else
-#define SCHED_DEBUG 0
-#endif
-#define d(fmt, ...) do { if (SCHED_DEBUG) { \
-	printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
-} } while (0)
-
-#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
-	printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
-	ir3_print_instr(instr); \
-} } while (0)
-
-/*
- * Post RA Instruction Scheduling
- */
-
-struct ir3_postsched_ctx {
-	struct ir3_context *ctx;
-
-	void *mem_ctx;
-	struct ir3_block *block;           /* the current block */
-	struct dag *dag;
-
-	struct list_head unscheduled_list; /* unscheduled instructions */
-
-	bool error;
-
-	int sfu_delay;
-};
-
-struct ir3_postsched_node {
-	struct dag_node dag;     /* must be first for util_dynarray_foreach */
-	struct ir3_instruction *instr;
-	bool partially_evaluated_path;
-
-	unsigned delay;
-	unsigned max_delay;
-};
-
-#define foreach_sched_node(__n, __list) \
-	list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
-
-#define foreach_bit(b, mask) \
-	for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
-
-static void
-schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
-{
-	debug_assert(ctx->block == instr->block);
-
-	/* remove from unscheduled_list:
-	 */
-	list_delinit(&instr->node);
-
-	di(instr, "schedule");
-
-	list_addtail(&instr->node, &instr->block->instr_list);
-
-	if (is_sfu(instr)) {
-		ctx->sfu_delay = 8;
-	} else if (ctx->sfu_delay > 0) {
-		ctx->sfu_delay--;
-	}
-
-	struct ir3_postsched_node *n = instr->data;
-	dag_prune_head(ctx->dag, &n->dag);
-}
-
-static void
-dump_state(struct ir3_postsched_ctx *ctx)
-{
-	if (!SCHED_DEBUG)
-		return;
-
-	foreach_sched_node (n, &ctx->dag->heads) {
-		di(n->instr, "maxdel=%3d    ", n->max_delay);
-
-		util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
-			struct ir3_postsched_node *child =
-				(struct ir3_postsched_node *)edge->child;
-
-			di(child->instr, " -> (%d parents) ", child->dag.parent_count);
-		}
-	}
-}
-
-/* Determine if this is an instruction that we'd prefer not to schedule
- * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
- * counter, ie. the more cycles it has been since the last SFU, the less
- * costly a sync would be.
- */
-static bool
-would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
-{
-	if (ctx->sfu_delay) {
-		struct ir3_register *reg;
-		foreach_src (reg, instr)
-			if (reg->instr && is_sfu(reg->instr))
-				return true;
-	}
-
-	return false;
-}
-
-/* find instruction to schedule: */
-static struct ir3_instruction *
-choose_instr(struct ir3_postsched_ctx *ctx)
-{
-	struct ir3_postsched_node *chosen = NULL;
-
-	dump_state(ctx);
-
-	foreach_sched_node (n, &ctx->dag->heads) {
-		if (!is_meta(n->instr))
-			continue;
-
-		if (!chosen || (chosen->max_delay < n->max_delay))
-			chosen = n;
-	}
-
-	if (chosen) {
-		di(chosen->instr, "prio: chose (meta)");
-		return chosen->instr;
-	}
-
-	/* Try to schedule inputs with a higher priority, if possible, as
-	 * the last bary.f unlocks varying storage to unblock more VS
-	 * warps.
-	 */
-	foreach_sched_node (n, &ctx->dag->heads) {
-		if (!is_input(n->instr))
-			continue;
-
-		if (!chosen || (chosen->max_delay < n->max_delay))
-			chosen = n;
-	}
-
-	if (chosen) {
-		di(chosen->instr, "prio: chose (input)");
-		return chosen->instr;
-	}
-
-	/* Next prioritize discards: */
-	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
-
-		if (d > 0)
-			continue;
-
-		if (!is_kill(n->instr))
-			continue;
-
-		if (!chosen || (chosen->max_delay < n->max_delay))
-			chosen = n;
-	}
-
-	if (chosen) {
-		di(chosen->instr, "csp: chose (kill, hard ready)");
-		return chosen->instr;
-	}
-
-	/* Next prioritize expensive instructions: */
-	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
-
-		if (d > 0)
-			continue;
-
-		if (!(is_sfu(n->instr) || is_tex(n->instr)))
-			continue;
-
-		if (!chosen || (chosen->max_delay < n->max_delay))
-			chosen = n;
-	}
-
-	if (chosen) {
-		di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
-		return chosen->instr;
-	}
-
-	/*
-	 * Sometimes be better to take a nop, rather than scheduling an
-	 * instruction that would require an (ss) shortly after another
-	 * SFU..  ie. if last SFU was just one or two instr ago, and we
-	 * could choose between taking a nop and then scheduling
-	 * something else, vs scheduling the immed avail instruction that
-	 * would require (ss), we are better with the nop.
-	 */
-	for (unsigned delay = 0; delay < 4; delay++) {
-		foreach_sched_node (n, &ctx->dag->heads) {
-			if (would_sync(ctx, n->instr))
-				continue;
-
-			unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
-
-			if (d > delay)
-				continue;
-
-			if (!chosen || (chosen->max_delay < n->max_delay))
-				chosen = n;
-		}
-
-		if (chosen) {
-			di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
-			return chosen->instr;
-		}
-	}
-
-	/* Next try to find a ready leader w/ soft delay (ie. including extra
-	 * delay for things like tex fetch which can be synchronized w/ sync
-	 * bit (but we probably do want to schedule some other instructions
-	 * while we wait)
-	 */
-	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
-
-		if (d > 0)
-			continue;
-
-		if (!chosen || (chosen->max_delay < n->max_delay))
-			chosen = n;
-	}
-
-	if (chosen) {
-		di(chosen->instr, "csp: chose (soft ready)");
-		return chosen->instr;
-	}
-
-	/* Next try to find a ready leader that can be scheduled without nop's,
-	 * which in the case of things that need (sy)/(ss) could result in
-	 * stalls.. but we've already decided there is not a better option.
-	 */
-	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
-
-		if (d > 0)
-			continue;
-
-		if (!chosen || (chosen->max_delay < n->max_delay))
-			chosen = n;
-	}
-
-	if (chosen) {
-		di(chosen->instr, "csp: chose (hard ready)");
-		return chosen->instr;
-	}
-
-	/* Otherwise choose leader with maximum cost:
-	 *
-	 * TODO should we try to balance cost and delays?  I guess it is
-	 * a balance between now-nop's and future-nop's?
-	 */
-	foreach_sched_node (n, &ctx->dag->heads) {
-		if (!chosen || chosen->max_delay < n->max_delay)
-			chosen = n;
-	}
-
-	if (chosen) {
-		di(chosen->instr, "csp: chose (leader)");
-		return chosen->instr;
-	}
-
-	return NULL;
-}
-
-struct ir3_postsched_deps_state {
-	struct ir3_context *ctx;
-
-	enum { F, R } direction;
-
-	bool merged;
-
-	/* Track the mapping between sched node (instruction) that last
-	 * wrote a given register (in whichever direction we are iterating
-	 * the block)
-	 *
-	 * Note, this table is twice as big as the # of regs, to deal with
-	 * half-precision regs.  The approach differs depending on whether
-	 * the half and full precision register files are "merged" (conflict,
-	 * ie. a6xx+) in which case we consider each full precision dep
-	 * as two half-precision dependencies, vs older separate (non-
-	 * conflicting) in which case the first half of the table is used
-	 * for full precision and 2nd half for half-precision.
-	 */
-	struct ir3_postsched_node *regs[2 * 256];
-};
-
-/* bounds checking read/write accessors, since OoB access to stuff on
- * the stack is gonna cause a bad day.
- */
-#define dep_reg(state, idx) *({ \
-		assert((idx) < ARRAY_SIZE((state)->regs)); \
-		&(state)->regs[(idx)]; \
-	})
-
-static void
-add_dep(struct ir3_postsched_deps_state *state,
-		struct ir3_postsched_node *before,
-		struct ir3_postsched_node *after)
-{
-	if (!before || !after)
-		return;
-
-	assert(before != after);
-
-	if (state->direction == F) {
-		dag_add_edge(&before->dag, &after->dag, NULL);
-	} else {
-		dag_add_edge(&after->dag, &before->dag, NULL);
-	}
-}
-
-static void
-add_single_reg_dep(struct ir3_postsched_deps_state *state,
-		struct ir3_postsched_node *node, unsigned num, bool write)
-{
-	add_dep(state, dep_reg(state, num), node);
-	if (write) {
-		dep_reg(state, num) = node;
-	}
-}
-
-/* This is where we handled full vs half-precision, and potential conflicts
- * between half and full precision that result in additional dependencies.
- * The 'reg' arg is really just to know half vs full precision.
- */
-static void
-add_reg_dep(struct ir3_postsched_deps_state *state,
-		struct ir3_postsched_node *node, const struct ir3_register *reg,
-		unsigned num, bool write)
-{
-	if (state->merged) {
-		if (reg->flags & IR3_REG_HALF) {
-			/* single conflict in half-reg space: */
-			add_single_reg_dep(state, node, num, write);
-		} else {
-			/* two conflicts in half-reg space: */
-			add_single_reg_dep(state, node, 2 * num + 0, write);
-			add_single_reg_dep(state, node, 2 * num + 1, write);
-		}
-	} else {
-		if (reg->flags & IR3_REG_HALF)
-			num += ARRAY_SIZE(state->regs) / 2;
-		add_single_reg_dep(state, node, num, write);
-	}
-}
-
-static void
-calculate_deps(struct ir3_postsched_deps_state *state,
-		struct ir3_postsched_node *node)
-{
-	struct ir3_register *reg;
-	int b;
-
-	/* Add dependencies on instructions that previously (or next,
-	 * in the reverse direction) wrote any of our src registers:
-	 */
-	foreach_src_n (reg, i, node->instr) {
-		if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-			continue;
-
-		if (reg->flags & IR3_REG_RELATIV) {
-			/* mark entire array as read: */
-			struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
-			for (unsigned i = 0; i < arr->length; i++) {
-				add_reg_dep(state, node, reg, arr->reg + i, false);
-			}
-		} else {
-			foreach_bit (b, reg->wrmask) {
-				add_reg_dep(state, node, reg, reg->num + b, false);
-
-				struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
-				if (dep && (state->direction == F)) {
-					unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
-					node->delay = MAX2(node->delay, d);
-				}
-			}
-		}
-	}
-
-	if (node->instr->address) {
-		add_reg_dep(state, node, node->instr->address->regs[0],
-					node->instr->address->regs[0]->num,
-					false);
-	}
-
-	if (dest_regs(node->instr) == 0)
-		return;
-
-	/* And then after we update the state for what this instruction
-	 * wrote:
-	 */
-	reg = node->instr->regs[0];
-	if (reg->flags & IR3_REG_RELATIV) {
-		/* mark the entire array as written: */
-		struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
-		for (unsigned i = 0; i < arr->length; i++) {
-			add_reg_dep(state, node, reg, arr->reg + i, true);
-		}
-	} else {
-		foreach_bit (b, reg->wrmask) {
-			add_reg_dep(state, node, reg, reg->num + b, true);
-		}
-	}
-}
-
-static void
-calculate_forward_deps(struct ir3_postsched_ctx *ctx)
-{
-	struct ir3_postsched_deps_state state = {
-			.ctx = ctx->ctx,
-			.direction = F,
-			.merged = ctx->ctx->compiler->gpu_id >= 600,
-	};
-
-	foreach_instr (instr, &ctx->unscheduled_list) {
-		calculate_deps(&state, instr->data);
-	}
-}
-
-static void
-calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
-{
-	struct ir3_postsched_deps_state state = {
-			.ctx = ctx->ctx,
-			.direction = R,
-			.merged = ctx->ctx->compiler->gpu_id >= 600,
-	};
-
-	foreach_instr_rev (instr, &ctx->unscheduled_list) {
-		calculate_deps(&state, instr->data);
-	}
-}
-
-static void
-sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
-{
-	struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
-
-	dag_init_node(ctx->dag, &n->dag);
-
-	n->instr = instr;
-	instr->data = n;
-}
-
-static void
-sched_dag_max_delay_cb(struct dag_node *node, void *state)
-{
-	struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
-	uint32_t max_delay = 0;
-
-	util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
-		struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
-		max_delay = MAX2(child->max_delay, max_delay);
-	}
-
-	n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
-}
-
-static void
-sched_dag_init(struct ir3_postsched_ctx *ctx)
-{
-	ctx->mem_ctx = ralloc_context(NULL);
-
-	ctx->dag = dag_create(ctx->mem_ctx);
-
-	foreach_instr (instr, &ctx->unscheduled_list)
-		sched_node_init(ctx, instr);
-
-	calculate_forward_deps(ctx);
-	calculate_reverse_deps(ctx);
-
-	/*
-	 * To avoid expensive texture fetches, etc, from being moved ahead
-	 * of kills, track the kills we've seen so far, so we can add an
-	 * extra dependency on them for tex/mem instructions
-	 */
-	struct util_dynarray kills;
-	util_dynarray_init(&kills, ctx->mem_ctx);
-
-	/*
-	 * Normal srcs won't be in SSA at this point, those are dealt with in
-	 * calculate_forward_deps() and calculate_reverse_deps().  But we still
-	 * have the false-dep information in SSA form, so go ahead and add
-	 * dependencies for that here:
-	 */
-	foreach_instr (instr, &ctx->unscheduled_list) {
-		struct ir3_postsched_node *n = instr->data;
-		struct ir3_instruction *src;
-
-		foreach_ssa_src_n (src, i, instr) {
-			if (src->block != instr->block)
-				continue;
-
-			/* we can end up with unused false-deps.. just skip them: */
-			if (src->flags & IR3_INSTR_UNUSED)
-				continue;
-
-			struct ir3_postsched_node *sn = src->data;
-
-			/* don't consider dependencies in other blocks: */
-			if (src->block != instr->block)
-				continue;
-
-			dag_add_edge(&sn->dag, &n->dag, NULL);
-		}
-
-		if (is_kill(instr)) {
-			util_dynarray_append(&kills, struct ir3_instruction *, instr);
-		} else if (is_tex(instr) || is_mem(instr)) {
-			util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
-				struct ir3_instruction *kill = *instrp;
-				struct ir3_postsched_node *kn = kill->data;
-				dag_add_edge(&kn->dag, &n->dag, NULL);
-			}
-		}
-	}
-
-	// TODO do we want to do this after reverse-dependencies?
-	dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
-}
-
-static void
-sched_dag_destroy(struct ir3_postsched_ctx *ctx)
-{
-	ralloc_free(ctx->mem_ctx);
-	ctx->mem_ctx = NULL;
-	ctx->dag = NULL;
-}
-
-static void
-sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
-{
-	ctx->block = block;
-
-	/* move all instructions to the unscheduled list, and
-	 * empty the block's instruction list (to which we will
-	 * be inserting).
-	 */
-	list_replace(&block->instr_list, &ctx->unscheduled_list);
-	list_inithead(&block->instr_list);
-
-	// TODO once we are using post-sched for everything we can
-	// just not stick in NOP's prior to post-sched, and drop this.
-	// for now keep this, since it makes post-sched optional:
-	foreach_instr_safe (instr, &ctx->unscheduled_list) {
-		switch (instr->opc) {
-		case OPC_NOP:
-		case OPC_BR:
-		case OPC_JUMP:
-			list_delinit(&instr->node);
-			break;
-		default:
-			break;
-		}
-	}
-
-	sched_dag_init(ctx);
-
-	/* First schedule all meta:input instructions, followed by
-	 * tex-prefetch.  We want all of the instructions that load
-	 * values into registers before the shader starts to go
-	 * before any other instructions.  But in particular we
-	 * want inputs to come before prefetches.  This is because
-	 * a FS's bary_ij input may not actually be live in the
-	 * shader, but it should not be scheduled on top of any
-	 * other input (but can be overwritten by a tex prefetch)
-	 */
-	foreach_instr_safe (instr, &ctx->unscheduled_list)
-		if (instr->opc == OPC_META_INPUT)
-			schedule(ctx, instr);
-
-	foreach_instr_safe (instr, &ctx->unscheduled_list)
-		if (instr->opc == OPC_META_TEX_PREFETCH)
-			schedule(ctx, instr);
-
-	while (!list_is_empty(&ctx->unscheduled_list)) {
-		struct ir3_instruction *instr;
-
-		instr = choose_instr(ctx);
-
-		/* this shouldn't happen: */
-		if (!instr) {
-			ctx->error = true;
-			break;
-		}
-
-		unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
-		d("delay=%u", delay);
-
-		/* and if we run out of instructions that can be scheduled,
-		 * then it is time for nop's:
-		 */
-		debug_assert(delay <= 6);
-		while (delay > 0) {
-			ir3_NOP(block);
-			delay--;
-		}
-
-		schedule(ctx, instr);
-	}
-
-	sched_dag_destroy(ctx);
-}
-
-
-static bool
-is_self_mov(struct ir3_instruction *instr)
-{
-	if (!is_same_type_mov(instr))
-		return false;
-
-	if (instr->regs[0]->num != instr->regs[1]->num)
-		return false;
-
-	if (instr->regs[0]->flags & IR3_REG_RELATIV)
-		return false;
-
-	if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
-			IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
-			IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
-			IR3_REG_EVEN | IR3_REG_POS_INF))
-		return false;
-
-	return true;
-}
-
-/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
- * as a result of places were before RA we are not sure that it is
- * safe to eliminate.  We could eliminate these earlier, but sometimes
- * they are tangled up in false-dep's, etc, so it is easier just to
- * let them exist until after RA
- */
-static void
-cleanup_self_movs(struct ir3 *ir)
-{
-	foreach_block (block, &ir->block_list) {
-		foreach_instr_safe (instr, &block->instr_list) {
-			struct ir3_register *reg;
-
-			foreach_src (reg, instr) {
-				if (!reg->instr)
-					continue;
-
-				if (is_self_mov(reg->instr)) {
-					list_delinit(&reg->instr->node);
-					reg->instr = reg->instr->regs[1]->instr;
-				}
-			}
-
-			for (unsigned i = 0; i < instr->deps_count; i++) {
-				if (instr->deps[i] && is_self_mov(instr->deps[i])) {
-					list_delinit(&instr->deps[i]->node);
-					instr->deps[i] = instr->deps[i]->regs[1]->instr;
-				}
-			}
-		}
-	}
-}
-
-int
-ir3_postsched(struct ir3_context *cctx)
-{
-	struct ir3_postsched_ctx ctx = {
-			.ctx = cctx,
-	};
-
-	ir3_remove_nops(cctx->ir);
-	cleanup_self_movs(cctx->ir);
-
-	foreach_block (block, &cctx->ir->block_list) {
-		sched_block(&ctx, block);
-	}
-
-	if (ctx.error)
-		return -1;
-
-	return 0;
-}
diff --git a/lib/mesa/src/freedreno/ir3/ir3_ra.h b/lib/mesa/src/freedreno/ir3/ir3_ra.h
deleted file mode 100644
index 35fb618c4..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_ra.h
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef IR3_RA_H_
-#define IR3_RA_H_
-
-#include "util/bitset.h"
-
-
-static const unsigned class_sizes[] = {
-	1, 2, 3, 4,
-	4 + 4, /* txd + 1d/2d */
-	4 + 6, /* txd + 3d */
-};
-#define class_count ARRAY_SIZE(class_sizes)
-
-static const unsigned half_class_sizes[] = {
-	1, 2, 3, 4,
-};
-#define half_class_count  ARRAY_SIZE(half_class_sizes)
-
-/* seems to just be used for compute shaders?  Seems like vec1 and vec3
- * are sufficient (for now?)
- */
-static const unsigned high_class_sizes[] = {
-	1, 3,
-};
-#define high_class_count ARRAY_SIZE(high_class_sizes)
-
-#define total_class_count (class_count + half_class_count + high_class_count)
-
-/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
-#define NUM_REGS             (4 * 48)  /* r0 to r47 */
-#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
-#define FIRST_HIGH_REG       (4 * 48)
-/* Number of virtual regs in a given class: */
-
-static inline unsigned CLASS_REGS(unsigned i)
-{
-	assert(i < class_count);
-
-	return (NUM_REGS - (class_sizes[i] - 1));
-}
-
-static inline unsigned HALF_CLASS_REGS(unsigned i)
-{
-	assert(i < half_class_count);
-
-	return (NUM_REGS - (half_class_sizes[i] - 1));
-}
-
-static inline unsigned HIGH_CLASS_REGS(unsigned i)
-{
-	assert(i < high_class_count);
-
-	return (NUM_HIGH_REGS - (high_class_sizes[i] - 1));
-}
-
-#define HALF_OFFSET          (class_count)
-#define HIGH_OFFSET          (class_count + half_class_count)
-
-/* register-set, created one time, used for all shaders: */
-struct ir3_ra_reg_set {
-	struct ra_regs *regs;
-	unsigned int classes[class_count];
-	unsigned int half_classes[half_class_count];
-	unsigned int high_classes[high_class_count];
-
-	/* The virtual register space flattens out all the classes,
-	 * starting with full, followed by half and then high, ie:
-	 *
-	 *   scalar full  (starting at zero)
-	 *   vec2 full
-	 *   vec3 full
-	 *   ...
-	 *   vecN full
-	 *   scalar half  (starting at first_half_reg)
-	 *   vec2 half
-	 *   ...
-	 *   vecN half
-	 *   scalar high  (starting at first_high_reg)
-	 *   ...
-	 *   vecN high
-	 *
-	 */
-	unsigned first_half_reg, first_high_reg;
-
-	/* maps flat virtual register space to base gpr: */
-	uint16_t *ra_reg_to_gpr;
-	/* maps cls,gpr to flat virtual register space: */
-	uint16_t **gpr_to_ra_reg;
-};
-
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-	BITSET_WORD *def;        /* variables defined before used in block */
-	BITSET_WORD *use;        /* variables used before defined in block */
-	BITSET_WORD *livein;     /* which defs reach entry point of block */
-	BITSET_WORD *liveout;    /* which defs reach exit point of block */
-};
-
-/* additional instruction-data (per-instruction) */
-struct ir3_ra_instr_data {
-	/* cached instruction 'definer' info: */
-	struct ir3_instruction *defn;
-	int off, sz, cls;
-};
-
-/* register-assign context, per-shader */
-struct ir3_ra_ctx {
-	struct ir3_shader_variant *v;
-	struct ir3 *ir;
-
-	struct ir3_ra_reg_set *set;
-	struct ra_graph *g;
-
-	/* Are we in the scalar assignment pass?  In this pass, all larger-
-	 * than-vec1 vales have already been assigned and pre-colored, so
-	 * we only consider scalar values.
-	 */
-	bool scalar_pass;
-
-	unsigned alloc_count;
-	unsigned r0_xyz_nodes; /* ra node numbers for r0.[xyz] precolors */
-	unsigned hr0_xyz_nodes; /* ra node numbers for hr0.[xyz] precolors pre-a6xx */
-	/* one per class, plus one slot for arrays: */
-	unsigned class_alloc_count[total_class_count + 1];
-	unsigned class_base[total_class_count + 1];
-	unsigned instr_cnt;
-	unsigned *def, *use;     /* def/use table */
-	struct ir3_ra_instr_data *instrd;
-
-	/* Mapping vreg name back to instruction, used select reg callback: */
-	struct hash_table *name_to_instr;
-
-	/* Tracking for select_reg callback */
-	unsigned start_search_reg;
-	unsigned max_target;
-
-	/* Temporary buffer for def/use iterators
-	 *
-	 * The worst case should probably be an array w/ relative access (ie.
-	 * all elements are def'd or use'd), and that can't be larger than
-	 * the number of registers.
-	 *
-	 * NOTE we could declare this on the stack if needed, but I don't
-	 * think there is a need for nested iterators.
-	 */
-	unsigned namebuf[NUM_REGS];
-	unsigned namecnt, nameidx;
-};
-
-static inline int
-ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
-{
-	unsigned name;
-	debug_assert(id->cls >= 0);
-	debug_assert(id->cls < total_class_count);  /* we shouldn't get arrays here.. */
-	name = ctx->class_base[id->cls] + id->defn->name;
-	debug_assert(name < ctx->alloc_count);
-	return name;
-}
-
-/* Get the scalar name of the n'th component of an instruction dst: */
-static inline int
-scalar_name(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned n)
-{
-	if (ctx->scalar_pass) {
-		if (instr->opc == OPC_META_SPLIT) {
-			debug_assert(n == 0);     /* split results in a scalar */
-			struct ir3_instruction *src = instr->regs[1]->instr;
-			return scalar_name(ctx, src, instr->split.off);
-		} else if (instr->opc == OPC_META_COLLECT) {
-			debug_assert(n < (instr->regs_count + 1));
-			struct ir3_instruction *src = instr->regs[n + 1]->instr;
-			return scalar_name(ctx, src, 0);
-		}
-	} else {
-		debug_assert(n == 0);
-	}
-
-	return ra_name(ctx, &ctx->instrd[instr->ip]) + n;
-}
-
-static inline bool
-writes_gpr(struct ir3_instruction *instr)
-{
-	if (dest_regs(instr) == 0)
-		return false;
-	/* is dest a normal temp register: */
-	struct ir3_register *reg = instr->regs[0];
-	debug_assert(!(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)));
-	if ((reg_num(reg) == REG_A0) ||
-			(reg->num == regid(REG_P0, 0)))
-		return false;
-	return true;
-}
-
-#define NO_NAME ~0
-
-/*
- * Iterators to iterate the vreg names of an instructions def's and use's
- */
-
-static inline unsigned
-__ra_name_cnt(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
-{
-	if (!instr)
-		return 0;
-
-	/* Filter special cases, ie. writes to a0.x or p0.x, or non-ssa: */
-	if (!writes_gpr(instr) || (instr->regs[0]->flags & IR3_REG_ARRAY))
-		return 0;
-
-	/* in scalar pass, we aren't considering virtual register classes, ie.
-	 * if an instruction writes a vec2, then it defines two different scalar
-	 * register names.
-	 */
-	if (ctx->scalar_pass)
-		return dest_regs(instr);
-
-	return 1;
-}
-
-#define foreach_name_n(__name, __n, __ctx, __instr) \
-	for (unsigned __cnt = __ra_name_cnt(__ctx, __instr), __n = 0, __name; \
-	     (__n < __cnt) && ({__name = scalar_name(__ctx, __instr, __n); 1;}); __n++)
-
-#define foreach_name(__name, __ctx, __instr) \
-	foreach_name_n(__name, __n, __ctx, __instr)
-
-static inline unsigned
-__ra_itr_pop(struct ir3_ra_ctx *ctx)
-{
-	if (ctx->nameidx < ctx->namecnt)
-		return ctx->namebuf[ctx->nameidx++];
-	return NO_NAME;
-}
-
-static inline void
-__ra_itr_push(struct ir3_ra_ctx *ctx, unsigned name)
-{
-	assert(ctx->namecnt < ARRAY_SIZE(ctx->namebuf));
-	ctx->namebuf[ctx->namecnt++] = name;
-}
-
-static inline unsigned
-__ra_init_def_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
-{
-	/* nested use is not supported: */
-	assert(ctx->namecnt == ctx->nameidx);
-
-	ctx->namecnt = ctx->nameidx = 0;
-
-	if (!writes_gpr(instr))
-		return NO_NAME;
-
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-	struct ir3_register *dst = instr->regs[0];
-
-	if (dst->flags & IR3_REG_ARRAY) {
-		struct ir3_array *arr = ir3_lookup_array(ctx->ir, dst->array.id);
-
-		/* indirect write is treated like a write to all array
-		 * elements, since we don't know which one is actually
-		 * written:
-		 */
-		if (dst->flags & IR3_REG_RELATIV) {
-			for (unsigned i = 0; i < arr->length; i++) {
-				__ra_itr_push(ctx, arr->base + i);
-			}
-		} else {
-			__ra_itr_push(ctx, arr->base + dst->array.offset);
-			debug_assert(dst->array.offset < arr->length);
-		}
-	} else if (id->defn == instr) {
-		foreach_name_n (name, i, ctx, instr) {
-			/* tex instructions actually have a wrmask, and
-			 * don't touch masked out components.  We can't do
-			 * anything useful about that in the first pass,
-			 * but in the scalar pass we can realize these
-			 * registers are available:
-			 */
-			if (ctx->scalar_pass && is_tex_or_prefetch(instr) &&
-					!(instr->regs[0]->wrmask & (1 << i)))
-				continue;
-			__ra_itr_push(ctx, name);
-		}
-	}
-
-	return __ra_itr_pop(ctx);
-}
-
-static inline unsigned
-__ra_init_use_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
-{
-	/* nested use is not supported: */
-	assert(ctx->namecnt == ctx->nameidx);
-
-	ctx->namecnt = ctx->nameidx = 0;
-
-	struct ir3_register *reg;
-	foreach_src (reg, instr) {
-		if (reg->flags & IR3_REG_ARRAY) {
-			struct ir3_array *arr =
-				ir3_lookup_array(ctx->ir, reg->array.id);
-
-			/* indirect read is treated like a read from all array
-			 * elements, since we don't know which one is actually
-			 * read:
-			 */
-			if (reg->flags & IR3_REG_RELATIV) {
-				for (unsigned i = 0; i < arr->length; i++) {
-					__ra_itr_push(ctx, arr->base + i);
-				}
-			} else {
-				__ra_itr_push(ctx, arr->base + reg->array.offset);
-				debug_assert(reg->array.offset < arr->length);
-			}
-		} else {
-			foreach_name_n (name, i, ctx, reg->instr) {
-				/* split takes a src w/ wrmask potentially greater
-				 * than 0x1, but it really only cares about a single
-				 * component.  This shows up in splits coming out of
-				 * a tex instruction w/ wrmask=.z, for example.
-				 */
-				if (ctx->scalar_pass && (instr->opc == OPC_META_SPLIT) &&
-						!(i == instr->split.off))
-					continue;
-				__ra_itr_push(ctx, name);
-			}
-		}
-	}
-
-	return __ra_itr_pop(ctx);
-}
-
-#define foreach_def(__name, __ctx, __instr) \
-	for (unsigned __name = __ra_init_def_itr(__ctx, __instr); \
-	     __name != NO_NAME; __name = __ra_itr_pop(__ctx))
-
-#define foreach_use(__name, __ctx, __instr) \
-	for (unsigned __name = __ra_init_use_itr(__ctx, __instr); \
-	     __name != NO_NAME; __name = __ra_itr_pop(__ctx))
-
-int ra_size_to_class(unsigned sz, bool half, bool high);
-int ra_class_to_size(unsigned class, bool *half, bool *high);
-
-#endif  /* IR3_RA_H_ */
diff --git a/lib/mesa/src/freedreno/ir3/ir3_ra_regset.c b/lib/mesa/src/freedreno/ir3/ir3_ra_regset.c
deleted file mode 100644
index c0abdf4ff..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_ra_regset.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/u_math.h"
-#include "util/register_allocate.h"
-#include "util/ralloc.h"
-#include "util/bitset.h"
-
-#include "ir3.h"
-#include "ir3_compiler.h"
-#include "ir3_ra.h"
-
-static void
-build_q_values(unsigned int **q_values, unsigned off,
-		const unsigned *sizes, unsigned count)
-{
-	for (unsigned i = 0; i < count; i++) {
-		q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
-
-		/* From register_allocate.c:
-		 *
-		 * q(B,C) (indexed by C, B is this register class) in
-		 * Runeson/Nyström paper.  This is "how many registers of B could
-		 * the worst choice register from C conflict with".
-		 *
-		 * If we just let the register allocation algorithm compute these
-		 * values, is extremely expensive.  However, since all of our
-		 * registers are laid out, we can very easily compute them
-		 * ourselves.  View the register from C as fixed starting at GRF n
-		 * somewhere in the middle, and the register from B as sliding back
-		 * and forth.  Then the first register to conflict from B is the
-		 * one starting at n - class_size[B] + 1 and the last register to
-		 * conflict will start at n + class_size[B] - 1.  Therefore, the
-		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
-		 *
-		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-		 * B | | | | | |n| --> | | | | | | |
-		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-		 *             +-+-+-+-+-+
-		 * C           |n| | | | |
-		 *             +-+-+-+-+-+
-		 *
-		 * (Idea copied from brw_fs_reg_allocate.cpp)
-		 */
-		for (unsigned j = 0; j < count; j++)
-			q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
-	}
-}
-
-static void
-setup_conflicts(struct ir3_ra_reg_set *set)
-{
-	unsigned reg;
-
-	reg = 0;
-	for (unsigned i = 0; i < class_count; i++) {
-		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
-			for (unsigned br = j; br < j + class_sizes[i]; br++) {
-				ra_add_transitive_reg_conflict(set->regs, br, reg);
-			}
-
-			reg++;
-		}
-	}
-
-	for (unsigned i = 0; i < half_class_count; i++) {
-		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
-			for (unsigned br = j; br < j + half_class_sizes[i]; br++) {
-				ra_add_transitive_reg_conflict(set->regs,
-						br + set->first_half_reg, reg);
-			}
-
-			reg++;
-		}
-	}
-
-	for (unsigned i = 0; i < high_class_count; i++) {
-		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
-			for (unsigned br = j; br < j + high_class_sizes[i]; br++) {
-				ra_add_transitive_reg_conflict(set->regs,
-						br + set->first_high_reg, reg);
-			}
-
-			reg++;
-		}
-	}
-}
-
-/* One-time setup of RA register-set, which describes all the possible
- * "virtual" registers and their interferences.  Ie. double register
- * occupies (and conflicts with) two single registers, and so forth.
- * Since registers do not need to be aligned to their class size, they
- * can conflict with other registers in the same class too.  Ie:
- *
- *    Single (base) |  Double
- *    --------------+---------------
- *       R0         |  D0
- *       R1         |  D0 D1
- *       R2         |     D1 D2
- *       R3         |        D2
- *           .. and so on..
- *
- * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
- * really just four scalar registers.  Don't let that confuse you.)
- */
-struct ir3_ra_reg_set *
-ir3_ra_alloc_reg_set(struct ir3_compiler *compiler)
-{
-	struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
-	unsigned ra_reg_count, reg, base;
-
-	/* calculate # of regs across all classes: */
-	ra_reg_count = 0;
-	for (unsigned i = 0; i < class_count; i++)
-		ra_reg_count += CLASS_REGS(i);
-	for (unsigned i = 0; i < half_class_count; i++)
-		ra_reg_count += HALF_CLASS_REGS(i);
-	for (unsigned i = 0; i < high_class_count; i++)
-		ra_reg_count += HIGH_CLASS_REGS(i);
-
-	/* allocate the reg-set.. */
-	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
-	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
-	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
-
-	/* .. and classes */
-	reg = 0;
-	for (unsigned i = 0; i < class_count; i++) {
-		set->classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
-
-		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->regs, set->classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[i][j] = reg;
-
-			reg++;
-		}
-	}
-
-	set->first_half_reg = reg;
-	base = HALF_OFFSET;
-
-	for (unsigned i = 0; i < half_class_count; i++) {
-		set->half_classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[base + i] =
-				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
-
-		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->regs, set->half_classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[base + i][j] = reg;
-
-			reg++;
-		}
-	}
-
-	set->first_high_reg = reg;
-	base = HIGH_OFFSET;
-
-	for (unsigned i = 0; i < high_class_count; i++) {
-		set->high_classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[base + i] =
-				ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
-
-		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->regs, set->high_classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[base + i][j] = reg;
-
-			reg++;
-		}
-	}
-
-	/* starting a6xx, half precision regs conflict w/ full precision regs: */
-	if (compiler->gpu_id >= 600) {
-		for (unsigned i = 0; i < CLASS_REGS(0) / 2; i++) {
-			unsigned freg  = set->gpr_to_ra_reg[0][i];
-			unsigned hreg0 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 0];
-			unsigned hreg1 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 1];
-
-			ra_add_transitive_reg_pair_conflict(set->regs, freg, hreg0, hreg1);
-		}
-
-		setup_conflicts(set);
-
-		// TODO also need to update q_values, but for now:
-		ra_set_finalize(set->regs, NULL);
-	} else {
-		setup_conflicts(set);
-
-		/* allocate and populate q_values: */
-		unsigned int **q_values = ralloc_array(set, unsigned *, total_class_count);
-
-		build_q_values(q_values, 0, class_sizes, class_count);
-		build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
-		build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
-
-		ra_set_finalize(set->regs, q_values);
-
-		ralloc_free(q_values);
-	}
-
-	return set;
-}
-
-int
-ra_size_to_class(unsigned sz, bool half, bool high)
-{
-	if (high) {
-		for (unsigned i = 0; i < high_class_count; i++)
-			if (high_class_sizes[i] >= sz)
-				return i + HIGH_OFFSET;
-	} else if (half) {
-		for (unsigned i = 0; i < half_class_count; i++)
-			if (half_class_sizes[i] >= sz)
-				return i + HALF_OFFSET;
-	} else {
-		for (unsigned i = 0; i < class_count; i++)
-			if (class_sizes[i] >= sz)
-				return i;
-	}
-	debug_assert(0);
-	return -1;
-}
-
-int
-ra_class_to_size(unsigned class, bool *half, bool *high)
-{
-	*half = *high = false;
-
-	if (class >= HIGH_OFFSET) {
-		*high = true;
-		return high_class_sizes[class - HIGH_OFFSET];
-	} else if (class >= HALF_OFFSET) {
-		*half = true;
-		return half_class_sizes[class - HALF_OFFSET];
-	} else {
-		return class_sizes[class];
-	}
-}
diff --git a/lib/mesa/src/freedreno/ir3/tests/disasm.c b/lib/mesa/src/freedreno/ir3/tests/disasm.c
deleted file mode 100644
index ee9457839..000000000
--- a/lib/mesa/src/freedreno/ir3/tests/disasm.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright © 2016 Broadcom
- * Copyright © 2020 Google LLC
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "util/macros.h"
-#include "instr-a3xx.h"
-
-#define INSTR_6XX(i, d) { .gpu_id = 630, .instr = #i, .expected = d }
-
-static const struct test {
-	int gpu_id;
-	const char *instr;
-	const char *expected;
-} tests[] = {
-	/* cat0 */
-	INSTR_6XX(00000000_00000000, "nop"),
-	INSTR_6XX(00000200_00000000, "(rpt2)nop"),
-	INSTR_6XX(03000000_00000000, "end"),
-	INSTR_6XX(00800000_00000004, "br p0.x, #4"),
-	INSTR_6XX(00900000_00000003, "br !p0.x, #3"),
-
-	/* cat1 */
-	INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"),
-	INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"),
-	INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"),
-	INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"),
-	INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"),
-	INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"),
-
-	/* cat2 */
-	INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"),
-	INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"),
-	INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"),
-	INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"),
-
-	/* cat3 */
-	INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"),
-	INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"),
-	INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"),
-	INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"),
-	INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"),
-	INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"),
-
-	/* cat4 */
-	INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"),
-
-	/* cat5 */
-	INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"),
-
-	/* cat6 */
-	INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x"),
-	INSTR_6XX(c1100000_c1000000, "stl.f16 l[0], hr0.x, hr48.y"),
-
-	/* discard stuff */
-	INSTR_6XX(42b400f8_20010004, "cmps.s.eq p0.x, r1.x, 1"),
-	INSTR_6XX(02800000_00000000, "kill p0.x"),
-
-	/* Immediates */
-	INSTR_6XX(40100007_68000008, "add.f r1.w, r2.x, (neg)(0.0)"),
-	INSTR_6XX(40100007_68010008, "add.f r1.w, r2.x, (neg)(0.5)"),
-	INSTR_6XX(40100007_68020008, "add.f r1.w, r2.x, (neg)(1.0)"),
-	INSTR_6XX(40100007_68030008, "add.f r1.w, r2.x, (neg)(2.0)"),
-	INSTR_6XX(40100007_68040008, "add.f r1.w, r2.x, (neg)(e)"),
-	INSTR_6XX(40100007_68050008, "add.f r1.w, r2.x, (neg)(pi)"),
-	INSTR_6XX(40100007_68060008, "add.f r1.w, r2.x, (neg)(1/pi)"),
-	INSTR_6XX(40100007_68070008, "add.f r1.w, r2.x, (neg)(1/log2(e))"),
-	INSTR_6XX(40100007_68080008, "add.f r1.w, r2.x, (neg)(log2(e))"),
-	INSTR_6XX(40100007_68090008, "add.f r1.w, r2.x, (neg)(1/log2(10))"),
-	INSTR_6XX(40100007_680a0008, "add.f r1.w, r2.x, (neg)(log2(10))"),
-	INSTR_6XX(40100007_680b0008, "add.f r1.w, r2.x, (neg)(4.0)"),
-};
-
-static void
-trim(char *string)
-{
-	for (int len = strlen(string); len > 0 && string[len - 1] == '\n'; len--)
-		string[len - 1] = 0;
-}
-
-int
-main(int argc, char **argv)
-{
-	int retval = 0;
-	const int output_size = 4096;
-	char *disasm_output = malloc(output_size);
-	FILE *fdisasm = fmemopen(disasm_output, output_size, "w+");
-	if (!fdisasm) {
-		fprintf(stderr, "failed to fmemopen\n");
-		return 1;
-	}
-
-	for (int i = 0; i < ARRAY_SIZE(tests); i++) {
-		const struct test *test = &tests[i];
-		printf("Testing a%d %s: \"%s\"...\n",
-				test->gpu_id, test->instr, test->expected);
-
-		rewind(fdisasm);
-		memset(disasm_output, 0, output_size);
-
-		uint32_t code[2] = {
-			strtoll(&test->instr[9], NULL, 16),
-			strtoll(&test->instr[0], NULL, 16),
-		};
-		disasm_a3xx(code, ARRAY_SIZE(code), 0, fdisasm, test->gpu_id);
-		fflush(fdisasm);
-
-		trim(disasm_output);
-
-		if (strcmp(disasm_output, test->expected) != 0) {
-			printf("FAIL\n");
-			printf("  Expected: \"%s\"\n", test->expected);
-			printf("  Got:      \"%s\"\n", disasm_output);
-			retval = 1;
-			continue;
-		}
-	}
-
-	fclose(fdisasm);
-	free(disasm_output);
-
-	return retval;
-}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_clear_blit.c b/lib/mesa/src/freedreno/vulkan/tu_clear_blit.c
deleted file mode 100644
index 550c66174..000000000
--- a/lib/mesa/src/freedreno/vulkan/tu_clear_blit.c
+++ /dev/null
@@ -1,2330 +0,0 @@
-/*
- * Copyright 2019-2020 Valve Corporation
- * SPDX-License-Identifier: MIT
- *
- * Authors:
- *    Jonathan Marek <jonathan@marek.ca>
- */
-
-#include "tu_private.h"
-
-#include "tu_cs.h"
-#include "vk_format.h"
-
-#include "util/format_r11g11b10f.h"
-#include "util/format_rgb9e5.h"
-#include "util/format_srgb.h"
-#include "util/u_half.h"
-
-/* helper functions previously in tu_formats.c */
-
-static uint32_t
-tu_pack_mask(int bits)
-{
-   assert(bits <= 32);
-   return (1ull << bits) - 1;
-}
-
-static uint32_t
-tu_pack_float32_for_unorm(float val, int bits)
-{
-   const uint32_t max = tu_pack_mask(bits);
-   if (val < 0.0f)
-      return 0;
-   else if (val > 1.0f)
-      return max;
-   else
-      return _mesa_lroundevenf(val * (float) max);
-}
-
-static uint32_t
-tu_pack_float32_for_snorm(float val, int bits)
-{
-   const int32_t max = tu_pack_mask(bits - 1);
-   int32_t tmp;
-   if (val < -1.0f)
-      tmp = -max;
-   else if (val > 1.0f)
-      tmp = max;
-   else
-      tmp = _mesa_lroundevenf(val * (float) max);
-
-   return tmp & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_float32_for_uscaled(float val, int bits)
-{
-   const uint32_t max = tu_pack_mask(bits);
-   if (val < 0.0f)
-      return 0;
-   else if (val > (float) max)
-      return max;
-   else
-      return (uint32_t) val;
-}
-
-static uint32_t
-tu_pack_float32_for_sscaled(float val, int bits)
-{
-   const int32_t max = tu_pack_mask(bits - 1);
-   const int32_t min = -max - 1;
-   int32_t tmp;
-   if (val < (float) min)
-      tmp = min;
-   else if (val > (float) max)
-      tmp = max;
-   else
-      tmp = (int32_t) val;
-
-   return tmp & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_uint32_for_uint(uint32_t val, int bits)
-{
-   return val & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_int32_for_sint(int32_t val, int bits)
-{
-   return val & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_float32_for_sfloat(float val, int bits)
-{
-   assert(bits == 16 || bits == 32);
-   return bits == 16 ? util_float_to_half(val) : fui(val);
-}
-
-union tu_clear_component_value {
-   float float32;
-   int32_t int32;
-   uint32_t uint32;
-};
-
-static uint32_t
-tu_pack_clear_component_value(union tu_clear_component_value val,
-                              const struct util_format_channel_description *ch)
-{
-   uint32_t packed;
-
-   switch (ch->type) {
-   case UTIL_FORMAT_TYPE_UNSIGNED:
-      /* normalized, scaled, or pure integer */
-      if (ch->normalized)
-         packed = tu_pack_float32_for_unorm(val.float32, ch->size);
-      else if (ch->pure_integer)
-         packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
-      else
-         packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
-      break;
-   case UTIL_FORMAT_TYPE_SIGNED:
-      /* normalized, scaled, or pure integer */
-      if (ch->normalized)
-         packed = tu_pack_float32_for_snorm(val.float32, ch->size);
-      else if (ch->pure_integer)
-         packed = tu_pack_int32_for_sint(val.int32, ch->size);
-      else
-         packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
-      break;
-   case UTIL_FORMAT_TYPE_FLOAT:
-      packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
-      break;
-   default:
-      unreachable("unexpected channel type");
-      packed = 0;
-      break;
-   }
-
-   assert((packed & tu_pack_mask(ch->size)) == packed);
-   return packed;
-}
-
-static const struct util_format_channel_description *
-tu_get_format_channel_description(const struct util_format_description *desc,
-                                  int comp)
-{
-   switch (desc->swizzle[comp]) {
-   case PIPE_SWIZZLE_X:
-      return &desc->channel[0];
-   case PIPE_SWIZZLE_Y:
-      return &desc->channel[1];
-   case PIPE_SWIZZLE_Z:
-      return &desc->channel[2];
-   case PIPE_SWIZZLE_W:
-      return &desc->channel[3];
-   default:
-      return NULL;
-   }
-}
-
-static union tu_clear_component_value
-tu_get_clear_component_value(const VkClearValue *val, int comp,
-                             enum util_format_colorspace colorspace)
-{
-   assert(comp < 4);
-
-   union tu_clear_component_value tmp;
-   switch (colorspace) {
-   case UTIL_FORMAT_COLORSPACE_ZS:
-      assert(comp < 2);
-      if (comp == 0)
-         tmp.float32 = val->depthStencil.depth;
-      else
-         tmp.uint32 = val->depthStencil.stencil;
-      break;
-   case UTIL_FORMAT_COLORSPACE_SRGB:
-      if (comp < 3) {
-         tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
-         break;
-      }
-   default:
-      assert(comp < 4);
-      tmp.uint32 = val->color.uint32[comp];
-      break;
-   }
-
-   return tmp;
-}
-
-/* r2d_ = BLIT_OP_SCALE operations */
-
-static enum a6xx_2d_ifmt
-format_to_ifmt(enum a6xx_format fmt)
-{
-   switch (fmt) {
-   case FMT6_A8_UNORM:
-   case FMT6_8_UNORM:
-   case FMT6_8_SNORM:
-   case FMT6_8_8_UNORM:
-   case FMT6_8_8_SNORM:
-   case FMT6_8_8_8_8_UNORM:
-   case FMT6_8_8_8_X8_UNORM:
-   case FMT6_8_8_8_8_SNORM:
-   case FMT6_4_4_4_4_UNORM:
-   case FMT6_5_5_5_1_UNORM:
-   case FMT6_5_6_5_UNORM:
-   case FMT6_Z24_UNORM_S8_UINT:
-   case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
-      return R2D_UNORM8;
-
-   case FMT6_32_UINT:
-   case FMT6_32_SINT:
-   case FMT6_32_32_UINT:
-   case FMT6_32_32_SINT:
-   case FMT6_32_32_32_32_UINT:
-   case FMT6_32_32_32_32_SINT:
-      return R2D_INT32;
-
-   case FMT6_16_UINT:
-   case FMT6_16_SINT:
-   case FMT6_16_16_UINT:
-   case FMT6_16_16_SINT:
-   case FMT6_16_16_16_16_UINT:
-   case FMT6_16_16_16_16_SINT:
-   case FMT6_10_10_10_2_UINT:
-      return R2D_INT16;
-
-   case FMT6_8_UINT:
-   case FMT6_8_SINT:
-   case FMT6_8_8_UINT:
-   case FMT6_8_8_SINT:
-   case FMT6_8_8_8_8_UINT:
-   case FMT6_8_8_8_8_SINT:
-      return R2D_INT8;
-
-   case FMT6_16_UNORM:
-   case FMT6_16_SNORM:
-   case FMT6_16_16_UNORM:
-   case FMT6_16_16_SNORM:
-   case FMT6_16_16_16_16_UNORM:
-   case FMT6_16_16_16_16_SNORM:
-   case FMT6_32_FLOAT:
-   case FMT6_32_32_FLOAT:
-   case FMT6_32_32_32_32_FLOAT:
-      return R2D_FLOAT32;
-
-   case FMT6_16_FLOAT:
-   case FMT6_16_16_FLOAT:
-   case FMT6_16_16_16_16_FLOAT:
-   case FMT6_11_11_10_FLOAT:
-   case FMT6_10_10_10_2_UNORM:
-   case FMT6_10_10_10_2_UNORM_DEST:
-      return R2D_FLOAT16;
-
-   default:
-      unreachable("bad format");
-      return 0;
-   }
-}
-
-static void
-r2d_coords(struct tu_cs *cs,
-           const VkOffset2D *dst,
-           const VkOffset2D *src,
-           const VkExtent2D *extent)
-{
-   tu_cs_emit_regs(cs,
-      A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
-      A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
-
-   if (!src)
-      return;
-
-   tu_cs_emit_regs(cs,
-                   A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
-                   A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
-                   A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
-                   A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
-}
-
-static void
-r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
-{
-   uint32_t clear_value[4] = {};
-
-   switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
-      /* cleared as r8g8b8a8_unorm using special format */
-      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
-      clear_value[1] = clear_value[0] >> 8;
-      clear_value[2] = clear_value[0] >> 16;
-      clear_value[3] = val->depthStencil.stencil;
-      break;
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D32_SFLOAT:
-      /* R2D_FLOAT32 */
-      clear_value[0] = fui(val->depthStencil.depth);
-      break;
-   case VK_FORMAT_S8_UINT:
-      clear_value[0] = val->depthStencil.stencil;
-      break;
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-      /* cleared as UINT32 */
-      clear_value[0] = float3_to_rgb9e5(val->color.float32);
-      break;
-   default:
-      assert(!vk_format_is_depth_or_stencil(format));
-      const struct util_format_description *desc = vk_format_description(format);
-      enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
-
-      assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
-                      format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
-
-      for (unsigned i = 0; i < desc->nr_channels; i++) {
-         const struct util_format_channel_description *ch = &desc->channel[i];
-         if (ifmt == R2D_UNORM8) {
-            float linear = val->color.float32[i];
-            if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
-               linear = util_format_linear_to_srgb_float(val->color.float32[i]);
-
-            if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
-               clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
-            else
-               clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
-         } else if (ifmt == R2D_FLOAT16) {
-            clear_value[i] = util_float_to_half(val->color.float32[i]);
-         } else {
-            assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
-                   ifmt == R2D_INT16 || ifmt == R2D_INT8);
-            clear_value[i] = val->color.uint32[i];
-         }
-      }
-      break;
-   }
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
-   tu_cs_emit_array(cs, clear_value, 4);
-}
-
-static void
-r2d_src(struct tu_cmd_buffer *cmd,
-        struct tu_cs *cs,
-        const struct tu_image_view *iview,
-        uint32_t layer,
-        bool linear_filter)
-{
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
-   tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
-                  COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
-   tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
-   tu_cs_image_ref_2d(cs, iview, layer, true);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
-   tu_cs_image_flag_ref(cs, iview, layer);
-}
-
-static void
-r2d_src_buffer(struct tu_cmd_buffer *cmd,
-               struct tu_cs *cs,
-               VkFormat vk_format,
-               uint64_t va, uint32_t pitch,
-               uint32_t width, uint32_t height)
-{
-   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_SP_PS_2D_SRC_INFO(
-                      .color_format = format.fmt,
-                      .color_swap = format.swap,
-                      .srgb = vk_format_is_srgb(vk_format),
-                      .unk20 = 1,
-                      .unk22 = 1),
-                   A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
-                   A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
-                   A6XX_SP_PS_2D_SRC_HI(va >> 32),
-                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
-}
-
-static void
-r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
-{
-   assert(iview->image->samples == 1);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
-   tu_cs_emit(cs, iview->RB_2D_DST_INFO);
-   tu_cs_image_ref_2d(cs, iview, layer, false);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
-   tu_cs_image_flag_ref(cs, iview, layer);
-}
-
-static void
-r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
-{
-   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_2D_DST_INFO(
-                      .color_format = format.fmt,
-                      .color_swap = format.swap,
-                      .srgb = vk_format_is_srgb(vk_format)),
-                   A6XX_RB_2D_DST_LO((uint32_t) va),
-                   A6XX_RB_2D_DST_HI(va >> 32),
-                   A6XX_RB_2D_DST_SIZE(.pitch = pitch));
-}
-
-static void
-r2d_setup_common(struct tu_cmd_buffer *cmd,
-                 struct tu_cs *cs,
-                 VkFormat vk_format,
-                 enum a6xx_rotation rotation,
-                 bool clear,
-                 uint8_t mask,
-                 bool scissor)
-{
-   enum a6xx_format format = tu6_base_format(vk_format);
-   enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
-   uint32_t unknown_8c01 = 0;
-
-   if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
-      /* preserve depth channels */
-      if (mask == 0x8)
-         unknown_8c01 = 0x00084001;
-      /* preserve stencil channel */
-      if (mask == 0x7)
-         unknown_8c01 = 0x08000041;
-   }
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
-   tu_cs_emit(cs, unknown_8c01);
-
-   uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
-         .scissor = scissor,
-         .rotate = rotation,
-         .solid_color = clear,
-         .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
-         .color_format = format,
-         .mask = 0xf,
-         .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
-      ).value;
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
-   tu_cs_emit(cs, blit_cntl);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
-   tu_cs_emit(cs, blit_cntl);
-
-   if (format == FMT6_10_10_10_2_UNORM_DEST)
-      format = FMT6_16_16_16_16_FLOAT;
-
-   tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
-         .sint = vk_format_is_sint(vk_format),
-         .uint = vk_format_is_uint(vk_format),
-         .color_format = format,
-         .srgb = vk_format_is_srgb(vk_format),
-         .mask = 0xf));
-}
-
-static void
-r2d_setup(struct tu_cmd_buffer *cmd,
-          struct tu_cs *cs,
-          VkFormat vk_format,
-          enum a6xx_rotation rotation,
-          bool clear,
-          uint8_t mask)
-{
-   const struct tu_physical_device *phys_dev = cmd->device->physical_device;
-
-   /* TODO: flushing with barriers instead of blindly always flushing */
-   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
-   tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
-   tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
-   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
-
-   tu_cs_emit_wfi(cs);
-   tu_cs_emit_regs(cs,
-                  A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
-
-   r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
-}
-
-static void
-r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
-   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
-   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
-
-   /* TODO: flushing with barriers instead of blindly always flushing */
-   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
-   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
-}
-
-/* r3d_ = shader path operations */
-
-static void
-r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts)
-{
-   static const instr_t vs_code[] = {
-      /* r0.xyz = r0.w ? c1.xyz : c0.xyz
-       * r1.xy = r0.w ? c1.zw : c0.zw
-       * r0.w = 1.0f
-       */
-      { .cat3 = {
-         .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 2, .dst = 0,
-         .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
-         .src2 = 3,
-         .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0},
-      } },
-      { .cat3 = {
-         .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 1, .dst = 4,
-         .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
-         .src2 = 3,
-         .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2},
-      } },
-      { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, .dst = 3,
-                  .src_im = 1, .fim_val = 1.0f } },
-      { .cat0 = { .opc = OPC_END } },
-   };
-#define FS_OFFSET (16 * sizeof(instr_t))
-   STATIC_ASSERT(sizeof(vs_code) <= FS_OFFSET);
-
-   /* vs inputs: only vtx id in r0.w */
-   tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 7);
-   tu_cs_emit(cs, 0x00000000);
-   tu_cs_emit(cs, 0xfcfcfc00 | A6XX_VFD_CONTROL_1_REGID4VTX(3));
-   tu_cs_emit(cs, 0x0000fcfc);
-   tu_cs_emit(cs, 0xfcfcfcfc);
-   tu_cs_emit(cs, 0x000000fc);
-   tu_cs_emit(cs, 0x0000fcfc);
-   tu_cs_emit(cs, 0x00000000);
-
-   /* vs outputs: position in r0.xyzw, blit coords in r1.xy */
-   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
-   tu_cs_emit(cs, blit ? 0xffffffcf : 0xffffffff);
-   tu_cs_emit(cs, 0xffffffff);
-   tu_cs_emit(cs, 0xffffffff);
-   tu_cs_emit(cs, 0xffffffff);
-
-   tu_cs_emit_regs(cs, A6XX_SP_VS_OUT_REG(0,
-         .a_regid = 0, .a_compmask = 0xf,
-         .b_regid = 4, .b_compmask = 0x3));
-   tu_cs_emit_regs(cs, A6XX_SP_VS_VPC_DST_REG(0, .outloc0 = 0, .outloc1 = 4));
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
-   tu_cs_emit(cs, 0xff00ff00 |
-                  COND(blit, A6XX_VPC_CNTL_0_VARYING) |
-                  A6XX_VPC_CNTL_0_NUMNONPOSVAR(blit ? 8 : 0));
-
-   tu_cs_emit_regs(cs, A6XX_VPC_PACK(
-         .positionloc = 0,
-         .psizeloc = 0xff,
-         .stride_in_vpc = blit ? 6 : 4));
-   tu_cs_emit_regs(cs, A6XX_SP_PRIMITIVE_CNTL(.vsout = blit ? 2 : 1));
-   tu_cs_emit_regs(cs,
-                   A6XX_PC_PRIMITIVE_CNTL_0(),
-                   A6XX_PC_PRIMITIVE_CNTL_1(.stride_in_vpc = blit ? 6 : 4));
-
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
-   tu_cs_emit(cs, blit ? 0xe000 : 0); // I think this can just be 0
-   for (uint32_t i = 1; i < 8; i++)
-      tu_cs_emit(cs, 0);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
-   for (uint32_t i = 0; i < 8; i++)
-      tu_cs_emit(cs, 0x99999999);
-
-   /* fs inputs: none, prefetch in blit case */
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + blit);
-   tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(blit) |
-                  A6XX_SP_FS_PREFETCH_CNTL_UNK4(0xfc) |
-                  0x7000);
-   if (blit) {
-         tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(4) |
-                        A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(0) |
-                        A6XX_SP_FS_PREFETCH_CMD_TEX_ID(0) |
-                        A6XX_SP_FS_PREFETCH_CMD_DST(0) |
-                        A6XX_SP_FS_PREFETCH_CMD_WRMASK(0xf) |
-                        A6XX_SP_FS_PREFETCH_CMD_CMD(0x4));
-   }
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
-   tu_cs_emit(cs, 0x3); // XXX blob uses 3 in blit path
-   tu_cs_emit(cs, 0xfcfcfcfc);
-   tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(blit ? 0 : 0xfc) |
-                  A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(0xfc) |
-                  0xfc00fc00);
-   tu_cs_emit(cs, 0xfcfcfcfc);
-   tu_cs_emit(cs, 0xfcfc);
-
-   tu_cs_emit_regs(cs, A6XX_HLSQ_UNKNOWN_B980(blit ? 3 : 1));
-   tu_cs_emit_regs(cs, A6XX_GRAS_CNTL(.varying = blit));
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_RENDER_CONTROL0(.varying = blit, .unk10 = blit),
-                   A6XX_RB_RENDER_CONTROL1());
-
-   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_CNTL());
-   tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8101());
-   tu_cs_emit_regs(cs, A6XX_GRAS_SAMPLE_CNTL());
-
-   /* shaders */
-   struct ts_cs_memory shaders = { };
-   VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, 16 * sizeof(instr_t), &shaders);
-   assert(result == VK_SUCCESS);
-
-   memcpy(shaders.map, vs_code, sizeof(vs_code));
-
-   instr_t *fs = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
-   for (uint32_t i = 0; i < num_rts; i++) {
-      /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
-      fs[i] =  (instr_t) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
-                              .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4 } };
-   }
-   fs[num_rts] = (instr_t) { .cat0 = { .opc = OPC_END } };
-   /* note: assumed <= 16 instructions (MAX_RTS is 8) */
-
-   tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
-   tu_cs_emit_regs(cs,
-                   A6XX_HLSQ_VS_CNTL(.constlen = 8, .enabled = true),
-                   A6XX_HLSQ_HS_CNTL(),
-                   A6XX_HLSQ_DS_CNTL(),
-                   A6XX_HLSQ_GS_CNTL());
-   tu_cs_emit_regs(cs, A6XX_HLSQ_FS_CNTL(.constlen = 4 * num_rts, .enabled = true));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_SP_VS_CONFIG(.enabled = true),
-                   A6XX_SP_VS_INSTRLEN(1));
-   tu_cs_emit_regs(cs, A6XX_SP_HS_CONFIG());
-   tu_cs_emit_regs(cs, A6XX_SP_DS_CONFIG());
-   tu_cs_emit_regs(cs, A6XX_SP_GS_CONFIG());
-   tu_cs_emit_regs(cs,
-                   A6XX_SP_FS_CONFIG(.enabled = true, .ntex = blit, .nsamp = blit),
-                   A6XX_SP_FS_INSTRLEN(1));
-
-   tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
-                        .threadsize = FOUR_QUADS,
-                        .fullregfootprint = 2,
-                        .mergedregs = true));
-   tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
-                        .varying = blit,
-                        .threadsize = FOUR_QUADS,
-                        /* could this be 0 in !blit && !num_rts case ? */
-                        .fullregfootprint = MAX2(1, num_rts),
-                        .mergedregs = true)); /* note: tu_pipeline also sets 0x1000000 bit */
-
-   tu_cs_emit_regs(cs, A6XX_SP_IBO_COUNT(0));
-
-   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
-                     CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-                     CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
-                     CP_LOAD_STATE6_0_NUM_UNIT(1));
-   tu_cs_emit_qw(cs, shaders.iova);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OBJ_START_LO, 2);
-   tu_cs_emit_qw(cs, shaders.iova);
-
-   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
-                     CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-                     CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
-                     CP_LOAD_STATE6_0_NUM_UNIT(1));
-   tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OBJ_START_LO, 2);
-   tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_GRAS_CL_CNTL(
-                      .persp_division_disable = 1,
-                      .vp_xform_disable = 1,
-                      .vp_clip_code_ignore = 1,
-                      .clip_disable = 1),
-                   A6XX_GRAS_UNKNOWN_8001(0));
-   tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
-
-   tu_cs_emit_regs(cs,
-                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
-                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
-   tu_cs_emit_regs(cs,
-                   A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
-                   A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
-}
-
-static void
-r3d_coords_raw(struct tu_cs *cs, const float *coords)
-{
-   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(2));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-   tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
-}
-
-static void
-r3d_coords(struct tu_cs *cs,
-           const VkOffset2D *dst,
-           const VkOffset2D *src,
-           const VkExtent2D *extent)
-{
-   int32_t src_x1 = src ? src->x : 0;
-   int32_t src_y1 = src ? src->y : 0;
-   r3d_coords_raw(cs, (float[]) {
-      dst->x,                 dst->y,
-      src_x1,                 src_y1,
-      dst->x + extent->width, dst->y + extent->height,
-      src_x1 + extent->width, src_y1 + extent->height,
-   });
-}
-
-static void
-r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
-{
-   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(1));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-   switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT: {
-      /* cleared as r8g8b8a8_unorm using special format */
-      uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
-      tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
-      tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
-      tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
-      tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
-   } break;
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D32_SFLOAT:
-      tu_cs_emit(cs, fui(val->depthStencil.depth));
-      tu_cs_emit(cs, 0);
-      tu_cs_emit(cs, 0);
-      tu_cs_emit(cs, 0);
-      break;
-   case VK_FORMAT_S8_UINT:
-      tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
-      tu_cs_emit(cs, 0);
-      tu_cs_emit(cs, 0);
-      tu_cs_emit(cs, 0);
-      break;
-   default:
-      /* as color formats use clear value as-is */
-      assert(!vk_format_is_depth_or_stencil(format));
-      tu_cs_emit_array(cs, val->color.uint32, 4);
-      break;
-   }
-}
-
-static void
-r3d_src_common(struct tu_cmd_buffer *cmd,
-               struct tu_cs *cs,
-               const uint32_t *tex_const,
-               uint32_t offset_base,
-               uint32_t offset_ubwc,
-               bool linear_filter)
-{
-   struct ts_cs_memory texture = { };
-   VkResult result = tu_cs_alloc(&cmd->sub_cs,
-                                 2, /* allocate space for a sampler too */
-                                 A6XX_TEX_CONST_DWORDS, &texture);
-   assert(result == VK_SUCCESS);
-
-   memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
-
-   /* patch addresses for layer offset */
-   *(uint64_t*) (texture.map + 4) += offset_base;
-   uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
-   texture.map[7] = ubwc_addr;
-   texture.map[8] = ubwc_addr >> 32;
-
-   texture.map[A6XX_TEX_CONST_DWORDS + 0] =
-      A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
-      A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
-      A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
-      A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
-      A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
-      0x60000; /* XXX used by blob, doesn't seem necessary */
-   texture.map[A6XX_TEX_CONST_DWORDS + 1] =
-      0x1 | /* XXX used by blob, doesn't seem necessary */
-      A6XX_TEX_SAMP_1_UNNORM_COORDS |
-      A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
-   texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
-   texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
-
-   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-               CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
-               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
-               CP_LOAD_STATE6_0_NUM_UNIT(1));
-   tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
-   tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
-
-   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
-      CP_LOAD_STATE6_0_NUM_UNIT(1));
-   tu_cs_emit_qw(cs, texture.iova);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
-   tu_cs_emit_qw(cs, texture.iova);
-
-   tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
-}
-
-static void
-r3d_src(struct tu_cmd_buffer *cmd,
-        struct tu_cs *cs,
-        const struct tu_image_view *iview,
-        uint32_t layer,
-        bool linear_filter)
-{
-   r3d_src_common(cmd, cs, iview->descriptor,
-                  iview->layer_size * layer,
-                  iview->ubwc_layer_size * layer,
-                  linear_filter);
-}
-
-static void
-r3d_src_buffer(struct tu_cmd_buffer *cmd,
-               struct tu_cs *cs,
-               VkFormat vk_format,
-               uint64_t va, uint32_t pitch,
-               uint32_t width, uint32_t height)
-{
-   uint32_t desc[A6XX_TEX_CONST_DWORDS];
-
-   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
-
-   desc[0] =
-      COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
-      A6XX_TEX_CONST_0_FMT(format.fmt) |
-      A6XX_TEX_CONST_0_SWAP(format.swap) |
-      A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
-      // XXX to swizzle into .w for stencil buffer_to_image
-      A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
-      A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
-      A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
-   desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
-   desc[2] =
-      A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
-      A6XX_TEX_CONST_2_PITCH(pitch) |
-      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
-   desc[3] = 0;
-   desc[4] = va;
-   desc[5] = va >> 32;
-   for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
-      desc[i] = 0;
-
-   r3d_src_common(cmd, cs, desc, 0, 0, false);
-}
-
-static void
-r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
-{
-   tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
-   tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
-   tu_cs_image_ref(cs, iview, layer);
-   tu_cs_emit(cs, 0);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
-   tu_cs_image_flag_ref(cs, iview, layer);
-
-   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
-}
-
-static void
-r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
-{
-   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
-
-   tu6_emit_msaa(cs, 1); /* TODO: move to setup */
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
-                   A6XX_RB_MRT_PITCH(0, pitch),
-                   A6XX_RB_MRT_ARRAY_PITCH(0, 0),
-                   A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
-                   A6XX_RB_MRT_BASE_HI(0, va >> 32),
-                   A6XX_RB_MRT_BASE_GMEM(0, 0));
-
-   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
-}
-
-static void
-r3d_setup(struct tu_cmd_buffer *cmd,
-          struct tu_cs *cs,
-          VkFormat vk_format,
-          enum a6xx_rotation rotation,
-          bool clear,
-          uint8_t mask)
-{
-   const struct tu_physical_device *phys_dev = cmd->device->physical_device;
-
-   if (!cmd->state.pass) {
-      /* TODO: flushing with barriers instead of blindly always flushing */
-      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
-      tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
-      tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
-      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
-
-      tu_cs_emit_regs(cs,
-                     A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
-
-      tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
-   }
-   tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
-   tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
-
-   r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
-   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
-                  A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
-                  0xfc000000);
-   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
-   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_FS_OUTPUT_CNTL0(),
-                   A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
-
-   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
-   tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
-   tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
-
-   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
-   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
-   tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
-   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
-   tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
-   tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
-   tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
-
-   tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
-   tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
-
-   tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
-                        .color_format = tu6_base_format(vk_format),
-                        .color_sint = vk_format_is_sint(vk_format),
-                        .color_uint = vk_format_is_uint(vk_format)));
-
-   tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
-   tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
-   tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
-}
-
-static void
-r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
-   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
-   tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
-                  CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
-                  CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
-   tu_cs_emit(cs, 1); /* instance count */
-   tu_cs_emit(cs, 2); /* vertex count */
-
-   if (!cmd->state.pass) {
-      /* TODO: flushing with barriers instead of blindly always flushing */
-      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
-      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
-   }
-}
-
-/* blit ops - common interface for 2d/shader paths */
-
-struct blit_ops {
-   void (*coords)(struct tu_cs *cs,
-                  const VkOffset2D *dst,
-                  const VkOffset2D *src,
-                  const VkExtent2D *extent);
-   void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
-   void (*src)(
-        struct tu_cmd_buffer *cmd,
-        struct tu_cs *cs,
-        const struct tu_image_view *iview,
-        uint32_t layer,
-        bool linear_filter);
-   void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      VkFormat vk_format,
-                      uint64_t va, uint32_t pitch,
-                      uint32_t width, uint32_t height);
-   void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-   void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
-   void (*setup)(struct tu_cmd_buffer *cmd,
-                 struct tu_cs *cs,
-                 VkFormat vk_format,
-                 enum a6xx_rotation rotation,
-                 bool clear,
-                 uint8_t mask);
-   void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
-};
-
-static const struct blit_ops r2d_ops = {
-   .coords = r2d_coords,
-   .clear_value = r2d_clear_value,
-   .src = r2d_src,
-   .src_buffer = r2d_src_buffer,
-   .dst = r2d_dst,
-   .dst_buffer = r2d_dst_buffer,
-   .setup = r2d_setup,
-   .run = r2d_run,
-};
-
-static const struct blit_ops r3d_ops = {
-   .coords = r3d_coords,
-   .clear_value = r3d_clear_value,
-   .src = r3d_src,
-   .src_buffer = r3d_src_buffer,
-   .dst = r3d_dst,
-   .dst_buffer = r3d_dst_buffer,
-   .setup = r3d_setup,
-   .run = r3d_run,
-};
-
-/* passthrough set coords from 3D extents */
-static void
-coords(const struct blit_ops *ops,
-       struct tu_cs *cs,
-       const VkOffset3D *dst,
-       const VkOffset3D *src,
-       const VkExtent3D *extent)
-{
-   ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
-}
-
-static void
-tu_image_view_blit2(struct tu_image_view *iview,
-                    struct tu_image *image,
-                    VkFormat format,
-                    const VkImageSubresourceLayers *subres,
-                    uint32_t layer,
-                    bool stencil_read)
-{
-   VkImageAspectFlags aspect_mask = subres->aspectMask;
-
-   /* always use the AS_R8G8B8A8 format for these */
-   if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
-       format == VK_FORMAT_X8_D24_UNORM_PACK32) {
-      aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
-   }
-
-   tu_image_view_init(iview, &(VkImageViewCreateInfo) {
-      .image = tu_image_to_handle(image),
-      .viewType = VK_IMAGE_VIEW_TYPE_2D,
-      .format = format,
-      /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
-      .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
-      .subresourceRange = {
-         .aspectMask = aspect_mask,
-         .baseMipLevel = subres->mipLevel,
-         .levelCount = 1,
-         .baseArrayLayer = subres->baseArrayLayer + layer,
-         .layerCount = 1,
-      },
-   });
-}
-
-static void
-tu_image_view_blit(struct tu_image_view *iview,
-                   struct tu_image *image,
-                   const VkImageSubresourceLayers *subres,
-                   uint32_t layer)
-{
-   tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
-}
-
-static void
-tu6_blit_image(struct tu_cmd_buffer *cmd,
-               struct tu_image *src_image,
-               struct tu_image *dst_image,
-               const VkImageBlit *info,
-               VkFilter filter)
-{
-   const struct blit_ops *ops = &r2d_ops;
-   struct tu_cs *cs = &cmd->cs;
-   uint32_t layers;
-
-   /* 2D blit can't do rotation mirroring from just coordinates */
-   static const enum a6xx_rotation rotate[2][2] = {
-      {ROTATE_0, ROTATE_HFLIP},
-      {ROTATE_VFLIP, ROTATE_180},
-   };
-
-   bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
-                   (info->dstOffsets[1].x < info->dstOffsets[0].x);
-   bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
-                   (info->dstOffsets[1].y < info->dstOffsets[0].y);
-   bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
-                   (info->dstOffsets[1].z < info->dstOffsets[0].z);
-
-   if (mirror_z) {
-      tu_finishme("blit z mirror\n");
-      return;
-   }
-
-   if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
-       info->dstOffsets[1].z - info->dstOffsets[0].z) {
-      tu_finishme("blit z filter\n");
-      return;
-   }
-
-   layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
-   if (info->dstSubresource.layerCount > 1) {
-      assert(layers <= 1);
-      layers = info->dstSubresource.layerCount;
-   }
-
-   uint8_t mask = 0xf;
-   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
-         mask = 0x7;
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
-         mask = 0x8;
-   }
-
-   if (dst_image->samples > 1)
-      ops = &r3d_ops;
-
-   /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
-    * figure out why (should be able to pass all tests with only shader path)
-    */
-
-   ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
-
-   if (ops == &r3d_ops) {
-      r3d_coords_raw(cs, (float[]) {
-         info->dstOffsets[0].x, info->dstOffsets[0].y,
-         info->srcOffsets[0].x, info->srcOffsets[0].y,
-         info->dstOffsets[1].x, info->dstOffsets[1].y,
-         info->srcOffsets[1].x, info->srcOffsets[1].y
-      });
-   } else {
-      tu_cs_emit_regs(cs,
-         A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
-                             .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
-         A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
-                             .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
-      tu_cs_emit_regs(cs,
-         A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
-         A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
-         A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
-         A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
-   }
-
-   struct tu_image_view dst, src;
-   tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
-   tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
-
-   for (uint32_t i = 0; i < layers; i++) {
-      ops->dst(cs, &dst, i);
-      ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
-      ops->run(cmd, cs);
-   }
-}
-
-void
-tu_CmdBlitImage(VkCommandBuffer commandBuffer,
-                VkImage srcImage,
-                VkImageLayout srcImageLayout,
-                VkImage dstImage,
-                VkImageLayout dstImageLayout,
-                uint32_t regionCount,
-                const VkImageBlit *pRegions,
-                VkFilter filter)
-
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
-
-   tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
-   tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
-
-   for (uint32_t i = 0; i < regionCount; ++i)
-      tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
-}
-
-static VkFormat
-copy_format(VkFormat format)
-{
-   switch (vk_format_get_blocksizebits(format)) {
-   case 8:  return VK_FORMAT_R8_UINT;
-   case 16: return VK_FORMAT_R16_UINT;
-   case 32: return VK_FORMAT_R32_UINT;
-   case 64: return VK_FORMAT_R32G32_UINT;
-   case 96: return VK_FORMAT_R32G32B32_UINT;
-   case 128:return VK_FORMAT_R32G32B32A32_UINT;
-   default:
-      unreachable("unhandled format size");
-   }
-}
-
-static void
-copy_compressed(VkFormat format,
-                VkOffset3D *offset,
-                VkExtent3D *extent,
-                uint32_t *pitch,
-                uint32_t *layer_size)
-{
-   if (!vk_format_is_compressed(format))
-      return;
-
-   uint32_t block_width = vk_format_get_blockwidth(format);
-   uint32_t block_height = vk_format_get_blockheight(format);
-
-   offset->x /= block_width;
-   offset->y /= block_height;
-
-   if (extent) {
-      extent->width = DIV_ROUND_UP(extent->width, block_width);
-      extent->height = DIV_ROUND_UP(extent->height, block_height);
-   }
-   if (pitch)
-      *pitch /= block_width;
-   if (layer_size)
-      *layer_size /= (block_width * block_height);
-}
-
-static void
-tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
-                        struct tu_buffer *src_buffer,
-                        struct tu_image *dst_image,
-                        const VkBufferImageCopy *info)
-{
-   struct tu_cs *cs = &cmd->cs;
-   uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat dst_format = dst_image->vk_format;
-   VkFormat src_format = dst_image->vk_format;
-   const struct blit_ops *ops = &r2d_ops;
-
-   uint8_t mask = 0xf;
-
-   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      switch (info->imageSubresource.aspectMask) {
-      case VK_IMAGE_ASPECT_STENCIL_BIT:
-         src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
-         mask = 0x8;
-         ops = &r3d_ops;
-         break;
-      case VK_IMAGE_ASPECT_DEPTH_BIT:
-         mask = 0x7;
-         break;
-      }
-   }
-
-   VkOffset3D offset = info->imageOffset;
-   VkExtent3D extent = info->imageExtent;
-   uint32_t pitch =
-      (info->bufferRowLength ?: extent.width) * vk_format_get_blocksize(src_format);
-   uint32_t layer_size = (info->bufferImageHeight ?: extent.height) * pitch;
-
-   if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
-      assert(src_format == dst_format);
-      copy_compressed(dst_format, &offset, &extent, &pitch, &layer_size);
-      src_format = dst_format = copy_format(dst_format);
-   }
-
-   /* note: the src_va/pitch alignment of 64 is for 2D engine,
-    * it is also valid for 1cpp format with shader path (stencil aspect path)
-    */
-
-   ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
-
-   struct tu_image_view dst;
-   tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
-
-   for (uint32_t i = 0; i < layers; i++) {
-      ops->dst(cs, &dst, i);
-
-      uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
-      if ((src_va & 63) || (pitch & 63)) {
-         for (uint32_t y = 0; y < extent.height; y++) {
-            uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
-            ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
-                            x + extent.width, 1);
-            ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
-                        &(VkExtent2D) {extent.width, 1});
-            ops->run(cmd, cs);
-            src_va += pitch;
-         }
-      } else {
-         ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
-         coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
-         ops->run(cmd, cs);
-      }
-   }
-}
-
-void
-tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
-                        VkBuffer srcBuffer,
-                        VkImage dstImage,
-                        VkImageLayout dstImageLayout,
-                        uint32_t regionCount,
-                        const VkBufferImageCopy *pRegions)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
-   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
-
-   tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
-   tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
-
-   for (unsigned i = 0; i < regionCount; ++i)
-      tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
-}
-
-static void
-tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
-                        struct tu_image *src_image,
-                        struct tu_buffer *dst_buffer,
-                        const VkBufferImageCopy *info)
-{
-   struct tu_cs *cs = &cmd->cs;
-   uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat src_format = src_image->vk_format;
-   VkFormat dst_format = src_image->vk_format;
-   bool stencil_read = false;
-
-   if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
-       info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
-      dst_format = VK_FORMAT_R8_UNORM;
-      stencil_read = true;
-   }
-
-   const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
-   VkOffset3D offset = info->imageOffset;
-   VkExtent3D extent = info->imageExtent;
-   uint32_t pitch = (info->bufferRowLength ?: extent.width) * vk_format_get_blocksize(dst_format);
-   uint32_t layer_size = (info->bufferImageHeight ?: extent.height) * pitch;
-
-   if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
-      assert(src_format == dst_format);
-      copy_compressed(dst_format, &offset, &extent, &pitch, &layer_size);
-      src_format = dst_format = copy_format(dst_format);
-   }
-
-   /* note: the dst_va/pitch alignment of 64 is for 2D engine,
-    * it is also valid for 1cpp format with shader path (stencil aspect)
-    */
-
-   ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
-
-   struct tu_image_view src;
-   tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
-
-   for (uint32_t i = 0; i < layers; i++) {
-      ops->src(cmd, cs, &src, i, false);
-
-      uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
-      if ((dst_va & 63) || (pitch & 63)) {
-         for (uint32_t y = 0; y < extent.height; y++) {
-            uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
-            ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
-            ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
-                        &(VkExtent2D) {extent.width, 1});
-            ops->run(cmd, cs);
-            dst_va += pitch;
-         }
-      } else {
-         ops->dst_buffer(cs, dst_format, dst_va, pitch);
-         coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
-         ops->run(cmd, cs);
-      }
-   }
-}
-
-void
-tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
-                        VkImage srcImage,
-                        VkImageLayout srcImageLayout,
-                        VkBuffer dstBuffer,
-                        uint32_t regionCount,
-                        const VkBufferImageCopy *pRegions)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
-
-   tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
-   tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
-
-   for (unsigned i = 0; i < regionCount; ++i)
-      tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
-}
-
-static void
-tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
-                       struct tu_image *src_image,
-                       struct tu_image *dst_image,
-                       const VkImageCopy *info)
-{
-   const struct blit_ops *ops = &r2d_ops;
-   struct tu_cs *cs = &cmd->cs;
-
-   uint8_t mask = 0xf;
-   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
-         mask = 0x7;
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
-         mask = 0x8;
-   }
-
-   if (dst_image->samples > 1)
-      ops = &r3d_ops;
-
-   assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
-
-   VkFormat format = VK_FORMAT_UNDEFINED;
-   VkOffset3D src_offset = info->srcOffset;
-   VkOffset3D dst_offset = info->dstOffset;
-   VkExtent3D extent = info->extent;
-
-   /* TODO: should check (ubwc || (tile_mode && swap)) instead */
-   if (src_image->layout.tile_mode && src_image->vk_format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
-      format = src_image->vk_format;
-
-   if (dst_image->layout.tile_mode && dst_image->vk_format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
-      if (format != VK_FORMAT_UNDEFINED && format != dst_image->vk_format) {
-         /* can be clever in some cases but in some cases we need and intermediate
-         * linear buffer
-         */
-         tu_finishme("image copy between two tiled/ubwc images\n");
-         return;
-      }
-      format = dst_image->vk_format;
-   }
-
-   if (format == VK_FORMAT_UNDEFINED)
-      format = copy_format(src_image->vk_format);
-
-   copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
-   copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
-
-   ops->setup(cmd, cs, format, ROTATE_0, false, mask);
-   coords(ops, cs, &dst_offset, &src_offset, &extent);
-
-   struct tu_image_view dst, src;
-   tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
-   tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
-
-   for (uint32_t i = 0; i < info->extent.depth; i++) {
-      ops->src(cmd, cs, &src, i, false);
-      ops->dst(cs, &dst, i);
-      ops->run(cmd, cs);
-   }
-}
-
-void
-tu_CmdCopyImage(VkCommandBuffer commandBuffer,
-                VkImage srcImage,
-                VkImageLayout srcImageLayout,
-                VkImage destImage,
-                VkImageLayout destImageLayout,
-                uint32_t regionCount,
-                const VkImageCopy *pRegions)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_image, dst_image, destImage);
-
-   tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
-   tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
-
-   for (uint32_t i = 0; i < regionCount; ++i)
-      tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
-}
-
-static void
-copy_buffer(struct tu_cmd_buffer *cmd,
-            uint64_t dst_va,
-            uint64_t src_va,
-            uint64_t size,
-            uint32_t block_size)
-{
-   const struct blit_ops *ops = &r2d_ops;
-   struct tu_cs *cs = &cmd->cs;
-   VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
-   uint64_t blocks = size / block_size;
-
-   ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
-
-   while (blocks) {
-      uint32_t src_x = (src_va & 63) / block_size;
-      uint32_t dst_x = (dst_va & 63) / block_size;
-      uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
-
-      ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
-      ops->dst_buffer(     cs, format, dst_va & ~63, 0);
-      ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
-      ops->run(cmd, cs);
-
-      src_va += width * block_size;
-      dst_va += width * block_size;
-      blocks -= width;
-   }
-}
-
-void
-tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
-                 VkBuffer srcBuffer,
-                 VkBuffer dstBuffer,
-                 uint32_t regionCount,
-                 const VkBufferCopy *pRegions)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
-   TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
-
-   tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
-   tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
-
-   for (unsigned i = 0; i < regionCount; ++i) {
-      copy_buffer(cmd,
-                  tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
-                  tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
-                  pRegions[i].size, 1);
-   }
-}
-
-void
-tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
-                   VkBuffer dstBuffer,
-                   VkDeviceSize dstOffset,
-                   VkDeviceSize dataSize,
-                   const void *pData)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
-
-   tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
-
-   struct ts_cs_memory tmp;
-   VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
-   memcpy(tmp.map, pData, dataSize);
-   copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
-}
-
-void
-tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
-                 VkBuffer dstBuffer,
-                 VkDeviceSize dstOffset,
-                 VkDeviceSize fillSize,
-                 uint32_t data)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
-   const struct blit_ops *ops = &r2d_ops;
-   struct tu_cs *cs = &cmd->cs;
-
-   tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
-
-   if (fillSize == VK_WHOLE_SIZE)
-      fillSize = buffer->size - dstOffset;
-
-   uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
-   uint32_t blocks = fillSize / 4;
-
-   ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
-   ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
-
-   while (blocks) {
-      uint32_t dst_x = (dst_va & 63) / 4;
-      uint32_t width = MIN2(blocks, 0x4000 - dst_x);
-
-      ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
-      ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
-      ops->run(cmd, cs);
-
-      dst_va += width * 4;
-      blocks -= width;
-   }
-}
-
-void
-tu_CmdResolveImage(VkCommandBuffer commandBuffer,
-                   VkImage srcImage,
-                   VkImageLayout srcImageLayout,
-                   VkImage dstImage,
-                   VkImageLayout dstImageLayout,
-                   uint32_t regionCount,
-                   const VkImageResolve *pRegions)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, src_image, srcImage);
-   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
-   const struct blit_ops *ops = &r2d_ops;
-   struct tu_cs *cs = &cmd->cs;
-
-   tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
-   tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
-
-   ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
-
-   for (uint32_t i = 0; i < regionCount; ++i) {
-      const VkImageResolve *info = &pRegions[i];
-      uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
-
-      assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
-      /* TODO: aspect masks possible ? */
-
-      coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
-
-      struct tu_image_view dst, src;
-      tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
-      tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
-
-      for (uint32_t i = 0; i < layers; i++) {
-         ops->src(cmd, cs, &src, i, false);
-         ops->dst(cs, &dst, i);
-         ops->run(cmd, cs);
-      }
-   }
-}
-
-void
-tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
-                  struct tu_cs *cs,
-                  struct tu_image_view *src,
-                  struct tu_image_view *dst,
-                  uint32_t layers,
-                  const VkRect2D *rect)
-{
-   const struct blit_ops *ops = &r2d_ops;
-
-   tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
-   tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
-
-   assert(src->image->vk_format == dst->image->vk_format);
-
-   ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
-   ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
-
-   for (uint32_t i = 0; i < layers; i++) {
-      ops->src(cmd, cs, src, i, false);
-      ops->dst(cs, dst, i);
-      ops->run(cmd, cs);
-   }
-}
-
-static void
-clear_image(struct tu_cmd_buffer *cmd,
-            struct tu_image *image,
-            const VkClearValue *clear_value,
-            const VkImageSubresourceRange *range)
-{
-   uint32_t level_count = tu_get_levelCount(image, range);
-   uint32_t layer_count = tu_get_layerCount(image, range);
-   struct tu_cs *cs = &cmd->cs;
-   VkFormat format = image->vk_format;
-   if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
-      format = VK_FORMAT_R32_UINT;
-
-   if (image->type == VK_IMAGE_TYPE_3D) {
-      assert(layer_count == 1);
-      assert(range->baseArrayLayer == 0);
-   }
-
-   uint8_t mask = 0xf;
-   if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      mask = 0;
-      if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
-         mask |= 0x7;
-      if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
-         mask |= 0x8;
-   }
-
-   const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
-
-   ops->setup(cmd, cs, format, ROTATE_0, true, mask);
-   ops->clear_value(cs, image->vk_format, clear_value);
-
-   for (unsigned j = 0; j < level_count; j++) {
-      if (image->type == VK_IMAGE_TYPE_3D)
-         layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
-
-      ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
-                     u_minify(image->extent.width, range->baseMipLevel + j),
-                     u_minify(image->extent.height, range->baseMipLevel + j)
-                  });
-
-      struct tu_image_view dst;
-      tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
-         .aspectMask = range->aspectMask,
-         .mipLevel = range->baseMipLevel + j,
-         .baseArrayLayer = range->baseArrayLayer,
-         .layerCount = 1,
-      }, 0, false);
-
-      for (uint32_t i = 0; i < layer_count; i++) {
-         ops->dst(cs, &dst, i);
-         ops->run(cmd, cs);
-      }
-   }
-}
-
-void
-tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
-                      VkImage image_h,
-                      VkImageLayout imageLayout,
-                      const VkClearColorValue *pColor,
-                      uint32_t rangeCount,
-                      const VkImageSubresourceRange *pRanges)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, image, image_h);
-
-   tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
-
-   for (unsigned i = 0; i < rangeCount; i++)
-      clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
-}
-
-void
-tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
-                             VkImage image_h,
-                             VkImageLayout imageLayout,
-                             const VkClearDepthStencilValue *pDepthStencil,
-                             uint32_t rangeCount,
-                             const VkImageSubresourceRange *pRanges)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   TU_FROM_HANDLE(tu_image, image, image_h);
-
-   tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
-
-   for (unsigned i = 0; i < rangeCount; i++)
-      clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
-}
-
-static void
-tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
-                               uint32_t attachment_count,
-                               const VkClearAttachment *attachments,
-                               uint32_t rect_count,
-                               const VkClearRect *rects)
-{
-   const struct tu_subpass *subpass = cmd->state.subpass;
-   /* note: cannot use shader path here.. there is a special shader path
-    * in tu_clear_sysmem_attachments()
-    */
-   const struct blit_ops *ops = &r2d_ops;
-   struct tu_cs *cs = &cmd->draw_cs;
-
-   for (uint32_t j = 0; j < attachment_count; j++) {
-         uint32_t a;
-         if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-            a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
-         } else {
-            a = subpass->depth_stencil_attachment.attachment;
-
-            /* sync depth into color */
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
-            /* also flush color to avoid losing contents from invalidate */
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-            tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
-         }
-
-         if (a == VK_ATTACHMENT_UNUSED)
-               continue;
-
-         uint8_t mask = 0xf;
-         if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
-               mask &= ~0x7;
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
-               mask &= ~0x8;
-         }
-
-         const struct tu_image_view *iview =
-            cmd->state.framebuffer->attachments[a].attachment;
-
-         ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
-         ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
-
-         for (uint32_t i = 0; i < rect_count; i++) {
-            ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
-            for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
-               ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
-               ops->run(cmd, cs);
-            }
-         }
-
-         if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-            /* does not use CCU - flush
-             * note: cache invalidate might be needed to, and just not covered by test cases
-             */
-            if (attachments[j].colorAttachment > 0)
-               tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-         } else {
-            /* sync color into depth */
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-            tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
-         }
-   }
-}
-
-static void
-tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
-                            uint32_t attachment_count,
-                            const VkClearAttachment *attachments,
-                            uint32_t rect_count,
-                            const VkClearRect *rects)
-{
-   /* the shader path here is special, it avoids changing MRT/etc state */
-   const struct tu_render_pass *pass = cmd->state.pass;
-   const struct tu_subpass *subpass = cmd->state.subpass;
-   const uint32_t mrt_count = subpass->color_count;
-   struct tu_cs *cs = &cmd->draw_cs;
-   uint32_t clear_value[MAX_RTS][4];
-   float z_clear_val = 0.0f;
-   uint8_t s_clear_val = 0;
-   uint32_t clear_rts = 0, num_rts = 0, b;
-   bool z_clear = false;
-   bool s_clear = false;
-   uint32_t max_samples = 1;
-
-   for (uint32_t i = 0; i < attachment_count; i++) {
-      uint32_t a;
-      if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-         uint32_t c = attachments[i].colorAttachment;
-         a = subpass->color_attachments[c].attachment;
-         if (a == VK_ATTACHMENT_UNUSED)
-            continue;
-
-         clear_rts |= 1 << c;
-         memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
-      } else {
-         a = subpass->depth_stencil_attachment.attachment;
-         if (a == VK_ATTACHMENT_UNUSED)
-            continue;
-
-         if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
-            z_clear = true;
-            z_clear_val = attachments[i].clearValue.depthStencil.depth;
-         }
-
-         if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
-            s_clear = true;
-            s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
-         }
-      }
-
-      max_samples = MAX2(max_samples, pass->attachments[a].samples);
-   }
-
-   /* prefer to use 2D path for clears
-    * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
-    */
-   if (max_samples == 1 && cmd->state.framebuffer) {
-      tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
-      return;
-   }
-
-   /* TODO: this path doesn't take into account multilayer rendering */
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
-   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
-                  A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
-                  0xfc000000);
-   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
-   for (uint32_t i = 0; i < mrt_count; i++) {
-      if (clear_rts & (1 << i))
-         tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
-      else
-         tu_cs_emit(cs, 0);
-   }
-
-   r3d_pipeline(cmd, cs, false, num_rts);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_FS_OUTPUT_CNTL0(),
-                   A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
-
-   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
-   tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
-   tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
-   for (uint32_t i = 0; i < mrt_count; i++) {
-      tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
-            .component_enable = COND(clear_rts & (1 << i), 0xf)));
-   }
-
-   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
-   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
-         .z_enable = z_clear,
-         .z_write_enable = z_clear,
-         .zfunc = FUNC_ALWAYS));
-   tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
-   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
-         .stencil_enable = s_clear,
-         .func = FUNC_ALWAYS,
-         .zpass = VK_STENCIL_OP_REPLACE));
-   tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
-   tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
-   tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
-
-   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-   for_each_bit(b, clear_rts)
-      tu_cs_emit_array(cs, clear_value[b], 4);
-
-   for (uint32_t i = 0; i < rect_count; i++) {
-      r3d_coords_raw(cs, (float[]) {
-         rects[i].rect.offset.x, rects[i].rect.offset.y,
-         z_clear_val, 1.0f,
-         rects[i].rect.offset.x + rects[i].rect.extent.width,
-         rects[i].rect.offset.y + rects[i].rect.extent.height,
-         z_clear_val, 1.0f
-      });
-      r3d_run(cmd, cs);
-   }
-
-   cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
-      TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
-      TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
-      TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
-      TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
-      TU_CMD_DIRTY_DYNAMIC_SCISSOR;
-}
-
-/**
- * Pack a VkClearValue into a 128-bit buffer. format is respected except
- * for the component order.  The components are always packed in WZYX order,
- * because gmem is tiled and tiled formats always have WZYX swap
- */
-static void
-pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
-{
-   const struct util_format_description *desc = vk_format_description(format);
-
-   switch (format) {
-   case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-      buf[0] = float3_to_r11g11b10f(val->color.float32);
-      return;
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-      buf[0] = float3_to_rgb9e5(val->color.float32);
-      return;
-   default:
-      break;
-   }
-
-   assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-
-   /* S8_UINT is special and has no depth */
-   const int max_components =
-      format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
-
-   int buf_offset = 0;
-   int bit_shift = 0;
-   for (int comp = 0; comp < max_components; comp++) {
-      const struct util_format_channel_description *ch =
-         tu_get_format_channel_description(desc, comp);
-      if (!ch) {
-         assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
-                (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
-         continue;
-      }
-
-      union tu_clear_component_value v = tu_get_clear_component_value(
-         val, comp, desc->colorspace);
-
-      /* move to the next uint32_t when there is not enough space */
-      assert(ch->size <= 32);
-      if (bit_shift + ch->size > 32) {
-         buf_offset++;
-         bit_shift = 0;
-      }
-
-      if (bit_shift == 0)
-         buf[buf_offset] = 0;
-
-      buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
-      bit_shift += ch->size;
-   }
-}
-
-static void
-tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
-                              struct tu_cs *cs,
-                              uint32_t attachment,
-                              uint8_t component_mask,
-                              const VkClearValue *value)
-{
-   VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
-   /* note: component_mask is 0x7 for depth and 0x8 for stencil
-    * because D24S8 is cleared with AS_R8G8B8A8 format
-    */
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
-   tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
-   tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
-   tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
-   tu_cs_emit(cs, 0);
-
-   uint32_t clear_vals[4] = {};
-   pack_gmem_clear_value(value, vk_format, clear_vals);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
-   tu_cs_emit_array(cs, clear_vals, 4);
-
-   tu6_emit_event_write(cmd, cs, BLIT, false);
-}
-
-static void
-tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
-                          uint32_t attachment_count,
-                          const VkClearAttachment *attachments,
-                          uint32_t rect_count,
-                          const VkClearRect *rects)
-{
-   const struct tu_subpass *subpass = cmd->state.subpass;
-   struct tu_cs *cs = &cmd->draw_cs;
-
-   /* TODO: swap the loops for smaller cmdstream */
-   for (unsigned i = 0; i < rect_count; i++) {
-      unsigned x1 = rects[i].rect.offset.x;
-      unsigned y1 = rects[i].rect.offset.y;
-      unsigned x2 = x1 + rects[i].rect.extent.width - 1;
-      unsigned y2 = y1 + rects[i].rect.extent.height - 1;
-
-      tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
-      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
-      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
-
-      for (unsigned j = 0; j < attachment_count; j++) {
-         uint32_t a;
-         if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
-            a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
-         else
-            a = subpass->depth_stencil_attachment.attachment;
-
-         if (a == VK_ATTACHMENT_UNUSED)
-               continue;
-
-         unsigned clear_mask = 0xf;
-         if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
-               clear_mask &= ~0x7;
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
-               clear_mask &= ~0x8;
-         }
-
-         tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
-                                       &attachments[j].clearValue);
-      }
-   }
-}
-
-void
-tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
-                       uint32_t attachmentCount,
-                       const VkClearAttachment *pAttachments,
-                       uint32_t rectCount,
-                       const VkClearRect *pRects)
-{
-   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-   struct tu_cs *cs = &cmd->draw_cs;
-
-   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
-   tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
-   tu_cond_exec_end(cs);
-
-   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
-   tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
-   tu_cond_exec_end(cs);
-}
-
-void
-tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
-                           struct tu_cs *cs,
-                           uint32_t a,
-                           const VkRenderPassBeginInfo *info)
-{
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_image_view *iview = fb->attachments[a].attachment;
-   const struct tu_render_pass_attachment *attachment =
-      &cmd->state.pass->attachments[a];
-   uint8_t mask = 0;
-
-   if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
-      mask = 0xf;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-      mask |= 0x7;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-      mask |= 0x8;
-
-   if (!mask)
-      return;
-
-   const struct blit_ops *ops = &r2d_ops;
-   if (attachment->samples > 1)
-      ops = &r3d_ops;
-
-   ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
-   ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
-   ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
-
-   for (uint32_t i = 0; i < fb->layers; i++) {
-      ops->dst(cs, iview, i);
-      ops->run(cmd, cs);
-   }
-}
-
-void
-tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
-                         struct tu_cs *cs,
-                         uint32_t a,
-                         const VkRenderPassBeginInfo *info)
-{
-   const struct tu_render_pass_attachment *attachment =
-      &cmd->state.pass->attachments[a];
-   unsigned clear_mask = 0;
-
-   if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
-      clear_mask = 0xf;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-      clear_mask |= 0x7;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-      clear_mask |= 0x8;
-
-   if (!clear_mask)
-      return;
-
-   tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
-
-   tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
-                                 &info->pClearValues[a]);
-}
-
-static void
-tu_emit_blit(struct tu_cmd_buffer *cmd,
-             struct tu_cs *cs,
-             const struct tu_image_view *iview,
-             const struct tu_render_pass_attachment *attachment,
-             bool resolve)
-{
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
-
-   tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
-      .unk0 = !resolve,
-      .gmem = !resolve,
-      /* "integer" bit disables msaa resolve averaging */
-      .integer = vk_format_is_int(attachment->format)));
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
-   tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
-   tu_cs_image_ref_2d(cs, iview, 0, false);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
-   tu_cs_image_flag_ref(cs, iview, 0);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
-
-   tu6_emit_event_write(cmd, cs, BLIT, false);
-}
-
-static bool
-blit_can_resolve(VkFormat format)
-{
-   const struct util_format_description *desc = vk_format_description(format);
-
-   /* blit event can only do resolve for simple cases:
-    * averaging samples as unsigned integers or choosing only one sample
-    */
-   if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
-      return false;
-
-   /* can't do formats with larger channel sizes
-    * note: this includes all float formats
-    * note2: single channel integer formats seem OK
-    */
-   if (desc->channel[0].size > 10)
-      return false;
-
-   switch (format) {
-   /* for unknown reasons blit event can't msaa resolve these formats when tiled
-    * likely related to these formats having different layout from other cpp=2 formats
-    */
-   case VK_FORMAT_R8G8_UNORM:
-   case VK_FORMAT_R8G8_UINT:
-   case VK_FORMAT_R8G8_SINT:
-   /* TODO: this one should be able to work? */
-   case VK_FORMAT_D24_UNORM_S8_UINT:
-      return false;
-   default:
-      break;
-   }
-
-   return true;
-}
-
-void
-tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
-                        struct tu_cs *cs,
-                        uint32_t a,
-                        bool force_load)
-{
-   const struct tu_image_view *iview =
-      cmd->state.framebuffer->attachments[a].attachment;
-   const struct tu_render_pass_attachment *attachment =
-      &cmd->state.pass->attachments[a];
-
-   if (attachment->load || force_load)
-      tu_emit_blit(cmd, cs, iview, attachment, false);
-}
-
-void
-tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
-                         struct tu_cs *cs,
-                         uint32_t a,
-                         uint32_t gmem_a)
-{
-   const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
-   const VkRect2D *render_area = &tiling->render_area;
-   struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
-   struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
-   struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
-
-   if (!dst->store)
-      return;
-
-   uint32_t x1 = render_area->offset.x;
-   uint32_t y1 = render_area->offset.y;
-   uint32_t x2 = x1 + render_area->extent.width;
-   uint32_t y2 = y1 + render_area->extent.height;
-   /* x2/y2 can be unaligned if equal to the size of the image,
-    * since it will write into padding space
-    * the one exception is linear levels which don't have the
-    * required y padding in the layout (except for the last level)
-    */
-   bool need_y2_align =
-      y2 != iview->extent.height || iview->need_y2_align;
-
-   bool unaligned =
-      x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
-      y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
-
-   /* use fast path when render area is aligned, except for unsupported resolve cases */
-   if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
-      tu_emit_blit(cmd, cs, iview, src, true);
-      return;
-   }
-
-   if (dst->samples > 1) {
-      /* I guess we need to use shader path in this case?
-       * need a testcase which fails because of this
-       */
-      tu_finishme("unaligned store of msaa attachment\n");
-      return;
-   }
-
-   r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
-   r2d_dst(cs, iview, 0);
-   r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
-
-   tu_cs_emit_regs(cs,
-                   A6XX_SP_PS_2D_SRC_INFO(
-                      .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
-                      .tile_mode = TILE6_2,
-                      .srgb = vk_format_is_srgb(src->format),
-                      .samples = tu_msaa_samples(src->samples),
-                      .samples_average = !vk_format_is_int(src->format),
-                      .unk20 = 1,
-                      .unk22 = 1),
-                   /* note: src size does not matter when not scaling */
-                   A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
-                   A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
-                   A6XX_SP_PS_2D_SRC_HI(),
-                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
-
-   /* sync GMEM writes with CACHE */
-   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
-
-   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
-   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
-
-   /* TODO: flushing with barriers instead of blindly always flushing */
-   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
-   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
-   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
-}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_meta_blit.c b/lib/mesa/src/freedreno/vulkan/tu_meta_blit.c
new file mode 100644
index 000000000..ec45e011a
--- /dev/null
+++ b/lib/mesa/src/freedreno/vulkan/tu_meta_blit.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "tu_private.h"
+
+#include "tu_blit.h"
+
+static void
+tu_blit_image(struct tu_cmd_buffer *cmdbuf,
+              struct tu_image *src_image,
+              struct tu_image *dst_image,
+              const VkImageBlit *info,
+              VkFilter filter)
+{
+   static const enum a6xx_rotation rotate[2][2] = {
+      {ROTATE_0, ROTATE_HFLIP},
+      {ROTATE_VFLIP, ROTATE_180},
+   };
+   bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
+                   (info->dstOffsets[1].x < info->dstOffsets[0].x);
+   bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
+                   (info->dstOffsets[1].y < info->dstOffsets[0].y);
+   bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
+                   (info->dstOffsets[1].z < info->dstOffsets[0].z);
+
+   if (mirror_z) {
+      tu_finishme("blit z mirror\n");
+      return;
+   }
+
+   if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
+       info->dstOffsets[1].z - info->dstOffsets[0].z) {
+      tu_finishme("blit z filter\n");
+      return;
+   }
+   assert(info->dstSubresource.layerCount == info->srcSubresource.layerCount);
+
+   struct tu_blit blt = {
+      .dst = tu_blit_surf(dst_image, info->dstSubresource, info->dstOffsets),
+      .src = tu_blit_surf(src_image, info->srcSubresource, info->srcOffsets),
+      .layers = MAX2(info->srcOffsets[1].z - info->srcOffsets[0].z,
+                     info->dstSubresource.layerCount),
+      .filter = filter == VK_FILTER_LINEAR,
+      .rotation = rotate[mirror_y][mirror_x],
+   };
+
+   tu_blit(cmdbuf, &blt);
+}
+
+void
+tu_CmdBlitImage(VkCommandBuffer commandBuffer,
+                VkImage srcImage,
+                VkImageLayout srcImageLayout,
+                VkImage destImage,
+                VkImageLayout destImageLayout,
+                uint32_t regionCount,
+                const VkImageBlit *pRegions,
+                VkFilter filter)
+
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   TU_FROM_HANDLE(tu_image, src_image, srcImage);
+   TU_FROM_HANDLE(tu_image, dst_image, destImage);
+
+   tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
+   tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
+
+   for (uint32_t i = 0; i < regionCount; ++i) {
+      tu_blit_image(cmdbuf, src_image, dst_image, pRegions + i, filter);
+   }
+}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_meta_buffer.c b/lib/mesa/src/freedreno/vulkan/tu_meta_buffer.c
new file mode 100644
index 000000000..ea764c579
--- /dev/null
+++ b/lib/mesa/src/freedreno/vulkan/tu_meta_buffer.c
@@ -0,0 +1,75 @@
+#include "tu_private.h"
+#include "tu_blit.h"
+#include "tu_cs.h"
+
+void
+tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
+                 VkBuffer dstBuffer,
+                 VkDeviceSize dstOffset,
+                 VkDeviceSize fillSize,
+                 uint32_t data)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
+
+   if (fillSize == VK_WHOLE_SIZE)
+      fillSize = buffer->size - dstOffset;
+
+   tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
+
+   tu_blit(cmd, &(struct tu_blit) {
+      .dst = {
+         .fmt = VK_FORMAT_R32_UINT,
+         .va = tu_buffer_iova(buffer) + dstOffset,
+         .width = fillSize / 4,
+         .height = 1,
+         .samples = 1,
+      },
+      .layers = 1,
+      .clear_value[0] = data,
+      .type = TU_BLIT_CLEAR,
+      .buffer = true,
+   });
+}
+
+void
+tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
+                   VkBuffer dstBuffer,
+                   VkDeviceSize dstOffset,
+                   VkDeviceSize dataSize,
+                   const void *pData)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
+
+   tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
+
+   struct ts_cs_memory tmp;
+   VkResult result = tu_cs_alloc(cmd->device, &cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
+   if (result != VK_SUCCESS) {
+      cmd->record_result = result;
+      return;
+   }
+
+   memcpy(tmp.map, pData, dataSize);
+
+   tu_blit(cmd, &(struct tu_blit) {
+      .dst = {
+         .fmt = VK_FORMAT_R32_UINT,
+         .va = tu_buffer_iova(buffer) + dstOffset,
+         .width = dataSize / 4,
+         .height = 1,
+         .samples = 1,
+      },
+      .src = {
+         .fmt = VK_FORMAT_R32_UINT,
+         .va = tmp.iova,
+         .width = dataSize / 4,
+         .height = 1,
+         .samples = 1,
+      },
+      .layers = 1,
+      .type = TU_BLIT_COPY,
+      .buffer = true,
+   });
+}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_meta_clear.c b/lib/mesa/src/freedreno/vulkan/tu_meta_clear.c
new file mode 100644
index 000000000..4b7e11694
--- /dev/null
+++ b/lib/mesa/src/freedreno/vulkan/tu_meta_clear.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "tu_private.h"
+#include "tu_blit.h"
+#include "tu_cs.h"
+
+static void
+clear_image(struct tu_cmd_buffer *cmdbuf,
+            struct tu_image *image,
+            uint32_t clear_value[4],
+            const VkImageSubresourceRange *range)
+{
+   uint32_t level_count = tu_get_levelCount(image, range);
+   uint32_t layer_count = tu_get_layerCount(image, range);
+
+   if (image->type == VK_IMAGE_TYPE_3D) {
+      assert(layer_count == 1);
+      assert(range->baseArrayLayer == 0);
+   }
+
+   for (unsigned j = 0; j < level_count; j++) {
+      if (image->type == VK_IMAGE_TYPE_3D)
+         layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
+
+      tu_blit(cmdbuf, &(struct tu_blit) {
+         .dst = tu_blit_surf_whole(image, range->baseMipLevel + j, range->baseArrayLayer),
+         .layers = layer_count,
+         .clear_value = {clear_value[0], clear_value[1], clear_value[2], clear_value[3]},
+         .type = TU_BLIT_CLEAR,
+      });
+   }
+}
+
+void
+tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
+                      VkImage image_h,
+                      VkImageLayout imageLayout,
+                      const VkClearColorValue *pColor,
+                      uint32_t rangeCount,
+                      const VkImageSubresourceRange *pRanges)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   TU_FROM_HANDLE(tu_image, image, image_h);
+   uint32_t clear_value[4] = {};
+
+   tu_2d_clear_color(pColor, image->vk_format, clear_value);
+
+   tu_bo_list_add(&cmdbuf->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
+
+   for (unsigned i = 0; i < rangeCount; i++)
+      clear_image(cmdbuf, image, clear_value, pRanges + i);
+}
+
+void
+tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
+                             VkImage image_h,
+                             VkImageLayout imageLayout,
+                             const VkClearDepthStencilValue *pDepthStencil,
+                             uint32_t rangeCount,
+                             const VkImageSubresourceRange *pRanges)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   TU_FROM_HANDLE(tu_image, image, image_h);
+   uint32_t clear_value[4] = {};
+
+   tu_2d_clear_zs(pDepthStencil, image->vk_format, clear_value);
+
+   tu_bo_list_add(&cmdbuf->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
+
+   for (unsigned i = 0; i < rangeCount; i++)
+      clear_image(cmdbuf, image, clear_value, pRanges + i);
+}
+
+void
+tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
+                       uint32_t attachmentCount,
+                       const VkClearAttachment *pAttachments,
+                       uint32_t rectCount,
+                       const VkClearRect *pRects)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   const struct tu_subpass *subpass = cmd->state.subpass;
+   struct tu_cs *cs = &cmd->draw_cs;
+
+   VkResult result = tu_cs_reserve_space(cmd->device, cs,
+                                         rectCount * (3 + 15 * attachmentCount));
+   if (result != VK_SUCCESS) {
+      cmd->record_result = result;
+      return;
+   }
+
+   /* TODO: deal with layered rendering (when layered rendering is implemented)
+    * TODO: disable bypass rendering for subpass (when bypass is implemented)
+    */
+
+   for (unsigned i = 0; i < rectCount; i++) {
+      unsigned x1 = pRects[i].rect.offset.x;
+      unsigned y1 = pRects[i].rect.offset.y;
+      unsigned x2 = x1 + pRects[i].rect.extent.width - 1;
+      unsigned y2 = y1 + pRects[i].rect.extent.height - 1;
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
+      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
+      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
+
+      for (unsigned j = 0; j < attachmentCount; j++) {
+         uint32_t a;
+         unsigned clear_mask = 0;
+         if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+            clear_mask = 0xf;
+            a = subpass->color_attachments[pAttachments[j].colorAttachment].attachment;
+         } else {
+            a = subpass->depth_stencil_attachment.attachment;
+            if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
+               clear_mask |= 1;
+            if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
+               clear_mask |= 2;
+         }
+
+         if (a == VK_ATTACHMENT_UNUSED)
+               continue;
+
+         VkFormat fmt = cmd->state.pass->attachments[a].format;
+         const struct tu_native_format *format = tu6_get_native_format(fmt);
+         assert(format && format->rb >= 0);
+
+         tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
+         tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb));
+
+         tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
+         tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(clear_mask));
+
+         tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
+         tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
+
+         tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
+         tu_cs_emit(cs, 0);
+
+         uint32_t clear_vals[4] = { 0 };
+         tu_pack_clear_value(&pAttachments[j].clearValue, fmt, clear_vals);
+
+         tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
+         tu_cs_emit(cs, clear_vals[0]);
+         tu_cs_emit(cs, clear_vals[1]);
+         tu_cs_emit(cs, clear_vals[2]);
+         tu_cs_emit(cs, clear_vals[3]);
+
+         tu6_emit_event_write(cmd, cs, BLIT, false);
+      }
+   }
+}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_meta_copy.c b/lib/mesa/src/freedreno/vulkan/tu_meta_copy.c
new file mode 100644
index 000000000..ecded029a
--- /dev/null
+++ b/lib/mesa/src/freedreno/vulkan/tu_meta_copy.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "tu_private.h"
+
+#include "a6xx.xml.h"
+#include "adreno_common.xml.h"
+#include "adreno_pm4.xml.h"
+
+#include "vk_format.h"
+
+#include "tu_cs.h"
+#include "tu_blit.h"
+
+static void
+tu_copy_buffer(struct tu_cmd_buffer *cmd,
+               struct tu_buffer *src,
+               struct tu_buffer *dst,
+               const VkBufferCopy *region)
+{
+   tu_bo_list_add(&cmd->bo_list, src->bo, MSM_SUBMIT_BO_READ);
+   tu_bo_list_add(&cmd->bo_list, dst->bo, MSM_SUBMIT_BO_WRITE);
+
+   tu_blit(cmd, &(struct tu_blit) {
+      .dst = {
+         .fmt = VK_FORMAT_R8_UNORM,
+         .va = tu_buffer_iova(dst) + region->dstOffset,
+         .width = region->size,
+         .height = 1,
+         .samples = 1,
+      },
+      .src = {
+         .fmt = VK_FORMAT_R8_UNORM,
+         .va = tu_buffer_iova(src) + region->srcOffset,
+         .width = region->size,
+         .height = 1,
+         .samples = 1,
+      },
+      .layers = 1,
+      .type = TU_BLIT_COPY,
+      .buffer = true,
+   });
+}
+
+static struct tu_blit_surf
+tu_blit_buffer(struct tu_buffer *buffer,
+               VkFormat format,
+               const VkBufferImageCopy *info)
+{
+   if (info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
+      format = VK_FORMAT_R8_UNORM;
+
+   unsigned pitch = (info->bufferRowLength ?: info->imageExtent.width) *
+                        vk_format_get_blocksize(format);
+
+   return (struct tu_blit_surf) {
+      .fmt = format,
+      .tile_mode = TILE6_LINEAR,
+      .va = tu_buffer_iova(buffer) + info->bufferOffset,
+      .pitch = pitch,
+      .layer_size = (info->bufferImageHeight ?: info->imageExtent.height) * pitch / vk_format_get_blockwidth(format) / vk_format_get_blockheight(format),
+      .width = info->imageExtent.width,
+      .height = info->imageExtent.height,
+      .samples = 1,
+   };
+}
+
+static void
+tu_copy_buffer_to_image(struct tu_cmd_buffer *cmdbuf,
+                        struct tu_buffer *src_buffer,
+                        struct tu_image *dst_image,
+                        const VkBufferImageCopy *info)
+{
+   if (info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT &&
+       vk_format_get_blocksize(dst_image->vk_format) == 4) {
+      tu_finishme("aspect mask\n");
+      return;
+   }
+
+   tu_blit(cmdbuf, &(struct tu_blit) {
+      .dst = tu_blit_surf_ext(dst_image, info->imageSubresource, info->imageOffset, info->imageExtent),
+      .src = tu_blit_buffer(src_buffer, dst_image->vk_format, info),
+      .layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount),
+      .type = TU_BLIT_COPY,
+   });
+}
+
+static void
+tu_copy_image_to_buffer(struct tu_cmd_buffer *cmdbuf,
+                        struct tu_image *src_image,
+                        struct tu_buffer *dst_buffer,
+                        const VkBufferImageCopy *info)
+{
+   tu_blit(cmdbuf, &(struct tu_blit) {
+      .dst = tu_blit_buffer(dst_buffer, src_image->vk_format, info),
+      .src = tu_blit_surf_ext(src_image, info->imageSubresource, info->imageOffset, info->imageExtent),
+      .layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount),
+      .type = TU_BLIT_COPY,
+   });
+}
+
+static void
+tu_copy_image_to_image(struct tu_cmd_buffer *cmdbuf,
+                       struct tu_image *src_image,
+                       struct tu_image *dst_image,
+                       const VkImageCopy *info)
+{
+   if ((info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT &&
+        vk_format_get_blocksize(dst_image->vk_format) == 4) ||
+       (info->srcSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT &&
+        vk_format_get_blocksize(src_image->vk_format) == 4)) {
+      tu_finishme("aspect mask\n");
+      return;
+   }
+
+   tu_blit(cmdbuf, &(struct tu_blit) {
+      .dst = tu_blit_surf_ext(dst_image, info->dstSubresource, info->dstOffset, info->extent),
+      .src = tu_blit_surf_ext(src_image, info->srcSubresource, info->srcOffset, info->extent),
+      .layers = info->extent.depth,
+      .type = TU_BLIT_COPY,
+   });
+}
+
+void
+tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
+                 VkBuffer srcBuffer,
+                 VkBuffer destBuffer,
+                 uint32_t regionCount,
+                 const VkBufferCopy *pRegions)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
+   TU_FROM_HANDLE(tu_buffer, dst_buffer, destBuffer);
+
+   for (unsigned i = 0; i < regionCount; ++i)
+      tu_copy_buffer(cmdbuf, src_buffer, dst_buffer, &pRegions[i]);
+}
+
+void
+tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
+                        VkBuffer srcBuffer,
+                        VkImage destImage,
+                        VkImageLayout destImageLayout,
+                        uint32_t regionCount,
+                        const VkBufferImageCopy *pRegions)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   TU_FROM_HANDLE(tu_image, dst_image, destImage);
+   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
+
+   tu_bo_list_add(&cmdbuf->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
+   tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
+
+   for (unsigned i = 0; i < regionCount; ++i)
+      tu_copy_buffer_to_image(cmdbuf, src_buffer, dst_image, pRegions + i);
+}
+
+void
+tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
+                        VkImage srcImage,
+                        VkImageLayout srcImageLayout,
+                        VkBuffer destBuffer,
+                        uint32_t regionCount,
+                        const VkBufferImageCopy *pRegions)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   TU_FROM_HANDLE(tu_image, src_image, srcImage);
+   TU_FROM_HANDLE(tu_buffer, dst_buffer, destBuffer);
+
+   tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
+   tu_bo_list_add(&cmdbuf->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
+
+   for (unsigned i = 0; i < regionCount; ++i)
+      tu_copy_image_to_buffer(cmdbuf, src_image, dst_buffer, pRegions + i);
+}
+
+void
+tu_CmdCopyImage(VkCommandBuffer commandBuffer,
+                VkImage srcImage,
+                VkImageLayout srcImageLayout,
+                VkImage destImage,
+                VkImageLayout destImageLayout,
+                uint32_t regionCount,
+                const VkImageCopy *pRegions)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
+   TU_FROM_HANDLE(tu_image, src_image, srcImage);
+   TU_FROM_HANDLE(tu_image, dst_image, destImage);
+
+   tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
+   tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
+
+   for (uint32_t i = 0; i < regionCount; ++i)
+      tu_copy_image_to_image(cmdbuf, src_image, dst_image, pRegions + i);
+}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_meta_resolve.c b/lib/mesa/src/freedreno/vulkan/tu_meta_resolve.c
new file mode 100644
index 000000000..b879f84bd
--- /dev/null
+++ b/lib/mesa/src/freedreno/vulkan/tu_meta_resolve.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "tu_private.h"
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "nir/nir_builder.h"
+#include "vk_format.h"
+
+#include "tu_blit.h"
+
+static void
+tu_resolve_image(struct tu_cmd_buffer *cmdbuf,
+                 struct tu_image *src_image,
+                 struct tu_image *dst_image,
+                 const VkImageResolve *info)
+{
+   assert(info->dstSubresource.layerCount == info->srcSubresource.layerCount);
+
+   tu_blit(cmdbuf, &(struct tu_blit) {
+      .dst = tu_blit_surf_ext(dst_image, info->dstSubresource, info->dstOffset, info->extent),
+      .src = tu_blit_surf_ext(src_image, info->srcSubresource, info->srcOffset, info->extent),
+      .layers = MAX2(info->extent.depth, info->dstSubresource.layerCount)
+   });
+}
+
+void
+tu_CmdResolveImage(VkCommandBuffer cmd_buffer_h,
+                   VkImage src_image_h,
+                   VkImageLayout src_image_layout,
+                   VkImage dest_image_h,
+                   VkImageLayout dest_image_layout,
+                   uint32_t region_count,
+                   const VkImageResolve *regions)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, cmd_buffer_h);
+   TU_FROM_HANDLE(tu_image, src_image, src_image_h);
+   TU_FROM_HANDLE(tu_image, dst_image, dest_image_h);
+
+   tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
+   tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
+
+   for (uint32_t i = 0; i < region_count; ++i)
+      tu_resolve_image(cmdbuf, src_image, dst_image, regions + i);
+}
author	Jonathan Gray <jsg@cvs.openbsd.org>	2020-09-22 02:09:17 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2020-09-22 02:09:17 +0000
commit	865c23c9c56f47f6cf8d73e8a6060a0c33a28b93 (patch)
tree	aeed22bc39ce87dd6f09ff173c8273beaef65fe7 /lib/mesa/src/freedreno
parent	27e7bb02bd0f89f96d9e3b402b46c2c97ee4defe (diff)