Import Mesa 17.1.6

author: Jonathan Gray <jsg@cvs.openbsd.org> 2017-08-14 09:45:54 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2017-08-14 09:45:54 +0000
commit: 4c58069f5013f0a621503525f7d5193bfe9976b3 (patch)
tree: bd8f8a08b889e9a8b99c9de01ae12459d527ea6d /lib/mesa/src/gallium/drivers/vc4
parent: 5caa025e6b62d0456faad86c89f239a14d1eaadb (diff)
49 files changed, 2271 insertions, 796 deletions
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.am b/lib/mesa/src/gallium/drivers/vc4/Makefile.am
index 19fc38759..b361a0c58 100644
--- a/lib/mesa/src/gallium/drivers/vc4/Makefile.am
+++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.am
@@ -40,3 +40,11 @@ noinst_LTLIBRARIES = libvc4.la
 libvc4_la_SOURCES = $(C_SOURCES)
 libvc4_la_LIBADD = $(SIM_LIB) $(VC4_LIBS)
 libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
+
+noinst_LTLIBRARIES += libvc4_neon.la
+libvc4_la_LIBADD += libvc4_neon.la
+
+libvc4_neon_la_SOURCES = vc4_tiling_lt.c
+libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -DVC4_BUILD_NEON
+
+EXTRA_DIST = kernel/README
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.sources b/lib/mesa/src/gallium/drivers/vc4/Makefile.sources
index e1496d101..10de34361 100644
--- a/lib/mesa/src/gallium/drivers/vc4/Makefile.sources
+++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.sources
@@ -28,6 +28,7 @@ C_SOURCES := \
 	vc4_opt_peephole_sf.c \
 	vc4_opt_small_immediates.c \
 	vc4_opt_vpm.c \
+	vc4_opt_coalesce_ff_writes.c \
 	vc4_program.c \
 	vc4_qir.c \
 	vc4_qir_emit_uniform_stream_resets.c \
@@ -54,6 +55,7 @@ C_SOURCES := \
 	vc4_simulator_validate.h \
 	vc4_state.c \
 	vc4_tiling.c \
+	vc4_tiling_lt.c \
 	vc4_tiling.h \
 	vc4_uniforms.c \
 	$()
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/README b/lib/mesa/src/gallium/drivers/vc4/kernel/README
new file mode 100644
index 000000000..89e4442b4
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/README
@@ -0,0 +1,6 @@
+This is a mirror of the kernel validation code into the userspace GL library.
+It is only built when USE_VC4_SIMULATOR is defined, for compiling the driver
+on an x86 system with the simpenrose simulator.  It allows testing of changes
+across the kernel and userspace with exposure to most of the software stack,
+on a higher-performance and more-debuggable environment than the native
+hardware.
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 90f45397d..8f5ed00d9 100644
--- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -150,6 +150,8 @@ struct vc4_validated_shader_info
 
 	uint32_t num_uniform_addr_offsets;
 	uint32_t *uniform_addr_offsets;
+
+	bool is_threaded;
 };
 
 /* vc4_validate.c */
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 4ef01108b..bd193b993 100644
--- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -640,6 +640,13 @@ reloc_tex(struct vc4_exec_info *exec,
 		cpp = 1;
 		break;
 	case VC4_TEXTURE_TYPE_ETC1:
+		/* ETC1 is arranged as 64-bit blocks, where each block is 4x4
+		 * pixels.
+		 */
+		cpp = 8;
+		width = (width + 3) >> 2;
+		height = (height + 3) >> 2;
+		break;
 	case VC4_TEXTURE_TYPE_BW1:
 	case VC4_TEXTURE_TYPE_A4:
 	case VC4_TEXTURE_TYPE_A1:
@@ -773,11 +780,6 @@ validate_gl_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
-	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
-		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
-		return -EINVAL;
-	}
-
 	for (i = 0; i < shader_reloc_count; i++) {
 		if (src_handles[i] > exec->bo_count) {
 			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
@@ -794,6 +796,18 @@ validate_gl_shader_rec(struct drm_device *dev,
 			return -EINVAL;
 	}
 
+	if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
+	    to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
+		DRM_ERROR("Thread mode of CL and FS do not match\n");
+		return -EINVAL;
+	}
+
+	if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
+	    to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
+		DRM_ERROR("cs and vs cannot be threaded\n");
+		return -EINVAL;
+	}
+
 	for (i = 0; i < shader_reloc_count; i++) {
 		struct vc4_validated_shader_info *validated_shader;
 		uint32_t o = shader_reloc_offsets[i];
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
index 82717ca55..d93f5239d 100644
--- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
@@ -84,6 +84,14 @@ struct vc4_shader_validation_state {
 	 * basic blocks.
 	 */
 	bool needs_uniform_address_for_loop;
+
+	/* Set when we find an instruction which violates the criterion for a
+	 * threaded shader. These are:
+	 * 	- only write the lower half of the register space
+	 * 	- last thread switch signaled at the end
+	 * So track the usage of the thread switches and the register usage.
+	 */
+	bool all_registers_used;
 };
 
 static uint32_t
@@ -119,6 +127,12 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
 		return ~0;
 }
 
+static bool live_reg_is_upper_half(uint32_t lri)
+{
+	return	(lri >=16 && lri < 32) ||
+		(lri >=32 + 16 && lri < 32 + 32);
+}
+
 static bool
 is_tmu_submit(uint32_t waddr)
 {
@@ -385,6 +399,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
 		} else {
 			validation_state->live_immediates[lri] = ~0;
 		}
+
+		if (live_reg_is_upper_half(lri))
+			validation_state->all_registers_used = true;
 	}
 
 	switch (waddr) {
@@ -593,6 +610,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
 		}
 	}
 
+	if ((raddr_a >= 16 && raddr_a < 32) ||
+	    (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
+		validation_state->all_registers_used = true;
+	}
+
 	return true;
 }
 
@@ -603,9 +625,7 @@ static bool
 vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 {
 	uint32_t max_branch_target = 0;
-	bool found_shader_end = false;
 	int ip;
-	int shader_end_ip = 0;
 	int last_branch = -2;
 
 	for (ip = 0; ip < validation_state->max_ip; ip++) {
@@ -616,8 +636,13 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 		uint32_t branch_target_ip;
 
 		if (sig == QPU_SIG_PROG_END) {
-			shader_end_ip = ip;
-			found_shader_end = true;
+			/* There are two delay slots after program end is
+			 * signaled that are still executed, then we're
+			 * finished.  validation_state->max_ip is the
+			 * instruction after the last valid instruction in the
+			 * program.
+			 */
+			validation_state->max_ip = ip + 3;
 			continue;
 		}
 
@@ -671,15 +696,9 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 		}
 		set_bit(after_delay_ip, validation_state->branch_targets);
 		max_branch_target = max(max_branch_target, after_delay_ip);
-
-		/* There are two delay slots after program end is signaled
-		 * that are still executed, then we're finished.
-		 */
-		if (found_shader_end && ip == shader_end_ip + 2)
-			break;
 	}
 
-	if (max_branch_target > shader_end_ip) {
+	if (max_branch_target > validation_state->max_ip - 3) {
 		DRM_ERROR("Branch landed after QPU_SIG_PROG_END");
 		return false;
 	}
@@ -751,6 +770,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
 	bool found_shader_end = false;
 	int shader_end_ip = 0;
+	uint32_t last_thread_switch_ip = -3;
 	uint32_t ip;
 	struct vc4_validated_shader_info *validated_shader = NULL;
 	struct vc4_shader_validation_state validation_state;
@@ -783,6 +803,16 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		if (!vc4_handle_branch_target(&validation_state))
 			goto fail;
 
+		if (ip == last_thread_switch_ip + 3) {
+			/* Reset r0-r3 live clamp data */
+			int i;
+			for (i = 64; i < LIVE_REG_COUNT; i++) {
+				validation_state.live_min_clamp_offsets[i] = ~0;
+				validation_state.live_max_clamp_regs[i] = false;
+				validation_state.live_immediates[i] = ~0;
+			}
+		}
+
 		switch (sig) {
 		case QPU_SIG_NONE:
 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -792,6 +822,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		case QPU_SIG_LOAD_TMU1:
 		case QPU_SIG_PROG_END:
 		case QPU_SIG_SMALL_IMM:
+		case QPU_SIG_THREAD_SWITCH:
+		case QPU_SIG_LAST_THREAD_SWITCH:
 			if (!check_instruction_writes(validated_shader,
 						      &validation_state)) {
 				DRM_ERROR("Bad write at ip %d\n", ip);
@@ -807,6 +839,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 				shader_end_ip = ip;
 			}
 
+			if (sig == QPU_SIG_THREAD_SWITCH ||
+			    sig == QPU_SIG_LAST_THREAD_SWITCH) {
+				validated_shader->is_threaded = true;
+
+				if (ip < last_thread_switch_ip + 3) {
+					DRM_ERROR("Thread switch too soon after "
+						  "last switch at ip %d\n", ip);
+					goto fail;
+				}
+				last_thread_switch_ip = ip;
+			}
+
 			break;
 
 		case QPU_SIG_LOAD_IMM:
@@ -821,6 +865,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 			if (!check_branch(inst, validated_shader,
 					  &validation_state, ip))
 				goto fail;
+
+			if (ip < last_thread_switch_ip + 3) {
+				DRM_ERROR("Branch in thread switch at ip %d",
+					  ip);
+				goto fail;
+			}
+
 			break;
 		default:
 			DRM_ERROR("Unsupported QPU signal %d at "
@@ -842,6 +893,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		goto fail;
 	}
 
+	/* Might corrupt other thread */
+	if (validated_shader->is_threaded &&
+	    validation_state.all_registers_used) {
+		DRM_ERROR("Shader uses threading, but uses the upper "
+			  "half of the registers, too\n");
+		goto fail;
+	}
+
 	/* If we did a backwards branch and we haven't emitted a uniforms
 	 * reset since then, we still need the uniforms stream to have the
 	 * uniforms address available so that the backwards branch can do its
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
index 1e056568a..0e4ab5bfa 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
@@ -212,14 +212,16 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
         if (vc4_tile_blit(pctx, blit_info))
                 return;
 
-        if (util_try_blit_via_copy_region(pctx, &info)) {
-                return; /* done */
-        }
-
         if (info.mask & PIPE_MASK_S) {
-                fprintf(stderr, "cannot blit stencil, skipping\n");
+                if (util_try_blit_via_copy_region(pctx, &info))
+                        return;
+
                 info.mask &= ~PIPE_MASK_S;
+                fprintf(stderr, "cannot blit stencil, skipping\n");
         }
 
-        vc4_render_blit(pctx, &info);
+        if (vc4_render_blit(pctx, &info))
+                return;
+
+        fprintf(stderr, "Unsupported blit\n");
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
index cf6a5114b..12af7f8a9 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -97,7 +97,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
                 return NULL;
 
         struct vc4_bo *bo = NULL;
-        pipe_mutex_lock(cache->lock);
+        mtx_lock(&cache->lock);
         if (!list_empty(&cache->size_list[page_index])) {
                 bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next,
                                 size_list);
@@ -107,7 +107,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
                  * user will proceed to CPU map it and fill it with stuff.
                  */
                 if (!vc4_bo_wait(bo, 0, NULL)) {
-                        pipe_mutex_unlock(cache->lock);
+                        mtx_unlock(&cache->lock);
                         return NULL;
                 }
 
@@ -116,7 +116,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
 
                 bo->name = name;
         }
-        pipe_mutex_unlock(cache->lock);
+        mtx_unlock(&cache->lock);
         return bo;
 }
 
@@ -148,28 +148,17 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
         bo->name = name;
         bo->private = true;
 
+ retry:
+        ;
+
         bool cleared_and_retried = false;
-retry:
-        if (!using_vc4_simulator) {
-                struct drm_vc4_create_bo create;
-                memset(&create, 0, sizeof(create));
-
-                create.size = size;
-
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_BO, &create);
-                bo->handle = create.handle;
-        } else {
-                struct drm_mode_create_dumb create;
-                memset(&create, 0, sizeof(create));
-
-                create.width = 128;
-                create.bpp = 8;
-                create.height = (size + 127) / 128;
-
-                ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
-                bo->handle = create.handle;
-                assert(create.size >= size);
-        }
+        struct drm_vc4_create_bo create = {
+                .size = size
+        };
+
+        ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_CREATE_BO, &create);
+        bo->handle = create.handle;
+
         if (ret != 0) {
                 if (!list_empty(&screen->bo_cache.time_list) &&
                     !cleared_and_retried) {
@@ -199,9 +188,9 @@ vc4_bo_last_unreference(struct vc4_bo *bo)
 
         struct timespec time;
         clock_gettime(CLOCK_MONOTONIC, &time);
-        pipe_mutex_lock(screen->bo_cache.lock);
+        mtx_lock(&screen->bo_cache.lock);
         vc4_bo_last_unreference_locked_timed(bo, time.tv_sec);
-        pipe_mutex_unlock(screen->bo_cache.lock);
+        mtx_unlock(&screen->bo_cache.lock);
 }
 
 static void
@@ -210,20 +199,19 @@ vc4_bo_free(struct vc4_bo *bo)
         struct vc4_screen *screen = bo->screen;
 
         if (bo->map) {
-#ifdef USE_VC4_SIMULATOR
-                if (bo->simulator_winsys_map) {
+                if (using_vc4_simulator && bo->name &&
+                    strcmp(bo->name, "winsys") == 0) {
                         free(bo->map);
-                        bo->map = bo->simulator_winsys_map;
+                } else {
+                        munmap(bo->map, bo->size);
+                        VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
                 }
-#endif
-                munmap(bo->map, bo->size);
-                VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
         }
 
         struct drm_gem_close c;
         memset(&c, 0, sizeof(c));
         c.handle = bo->handle;
-        int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
+        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
         if (ret != 0)
                 fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
 
@@ -273,13 +261,13 @@ free_stale_bos(struct vc4_screen *screen, time_t time)
 static void
 vc4_bo_cache_free_all(struct vc4_bo_cache *cache)
 {
-        pipe_mutex_lock(cache->lock);
+        mtx_lock(&cache->lock);
         list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
                                  time_list) {
                 vc4_bo_remove_from_cache(cache, bo);
                 vc4_bo_free(bo);
         }
-        pipe_mutex_unlock(cache->lock);
+        mtx_unlock(&cache->lock);
 }
 
 void
@@ -301,17 +289,8 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
                 /* Move old list contents over (since the array has moved, and
                  * therefore the pointers to the list heads have to change).
                  */
-                for (int i = 0; i < cache->size_list_size; i++) {
-                        struct list_head *old_head = &cache->size_list[i];
-                        if (list_empty(old_head))
-                                list_inithead(&new_list[i]);
-                        else {
-                                new_list[i].next = old_head->next;
-                                new_list[i].prev = old_head->prev;
-                                new_list[i].next->prev = &new_list[i];
-                                new_list[i].prev->next = &new_list[i];
-                        }
-                }
+                for (int i = 0; i < cache->size_list_size; i++)
+                        list_replace(&cache->size_list[i], &new_list[i]);
                 for (int i = cache->size_list_size; i < page_index + 1; i++)
                         list_inithead(&new_list[i]);
 
@@ -343,7 +322,7 @@ vc4_bo_open_handle(struct vc4_screen *screen,
 
         assert(size);
 
-        pipe_mutex_lock(screen->bo_handles_mutex);
+        mtx_lock(&screen->bo_handles_mutex);
 
         bo = util_hash_table_get(screen->bo_handles, (void*)(uintptr_t)handle);
         if (bo) {
@@ -360,16 +339,15 @@ vc4_bo_open_handle(struct vc4_screen *screen,
         bo->private = false;
 
 #ifdef USE_VC4_SIMULATOR
-        vc4_bo_map(bo);
-        bo->simulator_winsys_map = bo->map;
-        bo->simulator_winsys_stride = winsys_stride;
+        vc4_simulator_open_from_handle(screen->fd, winsys_stride,
+                                       bo->handle, bo->size);
         bo->map = malloc(bo->size);
 #endif
 
         util_hash_table_set(screen->bo_handles, (void *)(uintptr_t)handle, bo);
 
 done:
-        pipe_mutex_unlock(screen->bo_handles_mutex);
+        mtx_unlock(&screen->bo_handles_mutex);
         return bo;
 }
 
@@ -380,7 +358,7 @@ vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
         struct drm_gem_open o = {
                 .name = name
         };
-        int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o);
+        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o);
         if (ret) {
                 fprintf(stderr, "Failed to open bo %d: %s\n",
                         name, strerror(errno));
@@ -423,10 +401,10 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
                 return -1;
         }
 
-        pipe_mutex_lock(bo->screen->bo_handles_mutex);
+        mtx_lock(&bo->screen->bo_handles_mutex);
         bo->private = false;
         util_hash_table_set(bo->screen->bo_handles, (void *)(uintptr_t)bo->handle, bo);
-        pipe_mutex_unlock(bo->screen->bo_handles_mutex);
+        mtx_unlock(&bo->screen->bo_handles_mutex);
 
         return fd;
 }
@@ -447,30 +425,15 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
         bo->name = "code";
         bo->private = false; /* Make sure it doesn't go back to the cache. */
 
-        if (!using_vc4_simulator) {
-                struct drm_vc4_create_shader_bo create = {
-                        .size = size,
-                        .data = (uintptr_t)data,
-                };
-
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
-                               &create);
-                bo->handle = create.handle;
-        } else {
-                struct drm_mode_create_dumb create;
-                memset(&create, 0, sizeof(create));
-
-                create.width = 128;
-                create.bpp = 8;
-                create.height = (size + 127) / 128;
-
-                ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
-                bo->handle = create.handle;
-                assert(create.size >= size);
-
-                vc4_bo_map(bo);
-                memcpy(bo->map, data, size);
-        }
+        struct drm_vc4_create_shader_bo create = {
+                .size = size,
+                .data = (uintptr_t)data,
+        };
+
+        ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
+                        &create);
+        bo->handle = create.handle;
+
         if (ret != 0) {
                 fprintf(stderr, "create shader ioctl failure\n");
                 abort();
@@ -492,7 +455,7 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
         struct drm_gem_flink flink = {
                 .handle = bo->handle,
         };
-        int ret = drmIoctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink);
+        int ret = vc4_ioctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink);
         if (ret) {
                 fprintf(stderr, "Failed to flink bo %d: %s\n",
                         bo->handle, strerror(errno));
@@ -508,14 +471,11 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
 
 static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns)
 {
-        if (using_vc4_simulator)
-                return 0;
-
         struct drm_vc4_wait_seqno wait = {
                 .seqno = seqno,
                 .timeout_ns = timeout_ns,
         };
-        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
+        int ret = vc4_ioctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
         if (ret == -1)
                 return -errno;
         else
@@ -553,14 +513,11 @@ vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
 
 static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
 {
-        if (using_vc4_simulator)
-                return 0;
-
         struct drm_vc4_wait_bo wait = {
                 .handle = handle,
                 .timeout_ns = timeout_ns,
         };
-        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
+        int ret = vc4_ioctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
         if (ret == -1)
                 return -errno;
         else
@@ -602,19 +559,11 @@ vc4_bo_map_unsynchronized(struct vc4_bo *bo)
         if (bo->map)
                 return bo->map;
 
-        if (!using_vc4_simulator) {
-                struct drm_vc4_mmap_bo map;
-                memset(&map, 0, sizeof(map));
-                map.handle = bo->handle;
-                ret = drmIoctl(bo->screen->fd, DRM_IOCTL_VC4_MMAP_BO, &map);
-                offset = map.offset;
-        } else {
-                struct drm_mode_map_dumb map;
-                memset(&map, 0, sizeof(map));
-                map.handle = bo->handle;
-                ret = drmIoctl(bo->screen->fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
-                offset = map.offset;
-        }
+        struct drm_vc4_mmap_bo map;
+        memset(&map, 0, sizeof(map));
+        map.handle = bo->handle;
+        ret = vc4_ioctl(bo->screen->fd, DRM_IOCTL_VC4_MMAP_BO, &map);
+        offset = map.offset;
         if (ret != 0) {
                 fprintf(stderr, "map ioctl failure\n");
                 abort();
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
index 71a442648..838314f43 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -39,11 +39,6 @@ struct vc4_bo {
         uint32_t handle;
         uint32_t size;
 
-#ifdef USE_VC4_SIMULATOR
-        void *simulator_winsys_map;
-        uint32_t simulator_winsys_stride;
-#endif
-
         /** Entry in the linked list of buffers freed, by age. */
         struct list_head time_list;
         /** Entry in the per-page-count linked list of buffers freed (by age). */
@@ -98,7 +93,7 @@ vc4_bo_unreference(struct vc4_bo **bo)
                         vc4_bo_last_unreference(*bo);
         } else {
                 screen = (*bo)->screen;
-                pipe_mutex_lock(screen->bo_handles_mutex);
+                mtx_lock(&screen->bo_handles_mutex);
 
                 if (pipe_reference(&(*bo)->reference, NULL)) {
                         util_hash_table_remove(screen->bo_handles,
@@ -106,7 +101,7 @@ vc4_bo_unreference(struct vc4_bo **bo)
                         vc4_bo_last_unreference(*bo);
                 }
 
-                pipe_mutex_unlock(screen->bo_handles_mutex);
+                mtx_unlock(&screen->bo_handles_mutex);
         }
 
         *bo = NULL;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
index afb9987f4..35578370e 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
@@ -28,7 +28,7 @@
 void
 vc4_init_cl(void *mem_ctx, struct vc4_cl *cl)
 {
-        cl->base = ralloc_size(mem_ctx, 1);
+        cl->base = rzalloc_size(mem_ctx, 1); /* TODO: don't use rzalloc */
         cl->next = cl->base;
         cl->size = 0;
 }
@@ -76,5 +76,7 @@ vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo)
         cl_ptr(&out, vc4_bo_reference(bo));
         cl_end(&job->bo_pointers, out);
 
+        job->bo_space += bo->size;
+
         return hindex;
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
index 974df8a1d..401c160fc 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
@@ -144,7 +144,12 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         vc4->fd = screen->fd;
 
         slab_create_child(&vc4->transfer_pool, &screen->transfer_pool);
-        vc4->blitter = util_blitter_create(pctx);
+
+	vc4->uploader = u_upload_create_default(&vc4->base);
+	vc4->base.stream_uploader = vc4->uploader;
+	vc4->base.const_uploader = vc4->uploader;
+
+	vc4->blitter = util_blitter_create(pctx);
         if (!vc4->blitter)
                 goto fail;
 
@@ -153,10 +158,6 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         if (!vc4->primconvert)
                 goto fail;
 
-        vc4->uploader = u_upload_create(pctx, 16 * 1024,
-                                        PIPE_BIND_INDEX_BUFFER,
-                                        PIPE_USAGE_STREAM);
-
         vc4_debug |= saved_shaderdb_flag;
 
         vc4->sample_mask = (1 << VC4_MAX_SAMPLES) - 1;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
index c164eba80..6bd2424ec 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
@@ -30,6 +30,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "util/slab.h"
+#include "xf86drm.h"
 
 #define __user
 #include "vc4_drm.h"
@@ -38,6 +39,13 @@
 #include "vc4_cl.h"
 #include "vc4_qir.h"
 
+#ifndef DRM_VC4_PARAM_SUPPORTS_ETC1
+#define DRM_VC4_PARAM_SUPPORTS_ETC1		4
+#endif
+#ifndef DRM_VC4_PARAM_SUPPORTS_THREADED_FS
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
+#endif
+
 #ifdef USE_VC4_SIMULATOR
 #define using_vc4_simulator true
 #else
@@ -162,6 +170,8 @@ struct vc4_compiled_shader {
          */
         bool failed;
 
+        bool fs_threaded;
+
         uint8_t num_inputs;
 
         /* Byte offsets for the start of the vertex attributes 0-7, and the
@@ -218,6 +228,13 @@ struct vc4_job {
         struct vc4_cl bo_handles;
         struct vc4_cl bo_pointers;
         uint32_t shader_rec_count;
+        /**
+         * Amount of memory used by the BOs in bo_pointers.
+         *
+         * Used for checking when we should flush the job early so we don't
+         * OOM.
+         */
+        uint32_t bo_space;
 
         /** @{ Surfaces to submit rendering for. */
         struct pipe_surface *color_read;
@@ -317,11 +334,12 @@ struct vc4_context {
         uint64_t next_compiled_program_id;
 
         struct ra_regs *regs;
-        unsigned int reg_class_any;
-        unsigned int reg_class_a_or_b_or_acc;
+        unsigned int reg_class_any[2];
+        unsigned int reg_class_a_or_b[2];
+        unsigned int reg_class_a_or_b_or_acc[2];
         unsigned int reg_class_r0_r3;
-        unsigned int reg_class_r4_or_a;
-        unsigned int reg_class_a;
+        unsigned int reg_class_r4_or_a[2];
+        unsigned int reg_class_a[2];
 
         uint8_t prim_mode;
 
@@ -433,6 +451,18 @@ void vc4_simulator_destroy(struct vc4_screen *screen);
 int vc4_simulator_flush(struct vc4_context *vc4,
                         struct drm_vc4_submit_cl *args,
                         struct vc4_job *job);
+int vc4_simulator_ioctl(int fd, unsigned long request, void *arg);
+void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+                                    int handle, uint32_t size);
+
+static inline int
+vc4_ioctl(int fd, unsigned long request, void *arg)
+{
+        if (using_vc4_simulator)
+                return vc4_simulator_ioctl(fd, request, arg);
+        else
+                return drmIoctl(fd, request, arg);
+}
 
 void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader);
 void vc4_write_uniforms(struct vc4_context *vc4,
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
index c5afc0cda..ebd080298 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
@@ -155,7 +155,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
         /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
         cl_u16(&shader_rec,
                VC4_SHADER_FLAG_ENABLE_CLIPPING |
-               VC4_SHADER_FLAG_FS_SINGLE_THREAD |
+               (vc4->prog.fs->fs_threaded ?
+                0 : VC4_SHADER_FLAG_FS_SINGLE_THREAD) |
                ((info->mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex) ?
                 VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
@@ -465,6 +466,13 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
         job->resolve |= PIPE_CLEAR_COLOR0;
 
+        /* If we've used half of the presumably 256MB CMA area, flush the job
+         * so that we don't accumulate a job that will end up not being
+         * executable.
+         */
+        if (job->bo_space > 128 * 1024 * 1024)
+                vc4_flush(pctx);
+
         if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH)
                 vc4_flush(pctx);
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c
index 9258ceebe..b48d89a06 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c
@@ -76,6 +76,7 @@ vc4_emit_state(struct pipe_context *pctx)
                           VC4_DIRTY_ZSA |
                           VC4_DIRTY_COMPILED_FS)) {
                 uint8_t ez_enable_mask_out = ~0;
+                uint8_t rasosm_mask_out = ~0;
 
                 /* HW-2905: If the RCL ends up doing a full-res load when
                  * multisampling, then early Z tracking may end up with values
@@ -89,10 +90,20 @@ vc4_emit_state(struct pipe_context *pctx)
                 if (job->msaa || vc4->prog.fs->disable_early_z)
                         ez_enable_mask_out &= ~VC4_CONFIG_BITS_EARLY_Z;
 
+                /* Don't set the rasterizer to oversample if we're doing our
+                 * binning and load/stores in single-sample mode.  This is for
+                 * the samples == 1 case, where vc4 doesn't do any
+                 * multisampling behavior.
+                 */
+                if (!job->msaa) {
+                        rasosm_mask_out &=
+                                ~VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X;
+                }
+
                 cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
                 cl_u8(&bcl,
-                      vc4->rasterizer->config_bits[0] |
-                      vc4->zsa->config_bits[0]);
+                      (vc4->rasterizer->config_bits[0] |
+                       vc4->zsa->config_bits[0]) & rasosm_mask_out);
                 cl_u8(&bcl,
                       vc4->rasterizer->config_bits[1] |
                       vc4->zsa->config_bits[1]);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c b/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c
index dd700cdec..42cdad115 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c
@@ -83,6 +83,8 @@ static const struct vc4_format vc4_format_table[] = {
 
         FORMAT(B5G6R5_UNORM, RGB565, RGB565, SWIZ(X, Y, Z, 1)),
 
+        FORMAT(ETC1_RGB8, NO, ETC1, SWIZ(X, Y, Z, 1)),
+
         /* Depth sampling will be handled by doing nearest filtering and not
          * unpacking the RGBA value.
          */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index b7e31b80c..2ed89ead5 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -494,7 +494,7 @@ vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
         discard->num_components = 1;
         discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
         nir_builder_instr_insert(b, &discard->instr);
-        c->s->info.fs.uses_discard = true;
+        c->s->info->fs.uses_discard = true;
 }
 
 static nir_ssa_def *
@@ -630,25 +630,14 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
 {
         nir_ssa_def *frag_color = intr->src[0].ssa;
 
-        if (c->fs_key->sample_coverage) {
-                nir_intrinsic_instr *load =
-                        nir_intrinsic_instr_create(b->shader,
-                                                   nir_intrinsic_load_sample_mask_in);
-                load->num_components = 1;
-                nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
-                nir_builder_instr_insert(b, &load->instr);
-
-                nir_ssa_def *bitmask = &load->dest.ssa;
-
-                vc4_nir_store_sample_mask(c, b, bitmask);
-        } else if (c->fs_key->sample_alpha_to_coverage) {
+        if (c->fs_key->sample_alpha_to_coverage) {
                 nir_ssa_def *a = nir_channel(b, frag_color, 3);
 
                 /* XXX: We should do a nice dither based on the fragment
                  * coordinate, instead.
                  */
                 nir_ssa_def *num_samples = nir_imm_float(b, VC4_MAX_SAMPLES);
-                nir_ssa_def *num_bits = nir_f2i(b, nir_fmul(b, a, num_samples));
+                nir_ssa_def *num_bits = nir_f2i32(b, nir_fmul(b, a, num_samples));
                 nir_ssa_def *bitmask = nir_isub(b,
                                                 nir_ishl(b,
                                                          nir_imm_int(b, 1),
@@ -730,4 +719,16 @@ vc4_nir_lower_blend(nir_shader *s, struct vc4_compile *c)
                                               nir_metadata_dominance);
                 }
         }
+
+        /* If we didn't do alpha-to-coverage on the output color, we still
+         * need to pass glSampleMask() through.
+         */
+        if (c->fs_key->sample_coverage && !c->fs_key->sample_alpha_to_coverage) {
+                nir_function_impl *impl = nir_shader_get_entrypoint(s);
+                nir_builder b;
+                nir_builder_init(&b, impl);
+                b.cursor = nir_after_block(nir_impl_last_block(impl));
+
+                vc4_nir_store_sample_mask(c, &b, nir_load_sample_mask_in(&b));
+        }
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 4a795f8da..b7969a562 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -106,11 +106,11 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
         } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) {
                 if (chan->normalized) {
                         return nir_fmul(b,
-                                        nir_i2f(b, vpm_reads[swiz]),
+                                        nir_i2f32(b, vpm_reads[swiz]),
                                         nir_imm_float(b,
                                                       1.0 / 0x7fffffff));
                 } else {
-                        return nir_i2f(b, vpm_reads[swiz]);
+                        return nir_i2f32(b, vpm_reads[swiz]);
                 }
         } else if (chan->size == 8 &&
                    (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
@@ -125,16 +125,16 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
                                                 nir_imm_float(b, 1.0));
                         } else {
                                 return nir_fadd(b,
-                                                nir_i2f(b,
-                                                        vc4_nir_unpack_8i(b, temp,
-                                                                          swiz)),
+                                                nir_i2f32(b,
+                                                          vc4_nir_unpack_8i(b, temp,
+                                                                            swiz)),
                                                 nir_imm_float(b, -128.0));
                         }
                 } else {
                         if (chan->normalized) {
                                 return vc4_nir_unpack_8f(b, vpm, swiz);
                         } else {
-                                return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz));
+                                return nir_i2f32(b, vc4_nir_unpack_8i(b, vpm, swiz));
                         }
                 }
         } else if (chan->size == 16 &&
@@ -146,7 +146,7 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
                  * UNPACK_16_I for all of these.
                  */
                 if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1));
+                        temp = nir_i2f32(b, vc4_nir_unpack_16i(b, vpm, swiz & 1));
                         if (chan->normalized) {
                                 return nir_fmul(b, temp,
                                                 nir_imm_float(b, 1/32768.0f));
@@ -154,7 +154,7 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
                                 return temp;
                         }
                 } else {
-                        temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1));
+                        temp = nir_i2f32(b, vc4_nir_unpack_16u(b, vpm, swiz & 1));
                         if (chan->normalized) {
                                 return nir_fmul(b, temp,
                                                 nir_imm_float(b, 1 / 65535.0));
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 01ad05d27..5e7d26923 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -94,14 +94,17 @@ static void
 replace_with_mov(struct vc4_compile *c, struct qinst *inst, struct qreg arg)
 {
         dump_from(c, inst);
+
+        inst->src[0] = arg;
+        if (qir_has_implicit_tex_uniform(inst))
+                inst->src[1] = inst->src[qir_get_tex_uniform_src(inst)];
+
         if (qir_is_mul(inst))
                 inst->op = QOP_MMOV;
         else if (qir_is_float_input(inst))
                 inst->op = QOP_FMOV;
         else
                 inst->op = QOP_MOV;
-        inst->src[0] = arg;
-        inst->src[1] = c->undef;
         dump_to(c, inst);
 }
 
@@ -172,8 +175,12 @@ qir_opt_algebraic(struct vc4_compile *c)
                         break;
 
                 case QOP_ADD:
-                        if (replace_x_0_with_x(c, inst, 0) ||
-                            replace_x_0_with_x(c, inst, 1)) {
+                        /* Kernel validation requires that we use an actual
+                         * add instruction.
+                         */
+                        if (inst->dst.file != QFILE_TEX_S_DIRECT &&
+                            (replace_x_0_with_x(c, inst, 0) ||
+                             replace_x_0_with_x(c, inst, 1))) {
                                 progress = true;
                                 break;
                         }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c
new file mode 100644
index 000000000..e4f8e57fc
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_opt_coalesce_ff_writes.c
+ *
+ * This modifies instructions that generate the value consumed by a VPM or TMU
+ * coordinate write to write directly into the VPM or TMU.
+ */
+
+#include "vc4_qir.h"
+
+bool
+qir_opt_coalesce_ff_writes(struct vc4_compile *c)
+{
+        /* For now, only do this pass when we don't have control flow. */
+        struct qblock *block = qir_entry_block(c);
+        if (block != qir_exit_block(c))
+                return false;
+
+        bool progress = false;
+        uint32_t use_count[c->num_temps];
+        memset(&use_count, 0, sizeof(use_count));
+
+        qir_for_each_inst_inorder(inst, c) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file == QFILE_TEMP) {
+                                uint32_t temp = inst->src[i].index;
+                                use_count[temp]++;
+                        }
+                }
+        }
+
+        qir_for_each_inst_inorder(mov_inst, c) {
+                if (!qir_is_raw_mov(mov_inst) || mov_inst->sf)
+                        continue;
+                if (mov_inst->src[0].file != QFILE_TEMP)
+                        continue;
+
+                if (!(mov_inst->dst.file == QFILE_VPM ||
+                      mov_inst->dst.file == QFILE_TLB_COLOR_WRITE ||
+                      mov_inst->dst.file == QFILE_TLB_COLOR_WRITE_MS ||
+                      qir_is_tex(mov_inst)))
+                        continue;
+
+                uint32_t temp = mov_inst->src[0].index;
+                if (use_count[temp] != 1)
+                        continue;
+
+                struct qinst *inst = c->defs[temp];
+                if (!inst)
+                        continue;
+
+                /* Don't bother trying to fold in an ALU op using a uniform to
+                 * a texture op, as we'll just have to lower the uniform back
+                 * out.
+                 */
+                if (qir_is_tex(mov_inst) && qir_has_uniform_read(inst))
+                        continue;
+
+                if (qir_depends_on_flags(inst) || inst->sf)
+                        continue;
+
+                if (qir_has_side_effects(c, inst) ||
+                    qir_has_side_effect_reads(c, inst) ||
+                    inst->op == QOP_TLB_COLOR_READ ||
+                    inst->op == QOP_VARY_ADD_C) {
+                        continue;
+                }
+
+                /* Move the generating instruction into the position of the FF
+                 * write.
+                 */
+                c->defs[inst->dst.index] = NULL;
+                inst->dst.file = mov_inst->dst.file;
+                inst->dst.index = mov_inst->dst.index;
+                if (qir_has_implicit_tex_uniform(mov_inst)) {
+                        inst->src[qir_get_tex_uniform_src(inst)] =
+                                mov_inst->src[qir_get_tex_uniform_src(mov_inst)];
+                }
+
+                list_del(&inst->link);
+                list_addtail(&inst->link, &mov_inst->link);
+
+                qir_remove_instruction(c, mov_inst);
+
+                progress = true;
+        }
+
+        return progress;
+}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
index 7ff916155..de642d465 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
@@ -58,7 +58,7 @@ dump_to(struct vc4_compile *c, struct qinst *inst)
 static bool
 constant_fold(struct vc4_compile *c, struct qinst *inst)
 {
-        int nsrc = qir_get_op_nsrc(inst->op);
+        int nsrc = qir_get_nsrc(inst);
         uint32_t ui[nsrc];
 
         for (int i = 0; i < nsrc; i++) {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d20ee5e22..9a6320a9a 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,7 +67,7 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs)
         bool debug = false;
         bool progress = false;
 
-	for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+	for (int i = 0; i < qir_get_nsrc(inst); i++) {
                 if (inst->src[i].file != QFILE_TEMP)
                         continue;
 
@@ -113,7 +113,7 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs)
                          * this instruction doesn't already use it.
                          */
                         bool already_has_unpack = false;
-                        for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
+                        for (int j = 0; j < qir_get_nsrc(inst); j++) {
                                 if (inst->src[j].pack)
                                         already_has_unpack = true;
                         }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c
index 1838c394f..f04d0ff97 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c
@@ -54,7 +54,7 @@ dce(struct vc4_compile *c, struct qinst *inst)
 static bool
 has_nonremovable_reads(struct vc4_compile *c, struct qinst *inst)
 {
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
                 if (inst->src[i].file == QFILE_VPM) {
                         uint32_t attr = inst->src[i].index / 4;
                         uint32_t offset = (inst->src[i].index % 4) * 4;
@@ -88,7 +88,7 @@ qir_opt_dead_code(struct vc4_compile *c)
         bool *used = calloc(c->num_temps, sizeof(bool));
 
         qir_for_each_inst_inorder(inst, c) {
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file == QFILE_TEMP)
                                 used[inst->src[i].index] = true;
                 }
@@ -129,7 +129,7 @@ qir_opt_dead_code(struct vc4_compile *c)
                                 continue;
                         }
 
-                        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        for (int i = 0; i < qir_get_nsrc(inst); i++) {
                                 if (inst->src[i].file != QFILE_VPM)
                                         continue;
                                 uint32_t attr = inst->src[i].index / 4;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c
index f4856673b..577290b1f 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c
@@ -62,7 +62,7 @@ inst_srcs_updated(struct qinst *inst, struct qinst *writer)
          */
         switch (writer->dst.file) {
         case QFILE_TEMP:
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file == QFILE_TEMP &&
                             inst->src[i].index == writer->dst.index) {
                                 return true;
@@ -95,7 +95,7 @@ inst_result_equals(struct qinst *a, struct qinst *b)
                 return false;
         }
 
-        for (int i = 0; i < qir_get_op_nsrc(a->op); i++) {
+        for (int i = 0; i < qir_get_nsrc(a); i++) {
                 if (!qir_reg_equals(a->src[i], b->src[i]) ||
                     src_file_varies_on_reread(a->src[i]) ||
                     src_file_varies_on_reread(b->src[i])) {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
index e97cb63ae..07eca71f2 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
@@ -45,7 +45,7 @@ qir_opt_small_immediates(struct vc4_compile *c)
                  * elsewhere).
                  */
                 bool uses_small_imm = false;
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file == QFILE_SMALL_IMM)
                                 uses_small_imm = true;
                 }
@@ -63,7 +63,7 @@ qir_opt_small_immediates(struct vc4_compile *c)
                 if (inst->op == QOP_MIN_NOIMM)
                         continue;
 
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         struct qreg src = qir_follow_movs(c, inst->src[i]);
 
                         if (src.file != QFILE_UNIF ||
@@ -73,11 +73,8 @@ qir_opt_small_immediates(struct vc4_compile *c)
                                 continue;
                         }
 
-                        if (i == 1 &&
-                            (inst->op == QOP_TEX_S ||
-                             inst->op == QOP_TEX_T ||
-                             inst->op == QOP_TEX_R ||
-                             inst->op == QOP_TEX_B)) {
+                        if (qir_is_tex(inst) &&
+                            i == qir_get_tex_uniform_src(inst)) {
                                 /* No turning the implicit uniform read into
                                  * an immediate.
                                  */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c
index 83ba11b81..6f196e7d1 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c
@@ -24,10 +24,8 @@
 /**
  * @file vc4_opt_vpm.c
  *
- * This modifies instructions that:
- * 1. exclusively consume a value read from the VPM to directly read the VPM if
- *    other operands allow it.
- * 2. generate the value consumed by a VPM write to write directly into the VPM.
+ * This modifies instructions that exclusively consume a value read from the
+ * VPM to directly read the VPM if other operands allow it.
  */
 
 #include "vc4_qir.h"
@@ -44,21 +42,11 @@ qir_opt_vpm(struct vc4_compile *c)
                 return false;
 
         bool progress = false;
-        struct qinst *vpm_writes[64] = { 0 };
         uint32_t use_count[c->num_temps];
-        uint32_t vpm_write_count = 0;
         memset(&use_count, 0, sizeof(use_count));
 
         qir_for_each_inst_inorder(inst, c) {
-                switch (inst->dst.file) {
-                case QFILE_VPM:
-                        vpm_writes[vpm_write_count++] = inst;
-                        break;
-                default:
-                        break;
-                }
-
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file == QFILE_TEMP) {
                                 uint32_t temp = inst->src[i].index;
                                 use_count[temp]++;
@@ -81,7 +69,7 @@ qir_opt_vpm(struct vc4_compile *c)
                     qir_is_tex(inst))
                         continue;
 
-                for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
+                for (int j = 0; j < qir_get_nsrc(inst); j++) {
                         if (inst->src[j].file != QFILE_TEMP ||
                             inst->src[j].pack)
                                 continue;
@@ -106,7 +94,7 @@ qir_opt_vpm(struct vc4_compile *c)
                         }
 
                         uint32_t temps = 0;
-                        for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) {
+                        for (int k = 0; k < qir_get_nsrc(inst); k++) {
                                 if (inst->src[k].file == QFILE_TEMP)
                                         temps++;
                         }
@@ -127,42 +115,5 @@ qir_opt_vpm(struct vc4_compile *c)
                 }
         }
 
-        for (int i = 0; i < vpm_write_count; i++) {
-                if (!qir_is_raw_mov(vpm_writes[i]) ||
-                    vpm_writes[i]->src[0].file != QFILE_TEMP) {
-                        continue;
-                }
-
-                uint32_t temp = vpm_writes[i]->src[0].index;
-                if (use_count[temp] != 1)
-                        continue;
-
-                struct qinst *inst = c->defs[temp];
-                if (!inst)
-                        continue;
-
-                if (qir_depends_on_flags(inst) || inst->sf)
-                        continue;
-
-                if (qir_has_side_effects(c, inst) ||
-                    qir_has_side_effect_reads(c, inst)) {
-                        continue;
-                }
-
-                /* Move the generating instruction to the end of the program
-                 * to maintain the order of the VPM writes.
-                 */
-                assert(!vpm_writes[i]->sf);
-                list_del(&inst->link);
-                list_addtail(&inst->link, &vpm_writes[i]->link);
-                qir_remove_instruction(c, vpm_writes[i]);
-
-                c->defs[inst->dst.index] = NULL;
-                inst->dst.file = QFILE_VPM;
-                inst->dst.index = 0;
-
-                progress = true;
-        }
-
         return progress;
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
index 00e16e3db..59368734d 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
@@ -24,7 +24,7 @@
 
 #include <inttypes.h>
 #include "util/u_format.h"
-#include "util/u_hash.h"
+#include "util/crc32.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/ralloc.h"
@@ -38,9 +38,6 @@
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
 #include "mesa/state_tracker/st_glsl_types.h"
-#ifdef USE_VC4_SIMULATOR
-#include "simpenrose/simpenrose.h"
-#endif
 
 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
@@ -68,6 +65,23 @@ resize_qreg_array(struct vc4_compile *c,
                 (*regs)[i] = c->undef;
 }
 
+static void
+ntq_emit_thrsw(struct vc4_compile *c)
+{
+        if (!c->fs_threaded)
+                return;
+
+        /* Always thread switch after each texture operation for now.
+         *
+         * We could do better by batching a bunch of texture fetches up and
+         * then doing one thread switch and collecting all their results
+         * afterward.
+         */
+        qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
+                                    c->undef, c->undef));
+        c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
+}
+
 static struct qreg
 indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
 {
@@ -106,8 +120,14 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
                                         qir_uniform_ui(c, (range->dst_offset +
                                                            range->size - 4)));
 
-        qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
+        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
+                     indirect_offset,
+                     qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
+
         c->num_texture_samples++;
+
+        ntq_emit_thrsw(c);
+
         return qir_TEX_RESULT(c);
 }
 
@@ -140,10 +160,33 @@ ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
         return qregs;
 }
 
+/**
+ * This function is responsible for getting QIR results into the associated
+ * storage for a NIR instruction.
+ *
+ * If it's a NIR SSA def, then we just set the associated hash table entry to
+ * the new result.
+ *
+ * If it's a NIR reg, then we need to update the existing qreg assigned to the
+ * NIR destination with the incoming value.  To do that without introducing
+ * new MOVs, we require that the incoming qreg either be a uniform, or be
+ * SSA-defined by the previous QIR instruction in the block and rewritable by
+ * this function.  That lets us sneak ahead and insert the SF flag beforehand
+ * (knowing that the previous instruction doesn't depend on flags) and rewrite
+ * its destination to be the NIR reg's destination
+ */
 static void
 ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
                struct qreg result)
 {
+        struct qinst *last_inst = NULL;
+        if (!list_empty(&c->cur_block->instructions))
+                last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+        assert(result.file == QFILE_UNIF ||
+               (result.file == QFILE_TEMP &&
+                last_inst && last_inst == c->defs[result.index]));
+
         if (dest->is_ssa) {
                 assert(chan < dest->ssa.num_components);
 
@@ -165,17 +208,34 @@ ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
                         _mesa_hash_table_search(c->def_ht, reg);
                 struct qreg *qregs = entry->data;
 
-                /* Conditionally move the result to the destination if the
-                 * channel is active.
+                /* Insert a MOV if the source wasn't an SSA def in the
+                 * previous instruction.
+                 */
+                if (result.file == QFILE_UNIF) {
+                        result = qir_MOV(c, result);
+                        last_inst = c->defs[result.index];
+                }
+
+                /* We know they're both temps, so just rewrite index. */
+                c->defs[last_inst->dst.index] = NULL;
+                last_inst->dst.index = qregs[chan].index;
+
+                /* If we're in control flow, then make this update of the reg
+                 * conditional on the execution mask.
                  */
                 if (c->execute.file != QFILE_NULL) {
-                        struct qinst *mov;
+                        last_inst->dst.index = qregs[chan].index;
 
+                        /* Set the flags to the current exec mask.  To insert
+                         * the SF, we temporarily remove our SSA instruction.
+                         */
+                        list_del(&last_inst->link);
                         qir_SF(c, c->execute);
-                        mov = qir_MOV_cond(c, QPU_COND_ZS, qregs[chan], result);
-                        mov->cond_is_exec_mask = true;
-                } else {
-                        qir_MOV_dest(c, qregs[chan], result);
+                        list_addtail(&last_inst->link,
+                                     &c->cur_block->instructions);
+
+                        last_inst->cond = QPU_COND_ZS;
+                        last_inst->cond_is_exec_mask = true;
                 }
         }
 }
@@ -324,24 +384,24 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
         addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
         addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
 
-        qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
+        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
+                     addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
+
+        ntq_emit_thrsw(c);
 
         struct qreg tex = qir_TEX_RESULT(c);
         c->num_texture_samples++;
 
-        struct qreg dest[4];
         enum pipe_format format = c->key->tex[unit].format;
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg scaled = ntq_scale_depth_texture(c, tex);
                 for (int i = 0; i < 4; i++)
-                        dest[i] = scaled;
+                        ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));
         } else {
                 for (int i = 0; i < 4; i++)
-                        dest[i] = qir_UNPACK_8_F(c, tex, i);
+                        ntq_store_dest(c, &instr->dest, i,
+                                       qir_UNPACK_8_F(c, tex, i));
         }
-
-        for (int i = 0; i < 4; i++)
-                ntq_store_dest(c, &instr->dest, i, dest[i]);
 }
 
 static void
@@ -375,7 +435,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                         lod = ntq_get_src(c, instr->src[i].src, 0);
                         is_txl = true;
                         break;
-                case nir_tex_src_comparitor:
+                case nir_tex_src_comparator:
                         compare = ntq_get_src(c, instr->src[i].src, 0);
                         break;
                 default:
@@ -383,6 +443,16 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 }
         }
 
+        if (c->stage != QSTAGE_FRAG && !is_txl) {
+                /* From the GLSL 1.20 spec:
+                 *
+                 *     "If it is mip-mapped and running on the vertex shader,
+                 *      then the base texture is used."
+                 */
+                is_txl = true;
+                lod = qir_uniform_ui(c, 0);
+        }
+
         if (c->key->tex[unit].force_first_level) {
                 lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
                 is_txl = true;
@@ -413,14 +483,20 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                            unit | (is_txl << 16));
         }
 
+        struct qinst *tmu;
         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                qir_TEX_R(c, r, texture_u[next_texture_u++]);
+                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
+                tmu->src[qir_get_tex_uniform_src(tmu)] =
+                        texture_u[next_texture_u++];
         } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
                    c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
-                qir_TEX_R(c, qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, unit),
-                          texture_u[next_texture_u++]);
+                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
+                                   qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
+                                               unit));
+                tmu->src[qir_get_tex_uniform_src(tmu)] =
+                        texture_u[next_texture_u++];
         }
 
         if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
@@ -431,14 +507,23 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 t = qir_SAT(c, t);
         }
 
-        qir_TEX_T(c, t, texture_u[next_texture_u++]);
+        tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
+        tmu->src[qir_get_tex_uniform_src(tmu)] =
+                texture_u[next_texture_u++];
 
-        if (is_txl || is_txb)
-                qir_TEX_B(c, lod, texture_u[next_texture_u++]);
+        if (is_txl || is_txb) {
+                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
+                tmu->src[qir_get_tex_uniform_src(tmu)] =
+                        texture_u[next_texture_u++];
+        }
 
-        qir_TEX_S(c, s, texture_u[next_texture_u++]);
+        tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
+        tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
 
         c->num_texture_samples++;
+
+        ntq_emit_thrsw(c);
+
         struct qreg tex = qir_TEX_RESULT(c);
 
         enum pipe_format format = c->key->tex[unit].format;
@@ -514,8 +599,11 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
         struct qreg diff = qir_FSUB(c, src, trunc);
         qir_SF(c, diff);
-        return qir_SEL(c, QPU_COND_NS,
-                       qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff);
+
+        qir_FADD_dest(c, diff,
+                      diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
+
+        return qir_MOV(c, diff);
 }
 
 /**
@@ -525,15 +613,18 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
 static struct qreg
 ntq_ffloor(struct vc4_compile *c, struct qreg src)
 {
-        struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
+        struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
 
         /* This will be < 0 if we truncated and the truncation was of a value
          * that was < 0 in the first place.
          */
-        qir_SF(c, qir_FSUB(c, src, trunc));
+        qir_SF(c, qir_FSUB(c, src, result));
+
+        struct qinst *sub = qir_FSUB_dest(c, result,
+                                          result, qir_uniform_f(c, 1.0));
+        sub->cond = QPU_COND_NS;
 
-        return qir_SEL(c, QPU_COND_NS,
-                       qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc);
+        return qir_MOV(c, result);
 }
 
 /**
@@ -543,15 +634,17 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src)
 static struct qreg
 ntq_fceil(struct vc4_compile *c, struct qreg src)
 {
-        struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
+        struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
 
         /* This will be < 0 if we truncated and the truncation was of a value
          * that was > 0 in the first place.
          */
-        qir_SF(c, qir_FSUB(c, trunc, src));
+        qir_SF(c, qir_FSUB(c, result, src));
+
+        qir_FADD_dest(c, result,
+                      result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
 
-        return qir_SEL(c, QPU_COND_NS,
-                       qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc);
+        return qir_MOV(c, result);
 }
 
 static struct qreg
@@ -632,7 +725,7 @@ ntq_fsign(struct vc4_compile *c, struct qreg src)
         qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
         qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
         qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
-        return t;
+        return qir_MOV(c, t);
 }
 
 static void
@@ -811,7 +904,7 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
                 qir_PACK_8_F(c, result, src, i);
         }
 
-        ntq_store_dest(c, &instr->dest.dest, 0, result);
+        ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));
 }
 
 /** Handles sign-extended bitfield extracts for 16 bits. */
@@ -917,6 +1010,9 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
                 break;
         }
 
+        /* Make the temporary for nir_store_dest(). */
+        *dest = qir_MOV(c, *dest);
+
         return true;
 }
 
@@ -943,7 +1039,7 @@ static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
 
 out:
         qir_SF(c, src[0]);
-        return qir_SEL(c, QPU_COND_NS, src[1], src[2]);
+        return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
 }
 
 static struct qreg
@@ -962,9 +1058,9 @@ ntq_fddx(struct vc4_compile *c, struct qreg src)
         qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
                           qir_uniform_ui(c, 1)));
 
-        return qir_SEL(c, QPU_COND_ZS,
-                       qir_FSUB(c, from_right, src),
-                       qir_FSUB(c, src, from_left));
+        return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
+                                  qir_FSUB(c, from_right, src),
+                                  qir_FSUB(c, src, from_left)));
 }
 
 static struct qreg
@@ -981,9 +1077,9 @@ ntq_fddy(struct vc4_compile *c, struct qreg src)
                           qir_reg(QFILE_QPU_ELEMENT, 0),
                           qir_uniform_ui(c, 2)));
 
-        return qir_SEL(c, QPU_COND_ZS,
-                       qir_FSUB(c, from_top, src),
-                       qir_FSUB(c, src, from_bottom));
+        return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
+                                  qir_FSUB(c, from_top, src),
+                                  qir_FSUB(c, src, from_bottom)));
 }
 
 static void
@@ -1004,7 +1100,8 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-                        ntq_store_dest(c, &instr->dest.dest, i, srcs[i]);
+                        ntq_store_dest(c, &instr->dest.dest, i,
+                                       qir_MOV(c, srcs[i]));
                 return;
         }
 
@@ -1053,12 +1150,12 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 result = qir_FMAX(c, src[0], src[1]);
                 break;
 
-        case nir_op_f2i:
-        case nir_op_f2u:
+        case nir_op_f2i32:
+        case nir_op_f2u32:
                 result = qir_FTOI(c, src[0]);
                 break;
-        case nir_op_i2f:
-        case nir_op_u2f:
+        case nir_op_i2f32:
+        case nir_op_u2f32:
                 result = qir_ITOF(c, src[0]);
                 break;
         case nir_op_b2f:
@@ -1070,9 +1167,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_i2b:
         case nir_op_f2b:
                 qir_SF(c, src[0]);
-                result = qir_SEL(c, QPU_COND_ZC,
-                                 qir_uniform_ui(c, ~0),
-                                 qir_uniform_ui(c, 0));
+                result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,
+                                            qir_uniform_ui(c, ~0),
+                                            qir_uniform_ui(c, 0)));
                 break;
 
         case nir_op_iadd:
@@ -1136,7 +1233,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 break;
         case nir_op_fcsel:
                 qir_SF(c, src[0]);
-                result = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
+                result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
                 break;
 
         case nir_op_frcp:
@@ -1250,7 +1347,7 @@ emit_frag_end(struct vc4_compile *c)
         }
 
         uint32_t discard_cond = QPU_COND_ALWAYS;
-        if (c->s->info.fs.uses_discard) {
+        if (c->s->info->fs.uses_discard) {
                 qir_SF(c, c->discard);
                 discard_cond = QPU_COND_ZS;
         }
@@ -1414,7 +1511,7 @@ emit_vert_end(struct vc4_compile *c,
 static void
 emit_coord_end(struct vc4_compile *c)
 {
-        struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]);
+        struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
 
         emit_stub_vpm_read(c);
 
@@ -1448,6 +1545,10 @@ vc4_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
+                NIR_PASS(progress, s, nir_opt_loop_unroll,
+                         nir_var_shader_in |
+                         nir_var_shader_out |
+                         nir_var_local);
         } while (progress);
 }
 
@@ -1605,6 +1706,47 @@ ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
 }
 
 static void
+ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
+{
+        nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+        assert(const_offset->u32[0] == 0);
+
+        /* Reads of the per-sample color need to be done in
+         * order.
+         */
+        int sample_index = (nir_intrinsic_base(instr) -
+                            VC4_NIR_TLB_COLOR_READ_INPUT);
+        for (int i = 0; i <= sample_index; i++) {
+                if (c->color_reads[i].file == QFILE_NULL) {
+                        c->color_reads[i] =
+                                qir_TLB_COLOR_READ(c);
+                }
+        }
+        ntq_store_dest(c, &instr->dest, 0,
+                       qir_MOV(c, c->color_reads[sample_index]));
+}
+
+static void
+ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
+{
+        assert(instr->num_components == 1);
+
+        nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+        assert(const_offset && "vc4 doesn't support indirect inputs");
+
+        if (c->stage == QSTAGE_FRAG &&
+            nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
+                ntq_emit_color_read(c, instr);
+                return;
+        }
+
+        uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+        int comp = nir_intrinsic_component(instr);
+        ntq_store_dest(c, &instr->dest, 0,
+                       qir_MOV(c, c->inputs[offset * 4 + comp]));
+}
+
+static void
 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 {
         nir_const_value *const_offset;
@@ -1681,31 +1823,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_input:
-                assert(instr->num_components == 1);
-                const_offset = nir_src_as_const_value(instr->src[0]);
-                assert(const_offset && "vc4 doesn't support indirect inputs");
-                if (c->stage == QSTAGE_FRAG &&
-                    nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
-                        assert(const_offset->u32[0] == 0);
-                        /* Reads of the per-sample color need to be done in
-                         * order.
-                         */
-                        int sample_index = (nir_intrinsic_base(instr) -
-                                           VC4_NIR_TLB_COLOR_READ_INPUT);
-                        for (int i = 0; i <= sample_index; i++) {
-                                if (c->color_reads[i].file == QFILE_NULL) {
-                                        c->color_reads[i] =
-                                                qir_TLB_COLOR_READ(c);
-                                }
-                        }
-                        ntq_store_dest(c, &instr->dest, 0,
-                                       c->color_reads[sample_index]);
-                } else {
-                        offset = nir_intrinsic_base(instr) + const_offset->u32[0];
-                        int comp = nir_intrinsic_component(instr);
-                        ntq_store_dest(c, &instr->dest, 0,
-                                       c->inputs[offset * 4 + comp]);
-                }
+                ntq_emit_load_input(c, instr);
                 break;
 
         case nir_intrinsic_store_output:
@@ -1855,11 +1973,12 @@ ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
         qir_link_blocks(c->cur_block, after_block);
 
         qir_set_emit_block(c, after_block);
-        if (was_top_level)
+        if (was_top_level) {
                 c->execute = c->undef;
-        else
+                c->last_top_block = c->cur_block;
+        } else {
                 ntq_activate_execute_for_block(c);
-
+        }
 }
 
 static void
@@ -1983,10 +2102,12 @@ ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
         qir_link_blocks(c->cur_block, c->loop_break_block);
 
         qir_set_emit_block(c, c->loop_break_block);
-        if (was_top_level)
+        if (was_top_level) {
                 c->execute = c->undef;
-        else
+                c->last_top_block = c->cur_block;
+        } else {
                 ntq_activate_execute_for_block(c);
+        }
 
         c->loop_break_block = save_loop_break_block;
         c->loop_cont_block = save_loop_cont_block;
@@ -2037,7 +2158,7 @@ ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
 static void
 nir_to_qir(struct vc4_compile *c)
 {
-        if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
+        if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
                 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
 
         ntq_setup_inputs(c);
@@ -2063,11 +2184,13 @@ static const nir_shader_compiler_options nir_options = {
         .lower_fsqrt = true,
         .lower_negate = true,
         .native_integers = true,
+        .max_unroll_iterations = 32,
 };
 
 const void *
 vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
-                                enum pipe_shader_ir ir, unsigned shader)
+                                enum pipe_shader_ir ir,
+                                enum pipe_shader_type shader)
 {
         return &nir_options;
 }
@@ -2089,7 +2212,7 @@ count_nir_instrs(nir_shader *nir)
 
 static struct vc4_compile *
 vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
-                       struct vc4_key *key)
+               struct vc4_key *key, bool fs_threaded)
 {
         struct vc4_compile *c = qir_compile_init();
 
@@ -2099,6 +2222,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->program_id = key->shader_state->program_id;
         c->variant_id =
                 p_atomic_inc_return(&key->shader_state->compiled_variant_count);
+        c->fs_threaded = fs_threaded;
 
         c->key = key;
         switch (stage) {
@@ -2216,6 +2340,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
 
         switch (stage) {
         case QSTAGE_FRAG:
+                /* FS threading requires that the thread execute
+                 * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
+                 * (with no other THRSW afterwards, obviously).  If we didn't
+                 * fetch a texture at a top level block, this wouldn't be
+                 * true.
+                 */
+                if (c->fs_threaded && !c->last_thrsw_at_top_level) {
+                        c->failed = true;
+                        return c;
+                }
+
                 emit_frag_end(c);
                 break;
         case QSTAGE_VERT:
@@ -2300,7 +2435,7 @@ vc4_shader_state_create(struct pipe_context *pctx,
         }
 
         NIR_PASS_V(s, nir_opt_global_to_local);
-        NIR_PASS_V(s, nir_convert_to_ssa);
+        NIR_PASS_V(s, nir_lower_regs_to_ssa);
         NIR_PASS_V(s, nir_normalize_cubemap_coords);
 
         NIR_PASS_V(s, nir_lower_load_const_to_scalar);
@@ -2360,7 +2495,7 @@ vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
 
         memset(input_live, 0, sizeof(input_live));
         qir_for_each_inst_inorder(inst, c) {
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file == QFILE_VARY)
                                 input_live[inst->src[i].index] = true;
                 }
@@ -2416,12 +2551,16 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
 {
         struct hash_table *ht;
         uint32_t key_size;
+        bool try_threading;
+
         if (stage == QSTAGE_FRAG) {
                 ht = vc4->fs_cache;
                 key_size = sizeof(struct vc4_fs_key);
+                try_threading = vc4->screen->has_threaded_fs;
         } else {
                 ht = vc4->vs_cache;
                 key_size = sizeof(struct vc4_vs_key);
+                try_threading = false;
         }
 
         struct vc4_compiled_shader *shader;
@@ -2429,7 +2568,13 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         if (entry)
                 return entry->data;
 
-        struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key);
+        struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
+        /* If the FS failed to compile threaded, fall back to single threaded. */
+        if (try_threading && c->failed) {
+                qir_compile_destroy(c);
+                c = vc4_shader_ntq(vc4, stage, key, false);
+        }
+
         shader = rzalloc(NULL, struct vc4_compiled_shader);
 
         shader->program_id = vc4->next_compiled_program_id++;
@@ -2438,7 +2583,7 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
 
                 /* Note: the temporary clone in c->s has been freed. */
                 nir_shader *orig_shader = key->shader_state->base.ir.nir;
-                if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
+                if (orig_shader->info->outputs_written & (1 << FRAG_RESULT_DEPTH))
                         shader->disable_early_z = true;
         } else {
                 shader->num_inputs = c->num_inputs;
@@ -2463,6 +2608,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
                                                  sizeof(uint64_t));
         }
 
+        shader->fs_threaded = c->fs_threaded;
+
         /* Copy the compiler UBO range state to the compiled shader, dropping
          * out arrays that were never referenced by an indirect load.
          *
@@ -2496,10 +2643,17 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
                 }
         }
 
+        if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) {
+                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n",
+                        qir_get_stage_name(c->stage),
+                        c->program_id, c->variant_id,
+                        1 + shader->fs_threaded);
+        }
+
         qir_compile_destroy(c);
 
         struct vc4_key *dup_key;
-        dup_key = ralloc_size(shader, key_size);
+        dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
         memcpy(dup_key, key, key_size);
         _mesa_hash_table_insert(ht, dup_key, shader);
 
@@ -2573,8 +2727,7 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
         }
         if (job->msaa) {
                 key->msaa = vc4->rasterizer->base.multisample;
-                key->sample_coverage = (vc4->rasterizer->base.multisample &&
-                                        vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
+                key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
                 key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
                 key->sample_alpha_to_one = vc4->blend->alpha_to_one;
         }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
index 4b94fcfb9..c829e7f93 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
@@ -76,13 +76,10 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_FRAG_Z] = { "frag_z", 1, 0 },
         [QOP_FRAG_W] = { "frag_w", 1, 0 },
 
-        [QOP_TEX_S] = { "tex_s", 0, 2, true },
-        [QOP_TEX_T] = { "tex_t", 0, 2, true },
-        [QOP_TEX_R] = { "tex_r", 0, 2, true },
-        [QOP_TEX_B] = { "tex_b", 0, 2, true },
-        [QOP_TEX_DIRECT] = { "tex_direct", 0, 2, true },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
 
+        [QOP_THRSW] = { "thrsw", 0, 0, true },
+
         [QOP_LOAD_IMM] = { "load_imm", 0, 1 },
         [QOP_LOAD_IMM_U2] = { "load_imm_u2", 0, 1 },
         [QOP_LOAD_IMM_I2] = { "load_imm_i2", 0, 1 },
@@ -103,12 +100,35 @@ qir_get_op_name(enum qop qop)
 }
 
 int
-qir_get_op_nsrc(enum qop qop)
+qir_get_non_sideband_nsrc(struct qinst *inst)
 {
-        if (qop < ARRAY_SIZE(qir_op_info) && qir_op_info[qop].name)
-                return qir_op_info[qop].nsrc;
-        else
-                abort();
+        assert(qir_op_info[inst->op].name);
+        return qir_op_info[inst->op].nsrc;
+}
+
+int
+qir_get_nsrc(struct qinst *inst)
+{
+        assert(qir_op_info[inst->op].name);
+
+        int nsrc = qir_get_non_sideband_nsrc(inst);
+
+        /* Normal (non-direct) texture coordinate writes also implicitly load
+         * a uniform for the texture parameters.
+         */
+        if (qir_is_tex(inst) && inst->dst.file != QFILE_TEX_S_DIRECT)
+                nsrc++;
+
+        return nsrc;
+}
+
+/* The sideband uniform for textures gets stored after the normal ALU
+ * arguments.
+ */
+int
+qir_get_tex_uniform_src(struct qinst *inst)
+{
+        return qir_get_nsrc(inst) - 1;
 }
 
 /**
@@ -123,6 +143,11 @@ qir_has_side_effects(struct vc4_compile *c, struct qinst *inst)
         case QFILE_TLB_COLOR_WRITE:
         case QFILE_TLB_COLOR_WRITE_MS:
         case QFILE_TLB_STENCIL_SETUP:
+        case QFILE_TEX_S_DIRECT:
+        case QFILE_TEX_S:
+        case QFILE_TEX_T:
+        case QFILE_TEX_R:
+        case QFILE_TEX_B:
                 return true;
         default:
                 break;
@@ -139,7 +164,7 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst)
          * point/line coordinates reads, because they're generated by
          * fixed-function hardware.
          */
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
                 if (inst->src[i].file == QFILE_VARY &&
                     c->input_slots[inst->src[i].index].slot == 0xff) {
                         return true;
@@ -156,6 +181,17 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst)
 }
 
 bool
+qir_has_uniform_read(struct qinst *inst)
+{
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
+                if (inst->src[i].file == QFILE_UNIF)
+                        return true;
+        }
+
+        return false;
+}
+
+bool
 qir_is_mul(struct qinst *inst)
 {
         switch (inst->op) {
@@ -207,7 +243,30 @@ qir_is_raw_mov(struct qinst *inst)
 bool
 qir_is_tex(struct qinst *inst)
 {
-        return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT;
+        switch (inst->dst.file) {
+        case QFILE_TEX_S_DIRECT:
+        case QFILE_TEX_S:
+        case QFILE_TEX_T:
+        case QFILE_TEX_R:
+        case QFILE_TEX_B:
+                return true;
+        default:
+                return false;
+        }
+}
+
+bool
+qir_has_implicit_tex_uniform(struct qinst *inst)
+{
+        switch (inst->dst.file) {
+        case QFILE_TEX_S:
+        case QFILE_TEX_T:
+        case QFILE_TEX_R:
+        case QFILE_TEX_B:
+                return true;
+        default:
+                return false;
+        }
 }
 
 bool
@@ -299,6 +358,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
                 [QFILE_FRAG_Y] = "frag_y",
                 [QFILE_FRAG_REV_FLAG] = "frag_rev_flag",
                 [QFILE_QPU_ELEMENT] = "elem",
+                [QFILE_TEX_S_DIRECT] = "tex_s_direct",
+                [QFILE_TEX_S] = "tex_s",
+                [QFILE_TEX_T] = "tex_t",
+                [QFILE_TEX_R] = "tex_r",
+                [QFILE_TEX_B] = "tex_b",
         };
 
         switch (reg.file) {
@@ -331,6 +395,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
         case QFILE_TLB_COLOR_WRITE_MS:
         case QFILE_TLB_Z_WRITE:
         case QFILE_TLB_STENCIL_SETUP:
+        case QFILE_TEX_S_DIRECT:
+        case QFILE_TEX_S:
+        case QFILE_TEX_T:
+        case QFILE_TEX_R:
+        case QFILE_TEX_B:
                 fprintf(stderr, "%s", files[reg.file]);
                 break;
 
@@ -371,7 +440,7 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
                 }
         }
 
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
                 fprintf(stderr, ", ");
                 qir_print_reg(c, inst->src[i], false);
                 vc4_qpu_disasm_unpack(stderr, inst->src[i].pack);
@@ -382,6 +451,7 @@ void
 qir_dump(struct vc4_compile *c)
 {
         int ip = 0;
+        int pressure = 0;
 
         qir_for_each_block(block, c) {
                 fprintf(stderr, "BLOCK %d:\n", block->index);
@@ -389,6 +459,8 @@ qir_dump(struct vc4_compile *c)
                         if (c->temp_start) {
                                 bool first = true;
 
+                                fprintf(stderr, "%3d ", pressure);
+
                                 for (int i = 0; i < c->num_temps; i++) {
                                         if (c->temp_start[i] != ip)
                                                 continue;
@@ -399,6 +471,7 @@ qir_dump(struct vc4_compile *c)
                                                 fprintf(stderr, ", ");
                                         }
                                         fprintf(stderr, "S%4d", i);
+                                        pressure++;
                                 }
 
                                 if (first)
@@ -420,6 +493,7 @@ qir_dump(struct vc4_compile *c)
                                                 fprintf(stderr, ", ");
                                         }
                                         fprintf(stderr, "E%4d", i);
+                                        pressure--;
                                 }
 
                                 if (first)
@@ -471,7 +545,6 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
 
         inst->op = op;
         inst->dst = dst;
-        inst->src = calloc(2, sizeof(inst->src[0]));
         inst->src[0] = src0;
         inst->src[1] = src1;
         inst->cond = QPU_COND_ALWAYS;
@@ -479,26 +552,6 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
         return inst;
 }
 
-struct qinst *
-qir_inst4(enum qop op, struct qreg dst,
-          struct qreg a,
-          struct qreg b,
-          struct qreg c,
-          struct qreg d)
-{
-        struct qinst *inst = CALLOC_STRUCT(qinst);
-
-        inst->op = op;
-        inst->dst = dst;
-        inst->src = calloc(4, sizeof(*inst->src));
-        inst->src[0] = a;
-        inst->src[1] = b;
-        inst->src[2] = c;
-        inst->src[3] = d;
-
-        return inst;
-}
-
 static void
 qir_emit(struct vc4_compile *c, struct qinst *inst)
 {
@@ -593,6 +646,7 @@ qir_compile_init(void)
 
         list_inithead(&c->blocks);
         qir_set_emit_block(c, qir_new_block(c));
+        c->last_top_block = c->cur_block;
 
         c->output_position_index = -1;
         c->output_color_index = -1;
@@ -612,7 +666,6 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
                 c->defs[qinst->dst.index] = NULL;
 
         list_del(&qinst->link);
-        free(qinst->src);
         free(qinst);
 }
 
@@ -744,6 +797,7 @@ qir_optimize(struct vc4_compile *c)
                 OPTPASS(qir_opt_dead_code);
                 OPTPASS(qir_opt_small_immediates);
                 OPTPASS(qir_opt_vpm);
+                OPTPASS(qir_opt_coalesce_ff_writes);
 
                 if (!progress)
                         break;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
index b3cac6bf2..6469e51b0 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
@@ -55,6 +55,18 @@ enum qfile {
         QFILE_TLB_Z_WRITE,
         QFILE_TLB_STENCIL_SETUP,
 
+        /* If tex_s is written on its own without preceding t/r/b setup, it's
+         * a direct memory access using the input value, without the sideband
+         * uniform load.  We represent these in QIR as a separate write
+         * destination so we can tell if the sideband uniform is present.
+         */
+        QFILE_TEX_S_DIRECT,
+
+        QFILE_TEX_S,
+        QFILE_TEX_T,
+        QFILE_TEX_R,
+        QFILE_TEX_B,
+
         /* Payload registers that aren't in the physical register file, so we
          * can just use the corresponding qpu_reg at qpu_emit time.
          */
@@ -133,30 +145,22 @@ enum qop {
         QOP_FRAG_Z,
         QOP_FRAG_W,
 
-        /** Texture x coordinate parameter write */
-        QOP_TEX_S,
-        /** Texture y coordinate parameter write */
-        QOP_TEX_T,
-        /** Texture border color parameter or cube map z coordinate write */
-        QOP_TEX_R,
-        /** Texture LOD bias parameter write */
-        QOP_TEX_B,
-
-        /**
-         * Texture-unit 4-byte read with address provided direct in S
-         * cooordinate.
-         *
-         * The first operand is the offset from the start of the UBO, and the
-         * second is the uniform that has the UBO's base pointer.
-         */
-        QOP_TEX_DIRECT,
-
         /**
          * Signal of texture read being necessary and then reading r4 into
          * the destination
          */
         QOP_TEX_RESULT,
 
+        /**
+         * Insert the signal for switching threads in a threaded fragment
+         * shader.  No value can be live in an accumulator across a thrsw.
+         *
+         * At the QPU level, this will have several delay slots before the
+         * switch happens.  Those slots are the responsibility of the
+         * scheduler.
+         */
+        QOP_THRSW,
+
         /* 32-bit immediate loaded to each SIMD channel */
         QOP_LOAD_IMM,
 
@@ -194,7 +198,7 @@ struct qinst {
 
         enum qop op;
         struct qreg dst;
-        struct qreg *src;
+        struct qreg src[3];
         bool sf;
         bool cond_is_exec_mask;
         uint8_t cond;
@@ -502,9 +506,13 @@ struct vc4_compile {
         struct qblock *cur_block;
         struct qblock *loop_cont_block;
         struct qblock *loop_break_block;
+        struct qblock *last_top_block;
 
         struct list_head qpu_inst_list;
 
+        /* Pre-QPU-scheduled instruction containing the last THRSW */
+        uint64_t *last_thrsw;
+
         uint64_t *qpu_insts;
         uint32_t qpu_inst_count;
         uint32_t qpu_inst_size;
@@ -524,6 +532,15 @@ struct vc4_compile {
 
         uint32_t program_id;
         uint32_t variant_id;
+
+        /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
+         * is used to hide texturing latency at the cost of limiting ourselves
+         * to the bottom half of physical reg space.
+         */
+        bool fs_threaded;
+
+        bool last_thrsw_at_top_level;
+
         bool failed;
 };
 
@@ -543,11 +560,6 @@ struct qblock *qir_entry_block(struct vc4_compile *c);
 struct qblock *qir_exit_block(struct vc4_compile *c);
 struct qinst *qir_inst(enum qop op, struct qreg dst,
                        struct qreg src0, struct qreg src1);
-struct qinst *qir_inst4(enum qop op, struct qreg dst,
-                        struct qreg a,
-                        struct qreg b,
-                        struct qreg c,
-                        struct qreg d);
 void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst);
 struct qreg qir_uniform(struct vc4_compile *c,
                         enum quniform_contents contents,
@@ -561,13 +573,17 @@ struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst);
 
 struct qreg qir_get_temp(struct vc4_compile *c);
 void qir_calculate_live_intervals(struct vc4_compile *c);
-int qir_get_op_nsrc(enum qop qop);
+int qir_get_nsrc(struct qinst *inst);
+int qir_get_non_sideband_nsrc(struct qinst *inst);
+int qir_get_tex_uniform_src(struct qinst *inst);
 bool qir_reg_equals(struct qreg a, struct qreg b);
 bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
 bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
+bool qir_has_uniform_read(struct qinst *inst);
 bool qir_is_mul(struct qinst *inst);
 bool qir_is_raw_mov(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
+bool qir_has_implicit_tex_uniform(struct qinst *inst);
 bool qir_is_float_input(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
@@ -582,6 +598,7 @@ void qir_validate(struct vc4_compile *c);
 
 void qir_optimize(struct vc4_compile *c);
 bool qir_opt_algebraic(struct vc4_compile *c);
+bool qir_opt_coalesce_ff_writes(struct vc4_compile *c);
 bool qir_opt_constant_folding(struct vc4_compile *c);
 bool qir_opt_copy_propagation(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
@@ -722,11 +739,6 @@ QIR_ALU1(RSQ)
 QIR_ALU1(EXP2)
 QIR_ALU1(LOG2)
 QIR_ALU1(VARY_ADD_C)
-QIR_NODST_2(TEX_S)
-QIR_NODST_2(TEX_T)
-QIR_NODST_2(TEX_R)
-QIR_NODST_2(TEX_B)
-QIR_NODST_2(TEX_DIRECT)
 QIR_PAYLOAD(FRAG_Z)
 QIR_PAYLOAD(FRAG_W)
 QIR_ALU0(TEX_RESULT)
@@ -737,10 +749,8 @@ static inline struct qreg
 qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
 {
         struct qreg t = qir_get_temp(c);
-        struct qinst *a = qir_MOV_dest(c, t, src0);
-        struct qinst *b = qir_MOV_dest(c, t, src1);
-        a->cond = cond;
-        b->cond = qpu_cond_complement(cond);
+        qir_MOV_dest(c, t, src1);
+        qir_MOV_dest(c, t, src0)->cond = cond;
         return t;
 }
 
@@ -881,6 +891,6 @@ qir_BRANCH(struct vc4_compile *c, uint8_t cond)
 
 #define qir_for_each_inst_inorder(inst, c)                              \
         qir_for_each_block(_block, c)                                   \
-                qir_for_each_inst(inst, _block)
+                qir_for_each_inst_safe(inst, _block)
 
 #endif /* VC4_QIR_H */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c
index 3fd6358e3..443682a46 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c
@@ -36,24 +36,10 @@
 #include "util/u_math.h"
 
 static bool
-inst_reads_a_uniform(struct qinst *inst)
-{
-        if (qir_is_tex(inst))
-                return true;
-
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                if (inst->src[i].file == QFILE_UNIF)
-                        return true;
-        }
-
-        return false;
-}
-
-static bool
 block_reads_any_uniform(struct qblock *block)
 {
         qir_for_each_inst(inst, block) {
-                if (inst_reads_a_uniform(inst))
+                if (qir_has_uniform_read(inst))
                         return true;
         }
 
@@ -94,7 +80,7 @@ qir_emit_uniform_stream_resets(struct vc4_compile *c)
                 }
 
                 qir_for_each_inst(inst, block) {
-                        if (inst_reads_a_uniform(inst))
+                        if (qir_has_uniform_read(inst))
                                 uniform_count++;
                 }
         }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
index beefb0d7f..7108b3ee9 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
@@ -205,7 +205,7 @@ qir_setup_def_use(struct vc4_compile *c)
                 _mesa_hash_table_clear(partial_update_ht, NULL);
 
                 qir_for_each_inst(inst, block) {
-                        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++)
+                        for (int i = 0; i < qir_get_nsrc(inst); i++)
                                 qir_setup_use(c, block, ip, inst->src[i]);
 
                         qir_setup_def(c, block, ip, partial_update_ht, inst);
@@ -301,8 +301,13 @@ qir_calculate_live_intervals(struct vc4_compile *c)
 {
         int bitset_words = BITSET_WORDS(c->num_temps);
 
-        c->temp_start = reralloc(c, c->temp_start, int, c->num_temps);
-        c->temp_end = reralloc(c, c->temp_end, int, c->num_temps);
+        /* If we called this function more than once, then we should be
+         * freeing the previous arrays.
+         */
+        assert(!c->temp_start);
+
+        c->temp_start = rzalloc_array(c, int, c->num_temps);
+        c->temp_end = rzalloc_array(c, int, c->num_temps);
 
         for (int i = 0; i < c->num_temps; i++) {
                 c->temp_start[i] = MAX_INSTRUCTION;
@@ -310,10 +315,10 @@ qir_calculate_live_intervals(struct vc4_compile *c)
         }
 
         qir_for_each_block(block, c) {
-                block->def = reralloc(c, block->def, BITSET_WORD, bitset_words);
-                block->use = reralloc(c, block->use, BITSET_WORD, bitset_words);
-                block->live_in = reralloc(c, block->live_in, BITSET_WORD, bitset_words);
-                block->live_out = reralloc(c, block->live_out, BITSET_WORD, bitset_words);
+                block->def = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->use = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words);
         }
 
         qir_setup_def_use(c);
@@ -322,4 +327,27 @@ qir_calculate_live_intervals(struct vc4_compile *c)
                 ;
 
         qir_compute_start_end(c, c->num_temps);
+
+        if (vc4_debug & VC4_DEBUG_SHADERDB) {
+                int last_ip = 0;
+                for (int i = 0; i < c->num_temps; i++)
+                        last_ip = MAX2(last_ip, c->temp_end[i]);
+
+                int reg_pressure = 0;
+                int max_reg_pressure = 0;
+                for (int i = 0; i < last_ip; i++) {
+                        for (int j = 0; j < c->num_temps; j++) {
+                                if (c->temp_start[j] == i)
+                                        reg_pressure++;
+                                if (c->temp_end[j] == i)
+                                        reg_pressure--;
+                        }
+                        max_reg_pressure = MAX2(max_reg_pressure, reg_pressure);
+                }
+
+                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d max temps\n",
+                        qir_get_stage_name(c->stage),
+                        c->program_id, c->variant_id,
+                        max_reg_pressure);
+        }
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 8ec6c7973..9ecfe6521 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -77,7 +77,7 @@ is_lowerable_uniform(struct qinst *inst, int i)
         if (inst->src[i].file != QFILE_UNIF)
                 return false;
         if (qir_is_tex(inst))
-                return i != 1;
+                return i != qir_get_tex_uniform_src(inst);
         return true;
 }
 
@@ -89,7 +89,7 @@ qir_get_instruction_uniform_count(struct qinst *inst)
 {
         uint32_t count = 0;
 
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
                 if (inst->src[i].file != QFILE_UNIF)
                         continue;
 
@@ -119,7 +119,7 @@ qir_lower_uniforms(struct vc4_compile *c)
          * ht.
          */
         qir_for_each_inst_inorder(inst, c) {
-                uint32_t nsrc = qir_get_op_nsrc(inst->op);
+                uint32_t nsrc = qir_get_nsrc(inst);
 
                 if (qir_get_instruction_uniform_count(inst) <= 1)
                         continue;
@@ -155,7 +155,7 @@ qir_lower_uniforms(struct vc4_compile *c)
                         struct qinst *mov = NULL;
 
                         qir_for_each_inst(inst, block) {
-                                uint32_t nsrc = qir_get_op_nsrc(inst->op);
+                                uint32_t nsrc = qir_get_nsrc(inst);
 
                                 uint32_t count = qir_get_instruction_uniform_count(inst);
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c
index 69bd0dd62..5118caf31 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -187,7 +187,7 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
          * ignore uniforms accesses, because qir_reorder_uniforms() happens
          * after this.
          */
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
                 switch (inst->src[i].file) {
                 case QFILE_TEMP:
                         add_dep(dir,
@@ -212,23 +212,35 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
                 add_dep(dir, state->last_vary_read, n);
                 break;
 
-        case QOP_TEX_S:
-        case QOP_TEX_T:
-        case QOP_TEX_R:
-        case QOP_TEX_B:
-        case QOP_TEX_DIRECT:
-                /* Texturing setup gets scheduled in order, because
-                 * the uniforms referenced by them have to land in a
-                 * specific order.
-                 */
-                add_write_dep(dir, &state->last_tex_coord, n);
-                break;
-
         case QOP_TEX_RESULT:
                 /* Results have to be fetched in order. */
                 add_write_dep(dir, &state->last_tex_result, n);
                 break;
 
+        case QOP_THRSW:
+                /* After a new THRSW, one must collect all texture samples
+                 * queued since the previous THRSW/program start.  For now, we
+                 * have one THRSW in between each texture setup and its
+                 * results collection as our input, and we just make sure that
+                 * that ordering is maintained.
+                 */
+                add_write_dep(dir, &state->last_tex_coord, n);
+                add_write_dep(dir, &state->last_tex_result, n);
+
+                /* accumulators and flags are lost across thread switches. */
+                add_write_dep(dir, &state->last_sf, n);
+
+                /* Setup, like the varyings, will need to be drained before we
+                 * thread switch.
+                 */
+                add_write_dep(dir, &state->last_vary_read, n);
+
+                /* The TLB-locking operations have to stay after the last
+                 * thread switch.
+                 */
+                add_write_dep(dir, &state->last_tlb, n);
+                break;
+
         case QOP_TLB_COLOR_READ:
         case QOP_MS_MASK:
                 add_write_dep(dir, &state->last_tlb, n);
@@ -254,6 +266,18 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
                 add_write_dep(dir, &state->last_tlb, n);
                 break;
 
+        case QFILE_TEX_S_DIRECT:
+        case QFILE_TEX_S:
+        case QFILE_TEX_T:
+        case QFILE_TEX_R:
+        case QFILE_TEX_B:
+                /* Texturing setup gets scheduled in order, because
+                 * the uniforms referenced by them have to land in a
+                 * specific order.
+                 */
+                add_write_dep(dir, &state->last_tex_coord, n);
+                break;
+
         default:
                 break;
         }
@@ -281,7 +305,7 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
 
                 calculate_deps(&state, n);
 
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         switch (inst->src[i].file) {
                         case QFILE_UNIF:
                                 add_dep(state.dir, state.last_uniforms_reset, n);
@@ -291,26 +315,59 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
                         }
                 }
 
-                switch (inst->op) {
-                case QOP_TEX_S:
-                case QOP_TEX_T:
-                case QOP_TEX_R:
-                case QOP_TEX_B:
-                case QOP_TEX_DIRECT:
-                        /* If the texture coordinate fifo is full,
-                         * block this on the last QOP_TEX_RESULT.
+                switch (inst->dst.file) {
+                case QFILE_TEX_S_DIRECT:
+                case QFILE_TEX_S:
+                case QFILE_TEX_T:
+                case QFILE_TEX_R:
+                case QFILE_TEX_B:
+                        /* From the VC4 spec:
+                         *
+                         *     "The TFREQ input FIFO holds two full lots of s,
+                         *      t, r, b data, plus associated setup data, per
+                         *      QPU, that is, there are eight data slots. For
+                         *      each texture request, slots are only consumed
+                         *      for the components of s, t, r, and b actually
+                         *      written. Thus the FIFO can hold four requests
+                         *      of just (s, t) data, or eight requests of just
+                         *      s data (for direct addressed data lookups).
+                         *
+                         *      Note that there is one FIFO per QPU, and the
+                         *      FIFO has no concept of threads - that is,
+                         *      multi-threaded shaders must be careful to use
+                         *      only 1/2 the FIFO depth before reading
+                         *      back. Multi-threaded programs must also
+                         *      therefore always thread switch on texture
+                         *      fetch as the other thread may have data
+                         *      waiting in the FIFO."
+                         *
+                         * If the texture coordinate fifo is full, block this
+                         * on the last QOP_TEX_RESULT.
                          */
-                        if (state.tfreq_count == 8) {
+                        if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
                                 block_until_tex_result(&state, n);
                         }
 
-                        /* If the texture result fifo is full, block
-                         * adding any more to it until the last
-                         * QOP_TEX_RESULT.
+                        /* From the VC4 spec:
+                         *
+                         *     "Since the maximum number of texture requests
+                         *      in the input (TFREQ) FIFO is four lots of (s,
+                         *      t) data, the output (TFRCV) FIFO is sized to
+                         *      holds four lots of max-size color data per
+                         *      QPU. For non-float color, reads are packed
+                         *      RGBA8888 data (one read per pixel). For 16-bit
+                         *      float color, two reads are necessary per
+                         *      pixel, with reads packed as RG1616 then
+                         *      BA1616. So per QPU there are eight color slots
+                         *      in the TFRCV FIFO."
+                         *
+                         * If the texture result fifo is full, block adding
+                         * any more to it until the last QOP_TEX_RESULT.
                          */
-                        if (inst->op == QOP_TEX_S ||
-                            inst->op == QOP_TEX_DIRECT) {
-                                if (state.tfrcv_count == 4)
+                        if (inst->dst.file == QFILE_TEX_S ||
+                            inst->dst.file == QFILE_TEX_S_DIRECT) {
+                                if (state.tfrcv_count ==
+                                    (c->fs_threaded ? 2 : 4))
                                         block_until_tex_result(&state, n);
                                 state.tfrcv_count++;
                         }
@@ -319,6 +376,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
                         state.tfreq_count++;
                         break;
 
+                default:
+                        break;
+                }
+
+                switch (inst->op) {
                 case QOP_TEX_RESULT:
                         /* Results have to be fetched after the
                          * coordinate setup.  Note that we're assuming
@@ -341,7 +403,6 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
                         break;
 
                 default:
-                        assert(!qir_is_tex(inst));
                         break;
                 }
         }
@@ -372,11 +433,21 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
             state->temp_writes[inst->dst.index] == 1)
                 cost--;
 
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                if (inst->src[i].file == QFILE_TEMP &&
-                    !BITSET_TEST(state->temp_live, inst->src[i].index)) {
-                        cost++;
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
+                if (inst->src[i].file != QFILE_TEMP ||
+                    BITSET_TEST(state->temp_live, inst->src[i].index)) {
+                        continue;
                 }
+
+                bool already_counted = false;
+                for (int j = 0; j < i; j++) {
+                        if (inst->src[i].file == inst->src[j].file &&
+                            inst->src[i].index == inst->src[j].index) {
+                                already_counted = true;
+                        }
+                }
+                if (!already_counted)
+                        cost++;
         }
 
         return cost;
@@ -503,11 +574,33 @@ dump_state(struct vc4_compile *c, struct schedule_state *state)
 static uint32_t
 latency_between(struct schedule_node *before, struct schedule_node *after)
 {
-        if ((before->inst->op == QOP_TEX_S ||
-             before->inst->op == QOP_TEX_DIRECT) &&
+        if ((before->inst->dst.file == QFILE_TEX_S ||
+             before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
             after->inst->op == QOP_TEX_RESULT)
                 return 100;
 
+        switch (before->inst->op) {
+        case QOP_RCP:
+        case QOP_RSQ:
+        case QOP_EXP2:
+        case QOP_LOG2:
+                for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
+                        if (after->inst->src[i].file ==
+                            before->inst->dst.file &&
+                            after->inst->src[i].index ==
+                            before->inst->dst.index) {
+                                /* There are two QPU delay slots before we can
+                                 * read a math result, which could be up to 4
+                                 * QIR instructions if they packed well.
+                                 */
+                                return 4;
+                        }
+                }
+                break;
+        default:
+                break;
+        }
+
         return 1;
 }
 
@@ -532,7 +625,7 @@ compute_delay(struct schedule_node *n)
                                 compute_delay(n->children[i]);
                         n->delay = MAX2(n->delay,
                                         n->children[i]->delay +
-                                        latency_between(n, n->children[i]));
+                                        latency_between(n->children[i], n));
                 }
         }
 }
@@ -583,15 +676,15 @@ schedule_instructions(struct vc4_compile *c,
 
                         child->unblocked_time = MAX2(child->unblocked_time,
                                                      state->time +
-                                                     latency_between(chosen,
-                                                                     child));
+                                                     latency_between(child,
+                                                                     chosen));
                         child->parent_count--;
                         if (child->parent_count == 0)
                                 list_add(&child->link, &state->worklist);
                 }
 
                 /* Update our tracking of register pressure. */
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file == QFILE_TEMP)
                                 BITSET_SET(state->temp_live, inst->src[i].index);
                 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c
index e7cfe5ad2..302eb4826 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c
@@ -84,9 +84,28 @@ void qir_validate(struct vc4_compile *c)
                 case QFILE_LOAD_IMM:
                         fail_instr(c, inst, "Bad dest file");
                         break;
+
+                case QFILE_TEX_S:
+                case QFILE_TEX_T:
+                case QFILE_TEX_R:
+                case QFILE_TEX_B:
+                        if (inst->src[qir_get_tex_uniform_src(inst)].file !=
+                            QFILE_UNIF) {
+                                fail_instr(c, inst,
+                                           "tex op missing implicit uniform");
+                        }
+                        break;
+
+                case QFILE_TEX_S_DIRECT:
+                        if (inst->op != QOP_ADD) {
+                                fail_instr(c, inst,
+                                           "kernel validation requires that "
+                                           "direct texture lookups use an ADD");
+                        }
+                        break;
                 }
 
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         struct qreg src = inst->src[i];
 
                         switch (src.file) {
@@ -119,6 +138,11 @@ void qir_validate(struct vc4_compile *c)
                         case QFILE_TLB_COLOR_WRITE_MS:
                         case QFILE_TLB_Z_WRITE:
                         case QFILE_TLB_STENCIL_SETUP:
+                        case QFILE_TEX_S_DIRECT:
+                        case QFILE_TEX_S:
+                        case QFILE_TEX_T:
+                        case QFILE_TEX_R:
+                        case QFILE_TEX_B:
                                 fail_instr(c, inst, "Bad src file");
                                 break;
                         }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c
index 67850a811..380b9f43c 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c
@@ -323,6 +323,7 @@ qpu_waddr_ignores_ws(uint32_t waddr)
         case QPU_W_ACC1:
         case QPU_W_ACC2:
         case QPU_W_ACC3:
+        case QPU_W_NOP:
         case QPU_W_TLB_Z:
         case QPU_W_TLB_COLOR_MS:
         case QPU_W_TLB_COLOR_ALL:
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 529472272..9ea26455b 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -86,11 +86,11 @@ static const char *qpu_sig[] = {
 
 static const char *qpu_pack_mul[] = {
         [QPU_PACK_MUL_NOP] = "",
-        [QPU_PACK_MUL_8888] = "8888",
-        [QPU_PACK_MUL_8A] = "8a",
-        [QPU_PACK_MUL_8B] = "8b",
-        [QPU_PACK_MUL_8C] = "8c",
-        [QPU_PACK_MUL_8D] = "8d",
+        [QPU_PACK_MUL_8888] = ".8888",
+        [QPU_PACK_MUL_8A] = ".8a",
+        [QPU_PACK_MUL_8B] = ".8b",
+        [QPU_PACK_MUL_8C] = ".8c",
+        [QPU_PACK_MUL_8D] = ".8d",
 };
 
 /* The QPU unpack for A and R4 files can be described the same, it's just that
@@ -264,7 +264,7 @@ get_special_write_desc(int reg, bool is_a)
 void
 vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack)
 {
-        fprintf(out, ".%s", DESC(qpu_pack_mul, pack));
+        fprintf(out, "%s", DESC(qpu_pack_mul, pack));
 }
 
 void
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 2ee52a497..aaa3a0412 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -157,7 +157,7 @@ setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
  * address.
  *
  * In that case, we need to move one to a temporary that can be used in the
- * instruction, instead.  We reserve ra31/rb31 for this purpose.
+ * instruction, instead.  We reserve ra14/rb14 for this purpose.
  */
 static void
 fixup_raddr_conflict(struct qblock *block,
@@ -183,9 +183,9 @@ fixup_raddr_conflict(struct qblock *block,
                  * in case of unpacks.
                  */
                 if (qir_is_float_input(inst))
-                        queue(block, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
+                        queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
                 else
-                        queue(block, qpu_a_MOV(qpu_rb(31), *src0));
+                        queue(block, qpu_a_MOV(qpu_rb(14), *src0));
 
                 /* If we had an unpack on this A-file source, we need to put
                  * it into this MOV, not into the later move from regfile B.
@@ -194,10 +194,10 @@ fixup_raddr_conflict(struct qblock *block,
                         *last_inst(block) |= *unpack;
                         *unpack = 0;
                 }
-                *src0 = qpu_rb(31);
+                *src0 = qpu_rb(14);
         } else {
-                queue(block, qpu_a_MOV(qpu_ra(31), *src0));
-                *src0 = qpu_ra(31);
+                queue(block, qpu_a_MOV(qpu_ra(14), *src0));
+                *src0 = qpu_ra(14);
         }
 }
 
@@ -226,10 +226,14 @@ static void
 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
                     struct qpu_reg dst)
 {
-        if (dst.mux != QPU_MUX_R4)
+        if (dst.mux != QPU_MUX_R4) {
                 queue(block, qpu_a_MOV(dst, qpu_r4()));
-        else if (qinst->sf)
-                queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
+                set_last_cond_add(block, qinst->cond);
+        } else {
+                assert(qinst->cond == QPU_COND_ALWAYS);
+                if (qinst->sf)
+                        queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
+        }
 }
 
 static void
@@ -290,8 +294,8 @@ vc4_generate_code_block(struct vc4_compile *c,
                 };
 
                 uint64_t unpack = 0;
-                struct qpu_reg src[4];
-                for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
+                struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+                for (int i = 0; i < qir_get_nsrc(qinst); i++) {
                         int index = qinst->src[i].index;
                         switch (qinst->src[i].file) {
                         case QFILE_NULL:
@@ -349,6 +353,11 @@ vc4_generate_code_block(struct vc4_compile *c,
                         case QFILE_TLB_COLOR_WRITE_MS:
                         case QFILE_TLB_Z_WRITE:
                         case QFILE_TLB_STENCIL_SETUP:
+                        case QFILE_TEX_S:
+                        case QFILE_TEX_S_DIRECT:
+                        case QFILE_TEX_T:
+                        case QFILE_TEX_R:
+                        case QFILE_TEX_B:
                                 unreachable("bad qir src file");
                         }
                 }
@@ -381,6 +390,23 @@ vc4_generate_code_block(struct vc4_compile *c,
                         dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
                         break;
 
+                case QFILE_TEX_S:
+                case QFILE_TEX_S_DIRECT:
+                        dst = qpu_rb(QPU_W_TMU0_S);
+                        break;
+
+                case QFILE_TEX_T:
+                        dst = qpu_rb(QPU_W_TMU0_T);
+                        break;
+
+                case QFILE_TEX_R:
+                        dst = qpu_rb(QPU_W_TMU0_R);
+                        break;
+
+                case QFILE_TEX_B:
+                        dst = qpu_rb(QPU_W_TMU0_B);
+                        break;
+
                 case QFILE_VARY:
                 case QFILE_UNIF:
                 case QFILE_SMALL_IMM:
@@ -422,6 +448,7 @@ vc4_generate_code_block(struct vc4_compile *c,
                         }
 
                         handle_r4_qpu_write(block, qinst, dst);
+                        handled_qinst_cond = true;
 
                         break;
 
@@ -473,33 +500,27 @@ vc4_generate_code_block(struct vc4_compile *c,
                         *last_inst(block) = qpu_set_sig(*last_inst(block),
                                                         QPU_SIG_COLOR_LOAD);
                         handle_r4_qpu_write(block, qinst, dst);
+                        handled_qinst_cond = true;
                         break;
 
                 case QOP_VARY_ADD_C:
                         queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
                         break;
 
-                case QOP_TEX_S:
-                case QOP_TEX_T:
-                case QOP_TEX_R:
-                case QOP_TEX_B:
-                        queue(block, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
-                                                      (qinst->op - QOP_TEX_S)),
-                                               src[0]) | unpack);
-                        break;
-
-                case QOP_TEX_DIRECT:
-                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
-                                             qinst, &unpack);
-                        queue(block, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
-                                               src[0], src[1]) | unpack);
-                        break;
 
                 case QOP_TEX_RESULT:
                         queue(block, qpu_NOP());
                         *last_inst(block) = qpu_set_sig(*last_inst(block),
                                                         QPU_SIG_LOAD_TMU0);
                         handle_r4_qpu_write(block, qinst, dst);
+                        handled_qinst_cond = true;
+                        break;
+
+                case QOP_THRSW:
+                        queue(block, qpu_NOP());
+                        *last_inst(block) = qpu_set_sig(*last_inst(block),
+                                                        QPU_SIG_THREAD_SWITCH);
+                        c->last_thrsw = last_inst(block);
                         break;
 
                 case QOP_BRANCH:
@@ -533,7 +554,7 @@ vc4_generate_code_block(struct vc4_compile *c,
                          * argument slot as well so that we don't take up
                          * another raddr just to get unused data.
                          */
-                        if (qir_get_op_nsrc(qinst->op) == 1)
+                        if (qir_get_non_sideband_nsrc(qinst) == 1)
                                 src[1] = src[0];
 
                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
@@ -587,6 +608,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         qir_for_each_block(block, c)
                 vc4_generate_code_block(c, block, temp_registers);
 
+        /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
+         *
+         * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
+         * that ensures that a later thread doesn't try to lock the scoreboard
+         * and terminate before an earlier-spawned thread on the same QPU, by
+         * delaying switching back to the later shader until earlier has
+         * finished.  Otherwise, if the earlier thread was hitting the same
+         * quad, the scoreboard would deadlock.
+         */
+        if (c->last_thrsw) {
+                assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
+                       QPU_SIG_THREAD_SWITCH);
+                *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
+                                  QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
+                                                QPU_SIG));
+        }
+
         uint32_t cycles = qpu_schedule_instructions(c);
         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 25adbe671..9141396c8 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -385,12 +385,27 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
         switch (sig) {
         case QPU_SIG_SW_BREAKPOINT:
         case QPU_SIG_NONE:
-        case QPU_SIG_THREAD_SWITCH:
-        case QPU_SIG_LAST_THREAD_SWITCH:
         case QPU_SIG_SMALL_IMM:
         case QPU_SIG_LOAD_IMM:
                 break;
 
+        case QPU_SIG_THREAD_SWITCH:
+        case QPU_SIG_LAST_THREAD_SWITCH:
+                /* All accumulator contents and flags are undefined after the
+                 * switch.
+                 */
+                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
+                        add_write_dep(state, &state->last_r[i], n);
+                add_write_dep(state, &state->last_sf, n);
+
+                /* Scoreboard-locking operations have to stay after the last
+                 * thread switch.
+                 */
+                add_write_dep(state, &state->last_tlb, n);
+
+                add_write_dep(state, &state->last_tmu_write, n);
+                break;
+
         case QPU_SIG_LOAD_TMU0:
         case QPU_SIG_LOAD_TMU1:
                 /* TMU loads are coming from a FIFO, so ordering is important.
@@ -453,6 +468,7 @@ struct choose_scoreboard {
         int last_sfu_write_tick;
         int last_uniforms_reset_tick;
         uint32_t last_waddr_a, last_waddr_b;
+        bool tlb_locked;
 };
 
 static bool
@@ -461,6 +477,11 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+        /* Full immediate loads don't read any registers. */
+        if (sig == QPU_SIG_LOAD_IMM)
+                return false;
+
         uint32_t src_muxes[] = {
                 QPU_GET_FIELD(inst, QPU_ADD_A),
                 QPU_GET_FIELD(inst, QPU_ADD_B),
@@ -554,15 +575,28 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
         struct schedule_node *chosen = NULL;
         int chosen_prio = 0;
 
+        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
+         * will handle pairing it along with filling the delay slots.
+         */
+        if (prev_inst) {
+                uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
+                                                  QPU_SIG);
+                if (prev_sig == QPU_SIG_THREAD_SWITCH ||
+                    prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                        return NULL;
+                }
+        }
+
         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                 uint64_t inst = n->inst->inst;
+                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
                 /* Don't choose the branch instruction until it's the last one
                  * left.  XXX: We could potentially choose it before it's the
                  * last one, if the remaining instructions fit in the delay
                  * slots.
                  */
-                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
+                if (sig == QPU_SIG_BRANCH &&
                     !list_is_singular(schedule_list)) {
                         continue;
                 }
@@ -586,9 +620,25 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
                  * that they're compatible.
                  */
                 if (prev_inst) {
+                        /* Don't pair up a thread switch signal -- we'll
+                         * handle pairing it when we pick it on its own.
+                         */
+                        if (sig == QPU_SIG_THREAD_SWITCH ||
+                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                                continue;
+                        }
+
                         if (prev_inst->uniform != -1 && n->uniform != -1)
                                 continue;
 
+                        /* Don't merge in something that will lock the TLB.
+                         * Hopwefully what we have in inst will release some
+                         * other instructions, allowing us to delay the
+                         * TLB-locking instruction until later.
+                         */
+                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+                                continue;
+
                         inst = qpu_merge_inst(prev_inst->inst->inst, inst);
                         if (!inst)
                                 continue;
@@ -647,6 +697,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
             waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
                 scoreboard->last_uniforms_reset_tick = scoreboard->tick;
         }
+
+        if (qpu_inst_is_tlb(inst))
+                scoreboard->tlb_locked = true;
 }
 
 static void
@@ -678,6 +731,26 @@ static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
 
         /* Apply some huge latency between texture fetch requests and getting
          * their results back.
+         *
+         * FIXME: This is actually pretty bogus.  If we do:
+         *
+         * mov tmu0_s, a
+         * <a bit of math>
+         * mov tmu0_s, b
+         * load_tmu0
+         * <more math>
+         * load_tmu0
+         *
+         * we count that as worse than
+         *
+         * mov tmu0_s, a
+         * mov tmu0_s, b
+         * <lots of math>
+         * load_tmu0
+         * <more math>
+         * load_tmu0
+         *
+         * because we associate the first load_tmu0 with the *second* tmu0_s.
          */
         if (waddr == QPU_W_TMU0_S) {
                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
@@ -768,6 +841,51 @@ mark_instruction_scheduled(struct list_head *schedule_list,
         }
 }
 
+/**
+ * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
+ * with another instruction.
+ */
+static void
+emit_thrsw(struct vc4_compile *c,
+           struct choose_scoreboard *scoreboard,
+           uint64_t inst)
+{
+        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+        /* There should be nothing in a thrsw inst being scheduled other than
+         * the signal bits.
+         */
+        assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
+        assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
+
+        /* Try to find an earlier scheduled instruction that we can merge the
+         * thrsw into.
+         */
+        int thrsw_ip = c->qpu_inst_count;
+        for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
+                uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
+                uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
+
+                if (prev_sig == QPU_SIG_NONE)
+                        thrsw_ip = c->qpu_inst_count - i;
+        }
+
+        if (thrsw_ip != c->qpu_inst_count) {
+                /* Merge the thrsw into the existing instruction. */
+                c->qpu_insts[thrsw_ip] =
+                        QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
+        } else {
+                qpu_serialize_one_inst(c, inst);
+                update_scoreboard_for_chosen(scoreboard, inst);
+        }
+
+        /* Fill the delay slots. */
+        while (c->qpu_inst_count < thrsw_ip + 3) {
+                update_scoreboard_for_chosen(scoreboard, qpu_NOP());
+                qpu_serialize_one_inst(c, qpu_NOP());
+        }
+}
+
 static uint32_t
 schedule_instructions(struct vc4_compile *c,
                       struct choose_scoreboard *scoreboard,
@@ -860,10 +978,6 @@ schedule_instructions(struct vc4_compile *c,
                         fprintf(stderr, "\n");
                 }
 
-                qpu_serialize_one_inst(c, inst);
-
-                update_scoreboard_for_chosen(scoreboard, inst);
-
                 /* Now that we've scheduled a new instruction, some of its
                  * children can be promoted to the list of instructions ready to
                  * be scheduled.  Update the children's unblocked time for this
@@ -872,6 +986,14 @@ schedule_instructions(struct vc4_compile *c,
                 mark_instruction_scheduled(schedule_list, time, chosen, false);
                 mark_instruction_scheduled(schedule_list, time, merge, false);
 
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
+                    QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
+                        emit_thrsw(c, scoreboard, inst);
+                } else {
+                        qpu_serialize_one_inst(c, inst);
+                        update_scoreboard_for_chosen(scoreboard, inst);
+                }
+
                 scoreboard->tick++;
                 time++;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c
index 02fadaf61..08dd6e5df 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c
@@ -58,6 +58,10 @@ _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
                 return false;
 
+        /* Load immediates don't read any registers. */
+        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
+                return false;
+
         for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
                 if (!ignore_a &&
                     src_regs[i].mux == QPU_MUX_A &&
@@ -109,6 +113,7 @@ void
 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
 {
         bool scoreboard_locked = false;
+        bool threaded = false;
 
         /* We don't want to do validation in release builds, but we want to
          * keep compiling the validation code to make sure it doesn't get
@@ -120,11 +125,17 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
 
         for (int i = 0; i < num_inst; i++) {
                 uint64_t inst = insts[i];
+                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
-                if (QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_PROG_END) {
+                if (sig != QPU_SIG_PROG_END) {
                         if (qpu_inst_is_tlb(inst))
                                 scoreboard_locked = true;
 
+                        if (sig == QPU_SIG_THREAD_SWITCH ||
+                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                                threaded = true;
+                        }
+
                         continue;
                 }
 
@@ -359,4 +370,98 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
                     waddr_mul == QPU_W_UNIFORMS_ADDRESS)
                         last_unif_pointer_update = i;
         }
+
+        if (threaded) {
+                bool last_thrsw_found = false;
+                bool scoreboard_locked = false;
+                int tex_samples_outstanding = 0;
+                int last_tex_samples_outstanding = 0;
+                int thrsw_ip = -1;
+
+                for (int i = 0; i < num_inst; i++) {
+                        uint64_t inst = insts[i];
+                        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+                        if (i == thrsw_ip) {
+                                /* In order to get texture results back in the
+                                 * correct order, before a new thrsw we have
+                                 * to read all the texture results from before
+                                 * the previous thrsw.
+                                 *
+                                 * FIXME: Is collecting the remaining results
+                                 * during the delay slots OK, or should we do
+                                 * this at THRSW signal time?
+                                 */
+                                if (last_tex_samples_outstanding != 0) {
+                                        fail_instr(inst, "THRSW with texture "
+                                                   "results from the previous "
+                                                   "THRSW still in the FIFO.");
+                                }
+
+                                last_tex_samples_outstanding =
+                                        tex_samples_outstanding;
+                                tex_samples_outstanding = 0;
+                        }
+
+                        if (qpu_inst_is_tlb(inst))
+                                scoreboard_locked = true;
+
+                        switch (sig) {
+                        case QPU_SIG_THREAD_SWITCH:
+                        case QPU_SIG_LAST_THREAD_SWITCH:
+                                /* No thread switching with the scoreboard
+                                 * locked.  Doing so means we may deadlock
+                                 * when the other thread tries to lock
+                                 * scoreboard.
+                                 */
+                                if (scoreboard_locked) {
+                                        fail_instr(inst, "THRSW with the "
+                                                   "scoreboard locked.");
+                                }
+
+                                /* No thread switching after lthrsw, since
+                                 * lthrsw means that we get delayed until the
+                                 * other shader is ready for us to terminate.
+                                 */
+                                if (last_thrsw_found) {
+                                        fail_instr(inst, "THRSW after a "
+                                                   "previous LTHRSW");
+                                }
+
+                                if (sig == QPU_SIG_LAST_THREAD_SWITCH)
+                                        last_thrsw_found = true;
+
+                                /* No THRSW while we already have a THRSW
+                                 * queued.
+                                 */
+                                if (i < thrsw_ip) {
+                                        fail_instr(inst,
+                                                   "THRSW with a THRSW queued.");
+                                }
+
+                                thrsw_ip = i + 3;
+                                break;
+
+                        case QPU_SIG_LOAD_TMU0:
+                        case QPU_SIG_LOAD_TMU1:
+                                if (last_tex_samples_outstanding == 0) {
+                                        fail_instr(inst, "TMU load with nothing "
+                                                   "in the results fifo from "
+                                                   "the previous THRSW.");
+                                }
+
+                                last_tex_samples_outstanding--;
+                                break;
+                        }
+
+                        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+                        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+                        if (waddr_add == QPU_W_TMU0_S ||
+                            waddr_add == QPU_W_TMU1_S ||
+                            waddr_mul == QPU_W_TMU0_S ||
+                            waddr_mul == QPU_W_TMU1_S) {
+                                tex_samples_outstanding++;
+                        }
+                }
+        }
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c
index ab343ee31..506fdb593 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -115,37 +115,67 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
 
         vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true);
 
-        vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
-        vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs);
-        vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
-        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
+        /* The physical regfiles split us into two classes, with [0] being the
+         * whole space and [1] being the bottom half (for threaded fragment
+         * shaders).
+         */
+        for (int i = 0; i < 2; i++) {
+                vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs);
+                vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs);
+                vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs);
+                vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs);
+                vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs);
+        }
         vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs);
-        for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
-                /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
+
+        /* r0-r3 */
+        for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) {
+                ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
+        }
+
+        /* R4 gets a special class because it can't be written as a general
+         * purpose register. (it's TMU_NOSWAP as a write address).
+         */
+        for (int i = 0; i < 2; i++) {
+                ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i],
+                                 ACC_INDEX + 4);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_any[i],
+                                 ACC_INDEX + 4);
+        }
+
+        /* A/B */
+        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) {
+                /* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in
                  * vc4_qpu_emit.c
                  */
-                if (vc4_regs[i].addr == 31)
+                if (vc4_regs[i].addr == 14)
                         continue;
 
-                /* R4 can't be written as a general purpose register. (it's
-                 * TMU_NOSWAP as a write address).
-                 */
-                if (vc4_regs[i].mux == QPU_MUX_R4) {
-                        ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
-                        ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
-                        continue;
+                ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
+
+                if (vc4_regs[i].addr < 16) {
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i);
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i);
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
                 }
 
-                if (vc4_regs[i].mux <= QPU_MUX_R3)
-                        ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i);
 
-                ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
-                ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i);
-        }
+                /* A only */
+                if (((i - AB_INDEX) & 1) == 0) {
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i);
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i);
 
-        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
-                ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
-                ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+                        if (vc4_regs[i].addr < 16) {
+                                ra_class_add_reg(vc4->regs,
+                                                 vc4->reg_class_a[1], i);
+                                ra_class_add_reg(vc4->regs,
+                                                 vc4->reg_class_r4_or_a[1], i);
+                        }
+                }
         }
 
         ra_set_finalize(vc4->regs, NULL);
@@ -166,7 +196,7 @@ node_to_temp_priority(const void *in_a, const void *in_b)
 }
 
 #define CLASS_BIT_A			(1 << 0)
-#define CLASS_BIT_B_OR_ACC		(1 << 1)
+#define CLASS_BIT_B			(1 << 1)
 #define CLASS_BIT_R4			(1 << 2)
 #define CLASS_BIT_R0_R3			(1 << 4)
 
@@ -212,7 +242,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
          * incrementally remove bits that the temp definitely can't be in.
          */
         memset(class_bits,
-               CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+               CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3,
                sizeof(class_bits));
 
         int ip = 0;
@@ -226,6 +256,14 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                 if (c->temp_start[i] < ip && c->temp_end[i] > ip)
                                         class_bits[i] &= ~CLASS_BIT_R4;
                         }
+
+                        /* If we're doing a conditional write of something
+                         * writing R4 (math, tex results), then make sure that
+                         * we store in a temp so that we actually
+                         * conditionally move the result.
+                         */
+                        if (inst->cond != QPU_COND_ALWAYS)
+                                class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
                 } else {
                         /* R4 can't be written as a general purpose
                          * register. (it's TMU_NOSWAP as a write address).
@@ -250,6 +288,17 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                         class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3;
                         break;
 
+                case QOP_THRSW:
+                        /* All accumulators are invalidated across a thread
+                         * switch.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (c->temp_start[i] < ip && c->temp_end[i] > ip)
+                                        class_bits[i] &= ~(CLASS_BIT_R0_R3 |
+                                                           CLASS_BIT_R4);
+                        }
+                        break;
+
                 default:
                         break;
                 }
@@ -265,7 +314,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                  * can only be done from regfile A, while float unpacks can be
                  * either A or R4.
                  */
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file == QFILE_TEMP &&
                             inst->src[i].pack) {
                                 if (qir_is_float_input(inst)) {
@@ -285,22 +334,40 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 int node = temp_to_node[i];
 
                 switch (class_bits[i]) {
-                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
-                        ra_set_node_class(g, node, vc4->reg_class_any);
+                case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3:
+                        ra_set_node_class(g, node,
+                                          vc4->reg_class_any[c->fs_threaded]);
                         break;
-                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
-                        ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc);
+                case CLASS_BIT_A | CLASS_BIT_B:
+                        ra_set_node_class(g, node,
+                                          vc4->reg_class_a_or_b[c->fs_threaded]);
+                        break;
+                case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3:
+                        ra_set_node_class(g, node,
+                                          vc4->reg_class_a_or_b_or_acc[c->fs_threaded]);
                         break;
                 case CLASS_BIT_A | CLASS_BIT_R4:
-                        ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+                        ra_set_node_class(g, node,
+                                          vc4->reg_class_r4_or_a[c->fs_threaded]);
                         break;
                 case CLASS_BIT_A:
-                        ra_set_node_class(g, node, vc4->reg_class_a);
+                        ra_set_node_class(g, node,
+                                          vc4->reg_class_a[c->fs_threaded]);
                         break;
                 case CLASS_BIT_R0_R3:
                         ra_set_node_class(g, node, vc4->reg_class_r0_r3);
                         break;
+
                 default:
+                        /* DDX/DDY used across thread switched might get us
+                         * here.
+                         */
+                        if (c->fs_threaded) {
+                                c->failed = true;
+                                free(temp_registers);
+                                return NULL;
+                        }
+
                         fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
                                 i, class_bits[i]);
                         abort();
@@ -321,9 +388,13 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 
         bool ok = ra_allocate(g);
         if (!ok) {
-                fprintf(stderr, "Failed to register allocate:\n");
-                qir_dump(c);
+                if (!c->fs_threaded) {
+                        fprintf(stderr, "Failed to register allocate:\n");
+                        qir_dump(c);
+                }
+
                 c->failed = true;
+                free(temp_registers);
                 return NULL;
         }
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
index 7d5076f42..37acefdc0 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
@@ -46,7 +46,7 @@ qir_reorder_uniforms(struct vc4_compile *c)
         qir_for_each_inst_inorder(inst, c) {
                 uint32_t new = ~0;
 
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                         if (inst->src[i].file != QFILE_UNIF)
                                 continue;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
index 704cd71ea..596f73dfb 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
@@ -165,7 +165,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
             prsc->width0 == box->width &&
             prsc->height0 == box->height &&
             prsc->depth0 == box->depth &&
-            prsc->array_size == 1) {
+            prsc->array_size == 1 &&
+            rsc->bo->private) {
                 usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
         }
 
@@ -283,6 +284,20 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                 if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
                         return NULL;
 
+                if (format == PIPE_FORMAT_ETC1_RGB8) {
+                        /* ETC1 is arranged as 64-bit blocks, where each block
+                         * is 4x4 pixels.  Texture tiling operates on the
+                         * 64-bit block the way it would an uncompressed
+                         * pixels.
+                         */
+                        assert(!(ptrans->box.x & 3));
+                        assert(!(ptrans->box.y & 3));
+                        ptrans->box.x >>= 2;
+                        ptrans->box.y >>= 2;
+                        ptrans->box.width = (ptrans->box.width + 3) >> 2;
+                        ptrans->box.height = (ptrans->box.height + 3) >> 2;
+                }
+
                 /* We need to align the box to utile boundaries, since that's
                  * what load/store operates on.  This may cause us to need to
                  * read out the original contents in that border area.  Right
@@ -387,6 +402,11 @@ vc4_setup_slices(struct vc4_resource *rsc)
         struct pipe_resource *prsc = &rsc->base.b;
         uint32_t width = prsc->width0;
         uint32_t height = prsc->height0;
+        if (prsc->format == PIPE_FORMAT_ETC1_RGB8) {
+                width = (width + 3) >> 2;
+                height = (height + 3) >> 2;
+        }
+
         uint32_t pot_width = util_next_power_of_two(width);
         uint32_t pot_height = util_next_power_of_two(height);
         uint32_t offset = 0;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
index 72fd09aee..27d23dc96 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
@@ -123,9 +123,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_SHADOW_MAP:
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
         case PIPE_CAP_TWO_SIDED_STENCIL:
-        case PIPE_CAP_USER_INDEX_BUFFERS:
         case PIPE_CAP_TEXTURE_MULTISAMPLE:
         case PIPE_CAP_TEXTURE_SWIZZLE:
+        case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
                 return 1;
 
                 /* lying for GL 2.0 */
@@ -225,8 +225,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_STRING_MARKER:
         case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
         case PIPE_CAP_QUERY_BUFFER_OBJECT:
-	case PIPE_CAP_QUERY_MEMORY_INFO:
-	case PIPE_CAP_PCI_GROUP:
+        case PIPE_CAP_QUERY_MEMORY_INFO:
+        case PIPE_CAP_PCI_GROUP:
         case PIPE_CAP_PCI_BUS:
         case PIPE_CAP_PCI_DEVICE:
         case PIPE_CAP_PCI_FUNCTION:
@@ -239,11 +239,25 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
         case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
         case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+        case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+        case PIPE_CAP_NATIVE_FENCE_FD:
+        case PIPE_CAP_TGSI_FS_FBFETCH:
+        case PIPE_CAP_TGSI_MUL_ZERO_WINS:
+        case PIPE_CAP_DOUBLES:
+        case PIPE_CAP_INT64:
+        case PIPE_CAP_INT64_DIVMOD:
+        case PIPE_CAP_TGSI_TEX_TXF_LZ:
+        case PIPE_CAP_TGSI_CLOCK:
+        case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+        case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+        case PIPE_CAP_TGSI_BALLOT:
+        case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
                 return 0;
 
                 /* Stream output. */
         case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
         case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+        case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
         case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
         case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
                 return 0;
@@ -336,8 +350,9 @@ vc4_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 }
 
 static int
-vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
-                           enum pipe_shader_cap param)
+vc4_screen_get_shader_param(struct pipe_screen *pscreen,
+                            enum pipe_shader_type shader,
+                            enum pipe_shader_cap param)
 {
         if (shader != PIPE_SHADER_VERTEX &&
             shader != PIPE_SHADER_FRAGMENT) {
@@ -356,10 +371,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return vc4_screen(pscreen)->has_control_flow;
 
         case PIPE_SHADER_CAP_MAX_INPUTS:
-                if (shader == PIPE_SHADER_FRAGMENT)
-                        return 8;
-                else
-                        return 16;
+                return 8;
         case PIPE_SHADER_CAP_MAX_OUTPUTS:
                 return shader == PIPE_SHADER_FRAGMENT ? 1 : 8;
         case PIPE_SHADER_CAP_MAX_TEMPS:
@@ -368,8 +380,6 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return 16 * 1024 * sizeof(float);
         case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
                 return 1;
-        case PIPE_SHADER_CAP_MAX_PREDS:
-                return 0; /* nothing uses this */
         case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
                 return 0;
         case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
@@ -384,7 +394,6 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return 0;
         case PIPE_SHADER_CAP_INTEGERS:
                 return 1;
-        case PIPE_SHADER_CAP_DOUBLES:
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
@@ -401,6 +410,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 32;
         case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
         case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
                 return 0;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
@@ -416,6 +426,7 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
                                unsigned sample_count,
                                unsigned usage)
 {
+        struct vc4_screen *screen = vc4_screen(pscreen);
         unsigned retval = 0;
 
         if (sample_count > 1 && sample_count != VC4_MAX_SAMPLES)
@@ -485,7 +496,8 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
         }
 
         if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
-            vc4_tex_format_supported(format)) {
+            vc4_tex_format_supported(format) &&
+            (format != PIPE_FORMAT_ETC1_RGB8 || screen->has_etc1)) {
                 retval |= PIPE_BIND_SAMPLER_VIEW;
         }
 
@@ -526,16 +538,12 @@ static int handle_compare(void *key1, void *key2)
 }
 
 static bool
-vc4_supports_branches(struct vc4_screen *screen)
+vc4_has_feature(struct vc4_screen *screen, uint32_t feature)
 {
-#if USE_VC4_SIMULATOR
-        return true;
-#endif
-
         struct drm_vc4_get_param p = {
-                .param = DRM_VC4_PARAM_SUPPORTS_BRANCHES,
+                .param = feature,
         };
-        int ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &p);
+        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &p);
 
         if (ret != 0)
                 return false;
@@ -546,11 +554,6 @@ vc4_supports_branches(struct vc4_screen *screen)
 static bool
 vc4_get_chip_info(struct vc4_screen *screen)
 {
-#if USE_VC4_SIMULATOR
-        screen->v3d_ver = 21;
-        return true;
-#endif
-
         struct drm_vc4_get_param ident0 = {
                 .param = DRM_VC4_PARAM_V3D_IDENT0,
         };
@@ -559,7 +562,7 @@ vc4_get_chip_info(struct vc4_screen *screen)
         };
         int ret;
 
-        ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident0);
+        ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident0);
         if (ret != 0) {
                 if (errno == EINVAL) {
                         /* Backwards compatibility with 2835 kernels which
@@ -573,7 +576,7 @@ vc4_get_chip_info(struct vc4_screen *screen)
                         return false;
                 }
         }
-        ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident1);
+        ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident1);
         if (ret != 0) {
                 fprintf(stderr, "Couldn't get V3D IDENT1: %s\n",
                         strerror(errno));
@@ -612,11 +615,15 @@ vc4_screen_create(int fd)
 
         screen->fd = fd;
         list_inithead(&screen->bo_cache.time_list);
-        pipe_mutex_init(screen->bo_handles_mutex);
+        (void) mtx_init(&screen->bo_handles_mutex, mtx_plain);
         screen->bo_handles = util_hash_table_create(handle_hash, handle_compare);
 
-        if (vc4_supports_branches(screen))
-                screen->has_control_flow = true;
+        screen->has_control_flow =
+                vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_BRANCHES);
+        screen->has_etc1 =
+                vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_ETC1);
+        screen->has_threaded_fs =
+                vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_THREADED_FS);
 
         if (!vc4_get_chip_info(screen))
                 goto fail;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
index 16003cfcc..34d15381a 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
@@ -30,6 +30,10 @@
 #include "util/list.h"
 #include "util/slab.h"
 
+#ifndef DRM_VC4_PARAM_SUPPORTS_ETC1
+#define DRM_VC4_PARAM_SUPPORTS_ETC1		4
+#endif
+
 struct vc4_bo;
 
 #define VC4_DEBUG_CL        0x0001
@@ -47,6 +51,8 @@ struct vc4_bo;
 #define VC4_MAX_MIP_LEVELS 12
 #define VC4_MAX_TEXTURE_SAMPLERS 16
 
+struct vc4_simulator_file;
+
 struct vc4_screen {
         struct pipe_screen base;
         int fd;
@@ -55,9 +61,6 @@ struct vc4_screen {
 
         const char *name;
 
-        void *simulator_mem_base;
-        uint32_t simulator_mem_size;
-
         /** The last seqno we've completed a wait for.
          *
          * This lets us slightly optimize our waits by skipping wait syscalls
@@ -74,18 +77,22 @@ struct vc4_screen {
                 struct list_head *size_list;
                 uint32_t size_list_size;
 
-                pipe_mutex lock;
+                mtx_t lock;
 
                 uint32_t bo_size;
                 uint32_t bo_count;
         } bo_cache;
 
         struct util_hash_table *bo_handles;
-        pipe_mutex bo_handles_mutex;
+        mtx_t bo_handles_mutex;
 
         uint32_t bo_size;
         uint32_t bo_count;
         bool has_control_flow;
+        bool has_etc1;
+        bool has_threaded_fs;
+
+        struct vc4_simulator_file *sim_file;
 };
 
 static inline struct vc4_screen *
@@ -105,7 +112,8 @@ vc4_screen_bo_from_handle(struct pipe_screen *pscreen,
 
 const void *
 vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
-                                enum pipe_shader_ir ir, unsigned shader);
+                                enum pipe_shader_ir ir,
+                                enum pipe_shader_type shader);
 
 extern uint32_t vc4_debug;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
index 0291a4e14..9565c49ef 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
@@ -21,9 +21,37 @@
  * IN THE SOFTWARE.
  */
 
+/**
+ * @file vc4_simulator.c
+ *
+ * Implements VC4 simulation on top of a non-VC4 GEM fd.
+ *
+ * This file's goal is to emulate the VC4 ioctls' behavior in the kernel on
+ * top of the simpenrose software simulator.  Generally, VC4 driver BOs have a
+ * GEM-side copy of their contents and a simulator-side memory area that the
+ * GEM contents get copied into during simulation.  Once simulation is done,
+ * the simulator's data is copied back out to the GEM BOs, so that rendering
+ * appears on the screen as if actual hardware rendering had been done.
+ *
+ * One of the limitations of this code is that we shouldn't really need a
+ * GEM-side BO for non-window-system BOs.  However, do we need unique BO
+ * handles for each of our GEM bos so that this file can look up its state
+ * from the handle passed in at submit ioctl time (also, a couple of places
+ * outside of this file still call ioctls directly on the fd).
+ *
+ * Another limitation is that BO import doesn't work unless the underlying
+ * window system's BO size matches what VC4 is going to use, which of course
+ * doesn't work out in practice.  This means that for now, only DRI3 (VC4
+ * makes the winsys BOs) is supported, not DRI2 (window system makes the winys
+ * BOs).
+ */
+
 #ifdef USE_VC4_SIMULATOR
 
+#include <sys/mman.h>
+#include "xf86drm.h"
 #include "util/u_memory.h"
+#include "util/u_mm.h"
 #include "util/ralloc.h"
 
 #include "vc4_screen.h"
@@ -32,53 +60,160 @@
 #include "vc4_simulator_validate.h"
 #include "simpenrose/simpenrose.h"
 
-static mtx_t exec_mutex = _MTX_INITIALIZER_NP;
+/** Global (across GEM fds) state for the simulator */
+static struct vc4_simulator_state {
+        mtx_t mutex;
+
+        void *mem;
+        ssize_t mem_size;
+        struct mem_block *heap;
+        struct mem_block *overflow;
+
+        /** Mapping from GEM handle to struct vc4_simulator_bo * */
+        struct hash_table *fd_map;
+
+        int refcount;
+} sim_state = {
+        .mutex = _MTX_INITIALIZER_NP,
+};
+
+/** Per-GEM-fd state for the simulator. */
+struct vc4_simulator_file {
+        int fd;
+
+        /* This is weird -- we make a "vc4_device" per file, even though on
+         * the kernel side this is a global.  We do this so that kernel code
+         * calling us for BO allocation can get to our screen.
+         */
+        struct drm_device dev;
+
+        /** Mapping from GEM handle to struct vc4_simulator_bo * */
+        struct hash_table *bo_map;
+};
+
+/** Wrapper for drm_vc4_bo tracking the simulator-specific state. */
+struct vc4_simulator_bo {
+        struct drm_vc4_bo base;
+        struct vc4_simulator_file *file;
+
+        /** Area for this BO within sim_state->mem */
+        struct mem_block *block;
+        void *winsys_map;
+        uint32_t winsys_stride;
+
+        int handle;
+};
+
+static void *
+int_to_key(int key)
+{
+        return (void *)(uintptr_t)key;
+}
+
+static struct vc4_simulator_file *
+vc4_get_simulator_file_for_fd(int fd)
+{
+        struct hash_entry *entry = _mesa_hash_table_search(sim_state.fd_map,
+                                                           int_to_key(fd + 1));
+        return entry ? entry->data : NULL;
+}
 
 /* A marker placed just after each BO, then checked after rendering to make
  * sure it's still there.
  */
 #define BO_SENTINEL		0xfedcba98
 
-#define OVERFLOW_SIZE (32 * 1024 * 1024)
+#define PAGE_ALIGN2		12
 
-static struct drm_gem_cma_object *
-vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo)
+/**
+ * Allocates space in simulator memory and returns a tracking struct for it
+ * that also contains the drm_gem_cma_object struct.
+ */
+static struct vc4_simulator_bo *
+vc4_create_simulator_bo(int fd, int handle, unsigned size)
 {
-        struct vc4_context *vc4 = dev->vc4;
-        struct vc4_screen *screen = vc4->screen;
-        struct drm_vc4_bo *drm_bo = CALLOC_STRUCT(drm_vc4_bo);
-        struct drm_gem_cma_object *obj = &drm_bo->base;
-        uint32_t size = align(bo->size, 4096);
+        struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
+        struct vc4_simulator_bo *sim_bo = rzalloc(file,
+                                                  struct vc4_simulator_bo);
+        struct drm_vc4_bo *bo = &sim_bo->base;
+        struct drm_gem_cma_object *obj = &bo->base;
+        size = align(size, 4096);
+
+        sim_bo->file = file;
+        sim_bo->handle = handle;
+
+        mtx_lock(&sim_state.mutex);
+        sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, PAGE_ALIGN2, 0);
+        mtx_unlock(&sim_state.mutex);
+        assert(sim_bo->block);
 
-        drm_bo->bo = bo;
         obj->base.size = size;
-        obj->base.dev = dev;
-        obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next;
+        obj->base.dev = &file->dev;
+        obj->vaddr = sim_state.mem + sim_bo->block->ofs;
         obj->paddr = simpenrose_hw_addr(obj->vaddr);
 
-        dev->simulator_mem_next += size + sizeof(uint32_t);
-        dev->simulator_mem_next = align(dev->simulator_mem_next, 4096);
-        assert(dev->simulator_mem_next <= screen->simulator_mem_size);
+        *(uint32_t *)(obj->vaddr + size) = BO_SENTINEL;
 
-        *(uint32_t *)(obj->vaddr + bo->size) = BO_SENTINEL;
+        /* A handle of 0 is used for vc4_gem.c internal allocations that
+         * don't need to go in the lookup table.
+         */
+        if (handle != 0) {
+                mtx_lock(&sim_state.mutex);
+                _mesa_hash_table_insert(file->bo_map, int_to_key(handle), bo);
+                mtx_unlock(&sim_state.mutex);
+        }
+
+        return sim_bo;
+}
 
-        return obj;
+static void
+vc4_free_simulator_bo(struct vc4_simulator_bo *sim_bo)
+{
+        struct vc4_simulator_file *sim_file = sim_bo->file;
+        struct drm_vc4_bo *bo = &sim_bo->base;
+        struct drm_gem_cma_object *obj = &bo->base;
+
+        if (sim_bo->winsys_map)
+                munmap(sim_bo->winsys_map, obj->base.size);
+
+        mtx_lock(&sim_state.mutex);
+        u_mmFreeMem(sim_bo->block);
+        if (sim_bo->handle) {
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(sim_file->bo_map,
+                                                int_to_key(sim_bo->handle));
+                _mesa_hash_table_remove(sim_file->bo_map, entry);
+        }
+        mtx_unlock(&sim_state.mutex);
+        ralloc_free(sim_bo);
+}
+
+static struct vc4_simulator_bo *
+vc4_get_simulator_bo(struct vc4_simulator_file *file, int gem_handle)
+{
+        mtx_lock(&sim_state.mutex);
+        struct hash_entry *entry =
+                _mesa_hash_table_search(file->bo_map, int_to_key(gem_handle));
+        mtx_unlock(&sim_state.mutex);
+
+        return entry ? entry->data : NULL;
 }
 
 struct drm_gem_cma_object *
 drm_gem_cma_create(struct drm_device *dev, size_t size)
 {
-        struct vc4_context *vc4 = dev->vc4;
-        struct vc4_screen *screen = vc4->screen;
-
-        struct vc4_bo *bo = vc4_bo_alloc(screen, size, "simulator validate");
-        return vc4_wrap_bo_with_cma(dev, bo);
+        struct vc4_screen *screen = dev->screen;
+        struct vc4_simulator_bo *sim_bo = vc4_create_simulator_bo(screen->fd,
+                                                                  0, size);
+        return &sim_bo->base.base;
 }
 
 static int
 vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job,
                       struct vc4_exec_info *exec)
 {
+        int fd = dev->screen->fd;
+        struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
         struct drm_vc4_submit_cl *args = exec->args;
         struct vc4_bo **bos = job->bo_pointers.base;
 
@@ -86,9 +221,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job,
         exec->bo = calloc(exec->bo_count, sizeof(void *));
         for (int i = 0; i < exec->bo_count; i++) {
                 struct vc4_bo *bo = bos[i];
-                struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
+                struct vc4_simulator_bo *sim_bo =
+                        vc4_get_simulator_bo(file, bo->handle);
+                struct drm_vc4_bo *drm_bo = &sim_bo->base;
+                struct drm_gem_cma_object *obj = &drm_bo->base;
 
-                struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
+                drm_bo->bo = bo;
 #if 0
                 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
 #endif
@@ -118,14 +256,14 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
                 struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
                 struct vc4_bo *bo = drm_bo->bo;
 
-                assert(*(uint32_t *)(obj->vaddr + bo->size) == BO_SENTINEL);
+                assert(*(uint32_t *)(obj->vaddr +
+                                     obj->base.size) == BO_SENTINEL);
                 memcpy(bo->map, obj->vaddr, bo->size);
 
                 if (drm_bo->validated_shader) {
                         free(drm_bo->validated_shader->texture_samples);
                         free(drm_bo->validated_shader);
                 }
-                free(obj);
         }
 
         free(exec->bo);
@@ -194,8 +332,8 @@ vc4_dump_to_file(struct vc4_exec_info *exec)
 
         /* Add the static overflow memory area. */
         bo_state[i].handle = exec->bo_count;
-        bo_state[i].paddr = 0;
-        bo_state[i].size = OVERFLOW_SIZE;
+        bo_state[i].paddr = sim_state.overflow->ofs;
+        bo_state[i].size = sim_state.overflow->size;
         i++;
 
         fwrite(bo_state, sizeof(*bo_state), state->bo_count, f);
@@ -211,8 +349,8 @@ vc4_dump_to_file(struct vc4_exec_info *exec)
                 fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
         }
 
-        void *overflow = calloc(1, OVERFLOW_SIZE);
-        fwrite(overflow, 1, OVERFLOW_SIZE, f);
+        void *overflow = calloc(1, sim_state.overflow->size);
+        fwrite(overflow, 1, sim_state.overflow->size, f);
         free(overflow);
 
         free(state);
@@ -225,23 +363,22 @@ vc4_simulator_flush(struct vc4_context *vc4,
                     struct drm_vc4_submit_cl *args, struct vc4_job *job)
 {
         struct vc4_screen *screen = vc4->screen;
+        int fd = screen->fd;
+        struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
         struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
         struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL;
-        uint32_t winsys_stride = ctex ? ctex->bo->simulator_winsys_stride : 0;
+        struct vc4_simulator_bo *csim_bo = ctex ? vc4_get_simulator_bo(file, ctex->bo->handle) : NULL;
+        uint32_t winsys_stride = ctex ? csim_bo->winsys_stride : 0;
         uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0;
         uint32_t row_len = MIN2(sim_stride, winsys_stride);
         struct vc4_exec_info exec;
-        struct drm_device local_dev = {
-                .vc4 = vc4,
-                .simulator_mem_next = OVERFLOW_SIZE,
-        };
-        struct drm_device *dev = &local_dev;
+        struct drm_device *dev = &file->dev;
         int ret;
 
         memset(&exec, 0, sizeof(exec));
         list_inithead(&exec.unref_list);
 
-        if (ctex && ctex->bo->simulator_winsys_map) {
+        if (ctex && csim_bo->winsys_map) {
 #if 0
                 fprintf(stderr, "%dx%d %d %d %d\n",
                         ctex->base.b.width0, ctex->base.b.height0,
@@ -252,7 +389,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
 
                 for (int y = 0; y < ctex->base.b.height0; y++) {
                         memcpy(ctex->bo->map + y * sim_stride,
-                               ctex->bo->simulator_winsys_map + y * winsys_stride,
+                               csim_bo->winsys_map + y * winsys_stride,
                                row_len);
                 }
         }
@@ -269,7 +406,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
 
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "RCL:\n");
-                vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca,
+                vc4_dump_cl(sim_state.mem + exec.ct1ca,
                             exec.ct1ea - exec.ct1ca, true);
         }
 
@@ -281,7 +418,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
                         fprintf(stderr, "Binning returned %d flushes, should be 1.\n",
                                 bfc);
                         fprintf(stderr, "Relocated binning command list:\n");
-                        vc4_dump_cl(screen->simulator_mem_base + exec.ct0ca,
+                        vc4_dump_cl(sim_state.mem + exec.ct0ca,
                                     exec.ct0ea - exec.ct0ca, false);
                         abort();
                 }
@@ -291,7 +428,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
                 fprintf(stderr, "Rendering returned %d frames, should be 1.\n",
                         rfc);
                 fprintf(stderr, "Relocated render command list:\n");
-                vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca,
+                vc4_dump_cl(sim_state.mem + exec.ct1ca,
                             exec.ct1ea - exec.ct1ca, true);
                 abort();
         }
@@ -302,16 +439,17 @@ vc4_simulator_flush(struct vc4_context *vc4,
 
         list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list,
                                  unref_head) {
+                struct vc4_simulator_bo *sim_bo = (struct vc4_simulator_bo *)bo;
+                struct drm_gem_cma_object *obj = &sim_bo->base.base;
 		list_del(&bo->unref_head);
-                assert(*(uint32_t *)(bo->base.vaddr + bo->bo->size) ==
+                assert(*(uint32_t *)(obj->vaddr + obj->base.size) ==
                        BO_SENTINEL);
-                vc4_bo_unreference(&bo->bo);
-                free(bo);
+                vc4_free_simulator_bo(sim_bo);
         }
 
-        if (ctex && ctex->bo->simulator_winsys_map) {
+        if (ctex && csim_bo->winsys_map) {
                 for (int y = 0; y < ctex->base.b.height0; y++) {
-                        memcpy(ctex->bo->simulator_winsys_map + y * winsys_stride,
+                        memcpy(csim_bo->winsys_map + y * winsys_stride,
                                ctex->bo->map + y * sim_stride,
                                row_len);
                 }
@@ -320,33 +458,234 @@ vc4_simulator_flush(struct vc4_context *vc4,
         return 0;
 }
 
-static void *sim_mem_base = NULL;
-static int sim_mem_refcount = 0;
-static ssize_t sim_mem_size = 256 * 1024 * 1024;
+/**
+ * Map the underlying GEM object from the real hardware GEM handle.
+ */
+static void *
+vc4_simulator_map_winsys_bo(int fd, struct vc4_simulator_bo *sim_bo)
+{
+        struct drm_vc4_bo *bo = &sim_bo->base;
+        struct drm_gem_cma_object *obj = &bo->base;
+        int ret;
+        void *map;
 
-void
-vc4_simulator_init(struct vc4_screen *screen)
+        struct drm_mode_map_dumb map_dumb = {
+                .handle = sim_bo->handle,
+        };
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb);
+        if (ret != 0) {
+                fprintf(stderr, "map ioctl failure\n");
+                abort();
+        }
+
+        map = mmap(NULL, obj->base.size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                   fd, map_dumb.offset);
+        if (map == MAP_FAILED) {
+                fprintf(stderr,
+                        "mmap of bo %d (offset 0x%016llx, size %d) failed\n",
+                        sim_bo->handle, (long long)map_dumb.offset,
+                        (int)obj->base.size);
+                abort();
+        }
+
+        return map;
+}
+
+/**
+ * Do fixups after a BO has been opened from a handle.
+ *
+ * This could be done at DRM_IOCTL_GEM_OPEN/DRM_IOCTL_GEM_PRIME_FD_TO_HANDLE
+ * time, but we're still using drmPrimeFDToHandle() so we have this helper to
+ * be called afterward instead.
+ */
+void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+                                    int handle, uint32_t size)
 {
-        mtx_lock(&exec_mutex);
-        if (sim_mem_refcount++) {
-                screen->simulator_mem_size = sim_mem_size;
-                screen->simulator_mem_base = sim_mem_base;
-                mtx_unlock(&exec_mutex);
+        struct vc4_simulator_bo *sim_bo =
+                vc4_create_simulator_bo(fd, handle, size);
+
+        sim_bo->winsys_stride = winsys_stride;
+        sim_bo->winsys_map = vc4_simulator_map_winsys_bo(fd, sim_bo);
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC4_CREATE_BO) implementation.
+ *
+ * Making a VC4 BO is just a matter of making a corresponding BO on the host.
+ */
+static int
+vc4_simulator_create_bo_ioctl(int fd, struct drm_vc4_create_bo *args)
+{
+        int ret;
+        struct drm_mode_create_dumb create = {
+                .width = 128,
+                .bpp = 8,
+                .height = (args->size + 127) / 128,
+        };
+
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+        assert(create.size >= args->size);
+
+        args->handle = create.handle;
+
+        vc4_create_simulator_bo(fd, create.handle, args->size);
+
+        return ret;
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC4_CREATE_SHADER_BO) implementation.
+ *
+ * In simulation we defer shader validation until exec time.  Just make a host
+ * BO and memcpy the contents in.
+ */
+static int
+vc4_simulator_create_shader_bo_ioctl(int fd,
+                                     struct drm_vc4_create_shader_bo *args)
+{
+        int ret;
+        struct drm_mode_create_dumb create = {
+                .width = 128,
+                .bpp = 8,
+                .height = (args->size + 127) / 128,
+        };
+
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+        if (ret)
+                return ret;
+        assert(create.size >= args->size);
+
+        args->handle = create.handle;
+
+        vc4_create_simulator_bo(fd, create.handle, args->size);
+
+        struct drm_mode_map_dumb map = {
+                .handle = create.handle
+        };
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
+        if (ret)
+                return ret;
+
+        void *shader = mmap(NULL, args->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                            fd, map.offset);
+        memcpy(shader, (void *)(uintptr_t)args->data, args->size);
+        munmap(shader, args->size);
+
+        return 0;
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC4_MMAP_BO) implementation.
+ *
+ * We just pass this straight through to dumb mmap.
+ */
+static int
+vc4_simulator_mmap_bo_ioctl(int fd, struct drm_vc4_mmap_bo *args)
+{
+        int ret;
+        struct drm_mode_map_dumb map = {
+                .handle = args->handle,
+        };
+
+        ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
+        args->offset = map.offset;
+
+        return ret;
+}
+
+static int
+vc4_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+{
+        /* Free the simulator's internal tracking. */
+        struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
+        struct vc4_simulator_bo *sim_bo = vc4_get_simulator_bo(file,
+                                                               args->handle);
+
+        vc4_free_simulator_bo(sim_bo);
+
+        /* Pass the call on down. */
+        return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
+}
+
+static int
+vc4_simulator_get_param_ioctl(int fd, struct drm_vc4_get_param *args)
+{
+        switch (args->param) {
+        case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
+        case DRM_VC4_PARAM_SUPPORTS_ETC1:
+        case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
+                args->value = true;
+                return 0;
+
+        case DRM_VC4_PARAM_V3D_IDENT0:
+                args->value = 0x02000000;
+                return 0;
+
+        case DRM_VC4_PARAM_V3D_IDENT1:
+                args->value = 0x00000001;
+                return 0;
+
+        default:
+                fprintf(stderr, "Unknown DRM_IOCTL_VC4_GET_PARAM(%lld)\n",
+                        (long long)args->param);
+                abort();
+        };
+}
+
+int
+vc4_simulator_ioctl(int fd, unsigned long request, void *args)
+{
+        switch (request) {
+        case DRM_IOCTL_VC4_CREATE_BO:
+                return vc4_simulator_create_bo_ioctl(fd, args);
+        case DRM_IOCTL_VC4_CREATE_SHADER_BO:
+                return vc4_simulator_create_shader_bo_ioctl(fd, args);
+        case DRM_IOCTL_VC4_MMAP_BO:
+                return vc4_simulator_mmap_bo_ioctl(fd, args);
+
+        case DRM_IOCTL_VC4_WAIT_BO:
+        case DRM_IOCTL_VC4_WAIT_SEQNO:
+                /* We do all of the vc4 rendering synchronously, so we just
+                 * return immediately on the wait ioctls.  This ignores any
+                 * native rendering to the host BO, so it does mean we race on
+                 * front buffer rendering.
+                 */
+                return 0;
+
+        case DRM_IOCTL_VC4_GET_PARAM:
+                return vc4_simulator_get_param_ioctl(fd, args);
+
+        case DRM_IOCTL_GEM_CLOSE:
+                return vc4_simulator_gem_close_ioctl(fd, args);
+
+        case DRM_IOCTL_GEM_OPEN:
+        case DRM_IOCTL_GEM_FLINK:
+                return drmIoctl(fd, request, args);
+        default:
+                fprintf(stderr, "Unknown ioctl 0x%08x\n", (int)request);
+                abort();
+        }
+}
+
+static void
+vc4_simulator_init_global(void)
+{
+        mtx_lock(&sim_state.mutex);
+        if (sim_state.refcount++) {
+                mtx_unlock(&sim_state.mutex);
                 return;
         }
 
-        sim_mem_base = calloc(sim_mem_size, 1);
-        if (!sim_mem_base)
+        sim_state.mem_size = 256 * 1024 * 1024;
+        sim_state.mem = calloc(sim_state.mem_size, 1);
+        if (!sim_state.mem)
                 abort();
-
-        screen->simulator_mem_size = sim_mem_size;
-        screen->simulator_mem_base = sim_mem_base;
+        sim_state.heap = u_mmInit(0, sim_state.mem_size);
 
         /* We supply our own memory so that we can have more aperture
          * available (256MB instead of simpenrose's default 64MB).
          */
-        simpenrose_init_hardware_supply_mem(screen->simulator_mem_base,
-                                            screen->simulator_mem_size);
+        simpenrose_init_hardware_supply_mem(sim_state.mem, sim_state.mem_size);
 
         /* Carve out low memory for tile allocation overflow.  The kernel
          * should be automatically handling overflow memory setup on real
@@ -355,20 +694,50 @@ vc4_simulator_init(struct vc4_screen *screen)
          * up over the whole lifetime of simpenrose (not reused on each
          * flush), so it had better be big.
          */
-        simpenrose_supply_overflow_mem(0, OVERFLOW_SIZE);
+        sim_state.overflow = u_mmAllocMem(sim_state.heap, 32 * 1024 * 1024,
+                                          PAGE_ALIGN2, 0);
+        simpenrose_supply_overflow_mem(sim_state.overflow->ofs,
+                                       sim_state.overflow->size);
+
+        mtx_unlock(&sim_state.mutex);
+
+        sim_state.fd_map =
+                _mesa_hash_table_create(NULL,
+                                        _mesa_hash_pointer,
+                                        _mesa_key_pointer_equal);
+}
+
+void
+vc4_simulator_init(struct vc4_screen *screen)
+{
+        vc4_simulator_init_global();
+
+        screen->sim_file = rzalloc(screen, struct vc4_simulator_file);
+
+        screen->sim_file->bo_map =
+                _mesa_hash_table_create(screen->sim_file,
+                                        _mesa_hash_pointer,
+                                        _mesa_key_pointer_equal);
+
+        mtx_lock(&sim_state.mutex);
+        _mesa_hash_table_insert(sim_state.fd_map, int_to_key(screen->fd + 1),
+                                screen->sim_file);
+        mtx_unlock(&sim_state.mutex);
 
-        mtx_unlock(&exec_mutex);
+        screen->sim_file->dev.screen = screen;
 }
 
 void
 vc4_simulator_destroy(struct vc4_screen *screen)
 {
-        mtx_lock(&exec_mutex);
-        if (!--sim_mem_refcount) {
-                free(sim_mem_base);
-                sim_mem_base = NULL;
+        mtx_lock(&sim_state.mutex);
+        if (!--sim_state.refcount) {
+                _mesa_hash_table_destroy(sim_state.fd_map, NULL);
+                u_mmDestroy(sim_state.heap);
+                free(sim_state.mem);
+                /* No memsetting it, because it contains the mutex. */
         }
-        mtx_unlock(&exec_mutex);
+        mtx_unlock(&sim_state.mutex);
 }
 
 #endif /* USE_VC4_SIMULATOR */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 1352c9baf..d507b5fb6 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -78,8 +78,7 @@ typedef uint16_t u16;
 typedef uint32_t u32;
 
 struct drm_device {
-        struct vc4_context *vc4;
-        uint32_t simulator_mem_next;
+        struct vc4_screen *screen;
 };
 
 struct drm_gem_object {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
index 124715895..2e00104e4 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
@@ -374,7 +374,8 @@ vc4_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
 }
 
 static void
-vc4_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index,
+vc4_set_constant_buffer(struct pipe_context *pctx,
+                        enum pipe_shader_type shader, uint index,
                         const struct pipe_constant_buffer *cb)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
@@ -615,6 +616,9 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                  VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) |
                  VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH));
 
+        if (prsc->format == PIPE_FORMAT_ETC1_RGB8)
+                so->texture_p1 |= VC4_TEX_P1_ETCFLIP_MASK;
+
         return &so->base;
 }
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
index 4bcb85b16..07e1c9c5f 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
@@ -52,41 +52,6 @@
 #include "vc4_context.h"
 #include "vc4_tiling.h"
 
-/** Return the width in pixels of a 64-byte microtile. */
-uint32_t
-vc4_utile_width(int cpp)
-{
-        switch (cpp) {
-        case 1:
-        case 2:
-                return 8;
-        case 4:
-                return 4;
-        case 8:
-                return 2;
-        default:
-                fprintf(stderr, "unknown cpp: %d\n", cpp);
-                abort();
-        }
-}
-
-/** Return the height in pixels of a 64-byte microtile. */
-uint32_t
-vc4_utile_height(int cpp)
-{
-        switch (cpp) {
-        case 1:
-                return 8;
-        case 2:
-        case 4:
-        case 8:
-                return 4;
-        default:
-                fprintf(stderr, "unknown cpp: %d\n", cpp);
-                abort();
-        }
-}
-
 /**
  * The texture unit decides what tiling format a particular miplevel is using
  * this function, so we lay out our miptrees accordingly.
@@ -98,32 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp)
                 height <= 4 * vc4_utile_height(cpp));
 }
 
-void
-vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp)
-{
-        uint32_t utile_h = vc4_utile_height(cpp);
-        uint32_t row_size = 64 / utile_h;
-
-        for (int y = 0; y < utile_h; y++) {
-                memcpy(dst, src, row_size);
-                dst += dst_stride;
-                src += row_size;
-        }
-}
-
-void
-vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp)
-{
-        uint32_t utile_h = vc4_utile_height(cpp);
-        uint32_t row_size = 64 / utile_h;
-
-        for (int y = 0; y < utile_h; y++) {
-                memcpy(dst, src, row_size);
-                dst += row_size;
-                src += src_stride;
-        }
-}
-
 static void
 check_box_utile_alignment(const struct pipe_box *box, int cpp)
 {
@@ -133,48 +72,6 @@ check_box_utile_alignment(const struct pipe_box *box, int cpp)
         assert(!(box->height & (vc4_utile_height(cpp) - 1)));
 }
 
-static void
-vc4_load_lt_image(void *dst, uint32_t dst_stride,
-                  void *src, uint32_t src_stride,
-                  int cpp, const struct pipe_box *box)
-{
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-        uint32_t xstart = box->x;
-        uint32_t ystart = box->y;
-
-        for (uint32_t y = 0; y < box->height; y += utile_h) {
-                for (int x = 0; x < box->width; x += utile_w) {
-                        vc4_load_utile(dst + (dst_stride * y +
-                                              x * cpp),
-                                       src + ((ystart + y) * src_stride +
-                                              (xstart + x) * 64 / utile_w),
-                                       dst_stride, cpp);
-                }
-        }
-}
-
-static void
-vc4_store_lt_image(void *dst, uint32_t dst_stride,
-                   void *src, uint32_t src_stride,
-                   int cpp, const struct pipe_box *box)
-{
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-        uint32_t xstart = box->x;
-        uint32_t ystart = box->y;
-
-        for (uint32_t y = 0; y < box->height; y += utile_h) {
-                for (int x = 0; x < box->width; x += utile_w) {
-                        vc4_store_utile(dst + ((ystart + y) * dst_stride +
-                                               (xstart + x) * 64 / utile_w),
-                                        src + (src_stride * y +
-                                               x * cpp),
-                                        src_stride, cpp);
-                }
-        }
-}
-
 /**
  * Takes a utile x and y (and the number of utiles of width of the image) and
  * returns the offset to the utile within a VC4_TILING_FORMAT_TF image.
@@ -209,7 +106,10 @@ t_utile_address(uint32_t utile_x, uint32_t utile_y,
                                         odd_stile_map[stile_index] :
                                         even_stile_map[stile_index]);
 
-        uint32_t utile_offset = 64 * ((utile_y & 3) * 4 + (utile_x & 3));
+        /* This function no longer handles the utile offset within a subtile.
+         * Walking subtiles is the job of the LT image handler.
+         */
+        assert(!(utile_x & 3) && !(utile_y & 3));
 
 #if 0
         fprintf(stderr, "utile %d,%d -> %d + %d + %d (stride %d,%d) = %d\n",
@@ -219,29 +119,70 @@ t_utile_address(uint32_t utile_x, uint32_t utile_y,
                 tile_offset + stile_offset + utile_offset);
 #endif
 
-        return tile_offset + stile_offset + utile_offset;
+        return tile_offset + stile_offset;
 }
 
-static void
-vc4_load_t_image(void *dst, uint32_t dst_stride,
-                 void *src, uint32_t src_stride,
-                 int cpp, const struct pipe_box *box)
+/**
+ * Loads or stores a T texture image by breaking it down into subtiles
+ * (1024-byte, 4x4-utile) sub-images that we can use the LT tiling functions
+ * on.
+ */
+static inline void
+vc4_t_image_helper(void *gpu, uint32_t gpu_stride,
+                   void *cpu, uint32_t cpu_stride,
+                   int cpp, const struct pipe_box *box,
+                   bool to_cpu)
 {
         uint32_t utile_w = vc4_utile_width(cpp);
         uint32_t utile_h = vc4_utile_height(cpp);
-        uint32_t utile_stride = src_stride / cpp / utile_w;
-        uint32_t xstart = box->x / utile_w;
-        uint32_t ystart = box->y / utile_h;
+        uint32_t utile_w_shift = ffs(utile_w) - 1;
+        uint32_t utile_h_shift = ffs(utile_h) - 1;
+        uint32_t stile_w = 4 * utile_w;
+        uint32_t stile_h = 4 * utile_h;
+        assert(stile_w * stile_h * cpp == 1024);
+        uint32_t utile_stride = gpu_stride / cpp / utile_w;
+        uint32_t x1 = box->x;
+        uint32_t y1 = box->y;
+        uint32_t x2 = box->x + box->width;
+        uint32_t y2 = box->y + box->height;
+        struct pipe_box partial_box;
+        uint32_t gpu_lt_stride = stile_w * cpp;
+
+        for (uint32_t y = y1; y < y2; y = align(y + 1, stile_h)) {
+                partial_box.y = y & (stile_h - 1);
+                partial_box.height = MIN2(y2 - y, stile_h - partial_box.y);
+
+                uint32_t cpu_offset = 0;
+                for (uint32_t x = x1; x < x2; x = align(x + 1, stile_w)) {
+                        partial_box.x = x & (stile_w - 1);
+                        partial_box.width = MIN2(x2 - x,
+                                                 stile_w - partial_box.x);
+
+                        /* The dst offset we want is the start of this
+                         * subtile
+                         */
+                        uint32_t gpu_offset =
+                                t_utile_address((x >> utile_w_shift) & ~0x3,
+                                                (y >> utile_h_shift) & ~0x3,
+                                                utile_stride);
 
-        for (uint32_t y = 0; y < box->height / utile_h; y++) {
-                for (int x = 0; x < box->width / utile_w; x++) {
-                        vc4_load_utile(dst + (y * utile_h * dst_stride +
-                                              x * utile_w * cpp),
-                                       src + t_utile_address(xstart + x,
-                                                             ystart + y,
-                                                             utile_stride),
-                                       dst_stride, cpp);
+                        if (to_cpu) {
+                                vc4_load_lt_image(cpu + cpu_offset,
+                                                  cpu_stride,
+                                                  gpu + gpu_offset,
+                                                  gpu_lt_stride,
+                                                  cpp, &partial_box);
+                        } else {
+                                vc4_store_lt_image(gpu + gpu_offset,
+                                                   gpu_lt_stride,
+                                                   cpu + cpu_offset,
+                                                   cpu_stride,
+                                                   cpp, &partial_box);
+                        }
+
+                        cpu_offset += partial_box.width * cpp;
                 }
+                cpu += cpu_stride * partial_box.height;
         }
 }
 
@@ -250,22 +191,19 @@ vc4_store_t_image(void *dst, uint32_t dst_stride,
                   void *src, uint32_t src_stride,
                   int cpp, const struct pipe_box *box)
 {
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-        uint32_t utile_stride = dst_stride / cpp / utile_w;
-        uint32_t xstart = box->x / utile_w;
-        uint32_t ystart = box->y / utile_h;
+        vc4_t_image_helper(dst, dst_stride,
+                           src, src_stride,
+                           cpp, box, false);
+}
 
-        for (uint32_t y = 0; y < box->height / utile_h; y++) {
-                for (int x = 0; x < box->width / utile_w; x++) {
-                        vc4_store_utile(dst + t_utile_address(xstart + x,
-                                                              ystart + y,
-                                                              utile_stride),
-                                        src + (y * utile_h * src_stride +
-                                               x * utile_w * cpp),
-                                        src_stride, cpp);
-                }
-        }
+static void
+vc4_load_t_image(void *dst, uint32_t dst_stride,
+                  void *src, uint32_t src_stride,
+                  int cpp, const struct pipe_box *box)
+{
+        vc4_t_image_helper(src, src_stride,
+                           dst, dst_stride,
+                           cpp, box, true);
 }
 
 /**
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h
index b90bba702..ba1ad6fb3 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h
@@ -24,11 +24,56 @@
 #ifndef VC4_TILING_H
 #define VC4_TILING_H
 
-uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST;
-uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST;
+#include <stdbool.h>
+#include <stdint.h>
+#include "util/macros.h"
+
+/** Return the width in pixels of a 64-byte microtile. */
+static inline uint32_t
+vc4_utile_width(int cpp)
+{
+        switch (cpp) {
+        case 1:
+        case 2:
+                return 8;
+        case 4:
+                return 4;
+        case 8:
+                return 2;
+        default:
+                unreachable("unknown cpp");
+        }
+}
+
+/** Return the height in pixels of a 64-byte microtile. */
+static inline uint32_t
+vc4_utile_height(int cpp)
+{
+        switch (cpp) {
+        case 1:
+                return 8;
+        case 2:
+        case 4:
+        case 8:
+                return 4;
+        default:
+                unreachable("unknown cpp");
+        }
+}
+
 bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
-void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
-void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
+void vc4_load_lt_image_base(void *dst, uint32_t dst_stride,
+                            void *src, uint32_t src_stride,
+                            int cpp, const struct pipe_box *box);
+void vc4_store_lt_image_base(void *dst, uint32_t dst_stride,
+                             void *src, uint32_t src_stride,
+                             int cpp, const struct pipe_box *box);
+void vc4_load_lt_image_neon(void *dst, uint32_t dst_stride,
+                            void *src, uint32_t src_stride,
+                            int cpp, const struct pipe_box *box);
+void vc4_store_lt_image_neon(void *dst, uint32_t dst_stride,
+                             void *src, uint32_t src_stride,
+                             int cpp, const struct pipe_box *box);
 void vc4_load_tiled_image(void *dst, uint32_t dst_stride,
                           void *src, uint32_t src_stride,
                           uint8_t tiling_format, int cpp,
@@ -38,4 +83,34 @@ void vc4_store_tiled_image(void *dst, uint32_t dst_stride,
                            uint8_t tiling_format, int cpp,
                            const struct pipe_box *box);
 
+/* If we're building for ARMv7 (Pi 2+), assume it has NEON.  For Raspbian we
+ * should extend this to have some runtime detection of being built for ARMv6
+ * on a Pi 2+.
+ */
+#if defined(__ARM_ARCH) && __ARM_ARCH == 7
+#define NEON_SUFFIX(x) x ## _neon
+#else
+#define NEON_SUFFIX(x) x ## _base
+#endif
+
+static inline void
+vc4_load_lt_image(void *dst, uint32_t dst_stride,
+                  void *src, uint32_t src_stride,
+                  int cpp, const struct pipe_box *box)
+{
+        NEON_SUFFIX(vc4_load_lt_image)(dst, dst_stride, src, src_stride,
+                                       cpp, box);
+}
+
+static inline void
+vc4_store_lt_image(void *dst, uint32_t dst_stride,
+                   void *src, uint32_t src_stride,
+                   int cpp, const struct pipe_box *box)
+{
+        NEON_SUFFIX(vc4_store_lt_image)(dst, dst_stride, src, src_stride,
+                                        cpp, box);
+}
+
+#undef NEON_SUFFIX
+
 #endif /* VC4_TILING_H */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
new file mode 100644
index 000000000..f37a92e93
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright © 2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file vc4_tiling_lt.c
+ *
+ * Helper functions from vc4_tiling.c that will be compiled for using NEON
+ * assembly or not.
+ *
+ * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
+ * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
+ * sim build working.
+ */
+
+#include <string.h>
+#include "pipe/p_state.h"
+#include "vc4_tiling.h"
+
+#ifdef VC4_BUILD_NEON
+#define NEON_TAG(x) x ## _neon
+#else
+#define NEON_TAG(x) x ## _base
+#endif
+
+/** Returns the stride in bytes of a 64-byte microtile. */
+static uint32_t
+vc4_utile_stride(int cpp)
+{
+        switch (cpp) {
+        case 1:
+                return 8;
+        case 2:
+        case 4:
+        case 8:
+                return 16;
+        default:
+                unreachable("bad cpp");
+        }
+}
+
+static void
+vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
+{
+        uint32_t gpu_stride = vc4_utile_stride(cpp);
+#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+        if (gpu_stride == 8) {
+                __asm__ volatile (
+                        /* Load from the GPU in one shot, no interleave, to
+                         * d0-d7.
+                         */
+                        "vldm %0, {q0, q1, q2, q3}\n"
+                        /* Store each 8-byte line to cpu-side destination,
+                         * incrementing it by the stride each time.
+                         */
+                        "vst1.8 d0, [%1], %2\n"
+                        "vst1.8 d1, [%1], %2\n"
+                        "vst1.8 d2, [%1], %2\n"
+                        "vst1.8 d3, [%1], %2\n"
+                        "vst1.8 d4, [%1], %2\n"
+                        "vst1.8 d5, [%1], %2\n"
+                        "vst1.8 d6, [%1], %2\n"
+                        "vst1.8 d7, [%1]\n"
+                        :
+                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        : "q0", "q1", "q2", "q3");
+        } else {
+                assert(gpu_stride == 16);
+                __asm__ volatile (
+                        /* Load from the GPU in one shot, no interleave, to
+                         * d0-d7.
+                         */
+                        "vldm %0, {q0, q1, q2, q3};\n"
+                        /* Store each 16-byte line in 2 parts to the cpu-side
+                         * destination.  (vld1 can only store one d-register
+                         * at a time).
+                         */
+                        "vst1.8 d0, [%1], %3\n"
+                        "vst1.8 d1, [%2], %3\n"
+                        "vst1.8 d2, [%1], %3\n"
+                        "vst1.8 d3, [%2], %3\n"
+                        "vst1.8 d4, [%1], %3\n"
+                        "vst1.8 d5, [%2], %3\n"
+                        "vst1.8 d6, [%1]\n"
+                        "vst1.8 d7, [%2]\n"
+                        :
+                        : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+                        : "q0", "q1", "q2", "q3");
+        }
+#else
+        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
+                memcpy(cpu, gpu + gpu_offset, gpu_stride);
+                cpu += cpu_stride;
+        }
+#endif
+}
+
+static void
+vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
+{
+        uint32_t gpu_stride = vc4_utile_stride(cpp);
+
+#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+        if (gpu_stride == 8) {
+                __asm__ volatile (
+                        /* Load each 8-byte line from cpu-side source,
+                         * incrementing it by the stride each time.
+                         */
+                        "vld1.8 d0, [%1], %2\n"
+                        "vld1.8 d1, [%1], %2\n"
+                        "vld1.8 d2, [%1], %2\n"
+                        "vld1.8 d3, [%1], %2\n"
+                        "vld1.8 d4, [%1], %2\n"
+                        "vld1.8 d5, [%1], %2\n"
+                        "vld1.8 d6, [%1], %2\n"
+                        "vld1.8 d7, [%1]\n"
+                        /* Load from the GPU in one shot, no interleave, to
+                         * d0-d7.
+                         */
+                        "vstm %0, {q0, q1, q2, q3}\n"
+                        :
+                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        : "q0", "q1", "q2", "q3");
+        } else {
+                assert(gpu_stride == 16);
+                __asm__ volatile (
+                        /* Load each 16-byte line in 2 parts from the cpu-side
+                         * destination.  (vld1 can only store one d-register
+                         * at a time).
+                         */
+                        "vld1.8 d0, [%1], %3\n"
+                        "vld1.8 d1, [%2], %3\n"
+                        "vld1.8 d2, [%1], %3\n"
+                        "vld1.8 d3, [%2], %3\n"
+                        "vld1.8 d4, [%1], %3\n"
+                        "vld1.8 d5, [%2], %3\n"
+                        "vld1.8 d6, [%1]\n"
+                        "vld1.8 d7, [%2]\n"
+                        /* Store to the GPU in one shot, no interleave. */
+                        "vstm %0, {q0, q1, q2, q3}\n"
+                        :
+                        : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+                        : "q0", "q1", "q2", "q3");
+        }
+#else
+        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
+                memcpy(gpu + gpu_offset, cpu, gpu_stride);
+                cpu += cpu_stride;
+        }
+#endif
+
+}
+
+void
+NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
+                            void *src, uint32_t src_stride,
+                            int cpp, const struct pipe_box *box)
+{
+        uint32_t utile_w = vc4_utile_width(cpp);
+        uint32_t utile_h = vc4_utile_height(cpp);
+        uint32_t xstart = box->x;
+        uint32_t ystart = box->y;
+
+        for (uint32_t y = 0; y < box->height; y += utile_h) {
+                for (int x = 0; x < box->width; x += utile_w) {
+                        vc4_load_utile(dst + (dst_stride * y +
+                                              x * cpp),
+                                       src + ((ystart + y) * src_stride +
+                                              (xstart + x) * 64 / utile_w),
+                                       dst_stride, cpp);
+                }
+        }
+}
+
+void
+NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
+                             void *src, uint32_t src_stride,
+                             int cpp, const struct pipe_box *box)
+{
+        uint32_t utile_w = vc4_utile_width(cpp);
+        uint32_t utile_h = vc4_utile_height(cpp);
+        uint32_t xstart = box->x;
+        uint32_t ystart = box->y;
+
+        for (uint32_t y = 0; y < box->height; y += utile_h) {
+                for (int x = 0; x < box->width; x += utile_w) {
+                        vc4_store_utile(dst + ((ystart + y) * dst_stride +
+                                               (xstart + x) * 64 / utile_w),
+                                        src + (src_stride * y +
+                                               x * cpp),
+                                        src_stride, cpp);
+                }
+        }
+}
author	Jonathan Gray <jsg@cvs.openbsd.org>	2017-08-14 09:45:54 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2017-08-14 09:45:54 +0000
commit	4c58069f5013f0a621503525f7d5193bfe9976b3 (patch)
tree	bd8f8a08b889e9a8b99c9de01ae12459d527ea6d /lib/mesa/src/gallium/drivers/vc4
parent	5caa025e6b62d0456faad86c89f239a14d1eaadb (diff)