Import Mesa 19.2.8

author: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:13:18 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2020-01-22 02:13:18 +0000
commit: fdcc03929065b5bf5dd93553db219ea3e05c8c34 (patch)
tree: ca90dc8d9e89febdcd4160956c1b8ec098a4efc9 /lib/mesa/src/broadcom
parent: 3c9de4a7e13712b5696750bbd59a18c848742022 (diff)
42 files changed, 3915 insertions, 1388 deletions
diff --git a/lib/mesa/src/broadcom/.editorconfig b/lib/mesa/src/broadcom/.editorconfig
new file mode 100644
index 000000000..f3d8c4791
--- /dev/null
+++ b/lib/mesa/src/broadcom/.editorconfig
@@ -0,0 +1,3 @@
+[*.{c,h}]
+indent_style = space
+indent_size = 8
diff --git a/lib/mesa/src/broadcom/Android.cle.mk b/lib/mesa/src/broadcom/Android.cle.mk
new file mode 100644
index 000000000..5634a8d4a
--- /dev/null
+++ b/lib/mesa/src/broadcom/Android.cle.mk
@@ -0,0 +1,39 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_broadcom_cle
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+LOCAL_SRC_FILES := $(BROADCOM_DECODER_FILES)
+
+LOCAL_STATIC_LIBRARIES := libmesa_broadcom_genxml
+
+LOCAL_C_INCLUDES += $(MESA_TOP)/src/gallium/include
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)
+
+LOCAL_SHARED_LIBRARIES := libexpat libz
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/broadcom/Android.genxml.mk b/lib/mesa/src/broadcom/Android.genxml.mk
new file mode 100644
index 000000000..91e0de05d
--- /dev/null
+++ b/lib/mesa/src/broadcom/Android.genxml.mk
@@ -0,0 +1,83 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_broadcom_genxml
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+intermediates := $(call local-generated-sources-dir)
+
+# dummy.c source file is generated to meet the build system's rules.
+LOCAL_GENERATED_SOURCES += $(intermediates)/dummy.c
+
+$(intermediates)/dummy.c:
+	@mkdir -p $(dir $@)
+	@echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) touch $@
+
+# This is the list of auto-generated files headers
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/broadcom/, $(BROADCOM_GENXML_GENERATED_FILES))
+
+define pack-header-gen
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(PRIVATE_SCRIPT) $(PRIVATE_SCRIPT_FLAGS) $(PRIVATE_XML) $(PRIVATE_VER) > $@
+endef
+
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v21.xml
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_VER := 21
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v21.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+	$(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_VER := 33
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+	$(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_VER := 41
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+	$(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_VER := 42
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+	$(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_xml.h: $(addprefix $(MESA_TOP)/src/broadcom/,$(BROADCOM_GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py $(addprefix $(MESA_TOP)/src/broadcom/,$(BROADCOM_GENXML_XML_FILES)) > $@ || (rm -f $@; false)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(MESA_TOP)/src/broadcom/cle \
+	$(intermediates)/broadcom/cle \
+	$(intermediates)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/broadcom/Android.mk b/lib/mesa/src/broadcom/Android.mk
new file mode 100644
index 000000000..b3bf40510
--- /dev/null
+++ b/lib/mesa/src/broadcom/Android.mk
@@ -0,0 +1,29 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+LOCAL_PATH := $(call my-dir)
+
+# Import variables
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(LOCAL_PATH)/Android.genxml.mk
+include $(LOCAL_PATH)/Android.cle.mk
diff --git a/lib/mesa/src/broadcom/cle/meson.build b/lib/mesa/src/broadcom/cle/meson.build
index afaf5a1b4..a2f47625a 100644
--- a/lib/mesa/src/broadcom/cle/meson.build
+++ b/lib/mesa/src/broadcom/cle/meson.build
@@ -58,6 +58,6 @@ libbroadcom_cle = static_library(
   'v3d_decoder.c',
   include_directories : [inc_common, inc_broadcom],
   c_args : [c_vis_args, no_override_init_args],
-  dependencies : [dep_libdrm, dep_valgrind],
+  dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib],
   build_by_default : false,
 )
diff --git a/lib/mesa/src/broadcom/cle/v3d_decoder.c b/lib/mesa/src/broadcom/cle/v3d_decoder.c
index 373a1d996..23ee59fd0 100644
--- a/lib/mesa/src/broadcom/cle/v3d_decoder.c
+++ b/lib/mesa/src/broadcom/cle/v3d_decoder.c
@@ -651,7 +651,8 @@ v3d_spec_load(const struct v3d_device_info *devinfo)
         struct parser_context ctx;
         void *buf;
         uint8_t *text_data = NULL;
-        uint32_t text_offset = 0, text_length = 0, total_length;
+        uint32_t text_offset = 0, text_length = 0;
+        ASSERTED uint32_t total_length;
 
         for (int i = 0; i < ARRAY_SIZE(genxml_files_table); i++) {
                 if (i != 0) {
diff --git a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
index 06e8ddad7..f40796612 100644
--- a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
+++ b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
@@ -250,6 +250,28 @@
     <value name="RGBA" value="3"/>
   </enum>
 
+  <enum name="Pack Mode" prefix="V3D_PACK_MODE">
+    <value name="16-way" value="0"/>
+    <value name="8-way" value="1"/>
+    <value name="4-way" value="2"/>
+  </enum>
+
+  <enum name="TCS flush mode" prefix="V3D_TCS_FLUSH_MODE">
+    <value name="fully packed" value="0"/>
+    <value name="single patch" value="1"/>
+    <value name="packed complete patches" value="2"/>
+  </enum>
+
+  <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS">
+    <value name="tf_words_buffer0" value="0"/>
+    <value name="tf_words_buffer1" value="1"/>
+    <value name="tf_words_buffer2" value="2"/>
+    <value name="tf_words_buffer3" value="3"/>
+    <value name="written" value="4"/>
+    <value name="tf_written" value="5"/>
+    <value name="tf_overflow" value="6"/>
+  </enum>
+
   <packet code="0" name="Halt"/>
   <packet code="1" name="NOP"/>
   <packet code="4" name="Flush"/>
@@ -552,6 +574,14 @@
     <field name="mode" size="8" start="0" type="Primitive"/>
   </packet>
 
+  <packet code="39" name="Vertex Array Single Instance Prims" cl="B">
+    <field name="Index of First Vertex" size="32" start="72" type="uint"/>
+    <field name="Instance ID" size="32" start="40" type="uint"/>
+    <field name="Instance Length" size="32" start="8" type="uint"/>
+
+    <field name="mode" size="8" start="0" type="Primitive"/>
+  </packet>
+
   <packet code="43" name="Base Vertex Base Instance" cl="B">
     <field name="Base Instance" size="32" start="32" type="uint"/>
 
@@ -563,6 +593,14 @@
     <field name="Size" size="32" start="32" type="uint"/>
   </packet>
 
+  <packet code="54" name="Set InstanceID" cl="B" min_ver="41">
+    <field name="Instance ID" size="32" start="32" type="uint"/>
+  </packet>
+
+  <packet code="55" name="Set PrimitiveID" cl="B" min_ver="41">
+    <field name="Primitive ID" size="32" start="32" type="uint"/>
+  </packet>
+
   <packet code="56" name="Prim List Format">
     <field name="tri strip or fan" size="1" start="7" type="bool"/>
     <field name="primitive type" size="6" start="0" type="uint">
@@ -572,16 +610,64 @@
     </field>
   </packet>
 
+  <packet code="57" name="Serial Number List Start">
+    <field name="address" size="26" start="6" type="address"/>
+    <field name="block size" size="2" start="0" type="uint">
+      <value name="block size 64b" value="0"/>
+      <value name="block size 128b" value="1"/>
+      <value name="block size 256b" value="2"/>
+    </field>
+  </packet>
+
   <packet code="64" shortname="gl_shader" name="GL Shader State">
     <field name="address" size="27" start="5" type="address"/>
     <field name="number of attribute arrays" size="5" start="0" type="uint"/>
   </packet>
 
+  <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS" min_ver="41">
+    <field name="address" size="27" start="5" type="address"/>
+    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
+  </packet>
+
+  <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS" min_ver="41">
+    <field name="address" size="27" start="5" type="address"/>
+    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
+  </packet>
+
+  <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS" min_ver="41">
+    <field name="address" size="27" start="5" type="address"/>
+    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
+  </packet>
+
   <packet code="71" name="VCM Cache Size" min_ver="41">
     <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
     <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
   </packet>
 
+  <packet code="72" shortname="prim_counts_feedback" name="Primitive Counts Feedback">
+    <field name="address" size="27" start="5" type="address"/>
+    <field name="read/write 64byte" size="1" start="4" type="bool"/>
+    <field name="op" size="4" start="0" type="uint">
+      <!--
+	  dword 0-3 are words written to TFB 0-3.  4 is prims generated, 5 is prims written, 6 is
+	  prims overflowed
+      -->
+      <value name="store primitive counts" value="0"/>
+      <value name="store primitive counts and zero" value="1"/>
+      <!--
+	  write 4 pairs of TFB state: remaining TFB space in buffer n, current address in buffer n
+      -->
+      <value name="store buffer state" value="2"/>
+      <value name="store buffer state CL" value="3"/>
+      <!--
+	  Waits for buffer state stores to complete, then loads from
+	  the given buffer state.  This op can be offset by n to skip
+	  waiting for the last n.
+      -->
+      <value name="load buffer state" value="8"/>
+    </field>
+  </packet>
+
   <packet code="73" name="VCM Cache Size" max_ver="33">
     <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
     <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
@@ -1200,6 +1286,61 @@
     <field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/>
   </struct>
 
+  <struct name="Tessellation/Geometry Common Params" min_ver="41">
+    <field name="Tessellation Type" size="2" start="1" type="uint">
+      <value name="Tessellation Type Triangle" value="0"/>
+      <value name="Tessellation Type Quads" value="1"/>
+      <value name="Tessellation Type Isolines" value="2"/>
+    </field>
+
+    <field name="Tessellation point mode" size="1" start="3" type="bool"/>
+
+    <field name="Tessellation Edge Spacing" size="2" start="4" type="uint">
+      <value name="Tessellation Edge Spacing Even" value="0"/>
+      <value name="Tessellation Edge Spacing Fractional Even" value="1"/>
+      <value name="Tessellation Edge Spacing Fractional Odd" value="2"/>
+    </field>
+
+    <field name="Tessellation clockwise" size="1" start="6" type="bool"/>
+
+    <field name="Tessellation Invocations" size="5" start="12" type="uint"/> <!-- 0 == 32 -->
+
+    <field name="Geometry Shader output format" size="2" start="17" type="uint">
+      <value name="Geometry Shader Points" value="0"/>
+      <value name="Geometry Shader Line Strip" value="1"/>
+      <value name="Geometry Shader Tri Strip" value="2"/>
+    </field>
+
+    <field name="Geometry Shader Instances" size="5" start="19" type="uint"/> <!-- 0 == 32 -->
+
+    <!-- followed by "Tessellation/Geometry Shader Params" for bin, then render -->
+  </struct>
+
+  <struct name="Tessellation/Geometry Shader Params">
+    <field name="TCS Batch Flush Mode" size="2" start="0" type="TCS flush mode"/>
+    <field name="Per-patch data column depth" size="4" start="2" type="uint"/> <!-- 8-dword units, 0==16 -->
+
+    <field name="TCS output segment size in sectors" size="6" start="8" type="uint"/>
+    <field name="TCS output segment pack mode" size="2" start="14" type="Pack Mode"/>
+
+    <field name="TES output segment size in sectors" size="6" start="16" type="uint"/>
+    <field name="TES output segment pack mode" size="2" start="22" type="Pack Mode"/>
+
+    <field name="GS output segment size in sectors" size="6" start="24" type="uint"/>
+    <field name="GS output segment pack mode" size="2" start="30" type="Pack Mode"/>
+
+    <field name="TBG max patches per TCS batch" size="4" start="32" type="uint" minus_one="true"/>
+    <field name="TBG max extra vertex segs for patches after first" size="2" start="36" type="uint"/>
+    <field name="TBG min TCS output segments required in play" size="2" start="38" type="uint" minus_one="true"/>
+    <field name="TBG min per-patch data segments required in play" size="3" start="40" type="uint" minus_one="true"/>
+    <field name="TPG max patches per TES batch" size="4" start="45" type="uint" minus_one="true"/>
+    <field name="TPG max vertex segments per TES batch" size="2" start="49" type="uint"/>
+    <field name="TPG max TCS output segments per TES batch" size="3" start="51" type="uint" minus_one="true"/>
+    <field name="TPG min TES output segments required in play" size="3" start="54" type="uint" minus_one="true"/>
+    <field name="GBG max TES output/vertex segments per GS batch" size="2" start="57" type="uint"/>
+    <field name="GBG max TES output/vertex segments required in play" size="3" start="59" type="uint" minus_one="true"/>
+  </struct>
+
   <struct name="GL Shader State Attribute Record" max_ver="33">
     <field name="Address" size="32" start="0" type="address"/>
 
diff --git a/lib/mesa/src/broadcom/common/v3d_device_info.c b/lib/mesa/src/broadcom/common/v3d_device_info.c
new file mode 100644
index 000000000..272190eb2
--- /dev/null
+++ b/lib/mesa/src/broadcom/common/v3d_device_info.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common/v3d_device_info.h"
+#include "drm-uapi/v3d_drm.h"
+
+bool
+v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_ioctl) {
+    struct drm_v3d_get_param ident0 = {
+            .param = DRM_V3D_PARAM_V3D_CORE0_IDENT0,
+    };
+    struct drm_v3d_get_param ident1 = {
+            .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+    };
+    int ret;
+
+    ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
+    if (ret != 0) {
+            fprintf(stderr, "Couldn't get V3D core IDENT0: %s\n",
+                    strerror(errno));
+            return false;
+    }
+    ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident1);
+    if (ret != 0) {
+            fprintf(stderr, "Couldn't get V3D core IDENT1: %s\n",
+                    strerror(errno));
+            return false;
+    }
+
+    uint32_t major = (ident0.value >> 24) & 0xff;
+    uint32_t minor = (ident1.value >> 0) & 0xf;
+
+    devinfo->ver = major * 10 + minor;
+
+    devinfo->vpm_size = (ident1.value >> 28 & 0xf) * 8192;
+
+    int nslc = (ident1.value >> 4) & 0xf;
+    int qups = (ident1.value >> 8) & 0xf;
+    devinfo->qpu_count = nslc * qups;
+
+    switch (devinfo->ver) {
+        case 33:
+        case 41:
+        case 42:
+                break;
+        default:
+                fprintf(stderr,
+                        "V3D %d.%d not supported by this version of Mesa.\n",
+                        devinfo->ver / 10,
+                        devinfo->ver % 10);
+                return false;
+    }
+
+    return true;
+}
diff --git a/lib/mesa/src/broadcom/common/v3d_limits.h b/lib/mesa/src/broadcom/common/v3d_limits.h
index e21ee246e..776847622 100644
--- a/lib/mesa/src/broadcom/common/v3d_limits.h
+++ b/lib/mesa/src/broadcom/common/v3d_limits.h
@@ -24,6 +24,11 @@
 #ifndef V3D_LIMITS_H
 #define V3D_LIMITS_H
 
+/* Number of channels a QPU thread executes in parallel.  Also known as
+ * gl_SubGroupSizeARB.
+ */
+#define V3D_CHANNELS 16
+
 #define V3D_MAX_FS_INPUTS 64
 #define V3D_MAX_VS_INPUTS 64
 
diff --git a/lib/mesa/src/broadcom/compiler/meson.build b/lib/mesa/src/broadcom/compiler/meson.build
index c80918db3..d7af999c3 100644
--- a/lib/mesa/src/broadcom/compiler/meson.build
+++ b/lib/mesa/src/broadcom/compiler/meson.build
@@ -23,9 +23,9 @@ libbroadcom_compiler_files = files(
   'vir.c',
   'vir_dump.c',
   'vir_live_variables.c',
-  'vir_lower_uniforms.c',
   'vir_opt_copy_propagate.c',
   'vir_opt_dead_code.c',
+  'vir_opt_redundant_flags.c',
   'vir_opt_small_immediates.c',
   'vir_register_allocate.c',
   'vir_to_qpu.c',
@@ -37,6 +37,8 @@ libbroadcom_compiler_files = files(
   'v3d_compiler.h',
   'v3d_nir_lower_io.c',
   'v3d_nir_lower_image_load_store.c',
+  'v3d_nir_lower_logic_ops.c',
+  'v3d_nir_lower_scratch.c',
   'v3d_nir_lower_txf_ms.c',
 )
 
diff --git a/lib/mesa/src/broadcom/compiler/nir_to_vir.c b/lib/mesa/src/broadcom/compiler/nir_to_vir.c
index bd19bb9b0..01468fa87 100644
--- a/lib/mesa/src/broadcom/compiler/nir_to_vir.c
+++ b/lib/mesa/src/broadcom/compiler/nir_to_vir.c
@@ -32,18 +32,15 @@
 #include "common/v3d_device_info.h"
 #include "v3d_compiler.h"
 
+/* We don't do any address packing. */
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#include "cle/v3d_packet_v41_pack.h"
+
 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
-#define GENERAL_TMU_READ_OP_PREFETCH                (0 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_CLEAR             (1 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_FLUSH             (3 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_CLEAN             (3 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR         (4 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_INC              (8 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_DEC              (9 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_NOT              (10 << 3)
-#define GENERAL_TMU_READ_OP_READ                    (15 << 3)
 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
 #define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
@@ -53,19 +50,6 @@
 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
 
-#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP         (0 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP         (1 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG             (2 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG          (3 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN             (4 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX             (5 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN             (6 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX             (7 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_AND              (8 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_OR               (9 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR              (10 << 3)
-#define GENERAL_TMU_WRITE_OP_WRITE                   (15 << 3)
-
 #define V3D_TSY_SET_QUORUM          0
 #define V3D_TSY_INC_WAITERS         1
 #define V3D_TSY_DEC_WAITERS         2
@@ -122,6 +106,27 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw = vir_NOP(c);
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
+
+        /* We need to lock the scoreboard before any tlb acess happens. If this
+         * thread switch comes after we have emitted a tlb load, then it means
+         * that we can't lock on the last thread switch any more.
+         */
+        if (c->emitted_tlb_load)
+                c->lock_scoreboard_on_first_thrsw = true;
+}
+
+uint32_t
+v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
+{
+        if (nir_src_is_const(instr->src[src])) {
+                int64_t add_val = nir_src_as_int(instr->src[src]);
+                if (add_val == 1)
+                        return V3D_TMU_OP_WRITE_AND_READ_INC;
+                else if (add_val == -1)
+                        return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        }
+
+        return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
 }
 
 static uint32_t
@@ -132,40 +137,42 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
         case nir_intrinsic_load_ubo:
         case nir_intrinsic_load_uniform:
         case nir_intrinsic_load_shared:
-                return GENERAL_TMU_READ_OP_READ;
+        case nir_intrinsic_load_scratch:
         case nir_intrinsic_store_ssbo:
         case nir_intrinsic_store_shared:
-                return GENERAL_TMU_WRITE_OP_WRITE;
+        case nir_intrinsic_store_scratch:
+                return V3D_TMU_OP_REGULAR;
         case nir_intrinsic_ssbo_atomic_add:
+                return v3d_get_op_for_atomic_add(instr, 2);
         case nir_intrinsic_shared_atomic_add:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
+                return v3d_get_op_for_atomic_add(instr, 1);
         case nir_intrinsic_ssbo_atomic_imin:
         case nir_intrinsic_shared_atomic_imin:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
+                return V3D_TMU_OP_WRITE_SMIN;
         case nir_intrinsic_ssbo_atomic_umin:
         case nir_intrinsic_shared_atomic_umin:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
+                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
         case nir_intrinsic_ssbo_atomic_imax:
         case nir_intrinsic_shared_atomic_imax:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
+                return V3D_TMU_OP_WRITE_SMAX;
         case nir_intrinsic_ssbo_atomic_umax:
         case nir_intrinsic_shared_atomic_umax:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
+                return V3D_TMU_OP_WRITE_UMAX;
         case nir_intrinsic_ssbo_atomic_and:
         case nir_intrinsic_shared_atomic_and:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
+                return V3D_TMU_OP_WRITE_AND_READ_INC;
         case nir_intrinsic_ssbo_atomic_or:
         case nir_intrinsic_shared_atomic_or:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
+                return V3D_TMU_OP_WRITE_OR_READ_DEC;
         case nir_intrinsic_ssbo_atomic_xor:
         case nir_intrinsic_shared_atomic_xor:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
+                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
         case nir_intrinsic_ssbo_atomic_exchange:
         case nir_intrinsic_shared_atomic_exchange:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
+                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
         case nir_intrinsic_ssbo_atomic_comp_swap:
         case nir_intrinsic_shared_atomic_comp_swap:
-                return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
+                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
         default:
                 unreachable("unknown intrinsic op");
         }
@@ -177,147 +184,217 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
  */
 static void
 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
-                     bool is_shared)
+                     bool is_shared_or_scratch)
 {
-        /* XXX perf: We should turn add/sub of 1 to inc/dec.  Perhaps NIR
-         * wants to have support for inc/dec?
+        uint32_t tmu_op = v3d_general_tmu_op(instr);
+
+        /* If we were able to replace atomic_add for an inc/dec, then we
+         * need/can to do things slightly different, like not loading the
+         * amount to add/sub, as that is implicit.
          */
+        bool atomic_add_replaced =
+                ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
+                  instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+                 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
 
-        uint32_t tmu_op = v3d_general_tmu_op(instr);
         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
+                         instr->intrinsic == nir_intrinsic_store_scratch ||
                          instr->intrinsic == nir_intrinsic_store_shared);
-        bool has_index = !is_shared;
+
+        bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
+                        instr->intrinsic == nir_intrinsic_load_ubo ||
+                        instr->intrinsic == nir_intrinsic_load_ssbo ||
+                        instr->intrinsic == nir_intrinsic_load_scratch ||
+                        instr->intrinsic == nir_intrinsic_load_shared);
+
+        bool has_index = !is_shared_or_scratch;
 
         int offset_src;
-        int tmu_writes = 1; /* address */
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
                 offset_src = 0;
         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
                    instr->intrinsic == nir_intrinsic_load_ubo ||
-                   instr->intrinsic == nir_intrinsic_load_shared) {
+                   instr->intrinsic == nir_intrinsic_load_scratch ||
+                   instr->intrinsic == nir_intrinsic_load_shared ||
+                   atomic_add_replaced) {
                 offset_src = 0 + has_index;
         } else if (is_store) {
                 offset_src = 1 + has_index;
-                for (int i = 0; i < instr->num_components; i++) {
-                        vir_MOV_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[0], i));
-                        tmu_writes++;
-                }
         } else {
                 offset_src = 0 + has_index;
-                vir_MOV_dest(c,
-                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                             ntq_get_src(c, instr->src[1 + has_index], 0));
-                tmu_writes++;
-                if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
-                        vir_MOV_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[2 + has_index],
-                                                 0));
-                        tmu_writes++;
-                }
         }
 
-        /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
-         * storing at the same time.
-         */
-        while (tmu_writes > 16 / c->threads)
-                c->threads /= 2;
+        bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
+        uint32_t const_offset = 0;
+        if (!dynamic_src)
+                const_offset = nir_src_as_uint(instr->src[offset_src]);
 
-        struct qreg offset;
+        struct qreg base_offset;
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
-                offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 0);
-
-                /* Find what variable in the default uniform block this
-                 * uniform load is coming from.
-                 */
-                uint32_t base = nir_intrinsic_base(instr);
-                int i;
-                struct v3d_ubo_range *range = NULL;
-                for (i = 0; i < c->num_ubo_ranges; i++) {
-                        range = &c->ubo_ranges[i];
-                        if (base >= range->src_offset &&
-                            base < range->src_offset + range->size) {
-                                break;
-                        }
-                }
-                /* The driver-location-based offset always has to be within a
-                 * declared uniform range.
-                 */
-                assert(i != c->num_ubo_ranges);
-                if (!c->ubo_range_used[i]) {
-                        c->ubo_range_used[i] = true;
-                        range->dst_offset = c->next_ubo_dst_offset;
-                        c->next_ubo_dst_offset += range->size;
-                }
-
-                base = base - range->src_offset + range->dst_offset;
-
-                if (base != 0)
-                        offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
+                const_offset += nir_intrinsic_base(instr);
+                base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                          v3d_unit_data_create(0, const_offset));
+                const_offset = 0;
         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
+                uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
                 /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
                  * 1 (0 is gallium's constant buffer 0).
                  */
-                offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                     nir_src_as_uint(instr->src[0]) + 1);
-        } else if (is_shared) {
-                /* Shared variables have no buffer index, and all start from a
-                 * common base that we set up at the start of dispatch
+                base_offset =
+                        vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                    v3d_unit_data_create(index, const_offset));
+                const_offset = 0;
+        } else if (is_shared_or_scratch) {
+                /* Shared and scratch variables have no buffer index, and all
+                 * start from a common base that we set up at the start of
+                 * dispatch.
                  */
-                offset = c->cs_shared_offset;
+                if (instr->intrinsic == nir_intrinsic_load_scratch ||
+                    instr->intrinsic == nir_intrinsic_store_scratch) {
+                        base_offset = c->spill_base;
+                } else {
+                        base_offset = c->cs_shared_offset;
+                        const_offset += nir_intrinsic_base(instr);
+                }
         } else {
-                offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
-                                     nir_src_as_uint(instr->src[is_store ?
-                                                                1 : 0]));
+                base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
+                                          nir_src_as_uint(instr->src[is_store ?
+                                                                      1 : 0]));
         }
 
-        uint32_t config = (0xffffff00 |
-                           tmu_op |
-                           GENERAL_TMU_LOOKUP_PER_PIXEL);
-        if (instr->num_components == 1) {
-                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
-        } else {
-                config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 +
-                           instr->num_components - 2);
-        }
+        struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
+        unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
+        uint32_t base_const_offset = const_offset;
+        int first_component = -1;
+        int last_component = -1;
+        do {
+                int tmu_writes = 1; /* address */
 
-        if (c->execute.file != QFILE_NULL)
-                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                if (is_store) {
+                        /* Find the first set of consecutive components that
+                         * are enabled in the writemask and emit the TMUD
+                         * instructions for them.
+                         */
+                        first_component = ffs(writemask) - 1;
+                        last_component = first_component;
+                        while (writemask & BITFIELD_BIT(last_component + 1))
+                                last_component++;
+
+                        assert(first_component >= 0 &&
+                               first_component <= last_component &&
+                               last_component < instr->num_components);
+
+                        struct qreg tmud = vir_reg(QFILE_MAGIC,
+                                                   V3D_QPU_WADDR_TMUD);
+                        for (int i = first_component; i <= last_component; i++) {
+                                struct qreg data =
+                                        ntq_get_src(c, instr->src[0], i);
+                                vir_MOV_dest(c, tmud, data);
+                                tmu_writes++;
+                        }
 
-        struct qreg dest;
-        if (config == ~0)
-                dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
-        else
-                dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+                        /* Update the offset for the TMU write based on the
+                         * the first component we are writing.
+                         */
+                        const_offset = base_const_offset + first_component * 4;
+
+                        /* Clear these components from the writemask */
+                        uint32_t written_mask =
+                                BITFIELD_RANGE(first_component, tmu_writes - 1);
+                        writemask &= ~written_mask;
+                } else if (!is_load && !atomic_add_replaced) {
+                        struct qreg data =
+                                ntq_get_src(c, instr->src[1 + has_index], 0);
+                        vir_MOV_dest(c, tmud, data);
+                        tmu_writes++;
+                        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+                                data = ntq_get_src(c, instr->src[2 + has_index],
+                                                   0);
+                                vir_MOV_dest(c, tmud, data);
+                                tmu_writes++;
+                        }
+                }
 
-        struct qinst *tmu;
-        if (nir_src_is_const(instr->src[offset_src]) &&
-            nir_src_as_uint(instr->src[offset_src]) == 0) {
-                tmu = vir_MOV_dest(c, dest, offset);
-        } else {
-                tmu = vir_ADD_dest(c, dest,
-                                   offset,
-                                   ntq_get_src(c, instr->src[offset_src], 0));
-        }
+                /* Make sure we won't exceed the 16-entry TMU fifo if each
+                 * thread is storing at the same time.
+                 */
+                while (tmu_writes > 16 / c->threads)
+                        c->threads /= 2;
 
-        if (config != ~0) {
-                tmu->src[vir_get_implicit_uniform_src(tmu)] =
-                        vir_uniform_ui(c, config);
-        }
+                /* The spec says that for atomics, the TYPE field is ignored,
+                 * but that doesn't seem to be the case for CMPXCHG.  Just use
+                 * the number of tmud writes we did to decide the type (or
+                 * choose "32bit" for atomic reads, which has been fine).
+                 */
+                uint32_t num_components;
+                if (is_load || atomic_add_replaced) {
+                        num_components = instr->num_components;
+                } else {
+                        assert(tmu_writes > 1);
+                        num_components = tmu_writes - 1;
+                }
+
+                uint32_t config = (0xffffff00 |
+                                   tmu_op << 3|
+                                   GENERAL_TMU_LOOKUP_PER_PIXEL);
+                if (num_components == 1) {
+                        config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                } else {
+                        config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
+                                  num_components - 2;
+                }
+
+                if (vir_in_nonuniform_control_flow(c)) {
+                        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                }
 
-        if (c->execute.file != QFILE_NULL)
-                vir_set_cond(tmu, V3D_QPU_COND_IFA);
+                struct qreg tmua;
+                if (config == ~0)
+                        tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+                else
+                        tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+
+                struct qinst *tmu;
+                if (dynamic_src) {
+                        struct qreg offset = base_offset;
+                        if (const_offset != 0) {
+                                offset = vir_ADD(c, offset,
+                                                 vir_uniform_ui(c, const_offset));
+                        }
+                        struct qreg data =
+                                ntq_get_src(c, instr->src[offset_src], 0);
+                        tmu = vir_ADD_dest(c, tmua, offset, data);
+                } else {
+                        if (const_offset != 0) {
+                                tmu = vir_ADD_dest(c, tmua, base_offset,
+                                                   vir_uniform_ui(c, const_offset));
+                        } else {
+                                tmu = vir_MOV_dest(c, tmua, base_offset);
+                        }
+                }
 
-        vir_emit_thrsw(c);
+                if (config != ~0) {
+                        tmu->uniform =
+                                vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+                                                      config);
+                }
+
+                if (vir_in_nonuniform_control_flow(c))
+                        vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+                vir_emit_thrsw(c);
 
-        /* Read the result, or wait for the TMU op to complete. */
-        for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
-                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+                /* Read the result, or wait for the TMU op to complete. */
+                for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_MOV(c, vir_LDTMU(c)));
+                }
 
-        if (nir_intrinsic_dest_components(instr) == 0)
-                vir_TMUWT(c);
+                if (nir_intrinsic_dest_components(instr) == 0)
+                        vir_TMUWT(c);
+        } while (is_store && writemask != 0);
 }
 
 static struct qreg *
@@ -329,6 +406,20 @@ ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
         return qregs;
 }
 
+static bool
+is_ld_signal(const struct v3d_qpu_sig *sig)
+{
+        return (sig->ldunif ||
+                sig->ldunifa ||
+                sig->ldunifrf ||
+                sig->ldunifarf ||
+                sig->ldtmu ||
+                sig->ldvary ||
+                sig->ldvpm ||
+                sig->ldtlb ||
+                sig->ldtlbu);
+}
+
 /**
  * This function is responsible for getting VIR results into the associated
  * storage for a NIR instruction.
@@ -352,8 +443,7 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
         if (!list_empty(&c->cur_block->instructions))
                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
 
-        assert(result.file == QFILE_UNIF ||
-               (result.file == QFILE_TEMP &&
+        assert((result.file == QFILE_TEMP &&
                 last_inst && last_inst == c->defs[result.index]));
 
         if (dest->is_ssa) {
@@ -377,10 +467,12 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                         _mesa_hash_table_search(c->def_ht, reg);
                 struct qreg *qregs = entry->data;
 
-                /* Insert a MOV if the source wasn't an SSA def in the
-                 * previous instruction.
+                /* If the previous instruction can't be predicated for
+                 * the store into the nir_register, then emit a MOV
+                 * that can be.
                  */
-                if (result.file == QFILE_UNIF) {
+                if (vir_in_nonuniform_control_flow(c) &&
+                    is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) {
                         result = vir_MOV(c, result);
                         last_inst = c->defs[result.index];
                 }
@@ -392,17 +484,17 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                 /* If we're in control flow, then make this update of the reg
                  * conditional on the execution mask.
                  */
-                if (c->execute.file != QFILE_NULL) {
+                if (vir_in_nonuniform_control_flow(c)) {
                         last_inst->dst.index = qregs[chan].index;
 
                         /* Set the flags to the current exec mask.
                          */
                         c->cursor = vir_before_inst(last_inst);
-                        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
                         c->cursor = vir_after_inst(last_inst);
 
                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
-                        last_inst->cond_is_exec_mask = true;
                 }
         }
 }
@@ -540,26 +632,13 @@ ntq_fsign(struct v3d_compile *c, struct qreg src)
         struct qreg t = vir_get_temp(c);
 
         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
-        vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
+        vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
-        vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
+        vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
         return vir_MOV(c, t);
 }
 
-static struct qreg
-ntq_isign(struct v3d_compile *c, struct qreg src)
-{
-        struct qreg t = vir_get_temp(c);
-
-        vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
-        vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
-        vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
-        vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
-        vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
-        return vir_MOV(c, t);
-}
-
 static void
 emit_fragcoord_input(struct v3d_compile *c, int attr)
 {
@@ -675,27 +754,6 @@ add_output(struct v3d_compile *c,
                 v3d_slot_from_slot_and_component(slot, swizzle);
 }
 
-static void
-declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
-{
-        unsigned array_id = c->num_ubo_ranges++;
-        if (array_id >= c->ubo_ranges_array_size) {
-                c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
-                                                array_id + 1);
-                c->ubo_ranges = reralloc(c, c->ubo_ranges,
-                                         struct v3d_ubo_range,
-                                         c->ubo_ranges_array_size);
-                c->ubo_range_used = reralloc(c, c->ubo_range_used,
-                                             bool,
-                                             c->ubo_ranges_array_size);
-        }
-
-        c->ubo_ranges[array_id].dst_offset = 0;
-        c->ubo_ranges[array_id].src_offset = start;
-        c->ubo_ranges[array_id].size = size;
-        c->ubo_range_used[array_id] = false;
-}
-
 /**
  * If compare_instr is a valid comparison instruction, emits the
  * compare_instr's comparison and returns the sel_instr's return value based
@@ -711,7 +769,7 @@ ntq_emit_comparison(struct v3d_compile *c,
         if (nir_op_infos[compare_instr->op].num_inputs > 1)
                 src1 = ntq_get_alu_src(c, compare_instr, 1);
         bool cond_invert = false;
-        struct qreg nop = vir_reg(QFILE_NULL, 0);
+        struct qreg nop = vir_nop_reg();
 
         switch (compare_instr->op) {
         case nir_op_feq32:
@@ -756,6 +814,16 @@ ntq_emit_comparison(struct v3d_compile *c,
                 vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
                 break;
 
+        case nir_op_i2b32:
+                vir_set_pf(vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
+                cond_invert = true;
+                break;
+
+        case nir_op_f2b32:
+                vir_set_pf(vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
+                cond_invert = true;
+                break;
+
         default:
                 return false;
         }
@@ -789,28 +857,24 @@ ntq_get_alu_parent(nir_src src)
         return instr;
 }
 
-/**
- * Attempts to fold a comparison generating a boolean result into the
- * condition code for selecting between two values, instead of comparing the
- * boolean result against 0 to generate the condition code.
- */
-static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
-                                  struct qreg *src)
+/* Turns a NIR bool into a condition code to predicate on. */
+static enum v3d_qpu_cond
+ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
 {
-        nir_alu_instr *compare = ntq_get_alu_parent(instr->src[0].src);
+        nir_alu_instr *compare = ntq_get_alu_parent(src);
         if (!compare)
                 goto out;
 
         enum v3d_qpu_cond cond;
         if (ntq_emit_comparison(c, compare, &cond))
-                return vir_MOV(c, vir_SEL(c, cond, src[1], src[2]));
+                return cond;
 
 out:
-        vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
-        return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
+        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
+                   V3D_QPU_PF_PUSHZ);
+        return V3D_QPU_COND_IFNA;
 }
 
-
 static void
 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 {
@@ -843,8 +907,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         struct qreg result;
 
         switch (instr->op) {
-        case nir_op_fmov:
-        case nir_op_imov:
+        case nir_op_mov:
                 result = vir_MOV(c, src[0]);
                 break;
 
@@ -871,9 +934,16 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 result = vir_FMAX(c, src[0], src[1]);
                 break;
 
-        case nir_op_f2i32:
-                result = vir_FTOIZ(c, src[0]);
+        case nir_op_f2i32: {
+                nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
+                if (src0_alu && src0_alu->op == nir_op_fround_even) {
+                        result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
+                } else {
+                        result = vir_FTOIZ(c, src[0]);
+                }
                 break;
+        }
+
         case nir_op_f2u32:
                 result = vir_FTOUZ(c, src[0]);
                 break;
@@ -889,13 +959,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_b2i32:
                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
                 break;
-        case nir_op_i2b32:
-        case nir_op_f2b32:
-                vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
-                result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
-                                            vir_uniform_ui(c, ~0),
-                                            vir_uniform_ui(c, 0)));
-                break;
 
         case nir_op_iadd:
                 result = vir_ADD(c, src[0], src[1]);
@@ -950,7 +1013,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_sge:
         case nir_op_slt: {
                 enum v3d_qpu_cond cond;
-                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
                 assert(ok);
                 result = vir_MOV(c, vir_SEL(c, cond,
                                             vir_uniform_f(c, 1.0),
@@ -958,6 +1021,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
         }
 
+        case nir_op_i2b32:
+        case nir_op_f2b32:
         case nir_op_feq32:
         case nir_op_fne32:
         case nir_op_fge32:
@@ -969,7 +1034,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_ilt32:
         case nir_op_ult32: {
                 enum v3d_qpu_cond cond;
-                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
                 assert(ok);
                 result = vir_MOV(c, vir_SEL(c, cond,
                                             vir_uniform_ui(c, ~0),
@@ -978,10 +1043,15 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         }
 
         case nir_op_b32csel:
-                result = ntq_emit_bcsel(c, instr, src);
+                result = vir_MOV(c,
+                                 vir_SEL(c,
+                                         ntq_emit_bool_to_cond(c, instr->src[0].src),
+                                         src[1], src[2]));
                 break;
+
         case nir_op_fcsel:
-                vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), src[0]),
+                           V3D_QPU_PF_PUSHZ);
                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
                                             src[1], src[2]));
                 break;
@@ -1011,9 +1081,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_ftrunc:
                 result = vir_FTRUNC(c, src[0]);
                 break;
-        case nir_op_ffract:
-                result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
-                break;
 
         case nir_op_fsin:
                 result = ntq_fsincos(c, src[0], false);
@@ -1025,9 +1092,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         case nir_op_fsign:
                 result = ntq_fsign(c, src[0]);
                 break;
-        case nir_op_isign:
-                result = ntq_isign(c, src[0]);
-                break;
 
         case nir_op_fabs: {
                 result = vir_FMOV(c, src[0]);
@@ -1036,8 +1100,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         }
 
         case nir_op_iabs:
-                result = vir_MAX(c, src[0],
-                                vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
+                result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
                 break;
 
         case nir_op_fddx:
@@ -1053,7 +1116,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_uadd_carry:
-                vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC);
+                vir_set_pf(vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
+                           V3D_QPU_PF_PUSHC);
                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
                                             vir_uniform_ui(c, ~0),
                                             vir_uniform_ui(c, 0)));
@@ -1064,9 +1128,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_unpack_half_2x16_split_x:
-                /* XXX perf: It would be good to be able to merge this unpack
-                 * with whatever uses our result.
-                 */
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
                 break;
@@ -1120,6 +1181,107 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
 
 static void
+vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
+{
+        if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
+                return;
+
+        struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
+        struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
+
+        nir_variable *var = c->output_color_var[rt];
+        int num_components = glsl_get_vector_elements(var->type);
+        uint32_t conf = 0xffffff00;
+        struct qinst *inst;
+
+        conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
+                                            TLB_SAMPLE_MODE_PER_PIXEL;
+        conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+        if (c->fs_key->swap_color_rb & (1 << rt))
+                num_components = MAX2(num_components, 3);
+        assert(num_components != 0);
+
+        enum glsl_base_type type = glsl_get_base_type(var->type);
+        bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
+        bool is_32b_tlb_format = is_int_format ||
+                                 (c->fs_key->f32_color_rb & (1 << rt));
+
+        if (is_int_format) {
+                /* The F32 vs I32 distinction was dropped in 4.2. */
+                if (c->devinfo->ver < 42)
+                        conf |= TLB_TYPE_I32_COLOR;
+                else
+                        conf |= TLB_TYPE_F32_COLOR;
+                conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
+        } else {
+                if (c->fs_key->f32_color_rb & (1 << rt)) {
+                        conf |= TLB_TYPE_F32_COLOR;
+                        conf |= ((num_components - 1) <<
+                                TLB_VEC_SIZE_MINUS_1_SHIFT);
+                } else {
+                        conf |= TLB_TYPE_F16_COLOR;
+                        conf |= TLB_F16_SWAP_HI_LO;
+                        if (num_components >= 3)
+                                conf |= TLB_VEC_SIZE_4_F16;
+                        else
+                                conf |= TLB_VEC_SIZE_2_F16;
+                }
+        }
+
+        int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
+        for (int i = 0; i < num_samples; i++) {
+                struct qreg *color = c->msaa_per_sample_output ?
+                        &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
+                        &c->outputs[var->data.driver_location * 4];
+
+                struct qreg r = color[0];
+                struct qreg g = color[1];
+                struct qreg b = color[2];
+                struct qreg a = color[3];
+
+                if (c->fs_key->swap_color_rb & (1 << rt))  {
+                        r = color[2];
+                        b = color[0];
+                }
+
+                if (c->fs_key->sample_alpha_to_one)
+                        a = vir_uniform_f(c, 1.0);
+
+                if (is_32b_tlb_format) {
+                        if (i == 0) {
+                                inst = vir_MOV_dest(c, tlbu_reg, r);
+                                inst->uniform =
+                                        vir_get_uniform_index(c,
+                                                              QUNIFORM_CONSTANT,
+                                                              conf);
+                        } else {
+                                inst = vir_MOV_dest(c, tlb_reg, r);
+                        }
+
+                        if (num_components >= 2)
+                                vir_MOV_dest(c, tlb_reg, g);
+                        if (num_components >= 3)
+                                vir_MOV_dest(c, tlb_reg, b);
+                        if (num_components >= 4)
+                                vir_MOV_dest(c, tlb_reg, a);
+                } else {
+                        inst = vir_VFPACK_dest(c, tlb_reg, r, g);
+                        if (conf != ~0 && i == 0) {
+                                inst->dst = tlbu_reg;
+                                inst->uniform =
+                                        vir_get_uniform_index(c,
+                                                              QUNIFORM_CONSTANT,
+                                                              conf);
+                        }
+
+                        if (num_components >= 3)
+                                inst = vir_VFPACK_dest(c, tlb_reg, b, a);
+                }
+        }
+}
+
+static void
 emit_frag_end(struct v3d_compile *c)
 {
         /* XXX
@@ -1129,8 +1291,8 @@ emit_frag_end(struct v3d_compile *c)
         */
 
         bool has_any_tlb_color_write = false;
-        for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
-                if (c->output_color_var[rt])
+        for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
+                if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
                         has_any_tlb_color_write = true;
         }
 
@@ -1138,15 +1300,15 @@ emit_frag_end(struct v3d_compile *c)
                 struct nir_variable *var = c->output_color_var[0];
                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
 
-                vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+                vir_SETMSF_dest(c, vir_nop_reg(),
                                 vir_AND(c,
                                         vir_MSF(c),
                                         vir_FTOC(c, color[3])));
         }
 
+        struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
         if (c->output_position_index != -1) {
-                struct qinst *inst = vir_MOV_dest(c,
-                                                  vir_reg(QFILE_TLBU, 0),
+                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
                                                   c->outputs[c->output_position_index]);
                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
 
@@ -1156,8 +1318,9 @@ emit_frag_end(struct v3d_compile *c)
                 } else
                         tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
 
-                inst->src[vir_get_implicit_uniform_src(inst)] =
-                        vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+                                                      tlb_specifier |
+                                                      0xffffff00);
                 c->writes_z = true;
         } else if (c->s->info.fs.uses_discard ||
                    !c->s->info.fs.early_fragment_tests ||
@@ -1173,9 +1336,8 @@ emit_frag_end(struct v3d_compile *c)
                  */
                 c->s->info.fs.uses_discard = true;
 
-                struct qinst *inst = vir_MOV_dest(c,
-                                                  vir_reg(QFILE_TLBU, 0),
-                                                  vir_reg(QFILE_NULL, 0));
+                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
+                                                  vir_nop_reg());
                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
 
                 if (c->devinfo->ver >= 42) {
@@ -1188,254 +1350,34 @@ emit_frag_end(struct v3d_compile *c)
                         tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
                 }
 
-                inst->src[vir_get_implicit_uniform_src(inst)] =
-                        vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+                inst->uniform = vir_get_uniform_index(c,
+                                                      QUNIFORM_CONSTANT,
+                                                      tlb_specifier |
+                                                      0xffffff00);
                 c->writes_z = true;
         }
 
         /* XXX: Performance improvement: Merge Z write and color writes TLB
          * uniform setup
          */
-
-        for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
-                if (!c->output_color_var[rt])
-                        continue;
-
-                nir_variable *var = c->output_color_var[rt];
-                struct qreg *color = &c->outputs[var->data.driver_location * 4];
-                int num_components = glsl_get_vector_elements(var->type);
-                uint32_t conf = 0xffffff00;
-                struct qinst *inst;
-
-                conf |= TLB_SAMPLE_MODE_PER_PIXEL;
-                conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
-
-                if (c->fs_key->swap_color_rb & (1 << rt))
-                        num_components = MAX2(num_components, 3);
-
-                assert(num_components != 0);
-                switch (glsl_get_base_type(var->type)) {
-                case GLSL_TYPE_UINT:
-                case GLSL_TYPE_INT:
-                        /* The F32 vs I32 distinction was dropped in 4.2. */
-                        if (c->devinfo->ver < 42)
-                                conf |= TLB_TYPE_I32_COLOR;
-                        else
-                                conf |= TLB_TYPE_F32_COLOR;
-                        conf |= ((num_components - 1) <<
-                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
-
-                        inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
-                        inst->src[vir_get_implicit_uniform_src(inst)] =
-                                vir_uniform_ui(c, conf);
-
-                        for (int i = 1; i < num_components; i++) {
-                                inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
-                                                    color[i]);
-                        }
-                        break;
-
-                default: {
-                        struct qreg r = color[0];
-                        struct qreg g = color[1];
-                        struct qreg b = color[2];
-                        struct qreg a = color[3];
-
-                        if (c->fs_key->f32_color_rb & (1 << rt)) {
-                                conf |= TLB_TYPE_F32_COLOR;
-                                conf |= ((num_components - 1) <<
-                                         TLB_VEC_SIZE_MINUS_1_SHIFT);
-                        } else {
-                                conf |= TLB_TYPE_F16_COLOR;
-                                conf |= TLB_F16_SWAP_HI_LO;
-                                if (num_components >= 3)
-                                        conf |= TLB_VEC_SIZE_4_F16;
-                                else
-                                        conf |= TLB_VEC_SIZE_2_F16;
-                        }
-
-                        if (c->fs_key->swap_color_rb & (1 << rt))  {
-                                r = color[2];
-                                b = color[0];
-                        }
-
-                        if (c->fs_key->sample_alpha_to_one)
-                                a = vir_uniform_f(c, 1.0);
-
-                        if (c->fs_key->f32_color_rb & (1 << rt)) {
-                                inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), r);
-                                inst->src[vir_get_implicit_uniform_src(inst)] =
-                                        vir_uniform_ui(c, conf);
-
-                                if (num_components >= 2)
-                                        vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), g);
-                                if (num_components >= 3)
-                                        vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), b);
-                                if (num_components >= 4)
-                                        vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), a);
-                        } else {
-                                inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
-                                if (conf != ~0) {
-                                        inst->dst.file = QFILE_TLBU;
-                                        inst->src[vir_get_implicit_uniform_src(inst)] =
-                                                vir_uniform_ui(c, conf);
-                                }
-
-                                if (num_components >= 3)
-                                        inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
-                        }
-                        break;
-                }
-                }
-        }
+        for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
+                vir_emit_tlb_color_write(c, rt);
 }
 
 static void
-vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index)
+vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
 {
         if (c->devinfo->ver >= 40) {
-                vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val);
-                *vpm_index = *vpm_index + 1;
+                vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val);
         } else {
+                /* XXX: v3d33_vir_vpm_write_setup(c); */
                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
         }
-
-        c->num_vpm_writes++;
-}
-
-static void
-emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w,
-                           uint32_t *vpm_index)
-{
-        for (int i = 0; i < 2; i++) {
-                struct qreg coord = c->outputs[c->output_position_index + i];
-                coord = vir_FMUL(c, coord,
-                                 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
-                                             0));
-                coord = vir_FMUL(c, coord, rcp_w);
-                vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index);
-        }
-
-}
-
-static void
-emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
-{
-        struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
-        struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
-
-        struct qreg z = c->outputs[c->output_position_index + 2];
-        z = vir_FMUL(c, z, zscale);
-        z = vir_FMUL(c, z, rcp_w);
-        z = vir_FADD(c, z, zoffset);
-        vir_VPM_WRITE(c, z, vpm_index);
-}
-
-static void
-emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
-{
-        vir_VPM_WRITE(c, rcp_w, vpm_index);
-}
-
-static void
-emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index)
-{
-        struct qreg point_size;
-
-        if (c->output_point_size_index != -1)
-                point_size = c->outputs[c->output_point_size_index];
-        else
-                point_size = vir_uniform_f(c, 1.0);
-
-        /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
-         * BCM21553).
-         */
-        point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
-
-        vir_VPM_WRITE(c, point_size, vpm_index);
-}
-
-static void
-emit_vpm_write_setup(struct v3d_compile *c)
-{
-        if (c->devinfo->ver >= 40)
-                return;
-
-        v3d33_vir_vpm_write_setup(c);
-}
-
-/**
- * Sets up c->outputs[c->output_position_index] for the vertex shader
- * epilogue, if an output vertex position wasn't specified in the user's
- * shader.  This may be the case for transform feedback with rasterizer
- * discard enabled.
- */
-static void
-setup_default_position(struct v3d_compile *c)
-{
-        if (c->output_position_index != -1)
-                return;
-
-        c->output_position_index = c->outputs_array_size;
-        for (int i = 0; i < 4; i++) {
-                add_output(c,
-                           c->output_position_index + i,
-                           VARYING_SLOT_POS, i);
-        }
 }
 
 static void
 emit_vert_end(struct v3d_compile *c)
 {
-        setup_default_position(c);
-
-        uint32_t vpm_index = 0;
-        struct qreg rcp_w = vir_RECIP(c,
-                                      c->outputs[c->output_position_index + 3]);
-
-        emit_vpm_write_setup(c);
-
-        if (c->vs_key->is_coord) {
-                for (int i = 0; i < 4; i++)
-                        vir_VPM_WRITE(c, c->outputs[c->output_position_index + i],
-                                      &vpm_index);
-                emit_scaled_viewport_write(c, rcp_w, &vpm_index);
-                if (c->vs_key->per_vertex_point_size) {
-                        emit_point_size_write(c, &vpm_index);
-                        /* emit_rcp_wc_write(c, rcp_w); */
-                }
-                /* XXX: Z-only rendering */
-                if (0)
-                        emit_zs_write(c, rcp_w, &vpm_index);
-        } else {
-                emit_scaled_viewport_write(c, rcp_w, &vpm_index);
-                emit_zs_write(c, rcp_w, &vpm_index);
-                emit_rcp_wc_write(c, rcp_w, &vpm_index);
-                if (c->vs_key->per_vertex_point_size)
-                        emit_point_size_write(c, &vpm_index);
-        }
-
-        for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
-                struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
-                int j;
-
-                for (j = 0; j < c->num_outputs; j++) {
-                        struct v3d_varying_slot output = c->output_slots[j];
-
-                        if (!memcmp(&input, &output, sizeof(input))) {
-                                vir_VPM_WRITE(c, c->outputs[j],
-                                              &vpm_index);
-                                break;
-                        }
-                }
-                /* Emit padding if we didn't find a declared VS output for
-                 * this FS input.
-                 */
-                if (j == c->num_outputs)
-                        vir_VPM_WRITE(c, vir_uniform_f(c, 0.0),
-                                      &vpm_index);
-        }
-
         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
          */
         if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
@@ -1446,25 +1388,48 @@ void
 v3d_optimize_nir(struct nir_shader *s)
 {
         bool progress;
+        unsigned lower_flrp =
+                (s->options->lower_flrp16 ? 16 : 0) |
+                (s->options->lower_flrp32 ? 32 : 0) |
+                (s->options->lower_flrp64 ? 64 : 0);
 
         do {
                 progress = false;
 
                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
-                NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL);
                 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
                 NIR_PASS(progress, s, nir_copy_prop);
                 NIR_PASS(progress, s, nir_opt_remove_phis);
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
+
+                if (lower_flrp != 0) {
+                        bool lower_flrp_progress = false;
+
+                        NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
+                                 lower_flrp,
+                                 false /* always_precise */,
+                                 s->options->lower_ffma);
+                        if (lower_flrp_progress) {
+                                NIR_PASS(progress, s, nir_opt_constant_folding);
+                                progress = true;
+                        }
+
+                        /* Nothing should rematerialize any flrps, so we only
+                         * need to do this lowering once.
+                         */
+                        lower_flrp = 0;
+                }
+
                 NIR_PASS(progress, s, nir_opt_undef);
         } while (progress);
 
-        NIR_PASS(progress, s, nir_opt_move_load_ubo);
+        NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
 }
 
 static int
@@ -1492,7 +1457,6 @@ ntq_emit_vpm_read(struct v3d_compile *c,
 
         if (*num_components_queued != 0) {
                 (*num_components_queued)--;
-                c->num_inputs++;
                 return vir_MOV(c, vpm);
         }
 
@@ -1502,7 +1466,6 @@ ntq_emit_vpm_read(struct v3d_compile *c,
 
         *num_components_queued = num_components - 1;
         *remaining -= num_components;
-        c->num_inputs++;
 
         return vir_MOV(c, vpm);
 }
@@ -1550,6 +1513,12 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
                                            &num_components, ~0);
         }
 
+        /* The actual loads will happen directly in nir_intrinsic_load_input
+         * on newer versions.
+         */
+        if (c->devinfo->ver >= 40)
+                return;
+
         for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
                                   (loc + 1) * 4);
@@ -1572,6 +1541,26 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
         }
 }
 
+static bool
+var_needs_point_coord(struct v3d_compile *c, nir_variable *var)
+{
+        return (var->data.location == VARYING_SLOT_PNTC ||
+                (var->data.location >= VARYING_SLOT_VAR0 &&
+                 (c->fs_key->point_sprite_mask &
+                  (1 << (var->data.location - VARYING_SLOT_VAR0)))));
+}
+
+static bool
+program_reads_point_coord(struct v3d_compile *c)
+{
+        nir_foreach_variable(var, &c->s->inputs) {
+                if (var_needs_point_coord(c, var))
+                        return true;
+        }
+
+        return false;
+}
+
 static void
 ntq_setup_fs_inputs(struct v3d_compile *c)
 {
@@ -1605,11 +1594,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
 
                 if (var->data.location == VARYING_SLOT_POS) {
                         emit_fragcoord_input(c, loc);
-                } else if (var->data.location == VARYING_SLOT_PNTC ||
-                           (var->data.location >= VARYING_SLOT_VAR0 &&
-                            (c->fs_key->point_sprite_mask &
-                             (1 << (var->data.location -
-                                    VARYING_SLOT_VAR0))))) {
+                } else if (var_needs_point_coord(c, var)) {
                         c->inputs[loc * 4 + 0] = c->point_x;
                         c->inputs[loc * 4 + 1] = c->point_y;
                 } else {
@@ -1622,6 +1607,9 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
 static void
 ntq_setup_outputs(struct v3d_compile *c)
 {
+        if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+                return;
+
         nir_foreach_variable(var, &c->s->outputs) {
                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
                 unsigned loc = var->data.driver_location * 4;
@@ -1635,58 +1623,30 @@ ntq_setup_outputs(struct v3d_compile *c)
                                    var->data.location_frac + i);
                 }
 
-                if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                        switch (var->data.location) {
-                        case FRAG_RESULT_COLOR:
-                                c->output_color_var[0] = var;
-                                c->output_color_var[1] = var;
-                                c->output_color_var[2] = var;
-                                c->output_color_var[3] = var;
-                                break;
-                        case FRAG_RESULT_DATA0:
-                        case FRAG_RESULT_DATA1:
-                        case FRAG_RESULT_DATA2:
-                        case FRAG_RESULT_DATA3:
-                                c->output_color_var[var->data.location -
-                                                    FRAG_RESULT_DATA0] = var;
-                                break;
-                        case FRAG_RESULT_DEPTH:
-                                c->output_position_index = loc;
-                                break;
-                        case FRAG_RESULT_SAMPLE_MASK:
-                                c->output_sample_mask_index = loc;
-                                break;
-                        }
-                } else {
-                        switch (var->data.location) {
-                        case VARYING_SLOT_POS:
-                                c->output_position_index = loc;
-                                break;
-                        case VARYING_SLOT_PSIZ:
-                                c->output_point_size_index = loc;
-                                break;
-                        }
+                switch (var->data.location) {
+                case FRAG_RESULT_COLOR:
+                        c->output_color_var[0] = var;
+                        c->output_color_var[1] = var;
+                        c->output_color_var[2] = var;
+                        c->output_color_var[3] = var;
+                        break;
+                case FRAG_RESULT_DATA0:
+                case FRAG_RESULT_DATA1:
+                case FRAG_RESULT_DATA2:
+                case FRAG_RESULT_DATA3:
+                        c->output_color_var[var->data.location -
+                                            FRAG_RESULT_DATA0] = var;
+                        break;
+                case FRAG_RESULT_DEPTH:
+                        c->output_position_index = loc;
+                        break;
+                case FRAG_RESULT_SAMPLE_MASK:
+                        c->output_sample_mask_index = loc;
+                        break;
                 }
         }
 }
 
-static void
-ntq_setup_uniforms(struct v3d_compile *c)
-{
-        nir_foreach_variable(var, &c->s->uniforms) {
-                uint32_t vec4_count = glsl_count_attribute_slots(var->type,
-                                                                 false);
-                unsigned vec4_size = 4 * sizeof(float);
-
-                if (var->data.mode != nir_var_uniform)
-                        continue;
-
-                declare_uniform_range(c, var->data.driver_location * vec4_size,
-                                      vec4_count * vec4_size);
-
-        }
-}
-
 /**
  * Sets up the mapping from nir_register to struct qreg *.
  *
@@ -1717,7 +1677,7 @@ ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
          */
         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
-                qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
+                qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
 
         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
 }
@@ -1761,26 +1721,239 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
 }
 
 static void
-ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
-        unsigned offset;
+        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
 
-        switch (instr->intrinsic) {
-        case nir_intrinsic_load_uniform:
-                if (nir_src_is_const(instr->src[0])) {
-                        int offset = (nir_intrinsic_base(instr) +
-                                      nir_src_as_uint(instr->src[0]));
-                        assert(offset % 4 == 0);
-                        /* We need dwords */
-                        offset = offset / 4;
-                        for (int i = 0; i < instr->num_components; i++) {
-                                ntq_store_dest(c, &instr->dest, i,
-                                               vir_uniform(c, QUNIFORM_UNIFORM,
-                                                           offset + i));
-                        }
+        int rt = nir_src_as_uint(instr->src[0]);
+        assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+        int sample_index = nir_intrinsic_base(instr) ;
+        assert(sample_index < V3D_MAX_SAMPLES);
+
+        int component = nir_intrinsic_component(instr);
+        assert(component < 4);
+
+        /* We need to emit our TLB reads after we have acquired the scoreboard
+         * lock, or the GPU will hang. Usually, we do our scoreboard locking on
+         * the last thread switch to improve parallelism, however, that is only
+         * guaranteed to happen before the tlb color writes.
+         *
+         * To fix that, we make sure we always emit a thread switch before the
+         * first tlb color read. If that happens to be the last thread switch
+         * we emit, then everything is fine, but otherwsie, if any code after
+         * this point needs to emit additional thread switches, then we will
+         * switch the strategy to locking the scoreboard on the first thread
+         * switch instead -- see vir_emit_thrsw().
+         */
+        if (!c->emitted_tlb_load) {
+                if (!c->last_thrsw_at_top_level) {
+                        assert(c->devinfo->ver >= 41);
+                        vir_emit_thrsw(c);
+                }
+
+                c->emitted_tlb_load = true;
+        }
+
+        struct qreg *color_reads_for_sample =
+                &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
+
+        if (color_reads_for_sample[component].file == QFILE_NULL) {
+                enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
+                int num_components =
+                        util_format_get_nr_components(rt_format);
+
+                const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
+                if (swap_rb)
+                        num_components = MAX2(num_components, 3);
+
+                nir_variable *var = c->output_color_var[rt];
+                enum glsl_base_type type = glsl_get_base_type(var->type);
+
+                bool is_int_format = type == GLSL_TYPE_INT ||
+                                     type == GLSL_TYPE_UINT;
+
+                bool is_32b_tlb_format = is_int_format ||
+                                         (c->fs_key->f32_color_rb & (1 << rt));
+
+                int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
+
+                uint32_t conf = 0xffffff00;
+                conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
+                                          TLB_SAMPLE_MODE_PER_PIXEL;
+                conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+                if (is_32b_tlb_format) {
+                        /* The F32 vs I32 distinction was dropped in 4.2. */
+                        conf |= (c->devinfo->ver < 42 && is_int_format) ?
+                                TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
+
+                        conf |= ((num_components - 1) <<
+                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
                 } else {
-                        ntq_emit_tmu_general(c, instr, false);
+                        conf |= TLB_TYPE_F16_COLOR;
+                        conf |= TLB_F16_SWAP_HI_LO;
+
+                        if (num_components >= 3)
+                                conf |= TLB_VEC_SIZE_4_F16;
+                        else
+                                conf |= TLB_VEC_SIZE_2_F16;
                 }
+
+
+                for (int i = 0; i < num_samples; i++) {
+                        struct qreg r, g, b, a;
+                        if (is_32b_tlb_format) {
+                                r = conf != 0xffffffff && i == 0?
+                                        vir_TLBU_COLOR_READ(c, conf) :
+                                        vir_TLB_COLOR_READ(c);
+                                if (num_components >= 2)
+                                        g = vir_TLB_COLOR_READ(c);
+                                if (num_components >= 3)
+                                        b = vir_TLB_COLOR_READ(c);
+                                if (num_components >= 4)
+                                        a = vir_TLB_COLOR_READ(c);
+                        } else {
+                                struct qreg rg = conf != 0xffffffff && i == 0 ?
+                                        vir_TLBU_COLOR_READ(c, conf) :
+                                        vir_TLB_COLOR_READ(c);
+                                r = vir_FMOV(c, rg);
+                                vir_set_unpack(c->defs[r.index], 0,
+                                               V3D_QPU_UNPACK_L);
+                                g = vir_FMOV(c, rg);
+                                vir_set_unpack(c->defs[g.index], 0,
+                                               V3D_QPU_UNPACK_H);
+
+                                if (num_components > 2) {
+                                    struct qreg ba = vir_TLB_COLOR_READ(c);
+                                    b = vir_FMOV(c, ba);
+                                    vir_set_unpack(c->defs[b.index], 0,
+                                                   V3D_QPU_UNPACK_L);
+                                    a = vir_FMOV(c, ba);
+                                    vir_set_unpack(c->defs[a.index], 0,
+                                                   V3D_QPU_UNPACK_H);
+                                }
+                        }
+
+                        struct qreg *color_reads =
+                                &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
+
+                        color_reads[0] = swap_rb ? b : r;
+                        if (num_components >= 2)
+                                color_reads[1] = g;
+                        if (num_components >= 3)
+                                color_reads[2] = swap_rb ? r : b;
+                        if (num_components >= 4)
+                                color_reads[3] = a;
+                }
+        }
+
+        assert(color_reads_for_sample[component].file != QFILE_NULL);
+        ntq_store_dest(c, &instr->dest, 0,
+                       vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static void
+ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        if (nir_src_is_const(instr->src[0])) {
+                int offset = (nir_intrinsic_base(instr) +
+                             nir_src_as_uint(instr->src[0]));
+                assert(offset % 4 == 0);
+                /* We need dwords */
+                offset = offset / 4;
+                for (int i = 0; i < instr->num_components; i++) {
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_uniform(c, QUNIFORM_UNIFORM,
+                                                   offset + i));
+                }
+        } else {
+               ntq_emit_tmu_general(c, instr, false);
+        }
+}
+
+static void
+ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
+         * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
+         */
+        unsigned offset =
+                nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
+
+        if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+               /* Emit the LDVPM directly now, rather than at the top
+                * of the shader like we did for V3D 3.x (which needs
+                * vpmsetup when not just taking the next offset).
+                *
+                * Note that delaying like this may introduce stalls,
+                * as LDVPMV takes a minimum of 1 instruction but may
+                * be slower if the VPM unit is busy with another QPU.
+                */
+               int index = 0;
+               if (c->s->info.system_values_read &
+                   (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
+                      index++;
+               }
+               if (c->s->info.system_values_read &
+                   (1ull << SYSTEM_VALUE_VERTEX_ID)) {
+                      index++;
+               }
+               for (int i = 0; i < offset; i++)
+                      index += c->vattr_sizes[i];
+               index += nir_intrinsic_component(instr);
+               for (int i = 0; i < instr->num_components; i++) {
+                      struct qreg vpm_offset = vir_uniform_ui(c, index++);
+                      ntq_store_dest(c, &instr->dest, i,
+                                     vir_LDVPMV_IN(c, vpm_offset));
+                }
+        } else {
+                for (int i = 0; i < instr->num_components; i++) {
+                        int comp = nir_intrinsic_component(instr) + i;
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
+                }
+        }
+}
+
+static void
+ntq_emit_per_sample_color_write(struct v3d_compile *c,
+                                nir_intrinsic_instr *instr)
+{
+        assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
+
+        unsigned rt = nir_src_as_uint(instr->src[1]);
+        assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+        unsigned sample_idx = nir_intrinsic_base(instr);
+        assert(sample_idx < V3D_MAX_SAMPLES);
+
+        unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
+        for (int i = 0; i < instr->num_components; i++) {
+                c->sample_colors[offset + i] =
+                        vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+        }
+}
+
+static void
+ntq_emit_color_write(struct v3d_compile *c,
+                     nir_intrinsic_instr *instr)
+{
+        unsigned offset = (nir_intrinsic_base(instr) +
+                           nir_src_as_uint(instr->src[1])) * 4 +
+                          nir_intrinsic_component(instr);
+        for (int i = 0; i < instr->num_components; i++) {
+                c->outputs[offset + i] =
+                        vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+        }
+}
+
+static void
+ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        switch (instr->intrinsic) {
+        case nir_intrinsic_load_uniform:
+                ntq_emit_load_uniform(c, instr);
                 break;
 
         case nir_intrinsic_load_ubo:
@@ -1814,6 +1987,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_shared_atomic_comp_swap:
         case nir_intrinsic_load_shared:
         case nir_intrinsic_store_shared:
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_store_scratch:
                 ntq_emit_tmu_general(c, instr, true);
                 break;
 
@@ -1845,6 +2020,26 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
+        case nir_intrinsic_load_viewport_x_scale:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
+                break;
+
+        case nir_intrinsic_load_viewport_y_scale:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
+                break;
+
+        case nir_intrinsic_load_viewport_z_scale:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
+                break;
+
+        case nir_intrinsic_load_viewport_z_offset:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
+                break;
+
         case nir_intrinsic_load_alpha_ref_float:
                 ntq_store_dest(c, &instr->dest, 0,
                                vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
@@ -1855,7 +2050,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_helper_invocation:
-                vir_PF(c, vir_MSF(c), V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
                 ntq_store_dest(c, &instr->dest, 0,
                                vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
                                                   vir_uniform_ui(c, ~0),
@@ -1880,27 +2075,32 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
                 break;
 
+        case nir_intrinsic_load_tlb_color_v3d:
+                vir_emit_tlb_color_read(c, instr);
+                break;
+
         case nir_intrinsic_load_input:
-                for (int i = 0; i < instr->num_components; i++) {
-                        offset = (nir_intrinsic_base(instr) +
-                                  nir_src_as_uint(instr->src[0]));
-                        int comp = nir_intrinsic_component(instr) + i;
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
-                }
+                ntq_emit_load_input(c, instr);
                 break;
 
-        case nir_intrinsic_store_output:
-                offset = ((nir_intrinsic_base(instr) +
-                           nir_src_as_uint(instr->src[1])) * 4 +
-                          nir_intrinsic_component(instr));
+        case nir_intrinsic_store_tlb_sample_color_v3d:
+               ntq_emit_per_sample_color_write(c, instr);
+               break;
 
-                for (int i = 0; i < instr->num_components; i++) {
-                        c->outputs[offset + i] =
-                                vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+       case nir_intrinsic_store_output:
+                /* XXX perf: Use stvpmv with uniform non-constant offsets and
+                 * stvpmd with non-uniform offsets and enable
+                 * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
+                 */
+                if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+                        ntq_emit_color_write(c, instr);
+                } else {
+                        assert(instr->num_components == 1);
+
+                        vir_VPM_WRITE(c,
+                                      ntq_get_src(c, instr->src[0], 0),
+                                      nir_intrinsic_base(instr));
                 }
-                c->num_outputs = MAX2(c->num_outputs,
-                                      offset + instr->num_components);
                 break;
 
         case nir_intrinsic_image_deref_size:
@@ -1908,38 +2108,35 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_discard:
-                if (c->execute.file != QFILE_NULL) {
-                        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
-                        vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+                if (vir_in_nonuniform_control_flow(c)) {
+                        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                        vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
                                                      vir_uniform_ui(c, 0)),
                                 V3D_QPU_COND_IFA);
                 } else {
-                        vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+                        vir_SETMSF_dest(c, vir_nop_reg(),
                                         vir_uniform_ui(c, 0));
                 }
                 break;
 
         case nir_intrinsic_discard_if: {
-                /* true (~0) if we're discarding */
-                struct qreg cond = ntq_get_src(c, instr->src[0], 0);
+                enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
 
-                if (c->execute.file != QFILE_NULL) {
-                        /* execute == 0 means the channel is active.  Invert
-                         * the condition so that we can use zero as "executing
-                         * and discarding."
-                         */
-                        vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
-                               V3D_QPU_PF_PUSHZ);
-                        vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
-                                                     vir_uniform_ui(c, 0)),
-                                     V3D_QPU_COND_IFA);
-                } else {
-                        vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
-                        vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
-                                                     vir_uniform_ui(c, 0)),
-                                     V3D_QPU_COND_IFNA);
+                if (vir_in_nonuniform_control_flow(c)) {
+                        struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
+                                                               c->execute);
+                        if (cond == V3D_QPU_COND_IFA) {
+                                vir_set_uf(exec_flag, V3D_QPU_UF_ANDZ);
+                        } else {
+                                vir_set_uf(exec_flag, V3D_QPU_UF_NORNZ);
+                                cond = V3D_QPU_COND_IFA;
+                        }
                 }
 
+                vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
+                                             vir_uniform_ui(c, 0)), cond);
+
                 break;
         }
 
@@ -1948,6 +2145,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_memory_barrier_buffer:
         case nir_intrinsic_memory_barrier_image:
         case nir_intrinsic_memory_barrier_shared:
+        case nir_intrinsic_group_memory_barrier:
                 /* We don't do any instruction scheduling of these NIR
                  * instructions between each other, so we just need to make
                  * sure that the TMU operations before the barrier are flushed
@@ -1970,10 +2168,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                                 vir_BARRIERID_dest(c,
                                                    vir_reg(QFILE_MAGIC,
                                                            V3D_QPU_WADDR_SYNCU));
-                        sync->src[vir_get_implicit_uniform_src(sync)] =
-                                vir_uniform_ui(c,
-                                               0xffffff00 |
-                                               V3D_TSY_WAIT_INC_CHECK);
+                        sync->uniform =
+                                vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+                                                      0xffffff00 |
+                                                      V3D_TSY_WAIT_INC_CHECK);
 
                 }
 
@@ -2010,6 +2208,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                                        vir_uniform_ui(c, 0xffff)));
                 break;
 
+        case nir_intrinsic_load_subgroup_id:
+                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+                break;
+
         default:
                 fprintf(stderr, "Unknown intrinsic: ");
                 nir_print_instr(&instr->instr, stderr);
@@ -2030,7 +2232,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 static void
 ntq_activate_execute_for_block(struct v3d_compile *c)
 {
-        vir_set_pf(vir_XOR_dest(c, vir_reg(QFILE_NULL, 0),
+        vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
                                 c->execute, vir_uniform_ui(c, c->cur_block->index)),
                    V3D_QPU_PF_PUSHZ);
 
@@ -2054,14 +2256,7 @@ ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
                 else_block = vir_new_block(c);
 
         /* Set up the flags for the IF condition (taking the THEN branch). */
-        nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
-        enum v3d_qpu_cond cond;
-        if (!if_condition_alu ||
-            !ntq_emit_comparison(c, if_condition_alu, &cond)) {
-                vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
-                       V3D_QPU_PF_PUSHZ);
-                cond = V3D_QPU_COND_IFNA;
-        }
+        enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
 
         /* Jump to ELSE. */
         vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
@@ -2081,7 +2276,6 @@ ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
 
                 /* Emit the else block. */
                 vir_set_emit_block(c, else_block);
-                ntq_activate_execute_for_block(c);
                 ntq_emit_cf_list(c, &if_stmt->else_list);
         }
 
@@ -2107,20 +2301,13 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                 else_block = vir_new_block(c);
 
         bool was_uniform_control_flow = false;
-        if (c->execute.file == QFILE_NULL) {
+        if (!vir_in_nonuniform_control_flow(c)) {
                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
                 was_uniform_control_flow = true;
         }
 
         /* Set up the flags for the IF condition (taking the THEN branch). */
-        nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
-        enum v3d_qpu_cond cond;
-        if (!if_condition_alu ||
-            !ntq_emit_comparison(c, if_condition_alu, &cond)) {
-                vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
-                       V3D_QPU_PF_PUSHZ);
-                cond = V3D_QPU_COND_IFNA;
-        }
+        enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
 
         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
          * was previously active (execute Z) for updating the exec flags.
@@ -2128,8 +2315,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
         if (was_uniform_control_flow) {
                 cond = v3d_qpu_cond_invert(cond);
         } else {
-                struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
-                                                  c->execute);
+                struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
                 if (cond == V3D_QPU_COND_IFA) {
                         vir_set_uf(inst, V3D_QPU_UF_NORNZ);
                 } else {
@@ -2145,7 +2331,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
         /* Jump to ELSE if nothing is active for THEN, otherwise fall
          * through.
          */
-        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
         vir_link_blocks(c->cur_block, else_block);
         vir_link_blocks(c->cur_block, then_block);
@@ -2159,14 +2345,16 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                  * active channels update their execute flags to point to
                  * ENDIF
                  */
-                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                           V3D_QPU_PF_PUSHZ);
                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
                              vir_uniform_ui(c, after_block->index));
 
                 /* If everything points at ENDIF, then jump there immediately. */
-                vir_PF(c, vir_XOR(c, c->execute,
-                                  vir_uniform_ui(c, after_block->index)),
-                       V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
+                                        c->execute,
+                                        vir_uniform_ui(c, after_block->index)),
+                           V3D_QPU_PF_PUSHZ);
                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
                 vir_link_blocks(c->cur_block, after_block);
                 vir_link_blocks(c->cur_block, else_block);
@@ -2190,7 +2378,7 @@ ntq_emit_if(struct v3d_compile *c, nir_if *nif)
 {
         bool was_in_control_flow = c->in_control_flow;
         c->in_control_flow = true;
-        if (c->execute.file == QFILE_NULL &&
+        if (!vir_in_nonuniform_control_flow(c) &&
             nir_src_is_dynamically_uniform(nif->condition)) {
                 ntq_emit_uniform_if(c, nif);
         } else {
@@ -2204,13 +2392,15 @@ ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
 {
         switch (jump->type) {
         case nir_jump_break:
-                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                           V3D_QPU_PF_PUSHZ);
                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
                              vir_uniform_ui(c, c->loop_break_block->index));
                 break;
 
         case nir_jump_continue:
-                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                           V3D_QPU_PF_PUSHZ);
                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
                              vir_uniform_ui(c, c->loop_cont_block->index));
                 break;
@@ -2277,7 +2467,7 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
         c->in_control_flow = true;
 
         bool was_uniform_control_flow = false;
-        if (c->execute.file == QFILE_NULL) {
+        if (!vir_in_nonuniform_control_flow(c)) {
                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
                 was_uniform_control_flow = true;
         }
@@ -2299,13 +2489,14 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
          *
          * XXX: Use the .ORZ flags update, instead.
          */
-        vir_PF(c, vir_XOR(c,
-                          c->execute,
-                          vir_uniform_ui(c, c->loop_cont_block->index)),
-               V3D_QPU_PF_PUSHZ);
+        vir_set_pf(vir_XOR_dest(c,
+                                vir_nop_reg(),
+                                c->execute,
+                                vir_uniform_ui(c, c->loop_cont_block->index)),
+                   V3D_QPU_PF_PUSHZ);
         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
 
-        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
 
         struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
         /* Pixels that were not dispatched or have been discarded should not
@@ -2380,15 +2571,17 @@ nir_to_vir(struct v3d_compile *c)
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
-                /* XXX perf: We could set the "disable implicit point/line
-                 * varyings" field in the shader record and not emit these, if
-                 * they're not going to be used.
+                /* V3D 4.x can disable implicit point coordinate varyings if
+                 * they are not used.
                  */
-                if (c->fs_key->is_points) {
+                if (c->fs_key->is_points &&
+                    (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
                         c->point_x = emit_fragment_varying(c, NULL, 0, 0);
                         c->point_y = emit_fragment_varying(c, NULL, 0, 0);
-                } else if (c->fs_key->is_lines) {
+                        c->uses_implicit_point_line_varyings = true;
+                } else if (c->fs_key->is_lines && c->devinfo->ver < 40) {
                         c->line_x = emit_fragment_varying(c, NULL, 0, 0);
+                        c->uses_implicit_point_line_varyings = true;
                 }
                 break;
         case MESA_SHADER_COMPUTE:
@@ -2398,16 +2591,8 @@ nir_to_vir(struct v3d_compile *c)
                                                       V3D_QPU_WADDR_SYNC));
                 }
 
-                if (c->s->info.system_values_read &
-                    ((1ull << SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) |
-                     (1ull << SYSTEM_VALUE_WORK_GROUP_ID))) {
-                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
-                }
-                if ((c->s->info.system_values_read &
-                     ((1ull << SYSTEM_VALUE_WORK_GROUP_ID))) ||
-                    c->s->info.cs.shared_size) {
-                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
-                }
+                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
                 /* Set up the division between gl_LocalInvocationIndex and
                  * wg_in_mem in the payload reg.
@@ -2444,14 +2629,17 @@ nir_to_vir(struct v3d_compile *c)
                 break;
         }
 
+        if (c->s->scratch_size) {
+                v3d_setup_spill_base(c);
+                c->spill_size += V3D_CHANNELS * c->s->scratch_size;
+        }
+
         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
                 ntq_setup_fs_inputs(c);
         else
                 ntq_setup_vpm_inputs(c);
 
         ntq_setup_outputs(c);
-        ntq_setup_uniforms(c);
-        ntq_setup_registers(c, &c->s->registers);
 
         /* Find the main function and emit the body. */
         nir_foreach_function(function, c->s) {
@@ -2465,12 +2653,13 @@ const nir_shader_compiler_options v3d_nir_options = {
         .lower_all_io_to_temps = true,
         .lower_extract_byte = true,
         .lower_extract_word = true,
-        .lower_bfm = true,
         .lower_bitfield_insert_to_shifts = true,
         .lower_bitfield_extract_to_shifts = true,
         .lower_bitfield_reverse = true,
         .lower_bit_count = true,
         .lower_cs_local_id_from_index = true,
+        .lower_ffract = true,
+        .lower_fmod = true,
         .lower_pack_unorm_2x16 = true,
         .lower_pack_snorm_2x16 = true,
         .lower_pack_unorm_4x8 = true,
@@ -2487,10 +2676,11 @@ const nir_shader_compiler_options v3d_nir_options = {
         .lower_fsat = true,
         .lower_fsqrt = true,
         .lower_ifind_msb = true,
+        .lower_isign = true,
         .lower_ldexp = true,
         .lower_mul_high = true,
         .lower_wpos_pntc = true,
-        .native_integers = true,
+        .lower_rotate = true,
 };
 
 /**
@@ -2595,6 +2785,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
         case MESA_SHADER_VERTEX:
                 emit_vert_end(c);
                 break;
+        case MESA_SHADER_COMPUTE:
+                break;
         default:
                 unreachable("bad stage");
         }
@@ -2609,7 +2801,6 @@ v3d_nir_to_vir(struct v3d_compile *c)
         }
 
         vir_optimize(c);
-        vir_lower_uniforms(c);
 
         vir_check_payload_w(c);
 
@@ -2659,5 +2850,15 @@ v3d_nir_to_vir(struct v3d_compile *c)
                         vir_remove_thrsw(c);
         }
 
+        if (c->spills &&
+            (V3D_DEBUG & (V3D_DEBUG_VIR |
+                          v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+                fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id);
+                vir_dump(c);
+                fprintf(stderr, "\n");
+        }
+
         v3d_vir_to_qpu(c, temp_registers);
 }
diff --git a/lib/mesa/src/broadcom/compiler/qpu_schedule.c b/lib/mesa/src/broadcom/compiler/qpu_schedule.c
index 0f8001ff5..c15218e26 100644
--- a/lib/mesa/src/broadcom/compiler/qpu_schedule.c
+++ b/lib/mesa/src/broadcom/compiler/qpu_schedule.c
@@ -37,18 +37,16 @@
 #include "qpu/qpu_disasm.h"
 #include "v3d_compiler.h"
 #include "util/ralloc.h"
+#include "util/dag.h"
 
 static bool debug;
 
 struct schedule_node_child;
 
 struct schedule_node {
+        struct dag_node dag;
         struct list_head link;
         struct qinst *inst;
-        struct schedule_node_child *children;
-        uint32_t child_count;
-        uint32_t child_array_size;
-        uint32_t parent_count;
 
         /* Longest cycles + instruction_latency() of any parent of this node. */
         uint32_t unblocked_time;
@@ -67,11 +65,6 @@ struct schedule_node {
         uint32_t latency;
 };
 
-struct schedule_node_child {
-        struct schedule_node *node;
-        bool write_after_read;
-};
-
 /* When walking the instructions in reverse, we need to swap before/after in
  * add_dep().
  */
@@ -79,6 +72,7 @@ enum direction { F, R };
 
 struct schedule_state {
         const struct v3d_device_info *devinfo;
+        struct dag *dag;
         struct schedule_node *last_r[6];
         struct schedule_node *last_rf[64];
         struct schedule_node *last_sf;
@@ -101,37 +95,17 @@ add_dep(struct schedule_state *state,
         bool write)
 {
         bool write_after_read = !write && state->dir == R;
+        void *edge_data = (void *)(uintptr_t)write_after_read;
 
         if (!before || !after)
                 return;
 
         assert(before != after);
 
-        if (state->dir == R) {
-                struct schedule_node *t = before;
-                before = after;
-                after = t;
-        }
-
-        for (int i = 0; i < before->child_count; i++) {
-                if (before->children[i].node == after &&
-                    (before->children[i].write_after_read == write_after_read)) {
-                        return;
-                }
-        }
-
-        if (before->child_array_size <= before->child_count) {
-                before->child_array_size = MAX2(before->child_array_size * 2, 16);
-                before->children = reralloc(before, before->children,
-                                            struct schedule_node_child,
-                                            before->child_array_size);
-        }
-
-        before->children[before->child_count].node = after;
-        before->children[before->child_count].write_after_read =
-                write_after_read;
-        before->child_count++;
-        after->parent_count++;
+        if (state->dir == F)
+                dag_add_edge(&before->dag, &after->dag, edge_data);
+        else
+                dag_add_edge(&after->dag, &before->dag, edge_data);
 }
 
 static void
@@ -154,6 +128,9 @@ add_write_dep(struct schedule_state *state,
 static bool
 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
 {
+        if (inst->sig.ldtlb || inst->sig.ldtlbu)
+                return true;
+
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return false;
 
@@ -179,7 +156,10 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
                 break;
         case V3D_QPU_MUX_B:
-                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
+                if (!n->inst->qpu.sig.small_imm) {
+                        add_read_dep(state,
+                                     state->last_rf[n->inst->qpu.raddr_b], n);
+                }
                 break;
         default:
                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
@@ -402,7 +382,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
                 add_write_dep(state, &state->last_tmu_config, n);
 
         if (inst->sig.ldtlb | inst->sig.ldtlbu)
-                add_read_dep(state, state->last_tlb, n);
+                add_write_dep(state, &state->last_tlb, n);
 
         if (inst->sig.ldvpm) {
                 add_write_dep(state, &state->last_vpm_read, n);
@@ -415,7 +395,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
         }
 
         /* inst->sig.ldunif or sideband uniform read */
-        if (qinst->uniform != ~0)
+        if (vir_has_uniform(qinst))
                 add_write_dep(state, &state->last_unif, n);
 
         if (v3d_qpu_reads_flags(inst))
@@ -425,11 +405,13 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 }
 
 static void
-calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
+calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
+                       struct list_head *schedule_list)
 {
         struct schedule_state state;
 
         memset(&state, 0, sizeof(state));
+        state.dag = dag;
         state.devinfo = c->devinfo;
         state.dir = F;
 
@@ -438,23 +420,28 @@ calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
 }
 
 static void
-calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
+calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
+                       struct list_head *schedule_list)
 {
-        struct list_head *node;
         struct schedule_state state;
 
         memset(&state, 0, sizeof(state));
+        state.dag = dag;
         state.devinfo = c->devinfo;
         state.dir = R;
 
-        for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
+        list_for_each_entry_rev(struct schedule_node, node, schedule_list,
+                                link) {
                 calculate_deps(&state, (struct schedule_node *)node);
         }
 }
 
 struct choose_scoreboard {
+        struct dag *dag;
         int tick;
         int last_magic_sfu_write_tick;
+        int last_stallable_sfu_reg;
+        int last_stallable_sfu_tick;
         int last_ldvary_tick;
         int last_uniforms_reset_tick;
         int last_thrsw_tick;
@@ -546,6 +533,38 @@ pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
         return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
 }
 
+static bool
+qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+                        uint32_t waddr) {
+
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+           return false;
+
+        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+            inst->raddr_a == waddr)
+              return true;
+
+        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+            !inst->sig.small_imm && (inst->raddr_b == waddr))
+              return true;
+
+        return false;
+}
+
+static bool
+mux_read_stalls(struct choose_scoreboard *scoreboard,
+                const struct v3d_qpu_instr *inst)
+{
+        return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
+                qpu_instruction_uses_rf(inst,
+                                        scoreboard->last_stallable_sfu_reg);
+}
+
+/* We define a max schedule priority to allow negative priorities as result of
+ * substracting this max when an instruction stalls. So instructions that
+ * stall have lower priority than regular instructions. */
+#define MAX_SCHEDULE_PRIORITY 16
+
 static int
 get_instruction_priority(const struct v3d_qpu_instr *inst)
 {
@@ -564,10 +583,6 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
                 return next_score;
         next_score++;
 
-        /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
-         * instructions after the producer if possible, not just 1.
-         */
-
         /* Default score for things that aren't otherwise special. */
         baseline_score = next_score;
         next_score++;
@@ -577,6 +592,9 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
                 return next_score;
         next_score++;
 
+        /* We should increase the maximum if we assert here */
+        assert(next_score < MAX_SCHEDULE_PRIORITY);
+
         return baseline_score;
 }
 
@@ -623,6 +641,37 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
 }
 
 static bool
+qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
+                                 const struct v3d_qpu_instr *a,
+                                 const struct v3d_qpu_instr *b)
+{
+        const bool a_uses_peripheral = qpu_accesses_peripheral(a);
+        const bool b_uses_peripheral = qpu_accesses_peripheral(b);
+
+        /* We can always do one peripheral access per instruction. */
+        if (!a_uses_peripheral || !b_uses_peripheral)
+                return true;
+
+        if (devinfo->ver < 41)
+                return false;
+
+        /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
+         * WRTMUC with a TMU magic register write (other than tmuc).
+         */
+        if ((a->sig.ldtmu && v3d_qpu_uses_vpm(b)) ||
+            (b->sig.ldtmu && v3d_qpu_uses_vpm(a))) {
+                return true;
+        }
+
+        if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(b)) ||
+            (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(a))) {
+                return true;
+        }
+
+        return false;
+}
+
+static bool
 qpu_merge_inst(const struct v3d_device_info *devinfo,
                struct v3d_qpu_instr *result,
                const struct v3d_qpu_instr *a,
@@ -633,12 +682,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
                 return false;
         }
 
-        /* Can't do more than one peripheral access in an instruction.
-         *
-         * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and
-         * WRTMUC with a TMU magic register write (other than tmuc).
-         */
-        if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
+        if (!qpu_compatible_peripheral_access(devinfo, a, b))
                 return false;
 
         struct v3d_qpu_instr merge = *a;
@@ -714,7 +758,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
 static struct schedule_node *
 choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                                struct choose_scoreboard *scoreboard,
-                               struct list_head *schedule_list,
                                struct schedule_node *prev_inst)
 {
         struct schedule_node *chosen = NULL;
@@ -728,7 +771,8 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                         return NULL;
         }
 
-        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+        list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
+                            dag.link) {
                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
 
                 /* Don't choose the branch instruction until it's the last one
@@ -736,7 +780,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                  * choose it.
                  */
                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
-                    !list_is_singular(schedule_list)) {
+                    !list_is_singular(&scoreboard->dag->heads)) {
                         continue;
                 }
 
@@ -805,6 +849,18 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
 
                 int prio = get_instruction_priority(inst);
 
+                if (mux_read_stalls(scoreboard, inst)) {
+                        /* Don't merge an instruction that stalls */
+                        if (prev_inst)
+                                continue;
+                        else {
+                                /* Any instruction that don't stall will have
+                                 * higher scheduling priority */
+                                prio -= MAX_SCHEDULE_PRIORITY;
+                                assert(prio < 0);
+                        }
+                }
+
                 /* Found a valid instruction.  If nothing better comes along,
                  * this one works.
                  */
@@ -841,6 +897,16 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
 }
 
 static void
+update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
+                                      const struct v3d_qpu_instr *inst)
+{
+        if (v3d_qpu_instr_is_sfu(inst)) {
+                scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
+                scoreboard->last_stallable_sfu_tick = scoreboard->tick;
+        }
+}
+
+static void
 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                              const struct v3d_qpu_instr *inst)
 {
@@ -853,6 +919,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                 if (inst->alu.add.magic_write) {
                         update_scoreboard_for_magic_waddr(scoreboard,
                                                           inst->alu.add.waddr);
+                } else {
+                        update_scoreboard_for_sfu_stall_waddr(scoreboard,
+                                                              inst);
                 }
         }
 
@@ -871,24 +940,24 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
 }
 
 static void
-dump_state(const struct v3d_device_info *devinfo,
-           struct list_head *schedule_list)
+dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
 {
-        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+        list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
                 v3d_qpu_dump(devinfo, &n->inst->qpu);
                 fprintf(stderr, "\n");
 
-                for (int i = 0; i < n->child_count; i++) {
-                        struct schedule_node *child = n->children[i].node;
+                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+                        struct schedule_node *child =
+                                (struct schedule_node *)edge->child;
                         if (!child)
                                 continue;
 
                         fprintf(stderr, "                 - ");
                         v3d_qpu_dump(devinfo, &child->inst->qpu);
                         fprintf(stderr, " (%d parents, %c)\n",
-                                child->parent_count,
-                                n->children[i].write_after_read ? 'w' : 'r');
+                                child->dag.parent_count,
+                                edge->data ? 'w' : 'r');
                 }
         }
 }
@@ -952,64 +1021,64 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after)
                                                    after_inst));
         }
 
+        if (v3d_qpu_instr_is_sfu(before_inst))
+                return 2;
+
         return latency;
 }
 
 /** Recursive computation of the delay member of a node. */
 static void
-compute_delay(struct schedule_node *n)
+compute_delay(struct dag_node *node, void *state)
 {
-        if (!n->child_count) {
-                n->delay = 1;
-        } else {
-                for (int i = 0; i < n->child_count; i++) {
-                        if (!n->children[i].node->delay)
-                                compute_delay(n->children[i].node);
-                        n->delay = MAX2(n->delay,
-                                        n->children[i].node->delay +
-                                        instruction_latency(n, n->children[i].node));
-                }
+        struct schedule_node *n = (struct schedule_node *)node;
+
+        n->delay = 1;
+
+        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+                struct schedule_node *child =
+                        (struct schedule_node *)edge->child;
+
+                n->delay = MAX2(n->delay, (child->delay +
+                                           instruction_latency(n, child)));
         }
 }
 
+/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
+ * should be called on it later to finish pruning the other edges).
+ */
 static void
-mark_instruction_scheduled(struct list_head *schedule_list,
+pre_remove_head(struct dag *dag, struct schedule_node *n)
+{
+        list_delinit(&n->dag.link);
+
+        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+                if (edge->data)
+                        dag_remove_edge(dag, edge);
+        }
+}
+
+static void
+mark_instruction_scheduled(struct dag *dag,
                            uint32_t time,
-                           struct schedule_node *node,
-                           bool war_only)
+                           struct schedule_node *node)
 {
         if (!node)
                 return;
 
-        for (int i = node->child_count - 1; i >= 0; i--) {
+        util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
                 struct schedule_node *child =
-                        node->children[i].node;
+                        (struct schedule_node *)edge->child;
 
                 if (!child)
                         continue;
 
-                if (war_only && !node->children[i].write_after_read)
-                        continue;
-
-                /* If the requirement is only that the node not appear before
-                 * the last read of its destination, then it can be scheduled
-                 * immediately after (or paired with!) the thing reading the
-                 * destination.
-                 */
-                uint32_t latency = 0;
-                if (!war_only) {
-                        latency = instruction_latency(node,
-                                                      node->children[i].node);
-                }
+                uint32_t latency = instruction_latency(node, child);
 
                 child->unblocked_time = MAX2(child->unblocked_time,
                                              time + latency);
-                child->parent_count--;
-                if (child->parent_count == 0)
-                        list_add(&child->link, schedule_list);
-
-                node->children[i].node = NULL;
         }
+        dag_prune_head(dag, &node->dag);
 }
 
 static void
@@ -1028,7 +1097,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
 static struct qinst *
 vir_nop()
 {
-        struct qreg undef = { QFILE_NULL, 0 };
+        struct qreg undef = vir_nop_reg();
         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
 
         return qinst;
@@ -1223,7 +1292,6 @@ static uint32_t
 schedule_instructions(struct v3d_compile *c,
                       struct choose_scoreboard *scoreboard,
                       struct qblock *block,
-                      struct list_head *schedule_list,
                       enum quniform_contents *orig_uniform_contents,
                       uint32_t *orig_uniform_data,
                       uint32_t *next_uniform)
@@ -1231,23 +1299,10 @@ schedule_instructions(struct v3d_compile *c,
         const struct v3d_device_info *devinfo = c->devinfo;
         uint32_t time = 0;
 
-        if (debug) {
-                fprintf(stderr, "initial deps:\n");
-                dump_state(devinfo, schedule_list);
-                fprintf(stderr, "\n");
-        }
-
-        /* Remove non-DAG heads from the list. */
-        list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
-                if (n->parent_count != 0)
-                        list_del(&n->link);
-        }
-
-        while (!list_empty(schedule_list)) {
+        while (!list_empty(&scoreboard->dag->heads)) {
                 struct schedule_node *chosen =
                         choose_instruction_to_schedule(devinfo,
                                                        scoreboard,
-                                                       schedule_list,
                                                        NULL);
                 struct schedule_node *merge = NULL;
 
@@ -1260,7 +1315,7 @@ schedule_instructions(struct v3d_compile *c,
                 if (debug) {
                         fprintf(stderr, "t=%4d: current list:\n",
                                 time);
-                        dump_state(devinfo, schedule_list);
+                        dump_state(devinfo, scoreboard->dag);
                         fprintf(stderr, "t=%4d: chose:   ", time);
                         v3d_qpu_dump(devinfo, inst);
                         fprintf(stderr, "\n");
@@ -1278,17 +1333,14 @@ schedule_instructions(struct v3d_compile *c,
                  */
                 if (chosen) {
                         time = MAX2(chosen->unblocked_time, time);
-                        list_del(&chosen->link);
-                        mark_instruction_scheduled(schedule_list, time,
-                                                   chosen, true);
+                        pre_remove_head(scoreboard->dag, chosen);
 
                         while ((merge =
                                 choose_instruction_to_schedule(devinfo,
                                                                scoreboard,
-                                                               schedule_list,
                                                                chosen))) {
                                 time = MAX2(merge->unblocked_time, time);
-                                list_del(&merge->link);
+                                pre_remove_head(scoreboard->dag, chosen);
                                 list_addtail(&merge->link, &merged_list);
                                 (void)qpu_merge_inst(devinfo, inst,
                                                      inst, &merge->inst->qpu);
@@ -1307,6 +1359,8 @@ schedule_instructions(struct v3d_compile *c,
                                         fprintf(stderr, "\n");
                                 }
                         }
+                        if (mux_read_stalls(scoreboard, inst))
+                                c->qpu_inst_stalled_count++;
                 }
 
                 /* Update the uniform index for the rewritten location --
@@ -1334,11 +1388,10 @@ schedule_instructions(struct v3d_compile *c,
                  * be scheduled.  Update the children's unblocked time for this
                  * DAG edge as we do so.
                  */
-                mark_instruction_scheduled(schedule_list, time, chosen, false);
+                mark_instruction_scheduled(scoreboard->dag, time, chosen);
                 list_for_each_entry(struct schedule_node, merge, &merged_list,
                                     link) {
-                        mark_instruction_scheduled(schedule_list, time, merge,
-                                                   false);
+                        mark_instruction_scheduled(scoreboard->dag, time, merge);
 
                         /* The merged VIR instruction doesn't get re-added to the
                          * block, so free it now.
@@ -1380,9 +1433,10 @@ qpu_schedule_instructions_block(struct v3d_compile *c,
                                 uint32_t *next_uniform)
 {
         void *mem_ctx = ralloc_context(NULL);
-        struct list_head schedule_list;
+        scoreboard->dag = dag_create(mem_ctx);
+        struct list_head setup_list;
 
-        list_inithead(&schedule_list);
+        list_inithead(&setup_list);
 
         /* Wrap each instruction in a scheduler structure. */
         while (!list_empty(&block->instructions)) {
@@ -1390,26 +1444,25 @@ qpu_schedule_instructions_block(struct v3d_compile *c,
                 struct schedule_node *n =
                         rzalloc(mem_ctx, struct schedule_node);
 
+                dag_init_node(scoreboard->dag, &n->dag);
                 n->inst = qinst;
 
                 list_del(&qinst->link);
-                list_addtail(&n->link, &schedule_list);
+                list_addtail(&n->link, &setup_list);
         }
 
-        calculate_forward_deps(c, &schedule_list);
-        calculate_reverse_deps(c, &schedule_list);
+        calculate_forward_deps(c, scoreboard->dag, &setup_list);
+        calculate_reverse_deps(c, scoreboard->dag, &setup_list);
 
-        list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
-                compute_delay(n);
-        }
+        dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
 
         uint32_t cycles = schedule_instructions(c, scoreboard, block,
-                                                &schedule_list,
                                                 orig_uniform_contents,
                                                 orig_uniform_data,
                                                 next_uniform);
 
         ralloc_free(mem_ctx);
+        scoreboard->dag = NULL;
 
         return cycles;
 }
@@ -1491,6 +1544,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         scoreboard.last_magic_sfu_write_tick = -10;
         scoreboard.last_uniforms_reset_tick = -10;
         scoreboard.last_thrsw_tick = -10;
+        scoreboard.last_stallable_sfu_tick = -10;
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/lib/mesa/src/broadcom/compiler/v3d33_tex.c b/lib/mesa/src/broadcom/compiler/v3d33_tex.c
index 7e9cd27d3..488021bfc 100644
--- a/lib/mesa/src/broadcom/compiler/v3d33_tex.c
+++ b/lib/mesa/src/broadcom/compiler/v3d33_tex.c
@@ -106,18 +106,16 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                         break;
 
                 case nir_tex_src_offset: {
-                        nir_const_value *offset =
-                                nir_src_as_const_value(instr->src[i].src);
                         p0_unpacked.texel_offset_for_s_coordinate =
-                                offset->i32[0];
+                                nir_src_comp_as_int(instr->src[i].src, 0);
 
                         if (instr->coord_components >= 2)
                                 p0_unpacked.texel_offset_for_t_coordinate =
-                                        offset->i32[1];
+                                        nir_src_comp_as_int(instr->src[i].src, 1);
 
                         if (instr->coord_components >= 3)
                                 p0_unpacked.texel_offset_for_r_coordinate =
-                                        offset->i32[2];
+                                        nir_src_comp_as_int(instr->src[i].src, 2);
                         break;
                 }
 
@@ -161,11 +159,10 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                                                  unit));
         }
 
-        struct qreg texture_u[] = {
-                vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
-                vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
+        int texture_u[] = {
+                vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
+                vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
         };
-        uint32_t next_texture_u = 0;
 
         for (int i = 0; i < next_coord; i++) {
                 struct qreg dst;
@@ -177,11 +174,8 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 
                 struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
 
-                if (i < 2) {
-                        tmu->has_implicit_uniform = true;
-                        tmu->src[vir_get_implicit_uniform_src(tmu)] =
-                                texture_u[next_texture_u++];
-                }
+                if (i < 2)
+                        tmu->uniform = texture_u[i];
         }
 
         vir_emit_thrsw(c);
diff --git a/lib/mesa/src/broadcom/compiler/v3d40_tex.c b/lib/mesa/src/broadcom/compiler/v3d40_tex.c
index 9f5c56079..1c39289b6 100644
--- a/lib/mesa/src/broadcom/compiler/v3d40_tex.c
+++ b/lib/mesa/src/broadcom/compiler/v3d40_tex.c
@@ -48,8 +48,7 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data
 {
         struct qinst *inst = vir_NOP(c);
         inst->qpu.sig.wrtmuc = true;
-        inst->has_implicit_uniform = true;
-        inst->src[0] = vir_uniform(c, contents, data);
+        inst->uniform = vir_get_uniform_index(c, contents, data);
 }
 
 static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
@@ -139,14 +138,13 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 
                 case nir_tex_src_offset: {
                         if (nir_src_is_const(instr->src[i].src)) {
-                                nir_const_value *offset =
-                                        nir_src_as_const_value(instr->src[i].src);
-
-                                p2_unpacked.offset_s = offset->i32[0];
+                                p2_unpacked.offset_s = nir_src_comp_as_int(instr->src[i].src, 0);
                                 if (instr->coord_components >= 2)
-                                        p2_unpacked.offset_t = offset->i32[1];
-                                if (instr->coord_components >= 3)
-                                        p2_unpacked.offset_r = offset->i32[2];
+                                        p2_unpacked.offset_t =
+                                                nir_src_comp_as_int(instr->src[i].src, 1);
+                                if (non_array_components >= 3)
+                                        p2_unpacked.offset_r =
+                                                nir_src_comp_as_int(instr->src[i].src, 2);
                         } else {
                                 struct qreg mask = vir_uniform_ui(c, 0xf);
                                 struct qreg x, y, offset;
@@ -185,6 +183,8 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         assert(p1_unpacked.output_type_32_bit ||
                p0_unpacked.return_words_of_texture_data < (1 << 2));
 
+        assert(p0_unpacked.return_words_of_texture_data != 0);
+
         uint32_t p0_packed;
         V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
@@ -243,6 +243,34 @@ type_size_align_1(const struct glsl_type *type, unsigned *size, unsigned *align)
         *align = 1;
 }
 
+static uint32_t
+v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
+{
+        switch (instr->intrinsic) {
+        case nir_intrinsic_image_deref_load:
+        case nir_intrinsic_image_deref_store:
+                return V3D_TMU_OP_REGULAR;
+        case nir_intrinsic_image_deref_atomic_add:
+                return v3d_get_op_for_atomic_add(instr, 3);
+        case nir_intrinsic_image_deref_atomic_min:
+                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+        case nir_intrinsic_image_deref_atomic_max:
+                return V3D_TMU_OP_WRITE_UMAX;
+        case nir_intrinsic_image_deref_atomic_and:
+                return V3D_TMU_OP_WRITE_AND_READ_INC;
+        case nir_intrinsic_image_deref_atomic_or:
+                return V3D_TMU_OP_WRITE_OR_READ_DEC;
+        case nir_intrinsic_image_deref_atomic_xor:
+                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+        case nir_intrinsic_image_deref_atomic_exchange:
+                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+        case nir_intrinsic_image_deref_atomic_comp_swap:
+                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+        default:
+                unreachable("unknown image intrinsic");
+        };
+}
+
 void
 v3d40_vir_emit_image_load_store(struct v3d_compile *c,
                                 nir_intrinsic_instr *instr)
@@ -264,42 +292,15 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
 
         struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
 
-        /* XXX perf: We should turn add/sub of 1 to inc/dec.  Perhaps NIR
-         * wants to have support for inc/dec?
-         */
-        switch (instr->intrinsic) {
-        case nir_intrinsic_image_deref_load:
-        case nir_intrinsic_image_deref_store:
-                p2_unpacked.op = V3D_TMU_OP_REGULAR;
-                break;
-        case nir_intrinsic_image_deref_atomic_add:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
-                break;
-        case nir_intrinsic_image_deref_atomic_min:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
-                break;
+        p2_unpacked.op = v3d40_image_load_store_tmu_op(instr);
 
-        case nir_intrinsic_image_deref_atomic_max:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_UMAX;
-                break;
-        case nir_intrinsic_image_deref_atomic_and:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_AND_READ_INC;
-                break;
-        case nir_intrinsic_image_deref_atomic_or:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_OR_READ_DEC;
-                break;
-        case nir_intrinsic_image_deref_atomic_xor:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_XOR_READ_NOT;
-                break;
-        case nir_intrinsic_image_deref_atomic_exchange:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
-                break;
-        case nir_intrinsic_image_deref_atomic_comp_swap:
-                p2_unpacked.op = V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
-                break;
-        default:
-                unreachable("unknown image intrinsic");
-        };
+        /* If we were able to replace atomic_add for an inc/dec, then we
+         * need/can to do things slightly different, like not loading the
+         * amount to add/sub, as that is implicit.
+         */
+        bool atomic_add_replaced = (instr->intrinsic == nir_intrinsic_image_deref_atomic_add &&
+                                    (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+                                     p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC));
 
         bool is_1d = false;
         switch (glsl_get_sampler_dim(sampler_type)) {
@@ -368,7 +369,8 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
         /* Emit the data writes for atomics or image store. */
-        if (instr->intrinsic != nir_intrinsic_image_deref_load) {
+        if (instr->intrinsic != nir_intrinsic_image_deref_load &&
+            !atomic_add_replaced) {
                 /* Vector for stores, or first atomic argument */
                 struct qreg src[4];
                 for (int i = 0; i < nir_intrinsic_src_components(instr, 3); i++) {
@@ -386,9 +388,21 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
                 }
         }
 
+        if (vir_in_nonuniform_control_flow(c) &&
+            instr->intrinsic != nir_intrinsic_image_deref_load) {
+           vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                      V3D_QPU_PF_PUSHZ);
+        }
+
         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, ntq_get_src(c, instr->src[1], 0),
                       &tmu_writes);
 
+        if (vir_in_nonuniform_control_flow(c) &&
+            instr->intrinsic != nir_intrinsic_image_deref_load) {
+           struct qinst *last_inst= (struct  qinst *)c->cur_block->instructions.prev;
+           vir_set_cond(last_inst, V3D_QPU_COND_IFA);
+        }
+
         vir_emit_thrsw(c);
 
         /* The input FIFO has 16 slots across all threads, so make sure we
diff --git a/lib/mesa/src/broadcom/compiler/v3d_compiler.h b/lib/mesa/src/broadcom/compiler/v3d_compiler.h
index 671aba3c5..b61119f56 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_compiler.h
+++ b/lib/mesa/src/broadcom/compiler/v3d_compiler.h
@@ -69,9 +69,6 @@ enum qfile {
          * or physical registers later.
          */
         QFILE_TEMP,
-        QFILE_UNIF,
-        QFILE_TLB,
-        QFILE_TLBU,
 
         /**
          * VPM reads use this with an index value to say what part of the VPM
@@ -105,6 +102,16 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index)
         return (struct qreg){file, index};
 }
 
+static inline struct qreg vir_magic_reg(uint32_t index)
+{
+        return (struct qreg){QFILE_MAGIC, index};
+}
+
+static inline struct qreg vir_nop_reg(void)
+{
+        return (struct qreg){QFILE_NULL, 0};
+}
+
 /**
  * A reference to an actual register at the QPU level, for register
  * allocation.
@@ -129,12 +136,11 @@ struct qinst {
         /* Pre-register-allocation references to src/dst registers */
         struct qreg dst;
         struct qreg src[3];
-        bool cond_is_exec_mask;
-        bool has_implicit_uniform;
         bool is_last_thrsw;
 
-        /* After vir_to_qpu.c: If instr reads a uniform, which uniform from
-         * the uncompiled stream it is.
+        /* If the instruction reads a uniform (other than through src[i].file
+         * == QFILE_UNIF), that uniform's index in c->uniform_contents.  ~0
+         * otherwise.
          */
         int uniform;
 };
@@ -275,17 +281,18 @@ enum quniform_contents {
         QUNIFORM_SHARED_OFFSET,
 };
 
-static inline uint32_t v3d_tmu_config_data_create(uint32_t unit, uint32_t value)
+static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
 {
+        assert(value < (1 << 24));
         return unit << 24 | value;
 }
 
-static inline uint32_t v3d_tmu_config_data_get_unit(uint32_t data)
+static inline uint32_t v3d_unit_data_get_unit(uint32_t data)
 {
         return data >> 24;
 }
 
-static inline uint32_t v3d_tmu_config_data_get_value(uint32_t data)
+static inline uint32_t v3d_unit_data_get_offset(uint32_t data)
 {
         return data & 0xffffff;
 }
@@ -311,25 +318,6 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
         return slot.slot_and_component & 3;
 }
 
-struct v3d_ubo_range {
-        /**
-         * offset in bytes from the start of the ubo where this range is
-         * uploaded.
-         *
-         * Only set once used is set.
-         */
-        uint32_t dst_offset;
-
-        /**
-         * offset in bytes from the start of the gallium uniforms where the
-         * data comes from.
-         */
-        uint32_t src_offset;
-
-        /** size in bytes of this ubo range */
-        uint32_t size;
-};
-
 struct v3d_key {
         void *shader_state;
         struct {
@@ -357,7 +345,8 @@ struct v3d_fs_key {
         bool sample_alpha_to_one;
         bool clamp_color;
         bool shade_model_flat;
-        uint8_t nr_cbufs;
+        /* Mask of which color render targets are present. */
+        uint8_t cbufs;
         uint8_t swap_color_rb;
         /* Mask of which render targets need to be written as 32-bit floats */
         uint8_t f32_color_rb;
@@ -366,6 +355,15 @@ struct v3d_fs_key {
          */
         uint8_t int_color_rb;
         uint8_t uint_color_rb;
+
+        /* Color format information per render target. Only set when logic
+         * operations are enabled.
+         */
+        struct {
+                enum pipe_format format;
+                const uint8_t *swizzle;
+        } color_fmt[V3D_MAX_DRAW_BUFFERS];
+
         uint8_t alpha_test_func;
         uint8_t logicop_func;
         uint32_t point_sprite_mask;
@@ -413,6 +411,8 @@ struct qblock {
 
         /** @{ used by v3d_vir_live_variables.c */
         BITSET_WORD *def;
+        BITSET_WORD *defin;
+        BITSET_WORD *defout;
         BITSET_WORD *use;
         BITSET_WORD *live_in;
         BITSET_WORD *live_out;
@@ -469,6 +469,8 @@ vir_after_block(struct qblock *block)
 struct v3d_compiler {
         const struct v3d_device_info *devinfo;
         struct ra_regs *regs;
+        unsigned int reg_class_any[3];
+        unsigned int reg_class_r5[3];
         unsigned int reg_class_phys[3];
         unsigned int reg_class_phys_or_acc[3];
 };
@@ -502,8 +504,8 @@ struct v3d_compile {
         struct qreg *inputs;
         struct qreg *outputs;
         bool msaa_per_sample_output;
-        struct qreg color_reads[V3D_MAX_SAMPLES];
-        struct qreg sample_colors[V3D_MAX_SAMPLES];
+        struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
+        struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
         uint32_t inputs_array_size;
         uint32_t outputs_array_size;
         uint32_t uniforms_array_size;
@@ -520,13 +522,7 @@ struct v3d_compile {
 
         bool uses_center_w;
         bool writes_z;
-
-        struct v3d_ubo_range *ubo_ranges;
-        bool *ubo_range_used;
-        uint32_t ubo_ranges_array_size;
-        /** Number of uniform areas tracked in ubo_ranges. */
-        uint32_t num_ubo_ranges;
-        uint32_t next_ubo_dst_offset;
+        bool uses_implicit_point_line_varyings;
 
         /* State for whether we're executing on each channel currently.  0 if
          * yes, otherwise a block number + 1 that the channel jumped to.
@@ -556,7 +552,7 @@ struct v3d_compile {
         int local_invocation_index_bits;
 
         uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
-        uint32_t num_vpm_writes;
+        uint32_t vpm_output_size;
 
         /* Size in bytes of registers that have been spilled. This is how much
          * space needs to be available in the spill BO per thread per QPU.
@@ -600,10 +596,8 @@ struct v3d_compile {
         enum quniform_contents *uniform_contents;
         uint32_t uniform_array_size;
         uint32_t num_uniforms;
-        uint32_t num_outputs;
         uint32_t output_position_index;
         nir_variable *output_color_var[4];
-        uint32_t output_point_size_index;
         uint32_t output_sample_mask_index;
 
         struct qreg undef;
@@ -619,24 +613,13 @@ struct v3d_compile {
         uint64_t *qpu_insts;
         uint32_t qpu_inst_count;
         uint32_t qpu_inst_size;
+        uint32_t qpu_inst_stalled_count;
 
         /* For the FS, the number of varying inputs not counting the
          * point/line varyings payload
          */
         uint32_t num_inputs;
 
-        /**
-         * Number of inputs from num_inputs remaining to be queued to the read
-         * FIFO in the VS/CS.
-         */
-        uint32_t num_inputs_remaining;
-
-        /* Number of inputs currently in the read FIFO for the VS/CS */
-        uint32_t num_inputs_in_fifo;
-
-        /** Next offset in the VPM to read from in the VS/CS */
-        uint32_t vpm_read_offset;
-
         uint32_t program_id;
         uint32_t variant_id;
 
@@ -652,6 +635,9 @@ struct v3d_compile {
         struct qinst *last_thrsw;
         bool last_thrsw_at_top_level;
 
+        bool emitted_tlb_load;
+        bool lock_scoreboard_on_first_thrsw;
+
         bool failed;
 };
 
@@ -664,12 +650,8 @@ struct v3d_uniform_list {
 struct v3d_prog_data {
         struct v3d_uniform_list uniforms;
 
-        struct v3d_ubo_range *ubo_ranges;
-        uint32_t num_ubo_ranges;
-        uint32_t ubo_size;
         uint32_t spill_size;
 
-        uint8_t num_inputs;
         uint8_t threads;
 
         /* For threads > 1, whether the program should be dispatched in the
@@ -717,17 +699,25 @@ struct v3d_fs_prog_data {
 
         uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
 
+        uint8_t num_inputs;
         bool writes_z;
         bool disable_ez;
         bool uses_center_w;
+        bool uses_implicit_point_line_varyings;
+        bool lock_scoreboard_on_first_thrsw;
 };
 
-/* Special nir_load_input intrinsic index for loading the current TLB
- * destination color.
- */
-#define V3D_NIR_TLB_COLOR_READ_INPUT		2000000000
+struct v3d_compute_prog_data {
+        struct v3d_prog_data base;
+        /* Size in bytes of the workgroup's shared space. */
+        uint32_t shared_size;
+};
 
-#define V3D_NIR_MS_MASK_OUTPUT			2000000000
+static inline bool
+vir_has_uniform(struct qinst *inst)
+{
+        return inst->uniform != ~0;
+}
 
 extern const nir_shader_compiler_options v3d_nir_options;
 
@@ -758,12 +748,17 @@ struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst,
                            struct qreg src0, struct qreg src1);
 struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst,
                            struct qreg src0, struct qreg src1);
-struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0);
+struct qinst *vir_branch_inst(struct v3d_compile *c,
+                              enum v3d_qpu_branch_cond cond);
 void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst);
+uint32_t vir_get_uniform_index(struct v3d_compile *c,
+                               enum quniform_contents contents,
+                               uint32_t data);
 struct qreg vir_uniform(struct v3d_compile *c,
                         enum quniform_contents contents,
                         uint32_t data);
 void vir_schedule_instructions(struct v3d_compile *c);
+void v3d_setup_spill_base(struct v3d_compile *c);
 struct v3d_qpu_instr v3d_qpu_nop(void);
 
 struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
@@ -777,9 +772,6 @@ void vir_set_unpack(struct qinst *inst, int src,
 struct qreg vir_get_temp(struct v3d_compile *c);
 void vir_emit_last_thrsw(struct v3d_compile *c);
 void vir_calculate_live_intervals(struct v3d_compile *c);
-bool vir_has_implicit_uniform(struct qinst *inst);
-int vir_get_implicit_uniform_src(struct qinst *inst);
-int vir_get_non_sideband_nsrc(struct qinst *inst);
 int vir_get_nsrc(struct qinst *inst);
 bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
 bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op);
@@ -788,7 +780,6 @@ bool vir_is_raw_mov(struct qinst *inst);
 bool vir_is_tex(struct qinst *inst);
 bool vir_is_add(struct qinst *inst);
 bool vir_is_mul(struct qinst *inst);
-bool vir_is_float_input(struct qinst *inst);
 bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
 bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
 struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
@@ -810,10 +801,13 @@ bool vir_opt_constant_folding(struct v3d_compile *c);
 bool vir_opt_copy_propagate(struct v3d_compile *c);
 bool vir_opt_dead_code(struct v3d_compile *c);
 bool vir_opt_peephole_sf(struct v3d_compile *c);
+bool vir_opt_redundant_flags(struct v3d_compile *c);
 bool vir_opt_small_immediates(struct v3d_compile *c);
 bool vir_opt_vpm(struct v3d_compile *c);
 void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
 void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_scratch(nir_shader *s);
 void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
 void v3d_nir_lower_image_load_store(nir_shader *s);
 void vir_lower_uniforms(struct v3d_compile *c);
@@ -833,7 +827,8 @@ bool vir_init_reg_sets(struct v3d_compiler *compiler);
 
 bool v3d_gl_format_is_return_32(GLenum format);
 
-void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
+uint32_t
+v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
 
 static inline bool
 quniform_contents_is_texture_p0(enum quniform_contents contents)
@@ -843,6 +838,12 @@ quniform_contents_is_texture_p0(enum quniform_contents contents)
                             V3D_MAX_TEXTURE_SAMPLERS));
 }
 
+static inline bool
+vir_in_nonuniform_control_flow(struct v3d_compile *c)
+{
+        return c->execute.file != QFILE_NULL;
+}
+
 static inline struct qreg
 vir_uniform_ui(struct v3d_compile *c, uint32_t ui)
 {
@@ -1086,6 +1087,30 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
         return vir_UMUL24(c, src0, src1);
 }
 
+static inline struct qreg
+vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
+{
+        assert(c->devinfo->ver >= 41); /* XXX */
+        assert((config & 0xffffff00) == 0xffffff00);
+
+        struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+                                           c->undef, c->undef);
+        ldtlb->qpu.sig.ldtlbu = true;
+        ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
+        return vir_emit_def(c, ldtlb);
+}
+
+static inline struct qreg
+vir_TLB_COLOR_READ(struct v3d_compile *c)
+{
+        assert(c->devinfo->ver >= 41); /* XXX */
+
+        struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+                                           c->undef, c->undef);
+        ldtlb->qpu.sig.ldtlb = true;
+        return vir_emit_def(c, ldtlb);
+}
+
 /*
 static inline struct qreg
 vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
@@ -1114,7 +1139,7 @@ static inline struct qinst *
 vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 {
         /* The actual uniform_data value will be set at scheduling time */
-        return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0)));
+        return vir_emit_nondef(c, vir_branch_inst(c, cond));
 }
 
 #define vir_for_each_block(block, c)                                    \
@@ -1143,4 +1168,8 @@ vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
         vir_for_each_block(_block, c)                                   \
                 vir_for_each_inst(inst, _block)
 
+#define vir_for_each_inst_inorder_safe(inst, c)                         \
+        vir_for_each_block(_block, c)                                   \
+                vir_for_each_inst_safe(inst, _block)
+
 #endif /* V3D_COMPILER_H */
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
index b65a82b7f..2a68efb7b 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -28,11 +28,47 @@
  * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
  * intrinsics into something amenable to the V3D architecture.
  *
- * After moving more and more logic to NIR, all that's left here is fixing up
- * addressing on uniform loads.  FS input and VS output scalarization is
- * handled by nir_lower_io_to_scalar().
+ * Most of the work is turning the VS's store_output intrinsics from working
+ * on a base representing the gallium-level vec4 driver_location to an offset
+ * within the VPM, and emitting the header that's read by the fixed function
+ * hardware between the VS and FS.
+ *
+ * We also adjust the offsets on uniform loads to be in bytes, since that's
+ * what we need for indirect addressing with general TMU access.
  */
 
+struct v3d_nir_lower_io_state {
+        int pos_vpm_offset;
+        int vp_vpm_offset;
+        int zs_vpm_offset;
+        int rcp_wc_vpm_offset;
+        int psiz_vpm_offset;
+        int varyings_vpm_offset;
+
+        BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        nir_ssa_def *pos[4];
+};
+
+static void
+v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan)
+{
+        nir_intrinsic_instr *intr =
+                nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+        nir_ssa_dest_init(&intr->instr, &intr->dest,
+                          1, intr->dest.ssa.bit_size, NULL);
+        intr->num_components = 1;
+
+        intr->src[0] = nir_src_for_ssa(chan);
+        intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+
+        nir_intrinsic_set_base(intr, base);
+        nir_intrinsic_set_write_mask(intr, 0x1);
+        nir_intrinsic_set_component(intr, 0);
+
+        nir_builder_instr_insert(b, &intr->instr);
+}
+
 /* Convert the uniform offset to bytes.  If it happens to be a constant,
  * constant-folding will clean up the shift for us.
  */
@@ -50,9 +86,90 @@ v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
                                                        nir_imm_int(b, 4))));
 }
 
+static int
+v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan)
+{
+        int component = var->data.location_frac + chan;
+
+        for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+                struct v3d_varying_slot slot = c->vs_key->fs_inputs[i];
+
+                if (v3d_slot_get_slot(slot) == var->data.location &&
+                    v3d_slot_get_component(slot) == component) {
+                        return i;
+                }
+        }
+
+        return -1;
+}
+
+/* Lowers a store_output(gallium driver location) to a series of store_outputs
+ * with a driver_location equal to the offset in the VPM.
+ */
+static void
+v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
+                         nir_intrinsic_instr *intr,
+                         struct v3d_nir_lower_io_state *state)
+{
+        b->cursor = nir_before_instr(&intr->instr);
+
+        int start_comp = nir_intrinsic_component(intr);
+        nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
+                                           intr->num_components);
+
+        nir_variable *var = NULL;
+        nir_foreach_variable(scan_var, &c->s->outputs) {
+                if (scan_var->data.driver_location != nir_intrinsic_base(intr) ||
+                    start_comp < scan_var->data.location_frac ||
+                    start_comp >= scan_var->data.location_frac +
+                    glsl_get_components(scan_var->type)) {
+                        continue;
+                }
+                var = scan_var;
+        }
+
+        /* Save off the components of the position for the setup of VPM inputs
+         * read by fixed function HW.
+         */
+        if (var->data.location == VARYING_SLOT_POS) {
+                for (int i = 0; i < intr->num_components; i++) {
+                        state->pos[start_comp + i] = nir_channel(b, src, i);
+                }
+        }
+
+        /* Just psiz to the position in the FF header right now. */
+        if (var->data.location == VARYING_SLOT_PSIZ &&
+            state->psiz_vpm_offset != -1) {
+                v3d_nir_store_output(b, state->psiz_vpm_offset, src);
+        }
+
+        /* Scalarize outputs if it hasn't happened already, since we want to
+         * schedule each VPM write individually.  We can skip any outut
+         * components not read by the FS.
+         */
+        for (int i = 0; i < intr->num_components; i++) {
+                int vpm_offset =
+                        v3d_varying_slot_vpm_offset(c, var,
+                                                    i +
+                                                    start_comp -
+                                                    var->data.location_frac);
+
+                if (vpm_offset == -1)
+                        continue;
+
+                BITSET_SET(state->varyings_stored, vpm_offset);
+
+                v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,
+                                     nir_channel(b, src, i));
+        }
+
+        nir_instr_remove(&intr->instr);
+}
+
 static void
 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
-                       struct nir_instr *instr)
+                       struct nir_instr *instr,
+                       struct v3d_nir_lower_io_state *state)
 {
         if (instr->type != nir_instr_type_intrinsic)
                 return;
@@ -63,33 +180,171 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                 v3d_nir_lower_uniform(c, b, intr);
                 break;
 
+        case nir_intrinsic_store_output:
+                if (c->s->info.stage == MESA_SHADER_VERTEX)
+                        v3d_nir_lower_vpm_output(c, b, intr, state);
+                break;
+
         default:
                 break;
         }
 }
 
-static bool
-v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl)
+/* Remap the output var's .driver_location.  This is purely for
+ * nir_print_shader() so that store_output can map back to a variable name.
+ */
+static void
+v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,
+                                        struct v3d_nir_lower_io_state *state)
+{
+        nir_foreach_variable_safe(var, &c->s->outputs) {
+                if (var->data.location == VARYING_SLOT_POS &&
+                    state->pos_vpm_offset != -1) {
+                        var->data.driver_location = state->pos_vpm_offset;
+                        continue;
+                }
+
+                if (var->data.location == VARYING_SLOT_PSIZ &&
+                    state->psiz_vpm_offset != -1) {
+                        var->data.driver_location = state->psiz_vpm_offset;
+                        continue;
+                }
+
+                int vpm_offset = v3d_varying_slot_vpm_offset(c, var, 0);
+                if (vpm_offset != -1) {
+                        var->data.driver_location =
+                                state->varyings_vpm_offset + vpm_offset;
+                } else {
+                        /* If we couldn't find a mapping for the var, delete
+                         * it so that its old .driver_location doesn't confuse
+                         * nir_print_shader().
+                         */
+                        exec_node_remove(&var->node);
+                }
+        }
+}
+
+static void
+v3d_nir_setup_vpm_layout(struct v3d_compile *c,
+                         struct v3d_nir_lower_io_state *state)
+{
+        uint32_t vpm_offset = 0;
+
+        if (c->vs_key->is_coord) {
+                state->pos_vpm_offset = vpm_offset;
+                vpm_offset += 4;
+        } else {
+                state->pos_vpm_offset = -1;
+        }
+
+        state->vp_vpm_offset = vpm_offset;
+        vpm_offset += 2;
+
+        if (!c->vs_key->is_coord) {
+                state->zs_vpm_offset = vpm_offset++;
+                state->rcp_wc_vpm_offset = vpm_offset++;
+        } else {
+                state->zs_vpm_offset = -1;
+                state->rcp_wc_vpm_offset = -1;
+        }
+
+        if (c->vs_key->per_vertex_point_size)
+                state->psiz_vpm_offset = vpm_offset++;
+        else
+                state->psiz_vpm_offset = -1;
+
+        state->varyings_vpm_offset = vpm_offset;
+
+        c->vpm_output_size = vpm_offset + c->vs_key->num_fs_inputs;
+}
+
+static void
+v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
+                            struct v3d_nir_lower_io_state *state)
 {
-        nir_builder b;
-        nir_builder_init(&b, impl);
+        for (int i = 0; i < 4; i++) {
+                if (!state->pos[i])
+                        state->pos[i] = nir_ssa_undef(b, 1, 32);
+        }
+
+        nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
+
+        if (state->pos_vpm_offset != -1) {
+                for (int i = 0; i < 4; i++) {
+                        v3d_nir_store_output(b, state->pos_vpm_offset + i,
+                                             state->pos[i]);
+                }
+        }
 
-        nir_foreach_block(block, impl) {
-                nir_foreach_instr_safe(instr, block)
-                        v3d_nir_lower_io_instr(c, &b, instr);
+        for (int i = 0; i < 2; i++) {
+                nir_ssa_def *pos;
+                nir_ssa_def *scale;
+                pos = state->pos[i];
+                if (i == 0)
+                        scale = nir_load_viewport_x_scale(b);
+                else
+                        scale = nir_load_viewport_y_scale(b);
+                pos = nir_fmul(b, pos, scale);
+                pos = nir_fmul(b, pos, rcp_wc);
+                pos = nir_f2i32(b, nir_fround_even(b, pos));
+                v3d_nir_store_output(b, state->vp_vpm_offset + i,
+                                     pos);
         }
 
-        nir_metadata_preserve(impl, nir_metadata_block_index |
-                              nir_metadata_dominance);
+        if (state->zs_vpm_offset != -1) {
+                nir_ssa_def *z = state->pos[2];
+                z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
+                z = nir_fmul(b, z, rcp_wc);
+                z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
+                v3d_nir_store_output(b, state->zs_vpm_offset, z);
+        }
+
+        if (state->rcp_wc_vpm_offset != -1)
+                v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc);
 
-        return true;
+        /* Store 0 to varyings requested by the FS but not stored in the VS.
+         * This should be undefined behavior, but glsl-routing seems to rely
+         * on it.
+         */
+        for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+                if (!BITSET_TEST(state->varyings_stored, i)) {
+                        v3d_nir_store_output(b, state->varyings_vpm_offset + i,
+                                             nir_imm_int(b, 0));
+                }
+        }
 }
 
 void
 v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
 {
+        struct v3d_nir_lower_io_state state = { 0 };
+
+        /* Set up the layout of the VPM outputs. */
+        if (s->info.stage == MESA_SHADER_VERTEX)
+                v3d_nir_setup_vpm_layout(c, &state);
+
         nir_foreach_function(function, s) {
-                if (function->impl)
-                        v3d_nir_lower_io_impl(c, function->impl);
+                if (function->impl) {
+                        nir_builder b;
+                        nir_builder_init(&b, function->impl);
+
+                        nir_foreach_block(block, function->impl) {
+                                nir_foreach_instr_safe(instr, block)
+                                        v3d_nir_lower_io_instr(c, &b, instr,
+                                                               &state);
+                        }
+
+                        nir_block *last = nir_impl_last_block(function->impl);
+                        b.cursor = nir_after_block(last);
+                        if (s->info.stage == MESA_SHADER_VERTEX)
+                                v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+
+                        nir_metadata_preserve(function->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
         }
+
+        if (s->info.stage == MESA_SHADER_VERTEX)
+                v3d_nir_lower_io_update_output_var_base(c, &state);
 }
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
new file mode 100644
index 000000000..5c3a7c58a
--- /dev/null
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright © 2019 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Implements lowering for logical operations.
+ *
+ * V3D doesn't have any hardware support for logic ops.  Instead, you read the
+ * current contents of the destination from the tile buffer, then do math using
+ * your output color and that destination value, and update the output color
+ * appropriately.
+ */
+
+#include "util/u_format.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+#include "v3d_compiler.h"
+
+
+typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c);
+typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c);
+
+static bool
+logicop_depends_on_dst_color(int logicop_func)
+{
+        switch (logicop_func) {
+        case PIPE_LOGICOP_SET:
+        case PIPE_LOGICOP_CLEAR:
+        case PIPE_LOGICOP_COPY:
+        case PIPE_LOGICOP_COPY_INVERTED:
+                return false;
+        default:
+                return true;
+        }
+}
+
+static nir_ssa_def *
+v3d_logicop(nir_builder *b, int logicop_func,
+            nir_ssa_def *src, nir_ssa_def *dst)
+{
+        switch (logicop_func) {
+        case PIPE_LOGICOP_CLEAR:
+                return nir_imm_int(b, 0);
+        case PIPE_LOGICOP_NOR:
+                return nir_inot(b, nir_ior(b, src, dst));
+        case PIPE_LOGICOP_AND_INVERTED:
+                return nir_iand(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_COPY_INVERTED:
+                return nir_inot(b, src);
+        case PIPE_LOGICOP_AND_REVERSE:
+                return nir_iand(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_INVERT:
+                return nir_inot(b, dst);
+        case PIPE_LOGICOP_XOR:
+                return nir_ixor(b, src, dst);
+        case PIPE_LOGICOP_NAND:
+                return nir_inot(b, nir_iand(b, src, dst));
+        case PIPE_LOGICOP_AND:
+                return nir_iand(b, src, dst);
+        case PIPE_LOGICOP_EQUIV:
+                return nir_inot(b, nir_ixor(b, src, dst));
+        case PIPE_LOGICOP_NOOP:
+                return dst;
+        case PIPE_LOGICOP_OR_INVERTED:
+                return nir_ior(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_OR_REVERSE:
+                return nir_ior(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_OR:
+                return nir_ior(b, src, dst);
+        case PIPE_LOGICOP_SET:
+                return nir_imm_int(b, ~0);
+        default:
+                fprintf(stderr, "Unknown logic op %d\n", logicop_func);
+                /* FALLTHROUGH */
+        case PIPE_LOGICOP_COPY:
+                return src;
+        }
+}
+
+static nir_ssa_def *
+v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+        switch (swiz) {
+        default:
+        case PIPE_SWIZZLE_NONE:
+                fprintf(stderr, "warning: unknown swizzle\n");
+                /* FALLTHROUGH */
+        case PIPE_SWIZZLE_0:
+                return nir_imm_float(b, 0.0);
+        case PIPE_SWIZZLE_1:
+                return nir_imm_float(b, 1.0);
+        case PIPE_SWIZZLE_X:
+        case PIPE_SWIZZLE_Y:
+        case PIPE_SWIZZLE_Z:
+        case PIPE_SWIZZLE_W:
+                return srcs[swiz];
+        }
+}
+
+static nir_ssa_def *
+v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans,
+                         const uint8_t *swiz, nir_pack_func pack_func)
+{
+        nir_ssa_def *c[4];
+        for (int i = 0; i < 4; i++)
+                c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]);
+
+        return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3]));
+}
+
+static nir_ssa_def *
+v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed,
+                           const uint8_t *swiz, nir_unpack_func unpack_func)
+{
+        nir_ssa_def *unpacked = unpack_func(b, packed);
+
+        nir_ssa_def *unpacked_chans[4];
+        for (int i = 0; i < 4; i++)
+                unpacked_chans[i] = nir_channel(b, unpacked, i);
+
+        nir_ssa_def *c[4];
+        for (int i = 0; i < 4; i++)
+                c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]);
+
+        return nir_vec4(b, c[0], c[1], c[2], c[3]);
+}
+
+static nir_ssa_def *
+pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+{
+        const unsigned bits[4] = { 10, 10, 10, 2 };
+        nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits);
+
+        nir_ssa_def *chans[4];
+        for (int i = 0; i < 4; i++)
+                chans[i] = nir_channel(b, unorm, i);
+
+        nir_ssa_def *result = nir_mov(b, chans[0]);
+        int offset = bits[0];
+        for (int i = 1; i < 4; i++) {
+                nir_ssa_def *shifted_chan =
+                        nir_ishl(b, chans[i], nir_imm_int(b, offset));
+                result = nir_ior(b, result, shifted_chan);
+                offset += bits[i];
+        }
+        return result;
+}
+
+static nir_ssa_def *
+unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+{
+        const unsigned bits[4] = { 10, 10, 10, 2 };
+        const unsigned masks[4] = { BITFIELD_MASK(bits[0]),
+                                    BITFIELD_MASK(bits[1]),
+                                    BITFIELD_MASK(bits[2]),
+                                    BITFIELD_MASK(bits[3]) };
+
+        nir_ssa_def *chans[4];
+        for (int i = 0; i < 4; i++) {
+                nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i]));
+                chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]);
+                c = nir_ushr(b, c, nir_imm_int(b, bits[i]));
+        }
+
+        return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
+}
+
+static const uint8_t *
+v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt)
+{
+        static const uint8_t ident[4] = { 0, 1, 2, 3 };
+
+        /* We will automatically swap R and B channels for BGRA formats
+         * on tile loads and stores (see 'swap_rb' field in v3d_resource) so
+         * we want to treat these surfaces as if they were regular RGBA formats.
+         */
+        if (c->fs_key->color_fmt[rt].swizzle[0] == 2 &&
+            c->fs_key->color_fmt[rt].format != PIPE_FORMAT_B5G6R5_UNORM) {
+                return ident;
+        } else {
+                return  c->fs_key->color_fmt[rt].swizzle;
+        }
+}
+
+static nir_ssa_def *
+v3d_nir_get_tlb_color(nir_builder *b, int rt, int sample)
+{
+        nir_ssa_def *color[4];
+        for (int i = 0; i < 4; i++) {
+                nir_intrinsic_instr *load =
+                        nir_intrinsic_instr_create(b->shader,
+                                                   nir_intrinsic_load_tlb_color_v3d);
+                load->num_components = 1;
+                nir_intrinsic_set_base(load, sample);
+                nir_intrinsic_set_component(load, i);
+                load->src[0] = nir_src_for_ssa(nir_imm_int(b, rt));
+                nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+                nir_builder_instr_insert(b, &load->instr);
+                color[i] = &load->dest.ssa;
+        }
+
+        return nir_vec4(b, color[0], color[1], color[2], color[3]);
+}
+
+static nir_ssa_def *
+v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b,
+                      nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+                      int rt, int sample)
+{
+        const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
+
+        nir_ssa_def *op_res[4];
+        for (int i = 0; i < 4; i++) {
+                nir_ssa_def *src = src_chans[i];
+                nir_ssa_def *dst =
+                        v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]);
+                op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst);
+        }
+
+        nir_ssa_def *r[4];
+        for (int i = 0; i < 4; i++)
+                r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]);
+
+        return nir_vec4(b, r[0], r[1], r[2], r[3]);
+}
+
+static nir_ssa_def *
+v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b,
+                        nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+                        int rt, int sample,
+                        nir_pack_func pack_func, nir_unpack_func unpack_func)
+{
+        const uint8_t src_swz[4] = { 0, 1, 2, 3 };
+        nir_ssa_def *packed_src =
+                v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func);
+
+        const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
+        nir_ssa_def *packed_dst =
+                v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func);
+
+        nir_ssa_def *packed_result =
+                v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst);
+
+        return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func);
+}
+
+static nir_ssa_def *
+v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
+                      nir_ssa_def *src, int rt, int sample)
+{
+        nir_ssa_def *dst = v3d_nir_get_tlb_color(b, rt, sample);
+
+        nir_ssa_def *src_chans[4], *dst_chans[4];
+        for (unsigned i = 0; i < 4; i++) {
+                src_chans[i] = nir_channel(b, src, i);
+                dst_chans[i] = nir_channel(b, dst, i);
+        }
+
+        if (c->fs_key->color_fmt[rt].format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+                return v3d_emit_logic_op_unorm(
+                        c, b, src_chans, dst_chans, rt, 0,
+                        pack_unorm_rgb10a2, unpack_unorm_rgb10a2);
+        }
+
+        if (util_format_is_unorm(c->fs_key->color_fmt[rt].format)) {
+                return v3d_emit_logic_op_unorm(
+                        c, b, src_chans, dst_chans, rt, 0,
+                        nir_pack_unorm_4x8, nir_unpack_unorm_4x8);
+        }
+
+        return v3d_emit_logic_op_raw(c, b, src_chans, dst_chans, rt, 0);
+}
+
+static void
+v3d_emit_ms_output(struct v3d_compile *c, nir_builder *b,
+                   nir_ssa_def *color, nir_src *offset,
+                   nir_alu_type type, int rt, int sample)
+{
+
+        nir_intrinsic_instr *store =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_store_tlb_sample_color_v3d);
+        store->num_components = 4;
+        nir_intrinsic_set_base(store, sample);
+        nir_intrinsic_set_component(store, 0);
+        nir_intrinsic_set_type(store, type);
+        store->src[0] = nir_src_for_ssa(color);
+        store->src[1] = nir_src_for_ssa(nir_imm_int(b, rt));
+        nir_builder_instr_insert(b, &store->instr);
+}
+
+static void
+v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
+                             nir_builder *b,
+                             nir_intrinsic_instr *intr,
+                             int rt)
+{
+        nir_ssa_def *frag_color = intr->src[0].ssa;
+
+
+        const int logic_op = c->fs_key->logicop_func;
+        if (c->fs_key->msaa && logicop_depends_on_dst_color(logic_op)) {
+                c->msaa_per_sample_output = true;
+
+                nir_src *offset = &intr->src[1];
+                nir_alu_type type = nir_intrinsic_type(intr);
+                for (int i = 0; i < V3D_MAX_SAMPLES; i++) {
+                        nir_ssa_def *sample =
+                                v3d_nir_emit_logic_op(c, b, frag_color, rt, i);
+
+                        v3d_emit_ms_output(c, b, sample, offset, type, rt, i);
+                }
+
+                nir_instr_remove(&intr->instr);
+        } else {
+                nir_ssa_def *result =
+                        v3d_nir_emit_logic_op(c, b, frag_color, rt, 0);
+
+                nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                                      nir_src_for_ssa(result));
+                intr->num_components = result->num_components;
+        }
+}
+
+static bool
+v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
+{
+        nir_foreach_instr_safe(instr, block) {
+                if (instr->type != nir_instr_type_intrinsic)
+                        continue;
+
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                if (intr->intrinsic != nir_intrinsic_store_output)
+                        continue;
+
+                nir_foreach_variable(var, &c->s->outputs) {
+                        const int driver_loc = var->data.driver_location;
+                        if (driver_loc != nir_intrinsic_base(intr))
+                                continue;
+
+                        const int loc = var->data.location;
+                        if (loc != FRAG_RESULT_COLOR &&
+                            (loc < FRAG_RESULT_DATA0 ||
+                             loc >= FRAG_RESULT_DATA0 + V3D_MAX_DRAW_BUFFERS)) {
+                                continue;
+                        }
+
+                        /* Logic operations do not apply on floating point or
+                         * sRGB enabled render targets.
+                         */
+                        const int rt = driver_loc;
+                        assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+                        const enum pipe_format format =
+                                c->fs_key->color_fmt[rt].format;
+                        if (util_format_is_float(format) ||
+                            util_format_is_srgb(format)) {
+                                continue;
+                        }
+
+                        nir_function_impl *impl =
+                                nir_cf_node_get_function(&block->cf_node);
+                        nir_builder b;
+                        nir_builder_init(&b, impl);
+                        b.cursor = nir_before_instr(&intr->instr);
+                        v3d_nir_lower_logic_op_instr(c, &b, intr, rt);
+                }
+        }
+
+        return true;
+}
+
+void
+v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c)
+{
+        /* Nothing to do if logic op is 'copy src to dst' or if logic ops are
+         * disabled (we set the logic op to copy in that case).
+         */
+        if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY)
+                return;
+
+        nir_foreach_function(function, s) {
+                if (function->impl) {
+                        nir_foreach_block(block, function->impl)
+                                v3d_nir_lower_logic_ops_block(block, c);
+
+                        nir_metadata_preserve(function->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+}
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c
new file mode 100644
index 000000000..d23b8be83
--- /dev/null
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+/** @file v3d_nir_lower_scratch.c
+ *
+ * Swizzles around the addresses of
+ * nir_intrinsic_load_scratch/nir_intrinsic_store_scratch so that a QPU stores
+ * a cacheline at a time per dword of scratch access, scalarizing and removing
+ * writemasks in the process.
+ */
+
+static nir_ssa_def *
+v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr)
+{
+        bool is_store = instr->intrinsic == nir_intrinsic_store_scratch;
+        nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1);
+
+        assert(nir_intrinsic_align_mul(instr) >= 4);
+        assert(nir_intrinsic_align_offset(instr) == 0);
+
+        /* The spill_offset register will already have the subgroup ID (EIDX)
+         * shifted and ORed in at bit 2, so all we need to do is to move the
+         * dword index up above V3D_CHANNELS.
+         */
+        return nir_imul_imm(b, offset, V3D_CHANNELS);
+}
+
+static void
+v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
+{
+        b->cursor = nir_before_instr(&instr->instr);
+
+        nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr);
+
+        nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS];
+        for (int i = 0; i < instr->num_components; i++) {
+                nir_ssa_def *chan_offset =
+                        nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
+
+                nir_intrinsic_instr *chan_instr =
+                        nir_intrinsic_instr_create(b->shader, instr->intrinsic);
+                chan_instr->num_components = 1;
+                nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1,
+                                  instr->dest.ssa.bit_size, NULL);
+
+                chan_instr->src[0] = nir_src_for_ssa(chan_offset);
+
+                nir_intrinsic_set_align(chan_instr, 4, 0);
+
+                nir_builder_instr_insert(b, &chan_instr->instr);
+
+                chans[i] = &chan_instr->dest.ssa;
+        }
+
+        nir_ssa_def *result = nir_vec(b, chans, instr->num_components);
+        nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(result));
+        nir_instr_remove(&instr->instr);
+}
+
+static void
+v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
+{
+        b->cursor = nir_before_instr(&instr->instr);
+
+        nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr);
+        nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0],
+                                             instr->num_components);
+
+        for (int i = 0; i < instr->num_components; i++) {
+                if (!(nir_intrinsic_write_mask(instr) & (1 << i)))
+                        continue;
+
+                nir_ssa_def *chan_offset =
+                        nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
+
+                nir_intrinsic_instr *chan_instr =
+                        nir_intrinsic_instr_create(b->shader, instr->intrinsic);
+                chan_instr->num_components = 1;
+
+                chan_instr->src[0] = nir_src_for_ssa(nir_channel(b,
+                                                                 value,
+                                                                 i));
+                chan_instr->src[1] = nir_src_for_ssa(chan_offset);
+                nir_intrinsic_set_write_mask(chan_instr, 0x1);
+                nir_intrinsic_set_align(chan_instr, 4, 0);
+
+                nir_builder_instr_insert(b, &chan_instr->instr);
+        }
+
+        nir_instr_remove(&instr->instr);
+}
+
+void
+v3d_nir_lower_scratch(nir_shader *s)
+{
+        nir_foreach_function(function, s) {
+                if (!function->impl)
+                        continue;
+
+                nir_builder b;
+                nir_builder_init(&b, function->impl);
+
+                nir_foreach_block(block, function->impl) {
+                        nir_foreach_instr_safe(instr, block) {
+                                if (instr->type != nir_instr_type_intrinsic)
+                                        continue;
+
+                                nir_intrinsic_instr *intr =
+                                        nir_instr_as_intrinsic(instr);
+
+                                switch (intr->intrinsic) {
+                                case nir_intrinsic_load_scratch:
+                                        v3d_nir_lower_load_scratch(&b, intr);
+                                        break;
+                                case nir_intrinsic_store_scratch:
+                                        v3d_nir_lower_store_scratch(&b, intr);
+                                        break;
+                                default:
+                                        break;
+                                }
+                        }
+                }
+
+                nir_metadata_preserve(function->impl,
+                                      nir_metadata_block_index |
+                                      nir_metadata_dominance);
+        }
+}
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
index 68591529d..d79969374 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
@@ -34,12 +34,10 @@
 
 #define V3D_MAX_SAMPLES 4
 
-static void
-vc4_nir_lower_txf_ms_instr(struct v3d_compile *c, nir_builder *b,
-                           nir_tex_instr *instr)
+static nir_ssa_def *
+v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
 {
-        if (instr->op != nir_texop_txf_ms)
-                return;
+        nir_tex_instr *instr = nir_instr_as_tex(in_instr);
 
         b->cursor = nir_before_instr(&instr->instr);
 
@@ -66,30 +64,22 @@ vc4_nir_lower_txf_ms_instr(struct v3d_compile *c, nir_builder *b,
         nir_tex_instr_remove_src(instr, sample_index);
         instr->op = nir_texop_txf;
         instr->sampler_dim = GLSL_SAMPLER_DIM_2D;
+
+        return NIR_LOWER_INSTR_PROGRESS;
+}
+
+static bool
+v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data)
+{
+        return (instr->type == nir_instr_type_tex &&
+                nir_instr_as_tex(instr)->op == nir_texop_txf_ms);
 }
 
 void
 v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c)
 {
-        nir_foreach_function(function, s) {
-                if (!function->impl)
-                        continue;
-
-                nir_builder b;
-                nir_builder_init(&b, function->impl);
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_tex)
-                                        continue;
-
-                                vc4_nir_lower_txf_ms_instr(c, &b,
-                                                           nir_instr_as_tex(instr));
-                        }
-                }
-
-                nir_metadata_preserve(function->impl,
-                                      nir_metadata_block_index |
-                                      nir_metadata_dominance);
-        }
+        nir_shader_lower_instructions(s,
+                                      v3d_nir_lower_txf_ms_filter,
+                                      v3d_nir_lower_txf_ms_instr,
+                                      NULL);
 }
diff --git a/lib/mesa/src/broadcom/compiler/vir.c b/lib/mesa/src/broadcom/compiler/vir.c
index 20f700414..78362a294 100644
--- a/lib/mesa/src/broadcom/compiler/vir.c
+++ b/lib/mesa/src/broadcom/compiler/vir.c
@@ -25,7 +25,7 @@
 #include "v3d_compiler.h"
 
 int
-vir_get_non_sideband_nsrc(struct qinst *inst)
+vir_get_nsrc(struct qinst *inst)
 {
         switch (inst->qpu.type) {
         case V3D_QPU_INSTR_TYPE_BRANCH:
@@ -40,55 +40,6 @@ vir_get_non_sideband_nsrc(struct qinst *inst)
         return 0;
 }
 
-int
-vir_get_nsrc(struct qinst *inst)
-{
-        int nsrc = vir_get_non_sideband_nsrc(inst);
-
-        if (vir_has_implicit_uniform(inst))
-                nsrc++;
-
-        return nsrc;
-}
-
-bool
-vir_has_implicit_uniform(struct qinst *inst)
-{
-        switch (inst->qpu.type) {
-        case V3D_QPU_INSTR_TYPE_BRANCH:
-                return true;
-        case V3D_QPU_INSTR_TYPE_ALU:
-                switch (inst->dst.file) {
-                case QFILE_TLBU:
-                        return true;
-                case QFILE_MAGIC:
-                        switch (inst->dst.index) {
-                        case V3D_QPU_WADDR_TLBU:
-                        case V3D_QPU_WADDR_TMUAU:
-                        case V3D_QPU_WADDR_SYNCU:
-                                return true;
-                        default:
-                                break;
-                        }
-                        break;
-                default:
-                        return inst->has_implicit_uniform;
-                }
-        }
-        return false;
-}
-
-/* The sideband uniform for textures gets stored after the normal ALU
- * arguments.
- */
-int
-vir_get_implicit_uniform_src(struct qinst *inst)
-{
-        if (!vir_has_implicit_uniform(inst))
-                return -1;
-        return vir_get_nsrc(inst) - 1;
-}
-
 /**
  * Returns whether the instruction has any side effects that must be
  * preserved.
@@ -124,6 +75,8 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
 
         if (inst->qpu.sig.ldtmu ||
             inst->qpu.sig.ldvary ||
+            inst->qpu.sig.ldtlbu ||
+            inst->qpu.sig.ldtlb ||
             inst->qpu.sig.wrtmuc ||
             inst->qpu.sig.thrsw) {
                 return true;
@@ -133,38 +86,6 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
 }
 
 bool
-vir_is_float_input(struct qinst *inst)
-{
-        /* XXX: More instrs */
-        switch (inst->qpu.type) {
-        case V3D_QPU_INSTR_TYPE_BRANCH:
-                return false;
-        case V3D_QPU_INSTR_TYPE_ALU:
-                switch (inst->qpu.alu.add.op) {
-                case V3D_QPU_A_FADD:
-                case V3D_QPU_A_FSUB:
-                case V3D_QPU_A_FMIN:
-                case V3D_QPU_A_FMAX:
-                case V3D_QPU_A_FTOIN:
-                        return true;
-                default:
-                        break;
-                }
-
-                switch (inst->qpu.alu.mul.op) {
-                case V3D_QPU_M_FMOV:
-                case V3D_QPU_M_VFMUL:
-                case V3D_QPU_M_FMUL:
-                        return true;
-                default:
-                        break;
-                }
-        }
-
-        return false;
-}
-
-bool
 vir_is_raw_mov(struct qinst *inst)
 {
         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
@@ -178,6 +99,13 @@ vir_is_raw_mov(struct qinst *inst)
                 return false;
         }
 
+        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+                return false;
+        }
+
         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
                 return false;
@@ -421,7 +349,7 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q
 }
 
 struct qinst *
-vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
+vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 {
         struct qinst *inst = calloc(1, sizeof(*inst));
 
@@ -433,9 +361,8 @@ vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
         inst->qpu.branch.ub = true;
         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
 
-        inst->dst = vir_reg(QFILE_NULL, 0);
-        inst->src[0] = src;
-        inst->uniform = ~0;
+        inst->dst = vir_nop_reg();
+        inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
 
         return inst;
 }
@@ -591,7 +518,6 @@ vir_compile_init(const struct v3d_compiler *compiler,
         vir_set_emit_block(c, vir_new_block(c));
 
         c->output_position_index = -1;
-        c->output_point_size_index = -1;
         c->output_sample_mask_index = -1;
 
         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
@@ -601,7 +527,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
 }
 
 static int
-type_size_vec4(const struct glsl_type *type)
+type_size_vec4(const struct glsl_type *type, bool bindless)
 {
         return glsl_count_attribute_slots(type, false);
 }
@@ -638,8 +564,29 @@ v3d_lower_nir(struct v3d_compile *c)
                 }
         }
 
+        /* CS textures may not have return_size reflecting the shadow state. */
+        nir_foreach_variable(var, &c->s->uniforms) {
+                const struct glsl_type *type = glsl_without_array(var->type);
+                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+
+                if (!glsl_type_is_sampler(type) ||
+                    !glsl_sampler_type_is_shadow(type))
+                        continue;
+
+                for (int i = 0; i < array_len; i++) {
+                        tex_options.lower_tex_packing[var->data.binding + i] =
+                                nir_lower_tex_packing_16;
+                }
+        }
+
         NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
         NIR_PASS_V(c->s, nir_lower_system_values);
+
+        NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
+                   nir_var_function_temp,
+                   0,
+                   glsl_get_natural_size_align_bytes);
+        NIR_PASS_V(c->s, v3d_nir_lower_scratch);
 }
 
 static void
@@ -658,47 +605,10 @@ v3d_set_prog_data_uniforms(struct v3d_compile *c,
                count * sizeof(*ulist->contents));
 }
 
-/* Copy the compiler UBO range state to the compiled shader, dropping out
- * arrays that were never referenced by an indirect load.
- *
- * (Note that QIR dead code elimination of an array access still leaves that
- * array alive, though)
- */
-static void
-v3d_set_prog_data_ubo(struct v3d_compile *c,
-                      struct v3d_prog_data *prog_data)
-{
-        if (!c->num_ubo_ranges)
-                return;
-
-        prog_data->num_ubo_ranges = 0;
-        prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
-                                             c->num_ubo_ranges);
-        for (int i = 0; i < c->num_ubo_ranges; i++) {
-                if (!c->ubo_range_used[i])
-                        continue;
-
-                struct v3d_ubo_range *range = &c->ubo_ranges[i];
-                prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
-                prog_data->ubo_size += range->size;
-        }
-
-        if (prog_data->ubo_size) {
-                if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
-                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
-                                vir_get_stage_name(c),
-                                c->program_id, c->variant_id,
-                                prog_data->ubo_size / 4);
-                }
-        }
-}
-
 static void
 v3d_vs_set_prog_data(struct v3d_compile *c,
                      struct v3d_vs_prog_data *prog_data)
 {
-        prog_data->base.num_inputs = c->num_inputs;
-
         /* The vertex data gets format converted by the VPM so that
          * each attribute channel takes up a VPM column.  Precompute
          * the sizes for the shader record.
@@ -722,7 +632,7 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
          * channel).
          */
         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
-        prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
+        prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
 
         /* Set us up for shared input/output segments.  This is apparently
          * necessary for our VCM setup to avoid varying corruption.
@@ -741,7 +651,7 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
          * batches.
          */
         assert(c->devinfo->vpm_size);
-        int sector_size = 16 * sizeof(uint32_t) * 8;
+        int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
         int half_vpm = vpm_size_in_sectors / 2;
         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
@@ -754,7 +664,7 @@ static void
 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
                             struct v3d_fs_prog_data *prog_data)
 {
-        prog_data->base.num_inputs = c->num_inputs;
+        prog_data->num_inputs = c->num_inputs;
         memcpy(prog_data->input_slots, c->input_slots,
                c->num_inputs * sizeof(*c->input_slots));
 
@@ -780,6 +690,17 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
         prog_data->writes_z = c->writes_z;
         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
         prog_data->uses_center_w = c->uses_center_w;
+        prog_data->uses_implicit_point_line_varyings =
+                c->uses_implicit_point_line_varyings;
+        prog_data->lock_scoreboard_on_first_thrsw =
+                c->lock_scoreboard_on_first_thrsw;
+}
+
+static void
+v3d_cs_set_prog_data(struct v3d_compile *c,
+                     struct v3d_compute_prog_data *prog_data)
+{
+        prog_data->shared_size = c->s->info.cs.shared_size;
 }
 
 static void
@@ -791,9 +712,10 @@ v3d_set_prog_data(struct v3d_compile *c,
         prog_data->spill_size = c->spill_size;
 
         v3d_set_prog_data_uniforms(c, prog_data);
-        v3d_set_prog_data_ubo(c, prog_data);
 
-        if (c->s->info.stage == MESA_SHADER_VERTEX) {
+        if (c->s->info.stage == MESA_SHADER_COMPUTE) {
+                v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
+        } else if (c->s->info.stage == MESA_SHADER_VERTEX) {
                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
         } else {
                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
@@ -836,9 +758,16 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
         v3d_optimize_nir(c->s);
         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
+
+        /* This must go before nir_lower_io */
+        if (c->vs_key->per_vertex_point_size)
+                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+
         NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
                    type_size_vec4,
                    (nir_lower_io_options)0);
+        /* clean up nir_lower_io's deref_var remains */
+        NIR_PASS_V(c->s, nir_opt_dce);
 }
 
 static void
@@ -877,6 +806,8 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
                 v3d_fixup_fs_output_types(c);
 
+        NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
+
         /* If the shader has no non-TLB side effects, we can promote it to
          * enabling early_fragment_tests even if the user didn't.
          */
@@ -928,6 +859,33 @@ v3d_nir_lower_fs_late(struct v3d_compile *c)
         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
 }
 
+static uint32_t
+vir_get_max_temps(struct v3d_compile *c)
+{
+        int max_ip = 0;
+        vir_for_each_inst_inorder(inst, c)
+                max_ip++;
+
+        uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
+
+        for (int t = 0; t < c->num_temps; t++) {
+                for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
+                                                i < max_ip); i++) {
+                        if (i > max_ip)
+                                break;
+                        pressure[i]++;
+                }
+        }
+
+        uint32_t max_temps = 0;
+        for (int i = 0; i < max_ip; i++)
+                max_temps = MAX2(max_temps, pressure[i]);
+
+        ralloc_free(pressure);
+
+        return max_temps;
+}
+
 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                       struct v3d_key *key,
                       struct v3d_prog_data **out_prog_data,
@@ -952,13 +910,17 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 c->fs_key = (struct v3d_fs_key *)key;
                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
                 break;
+        case MESA_SHADER_COMPUTE:
+                prog_data = rzalloc_size(NULL,
+                                         sizeof(struct v3d_compute_prog_data));
+                break;
         default:
                 unreachable("unsupported shader stage");
         }
 
         if (c->s->info.stage == MESA_SHADER_VERTEX) {
                 v3d_nir_lower_vs_early(c);
-        } else {
+        } else if (c->s->info.stage != MESA_SHADER_COMPUTE) {
                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
                 v3d_nir_lower_fs_early(c);
         }
@@ -967,7 +929,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
 
         if (c->s->info.stage == MESA_SHADER_VERTEX) {
                 v3d_nir_lower_vs_late(c);
-        } else {
+        } else if (c->s->info.stage != MESA_SHADER_COMPUTE)  {
                 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
                 v3d_nir_lower_fs_late(c);
         }
@@ -990,15 +952,22 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
         char *shaderdb;
         int ret = asprintf(&shaderdb,
                            "%s shader: %d inst, %d threads, %d loops, "
-                           "%d uniforms, %d:%d spills:fills",
+                           "%d uniforms, %d max-temps, %d:%d spills:fills, "
+                           "%d sfu-stalls, %d inst-and-stalls",
                            vir_get_stage_name(c),
                            c->qpu_inst_count,
                            c->threads,
                            c->loops,
                            c->num_uniforms,
+                           vir_get_max_temps(c),
                            c->spills,
-                           c->fills);
+                           c->fills,
+                           c->qpu_inst_stalled_count,
+                           c->qpu_inst_count + c->qpu_inst_stalled_count);
         if (ret >= 0) {
+                if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
+                        fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
+
                 c->debug_output(shaderdb, c->debug_output_data);
                 free(shaderdb);
         }
@@ -1059,15 +1028,15 @@ vir_compile_destroy(struct v3d_compile *c)
         ralloc_free(c);
 }
 
-struct qreg
-vir_uniform(struct v3d_compile *c,
-            enum quniform_contents contents,
-            uint32_t data)
+uint32_t
+vir_get_uniform_index(struct v3d_compile *c,
+                      enum quniform_contents contents,
+                      uint32_t data)
 {
         for (int i = 0; i < c->num_uniforms; i++) {
                 if (c->uniform_contents[i] == contents &&
                     c->uniform_data[i] == data) {
-                        return vir_reg(QFILE_UNIF, i);
+                        return i;
                 }
         }
 
@@ -1088,52 +1057,20 @@ vir_uniform(struct v3d_compile *c,
         c->uniform_contents[uniform] = contents;
         c->uniform_data[uniform] = data;
 
-        return vir_reg(QFILE_UNIF, uniform);
-}
-
-static bool
-vir_can_set_flags(struct v3d_compile *c, struct qinst *inst)
-{
-        if (c->devinfo->ver >= 40 && (v3d_qpu_reads_vpm(&inst->qpu) ||
-                                      v3d_qpu_uses_sfu(&inst->qpu))) {
-                return false;
-        }
-
-        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
-            (inst->qpu.alu.add.op == V3D_QPU_A_NOP &&
-             inst->qpu.alu.mul.op == V3D_QPU_M_NOP)) {
-               return false;
-        }
-
-        return true;
+        return uniform;
 }
 
-void
-vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
+struct qreg
+vir_uniform(struct v3d_compile *c,
+            enum quniform_contents contents,
+            uint32_t data)
 {
-        struct qinst *last_inst = NULL;
-
-        if (!list_empty(&c->cur_block->instructions)) {
-                last_inst = (struct qinst *)c->cur_block->instructions.prev;
-
-                /* Can't stuff the PF into the last last inst if our cursor
-                 * isn't pointing after it.
-                 */
-                struct vir_cursor after_inst = vir_after_inst(last_inst);
-                if (c->cursor.mode != after_inst.mode ||
-                    c->cursor.link != after_inst.link)
-                        last_inst = NULL;
-        }
-
-        if (src.file != QFILE_TEMP ||
-            !c->defs[src.index] ||
-            last_inst != c->defs[src.index] ||
-            !vir_can_set_flags(c, last_inst)) {
-                /* XXX: Make the MOV be the appropriate type */
-                last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
-        }
-
-        vir_set_pf(last_inst, pf);
+        struct qinst *inst = vir_NOP(c);
+        inst->qpu.sig.ldunif = true;
+        inst->uniform = vir_get_uniform_index(c, contents, data);
+        inst->dst = vir_get_temp(c);
+        c->defs[inst->dst.index] = inst;
+        return inst->dst;
 }
 
 #define OPTPASS(func)                                                   \
@@ -1160,6 +1097,7 @@ vir_optimize(struct v3d_compile *c)
                 bool progress = false;
 
                 OPTPASS(vir_opt_copy_propagate);
+                OPTPASS(vir_opt_redundant_flags);
                 OPTPASS(vir_opt_dead_code);
                 OPTPASS(vir_opt_small_immediates);
 
diff --git a/lib/mesa/src/broadcom/compiler/vir_dump.c b/lib/mesa/src/broadcom/compiler/vir_dump.c
index ecf6f3e1f..9e1ef1e9d 100644
--- a/lib/mesa/src/broadcom/compiler/vir_dump.c
+++ b/lib/mesa/src/broadcom/compiler/vir_dump.c
@@ -30,6 +30,7 @@ vir_dump_uniform(enum quniform_contents contents,
                  uint32_t data)
 {
         static const char *quniform_names[] = {
+                [QUNIFORM_ALPHA_REF] = "alpha_ref",
                 [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale",
                 [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale",
                 [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset",
@@ -52,20 +53,20 @@ vir_dump_uniform(enum quniform_contents contents,
 
         case QUNIFORM_TMU_CONFIG_P0:
                 fprintf(stderr, "tex[%d].p0 | 0x%x",
-                        v3d_tmu_config_data_get_unit(data),
-                        v3d_tmu_config_data_get_value(data));
+                        v3d_unit_data_get_unit(data),
+                        v3d_unit_data_get_offset(data));
                 break;
 
         case QUNIFORM_TMU_CONFIG_P1:
                 fprintf(stderr, "tex[%d].p1 | 0x%x",
-                        v3d_tmu_config_data_get_unit(data),
-                        v3d_tmu_config_data_get_value(data));
+                        v3d_unit_data_get_unit(data),
+                        v3d_unit_data_get_offset(data));
                 break;
 
         case QUNIFORM_IMAGE_TMU_CONFIG_P0:
                 fprintf(stderr, "img[%d].p0 | 0x%x",
-                        v3d_tmu_config_data_get_unit(data),
-                        v3d_tmu_config_data_get_value(data));
+                        v3d_unit_data_get_unit(data),
+                        v3d_unit_data_get_offset(data));
                 break;
 
         case QUNIFORM_TEXTURE_WIDTH:
@@ -97,8 +98,18 @@ vir_dump_uniform(enum quniform_contents contents,
                 fprintf(stderr, "img[%d].array_size", data);
                 break;
 
+        case QUNIFORM_SPILL_OFFSET:
+                fprintf(stderr, "spill_offset");
+                break;
+
+        case QUNIFORM_SPILL_SIZE_PER_THREAD:
+                fprintf(stderr, "spill_size_per_thread");
+                break;
+
         case QUNIFORM_UBO_ADDR:
-                fprintf(stderr, "ubo[%d]", data);
+                fprintf(stderr, "ubo[%d]+0x%x",
+                        v3d_unit_data_get_unit(data),
+                        v3d_unit_data_get_offset(data));
                 break;
 
         case QUNIFORM_SSBO_OFFSET:
@@ -118,7 +129,8 @@ vir_dump_uniform(enum quniform_contents contents,
                         fprintf(stderr, "tex[%d].p0: 0x%08x",
                                 contents - QUNIFORM_TEXTURE_CONFIG_P0_0,
                                 data);
-                } else if (contents < ARRAY_SIZE(quniform_names)) {
+                } else if (contents < ARRAY_SIZE(quniform_names) &&
+                           quniform_names[contents]) {
                         fprintf(stderr, "%s",
                                 quniform_names[contents]);
                 } else {
@@ -131,13 +143,6 @@ static void
 vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
               struct qreg reg)
 {
-        static const char *files[] = {
-                [QFILE_TEMP] = "t",
-                [QFILE_UNIF] = "u",
-                [QFILE_TLB] = "tlb",
-                [QFILE_TLBU] = "tlbu",
-        };
-
         switch (reg.file) {
 
         case QFILE_NULL:
@@ -176,21 +181,8 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
                         reg.index / 4, reg.index % 4);
                 break;
 
-        case QFILE_TLB:
-        case QFILE_TLBU:
-                fprintf(stderr, "%s", files[reg.file]);
-                break;
-
-        case QFILE_UNIF:
-                fprintf(stderr, "%s%d", files[reg.file], reg.index);
-                fprintf(stderr, " (");
-                vir_dump_uniform(c->uniform_contents[reg.index],
-                                 c->uniform_data[reg.index]);
-                fprintf(stderr, ")");
-                break;
-
-        default:
-                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+        case QFILE_TEMP:
+                fprintf(stderr, "t%d", reg.index);
                 break;
         }
 }
@@ -258,8 +250,7 @@ static void
 vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
 {
         struct v3d_qpu_instr *instr = &inst->qpu;
-        int nsrc = vir_get_non_sideband_nsrc(inst);
-        int sideband_nsrc = vir_get_nsrc(inst);
+        int nsrc = vir_get_nsrc(inst);
         enum v3d_qpu_input_unpack unpack[2];
 
         if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) {
@@ -288,11 +279,10 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 unpack[1] = instr->alu.mul.b_unpack;
         }
 
-        for (int i = 0; i < sideband_nsrc; i++) {
+        for (int i = 0; i < nsrc; i++) {
                 fprintf(stderr, ", ");
                 vir_print_reg(c, inst, inst->src[i]);
-                if (i < nsrc)
-                        fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
+                fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
         }
 
         vir_dump_sig(c, inst);
@@ -353,25 +343,34 @@ vir_dump_inst(struct v3d_compile *c, struct qinst *inst)
                                 break;
                         }
                 }
-
-                if (vir_has_implicit_uniform(inst)) {
-                        fprintf(stderr, " ");
-                        vir_print_reg(c, inst, inst->src[vir_get_implicit_uniform_src(inst)]);
-                }
-
                 break;
         }
+
+        if (vir_has_uniform(inst)) {
+                fprintf(stderr, " (");
+                vir_dump_uniform(c->uniform_contents[inst->uniform],
+                                 c->uniform_data[inst->uniform]);
+                fprintf(stderr, ")");
+        }
 }
 
 void
 vir_dump(struct v3d_compile *c)
 {
         int ip = 0;
+        int pressure = 0;
 
         vir_for_each_block(block, c) {
                 fprintf(stderr, "BLOCK %d:\n", block->index);
                 vir_for_each_inst(inst, block) {
                         if (c->live_intervals_valid) {
+                                for (int i = 0; i < c->num_temps; i++) {
+                                        if (c->temp_start[i] == ip)
+                                                pressure++;
+                                }
+
+                                fprintf(stderr, "P%4d ", pressure);
+
                                 bool first = true;
 
                                 for (int i = 0; i < c->num_temps; i++) {
@@ -383,7 +382,10 @@ vir_dump(struct v3d_compile *c)
                                         } else {
                                                 fprintf(stderr, ", ");
                                         }
-                                        fprintf(stderr, "S%4d", i);
+                                        if (BITSET_TEST(c->spillable, i))
+                                                fprintf(stderr, "S%4d", i);
+                                        else
+                                                fprintf(stderr, "U%4d", i);
                                 }
 
                                 if (first)
@@ -405,6 +407,7 @@ vir_dump(struct v3d_compile *c)
                                                 fprintf(stderr, ", ");
                                         }
                                         fprintf(stderr, "E%4d", i);
+                                        pressure--;
                                 }
 
                                 if (first)
diff --git a/lib/mesa/src/broadcom/compiler/vir_live_variables.c b/lib/mesa/src/broadcom/compiler/vir_live_variables.c
index 2879e23b4..d3ca02f18 100644
--- a/lib/mesa/src/broadcom/compiler/vir_live_variables.c
+++ b/lib/mesa/src/broadcom/compiler/vir_live_variables.c
@@ -109,24 +109,18 @@ vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip,
         c->temp_start[var] = MIN2(c->temp_start[var], ip);
         c->temp_end[var] = MAX2(c->temp_end[var], ip);
 
-        /* If we've already tracked this as a def, or already used it within
-         * the block, there's nothing to do.
+        /* Mark the block as having a (partial) def of the var. */
+        BITSET_SET(block->defout, var);
+
+        /* If we've already tracked this as a def that screens off previous
+         * uses, or already used it within the block, there's nothing to do.
          */
         if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var))
                 return;
 
-        /* Easy, common case: unconditional full register update.
-         *
-         * We treat conditioning on the exec mask as the same as not being
-         * conditional.  This makes sure that if the register gets set on
-         * either side of an if, it is treated as being screened off before
-         * the if.  Otherwise, if there was no intervening def, its live
-         * interval doesn't extend back to the start of he program, and if too
-         * many registers did that we'd fail to register allocate.
-         */
-        if (((inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
-              inst->qpu.flags.mc == V3D_QPU_COND_NONE) ||
-             inst->cond_is_exec_mask) &&
+        /* Easy, common case: unconditional full register update.*/
+        if ((inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+             inst->qpu.flags.mc == V3D_QPU_COND_NONE) &&
             inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE &&
             inst->qpu.alu.mul.output_pack == V3D_QPU_PACK_NONE) {
                 BITSET_SET(block->def, var);
@@ -278,6 +272,33 @@ vir_live_variables_dataflow(struct v3d_compile *c, int bitset_words)
         return cont;
 }
 
+static bool
+vir_live_variables_defin_defout_dataflow(struct v3d_compile *c, int bitset_words)
+{
+        bool cont = false;
+
+        vir_for_each_block_rev(block, c) {
+                /* Propagate defin/defout down the successors to produce the
+                 * union of blocks with a reachable (partial) definition of
+                 * the var.
+                 *
+                 * This keeps a conditional first write to a reg from
+                 * extending its lifetime back to the start of the program.
+                 */
+                vir_for_each_successor(succ, block) {
+                        for (int i = 0; i < bitset_words; i++) {
+                                BITSET_WORD new_def = (block->defout[i] &
+                                                       ~succ->defin[i]);
+                                succ->defin[i] |= new_def;
+                                succ->defout[i] |= new_def;
+                                cont |= new_def;
+                        }
+                }
+        }
+
+        return cont;
+}
+
 /**
  * Extend the start/end ranges for each variable to account for the
  * new information calculated from control flow.
@@ -287,14 +308,16 @@ vir_compute_start_end(struct v3d_compile *c, int num_vars)
 {
         vir_for_each_block(block, c) {
                 for (int i = 0; i < num_vars; i++) {
-                        if (BITSET_TEST(block->live_in, i)) {
+                        if (BITSET_TEST(block->live_in, i) &&
+                            BITSET_TEST(block->defin, i)) {
                                 c->temp_start[i] = MIN2(c->temp_start[i],
                                                         block->start_ip);
                                 c->temp_end[i] = MAX2(c->temp_end[i],
                                                       block->start_ip);
                         }
 
-                        if (BITSET_TEST(block->live_out, i)) {
+                        if (BITSET_TEST(block->live_out, i) &&
+                            BITSET_TEST(block->defout, i)) {
                                 c->temp_start[i] = MIN2(c->temp_start[i],
                                                         block->end_ip);
                                 c->temp_end[i] = MAX2(c->temp_end[i],
@@ -334,6 +357,8 @@ vir_calculate_live_intervals(struct v3d_compile *c)
 
         vir_for_each_block(block, c) {
                 block->def = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->defin = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->defout = rzalloc_array(c, BITSET_WORD, bitset_words);
                 block->use = rzalloc_array(c, BITSET_WORD, bitset_words);
                 block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words);
                 block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words);
@@ -344,6 +369,9 @@ vir_calculate_live_intervals(struct v3d_compile *c)
         while (vir_live_variables_dataflow(c, bitset_words))
                 ;
 
+        while (vir_live_variables_defin_defout_dataflow(c, bitset_words))
+                ;
+
         vir_compute_start_end(c, c->num_temps);
 
         c->live_intervals_valid = true;
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c b/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c
index 2a22a1b55..c5bb61121 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -49,10 +49,8 @@ is_copy_mov(struct qinst *inst)
         if (inst->dst.file != QFILE_TEMP)
                 return false;
 
-        if (inst->src[0].file != QFILE_TEMP &&
-            inst->src[0].file != QFILE_UNIF) {
+        if (inst->src[0].file != QFILE_TEMP)
                 return false;
-        }
 
         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
@@ -151,13 +149,36 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
                          * would be the same between the two
                          * instructions.
                          */
-                        if (vir_is_float_input(inst) !=
-                            vir_is_float_input(mov)) {
+                        if (v3d_qpu_unpacks_f32(&inst->qpu) !=
+                            v3d_qpu_unpacks_f32(&mov->qpu) ||
+                            v3d_qpu_unpacks_f16(&inst->qpu) !=
+                            v3d_qpu_unpacks_f16(&mov->qpu)) {
                                 continue;
                         }
+
                         /* No composing the unpacks. */
                         if (vir_has_unpack(inst, i))
-                            continue;
+                                continue;
+
+                        /* these ops can't represent abs. */
+                        if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+                                switch (inst->qpu.alu.add.op) {
+                                case V3D_QPU_A_VFPACK:
+                                case V3D_QPU_A_FROUND:
+                                case V3D_QPU_A_FTRUNC:
+                                case V3D_QPU_A_FFLOOR:
+                                case V3D_QPU_A_FCEIL:
+                                case V3D_QPU_A_FDX:
+                                case V3D_QPU_A_FDY:
+                                case V3D_QPU_A_FTOIN:
+                                case V3D_QPU_A_FTOIZ:
+                                case V3D_QPU_A_FTOUZ:
+                                case V3D_QPU_A_FTOC:
+                                        continue;
+                                default:
+                                        break;
+                                }
+                        }
                 }
 
                 if (debug) {
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
index a486708bf..6048ccfcc 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
@@ -55,28 +55,8 @@ static bool
 has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
 {
         for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                if (inst->src[i].file == QFILE_VPM) {
-                        /* Instance ID, Vertex ID: Should have been removed at
-                         * the NIR level
-                         */
-                        if (inst->src[i].index == ~0)
-                                return true;
-
-                        uint32_t attr = inst->src[i].index / 4;
-                        uint32_t offset = inst->src[i].index % 4;
-
-                        if (c->vattr_sizes[attr] != offset)
-                                return true;
-
-                        /* Can't get rid of the last VPM read, or the
-                         * simulator (at least) throws an error.
-                         */
-                        uint32_t total_size = 0;
-                        for (uint32_t i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
-                                total_size += c->vattr_sizes[i];
-                        if (total_size == 1)
-                                return true;
-                }
+                if (inst->src[i].file == QFILE_VPM)
+                        return true;
         }
 
         return false;
@@ -187,18 +167,6 @@ vir_opt_dead_code(struct v3d_compile *c)
                                 continue;
                         }
 
-                        for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                                if (inst->src[i].file != QFILE_VPM)
-                                        continue;
-                                uint32_t attr = inst->src[i].index / 4;
-                                uint32_t offset = (inst->src[i].index % 4);
-
-                                if (c->vattr_sizes[attr] == offset) {
-                                        c->num_inputs--;
-                                        c->vattr_sizes[attr]--;
-                                }
-                        }
-
                         assert(inst != last_flags_write);
                         dce(c, inst);
                         progress = true;
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c
new file mode 100644
index 000000000..8749f3cd6
--- /dev/null
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2019 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_redundant_flags.c
+ *
+ * This eliminates the APF/MPF flags for redundant flags updates.  These are
+ * often produced by our channel masking in nonuniform control flow.
+ */
+
+#include "v3d_compiler.h"
+
+static bool debug;
+
+static void
+vir_dce_pf(struct v3d_compile *c, struct qinst *inst)
+{
+        if (debug) {
+                fprintf(stderr,
+                        "Removing flags write from: ");
+                vir_dump_inst(c, inst);
+                fprintf(stderr, "\n");
+        }
+
+        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+
+        inst->qpu.flags.apf = V3D_QPU_PF_NONE;
+        inst->qpu.flags.mpf = V3D_QPU_PF_NONE;
+}
+
+static bool
+vir_sources_modified(struct qinst *srcs, struct qinst *write)
+{
+        for (int i = 0; i < vir_get_nsrc(srcs); i++) {
+                if (write->dst.file == QFILE_TEMP &&
+                    srcs->src[i].file == QFILE_TEMP &&
+                    srcs->src[i].index == write->dst.index) {
+                        return true;
+                }
+
+                /* assume magic regs may be modified by basically anything. */
+                if (srcs->src[i].file != QFILE_TEMP &&
+                    srcs->src[i].file != QFILE_SMALL_IMM)
+                        return true;
+        }
+
+        return false;
+}
+
+static bool
+vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
+{
+        for (int i = 0; i < vir_get_nsrc(a); i++) {
+                if (a->src[i].file != b->src[i].file ||
+                    a->src[i].index != b->src[i].index) {
+                        return false;
+                }
+        }
+
+        if (a->qpu.flags.apf != b->qpu.flags.apf ||
+            a->qpu.flags.mpf != b->qpu.flags.mpf ||
+            a->qpu.alu.add.op != b->qpu.alu.add.op ||
+            a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
+            a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
+            a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+            a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
+            a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
+            a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+            a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
+                return false;
+        }
+
+        return true;
+}
+
+static bool
+vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block)
+{
+        struct qinst *last_flags = NULL;
+        bool progress = false;
+
+        vir_for_each_inst(inst, block) {
+                if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+                    inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
+                    inst->qpu.flags.muf != V3D_QPU_UF_NONE) {
+                        last_flags = NULL;
+                        continue;
+                }
+
+                /* Flags aren't preserved across a thrsw. */
+                if (inst->qpu.sig.thrsw)
+                        last_flags = NULL;
+
+                if (inst->qpu.flags.apf != V3D_QPU_PF_NONE ||
+                    inst->qpu.flags.mpf != V3D_QPU_PF_NONE) {
+                        if (last_flags &&
+                            vir_instr_flags_op_equal(inst, last_flags)) {
+                                vir_dce_pf(c, inst);
+                                progress = true;
+                        } else {
+                                last_flags = inst;
+                        }
+                }
+
+                if (last_flags && vir_sources_modified(last_flags, inst)) {
+                        last_flags = NULL;
+                }
+        }
+
+        return progress;
+}
+
+bool
+vir_opt_redundant_flags(struct v3d_compile *c)
+{
+        bool progress = false;
+
+        vir_for_each_block(block, c) {
+                progress = vir_opt_redundant_flags_block(c, block) || progress;
+        }
+
+        return progress;
+}
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c b/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c
index 5491f9c24..47d772296 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -55,26 +55,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
                         continue;
 
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                        struct qreg src = vir_follow_movs(c, inst->src[i]);
+                        if (inst->src[i].file != QFILE_TEMP)
+                                continue;
 
-                        if (src.file != QFILE_UNIF ||
-                            c->uniform_contents[src.index] !=
-                            QUNIFORM_CONSTANT) {
+                        /* See if it's a uniform load. */
+                        struct qinst *src_def = c->defs[inst->src[i].index];
+                        if (!src_def || !src_def->qpu.sig.ldunif)
                                 continue;
-                        }
+                        int uniform = src_def->uniform;
 
-                        if (vir_has_implicit_uniform(inst) &&
-                            i == vir_get_implicit_uniform_src(inst)) {
-                                /* No turning the implicit uniform read into
-                                 * an immediate.
-                                 */
+                        if (c->uniform_contents[uniform] != QUNIFORM_CONSTANT)
                                 continue;
-                        }
 
                         /* Check if the uniform is suitable as a small
                          * immediate.
                          */
-                        uint32_t imm = c->uniform_data[src.index];
+                        uint32_t imm = c->uniform_data[uniform];
                         uint32_t packed;
                         if (!v3d_qpu_small_imm_pack(c->devinfo, imm, &packed))
                                 continue;
diff --git a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
index 79ab5acd7..7583acf15 100644
--- a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
+++ b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
@@ -29,28 +29,44 @@
 #define QPU_R(i) { .magic = false, .index = i }
 
 #define ACC_INDEX     0
-#define ACC_COUNT     5
+#define ACC_COUNT     6
 #define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
 #define PHYS_COUNT    64
 
+static inline bool
+qinst_writes_tmu(struct qinst *inst)
+{
+        return (inst->dst.file == QFILE_MAGIC &&
+                v3d_qpu_magic_waddr_is_tmu(inst->dst.index));
+}
+
 static bool
 is_last_ldtmu(struct qinst *inst, struct qblock *block)
 {
-        list_for_each_entry_from(struct qinst, scan_inst, inst,
+        list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
                                  &block->instructions, link) {
-                if (inst->qpu.sig.ldtmu)
+                if (scan_inst->qpu.sig.ldtmu)
                         return false;
-                if (v3d_qpu_writes_tmu(&inst->qpu))
+                if (qinst_writes_tmu(scan_inst))
                         return true;
         }
 
         return true;
 }
 
+static bool
+vir_is_mov_uniform(struct v3d_compile *c, int temp)
+{
+        struct qinst *def = c->defs[temp];
+
+        return def && def->qpu.sig.ldunif;
+}
+
 static int
 v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                       uint32_t *temp_to_node)
 {
+        const float tmu_scale = 5;
         float block_scale = 1.0;
         float spill_costs[c->num_temps];
         bool in_tmu_operation = false;
@@ -75,22 +91,28 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                         continue;
 
                                 int temp = inst->src[i].index;
-                                if (no_spilling) {
-                                        BITSET_CLEAR(c->spillable,
-                                                     temp);
-                                } else {
+                                if (vir_is_mov_uniform(c, temp)) {
                                         spill_costs[temp] += block_scale;
+                                } else if (!no_spilling) {
+                                        spill_costs[temp] += (block_scale *
+                                                              tmu_scale);
+                                } else {
+                                        BITSET_CLEAR(c->spillable, temp);
                                 }
                         }
 
                         if (inst->dst.file == QFILE_TEMP) {
                                 int temp = inst->dst.index;
 
-                                if (no_spilling) {
-                                        BITSET_CLEAR(c->spillable,
-                                                     temp);
+                                if (vir_is_mov_uniform(c, temp)) {
+                                        /* We just rematerialize the unform
+                                         * later.
+                                         */
+                                } else if (!no_spilling) {
+                                        spill_costs[temp] += (block_scale *
+                                                              tmu_scale);
                                 } else {
-                                        spill_costs[temp] += block_scale;
+                                        BITSET_CLEAR(c->spillable, temp);
                                 }
                         }
 
@@ -123,7 +145,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
                                 in_tmu_operation = false;
 
-                        if (v3d_qpu_writes_tmu(&inst->qpu))
+                        if (qinst_writes_tmu(inst))
                                 in_tmu_operation = true;
                 }
         }
@@ -141,7 +163,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
 /* The spill offset for this thread takes a bit of setup, so do it once at
  * program start.
  */
-static void
+void
 v3d_setup_spill_base(struct v3d_compile *c)
 {
         c->cursor = vir_before_block(vir_entry_block(c));
@@ -170,6 +192,8 @@ v3d_setup_spill_base(struct v3d_compile *c)
         /* Make sure that we don't spill the spilling setup instructions. */
         for (int i = start_num_temps; i < c->num_temps; i++)
                 BITSET_CLEAR(c->spillable, i);
+
+        c->cursor = vir_after_block(c->cur_block);
 }
 
 static void
@@ -184,18 +208,30 @@ v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
 static void
 v3d_spill_reg(struct v3d_compile *c, int spill_temp)
 {
-        uint32_t spill_offset = c->spill_size;
-        c->spill_size += 16 * sizeof(uint32_t);
+        bool is_uniform = vir_is_mov_uniform(c, spill_temp);
+
+        uint32_t spill_offset = 0;
 
-        if (spill_offset == 0)
-                v3d_setup_spill_base(c);
+        if (!is_uniform) {
+                uint32_t spill_offset = c->spill_size;
+                c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
+
+                if (spill_offset == 0)
+                        v3d_setup_spill_base(c);
+        }
 
         struct qinst *last_thrsw = c->last_thrsw;
         assert(!last_thrsw || last_thrsw->is_last_thrsw);
 
         int start_num_temps = c->num_temps;
 
-        vir_for_each_inst_inorder(inst, c) {
+        int uniform_index = ~0;
+        if (is_uniform) {
+                struct qinst *orig_unif = c->defs[spill_temp];
+                uniform_index = orig_unif->uniform;
+        }
+
+        vir_for_each_inst_inorder_safe(inst, c) {
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
                         if (inst->src[i].file != QFILE_TEMP ||
                             inst->src[i].index != spill_temp) {
@@ -204,23 +240,37 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
 
                         c->cursor = vir_before_inst(inst);
 
-                        v3d_emit_spill_tmua(c, spill_offset);
-                        vir_emit_thrsw(c);
-                        inst->src[i] = vir_LDTMU(c);
-                        c->fills++;
+                        if (is_uniform) {
+                                struct qreg unif =
+                                        vir_uniform(c,
+                                                    c->uniform_contents[uniform_index],
+                                                    c->uniform_data[uniform_index]);
+                                inst->src[i] = unif;
+                        } else {
+                                v3d_emit_spill_tmua(c, spill_offset);
+                                vir_emit_thrsw(c);
+                                inst->src[i] = vir_LDTMU(c);
+                                c->fills++;
+                        }
                 }
 
                 if (inst->dst.file == QFILE_TEMP &&
                     inst->dst.index == spill_temp) {
-                        c->cursor = vir_after_inst(inst);
-
-                        inst->dst.index = c->num_temps++;
-                        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     inst->dst);
-                        v3d_emit_spill_tmua(c, spill_offset);
-                        vir_emit_thrsw(c);
-                        vir_TMUWT(c);
-                        c->spills++;
+                        if (is_uniform) {
+                                c->cursor.link = NULL;
+                                vir_remove_instruction(c, inst);
+                        } else {
+                                c->cursor = vir_after_inst(inst);
+
+                                inst->dst.index = c->num_temps++;
+                                vir_MOV_dest(c, vir_reg(QFILE_MAGIC,
+                                                        V3D_QPU_WADDR_TMUD),
+                                             inst->dst);
+                                v3d_emit_spill_tmua(c, spill_offset);
+                                vir_emit_thrsw(c);
+                                vir_TMUWT(c);
+                                c->spills++;
+                        }
                 }
 
                 /* If we didn't have a last-thrsw inserted by nir_to_vir and
@@ -228,7 +278,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                  * right before we start the vpm/tlb sequence for the last
                  * thread segment.
                  */
-                if (!last_thrsw && c->last_thrsw &&
+                if (!is_uniform && !last_thrsw && c->last_thrsw &&
                     (v3d_qpu_writes_vpm(&inst->qpu) ||
                      v3d_qpu_uses_tlb(&inst->qpu))) {
                         c->cursor = vir_before_inst(inst);
@@ -261,6 +311,14 @@ static unsigned int
 v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
 {
         struct v3d_ra_select_callback_data *v3d_ra = data;
+        int r5 = ACC_INDEX + 5;
+
+        /* Choose r5 for our ldunifs if possible (nobody else can load to that
+         * reg, and it keeps the QPU cond field free from being occupied by
+         * ldunifrf).
+         */
+        if (BITSET_TEST(regs, r5))
+                return r5;
 
         /* Choose an accumulator if possible (I think it's lower power than
          * phys regs), but round-robin through them to give post-RA
@@ -303,6 +361,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
                 return false;
 
         for (int threads = 0; threads < max_thread_index; threads++) {
+                compiler->reg_class_any[threads] =
+                        ra_alloc_reg_class(compiler->regs);
+                compiler->reg_class_r5[threads] =
+                        ra_alloc_reg_class(compiler->regs);
                 compiler->reg_class_phys_or_acc[threads] =
                         ra_alloc_reg_class(compiler->regs);
                 compiler->reg_class_phys[threads] =
@@ -314,12 +376,25 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
                                          compiler->reg_class_phys_or_acc[threads], i);
                         ra_class_add_reg(compiler->regs,
                                          compiler->reg_class_phys[threads], i);
+                        ra_class_add_reg(compiler->regs,
+                                         compiler->reg_class_any[threads], i);
                 }
 
-                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
+                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
                         ra_class_add_reg(compiler->regs,
                                          compiler->reg_class_phys_or_acc[threads], i);
+                        ra_class_add_reg(compiler->regs,
+                                         compiler->reg_class_any[threads], i);
                 }
+                /* r5 can only store a single 32-bit value, so not much can
+                 * use it.
+                 */
+                ra_class_add_reg(compiler->regs,
+                                 compiler->reg_class_r5[threads],
+                                 ACC_INDEX + 5);
+                ra_class_add_reg(compiler->regs,
+                                 compiler->reg_class_any[threads],
+                                 ACC_INDEX + 5);
         }
 
         ra_set_finalize(compiler->regs, NULL);
@@ -342,9 +417,11 @@ node_to_temp_priority(const void *in_a, const void *in_b)
 }
 
 #define CLASS_BIT_PHYS			(1 << 0)
-#define CLASS_BIT_R0_R2			(1 << 1)
-#define CLASS_BIT_R3			(1 << 2)
-#define CLASS_BIT_R4			(1 << 3)
+#define CLASS_BIT_ACC			(1 << 1)
+#define CLASS_BIT_R5			(1 << 4)
+#define CLASS_BITS_ANY			(CLASS_BIT_PHYS | \
+                                         CLASS_BIT_ACC | \
+                                         CLASS_BIT_R5)
 
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
@@ -357,8 +434,6 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
         struct node_to_temp_map map[c->num_temps];
         uint32_t temp_to_node[c->num_temps];
         uint8_t class_bits[c->num_temps];
-        struct qpu_reg *temp_registers = calloc(c->num_temps,
-                                                sizeof(*temp_registers));
         int acc_nodes[ACC_COUNT];
         struct v3d_ra_select_callback_data callback_data = {
                 .next_acc = 0,
@@ -412,9 +487,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
          * start with any temp being able to be in any file, then instructions
          * incrementally remove bits that the temp definitely can't be in.
          */
-        memset(class_bits,
-               CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
-               sizeof(class_bits));
+        memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
 
         int ip = 0;
         vir_for_each_inst_inorder(inst, c) {
@@ -497,6 +570,24 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                         }
                 }
 
+                if (inst->dst.file == QFILE_TEMP) {
+                        /* Only a ldunif gets to write to R5, which only has a
+                         * single 32-bit channel of storage.
+                         */
+                        if (!inst->qpu.sig.ldunif) {
+                                class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
+                        } else {
+                                /* Until V3D 4.x, we could only load a uniform
+                                 * to r5, so we'll need to spill if uniform
+                                 * loads interfere with each other.
+                                 */
+                                if (c->devinfo->ver < 40) {
+                                        class_bits[inst->dst.index] &=
+                                                CLASS_BIT_R5;
+                                }
+                        }
+                }
+
                 if (inst->qpu.sig.thrsw) {
                         /* All accumulators are invalidated across a thread
                          * switch.
@@ -514,13 +605,16 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                 if (class_bits[i] == CLASS_BIT_PHYS) {
                         ra_set_node_class(g, temp_to_node[i],
                                           c->compiler->reg_class_phys[thread_index]);
-                } else {
-                        assert(class_bits[i] == (CLASS_BIT_PHYS |
-                                                 CLASS_BIT_R0_R2 |
-                                                 CLASS_BIT_R3 |
-                                                 CLASS_BIT_R4));
+                } else if (class_bits[i] == (CLASS_BIT_R5)) {
+                        ra_set_node_class(g, temp_to_node[i],
+                                          c->compiler->reg_class_r5[thread_index]);
+                } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
                         ra_set_node_class(g, temp_to_node[i],
                                           c->compiler->reg_class_phys_or_acc[thread_index]);
+                } else {
+                        assert(class_bits[i] == CLASS_BITS_ANY);
+                        ra_set_node_class(g, temp_to_node[i],
+                                          c->compiler->reg_class_any[thread_index]);
                 }
         }
 
@@ -539,7 +633,8 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
          * conformance tests to make sure that spilling works.
          */
         int force_register_spills = 0;
-        if (c->spill_size < 16 * sizeof(uint32_t) * force_register_spills) {
+        if (c->spill_size <
+            V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
                 int node = v3d_choose_spill_node(c, g, temp_to_node);
                 if (node != -1) {
                         v3d_spill_reg(c, map[node].temp);
@@ -551,24 +646,27 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
 
         bool ok = ra_allocate(g);
         if (!ok) {
-                /* Try to spill, if we can't reduce threading first. */
-                if (thread_index == 0) {
-                        int node = v3d_choose_spill_node(c, g, temp_to_node);
+                int node = v3d_choose_spill_node(c, g, temp_to_node);
 
-                        if (node != -1) {
-                                v3d_spill_reg(c, map[node].temp);
-                                ralloc_free(g);
+                /* Don't emit spills using the TMU until we've dropped thread
+                 * conut first.
+                 */
+                if (node != -1 &&
+                    (vir_is_mov_uniform(c, map[node].temp) ||
+                     thread_index == 0)) {
+                        v3d_spill_reg(c, map[node].temp);
 
-                                /* Ask the outer loop to call back in. */
-                                *spilled = true;
-                                return NULL;
-                        }
+                        /* Ask the outer loop to call back in. */
+                        *spilled = true;
                 }
 
-                free(temp_registers);
+                ralloc_free(g);
                 return NULL;
         }
 
+        struct qpu_reg *temp_registers = calloc(c->num_temps,
+                                                sizeof(*temp_registers));
+
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
                 if (ra_reg < PHYS_INDEX) {
@@ -591,17 +689,5 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
 
         ralloc_free(g);
 
-        if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n",
-                        vir_get_stage_name(c),
-                        c->program_id, c->variant_id,
-                        c->spills);
-
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n",
-                        vir_get_stage_name(c),
-                        c->program_id, c->variant_id,
-                        c->fills);
-        }
-
         return temp_registers;
 }
diff --git a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
index c66bb84b3..e6461ff94 100644
--- a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
+++ b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
@@ -76,7 +76,7 @@ v3d_qpu_nop(void)
 static struct qinst *
 vir_nop(void)
 {
-        struct qreg undef = { QFILE_NULL, 0 };
+        struct qreg undef = vir_nop_reg();
         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
 
         return qinst;
@@ -92,16 +92,6 @@ new_qpu_nop_before(struct qinst *inst)
         return q;
 }
 
-static void
-new_ldunif_instr(struct qinst *inst, int i)
-{
-        struct qinst *ldunif = new_qpu_nop_before(inst);
-
-        ldunif->qpu.sig.ldunif = true;
-        assert(inst->src[i].file == QFILE_UNIF);
-        ldunif->uniform = inst->src[i].index;
-}
-
 /**
  * Allocates the src register (accumulator or register file) into the RADDR
  * fields of the instruction.
@@ -214,16 +204,11 @@ v3d_generate_code_block(struct v3d_compile *c,
 
                 struct qinst *temp;
 
-                if (vir_has_implicit_uniform(qinst)) {
-                        int src = vir_get_implicit_uniform_src(qinst);
-                        assert(qinst->src[src].file == QFILE_UNIF);
-                        qinst->uniform = qinst->src[src].index;
+                if (vir_has_uniform(qinst))
                         c->num_uniforms++;
-                }
 
-                int nsrc = vir_get_non_sideband_nsrc(qinst);
+                int nsrc = vir_get_nsrc(qinst);
                 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
-                bool emitted_ldunif = false;
                 for (int i = 0; i < nsrc; i++) {
                         int index = qinst->src[i].index;
                         switch (qinst->src[i].file) {
@@ -240,19 +225,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         case QFILE_TEMP:
                                 src[i] = temp_registers[index];
                                 break;
-                        case QFILE_UNIF:
-                                /* XXX perf: If the last ldunif we emitted was
-                                 * the same uniform value, skip it.  Common
-                                 * for multop/umul24 sequences.
-                                 */
-                                if (!emitted_ldunif) {
-                                        new_ldunif_instr(qinst, i);
-                                        c->num_uniforms++;
-                                        emitted_ldunif = true;
-                                }
-
-                                src[i] = qpu_acc(5);
-                                break;
                         case QFILE_SMALL_IMM:
                                 src[i].smimm = true;
                                 break;
@@ -268,10 +240,6 @@ v3d_generate_code_block(struct v3d_compile *c,
 
                                 src[i] = qpu_acc(3);
                                 break;
-
-                        case QFILE_TLB:
-                        case QFILE_TLBU:
-                                unreachable("bad vir src file");
                         }
                 }
 
@@ -297,15 +265,6 @@ v3d_generate_code_block(struct v3d_compile *c,
                         dst = qpu_magic(V3D_QPU_WADDR_VPM);
                         break;
 
-                case QFILE_TLB:
-                        dst = qpu_magic(V3D_QPU_WADDR_TLB);
-                        break;
-
-                case QFILE_TLBU:
-                        dst = qpu_magic(V3D_QPU_WADDR_TLBU);
-                        break;
-
-                case QFILE_UNIF:
                 case QFILE_SMALL_IMM:
                 case QFILE_LOAD_IMM:
                         assert(!"not reached");
@@ -313,7 +272,20 @@ v3d_generate_code_block(struct v3d_compile *c,
                 }
 
                 if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
-                        if (v3d_qpu_sig_writes_address(c->devinfo,
+                        if (qinst->qpu.sig.ldunif) {
+                                assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
+                                assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
+                                if (!dst.magic ||
+                                    dst.index != V3D_QPU_WADDR_R5) {
+                                        assert(c->devinfo->ver >= 40);
+
+                                        qinst->qpu.sig.ldunif = false;
+                                        qinst->qpu.sig.ldunifrf = true;
+                                        qinst->qpu.sig_addr = dst.index;
+                                        qinst->qpu.sig_magic = dst.magic;
+                                }
+                        } else if (v3d_qpu_sig_writes_address(c->devinfo,
                                                        &qinst->qpu.sig)) {
                                 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
@@ -361,11 +333,12 @@ static bool
 reads_uniform(const struct v3d_device_info *devinfo, uint64_t instruction)
 {
         struct v3d_qpu_instr qpu;
-        MAYBE_UNUSED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu);
+        ASSERTED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu);
         assert(ok);
 
         if (qpu.sig.ldunif ||
-            qpu.sig.ldunifarf ||
+            qpu.sig.ldunifrf ||
+            qpu.sig.ldtlbu ||
             qpu.sig.wrtmuc) {
                 return true;
         }
@@ -433,7 +406,7 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
         vir_for_each_block(block, c)
                 v3d_generate_code_block(c, block, temp_registers);
 
-        uint32_t cycles = v3d_qpu_schedule_instructions(c);
+        v3d_qpu_schedule_instructions(c);
 
         c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count);
         int i = 0;
@@ -450,23 +423,6 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
         }
         assert(i == c->qpu_inst_count);
 
-        if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",
-                        vir_get_stage_name(c),
-                        c->program_id, c->variant_id,
-                        c->qpu_inst_count);
-        }
-
-        /* The QPU cycle estimates are pretty broken (see waddr_latency()), so
-         * don't report them for now.
-         */
-        if (false) {
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
-                        vir_get_stage_name(c),
-                        c->program_id, c->variant_id,
-                        cycles);
-        }
-
         if (V3D_DEBUG & (V3D_DEBUG_QPU |
                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
                 v3d_dump_qpu(c);
diff --git a/lib/mesa/src/broadcom/drm-shim/README.md b/lib/mesa/src/broadcom/drm-shim/README.md
new file mode 100644
index 000000000..dde21c1b8
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/README.md
@@ -0,0 +1,17 @@
+### v3d backend
+
+This implements some of v3d using the closed source v3dv3 tree's
+C/C++-based simulator.  All execution is synchronous.
+
+Export: `MESA_LOADER_DRIVER_OVERRIDE=v3d
+LD_PRELOAD=$prefix/lib/libv3d_drm_shim.so`.  The v3dv3 version exposed
+will depend on the v3dv3 build -- 3.3, 4.1, and 4.2 are supported.
+
+### v3d_noop backend
+
+This implements the minimum of v3d in order to make shader-db work.
+The submit ioctl is stubbed out to not execute anything.
+
+Export `MESA_LOADER_DRIVER_OVERRIDE=v3d
+LD_PRELOAD=$prefix/lib/libv3d_noop_drm_shim.so`.  This will be a V3D
+4.2 device.
diff --git a/lib/mesa/src/broadcom/drm-shim/meson.build b/lib/mesa/src/broadcom/drm-shim/meson.build
new file mode 100644
index 000000000..4fcc594ad
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/meson.build
@@ -0,0 +1,62 @@
+# Copyright © 2019 Broadcom
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libv3d_noop_drm_shim = shared_library(
+  ['v3d_noop_drm_shim'],
+  'v3d_noop.c',
+  include_directories: inc_common,
+  dependencies: dep_drm_shim,
+  c_args : c_vis_args,
+  install : true,
+)
+
+dep_v3dv3 = dependency('v3dv3', required: false)
+if dep_v3dv3.found()
+  v3dv3_c_args = '-DUSE_V3D_SIMULATOR'
+
+  inc_gallium_v3d = include_directories('../../gallium/drivers/v3d')
+
+  per_version_libs = []
+  foreach ver : v3d_versions
+    per_version_libs += static_library(
+        'libv3d_drm_shim-v' + ver,
+        [
+                            'v3dx.c',
+                            v3d_xml_pack
+        ],
+        include_directories : [inc_common, inc_broadcom, inc_src, inc_gallium_v3d],
+        c_args : [c_vis_args, no_override_init_args, '-DV3D_VERSION=' + ver, v3dv3_c_args],
+        dependencies: [dep_valgrind, dep_thread, dep_v3dv3],
+    )
+  endforeach
+
+  libv3d_drm_shim = shared_library(
+    ['v3d_drm_shim'],
+    [
+      'v3d.c',
+      '../../gallium/drivers/v3d/v3d_simulator_wrapper.cpp',
+    ],
+    dependencies: [idep_mesautil, dep_dl, dep_drm_shim, dep_v3dv3],
+    link_with: per_version_libs,
+    include_directories : [inc_common, inc_broadcom, inc_gallium_v3d],
+    c_args : [c_vis_args, no_override_init_args, '-std=gnu99', v3dv3_c_args],
+    cpp_args : [v3dv3_c_args]
+  )
+endif
diff --git a/lib/mesa/src/broadcom/drm-shim/v3d.c b/lib/mesa/src/broadcom/drm-shim/v3d.c
new file mode 100644
index 000000000..e75657f59
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3d.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include "drm-uapi/v3d_drm.h"
+#include "drm-shim/drm_shim.h"
+#include "v3d.h"
+#include "v3d_simulator_wrapper.h"
+
+static struct v3d_device_info devinfo;
+struct v3d_shim_device v3d = {
+        .devinfo = &devinfo
+};
+
+struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle)
+{
+        return v3d_bo(drm_shim_bo_lookup(shim_fd, handle));
+}
+
+int
+v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg)
+{
+        /* No need to wait on anything yet, given that we submit
+         * synchronously.
+         */
+        return 0;
+}
+
+int
+v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_mmap_bo *map = arg;
+        struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
+
+        map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
+
+        drm_shim_bo_put(bo);
+
+        return 0;
+}
+
+int
+v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_get_bo_offset *get = arg;
+        struct v3d_bo *bo = v3d_bo_lookup(shim_fd, get->handle);
+
+        get->offset = bo->offset;
+
+        drm_shim_bo_put(&bo->base);
+
+        return 0;
+}
+
+void
+drm_shim_driver_init(void)
+{
+        shim_device.driver_name = "v3d";
+
+        drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n"
+                               "OF_COMPATIBLE_N=1\n"
+                               "OF_COMPATIBLE_0=brcm,7278-v3d\n",
+                               "/sys/dev/char/%d:%d/device/uevent",
+                               DRM_MAJOR, render_node_minor);
+
+        v3d.hw = v3d_hw_auto_new(NULL);
+        v3d.devinfo->ver = v3d_hw_get_version(v3d.hw);
+
+        if (v3d.devinfo->ver >= 42)
+                v3d42_drm_shim_driver_init();
+        else if (v3d.devinfo->ver >= 41)
+                v3d41_drm_shim_driver_init();
+        else
+                v3d33_drm_shim_driver_init();
+}
diff --git a/lib/mesa/src/broadcom/drm-shim/v3d.h b/lib/mesa/src/broadcom/drm-shim/v3d.h
new file mode 100644
index 000000000..0712b8b3f
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3d.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef DRM_SHIM_V3D_H
+#define DRM_SHIM_V3D_H
+
+#include "broadcom/common/v3d_device_info.h"
+#include "util/vma.h"
+
+struct drm_shim_fd;
+
+struct v3d_shim_device {
+        struct v3d_hw *hw;
+        struct v3d_device_info *devinfo;
+
+        /* Base virtual address of the heap. */
+        void *mem;
+        /* Base hardware address of the heap. */
+        uint32_t mem_base;
+        /* Size of the heap. */
+        size_t mem_size;
+
+        /* Allocator for the GPU virtual addresses. */
+        struct util_vma_heap heap;
+};
+extern struct v3d_shim_device v3d;
+
+struct v3d_bo {
+        struct shim_bo base;
+        uint64_t offset;
+        void *sim_vaddr;
+        void *gem_vaddr;
+};
+
+static inline struct v3d_bo *
+v3d_bo(struct shim_bo *bo)
+{
+        return (struct v3d_bo *)bo;
+}
+
+struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle);
+int v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg);
+int v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg);
+int v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg);
+
+void v3d33_drm_shim_driver_init(void);
+void v3d41_drm_shim_driver_init(void);
+void v3d42_drm_shim_driver_init(void);
+
+#endif /* DRM_SHIM_V3D_H */
diff --git a/lib/mesa/src/broadcom/drm-shim/v3d_noop.c b/lib/mesa/src/broadcom/drm-shim/v3d_noop.c
new file mode 100644
index 000000000..7c7d75128
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3d_noop.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include "drm-uapi/v3d_drm.h"
+#include "drm-shim/drm_shim.h"
+
+struct v3d_bo {
+        struct shim_bo base;
+        uint32_t offset;
+};
+
+static struct v3d_bo *
+v3d_bo(struct shim_bo *bo)
+{
+        return (struct v3d_bo *)bo;
+}
+
+struct v3d_device {
+        uint32_t next_offset;
+};
+
+static struct v3d_device v3d = {
+        .next_offset = 0x1000,
+};
+
+static int
+v3d_ioctl_noop(int fd, unsigned long request, void *arg)
+{
+        return 0;
+}
+
+static int
+v3d_ioctl_create_bo(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_create_bo *create = arg;
+        struct v3d_bo *bo = calloc(1, sizeof(*bo));
+
+        drm_shim_bo_init(&bo->base, create->size);
+
+        assert(UINT_MAX - v3d.next_offset > create->size);
+        bo->offset = v3d.next_offset;
+        v3d.next_offset += create->size;
+
+        create->offset = bo->offset;
+        create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
+
+        drm_shim_bo_put(&bo->base);
+
+        return 0;
+}
+
+static int
+v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_get_bo_offset *args = arg;
+        struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, args->handle);
+
+        args->offset = v3d_bo(bo)->offset;
+
+        drm_shim_bo_put(bo);
+
+        return 0;
+}
+
+static int
+v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_mmap_bo *map = arg;
+        struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
+
+        map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
+
+        drm_shim_bo_put(bo);
+
+        return 0;
+}
+
+static int
+v3d_ioctl_get_param(int fd, unsigned long request, void *arg)
+{
+        struct drm_v3d_get_param *gp = arg;
+        static const uint32_t v3d42_reg_map[] = {
+                [DRM_V3D_PARAM_V3D_UIFCFG] = 0x00000045,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT1] = 0x000e1124,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT2] = 0x00000100,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT3] = 0x00000e00,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = 0x04443356,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = 0x81001422,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = 0x40078121,
+        };
+
+        switch (gp->param) {
+        case DRM_V3D_PARAM_SUPPORTS_TFU:
+                gp->value = 1;
+                return 0;
+        default:
+                break;
+        }
+
+        if (gp->param < ARRAY_SIZE(v3d42_reg_map) && v3d42_reg_map[gp->param]) {
+                gp->value = v3d42_reg_map[gp->param];
+                return 0;
+        }
+
+        fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param);
+        return -1;
+}
+
+static ioctl_fn_t driver_ioctls[] = {
+        [DRM_V3D_SUBMIT_CL] = v3d_ioctl_noop,
+        [DRM_V3D_SUBMIT_TFU] = v3d_ioctl_noop,
+        [DRM_V3D_WAIT_BO] = v3d_ioctl_noop,
+        [DRM_V3D_CREATE_BO] = v3d_ioctl_create_bo,
+        [DRM_V3D_GET_PARAM] = v3d_ioctl_get_param,
+        [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset,
+        [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo,
+};
+
+void
+drm_shim_driver_init(void)
+{
+        shim_device.driver_name = "v3d";
+        shim_device.driver_ioctls = driver_ioctls;
+        shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls);
+
+        drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n"
+                               "OF_COMPATIBLE_N=1\n"
+                               "OF_COMPATIBLE_0=brcm,7278-v3d\n",
+                               "/sys/dev/char/%d:%d/device/uevent",
+                               DRM_MAJOR, render_node_minor);
+}
diff --git a/lib/mesa/src/broadcom/drm-shim/v3dx.c b/lib/mesa/src/broadcom/drm-shim/v3dx.c
new file mode 100644
index 000000000..a22550a03
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3dx.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* @file
+ *
+ * v3d driver code interacting v3dv3 simulator/fpga library.
+ *
+ * This is compiled per V3D version we support, since the register definitions
+ * conflict.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "util/macros.h"
+#include "util/u_mm.h"
+#include "broadcom/common/v3d_macros.h"
+#include "v3d_simulator_wrapper.h"
+#include "drm-shim/drm_shim.h"
+#include "drm-uapi/v3d_drm.h"
+#include "v3d.h"
+
+#define HW_REGISTER_RO(x) (x)
+#define HW_REGISTER_RW(x) (x)
+#if V3D_VERSION >= 41
+#include "libs/core/v3d/registers/4.1.34.0/v3d.h"
+#else
+#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+#endif
+
+#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d.hw, reg, val)
+#define V3D_READ(reg) v3d_hw_read_reg(v3d.hw, reg)
+
+static void
+v3d_flush_l3()
+{
+        if (!v3d_hw_has_gca(v3d.hw))
+                return;
+
+#if V3D_VERSION < 40
+        uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
+
+        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
+        V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
+#endif
+}
+
+/* Invalidates the L2 cache.  This is a read-only cache. */
+static void
+v3d_flush_l2(void)
+{
+        V3D_WRITE(V3D_CTL_0_L2CACTL,
+                  V3D_CTL_0_L2CACTL_L2CCLR_SET |
+                  V3D_CTL_0_L2CACTL_L2CENA_SET);
+}
+
+/* Invalidates texture L2 cachelines */
+static void
+v3d_flush_l2t(void)
+{
+        V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
+        V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
+        V3D_WRITE(V3D_CTL_0_L2TCACTL,
+                  V3D_CTL_0_L2TCACTL_L2TFLS_SET |
+                  (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
+}
+
+/* Invalidates the slice caches.  These are read-only caches. */
+static void
+v3d_flush_slices(void)
+{
+        V3D_WRITE(V3D_CTL_0_SLCACTL, ~0);
+}
+
+static void
+v3d_flush_caches(void)
+{
+        v3d_flush_l3();
+        v3d_flush_l2();
+        v3d_flush_l2t();
+        v3d_flush_slices();
+}
+
+static void
+v3d_simulator_copy_in_handle(struct shim_fd *shim_fd, int handle)
+{
+        if (!handle)
+                return;
+
+        struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
+
+        memcpy(bo->sim_vaddr, bo->gem_vaddr, bo->base.size);
+}
+
+static void
+v3d_simulator_copy_out_handle(struct shim_fd *shim_fd, int handle)
+{
+        if (!handle)
+                return;
+
+        struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
+
+        memcpy(bo->gem_vaddr, bo->sim_vaddr, bo->base.size);
+}
+
+static int
+v3dX(v3d_ioctl_submit_cl)(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_submit_cl *submit = arg;
+        uint32_t *bo_handles = (uint32_t *)(uintptr_t)submit->bo_handles;
+
+        for (int i = 0; i < submit->bo_handle_count; i++)
+                v3d_simulator_copy_in_handle(shim_fd, bo_handles[i]);
+
+        v3d_flush_caches();
+
+        if (submit->qma) {
+                V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
+                V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
+        }
+#if V3D_VERSION >= 41
+        if (submit->qts) {
+                V3D_WRITE(V3D_CLE_0_CT0QTS,
+                          V3D_CLE_0_CT0QTS_CTQTSEN_SET |
+                          submit->qts);
+        }
+#endif
+
+        fprintf(stderr, "submit %x..%x!\n", submit->bcl_start, submit->bcl_end);
+
+        V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
+        V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
+
+        /* Wait for bin to complete before firing render, as it seems the
+         * simulator doesn't implement the semaphores.
+         */
+        while (V3D_READ(V3D_CLE_0_CT0CA) !=
+               V3D_READ(V3D_CLE_0_CT0EA)) {
+                v3d_hw_tick(v3d.hw);
+        }
+
+        fprintf(stderr, "submit %x..%x!\n", submit->rcl_start, submit->rcl_end);
+
+        v3d_flush_caches();
+
+        V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start);
+        V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end);
+
+        while (V3D_READ(V3D_CLE_0_CT1CA) !=
+               V3D_READ(V3D_CLE_0_CT1EA)) {
+                v3d_hw_tick(v3d.hw);
+        }
+
+        for (int i = 0; i < submit->bo_handle_count; i++)
+                v3d_simulator_copy_out_handle(shim_fd, bo_handles[i]);
+
+        return 0;
+}
+
+static int
+v3dX(v3d_ioctl_submit_tfu)(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_submit_tfu *submit = arg;
+
+        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[0]);
+        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[1]);
+        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[2]);
+        v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[3]);
+
+        int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
+
+        V3D_WRITE(V3D_TFU_IIA, submit->iia);
+        V3D_WRITE(V3D_TFU_IIS, submit->iis);
+        V3D_WRITE(V3D_TFU_ICA, submit->ica);
+        V3D_WRITE(V3D_TFU_IUA, submit->iua);
+        V3D_WRITE(V3D_TFU_IOA, submit->ioa);
+        V3D_WRITE(V3D_TFU_IOS, submit->ios);
+        V3D_WRITE(V3D_TFU_COEF0, submit->coef[0]);
+        V3D_WRITE(V3D_TFU_COEF1, submit->coef[1]);
+        V3D_WRITE(V3D_TFU_COEF2, submit->coef[2]);
+        V3D_WRITE(V3D_TFU_COEF3, submit->coef[3]);
+
+        V3D_WRITE(V3D_TFU_ICFG, submit->icfg);
+
+        while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+                v3d_hw_tick(v3d.hw);
+        }
+
+        v3d_simulator_copy_out_handle(shim_fd, submit->bo_handles[0]);
+
+        return 0;
+}
+
+static int
+v3dX(v3d_ioctl_create_bo)(int fd, unsigned long request, void *arg)
+{
+        struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+        struct drm_v3d_create_bo *create = arg;
+        struct v3d_bo *bo = calloc(1, sizeof(*bo));
+
+        drm_shim_bo_init(&bo->base, create->size);
+        bo->offset = util_vma_heap_alloc(&v3d.heap, create->size, 4096);
+        if (bo->offset == 0)
+                return -ENOMEM;
+
+        bo->sim_vaddr = v3d.mem + bo->offset - v3d.mem_base;
+#if 0
+        /* Place a mapping of the BO inside of the simulator's address space
+         * for V3D memory.  This lets us avoid copy in/out for simpenrose, but
+         * I'm betting we'll need something else for FPGA.
+         */
+        void *sim_addr = v3d.mem + bo->block->ofs;
+        void *mmap_ret = mmap(sim_addr, create->size, PROT_READ | PROT_WRITE,
+                              MAP_SHARED | MAP_FIXED, bo->base.fd, 0);
+        assert(mmap_ret == sim_addr);
+#else
+        /* Make a simulator-private mapping of the shim GEM object. */
+        bo->gem_vaddr = mmap(NULL, bo->base.size,
+                             PROT_READ | PROT_WRITE,
+                             MAP_SHARED,
+                             bo->base.fd, 0);
+        if (bo->gem_vaddr == MAP_FAILED) {
+                fprintf(stderr, "v3d: mmap of shim bo failed\n");
+                abort();
+        }
+#endif
+
+        create->offset = bo->offset;
+        create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
+
+        drm_shim_bo_put(&bo->base);
+
+        return 0;
+}
+
+static int
+v3dX(v3d_ioctl_get_param)(int fd, unsigned long request, void *arg)
+{
+        struct drm_v3d_get_param *gp = arg;
+        static const uint32_t reg_map[] = {
+                [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
+        };
+
+        switch (gp->param) {
+        case DRM_V3D_PARAM_SUPPORTS_TFU:
+                gp->value = 1;
+                return 0;
+        }
+
+        if (gp->param < ARRAY_SIZE(reg_map) && reg_map[gp->param]) {
+                gp->value = V3D_READ(reg_map[gp->param]);
+                return 0;
+        }
+
+        fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param);
+        return -1;
+}
+
+static ioctl_fn_t driver_ioctls[] = {
+        [DRM_V3D_SUBMIT_CL] = v3dX(v3d_ioctl_submit_cl),
+        [DRM_V3D_SUBMIT_TFU] = v3dX(v3d_ioctl_submit_tfu),
+        [DRM_V3D_WAIT_BO] = v3d_ioctl_wait_bo,
+        [DRM_V3D_CREATE_BO] = v3dX(v3d_ioctl_create_bo),
+        [DRM_V3D_GET_PARAM] = v3dX(v3d_ioctl_get_param),
+        [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo,
+        [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset,
+};
+
+static void
+v3d_isr(uint32_t hub_status)
+{
+        /* Check the per-core bits */
+        if (hub_status & (1 << 0)) {
+                uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS);
+
+                if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
+                        fprintf(stderr, "GMP violation at 0x%08x\n",
+                                V3D_READ(V3D_GMP_0_VIO_ADDR));
+                        abort();
+                } else {
+                        fprintf(stderr,
+                                "Unexpected ISR with core status 0x%08x\n",
+                                core_status);
+                }
+                abort();
+        }
+
+        return;
+}
+
+static void
+v3dX(simulator_init_regs)(void)
+{
+#if V3D_VERSION == 33
+        /* Set OVRTMUOUT to match kernel behavior.
+         *
+         * This means that the texture sampler uniform configuration's tmu
+         * output type field is used, instead of using the hardware default
+         * behavior based on the texture type.  If you want the default
+         * behavior, you can still put "2" in the indirect texture state's
+         * output_type field.
+         */
+        V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
+#endif
+
+        uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_GMPV_SET;
+        V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
+        V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
+
+        v3d_hw_set_isr(v3d.hw, v3d_isr);
+}
+
+static void
+v3d_bo_free(struct shim_bo *shim_bo)
+{
+        struct v3d_bo *bo = v3d_bo(shim_bo);
+
+        if (bo->gem_vaddr)
+                munmap(bo->gem_vaddr, shim_bo->size);
+
+        util_vma_heap_free(&v3d.heap, bo->offset, bo->base.size);
+}
+
+void
+v3dX(drm_shim_driver_init)(void)
+{
+        shim_device.driver_ioctls = driver_ioctls;
+        shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls);
+
+        shim_device.driver_bo_free = v3d_bo_free;
+
+        /* Allocate a gig of memory to play in. */
+        v3d_hw_alloc_mem(v3d.hw, 1024 * 1024 * 1024);
+        v3d.mem_base =
+                v3d_hw_get_mem(v3d.hw, &v3d.mem_size,
+                               &v3d.mem);
+        util_vma_heap_init(&v3d.heap, 4096, v3d.mem_size - 4096);
+
+        v3dX(simulator_init_regs)();
+}
diff --git a/lib/mesa/src/broadcom/meson.build b/lib/mesa/src/broadcom/meson.build
index d3ea362f2..57f0d889b 100644
--- a/lib/mesa/src/broadcom/meson.build
+++ b/lib/mesa/src/broadcom/meson.build
@@ -30,6 +30,10 @@ if with_gallium_v3d
   subdir('qpu')
 endif
 
+if with_tools.contains('drm-shim')
+  subdir('drm-shim')
+endif
+
 per_version_libs = []
 foreach ver : v3d_versions
   per_version_libs += static_library(
@@ -47,7 +51,7 @@ endforeach
 libbroadcom_v3d = static_library(
   'libbroadcom_v3d',
   [
-    files('common/v3d_debug.c', 'clif/clif_dump.c'),
+    files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c'),
     v3d_xml_pack,
   ],
   include_directories : [inc_common, inc_broadcom, inc_src],
diff --git a/lib/mesa/src/broadcom/qpu/meson.build b/lib/mesa/src/broadcom/qpu/meson.build
index 279b09cb9..c9cf7b9e9 100644
--- a/lib/mesa/src/broadcom/qpu/meson.build
+++ b/lib/mesa/src/broadcom/qpu/meson.build
@@ -39,7 +39,8 @@ test(
   'qpu_disasm',
   executable(
     'qpu_disasm', 'tests/qpu_disasm.c',
-    link_with: [libbroadcom_qpu, libmesa_util],
+    link_with: libbroadcom_qpu,
+    dependencies : idep_mesautil,
     include_directories: inc_common
   ),
   suite : ['broadcom'],
diff --git a/lib/mesa/src/broadcom/qpu/qpu_disasm.c b/lib/mesa/src/broadcom/qpu/qpu_disasm.c
index 32e7ba12a..9f59bcdf7 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_disasm.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_disasm.c
@@ -64,7 +64,7 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
         } else if (mux == V3D_QPU_MUX_B) {
                 if (instr->sig.small_imm) {
                         uint32_t val;
-                        MAYBE_UNUSED bool ok =
+                        ASSERTED bool ok =
                                 v3d_qpu_small_imm_unpack(disasm->devinfo,
                                                          instr->raddr_b,
                                                          &val);
@@ -205,6 +205,8 @@ v3d_qpu_disasm_sig(struct disasm_state *disasm,
             !sig->ldvary &&
             !sig->ldvpm &&
             !sig->ldtmu &&
+            !sig->ldtlb &&
+            !sig->ldtlbu &&
             !sig->ldunif &&
             !sig->ldunifrf &&
             !sig->ldunifa &&
diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.c b/lib/mesa/src/broadcom/qpu/qpu_instr.c
index add2d2a23..09d06b3fa 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_instr.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_instr.c
@@ -645,19 +645,10 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
 bool
 v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
 {
-        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
-                switch (inst->alu.add.op) {
-                case V3D_QPU_A_RECIP:
-                case V3D_QPU_A_RSQRT:
-                case V3D_QPU_A_EXP:
-                case V3D_QPU_A_LOG:
-                case V3D_QPU_A_SIN:
-                case V3D_QPU_A_RSQRT2:
-                        return true;
-                default:
-                        break;
-                }
+        if (v3d_qpu_instr_is_sfu(inst))
+                return true;
 
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.magic_write &&
                     v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) {
                         return true;
@@ -673,6 +664,25 @@ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
 }
 
 bool
+v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                switch (inst->alu.add.op) {
+                case V3D_QPU_A_RECIP:
+                case V3D_QPU_A_RSQRT:
+                case V3D_QPU_A_EXP:
+                case V3D_QPU_A_LOG:
+                case V3D_QPU_A_SIN:
+                case V3D_QPU_A_RSQRT2:
+                        return true;
+                default:
+                        return false;
+                }
+        }
+        return false;
+}
+
+bool
 v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst)
 {
         return (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
@@ -683,6 +693,16 @@ v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst)
 }
 
 bool
+v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst)
+{
+        return v3d_qpu_writes_tmu(inst) &&
+               (!inst->alu.add.magic_write ||
+                inst->alu.add.waddr != V3D_QPU_WADDR_TMUC) &&
+               (!inst->alu.mul.magic_write ||
+                inst->alu.mul.waddr != V3D_QPU_WADDR_TMUC);
+}
+
+bool
 v3d_qpu_reads_vpm(const struct v3d_qpu_instr *inst)
 {
         if (inst->sig.ldvpm)
@@ -751,9 +771,6 @@ bool
 v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *inst)
 {
-        if (inst->sig.ldtmu)
-                return true;
-
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.magic_write &&
                     (inst->alu.add.waddr == V3D_QPU_WADDR_R4 ||
@@ -768,8 +785,10 @@ v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
                 }
         }
 
-        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
-            inst->sig_magic && inst->sig_addr == V3D_QPU_WADDR_R4) {
+        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
+                if (inst->sig_magic && inst->sig_addr == V3D_QPU_WADDR_R4)
+                        return true;
+        } else if (inst->sig.ldtmu) {
                 return true;
         }
 
@@ -867,3 +886,70 @@ v3d_qpu_writes_flags(const struct v3d_qpu_instr *inst)
 
         return false;
 }
+
+bool
+v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        switch (inst->alu.add.op) {
+        case V3D_QPU_A_FADD:
+        case V3D_QPU_A_FADDNF:
+        case V3D_QPU_A_FSUB:
+        case V3D_QPU_A_FMIN:
+        case V3D_QPU_A_FMAX:
+        case V3D_QPU_A_FCMP:
+        case V3D_QPU_A_FROUND:
+        case V3D_QPU_A_FTRUNC:
+        case V3D_QPU_A_FFLOOR:
+        case V3D_QPU_A_FCEIL:
+        case V3D_QPU_A_FDX:
+        case V3D_QPU_A_FDY:
+        case V3D_QPU_A_FTOIN:
+        case V3D_QPU_A_FTOIZ:
+        case V3D_QPU_A_FTOUZ:
+        case V3D_QPU_A_FTOC:
+        case V3D_QPU_A_VFPACK:
+                return true;
+                break;
+        default:
+                break;
+        }
+
+        switch (inst->alu.mul.op) {
+        case V3D_QPU_M_FMOV:
+        case V3D_QPU_M_FMUL:
+                return true;
+                break;
+        default:
+                break;
+        }
+
+        return false;
+}
+bool
+v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        switch (inst->alu.add.op) {
+        case V3D_QPU_A_VFMIN:
+        case V3D_QPU_A_VFMAX:
+                return true;
+                break;
+        default:
+                break;
+        }
+
+        switch (inst->alu.mul.op) {
+        case V3D_QPU_M_VFMUL:
+                return true;
+                break;
+        default:
+                break;
+        }
+
+        return false;
+}
diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.h b/lib/mesa/src/broadcom/qpu/qpu_instr.h
index 1e2dcb78a..ad2d37b60 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_instr.h
+++ b/lib/mesa/src/broadcom/qpu/qpu_instr.h
@@ -447,8 +447,10 @@ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
@@ -464,5 +466,7 @@ bool v3d_qpu_reads_flags(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_flags(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
                                 const struct v3d_qpu_sig *sig) ATTRIBUTE_CONST;
+bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 
 #endif
diff --git a/lib/mesa/src/broadcom/qpu/qpu_pack.c b/lib/mesa/src/broadcom/qpu/qpu_pack.c
index 70f31d734..516b0cf53 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_pack.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_pack.c
@@ -776,7 +776,11 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
         case V3D_QPU_A_FMIN:
         case V3D_QPU_A_FMAX:
         case V3D_QPU_A_FCMP:
-                instr->alu.add.output_pack = (op >> 4) & 0x3;
+        case V3D_QPU_A_VFPACK:
+                if (instr->alu.add.op != V3D_QPU_A_VFPACK)
+                        instr->alu.add.output_pack = (op >> 4) & 0x3;
+                else
+                        instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
 
                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
                                                    &instr->alu.add.a_unpack)) {
@@ -1042,6 +1046,32 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
 
                 opcode |= a_unpack << 2;
                 opcode |= b_unpack << 0;
+
+                break;
+        }
+
+        case V3D_QPU_A_VFPACK: {
+                uint32_t a_unpack;
+                uint32_t b_unpack;
+
+                if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
+                    instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                                                 &a_unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+                                                 &b_unpack)) {
+                        return false;
+                }
+
+                opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
+                opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
+
                 break;
         }
 
@@ -1065,7 +1095,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 }
                 if (packed == 0)
                         return false;
-                opcode |= packed << 2;
+                opcode = (opcode & ~(1 << 2)) | packed << 2;
                 break;
         }
 
diff --git a/lib/mesa/src/broadcom/qpu/qpu_validate.c b/lib/mesa/src/broadcom/qpu/qpu_validate.c
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/lib/mesa/src/broadcom/qpu/qpu_validate.c
diff --git a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
index 2e8d98058..1bc3c9ec6 100644
--- a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
@@ -48,6 +48,9 @@ static const struct {
         { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna  rf61.h, r4.abs, r2.l; vfmul  rf55, r2.hh, r1" },
         { 33, 0x2011c89b402cc000ull, "fsub.norz  rf27, r4.abs, r1.abs; vfmul.ifa  rf34, r3.swp, r1" },
 
+        { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc  rf43, rf15.l, r0.h; fmul.ifna  rf10.h, r4.l, r5.abs" },
+        { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb  rf7.h, r1.l; fmul.pushn  rf46, r3.l, r2.abs" },
+
         /* small immediates */
         { 33, 0x5de24398bbdc6218ull, "vflb.andnn  rf24     ; fmul  rf14, -8, rf8.h" },
         { 33, 0x25ef83d8b166f00full, "vfmin.pushn  rf24, 15.ff, r5; smul24.ifnb  rf15, r1, r3" },
author	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:13:18 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2020-01-22 02:13:18 +0000
commit	fdcc03929065b5bf5dd93553db219ea3e05c8c34 (patch)
tree	ca90dc8d9e89febdcd4160956c1b8ec098a4efc9 /lib/mesa/src/broadcom
parent	3c9de4a7e13712b5696750bbd59a18c848742022 (diff)