16 files changed, 654 insertions, 2431 deletions
diff --git a/lib/mesa/src/panfrost/bifrost/ISA.xml b/lib/mesa/src/panfrost/bifrost/ISA.xml
index f1e908331..b5965fd3c 100644
--- a/lib/mesa/src/panfrost/bifrost/ISA.xml
+++ b/lib/mesa/src/panfrost/bifrost/ISA.xml
@@ -1986,7 +1986,7 @@
     <src start="0" mask="0xfb"/>
   </ins>
 
-  <ins name="*NOP" mask="0x7fffff" exact="0x701963" dests="0"/>
+  <ins name="*NOP.i32" mask="0x7fffff" exact="0x701963"/>
 
   <ins name="*POPCOUNT.i32" mask="0x7ffff8" exact="0x73c6d8">
     <src start="0" mask="0xfb"/>
@@ -2036,7 +2036,6 @@
       <opt>not</opt>
       <opt>none</opt>
     </mod>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
   </ins>
 
   <ins name="*RSHIFT_AND.v2i16">
@@ -2057,7 +2056,6 @@
       <opt>not</opt>
       <opt>none</opt>
     </mod>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
     <encoding mask="0x7f3800" exact="0x300800">
       <or>
         <eq left="lanes2" right="#b00"/>
@@ -2091,7 +2089,6 @@
     <src start="0" mask="0xfb"/>
     <src start="3" mask="0xfb"/>
     <src start="6"/>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
     <mod name="lanes2" size="3" default="b0123">
       <opt>b0123</opt>
       <opt>b0000</opt>
@@ -2147,7 +2144,6 @@
       <opt>not</opt>
       <opt>none</opt>
     </mod>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
     <mod name="not_result" start="15" size="1" opt="not"/>
   </ins>
 
@@ -2169,7 +2165,6 @@
       <opt>none</opt>
     </mod>
     <mod name="not_result" start="15" size="1" opt="not"/>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
     <encoding mask="0x7f3800" exact="0x302800">
       <or>
         <eq left="lanes2" right="#b00"/>
@@ -2215,7 +2210,6 @@
       <opt>none</opt>
     </mod>
     <mod name="not_result" start="15" size="1" opt="not"/>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
     <encoding mask="0x7f3800" exact="0x302000">
       <neq left="lanes2" right="#b0123"/>
       <derived start="9" size="2">
@@ -2241,7 +2235,6 @@
       <opt>b3</opt>
     </mod>
     <mod name="not_result" start="13" size="1" opt="not"/>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
   </ins>
 
   <ins name="*RSHIFT_XOR.v2i16">
@@ -2258,7 +2251,6 @@
       <opt>b02</opt>
     </mod>
     <mod name="not_result" start="13" size="1" opt="not"/>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
     <encoding mask="0x7fd800" exact="0x320800">
       <or>
         <eq left="lanes2" right="#b00"/>
@@ -2300,7 +2292,6 @@
       <opt>b3333</opt>
     </mod>
     <mod name="not_result" start="13" size="1" opt="not"/>
-    <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
     <encoding mask="0x7fd800" exact="0x320000">
       <neq left="lanes2" right="#b0123"/>
       <derived start="9" size="2">
@@ -2429,7 +2420,6 @@
       <opt>rtz</opt>
       <opt>rtna</opt>
     </mod>
-    <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
     <derived start="6" size="1">
       <and>
         <eq left="abs0" right="#none"/>
@@ -2496,9 +2486,6 @@
   <ins name="+ATEST" staging="w=1" mask="0xfff00" exact="0xc8f00" message="atest" table="true">
     <src start="0" mask="0xf7"/>
     <src start="3" mask="0xf7"/>
-    <!-- ATEST parameter datum. Implicitly encoded into the tuple on Bifrost.
-         Real source on Valhall. -->
-    <src start="6" pseudo="true"/>
     <mod name="widen1" start="6" size="2">
       <reserved/>
       <opt>none</opt>
@@ -2533,22 +2520,8 @@
     <src start="0"/>
     <src start="3" mask="0xf7"/>
     <src start="6" mask="0xf7"/>
-    <!-- pseudo source for a dual source blend input -->
-    <src start="9" pseudo="true"/>
     <!-- not actually encoded, but used for IR -->
     <immediate name="sr_count" size="4" pseudo="true"/>
-    <immediate name="sr_count_2" size="4" pseudo="true"/>
-    <mod name="register_format" size="4" pseudo="true">
-      <opt>f16</opt>
-      <opt>f32</opt>
-      <opt>s32</opt>
-      <opt>u32</opt>
-      <opt>s16</opt>
-      <opt>u16</opt>
-      <opt>f64</opt>
-      <opt>i64</opt>
-      <opt>auto</opt>
-    </mod>
   </ins>
 
   <ins name="+BRANCH.f16" mask="0xf8000" exact="0x68000" last="true" dests="0">
@@ -3716,12 +3689,12 @@
     <src start="6" mask="0xf7"/>
   </ins>
 
-  <ins name="+CLPER_OLD.i32" mask="0xfffc0" exact="0x3f0c0">
+  <ins name="+CLPER_V6.i32" mask="0xfffc0" exact="0x3f0c0">
     <src start="0" mask="0x7"/>
     <src start="3"/>
   </ins>
 
-  <ins name="+CLPER.i32" mask="0xfc000" exact="0x7c000">
+  <ins name="+CLPER_V7.i32" mask="0xfc000" exact="0x7c000">
     <src start="0" mask="0x7"/>
     <src start="3"/>
     <mod name="lane_op" start="6" size="2">
@@ -3734,7 +3707,6 @@
       <opt>subgroup2</opt>
       <opt>subgroup4</opt>
       <opt>subgroup8</opt>
-      <opt pseudo="true">subgroup16</opt> <!-- Only on Valhall -->
     </mod>
     <mod name="inactive_result" start="10" size="4">
       <opt>zero</opt>
@@ -3874,7 +3846,6 @@
       <opt>h0</opt>
       <opt>h1</opt>
     </mod>
-    <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
   </ins>
 
   <ins name="+F16_TO_S32">
@@ -6195,7 +6166,7 @@
     <src start="6" mask="0xf7"/>
   </ins>
 
-  <ins name="+KABOOM" mask="0xffff8" exact="0xd7858" message="job" dests="0">
+  <ins name="+KABOOM" mask="0xffff8" exact="0xd7858" unused="true" message="job_management">
     <src start="0"/>
   </ins>
 
@@ -6381,7 +6352,7 @@
     </mod>
   </ins>
 
-  <ins name="+LD_TILE" staging="w=format" mask="0xff800" exact="0xcb000" message="tile">
+  <ins name="+LD_TILE" staging="w=vecsize" mask="0xff800" exact="0xcb000" message="tile">
     <src start="0"/>
     <src start="3"/>
     <src start="6" mask="0xf7"/>
@@ -6391,15 +6362,9 @@
       <opt>v3</opt>
       <opt>v4</opt>
     </mod>
-    <mod name="register_format" size="3" pseudo="true">
-      <opt>f32</opt>
-      <opt>f16</opt>
-      <opt>u32</opt>
-      <opt>s32</opt>
-    </mod>
   </ins>
 
-  <ins name="+LD_VAR" staging="w=format" message="varying">
+  <ins name="+LD_VAR" staging="w=vecsize" message="varying">
     <src start="0"/>
     <src start="3"/>
     <mod name="vecsize" start="8" size="2">
@@ -7105,7 +7070,6 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOAD.i16" staging="w=1" message="load">
@@ -7121,7 +7085,7 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <mod name="lane_dest" size="2" default="h0">
+    <mod name="lane0" size="2" default="h0">
       <opt>h0</opt>
       <opt>h1</opt>
       <opt>w0</opt>
@@ -7136,19 +7100,19 @@
       <and>
         <eq left="extend" right="#none"/>
         <or>
-          <eq left="lane_dest" right="#h0"/>
-          <eq left="lane_dest" right="#h1"/>
+          <eq left="lane0" right="#h0"/>
+          <eq left="lane0" right="#h1"/>
         </or>
       </and>
       <derived start="9" size="1">
-        <eq left="lane_dest" right="#h0"/>
-        <eq left="lane_dest" right="#h1"/>
+        <eq left="lane0" right="#h0"/>
+        <eq left="lane0" right="#h1"/>
       </derived>
     </encoding>
     <encoding mask="0xffc00" exact="0x63000">
       <and>
         <neq left="extend" right="#none"/>
-        <eq left="lane_dest" right="#w0"/>
+        <eq left="lane0" right="#w0"/>
       </and>
       <derived start="9" size="1">
         <eq left="extend" right="#sext"/>
@@ -7158,14 +7122,13 @@
     <encoding mask="0xffc00" exact="0x61800">
       <and>
         <neq left="extend" right="#none"/>
-        <eq left="lane_dest" right="#d0"/>
+        <eq left="lane0" right="#d0"/>
       </and>
       <derived start="9" size="1">
         <eq left="extend" right="#sext"/>
         <eq left="extend" right="#zext"/>
       </derived>
     </encoding>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOAD.i24" staging="w=1" mask="0xffe00" exact="0x65000" message="load">
@@ -7181,7 +7144,6 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOAD.i32" staging="w=1" message="load">
@@ -7197,7 +7159,7 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <mod name="lane_dest" size="1" opt="d0"/>
+    <mod name="lane0" size="1" opt="d0"/>
     <mod name="extend" size="2">
       <opt>none</opt>
       <opt>sext</opt>
@@ -7206,20 +7168,19 @@
     <encoding mask="0xffe00" exact="0x60c00">
       <and>
         <eq left="extend" right="#none"/>
-        <eq left="lane_dest" right="#none"/>
+        <eq left="lane0" right="#none"/>
       </and>
     </encoding>
     <encoding mask="0xffc00" exact="0x61c00">
       <and>
         <neq left="extend" right="#none"/>
-        <eq left="lane_dest" right="#d0"/>
+        <eq left="lane0" right="#d0"/>
       </and>
       <derived start="9" size="1">
         <eq left="extend" right="#sext"/>
         <eq left="extend" right="#zext"/>
       </derived>
     </encoding>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOAD.i48" staging="w=2" mask="0xffe00" exact="0x65200" message="load">
@@ -7235,7 +7196,6 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOAD.i64" staging="w=2" mask="0xffe00" exact="0x60e00" message="load">
@@ -7251,7 +7211,6 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOAD.i8" staging="w=1" message="load">
@@ -7267,7 +7226,7 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <mod name="lane_dest" size="3" default="b0">
+    <mod name="lane0" size="3" default="b0">
       <opt>b0</opt>
       <opt>b1</opt>
       <opt>b2</opt>
@@ -7286,25 +7245,25 @@
       <and>
         <eq left="extend" right="#none"/>
         <or>
-          <eq left="lane_dest" right="#b0"/>
-          <eq left="lane_dest" right="#b1"/>
-          <eq left="lane_dest" right="#b2"/>
-          <eq left="lane_dest" right="#b3"/>
+          <eq left="lane0" right="#b0"/>
+          <eq left="lane0" right="#b1"/>
+          <eq left="lane0" right="#b2"/>
+          <eq left="lane0" right="#b3"/>
         </or>
       </and>
       <derived start="9" size="2">
-        <eq left="lane_dest" right="#b0"/>
-        <eq left="lane_dest" right="#b1"/>
-        <eq left="lane_dest" right="#b2"/>
-        <eq left="lane_dest" right="#b3"/>
+        <eq left="lane0" right="#b0"/>
+        <eq left="lane0" right="#b1"/>
+        <eq left="lane0" right="#b2"/>
+        <eq left="lane0" right="#b3"/>
       </derived>
     </encoding>
     <encoding mask="0xff800" exact="0x63800">
       <and>
         <neq left="extend" right="#none"/>
         <or>
-          <eq left="lane_dest" right="#h0"/>
-          <eq left="lane_dest" right="#h1"/>
+          <eq left="lane0" right="#h0"/>
+          <eq left="lane0" right="#h1"/>
         </or>
       </and>
       <derived start="9" size="1">
@@ -7312,14 +7271,14 @@
         <eq left="extend" right="#zext"/>
       </derived>
       <derived start="10" size="1">
-        <eq left="lane_dest" right="#h0"/>
-        <eq left="lane_dest" right="#h1"/>
+        <eq left="lane0" right="#h0"/>
+        <eq left="lane0" right="#h1"/>
       </derived>
     </encoding>
     <encoding mask="0xffc00" exact="0x63400">
       <and>
         <neq left="extend" right="#none"/>
-        <eq left="lane_dest" right="#w0"/>
+        <eq left="lane0" right="#w0"/>
       </and>
       <derived start="9" size="1">
         <eq left="extend" right="#sext"/>
@@ -7329,14 +7288,13 @@
     <encoding mask="0xffc00" exact="0x61400">
       <and>
         <neq left="extend" right="#none"/>
-        <eq left="lane_dest" right="#d0"/>
+        <eq left="lane0" right="#d0"/>
       </and>
       <derived start="9" size="1">
         <eq left="extend" right="#sext"/>
         <eq left="extend" right="#zext"/>
       </derived>
     </encoding>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOAD.i96" staging="w=3" mask="0xffe00" exact="0x65400" message="load">
@@ -7352,7 +7310,6 @@
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+LOGB.f32" mask="0xfffe0" exact="0x3d9a0">
@@ -7438,7 +7395,7 @@
     </mod>
   </ins>
 
-  <ins name="+NOP" mask="0xfffff" exact="0x3d964" dests="0"/>
+  <ins name="+NOP.i32" mask="0xfffff" exact="0x3d964"/>
 
   <ins name="+QUIET.f32" mask="0xffff8" exact="0x3d970">
     <src start="0"/>
@@ -7562,12 +7519,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+STORE.i16" staging="r=1" mask="0xffe00" exact="0x62800" message="store" dests="0">
@@ -7578,12 +7534,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+STORE.i24" staging="r=1" mask="0xffe00" exact="0x65800" message="store" dests="0">
@@ -7594,12 +7549,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+STORE.i32" staging="r=1" mask="0xffe00" exact="0x62c00" message="store" dests="0">
@@ -7610,12 +7564,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+STORE.i48" staging="r=2" mask="0xffe00" exact="0x65a00" message="store" dests="0">
@@ -7626,12 +7579,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+STORE.i64" staging="r=2" mask="0xffe00" exact="0x62e00" message="store" dests="0">
@@ -7642,12 +7594,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+STORE.i8" staging="r=1" mask="0xffe00" exact="0x62000" message="store" dests="0">
@@ -7658,12 +7609,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+STORE.i96" staging="r=3" mask="0xffe00" exact="0x65c00" message="store" dests="0">
@@ -7674,12 +7624,11 @@
       <opt>none</opt>
       <opt>wls</opt>
       <opt>stream</opt>
-      <opt pseudo="true">pos</opt>
-      <opt pseudo="true">vary</opt>
+      <reserved/>
+      <reserved/>
       <reserved/>
       <opt>tl</opt>
     </mod>
-    <immediate name="byte_offset" size="16" pseudo="true"/>
   </ins>
 
   <ins name="+ST_CVT" staging="r=format" mask="0xff800" exact="0xc9800" message="store" dests="0">
@@ -7704,7 +7653,7 @@
     </mod>
   </ins>
 
-  <ins name="+ST_TILE" staging="r=format" mask="0xff800" exact="0xcb800" message="tile" dests="0">
+  <ins name="+ST_TILE" staging="r=vecsize" mask="0xff800" exact="0xcb800" message="tile" dests="0">
     <src start="0"/>
     <src start="3"/>
     <src start="6" mask="0xf7"/>
@@ -7714,12 +7663,6 @@
       <opt>v3</opt>
       <opt>v4</opt>
     </mod>
-    <mod name="register_format" size="3" pseudo="true">
-      <opt>f32</opt>
-      <opt>f16</opt>
-      <opt>u32</opt>
-      <opt>s32</opt>
-    </mod>
   </ins>
 
   <ins name="+SWZ.v2i16" mask="0xfffc8" exact="0x3d948">
@@ -7753,27 +7696,6 @@
     <mod name="skip" start="9" size="1" opt="skip"/>
     <!-- not actually encoded, but used for IR -->
     <immediate name="sr_count" size="4" pseudo="true"/>
-    <immediate name="sr_count_2" size="4" pseudo="true"/>
-    <mod name="lod_mode" start="13" size="1" default="zero_lod" pseudo="true">
-      <opt>computed_lod</opt>
-      <opt>zero_lod</opt>
-    </mod>
-  </ins>
-
-  <!-- Pseudo instruction representing dual texturing on Bifrost. Lowered to
-       TEXC after register allocation, when the second destination register can
-       be combined with the texture operation descriptor. -->
-  <ins name="+TEXC_DUAL" staging="rw=sr_count" pseudo="true" message="tex" dests="2">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6" mask="0xf7"/>
-    <mod name="skip" start="9" size="1" opt="skip"/>
-    <immediate name="sr_count" size="4" pseudo="true"/>
-    <immediate name="sr_count_2" size="4" pseudo="true"/>
-    <mod name="lod_mode" start="13" size="1" default="zero_lod" pseudo="true">
-      <opt>computed_lod</opt>
-      <opt>zero_lod</opt>
-    </mod>
   </ins>
 
   <ins name="+TEXS_2D.f16" staging="w=2" mask="0xfc000" exact="0xd8000" message="tex">
@@ -7959,7 +7881,6 @@
       <opt>rtz</opt>
       <opt>rtna</opt>
     </mod>
-    <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
     <derived start="6" size="1">
       <and>
         <eq left="abs0" right="#none"/>
@@ -8261,11 +8182,11 @@
     <mod name="preserve_null" size="1" opt="preserve_null"/>
   </ins>
 
-  <!-- Scheduler lowered to *ATOM_C.i32/+ATOM_CX. Real Valhall instructions. -->
-  <ins name="+ATOM_RETURN.i32" pseudo="true" staging="rw=sr_count" message="atomic">
+  <!-- Scheduler lowered to *ATOM_C.i32/+ATOM_CX -->
+  <ins name="+PATOM_C.i32" pseudo="true" staging="rw=sr_count" message="atomic">
     <src start="0"/>
     <src start="3"/>
-    <mod name="atom_opc" start="9" size="5">
+    <mod name="atom_opc" start="9" size="4">
       <reserved/>
       <reserved/>
       <opt>aadd</opt>
@@ -8281,14 +8202,10 @@
       <opt>aand</opt>
       <opt>aor</opt>
       <opt>axor</opt>
-      <opt>axchg</opt> <!-- For Valhall -->
-      <opt>acmpxchg</opt> <!-- For Valhall -->
     </mod>
-    <!-- not actually encoded, but used for IR -->
-    <immediate name="sr_count" size="4" pseudo="true"/>
   </ins>
 
-  <ins name="+ATOM1_RETURN.i32" pseudo="true" staging="w=sr_count" message="atomic">
+  <ins name="+PATOM_C1.i32" pseudo="true" staging="w=sr_count" message="atomic">
     <src start="0"/>
     <src start="3"/>
     <mod name="atom_opc" start="6" size="3">
@@ -8298,32 +8215,6 @@
       <opt>asmax1</opt>
       <opt>aor1</opt>
     </mod>
-    <!-- not actually encoded, but used for IR -->
-    <immediate name="sr_count" size="4" pseudo="true"/>
-  </ins>
-
-  <ins name="+ATOM.i32" pseudo="true" staging="r=sr_count" message="atomic">
-    <src start="0"/>
-    <src start="3"/>
-    <mod name="atom_opc" start="9" size="4">
-      <reserved/>
-      <reserved/>
-      <opt>aadd</opt>
-      <reserved/>
-      <reserved/>
-      <reserved/>
-      <reserved/>
-      <reserved/>
-      <opt>asmin</opt>
-      <opt>asmax</opt>
-      <opt>aumin</opt>
-      <opt>aumax</opt>
-      <opt>aand</opt>
-      <opt>aor</opt>
-      <opt>axor</opt>
-    </mod>
-    <!-- not actually encoded, but used for IR -->
-    <immediate name="sr_count" size="4" pseudo="true"/>
   </ins>
 
   <!-- *CUBEFACE1/+CUBEFACE2 pair, two destinations, scheduler lowered -->
@@ -8336,982 +8227,4 @@
     <mod name="neg2" size="1" opt="neg"/>
   </ins>
 
-  <ins name="+IADD_IMM.i32" pseudo="true">
-    <src start="0"/>
-    <immediate name="index" size="32"/>
-  </ins>
-
-  <ins name="+IADD_IMM.v2i16" pseudo="true">
-    <src start="0"/>
-    <immediate name="index" size="32"/>
-  </ins>
-
-  <ins name="+IADD_IMM.v4i8" pseudo="true">
-    <src start="0"/>
-    <immediate name="index" size="32"/>
-  </ins>
-
-  <ins name="+FADD_IMM.f32" pseudo="true">
-    <src start="0"/>
-    <immediate name="index" size="32"/>
-  </ins>
-
-  <ins name="+FADD_IMM.v2f16" pseudo="true">
-    <src start="0"/>
-    <immediate name="index" size="32"/>
-  </ins>
-
-  <ins name="*FABSNEG.f32" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <mod name="neg0" start="7" size="1" opt="neg"/>
-    <mod name="abs0" start="12" size="1" opt="abs"/>
-    <mod name="widen0" size="2">
-      <opt>none</opt>
-      <opt>h0</opt>
-      <opt>h1</opt>
-    </mod>
-  </ins>
-
-  <ins name="*FABSNEG.v2f16" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <mod name="abs0" size="1" opt="abs"/>
-    <mod name="neg0" start="7" size="1" opt="neg"/>
-    <mod name="swz0" start="9" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-  </ins>
-
-  <ins name="*FCLAMP.f32" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <mod name="clamp" start="15" size="2">
-      <opt>none</opt>
-      <opt>clamp_0_inf</opt>
-      <opt>clamp_m1_1</opt>
-      <opt>clamp_0_1</opt>
-    </mod>
-  </ins>
-
-  <ins name="*FCLAMP.v2f16" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <mod name="clamp" start="15" size="2">
-      <opt>none</opt>
-      <opt>clamp_0_inf</opt>
-      <opt>clamp_m1_1</opt>
-      <opt>clamp_0_1</opt>
-    </mod>
-  </ins>
-
-  <ins name="+DISCARD.b32" pseudo="true" dests="0">
-    <src start="0"/>
-    <mod name="widen0" size="2">
-      <opt>none</opt>
-      <opt>h0</opt>
-      <opt>h1</opt>
-    </mod>
-  </ins>
-
-  <ins name="+TEX_SINGLE" staging="rw=sr_count" message="tex" pseudo="true">
-    <src start="0"/>
-    <src start="1"/>
-    <immediate name="sr_count" size="4" pseudo="true"/>
-    <mod name="texel_offset" start="9" size="1" opt="texel_offset"/>
-    <mod name="skip" start="9" size="1" opt="skip"/>
-    <mod name="shadow" start="9" size="1" opt="shadow"/>
-    <mod name="array_enable" start="9" size="1" opt="array_enable"/>
-    <mod name="dimension" start="9" size="2">
-      <opt>1d</opt>
-      <opt>2d</opt>
-      <opt>3d</opt>
-      <opt>cube</opt>
-    </mod>
-    <mod name="write_mask" start="9" size="4">
-      <opt>none</opt>
-      <opt>r</opt>
-      <opt>g</opt>
-      <opt>rg</opt>
-      <opt>b</opt>
-      <opt>rb</opt>
-      <opt>gb</opt>
-      <opt>rgb</opt>
-      <opt>a</opt>
-      <opt>ra</opt>
-      <opt>ga</opt>
-      <opt>rga</opt>
-      <opt>ba</opt>
-      <opt>rba</opt>
-      <opt>gba</opt>
-      <opt>rgba</opt>
-    </mod>
-    <mod name="va_lod_mode" start="13" size="3" default="zero_lod">
-      <opt>zero_lod</opt>
-      <opt>computed_lod</opt>
-      <opt>explicit</opt>
-      <opt>computed_bias</opt>
-      <opt>grdesc</opt>
-    </mod>
-    <mod name="register_format" size="4">
-      <opt>f16</opt>
-      <opt>f32</opt>
-      <opt>s32</opt>
-      <opt>u32</opt>
-      <opt>s16</opt>
-      <opt>u16</opt>
-    </mod>
-  </ins>
-
-  <ins name="+TEX_FETCH" staging="rw=sr_count" message="tex" pseudo="true">
-    <src start="0"/>
-    <src start="1"/>
-    <immediate name="sr_count" size="4" pseudo="true"/>
-    <mod name="texel_offset" start="9" size="1" opt="texel_offset"/>
-    <mod name="skip" start="9" size="1" opt="skip"/>
-    <mod name="array_enable" start="9" size="1" opt="array_enable"/>
-    <mod name="dimension" start="9" size="2">
-      <opt>1d</opt>
-      <opt>2d</opt>
-      <opt>3d</opt>
-      <opt>cube</opt>
-    </mod>
-    <mod name="write_mask" start="9" size="4">
-      <opt>none</opt>
-      <opt>r</opt>
-      <opt>g</opt>
-      <opt>rg</opt>
-      <opt>b</opt>
-      <opt>rb</opt>
-      <opt>gb</opt>
-      <opt>rgb</opt>
-      <opt>a</opt>
-      <opt>ra</opt>
-      <opt>ga</opt>
-      <opt>rga</opt>
-      <opt>ba</opt>
-      <opt>rba</opt>
-      <opt>gba</opt>
-      <opt>rgba</opt>
-    </mod>
-    <mod name="register_format" size="4">
-      <opt>f16</opt>
-      <opt>f32</opt>
-      <opt>s32</opt>
-      <opt>u32</opt>
-      <opt>s16</opt>
-      <opt>u16</opt>
-    </mod>
-  </ins>
-
-  <ins name="+TEX_GATHER" staging="rw=sr_count" message="tex" pseudo="true">
-    <src start="0"/>
-    <src start="1"/>
-    <immediate name="sr_count" size="4" pseudo="true"/>
-    <mod name="texel_offset" start="9" size="1" opt="texel_offset"/>
-    <mod name="skip" start="9" size="1" opt="skip"/>
-    <mod name="shadow" start="9" size="1" opt="shadow"/>
-    <mod name="array_enable" start="9" size="1" opt="array_enable"/>
-    <mod name="integer_coordinates" start="9" size="1" opt="integer_coordinates"/>
-    <mod name="fetch_component" start="9" size="2">
-      <opt>gather4_r</opt>
-      <opt>gather4_g</opt>
-      <opt>gather4_b</opt>
-      <opt>gather4_a</opt>
-    </mod>
-    <mod name="dimension" start="9" size="2">
-      <opt>1d</opt>
-      <opt>2d</opt>
-      <opt>3d</opt>
-      <opt>cube</opt>
-    </mod>
-    <mod name="write_mask" start="9" size="4">
-      <opt>none</opt>
-      <opt>r</opt>
-      <opt>g</opt>
-      <opt>rg</opt>
-      <opt>b</opt>
-      <opt>rb</opt>
-      <opt>gb</opt>
-      <opt>rgb</opt>
-      <opt>a</opt>
-      <opt>ra</opt>
-      <opt>ga</opt>
-      <opt>rga</opt>
-      <opt>ba</opt>
-      <opt>rba</opt>
-      <opt>gba</opt>
-      <opt>rgba</opt>
-    </mod>
-    <mod name="register_format" size="4">
-      <opt>f16</opt>
-      <opt>f32</opt>
-      <opt>s32</opt>
-      <opt>u32</opt>
-      <opt>s16</opt>
-      <opt>u16</opt>
-    </mod>
-  </ins>
-
-  <ins name="+CUBEFACE2_V9" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <src start="3" mask="0xfb"/>
-    <src start="6"/>
-    <mod name="neg0" size="1" opt="neg"/>
-    <mod name="neg1" size="1" opt="neg"/>
-    <mod name="neg2" size="1" opt="neg"/>
-  </ins>
-
-  <ins name="+LD_VAR_BUF_IMM.f32" staging="w=format" message="varying" pseudo="true">
-    <src start="0"/>
-    <immediate name="index" start="3" size="5"/>
-    <mod name="vecsize" start="8" size="2">
-      <opt>none</opt>
-      <opt>v2</opt>
-      <opt>v3</opt>
-      <opt>v4</opt>
-    </mod>
-    <mod name="update" size="2">
-      <opt>store</opt>
-      <opt>retrieve</opt>
-      <opt>conditional</opt>
-      <opt>clobber</opt>
-    </mod>
-    <mod name="register_format" size="2">
-      <opt>f32</opt>
-      <opt>f16</opt>
-      <opt>u32</opt>
-      <opt>u16</opt>
-    </mod>
-    <mod name="source_format" size="2">
-      <opt>flat32</opt>
-      <opt>flat16</opt>
-      <opt>f32</opt>
-      <opt>f16</opt>
-    </mod>
-    <mod name="sample" size="3">
-      <opt>center</opt>
-      <opt>centroid</opt>
-      <opt>sample</opt>
-      <opt>explicit</opt>
-      <opt>none</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_VAR_BUF.f32" staging="w=format" message="varying" pseudo="true">
-    <src start="0"/>
-    <src start="1"/>
-    <mod name="vecsize" start="8" size="2">
-      <opt>none</opt>
-      <opt>v2</opt>
-      <opt>v3</opt>
-      <opt>v4</opt>
-    </mod>
-    <mod name="update" size="2">
-      <opt>store</opt>
-      <opt>retrieve</opt>
-      <opt>conditional</opt>
-      <opt>clobber</opt>
-    </mod>
-    <mod name="register_format" size="2">
-      <opt>f32</opt>
-      <opt>f16</opt>
-      <opt>u32</opt>
-      <opt>u16</opt>
-    </mod>
-    <mod name="source_format" size="2">
-      <opt>flat32</opt>
-      <opt>flat16</opt>
-      <opt>f32</opt>
-      <opt>f16</opt>
-    </mod>
-    <mod name="sample" size="3">
-      <opt>center</opt>
-      <opt>centroid</opt>
-      <opt>sample</opt>
-      <opt>explicit</opt>
-      <opt>none</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_VAR_BUF_IMM.f16" staging="w=format" message="varying" pseudo="true">
-    <src start="0"/>
-    <immediate name="index" start="3" size="5"/>
-    <mod name="vecsize" start="8" size="2">
-      <opt>none</opt>
-      <opt>v2</opt>
-      <opt>v3</opt>
-      <opt>v4</opt>
-    </mod>
-    <mod name="update" size="2">
-      <opt>store</opt>
-      <opt>retrieve</opt>
-      <opt>conditional</opt>
-      <opt>clobber</opt>
-    </mod>
-    <mod name="register_format" size="2">
-      <opt>f32</opt>
-      <opt>f16</opt>
-      <opt>u32</opt>
-      <opt>u16</opt>
-    </mod>
-    <mod name="source_format" size="2">
-      <opt>flat32</opt>
-      <opt>flat16</opt>
-      <opt>f32</opt>
-      <opt>f16</opt>
-    </mod>
-    <mod name="sample" size="3">
-      <opt>center</opt>
-      <opt>centroid</opt>
-      <opt>sample</opt>
-      <opt>explicit</opt>
-      <opt>none</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_VAR_BUF.f16" staging="w=format" message="varying" pseudo="true">
-    <src start="0"/>
-    <src start="1"/>
-    <mod name="vecsize" start="8" size="2">
-      <opt>none</opt>
-      <opt>v2</opt>
-      <opt>v3</opt>
-      <opt>v4</opt>
-    </mod>
-    <mod name="update" size="2">
-      <opt>store</opt>
-      <opt>retrieve</opt>
-      <opt>conditional</opt>
-      <opt>clobber</opt>
-    </mod>
-    <mod name="register_format" size="2">
-      <opt>f32</opt>
-      <opt>f16</opt>
-      <opt>u32</opt>
-      <opt>u16</opt>
-    </mod>
-    <mod name="source_format" size="2">
-      <opt>flat32</opt>
-      <opt>flat16</opt>
-      <opt>f32</opt>
-      <opt>f16</opt>
-    </mod>
-    <mod name="sample" size="3">
-      <opt>center</opt>
-      <opt>centroid</opt>
-      <opt>sample</opt>
-      <opt>explicit</opt>
-      <opt>none</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LEA_BUF_IMM" staging="w=2" message="attribute" pseudo="true">
-    <src start="0"/>
-  </ins>
-
-  <ins name="+LD_BUFFER.i128" staging="w=4" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-  </ins>
-
-  <ins name="+LD_BUFFER.i16" staging="w=1" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-    <mod name="lane_dest" size="2" default="h0">
-      <opt>h0</opt>
-      <opt>h1</opt>
-      <opt>w0</opt>
-      <opt>d0</opt>
-    </mod>
-    <mod name="extend" size="2">
-      <opt>none</opt>
-      <opt>sext</opt>
-      <opt>zext</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_BUFFER.i24" staging="w=1" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-  </ins>
-
-  <ins name="+LD_BUFFER.i32" staging="w=1" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-    <mod name="lane_dest" size="1" opt="d0"/>
-    <mod name="extend" size="2">
-      <opt>none</opt>
-      <opt>sext</opt>
-      <opt>zext</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_BUFFER.i48" staging="w=2" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-  </ins>
-
-  <ins name="+LD_BUFFER.i64" staging="w=2" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-  </ins>
-
-  <ins name="+LD_BUFFER.i8" staging="w=1" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-    <mod name="lane_dest" size="3" default="b0">
-      <opt>b0</opt>
-      <opt>b1</opt>
-      <opt>b2</opt>
-      <opt>b3</opt>
-      <opt>h0</opt>
-      <opt>h1</opt>
-      <opt>w0</opt>
-      <opt>d0</opt>
-    </mod>
-    <mod name="extend" size="2">
-      <opt>none</opt>
-      <opt>sext</opt>
-      <opt>zext</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_BUFFER.i96" staging="w=3" pseudo="true" message="load">
-    <src start="0"/>
-    <src start="3"/>
-  </ins>
-
-  <ins name="+BRANCHZI" pseudo="true" last="true" dests="0">
-    <src start="0"/>
-    <src start="6" mask="0xf7"/>
-    <mod name="cmpf" size="1">
-      <opt>eq</opt>
-      <opt>ne</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_TEX" pseudo="true" staging="w=format" message="attribute">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="register_format" size="4">
-      <opt>f16</opt>
-      <opt>f32</opt>
-      <opt>s32</opt>
-      <opt>u32</opt>
-      <opt>s16</opt>
-      <opt>u16</opt>
-      <opt>f64</opt>
-      <opt>i64</opt>
-      <opt>auto</opt>
-    </mod>
-    <mod name="vecsize" start="11" size="2">
-      <opt>none</opt>
-      <opt>v2</opt>
-      <opt>v3</opt>
-      <opt>v4</opt>
-    </mod>
-  </ins>
-
-  <ins name="+LD_TEX_IMM" pseudo="true" staging="w=format" message="attribute">
-    <src start="0"/>
-    <src start="3"/>
-    <immediate name="texture_index" start="6" size="4"/>
-    <mod name="register_format" size="4">
-      <opt>f16</opt>
-      <opt>f32</opt>
-      <opt>s32</opt>
-      <opt>u32</opt>
-      <opt>s16</opt>
-      <opt>u16</opt>
-      <opt>f64</opt>
-      <opt>i64</opt>
-      <opt>auto</opt>
-    </mod>
-    <mod name="vecsize" start="11" size="2">
-      <opt>none</opt>
-      <opt>v2</opt>
-      <opt>v3</opt>
-      <opt>v4</opt>
-    </mod>
-  </ins>
-
-  <ins name="*MKVEC.v2i8" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="lane0" start="12" size="2" default="b0">
-      <opt>b0</opt>
-      <opt>b1</opt>
-      <opt>b2</opt>
-      <opt>b3</opt>
-    </mod>
-    <mod name="lane1" start="13" size="2" default="b0">
-      <opt>b0</opt>
-      <opt>b1</opt>
-      <opt>b2</opt>
-      <opt>b3</opt>
-    </mod>
-  </ins>
-
-  <ins name="+PHI" pseudo="true" variable_srcs="true"/>
-
-  <ins name="+COLLECT.i32" pseudo="true" variable_srcs="true"/>
-
-  <ins name="+SPLIT.i32" pseudo="true" variable_dests="true">
-    <src start="0"/>
-  </ins>
-
-  <ins name="*FCMP_OR.f32" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <src start="3" mask="0xfb"/>
-    <src start="6" mask="0xfb"/>
-    <mod name="widen0" size="2">
-      <opt>none</opt>
-      <opt>h0</opt>
-      <opt>h1</opt>
-    </mod>
-    <mod name="widen1" size="2">
-      <opt>none</opt>
-      <opt>h0</opt>
-      <opt>h1</opt>
-    </mod>
-    <mod name="abs1" start="6" size="1" opt="abs"/>
-    <mod name="neg0" start="7" size="1" opt="neg"/>
-    <mod name="neg1" start="8" size="1" opt="neg"/>
-    <mod name="abs0" start="12" size="1" opt="abs"/>
-    <mod name="cmpf" start="13" size="3">
-      <opt>eq</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>ne</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-      <opt>gtlt</opt>
-      <opt>total</opt>
-    </mod>
-    <mod name="result_type" start="16" size="2" default="i1">
-      <opt>i1</opt>
-      <opt>f1</opt>
-      <opt>m1</opt>
-    </mod>
-  </ins>
-
-  <ins name="*FCMP_OR.v2f16" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <src start="3" mask="0xfb"/>
-    <src start="6" mask="0xfb"/>
-    <mod name="abs0" size="1" opt="abs"/>
-    <mod name="abs1" size="1" opt="abs"/>
-    <mod name="cmpf" size="3">
-      <opt>eq</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>ne</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-      <opt>gtlt</opt>
-      <opt>total</opt>
-    </mod>
-    <mod name="neg0" start="7" size="1" opt="neg"/>
-    <mod name="neg1" start="8" size="1" opt="neg"/>
-    <mod name="swz0" start="9" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="swz1" start="11" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="result_type" start="16" size="2" default="i1">
-      <opt>i1</opt>
-      <opt>f1</opt>
-      <opt>m1</opt>
-    </mod>
-  </ins>
-
-  <ins name="*FCMP_AND.f32" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <src start="3" mask="0xfb"/>
-    <src start="6" mask="0xfb"/>
-    <mod name="widen0" size="2">
-      <opt>none</opt>
-      <opt>h0</opt>
-      <opt>h1</opt>
-    </mod>
-    <mod name="widen1" size="2">
-      <opt>none</opt>
-      <opt>h0</opt>
-      <opt>h1</opt>
-    </mod>
-    <mod name="abs1" start="6" size="1" opt="abs"/>
-    <mod name="neg0" start="7" size="1" opt="neg"/>
-    <mod name="neg1" start="8" size="1" opt="neg"/>
-    <mod name="abs0" start="12" size="1" opt="abs"/>
-    <mod name="cmpf" start="13" size="3">
-      <opt>eq</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>ne</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-      <opt>gtlt</opt>
-      <opt>total</opt>
-    </mod>
-    <mod name="result_type" start="16" size="2" default="i1">
-      <opt>i1</opt>
-      <opt>f1</opt>
-      <opt>m1</opt>
-    </mod>
-  </ins>
-
-  <ins name="*FCMP_AND.v2f16" pseudo="true">
-    <src start="0" mask="0xfb"/>
-    <src start="3" mask="0xfb"/>
-    <src start="6" mask="0xfb"/>
-    <mod name="abs0" size="1" opt="abs"/>
-    <mod name="abs1" size="1" opt="abs"/>
-    <mod name="cmpf" size="3">
-      <opt>eq</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>ne</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-      <opt>gtlt</opt>
-      <opt>total</opt>
-    </mod>
-    <mod name="neg0" start="7" size="1" opt="neg"/>
-    <mod name="neg1" start="8" size="1" opt="neg"/>
-    <mod name="swz0" start="9" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="swz1" start="11" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="result_type" start="16" size="2" default="i1">
-      <opt>i1</opt>
-      <opt>f1</opt>
-      <opt>m1</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_MULTI.s32" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_MULTI.u32" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_OR.s32" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_OR.u32" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_OR.v2s16" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="swz0" start="6" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="swz1" start="8" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_OR.v2u16" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="swz0" start="6" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="swz1" start="8" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_OR.v4s8" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-    <derived start="6" size="1">
-      <eq left="cmpf" right="#gt"/>
-      <eq left="cmpf" right="#ge"/>
-    </derived>
-  </ins>
-
-  <ins name="+ICMP_OR.v4u8" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_AND.s32" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_AND.u32" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_AND.v2s16" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="swz0" start="6" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="swz1" start="8" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_AND.v2u16" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="swz0" start="6" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="swz1" start="8" size="2" default="h01">
-      <opt>h00</opt>
-      <opt>h10</opt>
-      <opt>h01</opt>
-      <opt>h11</opt>
-    </mod>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
-  <ins name="+ICMP_AND.v4s8" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-    <derived start="6" size="1">
-      <eq left="cmpf" right="#gt"/>
-      <eq left="cmpf" right="#ge"/>
-    </derived>
-  </ins>
-
-  <ins name="+ICMP_AND.v4u8" pseudo="true">
-    <src start="0"/>
-    <src start="3"/>
-    <src start="6"/>
-    <mod name="result_type" start="10" size="1" default="i1">
-      <opt>i1</opt>
-      <opt>m1</opt>
-    </mod>
-    <mod name="cmpf" size="2">
-      <opt>eq</opt>
-      <opt>ne</opt>
-      <opt>gt</opt>
-      <opt>ge</opt>
-      <opt>lt</opt>
-      <opt>le</opt>
-    </mod>
-  </ins>
-
 </bifrost>
diff --git a/lib/mesa/src/panfrost/bifrost/bi_builder.h.py b/lib/mesa/src/panfrost/bifrost/bi_builder.h.py
index 4ce47fb05..903ef4e02 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_builder.h.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_builder.h.py
@@ -19,9 +19,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen",
-    "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem",
-    "not_result", "skip", "round", "ftz"])
+SKIP = set(["lane", "lanes", "lanes", "replicate", "swz", "widen", "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem", "not_result", "skip"])
 
 TEMPLATE = """
 #ifndef _BI_BUILDER_H_
@@ -30,11 +28,6 @@ TEMPLATE = """
 #include "compiler.h"
 
 <%
-# For <32-bit loads/stores, the default extend `none` with a natural sized
-# input is not encodeable! To avoid a footgun, swap the default to `zext` which
-# will work as expected
-ZEXT_DEFAULT = set(["LOAD.i8", "LOAD.i16", "LOAD.i24", "STORE.i8", "STORE.i16", "STORE.i24"])
-
 def nirtypes(opcode):
     split = opcode.split('.', 1)
     if len(split) < 2:
@@ -60,6 +53,19 @@ def nirtypes(opcode):
     else:
         return None
 
+def typesize(opcode):
+    if opcode[-3:] == '128':
+        return 128
+    if opcode[-2:] == '48':
+        return 48
+    elif opcode[-1] == '8':
+        return 8
+    else:
+        try:
+            return int(opcode[-2:])
+        except:
+            return None
+
 def condition(opcode, typecheck, sizecheck):
     cond = ''
     if typecheck == True:
@@ -92,51 +98,27 @@ def to_suffix(op):
 static inline
 bi_instr * bi_${opcode.replace('.', '_').lower()}${to_suffix(ops[opcode])}(${signature(ops[opcode], modifiers)})
 {
-<%
-    op = ops[opcode]
-    nr_dests = "nr_dests" if op["variable_dests"] else op["dests"]
-    nr_srcs = "nr_srcs" if op["variable_srcs"] else src_count(op)
-%>
-    size_t size = sizeof(bi_instr) + sizeof(bi_index) * (${nr_dests} + ${nr_srcs});
-    bi_instr *I = (bi_instr *) rzalloc_size(b->shader, size);
-
+    bi_instr *I = rzalloc(b->shader, bi_instr);
     I->op = BI_OPCODE_${opcode.replace('.', '_').upper()};
-    I->nr_dests = ${nr_dests};
-    I->nr_srcs = ${nr_srcs};
-    I->dest = (bi_index *) (&I[1]);
-    I->src = I->dest + ${nr_dests};
-
-% if not op["variable_dests"]:
-% for dest in range(op["dests"]):
+% for dest in range(ops[opcode]["dests"]):
     I->dest[${dest}] = dest${dest};
 % endfor
-%endif
-
-% if not op["variable_srcs"]:
-% for src in range(src_count(op)):
+% for src in range(src_count(ops[opcode])):
     I->src[${src}] = src${src};
 % endfor
-% endif
-
 % for mod in ops[opcode]["modifiers"]:
-% if not should_skip(mod, opcode):
+% if mod[0:-1] not in SKIP and mod not in SKIP:
     I->${mod} = ${mod};
 % endif
 % endfor
-% if ops[opcode]["rtz"]:
-    I->round = BI_ROUND_RTZ;
-% endif
 % for imm in ops[opcode]["immediates"]:
     I->${imm} = ${imm};
 % endfor
-% if opcode in ZEXT_DEFAULT:
-    I->extend = BI_EXTEND_ZEXT;
-% endif
     bi_builder_insert(&b->cursor, I);
     return I;
 }
 
-% if ops[opcode]["dests"] == 1 and not ops[opcode]["variable_dests"]:
+% if ops[opcode]["dests"] == 1:
 static inline
 bi_index bi_${opcode.replace('.', '_').lower()}(${signature(ops[opcode], modifiers, no_dests=True)})
 {
@@ -193,26 +175,19 @@ modifier_lists = order_modifiers(ir_instructions)
 
 # Generate type signature for a builder routine
 
-def should_skip(mod, op):
-    # FROUND and HADD only make sense in context of a round mode, so override
-    # the usual skip
-    if mod == "round" and ("FROUND" in op or "HADD" in op):
-        return False
-
+def should_skip(mod):
     return mod in SKIP or mod[0:-1] in SKIP
 
 def modifier_signature(op):
-    return sorted([m for m in op["modifiers"].keys() if not should_skip(m, op["key"])])
+    return sorted([m for m in op["modifiers"].keys() if not should_skip(m)])
 
 def signature(op, modifiers, typeful = False, sized = False, no_dests = False):
     return ", ".join(
         ["bi_builder *b"] +
         (["nir_alu_type type"] if typeful == True else []) +
         (["unsigned bitsize"] if sized == True else []) +
-        (["unsigned nr_dests"] if op["variable_dests"] else
-            ["bi_index dest{}".format(i) for i in range(0 if no_dests else op["dests"])]) +
-        (["unsigned nr_srcs"] if op["variable_srcs"] else
-            ["bi_index src{}".format(i) for i in range(src_count(op))]) +
+        ["bi_index dest{}".format(i) for i in range(0 if no_dests else op["dests"])] +
+        ["bi_index src{}".format(i) for i in range(src_count(op))] +
         ["{} {}".format(
         "bool" if len(modifiers[T[0:-1]] if T[-1] in "0123" else modifiers[T]) == 2 else
         "enum bi_" + T[0:-1] if T[-1] in "0123" else
@@ -221,19 +196,11 @@ def signature(op, modifiers, typeful = False, sized = False, no_dests = False):
         ["uint32_t {}".format(imm) for imm in op["immediates"]])
 
 def arguments(op, temp_dest = True):
-    dest_pattern = "bi_temp(b->shader)" if temp_dest else 'dest{}'
-    dests = [dest_pattern.format(i) for i in range(op["dests"])]
-    srcs = ["src{}".format(i) for i in range(src_count(op))]
-
-    # Variable source/destinations just pass in the count
-    if op["variable_dests"]:
-        dests = ["nr_dests"]
-
-    if op["variable_srcs"]:
-        srcs = ["nr_srcs"]
-
-    return ", ".join(["b"] + dests + srcs + modifier_signature(op) + op["immediates"])
+    return ", ".join(
+        ["b"] +
+        ["bi_temp(b->shader)" if temp_dest else 'dest{}'.format(i) for i in range(op["dests"])] +
+        ["src{}".format(i) for i in range(src_count(op))] +
+        modifier_signature(op) +
+        op["immediates"])
 
-print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, modifiers =
-    modifier_lists, signature = signature, arguments = arguments, src_count =
-    src_count, typesize = typesize, should_skip = should_skip))
+print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, modifiers = modifier_lists, signature = signature, arguments = arguments, src_count = src_count, SKIP = SKIP))
diff --git a/lib/mesa/src/panfrost/bifrost/bi_layout.c b/lib/mesa/src/panfrost/bifrost/bi_layout.c
index 7c034cb31..db66ed04f 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_layout.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_layout.c
@@ -32,6 +32,24 @@
  * manipulating clause layouts.
  */
 
+/* Helper to see if a tuple can be inserted. We must satisfy the invariant:
+ *
+ *      constant_count + tuple_count <= 13
+ *
+ * ...which is equivalent to the clause ending up with 8 or fewer quardwords.
+ * Inserting a tuple increases tuple_count by one, and if it reads a unique
+ * constant, it increases constant_count by one.
+ */
+
+bool
+bi_can_insert_tuple(bi_clause *clause, bool constant)
+{
+        unsigned constant_count = clause->constant_count + (constant ? 1 : 0);
+        unsigned tuple_count = clause->tuple_count + 1;
+
+        return (constant_count + tuple_count) <= 13;
+}
+
 /* Is embedded constant 0 packed for free in a clause with this many tuples? */
 
 bool
@@ -69,7 +87,7 @@ bi_ec0_packed(unsigned tuple_count)
  * constants are packed two-by-two as constant quadwords.
  */
 
-static unsigned
+unsigned
 bi_clause_quadwords(bi_clause *clause)
 {
         unsigned X = clause->tuple_count;
@@ -95,7 +113,7 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target)
 
         /* Determine if the block we're branching to is strictly greater in
          * source order */
-        bool forwards = target->index > start->block->index;
+        bool forwards = target->base.name > start->block->base.name;
 
         if (forwards) {
                 /* We have to jump through this block from the start of this
@@ -106,7 +124,9 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target)
 
                 /* We then need to jump through every clause of every following
                  * block until the target */
-                bi_foreach_block_from(ctx, start->block, blk) {
+                bi_foreach_block_from(ctx, start->block, _blk) {
+                        bi_block *blk = (bi_block *) _blk;
+
                         /* Don't double-count the first block */
                         if (blk == start->block)
                                 continue;
@@ -133,7 +153,9 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target)
                 /* And jump back every clause of preceding blocks up through
                  * and including the target to get to the beginning of the
                  * target */
-                bi_foreach_block_from_rev(ctx, start->block, blk) {
+                bi_foreach_block_from_rev(ctx, start->block, _blk) {
+                        bi_block *blk = (bi_block *) _blk;
+
                         if (blk == start->block)
                                 continue;
 
diff --git a/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c b/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c
index 883f53014..ed03d4c2c 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c
@@ -30,51 +30,16 @@
  * recombine swizzles where we can as an optimization.
  */
 
-static bool
-bi_swizzle_replicates_8(enum bi_swizzle swz)
-{
-        switch (swz) {
-        case BI_SWIZZLE_B0000:
-        case BI_SWIZZLE_B1111:
-        case BI_SWIZZLE_B2222:
-        case BI_SWIZZLE_B3333:
-                return true;
-        default:
-                return false;
-        }
-}
-
 static void
-lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
+bi_lower_swizzle_16(bi_context *ctx, bi_instr *ins, unsigned src)
 {
         /* TODO: Use the opcode table and be a lot more methodical about this... */
         switch (ins->op) {
-        /* Some instructions used with 16-bit data never have swizzles */
         case BI_OPCODE_CSEL_V2F16:
         case BI_OPCODE_CSEL_V2I16:
         case BI_OPCODE_CSEL_V2S16:
         case BI_OPCODE_CSEL_V2U16:
-
-        /* Despite ostensibly being 32-bit instructions, CLPER does not
-         * inherently interpret the data, so it can be used for v2f16
-         * derivatives, which might require swizzle lowering */
-        case BI_OPCODE_CLPER_I32:
-        case BI_OPCODE_CLPER_OLD_I32:
-
-        /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
-         * boolean is implemented as a 16-bit integer, the swizzle is needed
-         * for correct operation if the instruction producing the 16-bit
-         * boolean does not replicate to both halves of the containing 32-bit
-         * register. As such, we may need to lower a swizzle.
-         *
-         * This is a silly hack. Ideally, code gen would be smart enough to
-         * avoid this case (by replicating). In practice, silly hardware design
-         * decisions force our hand here.
-         */
-        case BI_OPCODE_MUX_I32:
-        case BI_OPCODE_CSEL_I32:
             break;
-
         case BI_OPCODE_IADD_V2S16:
         case BI_OPCODE_IADD_V2U16:
         case BI_OPCODE_ISUB_V2S16:
@@ -93,212 +58,28 @@ lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
                     return;
             else
                     break;
-
-        /* For some reason MUX.v2i16 allows swaps but not replication */
-        case BI_OPCODE_MUX_V2I16:
-                if (ins->src[src].swizzle == BI_SWIZZLE_H10)
-                        return;
-                else
-                        break;
-
-        /* No swizzles supported */
-        case BI_OPCODE_HADD_V4U8:
-        case BI_OPCODE_HADD_V4S8:
-        case BI_OPCODE_CLZ_V4U8:
-        case BI_OPCODE_IDP_V4I8:
-        case BI_OPCODE_IABS_V4S8:
-        case BI_OPCODE_ICMP_V4I8:
-        case BI_OPCODE_ICMP_V4U8:
-        case BI_OPCODE_MUX_V4I8:
-        case BI_OPCODE_IADD_IMM_V4I8:
-                break;
-
-        case BI_OPCODE_LSHIFT_AND_V4I8:
-        case BI_OPCODE_LSHIFT_OR_V4I8:
-        case BI_OPCODE_LSHIFT_XOR_V4I8:
-        case BI_OPCODE_RSHIFT_AND_V4I8:
-        case BI_OPCODE_RSHIFT_OR_V4I8:
-        case BI_OPCODE_RSHIFT_XOR_V4I8:
-                /* Last source allows identity or replication */
-                if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle))
-                        return;
-
-                /* Others do not allow swizzles */
-                break;
-
-        /* We don't want to deal with reswizzling logic in modifier prop. Move
-         * the swizzle outside, it's easier for clamp propagation. */
-        case BI_OPCODE_FCLAMP_V2F16:
-        {
-                bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
-                bi_index dest = ins->dest[0];
-                bi_index tmp = bi_temp(ctx);
-
-                bi_index swizzled_src = bi_replace_index(ins->src[0], tmp);
-                ins->src[0].swizzle = BI_SWIZZLE_H01;
-                ins->dest[0] = tmp;
-                bi_swz_v2i16_to(&b, dest, swizzled_src);
-                return;
-        }
-
         default:
             return;
         }
 
-        /* First, try to apply a given swizzle to a constant to clear the
-         * runtime swizzle. This is less heavy-handed than ignoring the
-         * swizzle for scalar destinations, since it maintains
-         * replication of the destination.
-         */
-        if (ins->src[src].type == BI_INDEX_CONSTANT) {
-                ins->src[src].value = bi_apply_swizzle(ins->src[src].value,
-                                                       ins->src[src].swizzle);
-                ins->src[src].swizzle = BI_SWIZZLE_H01;
+        /* Identity is ok (TODO: what about replicate only?) */
+        if (ins->src[src].swizzle == BI_SWIZZLE_H01)
                 return;
-        }
-
-        /* Even if the source does not replicate, if the consuming instruction
-         * produces a 16-bit scalar, we can ignore the other component.
-         */
-        if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
-                        ins->src[src].swizzle == BI_SWIZZLE_H00)
-        {
-                ins->src[src].swizzle = BI_SWIZZLE_H01;
-                return;
-        }
 
         /* Lower it away */
         bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
-
-        bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8);
-        bi_index orig = ins->src[src];
-        bi_index stripped = bi_replace_index(bi_null(), orig);
-        stripped.swizzle = ins->src[src].swizzle;
-
-        bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped);
-
-        bi_replace_src(ins, src, swz);
+        ins->src[src] = bi_replace_index(ins->src[src],
+                        bi_swz_v2i16(&b, ins->src[src]));
         ins->src[src].swizzle = BI_SWIZZLE_H01;
 }
 
-static bool
-bi_swizzle_replicates_16(enum bi_swizzle swz)
-{
-        switch (swz) {
-        case BI_SWIZZLE_H00:
-        case BI_SWIZZLE_H11:
-                return true;
-        default:
-                /* If a swizzle replicates every 8-bits, it also replicates
-                 * every 16-bits, so allow 8-bit replicating swizzles.
-                 */
-                return bi_swizzle_replicates_8(swz);
-        }
-}
-
-static bool
-bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
-{
-        switch (I->op) {
-
-        /* Instructions that construct vectors have replicated output if their
-         * sources are identical. Check this case first.
-         */
-        case BI_OPCODE_MKVEC_V2I16:
-        case BI_OPCODE_V2F16_TO_V2S16:
-        case BI_OPCODE_V2F16_TO_V2U16:
-        case BI_OPCODE_V2F32_TO_V2F16:
-        case BI_OPCODE_V2S16_TO_V2F16:
-        case BI_OPCODE_V2S8_TO_V2F16:
-        case BI_OPCODE_V2S8_TO_V2S16:
-        case BI_OPCODE_V2U16_TO_V2F16:
-        case BI_OPCODE_V2U8_TO_V2F16:
-        case BI_OPCODE_V2U8_TO_V2U16:
-                return bi_is_value_equiv(I->src[0], I->src[1]);
-
-        /* 16-bit transcendentals are defined to output zero in their
-         * upper half, so they do not replicate
-         */
-        case BI_OPCODE_FRCP_F16:
-        case BI_OPCODE_FRSQ_F16:
-                return false;
-
-        /* Not sure, be conservative, we don't use these.. */
-        case BI_OPCODE_VN_ASST1_F16:
-        case BI_OPCODE_FPCLASS_F16:
-        case BI_OPCODE_FPOW_SC_DET_F16:
-                return false;
-
-        default:
-                break;
-        }
-
-        /* Replication analysis only makes sense for ALU instructions */
-        if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
-                return false;
-
-        /* We only analyze 16-bit instructions for 16-bit replication. We could
-         * maybe do better.
-         */
-        if (bi_opcode_props[I->op].size != BI_SIZE_16)
-                return false;
-
-        bi_foreach_src(I, s) {
-                if (bi_is_null(I->src[s]))
-                        continue;
-
-                /* Replicated swizzles */
-                if (bi_swizzle_replicates_16(I->src[s].swizzle))
-                        continue;
-
-                /* Replicated values */
-                if (bi_is_ssa(I->src[s]) &&
-                    BITSET_TEST(replicates_16, I->src[s].value))
-                        continue;
-
-                /* Replicated constants */
-                if (I->src[s].type == BI_INDEX_CONSTANT &&
-                    (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
-                        continue;
-
-                return false;
-        }
-
-        return true;
-}
-
 void
 bi_lower_swizzle(bi_context *ctx)
 {
         bi_foreach_instr_global_safe(ctx, ins) {
                 bi_foreach_src(ins, s) {
-                        if (bi_is_null(ins->src[s])) continue;
-                        if (ins->src[s].swizzle == BI_SWIZZLE_H01) continue;
-
-                        lower_swizzle(ctx, ins, s);
+                        if (!bi_is_null(ins->src[s]))
+                                bi_lower_swizzle_16(ctx, ins, s);
                 }
         }
-
-        /* Now that we've lowered swizzles, clean up the mess */
-        BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
-
-        bi_foreach_instr_global(ctx, ins) {
-                if (ins->nr_dests && bi_instr_replicates(ins, replicates_16))
-                        BITSET_SET(replicates_16, ins->dest[0].value);
-
-                if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
-                    BITSET_TEST(replicates_16, ins->src[0].value)) {
-                        ins->op = BI_OPCODE_MOV_I32;
-                        ins->src[0].swizzle = BI_SWIZZLE_H01;
-                }
-
-                /* The above passes rely on replicating destinations.  For
-                 * Valhall, we will want to optimize this. For now, default
-                 * to Bifrost compatible behaviour.
-                 */
-                if (ins->nr_dests)
-                        ins->dest[0].swizzle = BI_SWIZZLE_H01;
-        }
-
-        free(replicates_16);
 }
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py b/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py
index cbe0ae458..7ef88da8f 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py
@@ -21,15 +21,11 @@
 # IN THE SOFTWARE.
 
 TEMPLATE = """#include "bi_opcodes.h"
-<%
-def hasmod(mods, name):
-        return 1 if name in mods else 0
-%>
+
 struct bi_op_props bi_opcode_props[BI_NUM_OPCODES] = {
 % for opcode in sorted(mnemonics):
     <%
         add = instructions["+" + opcode][0][1] if "+" + opcode in instructions else None
-        size = typesize(opcode)
         message = add["message"].upper() if add else "NONE"
         sr_count = add["staging_count"].upper() if add else "0"
         sr_read = int(add["staging"] in ["r", "rw"] if add else False)
@@ -39,18 +35,10 @@ struct bi_op_props bi_opcode_props[BI_NUM_OPCODES] = {
         branch = int(opcode.startswith('BRANCH'))
         has_fma = int("*" + opcode in instructions)
         has_add = int("+" + opcode in instructions)
-        mods = ops[opcode]['modifiers']
-        clamp = hasmod(mods, 'clamp')
-        not_result = hasmod(mods, 'not_result')
-        abs = hasmod(mods, 'abs0') | (hasmod(mods, 'abs1') << 1) | (hasmod(mods, 'abs2') << 2)
-        neg = hasmod(mods, 'neg0') | (hasmod(mods, 'neg1') << 1) | (hasmod(mods, 'neg2') << 2)
-        m_not = hasmod(mods, 'not1')
     %>
     [BI_OPCODE_${opcode.replace('.', '_').upper()}] = {
-        "${opcode}", BIFROST_MESSAGE_${message}, BI_SIZE_${size},
-        BI_SR_COUNT_${sr_count}, ${sr_read}, ${sr_write}, ${last}, ${branch},
-        ${table}, ${has_fma}, ${has_add}, ${clamp}, ${not_result}, ${abs},
-        ${neg}, ${m_not},
+        "${opcode}", BIFROST_MESSAGE_${message}, BI_SR_COUNT_${sr_count},
+        ${sr_read}, ${sr_write}, ${last}, ${branch}, ${table}, ${has_fma}, ${has_add},
     },
 % endfor
 };"""
@@ -63,4 +51,4 @@ instructions = parse_instructions(sys.argv[1], include_pseudo = True)
 ir_instructions = partition_mnemonics(instructions)
 mnemonics = set(x[1:] for x in instructions.keys())
 
-print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, mnemonics = mnemonics, instructions = instructions, typesize = typesize))
+print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, mnemonics = mnemonics, instructions = instructions))
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py b/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py
index 3b8ff0b33..b807513e1 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py
@@ -64,23 +64,11 @@ enum bi_sr_count {
     BI_SR_COUNT_SR_COUNT = 7
 };
 
-enum bi_size {
-   BI_SIZE_8 = 0,
-   BI_SIZE_16,
-   BI_SIZE_24,
-   BI_SIZE_32,
-   BI_SIZE_48,
-   BI_SIZE_64,
-   BI_SIZE_96,
-   BI_SIZE_128,
-};
-
 /* Description of an opcode in the IR */
 struct bi_op_props {
         const char *name;
 
         enum bifrost_message_type message : 4;
-        enum bi_size size : 3;
         enum bi_sr_count sr_count : 3;
         bool sr_read : 1;
         bool sr_write : 1;
@@ -89,13 +77,6 @@ struct bi_op_props {
         bool table : 1;
         bool fma : 1;
         bool add : 1;
-
-        /* Supported propagable modifiers */
-        bool clamp : 1;
-        bool not_result : 1;
-        unsigned abs : 3;
-        unsigned neg : 3;
-        bool not_mod : 1;
 };
 
 /* Generated in bi_opcodes.c.py */
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c b/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c
index 13b9b0d2b..06b0e41e8 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c
@@ -23,89 +23,54 @@
  */
 
 #include "compiler.h"
-#include "bi_builder.h"
 
-/* SSA copy propagation */
+/* A simple scalar-only SSA-based copy-propagation pass. TODO: vectors */
 
 static bool
-bi_reads_fau(bi_instr *ins)
+bi_is_copy(bi_instr *ins)
 {
-        bi_foreach_src(ins, s) {
-                if (ins->src[s].type == BI_INDEX_FAU)
-                        return true;
-        }
+        return (ins->op == BI_OPCODE_MOV_I32) && bi_is_ssa(ins->dest[0])
+                && (bi_is_ssa(ins->src[0]) || ins->src[0].type == BI_INDEX_FAU);
+}
 
-        return false;
+static inline unsigned
+bi_word_node(bi_index idx)
+{
+        assert(idx.type == BI_INDEX_NORMAL && !idx.reg);
+        return (idx.value << 2) | idx.offset;
 }
 
 void
 bi_opt_copy_prop(bi_context *ctx)
 {
-        /* Chase SPLIT of COLLECT. Instruction selection usually avoids this
-         * pattern (due to the split cache), but it is inevitably generated by
-         * the UBO pushing pass.
-         */
-        bi_instr **collects = calloc(sizeof(bi_instr *), ctx->ssa_alloc);
-        bi_foreach_instr_global_safe(ctx, I) {
-                if (I->op == BI_OPCODE_COLLECT_I32) {
-                        /* Rewrite trivial collects while we're at it */
-                        if (I->nr_srcs == 1)
-                                I->op = BI_OPCODE_MOV_I32;
-
-                        collects[I->dest[0].value] = I;
-                } else if (I->op == BI_OPCODE_SPLIT_I32) {
-                        /* Rewrite trivial splits while we're at it */
-                        if (I->nr_dests == 1)
-                                I->op = BI_OPCODE_MOV_I32;
-
-                        bi_instr *collect = collects[I->src[0].value];
-                        if (!collect)
-                                continue;
-
-                        /* Lower the split to moves, copyprop cleans up */
-                        bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
-
-                        bi_foreach_dest(I, d)
-                                bi_mov_i32_to(&b, I->dest[d], collect->src[d]);
-
-                        bi_remove_instruction(I);
-                }
-        }
-
-        free(collects);
-
-        bi_index *replacement = calloc(sizeof(bi_index), ctx->ssa_alloc);
+        bi_index *replacement = calloc(sizeof(bi_index), ((ctx->ssa_alloc + 1) << 2));
 
         bi_foreach_instr_global_safe(ctx, ins) {
-                if (ins->op == BI_OPCODE_MOV_I32 && ins->src[0].type != BI_INDEX_REGISTER) {
+                if (bi_is_copy(ins)) {
                         bi_index replace = ins->src[0];
 
                         /* Peek through one layer so copyprop converges in one
                          * iteration for chained moves */
                         if (bi_is_ssa(replace)) {
-                                bi_index chained = replacement[replace.value];
+                                bi_index chained = replacement[bi_word_node(replace)];
 
                                 if (!bi_is_null(chained))
                                         replace = chained;
                         }
 
-                        assert(ins->nr_dests == 1);
-                        replacement[ins->dest[0].value] = replace;
+                        replacement[bi_word_node(ins->dest[0])] = replace;
                 }
 
                 bi_foreach_src(ins, s) {
                         bi_index use = ins->src[s];
 
-                        if (use.type != BI_INDEX_NORMAL) continue;
-                        if (bi_is_staging_src(ins, s)) continue;
-
-                        bi_index repl = replacement[use.value];
+                        if (use.type != BI_INDEX_NORMAL || use.reg) continue;
+                        if (bi_count_read_registers(ins, s) != 1) continue;
 
-                        if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(ins))
-                                continue;
+                        bi_index repl = replacement[bi_word_node(use)];
 
                         if (!bi_is_null(repl))
-                                bi_replace_src(ins, s, repl);
+                                ins->src[s] = bi_replace_index(ins->src[s], repl);
                 }
         }
 
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c b/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c
index 5a37bf3a9..8debdd486 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c
@@ -30,16 +30,10 @@
  * structure returned back to the command stream. */
 
 static bool
-bi_is_ubo(bi_instr *ins)
-{
-        return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
-                (ins->seg == BI_SEG_UBO);
-}
-
-static bool
 bi_is_direct_aligned_ubo(bi_instr *ins)
 {
-        return bi_is_ubo(ins) &&
+        return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
+                (ins->seg == BI_SEG_UBO) &&
                 (ins->src[0].type == BI_INDEX_CONSTANT) &&
                 (ins->src[1].type == BI_INDEX_CONSTANT) &&
                 ((ins->src[0].value & 0x3) == 0);
@@ -79,12 +73,8 @@ bi_analyze_ranges(bi_context *ctx)
                 assert(ubo < res.nr_blocks);
                 assert(channels > 0 && channels <= 4);
 
-                if (word >= MAX_UBO_WORDS) continue;
-
-                /* Must use max if the same base is read with different channel
-                 * counts, which is possible with nir_opt_shrink_vectors */
-                uint8_t *range = res.blocks[ubo].range;
-                range[word] = MAX2(range[word], channels);
+                if (word < MAX_UBO_WORDS)
+                        res.blocks[ubo].range[word] = channels;
         }
 
         return res;
@@ -128,51 +118,42 @@ bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis)
 void
 bi_opt_push_ubo(bi_context *ctx)
 {
-        struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
-        bi_pick_ubo(ctx->info.push, &analysis);
+        if (ctx->inputs->no_ubo_to_push)
+                return;
 
-        ctx->ubo_mask = 0;
+        /* This pass only runs once */
+        assert(ctx->info->push.count == 0);
+
+        struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
+        bi_pick_ubo(&ctx->info->push, &analysis);
 
         bi_foreach_instr_global_safe(ctx, ins) {
-                if (!bi_is_ubo(ins)) continue;
+                if (!bi_is_direct_aligned_ubo(ins)) continue;
 
                 unsigned ubo = ins->src[1].value;
                 unsigned offset = ins->src[0].value;
 
-                if (!bi_is_direct_aligned_ubo(ins)) {
-                        /* The load can't be pushed, so this UBO needs to be
-                         * uploaded conventionally */
-                        if (ins->src[1].type == BI_INDEX_CONSTANT)
-                                ctx->ubo_mask |= BITSET_BIT(ubo);
-                        else
-                                ctx->ubo_mask = ~0;
-
-                        continue;
-                }
-
                 /* Check if we decided to push this */
                 assert(ubo < analysis.nr_blocks);
-                if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) {
-                        ctx->ubo_mask |= BITSET_BIT(ubo);
-                        continue;
-                }
+                if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) continue;
 
                 /* Replace the UBO load with moves from FAU */
                 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
 
-                unsigned nr = bi_opcode_props[ins->op].sr_count;
-                bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0], nr);
+                unsigned channels = bi_opcode_props[ins->op].sr_count;
 
-                bi_foreach_src(vec, w) {
+                for (unsigned w = 0; w < channels; ++w) {
                         /* FAU is grouped in pairs (2 x 4-byte) */
                         unsigned base =
-                                pan_lookup_pushed_ubo(ctx->info.push, ubo,
+                                pan_lookup_pushed_ubo(&ctx->info->push, ubo,
                                                       (offset + 4 * w));
 
                         unsigned fau_idx = (base >> 1);
                         unsigned fau_hi = (base & 1);
 
-                        vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi);
+                        bi_mov_i32_to(&b,
+                                bi_word(ins->dest[0], w),
+                                bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi));
                 }
 
                 bi_remove_instruction(ins);
@@ -180,169 +161,3 @@ bi_opt_push_ubo(bi_context *ctx)
 
         free(analysis.blocks);
 }
-
-typedef struct {
-        BITSET_DECLARE(row, PAN_MAX_PUSH);
-} adjacency_row;
-
-/* Find the connected component containing `node` with depth-first search */
-static void
-bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited,
-                  unsigned *component, unsigned *size, unsigned node)
-{
-        unsigned neighbour;
-
-        BITSET_SET(visited, node);
-        component[(*size)++] = node;
-
-        BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) {
-                if (!BITSET_TEST(visited, neighbour)) {
-                        bi_find_component(adjacency, visited, component, size,
-                                          neighbour);
-                }
-        }
-}
-
-static bool
-bi_is_uniform(bi_index idx)
-{
-        return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM);
-}
-
-/* Get the index of a uniform in 32-bit words from the start of FAU-RAM */
-static unsigned
-bi_uniform_word(bi_index idx)
-{
-        assert(bi_is_uniform(idx));
-        assert(idx.offset <= 1);
-
-        return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset;
-}
-
-/*
- * Create an undirected graph where nodes are 32-bit uniform indices and edges
- * represent that two nodes are used in the same instruction.
- *
- * The graph is constructed as an adjacency matrix stored in adjacency.
- */
-static void
-bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency)
-{
-        bi_foreach_instr_global(ctx, I) {
-                unsigned nodes[BI_MAX_SRCS] = {};
-                unsigned node_count = 0;
-
-                /* Set nodes[] to 32-bit uniforms accessed */
-                bi_foreach_src(I, s) {
-                        if (bi_is_uniform(I->src[s])) {
-                                unsigned word = bi_uniform_word(I->src[s]);
-
-                                if (word >= ctx->info.push_offset)
-                                        nodes[node_count++] = word;
-                        }
-                }
-
-                /* Create clique connecting nodes[] */
-                for (unsigned i = 0; i < node_count; ++i) {
-                        for (unsigned j = 0; j < node_count; ++j) {
-                                if (i == j)
-                                        continue;
-
-                                unsigned x = nodes[i], y = nodes[j];
-                                assert(MAX2(x, y) < ctx->info.push->count);
-
-                                /* Add undirected edge between the nodes */
-                                BITSET_SET(adjacency[x].row, y);
-                                BITSET_SET(adjacency[y].row, x);
-                        }
-                }
-        }
-}
-
-/*
- * Optimization pass to reorder uniforms. The goal is to reduce the number of
- * moves we emit when lowering FAU. The pass groups uniforms used by the same
- * instruction.
- *
- * The pass works by creating a graph of pushed uniforms, where edges denote the
- * "both 32-bit uniforms required by the same instruction" relationship. We
- * perform depth-first search on this graph to find the connected components,
- * where each connected component is a cluster of uniforms that are used
- * together. We then select pairs of uniforms from each connected component.
- * The remaining unpaired uniforms (from components of odd sizes) are paired
- * together arbitrarily.
- *
- * After a new ordering is selected, pushed uniforms in the program and the
- * panfrost_ubo_push data structure must be remapped to use the new ordering.
- */
-void
-bi_opt_reorder_push(bi_context *ctx)
-{
-        adjacency_row adjacency[PAN_MAX_PUSH] = { 0 };
-        BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 };
-
-        unsigned ordering[PAN_MAX_PUSH] = { 0 };
-        unsigned unpaired[PAN_MAX_PUSH] = { 0 };
-        unsigned pushed = 0, unpaired_count = 0;
-
-        struct panfrost_ubo_push *push = ctx->info.push;
-        unsigned push_offset = ctx->info.push_offset;
-
-        bi_create_fau_interference_graph(ctx, adjacency);
-
-        for (unsigned i = push_offset; i < push->count; ++i) {
-                if (BITSET_TEST(visited, i)) continue;
-
-                unsigned component[PAN_MAX_PUSH] = { 0 };
-                unsigned size = 0;
-                bi_find_component(adjacency, visited, component, &size, i);
-
-                /* If there is an odd number of uses, at least one use must be
-                 * unpaired. Arbitrarily take the last one.
-                 */
-                if (size % 2)
-                        unpaired[unpaired_count++] = component[--size];
-
-                /* The rest of uses are paired */
-                assert((size % 2) == 0);
-
-                /* Push the paired uses */
-                memcpy(ordering + pushed, component, sizeof(unsigned) * size);
-                pushed += size;
-        }
-
-        /* Push unpaired nodes at the end */
-        memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count);
-        pushed += unpaired_count;
-
-        /* Ordering is a permutation. Invert it for O(1) lookup. */
-        unsigned old_to_new[PAN_MAX_PUSH] = { 0 };
-
-        for (unsigned i = 0; i < push_offset; ++i) {
-                old_to_new[i] = i;
-        }
-
-        for (unsigned i = 0; i < pushed; ++i) {
-                assert(ordering[i] >= push_offset);
-                old_to_new[ordering[i]] = push_offset + i;
-        }
-
-        /* Use new ordering throughout the program */
-        bi_foreach_instr_global(ctx, I) {
-                bi_foreach_src(I, s) {
-                        if (bi_is_uniform(I->src[s])) {
-                                unsigned node = bi_uniform_word(I->src[s]);
-                                unsigned new_node = old_to_new[node];
-                                I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1);
-                                I->src[s].offset = new_node & 1;
-                        }
-                }
-        }
-
-        /* Use new ordering for push */
-        struct panfrost_ubo_push old = *push;
-        for (unsigned i = 0; i < pushed; ++i)
-                push->words[push_offset + i] = old.words[ordering[i]];
-
-        push->count = push_offset + pushed;
-}
diff --git a/lib/mesa/src/panfrost/bifrost/bi_packer.c.py b/lib/mesa/src/panfrost/bifrost/bi_packer.c.py
index 601750e2a..28669ebfa 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_packer.c.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_packer.c.py
@@ -24,14 +24,9 @@ import sys
 from bifrost_isa import *
 from mako.template import Template
 
-# Consider pseudo instructions when getting the modifier list
-instructions_with_pseudo = parse_instructions(sys.argv[1], include_pseudo = True)
-ir_instructions_with_pseudo = partition_mnemonics(instructions_with_pseudo)
-modifier_lists = order_modifiers(ir_instructions_with_pseudo)
-
-# ...but strip for packing
 instructions = parse_instructions(sys.argv[1])
 ir_instructions = partition_mnemonics(instructions)
+modifier_lists = order_modifiers(ir_instructions)
 
 # Packs sources into an argument. Offset argument to work around a quirk of our
 # compiler IR when dealing with staging registers (TODO: reorder in the IR to
@@ -112,9 +107,6 @@ def pack_modifier(mod, width, default, opts, body, pack_exprs):
         # Construct a list
         lists = [pick_from_bucket(opts, bucket) for bucket in SWIZZLE_BUCKETS]
         ir_value = "src[{}].swizzle".format(arg)
-    elif raw == "lane_dest":
-        lists = [pick_from_bucket(opts, bucket) for bucket in SWIZZLE_BUCKETS]
-        ir_value = "dest->swizzle"
     elif raw in ["abs", "sign"]:
         ir_value = "src[{}].abs".format(arg)
     elif raw in ["neg", "not"]:
@@ -315,7 +307,7 @@ bi_pack_${'fma' if unit == '*' else 'add'}(bi_instr *I,
     enum bifrost_packed_src src3)
 {
     if (!I)
-        return bi_pack_${opname_to_c(unit + 'NOP')}(I, src0, src1, src2, src3);
+        return bi_pack_${opname_to_c(unit + 'NOP.i32')}(I, src0, src1, src2, src3);
 
 % if unit == '*':
     assert((1 << src0) & 0xfb);
diff --git a/lib/mesa/src/panfrost/bifrost/bi_printer.c.py b/lib/mesa/src/panfrost/bifrost/bi_printer.c.py
index 04a9c0095..5692633b4 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_printer.c.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_printer.c.py
@@ -55,7 +55,6 @@ bir_fau_name(unsigned fau_idx)
             "blend_descriptor_2", "blend_descriptor_3",
             "blend_descriptor_4", "blend_descriptor_5",
             "blend_descriptor_6", "blend_descriptor_7",
-            "tls_ptr", "wls_ptr", "program_counter",
     };
 
     assert(fau_idx < ARRAY_SIZE(names));
@@ -76,9 +75,6 @@ bir_passthrough_name(unsigned idx)
 static void
 bi_print_index(FILE *fp, bi_index index)
 {
-    if (index.discard)
-        fputs("^", fp);
-
     if (bi_is_null(index))
         fprintf(fp, "_");
     else if (index.type == BI_INDEX_CONSTANT)
@@ -90,6 +86,8 @@ bi_print_index(FILE *fp, bi_index index)
     else if (index.type == BI_INDEX_PASS)
         fprintf(fp, "%s", bir_passthrough_name(index.value));
     else if (index.type == BI_INDEX_REGISTER)
+        fprintf(fp, "br%u", index.value);
+    else if (index.type == BI_INDEX_NORMAL && index.reg)
         fprintf(fp, "r%u", index.value);
     else if (index.type == BI_INDEX_NORMAL)
         fprintf(fp, "%u", index.value);
@@ -111,7 +109,7 @@ bi_print_index(FILE *fp, bi_index index)
 % for mod in sorted(modifiers):
 % if len(modifiers[mod]) > 2: # otherwise just boolean
 
-UNUSED static inline const char *
+static inline const char *
 bi_${mod}_as_str(enum bi_${mod} ${mod})
 {
     switch (${mod}) {
@@ -131,13 +129,11 @@ bi_${mod}_as_str(enum bi_${mod} ${mod})
 
 <%def name="print_modifiers(mods, table)">
     % for mod in mods:
-    % if mod not in ["lane_dest"]:
     % if len(table[mod]) > 2:
         fputs(bi_${mod}_as_str(I->${mod}), fp);
     % else:
         if (I->${mod}) fputs(".${mod}", fp);
     % endif
-    % endif
     % endfor
 </%def>
 
@@ -156,37 +152,19 @@ bi_${mod}_as_str(enum bi_${mod} ${mod})
 </%def>
 
 void
-bi_print_instr(const bi_instr *I, FILE *fp)
+bi_print_instr(bi_instr *I, FILE *fp)
 {
-    fputs("   ", fp);
-
     bi_foreach_dest(I, d) {
+        if (bi_is_null(I->dest[d])) break;
         if (d > 0) fprintf(fp, ", ");
 
         bi_print_index(fp, I->dest[d]);
     }
 
-    if (I->nr_dests > 0)
-        fputs(" = ", fp);
-
-    fprintf(fp, "%s", bi_opcode_props[I->op].name);
+    fprintf(fp, " = %s", bi_opcode_props[I->op].name);
 
     if (I->table)
-        fprintf(fp, ".table%u", I->table);
-
-    if (I->flow)
-        fprintf(fp, ".flow%u", I->flow);
-
-    if (I->op == BI_OPCODE_COLLECT_I32 || I->op == BI_OPCODE_PHI) {
-        for (unsigned i = 0; i < I->nr_srcs; ++i) {
-            if (i > 0)
-                fputs(", ", fp);
-            else
-                fputs(" ", fp);
-
-            bi_print_index(fp, I->src[i]);
-        }
-    }
+        fprintf(fp, ".%s", bi_table_as_str(I->table));
 
     switch (I->op) {
 % for opcode in ops:
@@ -214,7 +192,7 @@ bi_print_instr(const bi_instr *I, FILE *fp)
     }
 
     if (I->branch_target)
-            fprintf(fp, " -> block%u", I->branch_target->index);
+            fprintf(fp, " -> block%u", I->branch_target->base.name);
 
     fputs("\\n", fp);
 
diff --git a/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c b/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c
index 04aa07b0c..05b731a53 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c
@@ -38,7 +38,7 @@
  * 3. The shader must wait on slot #6 before running BLEND, ATEST
  * 4. The shader must wait on slot #7 before running BLEND, ST_TILE
  * 5. ATEST, ZS_EMIT must be issued with slot #0
- * 6. BARRIER must be issued with slot #7 and wait on every active slot.
+ * 6. BARRIER must be issued with slot #7
  * 7. Only slots #0 through #5 may be used for clauses not otherwise specified.
  * 8. If a clause writes to a read staging register of an unresolved
  * dependency, it must set a staging barrier.
@@ -54,256 +54,57 @@
  */
 
 #define BI_NUM_GENERAL_SLOTS 6
-#define BI_NUM_SLOTS 8
-#define BI_NUM_REGISTERS 64
-#define BI_SLOT_SERIAL 0 /* arbitrary */
 
-/*
- * Due to the crude scoreboarding we do, we need to serialize varying loads and
- * memory access. Identify these instructions here.
- */
-static bool
-bi_should_serialize(bi_instr *I)
-{
-        /* For debug, serialize everything to disable scoreboard opts */
-        if (bifrost_debug & BIFROST_DBG_NOSB)
-                return true;
+/* A model for the state of the scoreboard */
 
-        /* Although nominally on the attribute unit, image loads have the same
-         * coherency requirements as general memory loads. Serialize them for
-         * now until we can do something more clever.
-         */
-        if (I->op == BI_OPCODE_LD_ATTR_TEX)
-                return true;
-
-        switch (bi_opcode_props[I->op].message) {
-        case BIFROST_MESSAGE_VARYING:
-        case BIFROST_MESSAGE_LOAD:
-        case BIFROST_MESSAGE_STORE:
-        case BIFROST_MESSAGE_ATOMIC:
-                return true;
-        default:
-                return false;
-        }
-}
+struct bi_scoreboard_state {
+        /* TODO: what do we track here for a heuristic? */
+};
 
 /* Given a scoreboard model, choose a slot for a clause wrapping a given
  * message passing instruction. No side effects. */
 
 static unsigned
-bi_choose_scoreboard_slot(bi_instr *message)
+bi_choose_scoreboard_slot(struct bi_scoreboard_state *st, bi_instr *message)
 {
+        /* A clause that does not produce a message must use slot #0 */
+        if (!message)
+                return 0;
+
+        switch (message->op) {
         /* ATEST, ZS_EMIT must be issued with slot #0 */
-        if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT)
+        case BI_OPCODE_ATEST:
+        case BI_OPCODE_ZS_EMIT:
                 return 0;
 
         /* BARRIER must be issued with slot #7 */
-        if (message->op == BI_OPCODE_BARRIER)
+        case BI_OPCODE_BARRIER:
                 return 7;
 
-        /* For now, make serialization is easy */
-        if (bi_should_serialize(message))
-                return BI_SLOT_SERIAL;
-
-        return 0;
-}
-
-static uint64_t
-bi_read_mask(bi_instr *I, bool staging_only)
-{
-        uint64_t mask = 0;
-
-        if (staging_only && !bi_opcode_props[I->op].sr_read)
-                return mask;
-
-        bi_foreach_src(I, s) {
-                if (I->src[s].type == BI_INDEX_REGISTER) {
-                        unsigned reg = I->src[s].value;
-                        unsigned count = bi_count_read_registers(I, s);
-
-                        mask |= (BITFIELD64_MASK(count) << reg);
-                }
-
-                if (staging_only)
-                        break;
-        }
-
-        return mask;
-}
-
-static uint64_t
-bi_write_mask(bi_instr *I)
-{
-        uint64_t mask = 0;
-
-        bi_foreach_dest(I, d) {
-                if (bi_is_null(I->dest[d])) continue;
-
-                assert(I->dest[d].type == BI_INDEX_REGISTER);
-
-                unsigned reg = I->dest[d].value;
-                unsigned count = bi_count_write_registers(I, d);
-
-                mask |= (BITFIELD64_MASK(count) << reg);
-        }
-
-        /* Instructions like AXCHG.i32 unconditionally both read and write
-         * staging registers. Even if we discard the result, the write still
-         * happens logically and needs to be included in our calculations.
-         * Obscurely, ATOM_CX is sr_write but can ignore the staging register in
-         * certain circumstances; this does not require consideration.
-         */
-        if (bi_opcode_props[I->op].sr_write && I->nr_dests && I->nr_srcs &&
-            bi_is_null(I->dest[0]) && !bi_is_null(I->src[0])) {
-
-                unsigned reg = I->src[0].value;
-                unsigned count = bi_count_write_registers(I, 0);
-
-                mask |= (BITFIELD64_MASK(count) << reg);
-        }
-
-        return mask;
-}
-
-/* Update the scoreboard model to assign an instruction to a given slot */
-
-static void
-bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause)
-{
-        bi_instr *I = clause->message;
-        unsigned slot = clause->scoreboard_id;
-
-        if (!I)
-                return;
-
-        st->read[slot] |= bi_read_mask(I, true);
-
-        if (bi_opcode_props[I->op].sr_write)
-                st->write[slot] |= bi_write_mask(I);
-}
-
-/* Adds a dependency on each slot writing any specified register */
-
-static void
-bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
-{
-        for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
-                if (!(st->write[slot] & regmask))
-                        continue;
-
-                st->write[slot] = 0;
-                st->read[slot] = 0;
-
-                clause->dependencies |= BITFIELD_BIT(slot);
-        }
-}
-
-static void
-bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
-{
-        for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) {
-                if (!(st->read[slot] & regmask))
-                        continue;
-
-                st->read[slot] = 0;
-                clause->staging_barrier = true;
-        }
-}
-
-/* Sets the dependencies for a given clause, updating the model */
-
-static void
-bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st)
-{
-        bi_foreach_instr_in_clause(block, clause, I) {
-                uint64_t read = bi_read_mask(I, false);
-                uint64_t written = bi_write_mask(I);
-
-                /* Read-after-write; write-after-write */
-                bi_depend_on_writers(clause, st, read | written);
-
-                /* Write-after-read */
-                bi_set_staging_barrier(clause, st, written);
-        }
-
-        /* LD_VAR instructions must be serialized per-quad. Just always depend
-         * on any LD_VAR instructions. This isn't optimal, but doing better
-         * requires divergence-aware data flow analysis.
-         *
-         * Similarly, memory loads/stores need to be synchronized. For now,
-         * force them to be serialized. This is not optimal.
-         */
-        if (clause->message && bi_should_serialize(clause->message))
-                clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL);
-
-        /* Barriers must wait on all slots to flush existing work. It might be
-         * possible to skip this with more information about the barrier. For
-         * now, be conservative.
-         */
-        if (clause->message && clause->message->op == BI_OPCODE_BARRIER)
-                clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS);
-}
-
-static bool
-scoreboard_block_update(bi_block *blk)
-{
-        bool progress = false;
-
-        /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
-        bi_foreach_predecessor(blk, pred) {
-                for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
-                        blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
-                        blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
-                }
-        }
-
-        struct bi_scoreboard_state state = blk->scoreboard_in;
-
-        /* Assign locally */
-
-        bi_foreach_clause_in_block(blk, clause) {
-                bi_set_dependencies(blk, clause, &state);
-                bi_push_clause(&state, clause);
+        default:
+                break;
         }
 
-        /* To figure out progress, diff scoreboard_out */
-
-        for (unsigned i = 0; i < BI_NUM_SLOTS; ++i)
-                progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
-
-        blk->scoreboard_out = state;
-
-        return progress;
+        /* TODO: Use a heuristic */
+        return 0;
 }
 
 void
 bi_assign_scoreboard(bi_context *ctx)
 {
-        u_worklist worklist;
-        bi_worklist_init(ctx, &worklist);
-
-        /* First, assign slots. */
-        bi_foreach_block(ctx, block) {
-                bi_foreach_clause_in_block(block, clause) {
-                        if (clause->message) {
-                                unsigned slot = bi_choose_scoreboard_slot(clause->message);
-                                clause->scoreboard_id = slot;
-                        }
-                }
+        struct bi_scoreboard_state st = {};
 
-                bi_worklist_push_tail(&worklist, block);
-        }
+        /* Assign slots */
+        bi_foreach_block(ctx, _block) {
+                bi_block *block = (bi_block *) _block;
 
-        /* Next, perform forward data flow analysis to calculate dependencies */
-        while (!u_worklist_is_empty(&worklist)) {
-                /* Pop from the front for forward analysis */
-                bi_block *blk = bi_worklist_pop_head(&worklist);
+                bi_foreach_clause_in_block(block, clause) {
+                        unsigned slot = bi_choose_scoreboard_slot(&st, clause->message);
+                        clause->scoreboard_id = slot;
 
-                if (scoreboard_block_update(blk)) {
-                        bi_foreach_successor(blk, succ)
-                                bi_worklist_push_tail(&worklist, succ);
+                        bi_clause *next = bi_next_clause(ctx, _block, clause);
+                        if (next)
+                                next->dependencies |= (1 << slot);
                 }
         }
-
-        u_worklist_fini(&worklist);
 }
diff --git a/lib/mesa/src/panfrost/bifrost/bifrost_isa.py b/lib/mesa/src/panfrost/bifrost/bifrost_isa.py
index 7152509bc..ae97795f3 100644
--- a/lib/mesa/src/panfrost/bifrost/bifrost_isa.py
+++ b/lib/mesa/src/panfrost/bifrost/bifrost_isa.py
@@ -132,8 +132,6 @@ def parse_instruction(ins, include_pseudo):
             'staging': ins.attrib.get('staging', '').split('=')[0],
             'staging_count': ins.attrib.get('staging', '=0').split('=')[1],
             'dests': int(ins.attrib.get('dests', '1')),
-            'variable_dests': ins.attrib.get('variable_dests', False),
-            'variable_srcs': ins.attrib.get('variable_srcs', False),
             'unused': ins.attrib.get('unused', False),
             'pseudo': ins.attrib.get('pseudo', False),
             'message': ins.attrib.get('message', 'none'),
@@ -145,9 +143,6 @@ def parse_instruction(ins, include_pseudo):
         common['exact'] = parse_exact(ins)
 
     for src in ins.findall('src'):
-        if src.attrib.get('pseudo', False) and not include_pseudo:
-            continue
-
         mask = int(src.attrib['mask'], 0) if ('mask' in src.attrib) else 0xFF
         common['srcs'].append([int(src.attrib['start'], 0), mask])
 
@@ -245,28 +240,18 @@ def simplify_to_ir(ins):
             'staging': ins['staging'],
             'srcs': len(ins['srcs']),
             'dests': ins['dests'],
-            'variable_dests': ins['variable_dests'],
-            'variable_srcs': ins['variable_srcs'],
             'modifiers': [[m[0][0], m[2]] for m in ins['modifiers']],
             'immediates': [m[0] for m in ins['immediates']]
         }
 
-# Converstions to integers default to rounding-to-zero
-# All other opcodes default to rounding to nearest even
-def default_round_to_zero(name):
-    # 8-bit int to float is exact
-    subs = ['_TO_U', '_TO_S', '_TO_V2U', '_TO_V2S', '_TO_V4U', '_TO_V4S']
-    return any([x in name for x in subs])
 
-def combine_ir_variants(instructions, key):
-    seen = [op for op in instructions.keys() if op[1:] == key]
-    variant_objs = [[simplify_to_ir(Q[1]) for Q in instructions[x]] for x in seen]
-    variants = sum(variant_objs, [])
+def combine_ir_variants(instructions, v):
+    variants = sum([[simplify_to_ir(Q[1]) for Q in instructions[x]] for x in v], [])
 
     # Accumulate modifiers across variants
     modifiers = {}
 
-    for s in variants[0:]:
+    for s in variants:
         # Check consistency
         assert(s['srcs'] == variants[0]['srcs'])
         assert(s['dests'] == variants[0]['dests'])
@@ -282,27 +267,19 @@ def combine_ir_variants(instructions, key):
     # Great, we've checked srcs/immediates are consistent and we've summed over
     # modifiers
     return {
-            'key': key,
             'srcs': variants[0]['srcs'],
             'dests': variants[0]['dests'],
-            'variable_dests': variants[0]['variable_dests'],
-            'variable_srcs': variants[0]['variable_srcs'],
             'staging': variants[0]['staging'],
             'immediates': sorted(variants[0]['immediates']),
-            'modifiers': modifiers,
-            'v': len(variants),
-            'ir': variants,
-            'rtz': default_round_to_zero(key)
+            'modifiers': { k: modifiers[k] for k in modifiers }
         }
 
 # Partition instructions to mnemonics, considering units and variants
 # equivalent.
 
 def partition_mnemonics(instructions):
-    key_func = lambda x: x[1:]
-    sorted_instrs = sorted(instructions.keys(), key = key_func)
-    partitions = itertools.groupby(sorted_instrs, key_func)
-    return { k: combine_ir_variants(instructions, k) for k, v in partitions }
+    partitions = itertools.groupby(instructions, lambda x: x[1:])
+    return { k: combine_ir_variants(instructions, v) for (k, v) in partitions }
 
 # Generate modifier lists, by accumulating all the possible modifiers, and
 # deduplicating thus assigning canonical enum values. We don't try _too_ hard
@@ -351,17 +328,3 @@ def order_modifiers(ir_instructions):
 def src_count(op):
     staging = 1 if (op["staging"] in ["r", "rw"]) else 0
     return op["srcs"] + staging
-
-# Parses out the size part of an opocde name
-def typesize(opcode):
-    if opcode[-3:] == '128':
-        return 128
-    if opcode[-2:] == '48':
-        return 48
-    elif opcode[-1] == '8':
-        return 8
-    else:
-        try:
-            return int(opcode[-2:])
-        except:
-            return 32
diff --git a/lib/mesa/src/panfrost/bifrost/gen_disasm.py b/lib/mesa/src/panfrost/bifrost/gen_disasm.py
index 505c61cc0..11acf5ae9 100644
--- a/lib/mesa/src/panfrost/bifrost/gen_disasm.py
+++ b/lib/mesa/src/panfrost/bifrost/gen_disasm.py
@@ -238,7 +238,7 @@ def build_lut(mnemonic, desc, test):
     key_set = find_context_keys(desc, test)
     ordered = 'ordering' in key_set
     key_set.discard('ordering')
-    keys = sorted(list(key_set))
+    keys = list(key_set)
 
     # Evaluate the deriveds for every possible state, forming a (state -> deriveds) map
     testf = compile_derived(test, keys)
@@ -326,7 +326,7 @@ def disasm_op(name, op):
 
     for i, (pos, mask) in enumerate(srcs):
         body += '    fputs(", ", fp);\n'
-        body += '    dump_src(fp, _BITS(bits, {}, 3), *srcs, branch_offset, consts, {});\n'.format(pos, "true" if is_fma else "false")
+        body += '    dump_src(fp, _BITS(bits, {}, 3), *srcs, consts, {});\n'.format(pos, "true" if is_fma else "false")
 
         # Error check if needed
         if (mask != 0xFF):
diff --git a/lib/mesa/src/panfrost/lib/pan_indirect_draw.c b/lib/mesa/src/panfrost/lib/pan_indirect_draw.c
index 3fa1f5485..2886d3d91 100644
--- a/lib/mesa/src/panfrost/lib/pan_indirect_draw.c
+++ b/lib/mesa/src/panfrost/lib/pan_indirect_draw.c
@@ -30,6 +30,7 @@
 #include "pan_indirect_draw.h"
 #include "pan_pool.h"
 #include "pan_util.h"
+#include "panfrost-quirks.h"
 #include "compiler/nir/nir_builder.h"
 #include "util/u_memory.h"
 #include "util/macros.h"
@@ -54,7 +55,6 @@ struct draw_data {
         nir_ssa_def *index_buf;
         nir_ssa_def *restart_index;
         nir_ssa_def *vertex_count;
-        nir_ssa_def *start_instance;
         nir_ssa_def *instance_count;
         nir_ssa_def *vertex_start;
         nir_ssa_def *index_bias;
@@ -72,9 +72,6 @@ struct jobs_data {
         nir_ssa_def *vertex_job;
         nir_ssa_def *tiler_job;
         nir_ssa_def *base_vertex_offset;
-        nir_ssa_def *first_vertex_sysval;
-        nir_ssa_def *base_vertex_sysval;
-        nir_ssa_def *base_instance_sysval;
         nir_ssa_def *offset_start;
         nir_ssa_def *invocation;
 };
@@ -111,13 +108,6 @@ struct indirect_draw_info {
         uint32_t count;
         uint32_t instance_count;
         uint32_t start;
-        uint32_t start_instance;
-};
-
-struct indirect_indexed_draw_info {
-        uint32_t count;
-        uint32_t instance_count;
-        uint32_t start;
         int32_t index_bias;
         uint32_t start_instance;
 };
@@ -142,7 +132,7 @@ struct indirect_draw_context {
         mali_ptr varying_mem;
 };
 
-/* Indirect draw shader inputs. Those are stored in FAU. */
+/* Indirect draw shader inputs. Those are stored in a UBO. */
 
 struct indirect_draw_inputs {
         /* indirect_draw_context pointer */
@@ -160,11 +150,6 @@ struct indirect_draw_inputs {
         /* index buffer */
         mali_ptr index_buf;
 
-        /* {base,first}_{vertex,instance} sysvals */
-        mali_ptr first_vertex_sysval;
-        mali_ptr base_vertex_sysval;
-        mali_ptr base_instance_sysval;
-
         /* Pointers to various cmdstream structs that need to be patched */
         mali_ptr vertex_job;
         mali_ptr tiler_job;
@@ -175,13 +160,26 @@ struct indirect_draw_inputs {
         uint32_t draw_buf_stride;
         uint32_t restart_index;
         uint32_t attrib_count;
-} PACKED;
+};
+
+static nir_ssa_def *
+get_input_data(nir_builder *b, unsigned offset, unsigned size)
+{
+        assert(!(offset & 0x3));
+        assert(size && !(size & 0x3));
+
+        return nir_load_ubo(b, 1, size,
+                            nir_imm_int(b, 0),
+                            nir_imm_int(b, offset),
+                            .align_mul = 4,
+                            .align_offset = 0,
+                            .range_base = 0,
+                            .range = ~0);
+}
 
 #define get_input_field(b, name) \
-        nir_load_push_constant(b, \
-               1, sizeof(((struct indirect_draw_inputs *)0)->name) * 8, \
-               nir_imm_int(b, 0), \
-               .base = offsetof(struct indirect_draw_inputs, name))
+        get_input_data(b, offsetof(struct indirect_draw_inputs, name), \
+                       sizeof(((struct indirect_draw_inputs *)0)->name) * 8)
 
 static nir_ssa_def *
 get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
@@ -282,12 +280,6 @@ update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
                                     offsetof(struct indirect_draw_info, field)), \
                     1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
 
-#define get_indexed_draw_field(b, draw_ptr, field) \
-        load_global(b, \
-                    get_address_imm(b, draw_ptr, \
-                                    offsetof(struct indirect_indexed_draw_info, field)), \
-                    1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
-
 static void
 extract_inputs(struct indirect_draw_shader_builder *builder)
 {
@@ -309,9 +301,6 @@ extract_inputs(struct indirect_draw_shader_builder *builder)
         if (builder->index_min_max_search)
                 return;
 
-        builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
-        builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
-        builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
         builder->jobs.vertex_job = get_input_field(b, vertex_job);
         builder->jobs.tiler_job = get_input_field(b, tiler_job);
         builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
@@ -342,49 +331,29 @@ init_shader_builder(struct indirect_draw_shader_builder *builder,
         if (index_min_max_search) {
                 builder->b =
                         nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
-                                                       GENX(pan_shader_get_compiler_options)(),
+                                                       pan_shader_get_compiler_options(dev),
                                                        "indirect_draw_min_max_index(index_size=%d)",
                                                        builder->index_size);
         } else {
                 builder->b =
                         nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
-                                                       GENX(pan_shader_get_compiler_options)(),
-                                                       "indirect_draw(index_size=%d%s%s%s%s)",
+                                                       pan_shader_get_compiler_options(dev),
+                                                       "indirect_draw(index_size=%d%s%s%s)",
                                                        builder->index_size,
                                                        flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
                                                        ",psiz" : "",
                                                        flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
                                                        ",primitive_restart" : "",
                                                        flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
-                                                       ",update_primitive_size" : "",
-                                                       flags & PAN_INDIRECT_DRAW_IDVS ?
-                                                       ",idvs" : "");
+                                                       ",update_primitive_size" : "");
         }
 
-        extract_inputs(builder);
-}
-
-static void
-update_dcd(struct indirect_draw_shader_builder *builder,
-           nir_ssa_def *job_ptr,
-           unsigned draw_offset)
-{
         nir_builder *b = &builder->b;
-        nir_ssa_def *draw_w01 =
-                load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
-        nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
+        nir_variable_create(b->shader, nir_var_mem_ubo,
+                            glsl_uint_type(), "inputs");
+        b->shader->info.num_ubos++;
 
-        /* Update DRAW.{instance_size,offset_start} */
-        nir_ssa_def *instance_size =
-                nir_bcsel(b,
-                          nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
-                          nir_imm_int(b, 0), builder->instance_size.packed);
-        draw_w01 = nir_vec2(b,
-                            nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
-                                    nir_ishl(b, instance_size, nir_imm_int(b, 16))),
-                            builder->jobs.offset_start);
-        store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
-                     draw_w01, 2);
+        extract_inputs(builder);
 }
 
 static void
@@ -402,9 +371,17 @@ update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type
         unsigned draw_offset =
                 type == MALI_JOB_TYPE_VERTEX ?
                 pan_section_offset(COMPUTE_JOB, DRAW) :
-                pan_section_offset(TILER_JOB, DRAW);
-        unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
-        unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
+                pan_is_bifrost(builder->dev) ?
+                pan_section_offset(BIFROST_TILER_JOB, DRAW) :
+                pan_section_offset(MIDGARD_TILER_JOB, DRAW);
+        unsigned prim_offset =
+                pan_is_bifrost(builder->dev) ?
+                pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE) :
+                pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE);
+        unsigned psiz_offset =
+                pan_is_bifrost(builder->dev) ?
+                pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE_SIZE) :
+                pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE_SIZE);
         unsigned index_size = builder->index_size;
 
         if (type == MALI_JOB_TYPE_TILER) {
@@ -440,14 +417,21 @@ update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type
                              builder->varyings.pos_ptr, 2);
         }
 
-        update_dcd(builder, job_ptr, draw_offset);
-
-        if (builder->flags & PAN_INDIRECT_DRAW_IDVS) {
-                assert(type == MALI_JOB_TYPE_TILER);
+        nir_ssa_def *draw_w01 =
+                load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
+        nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
 
-                update_dcd(builder, job_ptr,
-                           pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW));
-        }
+        /* Update DRAW.{instance_size,offset_start} */
+        nir_ssa_def *instance_size =
+                nir_bcsel(b,
+                          nir_ilt(b, builder->draw.instance_count, nir_imm_int(b, 2)),
+                          nir_imm_int(b, 0), builder->instance_size.packed);
+        draw_w01 = nir_vec2(b,
+                            nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
+                                    nir_ishl(b, instance_size, nir_imm_int(b, 16))),
+                            builder->jobs.offset_start);
+        store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
+                     draw_w01, 2);
 }
 
 static void
@@ -463,7 +447,7 @@ split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
                                    half_div64);
         nir_ssa_def *fi = nir_idiv(b, f0, div64);
         nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
-        nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
+        nir_ssa_def *e = nir_bcsel(b, nir_ilt(b, half_div64, ff),
                                    nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
         *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
         *r_e = nir_ior(b, r, e);
@@ -504,68 +488,33 @@ update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
 }
 
 static void
-zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
-                       nir_ssa_def *attrib_buf_ptr)
-{
-        /* Stride is an unadorned 32-bit uint at word 2 */
-        nir_builder *b = &builder->b;
-        store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
-                        nir_imm_int(b, 0), 1);
-}
-
-static void
 adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
-                     nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
-                     nir_ssa_def *instance_div)
+                     nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr)
 {
         nir_builder *b = &builder->b;
         nir_ssa_def *zero = nir_imm_int(b, 0);
         nir_ssa_def *two = nir_imm_int(b, 2);
         nir_ssa_def *sub_cur_offset =
                 nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
-                         nir_uge(b, builder->draw.instance_count, two));
-
-        nir_ssa_def *add_base_inst_offset =
-                nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
-                         nir_ine(b, instance_div, zero));
-
-        IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
-                nir_ssa_def *offset =
-                        load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
-                nir_ssa_def *stride =
-                        load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
+                         nir_ige(b, builder->draw.instance_count, two));
 
+        IF (sub_cur_offset) {
                 /* Per-instance data needs to be offset in response to a
                  * delayed start in an indexed draw.
                  */
+                nir_ssa_def *stride =
+                        load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
+                nir_ssa_def *offset =
+                        load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
 
-                IF (add_base_inst_offset) {
-                        offset = nir_iadd(b, offset,
-                                          nir_idiv(b,
-                                                   nir_imul(b, stride,
-                                                            builder->draw.start_instance),
-                                                   instance_div));
-                } ENDIF
-
-                IF (sub_cur_offset) {
-                        offset = nir_isub(b, offset,
-                                          nir_imul(b, stride,
-                                                   builder->jobs.offset_start));
-                } ENDIF
-
+                offset = nir_isub(b, offset,
+                                  nir_imul(b, stride,
+                                  builder->jobs.offset_start));
                 store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
                              offset, 1);
         } ENDIF
 }
 
-/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
-
-static nir_ssa_def *
-nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
-{
-        return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
-}
-
 /* Based on panfrost_emit_vertex_data() */
 
 static void
@@ -576,78 +525,78 @@ update_vertex_attribs(struct indirect_draw_shader_builder *builder)
                 nir_local_variable_create(b->impl, glsl_uint_type(),
                                           "attrib_idx");
         nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
-
-#if PAN_ARCH <= 5
         nir_ssa_def *single_instance =
-                nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
-#endif
+                nir_ilt(b, builder->draw.instance_count, nir_imm_int(b, 2));
 
         LOOP {
                 nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
-                IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
+                IF (nir_ige(b, attrib_idx, builder->attribs.attrib_count))
                         BREAK;
                 ENDIF
 
                 nir_ssa_def *attrib_buf_ptr =
                          get_address(b, builder->attribs.attrib_bufs,
                                      nir_imul_imm(b, attrib_idx,
-                                                  2 * pan_size(ATTRIBUTE_BUFFER)));
+                                                  2 * MALI_ATTRIBUTE_BUFFER_LENGTH));
                 nir_ssa_def *attrib_ptr =
                          get_address(b, builder->attribs.attribs,
                                      nir_imul_imm(b, attrib_idx,
-                                                  pan_size(ATTRIBUTE)));
+                                                  MALI_ATTRIBUTE_LENGTH));
 
                 nir_ssa_def *r_e, *d;
 
-#if PAN_ARCH <= 5
-                IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
-                        nir_ssa_def *r_p =
-                                nir_bcsel(b, single_instance,
-                                          nir_imm_int(b, 0x9f),
-                                          builder->instance_size.packed);
+                if (!pan_is_bifrost(builder->dev)) {
+                        IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
+                                nir_ssa_def *r_p =
+                                        nir_bcsel(b, single_instance,
+                                                  nir_imm_int(b, 0x9f),
+                                                  builder->instance_size.packed);
 
-                        store_global(b,
-                                     get_address_imm(b, attrib_buf_ptr, WORD(4)),
-                                     nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
+                                store_global(b,
+                                             get_address_imm(b, attrib_buf_ptr, WORD(4)),
+                                             nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
 
-                        nir_store_var(b, attrib_idx_var,
-                                      nir_iadd_imm(b, attrib_idx, 1), 1);
-                        CONTINUE;
-                } ENDIF
+                                nir_store_var(b, attrib_idx_var,
+                                              nir_iadd_imm(b, attrib_idx, 1), 1);
+                                CONTINUE;
+                        } ENDIF
 
-                IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
-                        split_div(b, builder->instance_size.padded,
-                                  &r_e, &d);
-                        nir_ssa_def *default_div =
-                                nir_ior(b, single_instance,
-                                        nir_ult(b,
-                                                builder->instance_size.padded,
-                                                nir_imm_int(b, 2)));
-                        r_e = nir_bcsel(b, default_div,
-                                        nir_imm_int(b, 0x3f), r_e);
-                        d = nir_bcsel(b, default_div,
-                                      nir_imm_int(b, (1u << 31) - 1), d);
-                        store_global(b,
-                                     get_address_imm(b, attrib_buf_ptr, WORD(1)),
-                                     nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
-                                     2);
-                        nir_store_var(b, attrib_idx_var,
-                                      nir_iadd_imm(b, attrib_idx, 1), 1);
-                        CONTINUE;
-                } ENDIF
-#endif
+                        IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
+                                split_div(b, builder->instance_size.padded,
+                                          &r_e, &d);
+                                nir_ssa_def *default_div =
+                                        nir_ior(b, single_instance,
+                                                nir_ilt(b,
+                                                        builder->instance_size.padded,
+                                                        nir_imm_int(b, 2)));
+                                r_e = nir_bcsel(b, default_div,
+                                                nir_imm_int(b, 0x3f), r_e);
+                                d = nir_bcsel(b, default_div,
+                                              nir_imm_int(b, (1u << 31) - 1), d);
+                                store_global(b,
+                                             get_address_imm(b, attrib_buf_ptr, WORD(1)),
+                                             nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
+                                             2);
+                                nir_store_var(b, attrib_idx_var,
+                                              nir_iadd_imm(b, attrib_idx, 1), 1);
+                                CONTINUE;
+                        } ENDIF
+                }
 
-                nir_ssa_def *instance_div =
+                nir_ssa_def *div =
                         load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
 
-                nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
+                div = nir_imul(b, div, builder->instance_size.padded);
 
                 nir_ssa_def *multi_instance =
-                        nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
+                        nir_ige(b, builder->draw.instance_count, nir_imm_int(b, 2));
 
                 IF (nir_ine(b, div, nir_imm_int(b, 0))) {
                         IF (multi_instance) {
-                                IF (nir_is_power_of_two_or_zero(b, div)) {
+                                nir_ssa_def *div_pow2 =
+                                        nir_ilt(b, nir_bit_count(b, div), nir_imm_int(b, 2));
+
+                                IF (div_pow2) {
                                         nir_ssa_def *exp =
                                                 nir_imax(b, nir_ufind_msb(b, div),
                                                          nir_imm_int(b, 0));
@@ -662,16 +611,26 @@ update_vertex_attribs(struct indirect_draw_shader_builder *builder)
                                 } ENDIF
                         } ELSE {
                                 /* Single instance with a non-0 divisor: all
-                                 * accesses should point to attribute 0 */
-                                zero_attrib_buf_stride(builder, attrib_buf_ptr);
+                                 * accesses should point to attribute 0, pick
+                                 * the biggest pot divisor.
+                                 */
+                                update_vertex_attrib_buf(builder, attrib_buf_ptr,
+                                                         MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
+                                                         nir_imm_int(b, 31), NULL);
                         } ENDIF
 
-                        adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
-                } ELSE IF (multi_instance) {
-                        update_vertex_attrib_buf(builder, attrib_buf_ptr,
-                                        MALI_ATTRIBUTE_TYPE_1D_MODULUS,
-                                        builder->instance_size.packed, NULL);
-                } ENDIF ENDIF
+                        adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr);
+                } ELSE {
+                        IF (multi_instance) {
+                                update_vertex_attrib_buf(builder, attrib_buf_ptr,
+                                                         MALI_ATTRIBUTE_TYPE_1D_MODULUS,
+                                                         builder->instance_size.packed, NULL);
+                        } ELSE {
+                                update_vertex_attrib_buf(builder, attrib_buf_ptr,
+                                                         MALI_ATTRIBUTE_TYPE_1D,
+                                                         nir_imm_int(b, 0), NULL);
+                        } ENDIF
+                } ENDIF
 
                 nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
         }
@@ -716,19 +675,19 @@ update_varyings(struct indirect_draw_shader_builder *builder)
         nir_ssa_def *buf_ptr =
                 get_address_imm(b, builder->varyings.varying_bufs,
                                 PAN_VARY_GENERAL *
-                                pan_size(ATTRIBUTE_BUFFER));
+                                MALI_ATTRIBUTE_BUFFER_LENGTH);
         update_varying_buf(builder, buf_ptr, vertex_count);
 
         buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
                                   PAN_VARY_POSITION *
-                                  pan_size(ATTRIBUTE_BUFFER));
+                                  MALI_ATTRIBUTE_BUFFER_LENGTH);
         builder->varyings.pos_ptr =
                 update_varying_buf(builder, buf_ptr, vertex_count);
 
         if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
                 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
                                           PAN_VARY_PSIZ *
-                                          pan_size(ATTRIBUTE_BUFFER));
+                                          MALI_ATTRIBUTE_BUFFER_LENGTH);
                 builder->varyings.psiz_ptr =
                         update_varying_buf(builder, buf_ptr, vertex_count);
         }
@@ -761,14 +720,6 @@ get_invocation(struct indirect_draw_shader_builder *builder)
                                  nir_imm_int(b, 2 << 28)));
 }
 
-static nir_ssa_def *
-nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot)
-{
-        assert(pot != 0 && util_is_power_of_two_or_zero(pot));
-
-        return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1));
-}
-
 /* Based on panfrost_padded_vertex_count() */
 
 static nir_ssa_def *
@@ -789,7 +740,7 @@ get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
         nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
         exp = nir_iadd(b, exp, rshift);
         base = nir_ushr(b, base, rshift);
-        base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
+        base = nir_iadd(b, base, nir_bcsel(b, nir_ige(b, base, eleven), one, zero));
         rshift = nir_imax(b, nir_find_lsb(b, base), zero);
         exp = nir_iadd(b, exp, rshift);
         base = nir_ushr(b, base, rshift);
@@ -803,28 +754,10 @@ static void
 update_jobs(struct indirect_draw_shader_builder *builder)
 {
         get_invocation(builder);
-
-        if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
-                update_job(builder, MALI_JOB_TYPE_VERTEX);
-
+        update_job(builder, MALI_JOB_TYPE_VERTEX);
         update_job(builder, MALI_JOB_TYPE_TILER);
 }
 
-
-static void
-set_null_job(struct indirect_draw_shader_builder *builder,
-             nir_ssa_def *job_ptr)
-{
-        nir_builder *b = &builder->b;
-        nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
-        nir_ssa_def *val = load_global(b, w4, 1, 32);
-
-        /* Set job type to NULL (AKA NOOP) */
-        val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
-                      nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
-        store_global(b, w4, val, 1);
-}
-
 static void
 get_instance_size(struct indirect_draw_shader_builder *builder)
 {
@@ -877,8 +810,8 @@ get_instance_size(struct indirect_draw_shader_builder *builder)
                         for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
                                 nir_ssa_def *oob =
                                         nir_ior(b,
-                                                nir_ult(b, nir_imm_int(b, i), offset),
-                                                nir_uge(b, nir_imm_int(b, i), end));
+                                                nir_ilt(b, nir_imm_int(b, i), offset),
+                                                nir_ige(b, nir_imm_int(b, i), end));
                                 nir_ssa_def *data = nir_iand_imm(b, val, mask);
 
                                 min = nir_umin(b, min,
@@ -903,7 +836,7 @@ get_instance_size(struct indirect_draw_shader_builder *builder)
 
                         nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
                         for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
-                                nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
+                                nir_ssa_def *oob = nir_ige(b, nir_imm_int(b, i), remaining);
                                 nir_ssa_def *data = nir_iand_imm(b, val, mask);
 
                                 min = nir_umin(b, min,
@@ -936,68 +869,25 @@ patch(struct indirect_draw_shader_builder *builder)
 
         nir_ssa_def *draw_ptr = builder->draw.draw_buf;
 
+        builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
+        assert(builder->draw.vertex_count->num_components);
+        builder->draw.instance_count =
+                get_draw_field(b, draw_ptr, instance_count);
+        builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
         if (index_size) {
-                builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
-                builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
-                builder->draw.instance_count =
-                        get_indexed_draw_field(b, draw_ptr, instance_count);
-                builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
-                builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
-        } else {
-                builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
-                builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
-                builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
-                builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
+                builder->draw.index_bias =
+                        get_draw_field(b, draw_ptr, index_bias);
         }
 
-        assert(builder->draw.vertex_count->num_components);
-
-        nir_ssa_def *num_vertices =
-                nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
+        get_instance_size(builder);
 
-        IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
-                /* If there's nothing to draw, turn the vertex/tiler jobs into
-                 * null jobs.
-                 */
-                if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
-                        set_null_job(builder, builder->jobs.vertex_job);
+        builder->instance_size.padded =
+                get_padded_count(b, builder->instance_size.raw,
+                                 &builder->instance_size.packed);
 
-                set_null_job(builder, builder->jobs.tiler_job);
-        } ELSE {
-                get_instance_size(builder);
-
-                nir_ssa_def *count = builder->instance_size.raw;
-
-                /* IDVS requires padding to a multiple of 4 */
-                if (builder->flags & PAN_INDIRECT_DRAW_IDVS)
-                        count = nir_align_pot(b, count, 4);
-
-                builder->instance_size.padded =
-                        get_padded_count(b, count,
-                                         &builder->instance_size.packed);
-
-                update_varyings(builder);
-                update_jobs(builder);
-                update_vertex_attribs(builder);
-
-                IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
-                        store_global(b, builder->jobs.first_vertex_sysval,
-                                     builder->jobs.offset_start, 1);
-                } ENDIF
-
-                IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
-                        store_global(b, builder->jobs.base_vertex_sysval,
-                                     index_size ?
-                                     builder->draw.index_bias :
-                                     nir_imm_int(b, 0),
-                                     1);
-                } ENDIF
-
-                IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
-                        store_global(b, builder->jobs.base_instance_sysval,
-                                     builder->draw.start_instance, 1);
-                } ENDIF
-        } ENDIF
+        update_varyings(builder);
+        update_jobs(builder);
+        update_vertex_attribs(builder);
 }
 
 /* Search the min/max index in the range covered by the indirect draw call */
@@ -1046,7 +936,7 @@ get_index_min_max(struct indirect_draw_shader_builder *builder)
 
         LOOP {
                 nir_ssa_def *offset = nir_load_var(b, offset_var);
-                IF (nir_uge(b, offset, end))
+                IF (nir_ige(b, offset, end))
                         BREAK;
                 ENDIF
 
@@ -1076,7 +966,7 @@ get_index_min_max(struct indirect_draw_shader_builder *builder)
                               nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
         }
 
-        IF (nir_ult(b, start, end))
+        IF (nir_ilt(b, start, end))
                 update_min(builder, nir_load_var(b, min_var));
                 update_max(builder, nir_load_var(b, max_var));
         ENDIF
@@ -1093,9 +983,7 @@ get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
                 return flags;
         }
 
-        return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
-                PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
-                PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
+        return PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX +
                util_logbase2(index_size);
 }
 
@@ -1115,46 +1003,42 @@ create_indirect_draw_shader(struct panfrost_device *dev,
         else
                 patch(&builder);
 
-        struct panfrost_compile_inputs inputs = {
-                .gpu_id = dev->gpu_id,
-                .fixed_sysval_ubo = -1,
-                .no_ubo_to_push = true,
-        };
+        struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
         struct pan_shader_info shader_info;
         struct util_dynarray binary;
 
         util_dynarray_init(&binary, NULL);
-        GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
+        pan_shader_compile(dev, b->shader, &inputs, &binary, &shader_info);
 
         assert(!shader_info.tls_size);
         assert(!shader_info.wls_size);
         assert(!shader_info.sysvals.sysval_count);
 
-        shader_info.push.count =
-                DIV_ROUND_UP(sizeof(struct indirect_draw_inputs), 4);
-
         unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
         struct pan_indirect_draw_shader *draw_shader =
                 &dev->indirect_draw_shaders.shaders[shader_id];
         void *state = dev->indirect_draw_shaders.states->ptr.cpu +
-                      (shader_id * pan_size(RENDERER_STATE));
+                      (shader_id * MALI_RENDERER_STATE_LENGTH);
 
         pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
         if (!draw_shader->rsd) {
                 mali_ptr address =
-                        pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
-                                                binary.data, binary.size,
-                                                PAN_ARCH >= 6 ? 128 : 64);
+                        panfrost_pool_upload_aligned(&dev->indirect_draw_shaders.bin_pool,
+                                                     binary.data, binary.size,
+                                                     pan_is_bifrost(dev) ? 128 : 64);
+                if (!pan_is_bifrost(dev))
+                        address |= shader_info.midgard.first_tag;
 
                 util_dynarray_fini(&binary);
 
                 pan_pack(state, RENDERER_STATE, cfg) {
-                        pan_shader_prepare_rsd(&shader_info, address, &cfg);
+                        pan_shader_prepare_rsd(dev, &shader_info, address, &cfg);
                 }
+                pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
 
                 draw_shader->push = shader_info.push;
                 draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
-                                   (shader_id * pan_size(RENDERER_STATE));
+                                   (shader_id * MALI_RENDERER_STATE_LENGTH);
         }
         pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
 
@@ -1182,7 +1066,46 @@ static mali_ptr
 get_tls(const struct panfrost_device *dev)
 {
         return dev->indirect_draw_shaders.states->ptr.gpu +
-               (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
+               (PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);
+}
+
+static mali_ptr
+get_ubos(struct pan_pool *pool,
+         const struct indirect_draw_inputs *inputs)
+{
+        struct panfrost_ptr inputs_buf =
+                panfrost_pool_alloc_aligned(pool, sizeof(inputs), 16);
+
+        memcpy(inputs_buf.cpu, &inputs, sizeof(inputs));
+
+        struct panfrost_ptr ubos_buf =
+                panfrost_pool_alloc_desc(pool, UNIFORM_BUFFER);
+
+        pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
+                cfg.entries = DIV_ROUND_UP(sizeof(inputs), 16);
+                cfg.pointer = inputs_buf.gpu;
+        }
+
+        return ubos_buf.gpu;
+}
+
+static mali_ptr
+get_push_uniforms(struct pan_pool *pool,
+                  const struct pan_indirect_draw_shader *shader,
+                  const struct indirect_draw_inputs *inputs)
+{
+        if (!shader->push.count)
+                return 0;
+
+        struct panfrost_ptr push_consts_buf =
+                panfrost_pool_alloc_aligned(pool, shader->push.count * 4, 16);
+        uint32_t *out = push_consts_buf.cpu;
+        uint8_t *in = (uint8_t *)inputs;
+
+        for (unsigned i = 0; i < shader->push.count; ++i)
+                memcpy(out + i, in + shader->push.words[i].offset, 4);
+
+        return push_consts_buf.gpu;
 }
 
 static void
@@ -1193,15 +1116,15 @@ panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
                 goto out;
 
         unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
-                                  pan_size(RENDERER_STATE)) +
-                                 pan_size(LOCAL_STORAGE);
+                                  MALI_RENDERER_STATE_LENGTH) +
+                                 MALI_LOCAL_STORAGE_LENGTH;
 
         dev->indirect_draw_shaders.states =
-                panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
+                panfrost_bo_create(dev, state_bo_size, 0);
 
         /* Prepare the thread storage descriptor now since it's invariant. */
         void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
-                    (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
+                    (PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);
         pan_pack(tsd, LOCAL_STORAGE, ls) {
                 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
         };
@@ -1215,8 +1138,7 @@ panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
          */
         dev->indirect_draw_shaders.varying_heap =
                 panfrost_bo_create(dev, 512 * 1024 * 1024,
-                                   PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
-                                   "Indirect draw varying heap");
+                                   PAN_BO_INVISIBLE | PAN_BO_GROWABLE);
 
 out:
         pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
@@ -1227,7 +1149,8 @@ panfrost_emit_index_min_max_search(struct pan_pool *pool,
                                    struct pan_scoreboard *scoreboard,
                                    const struct pan_indirect_draw_info *draw_info,
                                    const struct indirect_draw_inputs *inputs,
-                                   struct indirect_draw_context *draw_ctx)
+                                   struct indirect_draw_context *draw_ctx,
+                                   mali_ptr ubos)
 {
         struct panfrost_device *dev = pool->dev;
         unsigned index_size = draw_info->index_size;
@@ -1238,34 +1161,42 @@ panfrost_emit_index_min_max_search(struct pan_pool *pool,
         mali_ptr rsd =
                 get_renderer_state(dev, draw_info->flags,
                                    draw_info->index_size, true);
+        unsigned shader_id =
+                get_shader_id(draw_info->flags, draw_info->index_size, true);
+        const struct pan_indirect_draw_shader *shader =
+                &dev->indirect_draw_shaders.shaders[shader_id];
         struct panfrost_ptr job =
-                pan_pool_alloc_desc(pool, COMPUTE_JOB);
+                panfrost_pool_alloc_desc(pool, COMPUTE_JOB);
         void *invocation =
                 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
         panfrost_pack_work_groups_compute(invocation,
                                           1, 1, 1, MIN_MAX_JOBS, 1, 1,
-                                          false, false);
+                                          false);
 
         pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
                 cfg.job_task_split = 7;
         }
 
         pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
+                cfg.draw_descriptor_is_64b = true;
+                cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);
                 cfg.state = rsd;
                 cfg.thread_storage = get_tls(pool->dev);
-                cfg.push_uniforms =
-                        pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16);
+                cfg.uniform_buffers = ubos;
+                cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);
         }
 
+        pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
+
         return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
                                 false, false, 0, 0, &job, false);
 }
 
 unsigned
-GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
-                                  struct pan_scoreboard *scoreboard,
-                                  const struct pan_indirect_draw_info *draw_info,
-                                  struct panfrost_ptr *ctx)
+panfrost_emit_indirect_draw(struct pan_pool *pool,
+                            struct pan_scoreboard *scoreboard,
+                            const struct pan_indirect_draw_info *draw_info,
+                            struct panfrost_ptr *ctx)
 {
         struct panfrost_device *dev = pool->dev;
 
@@ -1277,7 +1208,7 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
         panfrost_indirect_draw_alloc_deps(dev);
 
         struct panfrost_ptr job =
-                pan_pool_alloc_desc(pool, COMPUTE_JOB);
+                panfrost_pool_alloc_desc(pool, COMPUTE_JOB);
         mali_ptr rsd =
                 get_renderer_state(dev, draw_info->flags,
                                    draw_info->index_size, false);
@@ -1288,18 +1219,15 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
 
         struct panfrost_ptr draw_ctx_ptr = *ctx;
         if (!draw_ctx_ptr.cpu) {
-                draw_ctx_ptr = pan_pool_alloc_aligned(pool,
-                                                      sizeof(draw_ctx),
-                                                      sizeof(mali_ptr));
+                draw_ctx_ptr = panfrost_pool_alloc_aligned(pool,
+                                                           sizeof(draw_ctx),
+                                                           sizeof(mali_ptr));
         }
 
         struct indirect_draw_inputs inputs = {
                 .draw_ctx = draw_ctx_ptr.gpu,
                 .draw_buf = draw_info->draw_buf,
                 .index_buf = draw_info->index_buf,
-                .first_vertex_sysval = draw_info->first_vertex_sysval,
-                .base_vertex_sysval = draw_info->base_vertex_sysval,
-                .base_instance_sysval = draw_info->base_instance_sysval,
                 .vertex_job = draw_info->vertex_job,
                 .tiler_job = draw_info->tiler_job,
                 .attrib_bufs = draw_info->attrib_bufs,
@@ -1312,9 +1240,9 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
                 inputs.restart_index = draw_info->restart_index;
 
                 struct panfrost_ptr min_max_ctx_ptr =
-                        pan_pool_alloc_aligned(pool,
-                                               sizeof(struct min_max_context),
-                                               4);
+                        panfrost_pool_alloc_aligned(pool,
+                                                    sizeof(struct min_max_context),
+                                                    4);
                 struct min_max_context *ctx = min_max_ctx_ptr.cpu;
 
                 ctx->min = UINT32_MAX;
@@ -1322,27 +1250,37 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
                 inputs.min_max_ctx = min_max_ctx_ptr.gpu;
         }
 
+        unsigned shader_id =
+                get_shader_id(draw_info->flags, draw_info->index_size, false);
+        const struct pan_indirect_draw_shader *shader =
+                &dev->indirect_draw_shaders.shaders[shader_id];
+        mali_ptr ubos = get_ubos(pool, &inputs);
+
         void *invocation =
                 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
         panfrost_pack_work_groups_compute(invocation,
                                           1, 1, 1, 1, 1, 1,
-                                          false, false);
+                                          false);
 
         pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
                 cfg.job_task_split = 2;
         }
 
         pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
+                cfg.draw_descriptor_is_64b = true;
+                cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);
                 cfg.state = rsd;
                 cfg.thread_storage = get_tls(pool->dev);
-                cfg.push_uniforms =
-                        pan_pool_upload_aligned(pool, &inputs, sizeof(inputs), 16);
+                cfg.uniform_buffers = ubos;
+                cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);
         }
 
+        pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
+
         unsigned global_dep = draw_info->last_indirect_draw;
         unsigned local_dep =
                 panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
-                                                   &inputs, &draw_ctx);
+                                                   &inputs, &draw_ctx, ubos);
 
         if (!ctx->cpu) {
                 *ctx = draw_ctx_ptr;
@@ -1355,19 +1293,20 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
 }
 
 void
-GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
-                                          struct pan_pool *bin_pool)
+panfrost_init_indirect_draw_shaders(struct panfrost_device *dev)
 {
         /* We allocate the states and varying_heap BO lazily to avoid
          * reserving memory when indirect draws are not used.
          */
         pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
-        dev->indirect_draw_shaders.bin_pool = bin_pool;
+        panfrost_pool_init(&dev->indirect_draw_shaders.bin_pool, NULL, dev,
+                           PAN_BO_EXECUTE, false);
 }
 
 void
-GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
+panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev)
 {
+        panfrost_pool_cleanup(&dev->indirect_draw_shaders.bin_pool);
         panfrost_bo_unreference(dev->indirect_draw_shaders.states);
         panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
         pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
diff --git a/lib/mesa/src/panfrost/lib/pan_indirect_draw.h b/lib/mesa/src/panfrost/lib/pan_indirect_draw.h
index 6a7737441..28bcd535d 100644
--- a/lib/mesa/src/panfrost/lib/pan_indirect_draw.h
+++ b/lib/mesa/src/panfrost/lib/pan_indirect_draw.h
@@ -24,8 +24,6 @@
 #ifndef __PAN_INDIRECT_DRAW_SHADERS_H__
 #define __PAN_INDIRECT_DRAW_SHADERS_H__
 
-#include "genxml/gen_macros.h"
-
 struct pan_device;
 struct pan_scoreboard;
 struct pan_pool;
@@ -33,9 +31,6 @@ struct pan_pool;
 struct pan_indirect_draw_info {
         mali_ptr draw_buf;
         mali_ptr index_buf;
-        mali_ptr first_vertex_sysval;
-        mali_ptr base_vertex_sysval;
-        mali_ptr base_instance_sysval;
         mali_ptr vertex_job;
         mali_ptr tiler_job;
         mali_ptr attrib_bufs;
@@ -49,16 +44,15 @@ struct pan_indirect_draw_info {
 };
 
 unsigned
-GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
-                                  struct pan_scoreboard *scoreboard,
-                                  const struct pan_indirect_draw_info *draw_info,
-                                  struct panfrost_ptr *ctx);
+panfrost_emit_indirect_draw(struct pan_pool *pool,
+                            struct pan_scoreboard *scoreboard,
+                            const struct pan_indirect_draw_info *draw_info,
+                            struct panfrost_ptr *ctx);
 
 void
-GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
-                                          struct pan_pool *bin_pool);
+panfrost_init_indirect_draw_shaders(struct panfrost_device *dev);
 
 void
-GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev);
+panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev);
 
 #endif
diff --git a/lib/mesa/src/vulkan/wsi/wsi_common_win32.c b/lib/mesa/src/vulkan/wsi/wsi_common_win32.c
index bef81028b..fa6f898e5 100644
--- a/lib/mesa/src/vulkan/wsi/wsi_common_win32.c
+++ b/lib/mesa/src/vulkan/wsi/wsi_common_win32.c
@@ -26,12 +26,9 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "vk_format.h"
-#include "vk_instance.h"
-#include "vk_physical_device.h"
 #include "vk_util.h"
-#include "wsi_common_entrypoints.h"
 #include "wsi_common_private.h"
+#include "wsi_common_win32.h"
 
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wint-to-pointer-cast"      // warning: cast to pointer from integer of different size
@@ -70,37 +67,30 @@ struct wsi_win32_swapchain {
    struct wsi_win32_image     images[0];
 };
 
-VKAPI_ATTR VkBool32 VKAPI_CALL
-wsi_GetPhysicalDeviceWin32PresentationSupportKHR(VkPhysicalDevice physicalDevice,
-                                                 uint32_t queueFamilyIndex)
+VkBool32
+wsi_win32_get_presentation_support(struct wsi_device *wsi_device)
 {
    return TRUE;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-wsi_CreateWin32SurfaceKHR(VkInstance _instance,
-                          const VkWin32SurfaceCreateInfoKHR *pCreateInfo,
-                          const VkAllocationCallbacks *pAllocator,
-                          VkSurfaceKHR *pSurface)
+VkResult
+wsi_create_win32_surface(VkInstance instance,
+                           const VkAllocationCallbacks *allocator,
+                           const VkWin32SurfaceCreateInfoKHR *create_info,
+                           VkSurfaceKHR *surface_khr)
 {
-   VK_FROM_HANDLE(vk_instance, instance, _instance);
-   VkIcdSurfaceWin32 *surface;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR);
-
-   surface = vk_zalloc2(&instance->alloc, pAllocator, sizeof(*surface), 8,
-                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   VkIcdSurfaceWin32 *surface = vk_zalloc(allocator, sizeof *surface, 8,
+                                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
    if (surface == NULL)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
 
    surface->base.platform = VK_ICD_WSI_PLATFORM_WIN32;
 
-   surface->hinstance = pCreateInfo->hinstance;
-   surface->hwnd = pCreateInfo->hwnd;
-
-   *pSurface = VkIcdSurfaceBase_to_handle(&surface->base);
+   surface->hinstance = create_info->hinstance;
+   surface->hwnd = create_info->hwnd;
 
+   *surface_khr = VkIcdSurfaceBase_to_handle(&surface->base);
    return VK_SUCCESS;
 }
 
@@ -116,24 +106,15 @@ wsi_win32_surface_get_support(VkIcdSurfaceBase *surface,
 }
 
 static VkResult
-wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surf,
+wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surface,
                                 struct wsi_device *wsi_device,
                                 VkSurfaceCapabilitiesKHR* caps)
 {
-   VkIcdSurfaceWin32 *surface = (VkIcdSurfaceWin32 *)surf;
-
-   RECT win_rect;
-   if (!GetClientRect(surface->hwnd, &win_rect))
-      return VK_ERROR_SURFACE_LOST_KHR;
-
    caps->minImageCount = 1;
    /* There is no real maximum */
    caps->maxImageCount = 0;
 
-   caps->currentExtent = (VkExtent2D) {
-      win_rect.right - win_rect.left,
-      win_rect.bottom - win_rect.top
-   };
+   caps->currentExtent = (VkExtent2D) { UINT32_MAX, UINT32_MAX };
    caps->minImageExtent = (VkExtent2D) { 1, 1 };
    caps->maxImageExtent = (VkExtent2D) {
       wsi_device->maxImageDimension2D,
@@ -153,8 +134,7 @@ wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surf,
       VK_IMAGE_USAGE_SAMPLED_BIT |
       VK_IMAGE_USAGE_TRANSFER_DST_BIT |
       VK_IMAGE_USAGE_STORAGE_BIT |
-      VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
-      VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
+      VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
 
    return VK_SUCCESS;
 }
@@ -228,7 +208,7 @@ wsi_win32_surface_get_formats(VkIcdSurfaceBase *icd_surface,
    for (unsigned i = 0; i < ARRAY_SIZE(sorted_formats); i++) {
       vk_outarray_append_typed(VkSurfaceFormatKHR, &out, f) {
          f->format = sorted_formats[i];
-         f->colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR;
+         f->colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
       }
    }
 
@@ -251,7 +231,7 @@ wsi_win32_surface_get_formats2(VkIcdSurfaceBase *icd_surface,
       vk_outarray_append_typed(VkSurfaceFormat2KHR, &out, f) {
          assert(f->sType == VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR);
          f->surfaceFormat.format = sorted_formats[i];
-         f->surfaceFormat.colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR;
+         f->surfaceFormat.colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
       }
    }
 
@@ -301,16 +281,155 @@ wsi_win32_surface_get_present_rectangles(VkIcdSurfaceBase *surface,
    return vk_outarray_status(&out);
 }
 
+static uint32_t
+select_memory_type(const struct wsi_device *wsi,
+                   VkMemoryPropertyFlags props,
+                   uint32_t type_bits)
+{
+   for (uint32_t i = 0; i < wsi->memory_props.memoryTypeCount; i++) {
+       const VkMemoryType type = wsi->memory_props.memoryTypes[i];
+       if ((type_bits & (1 << i)) && (type.propertyFlags & props) == props)
+         return i;
+   }
+
+   unreachable("No memory type found");
+}
+
+VkResult
+wsi_create_native_image(const struct wsi_swapchain *chain,
+                        const VkSwapchainCreateInfoKHR *pCreateInfo,
+                        uint32_t num_modifier_lists,
+                        const uint32_t *num_modifiers,
+                        const uint64_t *const *modifiers,
+                        struct wsi_image *image)
+{
+   const struct wsi_device *wsi = chain->wsi;
+   VkResult result;
+
+   memset(image, 0, sizeof(*image));
+   for (int i = 0; i < ARRAY_SIZE(image->fds); i++)
+      image->fds[i] = -1;
+
+   VkImageCreateInfo image_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+      .flags = 0,
+      .imageType = VK_IMAGE_TYPE_2D,
+      .format = pCreateInfo->imageFormat,
+      .extent = {
+         .width = pCreateInfo->imageExtent.width,
+         .height = pCreateInfo->imageExtent.height,
+         .depth = 1,
+      },
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = VK_SAMPLE_COUNT_1_BIT,
+      .tiling = VK_IMAGE_TILING_OPTIMAL,
+      .usage = pCreateInfo->imageUsage,
+      .sharingMode = pCreateInfo->imageSharingMode,
+      .queueFamilyIndexCount = pCreateInfo->queueFamilyIndexCount,
+      .pQueueFamilyIndices = pCreateInfo->pQueueFamilyIndices,
+      .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+   };
+
+   VkImageFormatListCreateInfoKHR image_format_list;
+   if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR) {
+      image_info.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+                          VK_IMAGE_CREATE_EXTENDED_USAGE_BIT_KHR;
+
+      const VkImageFormatListCreateInfoKHR *format_list =
+         vk_find_struct_const(pCreateInfo->pNext,
+                              IMAGE_FORMAT_LIST_CREATE_INFO_KHR);
+
+#ifndef NDEBUG
+      assume(format_list && format_list->viewFormatCount > 0);
+      bool format_found = false;
+      for (int i = 0; i < format_list->viewFormatCount; i++)
+         if (pCreateInfo->imageFormat == format_list->pViewFormats[i])
+            format_found = true;
+      assert(format_found);
+#endif
+
+      image_format_list = *format_list;
+      image_format_list.pNext = NULL;
+      __vk_append_struct(&image_info, &image_format_list);
+   }
+
+
+   result = wsi->CreateImage(chain->device, &image_info,
+                             &chain->alloc, &image->image);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   VkMemoryRequirements reqs;
+   wsi->GetImageMemoryRequirements(chain->device, image->image, &reqs);
+
+   const struct wsi_memory_allocate_info memory_wsi_info = {
+      .sType = VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA,
+      .pNext = NULL,
+      .implicit_sync = true,
+   };
+   const VkExportMemoryAllocateInfo memory_export_info = {
+      .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+      .pNext = &memory_wsi_info,
+      .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+   };
+   const VkMemoryDedicatedAllocateInfo memory_dedicated_info = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+      .pNext = &memory_export_info,
+      .image = image->image,
+      .buffer = VK_NULL_HANDLE,
+   };
+   const VkMemoryAllocateInfo memory_info = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+      .pNext = &memory_dedicated_info,
+      .allocationSize = reqs.size,
+      .memoryTypeIndex = select_memory_type(wsi, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                                            reqs.memoryTypeBits),
+   };
+   result = wsi->AllocateMemory(chain->device, &memory_info,
+                                &chain->alloc, &image->memory);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   result = wsi->BindImageMemory(chain->device, image->image,
+                                 image->memory, 0);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   const VkImageSubresource image_subresource = {
+      .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+      .mipLevel = 0,
+      .arrayLayer = 0,
+   };
+   VkSubresourceLayout image_layout;
+   wsi->GetImageSubresourceLayout(chain->device, image->image,
+                                  &image_subresource, &image_layout);
+
+   image->num_planes = 1;
+   image->sizes[0] = reqs.size;
+   image->row_pitches[0] = image_layout.rowPitch;
+   image->offsets[0] = 0;
+
+   return VK_SUCCESS;
+
+fail:
+   wsi_destroy_image(chain, image);
+
+   return result;
+}
+
 static VkResult
 wsi_win32_image_init(VkDevice device_h,
-                     struct wsi_win32_swapchain *chain,
-                     const VkSwapchainCreateInfoKHR *create_info,
-                     const VkAllocationCallbacks *allocator,
-                     struct wsi_win32_image *image)
+                       struct wsi_swapchain *drv_chain,
+                       const VkSwapchainCreateInfoKHR *create_info,
+                       const VkAllocationCallbacks *allocator,
+                       struct wsi_win32_image *image)
 {
-   assert(chain->base.use_buffer_blit);
-   VkResult result = wsi_create_image(&chain->base, &chain->base.image_info,
-                                      &image->base);
+   struct wsi_win32_swapchain *chain = (struct wsi_win32_swapchain *) drv_chain;
+
+   VkResult result = wsi_create_native_image(&chain->base, create_info,
+                                             0, NULL, NULL,
+                                             &image->base);
    if (result != VK_SUCCESS)
       return result;
 
@@ -345,10 +464,13 @@ wsi_win32_image_init(VkDevice device_h,
 }
 
 static void
-wsi_win32_image_finish(struct wsi_win32_swapchain *chain,
-                       const VkAllocationCallbacks *allocator,
-                       struct wsi_win32_image *image)
+wsi_win32_image_finish(struct wsi_swapchain *drv_chain,
+                         const VkAllocationCallbacks *allocator,
+                         struct wsi_win32_image *image)
 {
+   struct wsi_win32_swapchain *chain =
+      (struct wsi_win32_swapchain *) drv_chain;
+
    DeleteDC(image->dc);
    if(image->bmp)
       DeleteObject(image->bmp);
@@ -363,7 +485,7 @@ wsi_win32_swapchain_destroy(struct wsi_swapchain *drv_chain,
       (struct wsi_win32_swapchain *) drv_chain;
 
    for (uint32_t i = 0; i < chain->base.image_count; i++)
-      wsi_win32_image_finish(chain, allocator, &chain->images[i]);
+      wsi_win32_image_finish(drv_chain, allocator, &chain->images[i]);
 
    DeleteDC(chain->chain_dc);
 
@@ -406,19 +528,30 @@ wsi_win32_queue_present(struct wsi_swapchain *drv_chain,
    struct wsi_win32_swapchain *chain = (struct wsi_win32_swapchain *) drv_chain;
    assert(image_index < chain->base.image_count);
    struct wsi_win32_image *image = &chain->images[image_index];
+   VkResult result;
 
-   assert(chain->base.use_buffer_blit);
-
-   char *ptr = image->base.cpu_map;
+   char *ptr;
    char *dptr = image->ppvBits;
+   result = chain->base.wsi->MapMemory(chain->base.device,
+                                       image->base.memory,
+                                       0, 0, 0, (void**)&ptr);
 
    for (unsigned h = 0; h < chain->extent.height; h++) {
       memcpy(dptr, ptr, chain->extent.width * 4);
       dptr += image->bmp_row_pitch;
       ptr += image->base.row_pitches[0];
    }
-   if (!StretchBlt(chain->chain_dc, 0, 0, chain->extent.width, chain->extent.height, image->dc, 0, 0, chain->extent.width, chain->extent.height, SRCCOPY))
-      chain->status = VK_ERROR_MEMORY_MAP_FAILED;
+   if(StretchBlt(chain->chain_dc, 0, 0, chain->extent.width, chain->extent.height, image->dc, 0, 0, chain->extent.width, chain->extent.height, SRCCOPY))
+      result = VK_SUCCESS;
+   else
+     result = VK_ERROR_MEMORY_MAP_FAILED;
+
+   chain->base.wsi->UnmapMemory(chain->base.device, image->base.memory);
+   if (result != VK_SUCCESS)
+      chain->status = result;
+
+   if (result != VK_SUCCESS)
+      return result;
 
    return chain->status;
 }
@@ -448,13 +581,8 @@ wsi_win32_surface_create_swapchain(
    if (chain == NULL)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-   struct wsi_cpu_image_params image_params = {
-      .base.image_type = WSI_IMAGE_TYPE_CPU,
-   };
-
    VkResult result = wsi_swapchain_init(wsi_device, &chain->base, device,
-                                        create_info, &image_params.base,
-                                        allocator);
+                                        create_info, allocator);
    if (result != VK_SUCCESS) {
       vk_free(allocator, chain);
       return result;
@@ -473,20 +601,16 @@ wsi_win32_surface_create_swapchain(
 
    chain->surface = surface;
 
-   assert(wsi_device->sw);
-   chain->base.use_buffer_blit = true;
-
    for (uint32_t image = 0; image < chain->base.image_count; image++) {
-      result = wsi_win32_image_init(device, chain,
-                                    create_info, allocator,
-                                    &chain->images[image]);
+      result = wsi_win32_image_init(device, &chain->base,
+                                      create_info, allocator,
+                                      &chain->images[image]);
       if (result != VK_SUCCESS) {
          while (image > 0) {
             --image;
-            wsi_win32_image_finish(chain, allocator,
-                                   &chain->images[image]);
+            wsi_win32_image_finish(&chain->base, allocator,
+                                     &chain->images[image]);
          }
-         wsi_swapchain_finish(&chain->base);
          vk_free(allocator, chain);
          goto fail_init_images;
       }