diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/ISA.xml | 1193 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_builder.h.py | 93 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_layout.c | 30 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c | 233 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py | 20 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py | 19 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c | 71 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c | 223 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_packer.c.py | 12 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_printer.c.py | 38 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bi_scoreboard.c | 255 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/bifrost_isa.py | 49 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/bifrost/gen_disasm.py | 4 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/lib/pan_indirect_draw.c | 565 | ||||
-rw-r--r-- | lib/mesa/src/panfrost/lib/pan_indirect_draw.h | 18 | ||||
-rw-r--r-- | lib/mesa/src/vulkan/wsi/wsi_common_win32.c | 262 |
16 files changed, 654 insertions, 2431 deletions
diff --git a/lib/mesa/src/panfrost/bifrost/ISA.xml b/lib/mesa/src/panfrost/bifrost/ISA.xml index f1e908331..b5965fd3c 100644 --- a/lib/mesa/src/panfrost/bifrost/ISA.xml +++ b/lib/mesa/src/panfrost/bifrost/ISA.xml @@ -1986,7 +1986,7 @@ <src start="0" mask="0xfb"/> </ins> - <ins name="*NOP" mask="0x7fffff" exact="0x701963" dests="0"/> + <ins name="*NOP.i32" mask="0x7fffff" exact="0x701963"/> <ins name="*POPCOUNT.i32" mask="0x7ffff8" exact="0x73c6d8"> <src start="0" mask="0xfb"/> @@ -2036,7 +2036,6 @@ <opt>not</opt> <opt>none</opt> </mod> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> </ins> <ins name="*RSHIFT_AND.v2i16"> @@ -2057,7 +2056,6 @@ <opt>not</opt> <opt>none</opt> </mod> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> <encoding mask="0x7f3800" exact="0x300800"> <or> <eq left="lanes2" right="#b00"/> @@ -2091,7 +2089,6 @@ <src start="0" mask="0xfb"/> <src start="3" mask="0xfb"/> <src start="6"/> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> <mod name="lanes2" size="3" default="b0123"> <opt>b0123</opt> <opt>b0000</opt> @@ -2147,7 +2144,6 @@ <opt>not</opt> <opt>none</opt> </mod> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> <mod name="not_result" start="15" size="1" opt="not"/> </ins> @@ -2169,7 +2165,6 @@ <opt>none</opt> </mod> <mod name="not_result" start="15" size="1" opt="not"/> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> <encoding mask="0x7f3800" exact="0x302800"> <or> <eq left="lanes2" right="#b00"/> @@ -2215,7 +2210,6 @@ <opt>none</opt> </mod> <mod name="not_result" start="15" size="1" opt="not"/> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> <encoding mask="0x7f3800" exact="0x302000"> <neq left="lanes2" right="#b0123"/> <derived start="9" size="2"> @@ -2241,7 +2235,6 @@ <opt>b3</opt> </mod> <mod name="not_result" start="13" size="1" opt="not"/> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> </ins> <ins name="*RSHIFT_XOR.v2i16"> @@ -2258,7 +2251,6 @@ <opt>b02</opt> </mod> <mod name="not_result" start="13" size="1" opt="not"/> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> <encoding mask="0x7fd800" exact="0x320800"> <or> <eq left="lanes2" right="#b00"/> @@ -2300,7 +2292,6 @@ <opt>b3333</opt> </mod> <mod name="not_result" start="13" size="1" opt="not"/> - <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/> <encoding mask="0x7fd800" exact="0x320000"> <neq left="lanes2" right="#b0123"/> <derived start="9" size="2"> @@ -2429,7 +2420,6 @@ <opt>rtz</opt> <opt>rtna</opt> </mod> - <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/> <derived start="6" size="1"> <and> <eq left="abs0" right="#none"/> @@ -2496,9 +2486,6 @@ <ins name="+ATEST" staging="w=1" mask="0xfff00" exact="0xc8f00" message="atest" table="true"> <src start="0" mask="0xf7"/> <src start="3" mask="0xf7"/> - <!-- ATEST parameter datum. Implicitly encoded into the tuple on Bifrost. - Real source on Valhall. --> - <src start="6" pseudo="true"/> <mod name="widen1" start="6" size="2"> <reserved/> <opt>none</opt> @@ -2533,22 +2520,8 @@ <src start="0"/> <src start="3" mask="0xf7"/> <src start="6" mask="0xf7"/> - <!-- pseudo source for a dual source blend input --> - <src start="9" pseudo="true"/> <!-- not actually encoded, but used for IR --> <immediate name="sr_count" size="4" pseudo="true"/> - <immediate name="sr_count_2" size="4" pseudo="true"/> - <mod name="register_format" size="4" pseudo="true"> - <opt>f16</opt> - <opt>f32</opt> - <opt>s32</opt> - <opt>u32</opt> - <opt>s16</opt> - <opt>u16</opt> - <opt>f64</opt> - <opt>i64</opt> - <opt>auto</opt> - </mod> </ins> <ins name="+BRANCH.f16" mask="0xf8000" exact="0x68000" last="true" dests="0"> @@ -3716,12 +3689,12 @@ <src start="6" mask="0xf7"/> </ins> - <ins name="+CLPER_OLD.i32" mask="0xfffc0" exact="0x3f0c0"> + <ins name="+CLPER_V6.i32" mask="0xfffc0" exact="0x3f0c0"> <src start="0" mask="0x7"/> <src start="3"/> </ins> - <ins name="+CLPER.i32" mask="0xfc000" exact="0x7c000"> + <ins name="+CLPER_V7.i32" mask="0xfc000" exact="0x7c000"> <src start="0" mask="0x7"/> <src start="3"/> <mod name="lane_op" start="6" size="2"> @@ -3734,7 +3707,6 @@ <opt>subgroup2</opt> <opt>subgroup4</opt> <opt>subgroup8</opt> - <opt pseudo="true">subgroup16</opt> <!-- Only on Valhall --> </mod> <mod name="inactive_result" start="10" size="4"> <opt>zero</opt> @@ -3874,7 +3846,6 @@ <opt>h0</opt> <opt>h1</opt> </mod> - <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/> </ins> <ins name="+F16_TO_S32"> @@ -6195,7 +6166,7 @@ <src start="6" mask="0xf7"/> </ins> - <ins name="+KABOOM" mask="0xffff8" exact="0xd7858" message="job" dests="0"> + <ins name="+KABOOM" mask="0xffff8" exact="0xd7858" unused="true" message="job_management"> <src start="0"/> </ins> @@ -6381,7 +6352,7 @@ </mod> </ins> - <ins name="+LD_TILE" staging="w=format" mask="0xff800" exact="0xcb000" message="tile"> + <ins name="+LD_TILE" staging="w=vecsize" mask="0xff800" exact="0xcb000" message="tile"> <src start="0"/> <src start="3"/> <src start="6" mask="0xf7"/> @@ -6391,15 +6362,9 @@ <opt>v3</opt> <opt>v4</opt> </mod> - <mod name="register_format" size="3" pseudo="true"> - <opt>f32</opt> - <opt>f16</opt> - <opt>u32</opt> - <opt>s32</opt> - </mod> </ins> - <ins name="+LD_VAR" staging="w=format" message="varying"> + <ins name="+LD_VAR" staging="w=vecsize" message="varying"> <src start="0"/> <src start="3"/> <mod name="vecsize" start="8" size="2"> @@ -7105,7 +7070,6 @@ <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOAD.i16" staging="w=1" message="load"> @@ -7121,7 +7085,7 @@ <reserved/> <opt>tl</opt> </mod> - <mod name="lane_dest" size="2" default="h0"> + <mod name="lane0" size="2" default="h0"> <opt>h0</opt> <opt>h1</opt> <opt>w0</opt> @@ -7136,19 +7100,19 @@ <and> <eq left="extend" right="#none"/> <or> - <eq left="lane_dest" right="#h0"/> - <eq left="lane_dest" right="#h1"/> + <eq left="lane0" right="#h0"/> + <eq left="lane0" right="#h1"/> </or> </and> <derived start="9" size="1"> - <eq left="lane_dest" right="#h0"/> - <eq left="lane_dest" right="#h1"/> + <eq left="lane0" right="#h0"/> + <eq left="lane0" right="#h1"/> </derived> </encoding> <encoding mask="0xffc00" exact="0x63000"> <and> <neq left="extend" right="#none"/> - <eq left="lane_dest" right="#w0"/> + <eq left="lane0" right="#w0"/> </and> <derived start="9" size="1"> <eq left="extend" right="#sext"/> @@ -7158,14 +7122,13 @@ <encoding mask="0xffc00" exact="0x61800"> <and> <neq left="extend" right="#none"/> - <eq left="lane_dest" right="#d0"/> + <eq left="lane0" right="#d0"/> </and> <derived start="9" size="1"> <eq left="extend" right="#sext"/> <eq left="extend" right="#zext"/> </derived> </encoding> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOAD.i24" staging="w=1" mask="0xffe00" exact="0x65000" message="load"> @@ -7181,7 +7144,6 @@ <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOAD.i32" staging="w=1" message="load"> @@ -7197,7 +7159,7 @@ <reserved/> <opt>tl</opt> </mod> - <mod name="lane_dest" size="1" opt="d0"/> + <mod name="lane0" size="1" opt="d0"/> <mod name="extend" size="2"> <opt>none</opt> <opt>sext</opt> @@ -7206,20 +7168,19 @@ <encoding mask="0xffe00" exact="0x60c00"> <and> <eq left="extend" right="#none"/> - <eq left="lane_dest" right="#none"/> + <eq left="lane0" right="#none"/> </and> </encoding> <encoding mask="0xffc00" exact="0x61c00"> <and> <neq left="extend" right="#none"/> - <eq left="lane_dest" right="#d0"/> + <eq left="lane0" right="#d0"/> </and> <derived start="9" size="1"> <eq left="extend" right="#sext"/> <eq left="extend" right="#zext"/> </derived> </encoding> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOAD.i48" staging="w=2" mask="0xffe00" exact="0x65200" message="load"> @@ -7235,7 +7196,6 @@ <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOAD.i64" staging="w=2" mask="0xffe00" exact="0x60e00" message="load"> @@ -7251,7 +7211,6 @@ <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOAD.i8" staging="w=1" message="load"> @@ -7267,7 +7226,7 @@ <reserved/> <opt>tl</opt> </mod> - <mod name="lane_dest" size="3" default="b0"> + <mod name="lane0" size="3" default="b0"> <opt>b0</opt> <opt>b1</opt> <opt>b2</opt> @@ -7286,25 +7245,25 @@ <and> <eq left="extend" right="#none"/> <or> - <eq left="lane_dest" right="#b0"/> - <eq left="lane_dest" right="#b1"/> - <eq left="lane_dest" right="#b2"/> - <eq left="lane_dest" right="#b3"/> + <eq left="lane0" right="#b0"/> + <eq left="lane0" right="#b1"/> + <eq left="lane0" right="#b2"/> + <eq left="lane0" right="#b3"/> </or> </and> <derived start="9" size="2"> - <eq left="lane_dest" right="#b0"/> - <eq left="lane_dest" right="#b1"/> - <eq left="lane_dest" right="#b2"/> - <eq left="lane_dest" right="#b3"/> + <eq left="lane0" right="#b0"/> + <eq left="lane0" right="#b1"/> + <eq left="lane0" right="#b2"/> + <eq left="lane0" right="#b3"/> </derived> </encoding> <encoding mask="0xff800" exact="0x63800"> <and> <neq left="extend" right="#none"/> <or> - <eq left="lane_dest" right="#h0"/> - <eq left="lane_dest" right="#h1"/> + <eq left="lane0" right="#h0"/> + <eq left="lane0" right="#h1"/> </or> </and> <derived start="9" size="1"> @@ -7312,14 +7271,14 @@ <eq left="extend" right="#zext"/> </derived> <derived start="10" size="1"> - <eq left="lane_dest" right="#h0"/> - <eq left="lane_dest" right="#h1"/> + <eq left="lane0" right="#h0"/> + <eq left="lane0" right="#h1"/> </derived> </encoding> <encoding mask="0xffc00" exact="0x63400"> <and> <neq left="extend" right="#none"/> - <eq left="lane_dest" right="#w0"/> + <eq left="lane0" right="#w0"/> </and> <derived start="9" size="1"> <eq left="extend" right="#sext"/> @@ -7329,14 +7288,13 @@ <encoding mask="0xffc00" exact="0x61400"> <and> <neq left="extend" right="#none"/> - <eq left="lane_dest" right="#d0"/> + <eq left="lane0" right="#d0"/> </and> <derived start="9" size="1"> <eq left="extend" right="#sext"/> <eq left="extend" right="#zext"/> </derived> </encoding> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOAD.i96" staging="w=3" mask="0xffe00" exact="0x65400" message="load"> @@ -7352,7 +7310,6 @@ <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+LOGB.f32" mask="0xfffe0" exact="0x3d9a0"> @@ -7438,7 +7395,7 @@ </mod> </ins> - <ins name="+NOP" mask="0xfffff" exact="0x3d964" dests="0"/> + <ins name="+NOP.i32" mask="0xfffff" exact="0x3d964"/> <ins name="+QUIET.f32" mask="0xffff8" exact="0x3d970"> <src start="0"/> @@ -7562,12 +7519,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+STORE.i16" staging="r=1" mask="0xffe00" exact="0x62800" message="store" dests="0"> @@ -7578,12 +7534,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+STORE.i24" staging="r=1" mask="0xffe00" exact="0x65800" message="store" dests="0"> @@ -7594,12 +7549,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+STORE.i32" staging="r=1" mask="0xffe00" exact="0x62c00" message="store" dests="0"> @@ -7610,12 +7564,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+STORE.i48" staging="r=2" mask="0xffe00" exact="0x65a00" message="store" dests="0"> @@ -7626,12 +7579,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+STORE.i64" staging="r=2" mask="0xffe00" exact="0x62e00" message="store" dests="0"> @@ -7642,12 +7594,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+STORE.i8" staging="r=1" mask="0xffe00" exact="0x62000" message="store" dests="0"> @@ -7658,12 +7609,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+STORE.i96" staging="r=3" mask="0xffe00" exact="0x65c00" message="store" dests="0"> @@ -7674,12 +7624,11 @@ <opt>none</opt> <opt>wls</opt> <opt>stream</opt> - <opt pseudo="true">pos</opt> - <opt pseudo="true">vary</opt> + <reserved/> + <reserved/> <reserved/> <opt>tl</opt> </mod> - <immediate name="byte_offset" size="16" pseudo="true"/> </ins> <ins name="+ST_CVT" staging="r=format" mask="0xff800" exact="0xc9800" message="store" dests="0"> @@ -7704,7 +7653,7 @@ </mod> </ins> - <ins name="+ST_TILE" staging="r=format" mask="0xff800" exact="0xcb800" message="tile" dests="0"> + <ins name="+ST_TILE" staging="r=vecsize" mask="0xff800" exact="0xcb800" message="tile" dests="0"> <src start="0"/> <src start="3"/> <src start="6" mask="0xf7"/> @@ -7714,12 +7663,6 @@ <opt>v3</opt> <opt>v4</opt> </mod> - <mod name="register_format" size="3" pseudo="true"> - <opt>f32</opt> - <opt>f16</opt> - <opt>u32</opt> - <opt>s32</opt> - </mod> </ins> <ins name="+SWZ.v2i16" mask="0xfffc8" exact="0x3d948"> @@ -7753,27 +7696,6 @@ <mod name="skip" start="9" size="1" opt="skip"/> <!-- not actually encoded, but used for IR --> <immediate name="sr_count" size="4" pseudo="true"/> - <immediate name="sr_count_2" size="4" pseudo="true"/> - <mod name="lod_mode" start="13" size="1" default="zero_lod" pseudo="true"> - <opt>computed_lod</opt> - <opt>zero_lod</opt> - </mod> - </ins> - - <!-- Pseudo instruction representing dual texturing on Bifrost. Lowered to - TEXC after register allocation, when the second destination register can - be combined with the texture operation descriptor. --> - <ins name="+TEXC_DUAL" staging="rw=sr_count" pseudo="true" message="tex" dests="2"> - <src start="0"/> - <src start="3"/> - <src start="6" mask="0xf7"/> - <mod name="skip" start="9" size="1" opt="skip"/> - <immediate name="sr_count" size="4" pseudo="true"/> - <immediate name="sr_count_2" size="4" pseudo="true"/> - <mod name="lod_mode" start="13" size="1" default="zero_lod" pseudo="true"> - <opt>computed_lod</opt> - <opt>zero_lod</opt> - </mod> </ins> <ins name="+TEXS_2D.f16" staging="w=2" mask="0xfc000" exact="0xd8000" message="tex"> @@ -7959,7 +7881,6 @@ <opt>rtz</opt> <opt>rtna</opt> </mod> - <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/> <derived start="6" size="1"> <and> <eq left="abs0" right="#none"/> @@ -8261,11 +8182,11 @@ <mod name="preserve_null" size="1" opt="preserve_null"/> </ins> - <!-- Scheduler lowered to *ATOM_C.i32/+ATOM_CX. Real Valhall instructions. --> - <ins name="+ATOM_RETURN.i32" pseudo="true" staging="rw=sr_count" message="atomic"> + <!-- Scheduler lowered to *ATOM_C.i32/+ATOM_CX --> + <ins name="+PATOM_C.i32" pseudo="true" staging="rw=sr_count" message="atomic"> <src start="0"/> <src start="3"/> - <mod name="atom_opc" start="9" size="5"> + <mod name="atom_opc" start="9" size="4"> <reserved/> <reserved/> <opt>aadd</opt> @@ -8281,14 +8202,10 @@ <opt>aand</opt> <opt>aor</opt> <opt>axor</opt> - <opt>axchg</opt> <!-- For Valhall --> - <opt>acmpxchg</opt> <!-- For Valhall --> </mod> - <!-- not actually encoded, but used for IR --> - <immediate name="sr_count" size="4" pseudo="true"/> </ins> - <ins name="+ATOM1_RETURN.i32" pseudo="true" staging="w=sr_count" message="atomic"> + <ins name="+PATOM_C1.i32" pseudo="true" staging="w=sr_count" message="atomic"> <src start="0"/> <src start="3"/> <mod name="atom_opc" start="6" size="3"> @@ -8298,32 +8215,6 @@ <opt>asmax1</opt> <opt>aor1</opt> </mod> - <!-- not actually encoded, but used for IR --> - <immediate name="sr_count" size="4" pseudo="true"/> - </ins> - - <ins name="+ATOM.i32" pseudo="true" staging="r=sr_count" message="atomic"> - <src start="0"/> - <src start="3"/> - <mod name="atom_opc" start="9" size="4"> - <reserved/> - <reserved/> - <opt>aadd</opt> - <reserved/> - <reserved/> - <reserved/> - <reserved/> - <reserved/> - <opt>asmin</opt> - <opt>asmax</opt> - <opt>aumin</opt> - <opt>aumax</opt> - <opt>aand</opt> - <opt>aor</opt> - <opt>axor</opt> - </mod> - <!-- not actually encoded, but used for IR --> - <immediate name="sr_count" size="4" pseudo="true"/> </ins> <!-- *CUBEFACE1/+CUBEFACE2 pair, two destinations, scheduler lowered --> @@ -8336,982 +8227,4 @@ <mod name="neg2" size="1" opt="neg"/> </ins> - <ins name="+IADD_IMM.i32" pseudo="true"> - <src start="0"/> - <immediate name="index" size="32"/> - </ins> - - <ins name="+IADD_IMM.v2i16" pseudo="true"> - <src start="0"/> - <immediate name="index" size="32"/> - </ins> - - <ins name="+IADD_IMM.v4i8" pseudo="true"> - <src start="0"/> - <immediate name="index" size="32"/> - </ins> - - <ins name="+FADD_IMM.f32" pseudo="true"> - <src start="0"/> - <immediate name="index" size="32"/> - </ins> - - <ins name="+FADD_IMM.v2f16" pseudo="true"> - <src start="0"/> - <immediate name="index" size="32"/> - </ins> - - <ins name="*FABSNEG.f32" pseudo="true"> - <src start="0" mask="0xfb"/> - <mod name="neg0" start="7" size="1" opt="neg"/> - <mod name="abs0" start="12" size="1" opt="abs"/> - <mod name="widen0" size="2"> - <opt>none</opt> - <opt>h0</opt> - <opt>h1</opt> - </mod> - </ins> - - <ins name="*FABSNEG.v2f16" pseudo="true"> - <src start="0" mask="0xfb"/> - <mod name="abs0" size="1" opt="abs"/> - <mod name="neg0" start="7" size="1" opt="neg"/> - <mod name="swz0" start="9" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - </ins> - - <ins name="*FCLAMP.f32" pseudo="true"> - <src start="0" mask="0xfb"/> - <mod name="clamp" start="15" size="2"> - <opt>none</opt> - <opt>clamp_0_inf</opt> - <opt>clamp_m1_1</opt> - <opt>clamp_0_1</opt> - </mod> - </ins> - - <ins name="*FCLAMP.v2f16" pseudo="true"> - <src start="0" mask="0xfb"/> - <mod name="clamp" start="15" size="2"> - <opt>none</opt> - <opt>clamp_0_inf</opt> - <opt>clamp_m1_1</opt> - <opt>clamp_0_1</opt> - </mod> - </ins> - - <ins name="+DISCARD.b32" pseudo="true" dests="0"> - <src start="0"/> - <mod name="widen0" size="2"> - <opt>none</opt> - <opt>h0</opt> - <opt>h1</opt> - </mod> - </ins> - - <ins name="+TEX_SINGLE" staging="rw=sr_count" message="tex" pseudo="true"> - <src start="0"/> - <src start="1"/> - <immediate name="sr_count" size="4" pseudo="true"/> - <mod name="texel_offset" start="9" size="1" opt="texel_offset"/> - <mod name="skip" start="9" size="1" opt="skip"/> - <mod name="shadow" start="9" size="1" opt="shadow"/> - <mod name="array_enable" start="9" size="1" opt="array_enable"/> - <mod name="dimension" start="9" size="2"> - <opt>1d</opt> - <opt>2d</opt> - <opt>3d</opt> - <opt>cube</opt> - </mod> - <mod name="write_mask" start="9" size="4"> - <opt>none</opt> - <opt>r</opt> - <opt>g</opt> - <opt>rg</opt> - <opt>b</opt> - <opt>rb</opt> - <opt>gb</opt> - <opt>rgb</opt> - <opt>a</opt> - <opt>ra</opt> - <opt>ga</opt> - <opt>rga</opt> - <opt>ba</opt> - <opt>rba</opt> - <opt>gba</opt> - <opt>rgba</opt> - </mod> - <mod name="va_lod_mode" start="13" size="3" default="zero_lod"> - <opt>zero_lod</opt> - <opt>computed_lod</opt> - <opt>explicit</opt> - <opt>computed_bias</opt> - <opt>grdesc</opt> - </mod> - <mod name="register_format" size="4"> - <opt>f16</opt> - <opt>f32</opt> - <opt>s32</opt> - <opt>u32</opt> - <opt>s16</opt> - <opt>u16</opt> - </mod> - </ins> - - <ins name="+TEX_FETCH" staging="rw=sr_count" message="tex" pseudo="true"> - <src start="0"/> - <src start="1"/> - <immediate name="sr_count" size="4" pseudo="true"/> - <mod name="texel_offset" start="9" size="1" opt="texel_offset"/> - <mod name="skip" start="9" size="1" opt="skip"/> - <mod name="array_enable" start="9" size="1" opt="array_enable"/> - <mod name="dimension" start="9" size="2"> - <opt>1d</opt> - <opt>2d</opt> - <opt>3d</opt> - <opt>cube</opt> - </mod> - <mod name="write_mask" start="9" size="4"> - <opt>none</opt> - <opt>r</opt> - <opt>g</opt> - <opt>rg</opt> - <opt>b</opt> - <opt>rb</opt> - <opt>gb</opt> - <opt>rgb</opt> - <opt>a</opt> - <opt>ra</opt> - <opt>ga</opt> - <opt>rga</opt> - <opt>ba</opt> - <opt>rba</opt> - <opt>gba</opt> - <opt>rgba</opt> - </mod> - <mod name="register_format" size="4"> - <opt>f16</opt> - <opt>f32</opt> - <opt>s32</opt> - <opt>u32</opt> - <opt>s16</opt> - <opt>u16</opt> - </mod> - </ins> - - <ins name="+TEX_GATHER" staging="rw=sr_count" message="tex" pseudo="true"> - <src start="0"/> - <src start="1"/> - <immediate name="sr_count" size="4" pseudo="true"/> - <mod name="texel_offset" start="9" size="1" opt="texel_offset"/> - <mod name="skip" start="9" size="1" opt="skip"/> - <mod name="shadow" start="9" size="1" opt="shadow"/> - <mod name="array_enable" start="9" size="1" opt="array_enable"/> - <mod name="integer_coordinates" start="9" size="1" opt="integer_coordinates"/> - <mod name="fetch_component" start="9" size="2"> - <opt>gather4_r</opt> - <opt>gather4_g</opt> - <opt>gather4_b</opt> - <opt>gather4_a</opt> - </mod> - <mod name="dimension" start="9" size="2"> - <opt>1d</opt> - <opt>2d</opt> - <opt>3d</opt> - <opt>cube</opt> - </mod> - <mod name="write_mask" start="9" size="4"> - <opt>none</opt> - <opt>r</opt> - <opt>g</opt> - <opt>rg</opt> - <opt>b</opt> - <opt>rb</opt> - <opt>gb</opt> - <opt>rgb</opt> - <opt>a</opt> - <opt>ra</opt> - <opt>ga</opt> - <opt>rga</opt> - <opt>ba</opt> - <opt>rba</opt> - <opt>gba</opt> - <opt>rgba</opt> - </mod> - <mod name="register_format" size="4"> - <opt>f16</opt> - <opt>f32</opt> - <opt>s32</opt> - <opt>u32</opt> - <opt>s16</opt> - <opt>u16</opt> - </mod> - </ins> - - <ins name="+CUBEFACE2_V9" pseudo="true"> - <src start="0" mask="0xfb"/> - <src start="3" mask="0xfb"/> - <src start="6"/> - <mod name="neg0" size="1" opt="neg"/> - <mod name="neg1" size="1" opt="neg"/> - <mod name="neg2" size="1" opt="neg"/> - </ins> - - <ins name="+LD_VAR_BUF_IMM.f32" staging="w=format" message="varying" pseudo="true"> - <src start="0"/> - <immediate name="index" start="3" size="5"/> - <mod name="vecsize" start="8" size="2"> - <opt>none</opt> - <opt>v2</opt> - <opt>v3</opt> - <opt>v4</opt> - </mod> - <mod name="update" size="2"> - <opt>store</opt> - <opt>retrieve</opt> - <opt>conditional</opt> - <opt>clobber</opt> - </mod> - <mod name="register_format" size="2"> - <opt>f32</opt> - <opt>f16</opt> - <opt>u32</opt> - <opt>u16</opt> - </mod> - <mod name="source_format" size="2"> - <opt>flat32</opt> - <opt>flat16</opt> - <opt>f32</opt> - <opt>f16</opt> - </mod> - <mod name="sample" size="3"> - <opt>center</opt> - <opt>centroid</opt> - <opt>sample</opt> - <opt>explicit</opt> - <opt>none</opt> - </mod> - </ins> - - <ins name="+LD_VAR_BUF.f32" staging="w=format" message="varying" pseudo="true"> - <src start="0"/> - <src start="1"/> - <mod name="vecsize" start="8" size="2"> - <opt>none</opt> - <opt>v2</opt> - <opt>v3</opt> - <opt>v4</opt> - </mod> - <mod name="update" size="2"> - <opt>store</opt> - <opt>retrieve</opt> - <opt>conditional</opt> - <opt>clobber</opt> - </mod> - <mod name="register_format" size="2"> - <opt>f32</opt> - <opt>f16</opt> - <opt>u32</opt> - <opt>u16</opt> - </mod> - <mod name="source_format" size="2"> - <opt>flat32</opt> - <opt>flat16</opt> - <opt>f32</opt> - <opt>f16</opt> - </mod> - <mod name="sample" size="3"> - <opt>center</opt> - <opt>centroid</opt> - <opt>sample</opt> - <opt>explicit</opt> - <opt>none</opt> - </mod> - </ins> - - <ins name="+LD_VAR_BUF_IMM.f16" staging="w=format" message="varying" pseudo="true"> - <src start="0"/> - <immediate name="index" start="3" size="5"/> - <mod name="vecsize" start="8" size="2"> - <opt>none</opt> - <opt>v2</opt> - <opt>v3</opt> - <opt>v4</opt> - </mod> - <mod name="update" size="2"> - <opt>store</opt> - <opt>retrieve</opt> - <opt>conditional</opt> - <opt>clobber</opt> - </mod> - <mod name="register_format" size="2"> - <opt>f32</opt> - <opt>f16</opt> - <opt>u32</opt> - <opt>u16</opt> - </mod> - <mod name="source_format" size="2"> - <opt>flat32</opt> - <opt>flat16</opt> - <opt>f32</opt> - <opt>f16</opt> - </mod> - <mod name="sample" size="3"> - <opt>center</opt> - <opt>centroid</opt> - <opt>sample</opt> - <opt>explicit</opt> - <opt>none</opt> - </mod> - </ins> - - <ins name="+LD_VAR_BUF.f16" staging="w=format" message="varying" pseudo="true"> - <src start="0"/> - <src start="1"/> - <mod name="vecsize" start="8" size="2"> - <opt>none</opt> - <opt>v2</opt> - <opt>v3</opt> - <opt>v4</opt> - </mod> - <mod name="update" size="2"> - <opt>store</opt> - <opt>retrieve</opt> - <opt>conditional</opt> - <opt>clobber</opt> - </mod> - <mod name="register_format" size="2"> - <opt>f32</opt> - <opt>f16</opt> - <opt>u32</opt> - <opt>u16</opt> - </mod> - <mod name="source_format" size="2"> - <opt>flat32</opt> - <opt>flat16</opt> - <opt>f32</opt> - <opt>f16</opt> - </mod> - <mod name="sample" size="3"> - <opt>center</opt> - <opt>centroid</opt> - <opt>sample</opt> - <opt>explicit</opt> - <opt>none</opt> - </mod> - </ins> - - <ins name="+LEA_BUF_IMM" staging="w=2" message="attribute" pseudo="true"> - <src start="0"/> - </ins> - - <ins name="+LD_BUFFER.i128" staging="w=4" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - </ins> - - <ins name="+LD_BUFFER.i16" staging="w=1" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - <mod name="lane_dest" size="2" default="h0"> - <opt>h0</opt> - <opt>h1</opt> - <opt>w0</opt> - <opt>d0</opt> - </mod> - <mod name="extend" size="2"> - <opt>none</opt> - <opt>sext</opt> - <opt>zext</opt> - </mod> - </ins> - - <ins name="+LD_BUFFER.i24" staging="w=1" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - </ins> - - <ins name="+LD_BUFFER.i32" staging="w=1" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - <mod name="lane_dest" size="1" opt="d0"/> - <mod name="extend" size="2"> - <opt>none</opt> - <opt>sext</opt> - <opt>zext</opt> - </mod> - </ins> - - <ins name="+LD_BUFFER.i48" staging="w=2" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - </ins> - - <ins name="+LD_BUFFER.i64" staging="w=2" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - </ins> - - <ins name="+LD_BUFFER.i8" staging="w=1" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - <mod name="lane_dest" size="3" default="b0"> - <opt>b0</opt> - <opt>b1</opt> - <opt>b2</opt> - <opt>b3</opt> - <opt>h0</opt> - <opt>h1</opt> - <opt>w0</opt> - <opt>d0</opt> - </mod> - <mod name="extend" size="2"> - <opt>none</opt> - <opt>sext</opt> - <opt>zext</opt> - </mod> - </ins> - - <ins name="+LD_BUFFER.i96" staging="w=3" pseudo="true" message="load"> - <src start="0"/> - <src start="3"/> - </ins> - - <ins name="+BRANCHZI" pseudo="true" last="true" dests="0"> - <src start="0"/> - <src start="6" mask="0xf7"/> - <mod name="cmpf" size="1"> - <opt>eq</opt> - <opt>ne</opt> - </mod> - </ins> - - <ins name="+LD_TEX" pseudo="true" staging="w=format" message="attribute"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="register_format" size="4"> - <opt>f16</opt> - <opt>f32</opt> - <opt>s32</opt> - <opt>u32</opt> - <opt>s16</opt> - <opt>u16</opt> - <opt>f64</opt> - <opt>i64</opt> - <opt>auto</opt> - </mod> - <mod name="vecsize" start="11" size="2"> - <opt>none</opt> - <opt>v2</opt> - <opt>v3</opt> - <opt>v4</opt> - </mod> - </ins> - - <ins name="+LD_TEX_IMM" pseudo="true" staging="w=format" message="attribute"> - <src start="0"/> - <src start="3"/> - <immediate name="texture_index" start="6" size="4"/> - <mod name="register_format" size="4"> - <opt>f16</opt> - <opt>f32</opt> - <opt>s32</opt> - <opt>u32</opt> - <opt>s16</opt> - <opt>u16</opt> - <opt>f64</opt> - <opt>i64</opt> - <opt>auto</opt> - </mod> - <mod name="vecsize" start="11" size="2"> - <opt>none</opt> - <opt>v2</opt> - <opt>v3</opt> - <opt>v4</opt> - </mod> - </ins> - - <ins name="*MKVEC.v2i8" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="lane0" start="12" size="2" default="b0"> - <opt>b0</opt> - <opt>b1</opt> - <opt>b2</opt> - <opt>b3</opt> - </mod> - <mod name="lane1" start="13" size="2" default="b0"> - <opt>b0</opt> - <opt>b1</opt> - <opt>b2</opt> - <opt>b3</opt> - </mod> - </ins> - - <ins name="+PHI" pseudo="true" variable_srcs="true"/> - - <ins name="+COLLECT.i32" pseudo="true" variable_srcs="true"/> - - <ins name="+SPLIT.i32" pseudo="true" variable_dests="true"> - <src start="0"/> - </ins> - - <ins name="*FCMP_OR.f32" pseudo="true"> - <src start="0" mask="0xfb"/> - <src start="3" mask="0xfb"/> - <src start="6" mask="0xfb"/> - <mod name="widen0" size="2"> - <opt>none</opt> - <opt>h0</opt> - <opt>h1</opt> - </mod> - <mod name="widen1" size="2"> - <opt>none</opt> - <opt>h0</opt> - <opt>h1</opt> - </mod> - <mod name="abs1" start="6" size="1" opt="abs"/> - <mod name="neg0" start="7" size="1" opt="neg"/> - <mod name="neg1" start="8" size="1" opt="neg"/> - <mod name="abs0" start="12" size="1" opt="abs"/> - <mod name="cmpf" start="13" size="3"> - <opt>eq</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>ne</opt> - <opt>lt</opt> - <opt>le</opt> - <opt>gtlt</opt> - <opt>total</opt> - </mod> - <mod name="result_type" start="16" size="2" default="i1"> - <opt>i1</opt> - <opt>f1</opt> - <opt>m1</opt> - </mod> - </ins> - - <ins name="*FCMP_OR.v2f16" pseudo="true"> - <src start="0" mask="0xfb"/> - <src start="3" mask="0xfb"/> - <src start="6" mask="0xfb"/> - <mod name="abs0" size="1" opt="abs"/> - <mod name="abs1" size="1" opt="abs"/> - <mod name="cmpf" size="3"> - <opt>eq</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>ne</opt> - <opt>lt</opt> - <opt>le</opt> - <opt>gtlt</opt> - <opt>total</opt> - </mod> - <mod name="neg0" start="7" size="1" opt="neg"/> - <mod name="neg1" start="8" size="1" opt="neg"/> - <mod name="swz0" start="9" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="swz1" start="11" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="result_type" start="16" size="2" default="i1"> - <opt>i1</opt> - <opt>f1</opt> - <opt>m1</opt> - </mod> - </ins> - - <ins name="*FCMP_AND.f32" pseudo="true"> - <src start="0" mask="0xfb"/> - <src start="3" mask="0xfb"/> - <src start="6" mask="0xfb"/> - <mod name="widen0" size="2"> - <opt>none</opt> - <opt>h0</opt> - <opt>h1</opt> - </mod> - <mod name="widen1" size="2"> - <opt>none</opt> - <opt>h0</opt> - <opt>h1</opt> - </mod> - <mod name="abs1" start="6" size="1" opt="abs"/> - <mod name="neg0" start="7" size="1" opt="neg"/> - <mod name="neg1" start="8" size="1" opt="neg"/> - <mod name="abs0" start="12" size="1" opt="abs"/> - <mod name="cmpf" start="13" size="3"> - <opt>eq</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>ne</opt> - <opt>lt</opt> - <opt>le</opt> - <opt>gtlt</opt> - <opt>total</opt> - </mod> - <mod name="result_type" start="16" size="2" default="i1"> - <opt>i1</opt> - <opt>f1</opt> - <opt>m1</opt> - </mod> - </ins> - - <ins name="*FCMP_AND.v2f16" pseudo="true"> - <src start="0" mask="0xfb"/> - <src start="3" mask="0xfb"/> - <src start="6" mask="0xfb"/> - <mod name="abs0" size="1" opt="abs"/> - <mod name="abs1" size="1" opt="abs"/> - <mod name="cmpf" size="3"> - <opt>eq</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>ne</opt> - <opt>lt</opt> - <opt>le</opt> - <opt>gtlt</opt> - <opt>total</opt> - </mod> - <mod name="neg0" start="7" size="1" opt="neg"/> - <mod name="neg1" start="8" size="1" opt="neg"/> - <mod name="swz0" start="9" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="swz1" start="11" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="result_type" start="16" size="2" default="i1"> - <opt>i1</opt> - <opt>f1</opt> - <opt>m1</opt> - </mod> - </ins> - - <ins name="+ICMP_MULTI.s32" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_MULTI.u32" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_OR.s32" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_OR.u32" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_OR.v2s16" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="swz0" start="6" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="swz1" start="8" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_OR.v2u16" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="swz0" start="6" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="swz1" start="8" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_OR.v4s8" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - <derived start="6" size="1"> - <eq left="cmpf" right="#gt"/> - <eq left="cmpf" right="#ge"/> - </derived> - </ins> - - <ins name="+ICMP_OR.v4u8" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_AND.s32" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_AND.u32" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_AND.v2s16" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="swz0" start="6" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="swz1" start="8" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_AND.v2u16" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="swz0" start="6" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="swz1" start="8" size="2" default="h01"> - <opt>h00</opt> - <opt>h10</opt> - <opt>h01</opt> - <opt>h11</opt> - </mod> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - - <ins name="+ICMP_AND.v4s8" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - <derived start="6" size="1"> - <eq left="cmpf" right="#gt"/> - <eq left="cmpf" right="#ge"/> - </derived> - </ins> - - <ins name="+ICMP_AND.v4u8" pseudo="true"> - <src start="0"/> - <src start="3"/> - <src start="6"/> - <mod name="result_type" start="10" size="1" default="i1"> - <opt>i1</opt> - <opt>m1</opt> - </mod> - <mod name="cmpf" size="2"> - <opt>eq</opt> - <opt>ne</opt> - <opt>gt</opt> - <opt>ge</opt> - <opt>lt</opt> - <opt>le</opt> - </mod> - </ins> - </bifrost> diff --git a/lib/mesa/src/panfrost/bifrost/bi_builder.h.py b/lib/mesa/src/panfrost/bifrost/bi_builder.h.py index 4ce47fb05..903ef4e02 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_builder.h.py +++ b/lib/mesa/src/panfrost/bifrost/bi_builder.h.py @@ -19,9 +19,7 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. -SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen", - "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem", - "not_result", "skip", "round", "ftz"]) +SKIP = set(["lane", "lanes", "lanes", "replicate", "swz", "widen", "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem", "not_result", "skip"]) TEMPLATE = """ #ifndef _BI_BUILDER_H_ @@ -30,11 +28,6 @@ TEMPLATE = """ #include "compiler.h" <% -# For <32-bit loads/stores, the default extend `none` with a natural sized -# input is not encodeable! To avoid a footgun, swap the default to `zext` which -# will work as expected -ZEXT_DEFAULT = set(["LOAD.i8", "LOAD.i16", "LOAD.i24", "STORE.i8", "STORE.i16", "STORE.i24"]) - def nirtypes(opcode): split = opcode.split('.', 1) if len(split) < 2: @@ -60,6 +53,19 @@ def nirtypes(opcode): else: return None +def typesize(opcode): + if opcode[-3:] == '128': + return 128 + if opcode[-2:] == '48': + return 48 + elif opcode[-1] == '8': + return 8 + else: + try: + return int(opcode[-2:]) + except: + return None + def condition(opcode, typecheck, sizecheck): cond = '' if typecheck == True: @@ -92,51 +98,27 @@ def to_suffix(op): static inline bi_instr * bi_${opcode.replace('.', '_').lower()}${to_suffix(ops[opcode])}(${signature(ops[opcode], modifiers)}) { -<% - op = ops[opcode] - nr_dests = "nr_dests" if op["variable_dests"] else op["dests"] - nr_srcs = "nr_srcs" if op["variable_srcs"] else src_count(op) -%> - size_t size = sizeof(bi_instr) + sizeof(bi_index) * (${nr_dests} + ${nr_srcs}); - bi_instr *I = (bi_instr *) rzalloc_size(b->shader, size); - + bi_instr *I = rzalloc(b->shader, bi_instr); I->op = BI_OPCODE_${opcode.replace('.', '_').upper()}; - I->nr_dests = ${nr_dests}; - I->nr_srcs = ${nr_srcs}; - I->dest = (bi_index *) (&I[1]); - I->src = I->dest + ${nr_dests}; - -% if not op["variable_dests"]: -% for dest in range(op["dests"]): +% for dest in range(ops[opcode]["dests"]): I->dest[${dest}] = dest${dest}; % endfor -%endif - -% if not op["variable_srcs"]: -% for src in range(src_count(op)): +% for src in range(src_count(ops[opcode])): I->src[${src}] = src${src}; % endfor -% endif - % for mod in ops[opcode]["modifiers"]: -% if not should_skip(mod, opcode): +% if mod[0:-1] not in SKIP and mod not in SKIP: I->${mod} = ${mod}; % endif % endfor -% if ops[opcode]["rtz"]: - I->round = BI_ROUND_RTZ; -% endif % for imm in ops[opcode]["immediates"]: I->${imm} = ${imm}; % endfor -% if opcode in ZEXT_DEFAULT: - I->extend = BI_EXTEND_ZEXT; -% endif bi_builder_insert(&b->cursor, I); return I; } -% if ops[opcode]["dests"] == 1 and not ops[opcode]["variable_dests"]: +% if ops[opcode]["dests"] == 1: static inline bi_index bi_${opcode.replace('.', '_').lower()}(${signature(ops[opcode], modifiers, no_dests=True)}) { @@ -193,26 +175,19 @@ modifier_lists = order_modifiers(ir_instructions) # Generate type signature for a builder routine -def should_skip(mod, op): - # FROUND and HADD only make sense in context of a round mode, so override - # the usual skip - if mod == "round" and ("FROUND" in op or "HADD" in op): - return False - +def should_skip(mod): return mod in SKIP or mod[0:-1] in SKIP def modifier_signature(op): - return sorted([m for m in op["modifiers"].keys() if not should_skip(m, op["key"])]) + return sorted([m for m in op["modifiers"].keys() if not should_skip(m)]) def signature(op, modifiers, typeful = False, sized = False, no_dests = False): return ", ".join( ["bi_builder *b"] + (["nir_alu_type type"] if typeful == True else []) + (["unsigned bitsize"] if sized == True else []) + - (["unsigned nr_dests"] if op["variable_dests"] else - ["bi_index dest{}".format(i) for i in range(0 if no_dests else op["dests"])]) + - (["unsigned nr_srcs"] if op["variable_srcs"] else - ["bi_index src{}".format(i) for i in range(src_count(op))]) + + ["bi_index dest{}".format(i) for i in range(0 if no_dests else op["dests"])] + + ["bi_index src{}".format(i) for i in range(src_count(op))] + ["{} {}".format( "bool" if len(modifiers[T[0:-1]] if T[-1] in "0123" else modifiers[T]) == 2 else "enum bi_" + T[0:-1] if T[-1] in "0123" else @@ -221,19 +196,11 @@ def signature(op, modifiers, typeful = False, sized = False, no_dests = False): ["uint32_t {}".format(imm) for imm in op["immediates"]]) def arguments(op, temp_dest = True): - dest_pattern = "bi_temp(b->shader)" if temp_dest else 'dest{}' - dests = [dest_pattern.format(i) for i in range(op["dests"])] - srcs = ["src{}".format(i) for i in range(src_count(op))] - - # Variable source/destinations just pass in the count - if op["variable_dests"]: - dests = ["nr_dests"] - - if op["variable_srcs"]: - srcs = ["nr_srcs"] - - return ", ".join(["b"] + dests + srcs + modifier_signature(op) + op["immediates"]) + return ", ".join( + ["b"] + + ["bi_temp(b->shader)" if temp_dest else 'dest{}'.format(i) for i in range(op["dests"])] + + ["src{}".format(i) for i in range(src_count(op))] + + modifier_signature(op) + + op["immediates"]) -print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, modifiers = - modifier_lists, signature = signature, arguments = arguments, src_count = - src_count, typesize = typesize, should_skip = should_skip)) +print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, modifiers = modifier_lists, signature = signature, arguments = arguments, src_count = src_count, SKIP = SKIP)) diff --git a/lib/mesa/src/panfrost/bifrost/bi_layout.c b/lib/mesa/src/panfrost/bifrost/bi_layout.c index 7c034cb31..db66ed04f 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_layout.c +++ b/lib/mesa/src/panfrost/bifrost/bi_layout.c @@ -32,6 +32,24 @@ * manipulating clause layouts. */ +/* Helper to see if a tuple can be inserted. We must satisfy the invariant: + * + * constant_count + tuple_count <= 13 + * + * ...which is equivalent to the clause ending up with 8 or fewer quardwords. + * Inserting a tuple increases tuple_count by one, and if it reads a unique + * constant, it increases constant_count by one. + */ + +bool +bi_can_insert_tuple(bi_clause *clause, bool constant) +{ + unsigned constant_count = clause->constant_count + (constant ? 1 : 0); + unsigned tuple_count = clause->tuple_count + 1; + + return (constant_count + tuple_count) <= 13; +} + /* Is embedded constant 0 packed for free in a clause with this many tuples? */ bool @@ -69,7 +87,7 @@ bi_ec0_packed(unsigned tuple_count) * constants are packed two-by-two as constant quadwords. */ -static unsigned +unsigned bi_clause_quadwords(bi_clause *clause) { unsigned X = clause->tuple_count; @@ -95,7 +113,7 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target) /* Determine if the block we're branching to is strictly greater in * source order */ - bool forwards = target->index > start->block->index; + bool forwards = target->base.name > start->block->base.name; if (forwards) { /* We have to jump through this block from the start of this @@ -106,7 +124,9 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target) /* We then need to jump through every clause of every following * block until the target */ - bi_foreach_block_from(ctx, start->block, blk) { + bi_foreach_block_from(ctx, start->block, _blk) { + bi_block *blk = (bi_block *) _blk; + /* Don't double-count the first block */ if (blk == start->block) continue; @@ -133,7 +153,9 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target) /* And jump back every clause of preceding blocks up through * and including the target to get to the beginning of the * target */ - bi_foreach_block_from_rev(ctx, start->block, blk) { + bi_foreach_block_from_rev(ctx, start->block, _blk) { + bi_block *blk = (bi_block *) _blk; + if (blk == start->block) continue; diff --git a/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c b/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c index 883f53014..ed03d4c2c 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c +++ b/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c @@ -30,51 +30,16 @@ * recombine swizzles where we can as an optimization. */ -static bool -bi_swizzle_replicates_8(enum bi_swizzle swz) -{ - switch (swz) { - case BI_SWIZZLE_B0000: - case BI_SWIZZLE_B1111: - case BI_SWIZZLE_B2222: - case BI_SWIZZLE_B3333: - return true; - default: - return false; - } -} - static void -lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src) +bi_lower_swizzle_16(bi_context *ctx, bi_instr *ins, unsigned src) { /* TODO: Use the opcode table and be a lot more methodical about this... */ switch (ins->op) { - /* Some instructions used with 16-bit data never have swizzles */ case BI_OPCODE_CSEL_V2F16: case BI_OPCODE_CSEL_V2I16: case BI_OPCODE_CSEL_V2S16: case BI_OPCODE_CSEL_V2U16: - - /* Despite ostensibly being 32-bit instructions, CLPER does not - * inherently interpret the data, so it can be used for v2f16 - * derivatives, which might require swizzle lowering */ - case BI_OPCODE_CLPER_I32: - case BI_OPCODE_CLPER_OLD_I32: - - /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the - * boolean is implemented as a 16-bit integer, the swizzle is needed - * for correct operation if the instruction producing the 16-bit - * boolean does not replicate to both halves of the containing 32-bit - * register. As such, we may need to lower a swizzle. - * - * This is a silly hack. Ideally, code gen would be smart enough to - * avoid this case (by replicating). In practice, silly hardware design - * decisions force our hand here. - */ - case BI_OPCODE_MUX_I32: - case BI_OPCODE_CSEL_I32: break; - case BI_OPCODE_IADD_V2S16: case BI_OPCODE_IADD_V2U16: case BI_OPCODE_ISUB_V2S16: @@ -93,212 +58,28 @@ lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src) return; else break; - - /* For some reason MUX.v2i16 allows swaps but not replication */ - case BI_OPCODE_MUX_V2I16: - if (ins->src[src].swizzle == BI_SWIZZLE_H10) - return; - else - break; - - /* No swizzles supported */ - case BI_OPCODE_HADD_V4U8: - case BI_OPCODE_HADD_V4S8: - case BI_OPCODE_CLZ_V4U8: - case BI_OPCODE_IDP_V4I8: - case BI_OPCODE_IABS_V4S8: - case BI_OPCODE_ICMP_V4I8: - case BI_OPCODE_ICMP_V4U8: - case BI_OPCODE_MUX_V4I8: - case BI_OPCODE_IADD_IMM_V4I8: - break; - - case BI_OPCODE_LSHIFT_AND_V4I8: - case BI_OPCODE_LSHIFT_OR_V4I8: - case BI_OPCODE_LSHIFT_XOR_V4I8: - case BI_OPCODE_RSHIFT_AND_V4I8: - case BI_OPCODE_RSHIFT_OR_V4I8: - case BI_OPCODE_RSHIFT_XOR_V4I8: - /* Last source allows identity or replication */ - if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle)) - return; - - /* Others do not allow swizzles */ - break; - - /* We don't want to deal with reswizzling logic in modifier prop. Move - * the swizzle outside, it's easier for clamp propagation. */ - case BI_OPCODE_FCLAMP_V2F16: - { - bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); - bi_index dest = ins->dest[0]; - bi_index tmp = bi_temp(ctx); - - bi_index swizzled_src = bi_replace_index(ins->src[0], tmp); - ins->src[0].swizzle = BI_SWIZZLE_H01; - ins->dest[0] = tmp; - bi_swz_v2i16_to(&b, dest, swizzled_src); - return; - } - default: return; } - /* First, try to apply a given swizzle to a constant to clear the - * runtime swizzle. This is less heavy-handed than ignoring the - * swizzle for scalar destinations, since it maintains - * replication of the destination. - */ - if (ins->src[src].type == BI_INDEX_CONSTANT) { - ins->src[src].value = bi_apply_swizzle(ins->src[src].value, - ins->src[src].swizzle); - ins->src[src].swizzle = BI_SWIZZLE_H01; + /* Identity is ok (TODO: what about replicate only?) */ + if (ins->src[src].swizzle == BI_SWIZZLE_H01) return; - } - - /* Even if the source does not replicate, if the consuming instruction - * produces a 16-bit scalar, we can ignore the other component. - */ - if (ins->dest[0].swizzle == BI_SWIZZLE_H00 && - ins->src[src].swizzle == BI_SWIZZLE_H00) - { - ins->src[src].swizzle = BI_SWIZZLE_H01; - return; - } /* Lower it away */ bi_builder b = bi_init_builder(ctx, bi_before_instr(ins)); - - bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8); - bi_index orig = ins->src[src]; - bi_index stripped = bi_replace_index(bi_null(), orig); - stripped.swizzle = ins->src[src].swizzle; - - bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped); - - bi_replace_src(ins, src, swz); + ins->src[src] = bi_replace_index(ins->src[src], + bi_swz_v2i16(&b, ins->src[src])); ins->src[src].swizzle = BI_SWIZZLE_H01; } -static bool -bi_swizzle_replicates_16(enum bi_swizzle swz) -{ - switch (swz) { - case BI_SWIZZLE_H00: - case BI_SWIZZLE_H11: - return true; - default: - /* If a swizzle replicates every 8-bits, it also replicates - * every 16-bits, so allow 8-bit replicating swizzles. - */ - return bi_swizzle_replicates_8(swz); - } -} - -static bool -bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16) -{ - switch (I->op) { - - /* Instructions that construct vectors have replicated output if their - * sources are identical. Check this case first. - */ - case BI_OPCODE_MKVEC_V2I16: - case BI_OPCODE_V2F16_TO_V2S16: - case BI_OPCODE_V2F16_TO_V2U16: - case BI_OPCODE_V2F32_TO_V2F16: - case BI_OPCODE_V2S16_TO_V2F16: - case BI_OPCODE_V2S8_TO_V2F16: - case BI_OPCODE_V2S8_TO_V2S16: - case BI_OPCODE_V2U16_TO_V2F16: - case BI_OPCODE_V2U8_TO_V2F16: - case BI_OPCODE_V2U8_TO_V2U16: - return bi_is_value_equiv(I->src[0], I->src[1]); - - /* 16-bit transcendentals are defined to output zero in their - * upper half, so they do not replicate - */ - case BI_OPCODE_FRCP_F16: - case BI_OPCODE_FRSQ_F16: - return false; - - /* Not sure, be conservative, we don't use these.. */ - case BI_OPCODE_VN_ASST1_F16: - case BI_OPCODE_FPCLASS_F16: - case BI_OPCODE_FPOW_SC_DET_F16: - return false; - - default: - break; - } - - /* Replication analysis only makes sense for ALU instructions */ - if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE) - return false; - - /* We only analyze 16-bit instructions for 16-bit replication. We could - * maybe do better. - */ - if (bi_opcode_props[I->op].size != BI_SIZE_16) - return false; - - bi_foreach_src(I, s) { - if (bi_is_null(I->src[s])) - continue; - - /* Replicated swizzles */ - if (bi_swizzle_replicates_16(I->src[s].swizzle)) - continue; - - /* Replicated values */ - if (bi_is_ssa(I->src[s]) && - BITSET_TEST(replicates_16, I->src[s].value)) - continue; - - /* Replicated constants */ - if (I->src[s].type == BI_INDEX_CONSTANT && - (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16)) - continue; - - return false; - } - - return true; -} - void bi_lower_swizzle(bi_context *ctx) { bi_foreach_instr_global_safe(ctx, ins) { bi_foreach_src(ins, s) { - if (bi_is_null(ins->src[s])) continue; - if (ins->src[s].swizzle == BI_SWIZZLE_H01) continue; - - lower_swizzle(ctx, ins, s); + if (!bi_is_null(ins->src[s])) + bi_lower_swizzle_16(ctx, ins, s); } } - - /* Now that we've lowered swizzles, clean up the mess */ - BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc); - - bi_foreach_instr_global(ctx, ins) { - if (ins->nr_dests && bi_instr_replicates(ins, replicates_16)) - BITSET_SET(replicates_16, ins->dest[0].value); - - if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) && - BITSET_TEST(replicates_16, ins->src[0].value)) { - ins->op = BI_OPCODE_MOV_I32; - ins->src[0].swizzle = BI_SWIZZLE_H01; - } - - /* The above passes rely on replicating destinations. For - * Valhall, we will want to optimize this. For now, default - * to Bifrost compatible behaviour. - */ - if (ins->nr_dests) - ins->dest[0].swizzle = BI_SWIZZLE_H01; - } - - free(replicates_16); } diff --git a/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py b/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py index cbe0ae458..7ef88da8f 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py +++ b/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py @@ -21,15 +21,11 @@ # IN THE SOFTWARE. TEMPLATE = """#include "bi_opcodes.h" -<% -def hasmod(mods, name): - return 1 if name in mods else 0 -%> + struct bi_op_props bi_opcode_props[BI_NUM_OPCODES] = { % for opcode in sorted(mnemonics): <% add = instructions["+" + opcode][0][1] if "+" + opcode in instructions else None - size = typesize(opcode) message = add["message"].upper() if add else "NONE" sr_count = add["staging_count"].upper() if add else "0" sr_read = int(add["staging"] in ["r", "rw"] if add else False) @@ -39,18 +35,10 @@ struct bi_op_props bi_opcode_props[BI_NUM_OPCODES] = { branch = int(opcode.startswith('BRANCH')) has_fma = int("*" + opcode in instructions) has_add = int("+" + opcode in instructions) - mods = ops[opcode]['modifiers'] - clamp = hasmod(mods, 'clamp') - not_result = hasmod(mods, 'not_result') - abs = hasmod(mods, 'abs0') | (hasmod(mods, 'abs1') << 1) | (hasmod(mods, 'abs2') << 2) - neg = hasmod(mods, 'neg0') | (hasmod(mods, 'neg1') << 1) | (hasmod(mods, 'neg2') << 2) - m_not = hasmod(mods, 'not1') %> [BI_OPCODE_${opcode.replace('.', '_').upper()}] = { - "${opcode}", BIFROST_MESSAGE_${message}, BI_SIZE_${size}, - BI_SR_COUNT_${sr_count}, ${sr_read}, ${sr_write}, ${last}, ${branch}, - ${table}, ${has_fma}, ${has_add}, ${clamp}, ${not_result}, ${abs}, - ${neg}, ${m_not}, + "${opcode}", BIFROST_MESSAGE_${message}, BI_SR_COUNT_${sr_count}, + ${sr_read}, ${sr_write}, ${last}, ${branch}, ${table}, ${has_fma}, ${has_add}, }, % endfor };""" @@ -63,4 +51,4 @@ instructions = parse_instructions(sys.argv[1], include_pseudo = True) ir_instructions = partition_mnemonics(instructions) mnemonics = set(x[1:] for x in instructions.keys()) -print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, mnemonics = mnemonics, instructions = instructions, typesize = typesize)) +print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, mnemonics = mnemonics, instructions = instructions)) diff --git a/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py b/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py index 3b8ff0b33..b807513e1 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py +++ b/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py @@ -64,23 +64,11 @@ enum bi_sr_count { BI_SR_COUNT_SR_COUNT = 7 }; -enum bi_size { - BI_SIZE_8 = 0, - BI_SIZE_16, - BI_SIZE_24, - BI_SIZE_32, - BI_SIZE_48, - BI_SIZE_64, - BI_SIZE_96, - BI_SIZE_128, -}; - /* Description of an opcode in the IR */ struct bi_op_props { const char *name; enum bifrost_message_type message : 4; - enum bi_size size : 3; enum bi_sr_count sr_count : 3; bool sr_read : 1; bool sr_write : 1; @@ -89,13 +77,6 @@ struct bi_op_props { bool table : 1; bool fma : 1; bool add : 1; - - /* Supported propagable modifiers */ - bool clamp : 1; - bool not_result : 1; - unsigned abs : 3; - unsigned neg : 3; - bool not_mod : 1; }; /* Generated in bi_opcodes.c.py */ diff --git a/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c b/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c index 13b9b0d2b..06b0e41e8 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c +++ b/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c @@ -23,89 +23,54 @@ */ #include "compiler.h" -#include "bi_builder.h" -/* SSA copy propagation */ +/* A simple scalar-only SSA-based copy-propagation pass. TODO: vectors */ static bool -bi_reads_fau(bi_instr *ins) +bi_is_copy(bi_instr *ins) { - bi_foreach_src(ins, s) { - if (ins->src[s].type == BI_INDEX_FAU) - return true; - } + return (ins->op == BI_OPCODE_MOV_I32) && bi_is_ssa(ins->dest[0]) + && (bi_is_ssa(ins->src[0]) || ins->src[0].type == BI_INDEX_FAU); +} - return false; +static inline unsigned +bi_word_node(bi_index idx) +{ + assert(idx.type == BI_INDEX_NORMAL && !idx.reg); + return (idx.value << 2) | idx.offset; } void bi_opt_copy_prop(bi_context *ctx) { - /* Chase SPLIT of COLLECT. Instruction selection usually avoids this - * pattern (due to the split cache), but it is inevitably generated by - * the UBO pushing pass. - */ - bi_instr **collects = calloc(sizeof(bi_instr *), ctx->ssa_alloc); - bi_foreach_instr_global_safe(ctx, I) { - if (I->op == BI_OPCODE_COLLECT_I32) { - /* Rewrite trivial collects while we're at it */ - if (I->nr_srcs == 1) - I->op = BI_OPCODE_MOV_I32; - - collects[I->dest[0].value] = I; - } else if (I->op == BI_OPCODE_SPLIT_I32) { - /* Rewrite trivial splits while we're at it */ - if (I->nr_dests == 1) - I->op = BI_OPCODE_MOV_I32; - - bi_instr *collect = collects[I->src[0].value]; - if (!collect) - continue; - - /* Lower the split to moves, copyprop cleans up */ - bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - - bi_foreach_dest(I, d) - bi_mov_i32_to(&b, I->dest[d], collect->src[d]); - - bi_remove_instruction(I); - } - } - - free(collects); - - bi_index *replacement = calloc(sizeof(bi_index), ctx->ssa_alloc); + bi_index *replacement = calloc(sizeof(bi_index), ((ctx->ssa_alloc + 1) << 2)); bi_foreach_instr_global_safe(ctx, ins) { - if (ins->op == BI_OPCODE_MOV_I32 && ins->src[0].type != BI_INDEX_REGISTER) { + if (bi_is_copy(ins)) { bi_index replace = ins->src[0]; /* Peek through one layer so copyprop converges in one * iteration for chained moves */ if (bi_is_ssa(replace)) { - bi_index chained = replacement[replace.value]; + bi_index chained = replacement[bi_word_node(replace)]; if (!bi_is_null(chained)) replace = chained; } - assert(ins->nr_dests == 1); - replacement[ins->dest[0].value] = replace; + replacement[bi_word_node(ins->dest[0])] = replace; } bi_foreach_src(ins, s) { bi_index use = ins->src[s]; - if (use.type != BI_INDEX_NORMAL) continue; - if (bi_is_staging_src(ins, s)) continue; - - bi_index repl = replacement[use.value]; + if (use.type != BI_INDEX_NORMAL || use.reg) continue; + if (bi_count_read_registers(ins, s) != 1) continue; - if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(ins)) - continue; + bi_index repl = replacement[bi_word_node(use)]; if (!bi_is_null(repl)) - bi_replace_src(ins, s, repl); + ins->src[s] = bi_replace_index(ins->src[s], repl); } } diff --git a/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c b/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c index 5a37bf3a9..8debdd486 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c +++ b/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c @@ -30,16 +30,10 @@ * structure returned back to the command stream. */ static bool -bi_is_ubo(bi_instr *ins) -{ - return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) && - (ins->seg == BI_SEG_UBO); -} - -static bool bi_is_direct_aligned_ubo(bi_instr *ins) { - return bi_is_ubo(ins) && + return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) && + (ins->seg == BI_SEG_UBO) && (ins->src[0].type == BI_INDEX_CONSTANT) && (ins->src[1].type == BI_INDEX_CONSTANT) && ((ins->src[0].value & 0x3) == 0); @@ -79,12 +73,8 @@ bi_analyze_ranges(bi_context *ctx) assert(ubo < res.nr_blocks); assert(channels > 0 && channels <= 4); - if (word >= MAX_UBO_WORDS) continue; - - /* Must use max if the same base is read with different channel - * counts, which is possible with nir_opt_shrink_vectors */ - uint8_t *range = res.blocks[ubo].range; - range[word] = MAX2(range[word], channels); + if (word < MAX_UBO_WORDS) + res.blocks[ubo].range[word] = channels; } return res; @@ -128,51 +118,42 @@ bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis) void bi_opt_push_ubo(bi_context *ctx) { - struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); - bi_pick_ubo(ctx->info.push, &analysis); + if (ctx->inputs->no_ubo_to_push) + return; - ctx->ubo_mask = 0; + /* This pass only runs once */ + assert(ctx->info->push.count == 0); + + struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); + bi_pick_ubo(&ctx->info->push, &analysis); bi_foreach_instr_global_safe(ctx, ins) { - if (!bi_is_ubo(ins)) continue; + if (!bi_is_direct_aligned_ubo(ins)) continue; unsigned ubo = ins->src[1].value; unsigned offset = ins->src[0].value; - if (!bi_is_direct_aligned_ubo(ins)) { - /* The load can't be pushed, so this UBO needs to be - * uploaded conventionally */ - if (ins->src[1].type == BI_INDEX_CONSTANT) - ctx->ubo_mask |= BITSET_BIT(ubo); - else - ctx->ubo_mask = ~0; - - continue; - } - /* Check if we decided to push this */ assert(ubo < analysis.nr_blocks); - if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) { - ctx->ubo_mask |= BITSET_BIT(ubo); - continue; - } + if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) continue; /* Replace the UBO load with moves from FAU */ bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); - unsigned nr = bi_opcode_props[ins->op].sr_count; - bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0], nr); + unsigned channels = bi_opcode_props[ins->op].sr_count; - bi_foreach_src(vec, w) { + for (unsigned w = 0; w < channels; ++w) { /* FAU is grouped in pairs (2 x 4-byte) */ unsigned base = - pan_lookup_pushed_ubo(ctx->info.push, ubo, + pan_lookup_pushed_ubo(&ctx->info->push, ubo, (offset + 4 * w)); unsigned fau_idx = (base >> 1); unsigned fau_hi = (base & 1); - vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi); + bi_mov_i32_to(&b, + bi_word(ins->dest[0], w), + bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi)); } bi_remove_instruction(ins); @@ -180,169 +161,3 @@ bi_opt_push_ubo(bi_context *ctx) free(analysis.blocks); } - -typedef struct { - BITSET_DECLARE(row, PAN_MAX_PUSH); -} adjacency_row; - -/* Find the connected component containing `node` with depth-first search */ -static void -bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited, - unsigned *component, unsigned *size, unsigned node) -{ - unsigned neighbour; - - BITSET_SET(visited, node); - component[(*size)++] = node; - - BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) { - if (!BITSET_TEST(visited, neighbour)) { - bi_find_component(adjacency, visited, component, size, - neighbour); - } - } -} - -static bool -bi_is_uniform(bi_index idx) -{ - return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM); -} - -/* Get the index of a uniform in 32-bit words from the start of FAU-RAM */ -static unsigned -bi_uniform_word(bi_index idx) -{ - assert(bi_is_uniform(idx)); - assert(idx.offset <= 1); - - return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset; -} - -/* - * Create an undirected graph where nodes are 32-bit uniform indices and edges - * represent that two nodes are used in the same instruction. - * - * The graph is constructed as an adjacency matrix stored in adjacency. - */ -static void -bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency) -{ - bi_foreach_instr_global(ctx, I) { - unsigned nodes[BI_MAX_SRCS] = {}; - unsigned node_count = 0; - - /* Set nodes[] to 32-bit uniforms accessed */ - bi_foreach_src(I, s) { - if (bi_is_uniform(I->src[s])) { - unsigned word = bi_uniform_word(I->src[s]); - - if (word >= ctx->info.push_offset) - nodes[node_count++] = word; - } - } - - /* Create clique connecting nodes[] */ - for (unsigned i = 0; i < node_count; ++i) { - for (unsigned j = 0; j < node_count; ++j) { - if (i == j) - continue; - - unsigned x = nodes[i], y = nodes[j]; - assert(MAX2(x, y) < ctx->info.push->count); - - /* Add undirected edge between the nodes */ - BITSET_SET(adjacency[x].row, y); - BITSET_SET(adjacency[y].row, x); - } - } - } -} - -/* - * Optimization pass to reorder uniforms. The goal is to reduce the number of - * moves we emit when lowering FAU. The pass groups uniforms used by the same - * instruction. - * - * The pass works by creating a graph of pushed uniforms, where edges denote the - * "both 32-bit uniforms required by the same instruction" relationship. We - * perform depth-first search on this graph to find the connected components, - * where each connected component is a cluster of uniforms that are used - * together. We then select pairs of uniforms from each connected component. - * The remaining unpaired uniforms (from components of odd sizes) are paired - * together arbitrarily. - * - * After a new ordering is selected, pushed uniforms in the program and the - * panfrost_ubo_push data structure must be remapped to use the new ordering. - */ -void -bi_opt_reorder_push(bi_context *ctx) -{ - adjacency_row adjacency[PAN_MAX_PUSH] = { 0 }; - BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 }; - - unsigned ordering[PAN_MAX_PUSH] = { 0 }; - unsigned unpaired[PAN_MAX_PUSH] = { 0 }; - unsigned pushed = 0, unpaired_count = 0; - - struct panfrost_ubo_push *push = ctx->info.push; - unsigned push_offset = ctx->info.push_offset; - - bi_create_fau_interference_graph(ctx, adjacency); - - for (unsigned i = push_offset; i < push->count; ++i) { - if (BITSET_TEST(visited, i)) continue; - - unsigned component[PAN_MAX_PUSH] = { 0 }; - unsigned size = 0; - bi_find_component(adjacency, visited, component, &size, i); - - /* If there is an odd number of uses, at least one use must be - * unpaired. Arbitrarily take the last one. - */ - if (size % 2) - unpaired[unpaired_count++] = component[--size]; - - /* The rest of uses are paired */ - assert((size % 2) == 0); - - /* Push the paired uses */ - memcpy(ordering + pushed, component, sizeof(unsigned) * size); - pushed += size; - } - - /* Push unpaired nodes at the end */ - memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count); - pushed += unpaired_count; - - /* Ordering is a permutation. Invert it for O(1) lookup. */ - unsigned old_to_new[PAN_MAX_PUSH] = { 0 }; - - for (unsigned i = 0; i < push_offset; ++i) { - old_to_new[i] = i; - } - - for (unsigned i = 0; i < pushed; ++i) { - assert(ordering[i] >= push_offset); - old_to_new[ordering[i]] = push_offset + i; - } - - /* Use new ordering throughout the program */ - bi_foreach_instr_global(ctx, I) { - bi_foreach_src(I, s) { - if (bi_is_uniform(I->src[s])) { - unsigned node = bi_uniform_word(I->src[s]); - unsigned new_node = old_to_new[node]; - I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1); - I->src[s].offset = new_node & 1; - } - } - } - - /* Use new ordering for push */ - struct panfrost_ubo_push old = *push; - for (unsigned i = 0; i < pushed; ++i) - push->words[push_offset + i] = old.words[ordering[i]]; - - push->count = push_offset + pushed; -} diff --git a/lib/mesa/src/panfrost/bifrost/bi_packer.c.py b/lib/mesa/src/panfrost/bifrost/bi_packer.c.py index 601750e2a..28669ebfa 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_packer.c.py +++ b/lib/mesa/src/panfrost/bifrost/bi_packer.c.py @@ -24,14 +24,9 @@ import sys from bifrost_isa import * from mako.template import Template -# Consider pseudo instructions when getting the modifier list -instructions_with_pseudo = parse_instructions(sys.argv[1], include_pseudo = True) -ir_instructions_with_pseudo = partition_mnemonics(instructions_with_pseudo) -modifier_lists = order_modifiers(ir_instructions_with_pseudo) - -# ...but strip for packing instructions = parse_instructions(sys.argv[1]) ir_instructions = partition_mnemonics(instructions) +modifier_lists = order_modifiers(ir_instructions) # Packs sources into an argument. Offset argument to work around a quirk of our # compiler IR when dealing with staging registers (TODO: reorder in the IR to @@ -112,9 +107,6 @@ def pack_modifier(mod, width, default, opts, body, pack_exprs): # Construct a list lists = [pick_from_bucket(opts, bucket) for bucket in SWIZZLE_BUCKETS] ir_value = "src[{}].swizzle".format(arg) - elif raw == "lane_dest": - lists = [pick_from_bucket(opts, bucket) for bucket in SWIZZLE_BUCKETS] - ir_value = "dest->swizzle" elif raw in ["abs", "sign"]: ir_value = "src[{}].abs".format(arg) elif raw in ["neg", "not"]: @@ -315,7 +307,7 @@ bi_pack_${'fma' if unit == '*' else 'add'}(bi_instr *I, enum bifrost_packed_src src3) { if (!I) - return bi_pack_${opname_to_c(unit + 'NOP')}(I, src0, src1, src2, src3); + return bi_pack_${opname_to_c(unit + 'NOP.i32')}(I, src0, src1, src2, src3); % if unit == '*': assert((1 << src0) & 0xfb); diff --git a/lib/mesa/src/panfrost/bifrost/bi_printer.c.py b/lib/mesa/src/panfrost/bifrost/bi_printer.c.py index 04a9c0095..5692633b4 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_printer.c.py +++ b/lib/mesa/src/panfrost/bifrost/bi_printer.c.py @@ -55,7 +55,6 @@ bir_fau_name(unsigned fau_idx) "blend_descriptor_2", "blend_descriptor_3", "blend_descriptor_4", "blend_descriptor_5", "blend_descriptor_6", "blend_descriptor_7", - "tls_ptr", "wls_ptr", "program_counter", }; assert(fau_idx < ARRAY_SIZE(names)); @@ -76,9 +75,6 @@ bir_passthrough_name(unsigned idx) static void bi_print_index(FILE *fp, bi_index index) { - if (index.discard) - fputs("^", fp); - if (bi_is_null(index)) fprintf(fp, "_"); else if (index.type == BI_INDEX_CONSTANT) @@ -90,6 +86,8 @@ bi_print_index(FILE *fp, bi_index index) else if (index.type == BI_INDEX_PASS) fprintf(fp, "%s", bir_passthrough_name(index.value)); else if (index.type == BI_INDEX_REGISTER) + fprintf(fp, "br%u", index.value); + else if (index.type == BI_INDEX_NORMAL && index.reg) fprintf(fp, "r%u", index.value); else if (index.type == BI_INDEX_NORMAL) fprintf(fp, "%u", index.value); @@ -111,7 +109,7 @@ bi_print_index(FILE *fp, bi_index index) % for mod in sorted(modifiers): % if len(modifiers[mod]) > 2: # otherwise just boolean -UNUSED static inline const char * +static inline const char * bi_${mod}_as_str(enum bi_${mod} ${mod}) { switch (${mod}) { @@ -131,13 +129,11 @@ bi_${mod}_as_str(enum bi_${mod} ${mod}) <%def name="print_modifiers(mods, table)"> % for mod in mods: - % if mod not in ["lane_dest"]: % if len(table[mod]) > 2: fputs(bi_${mod}_as_str(I->${mod}), fp); % else: if (I->${mod}) fputs(".${mod}", fp); % endif - % endif % endfor </%def> @@ -156,37 +152,19 @@ bi_${mod}_as_str(enum bi_${mod} ${mod}) </%def> void -bi_print_instr(const bi_instr *I, FILE *fp) +bi_print_instr(bi_instr *I, FILE *fp) { - fputs(" ", fp); - bi_foreach_dest(I, d) { + if (bi_is_null(I->dest[d])) break; if (d > 0) fprintf(fp, ", "); bi_print_index(fp, I->dest[d]); } - if (I->nr_dests > 0) - fputs(" = ", fp); - - fprintf(fp, "%s", bi_opcode_props[I->op].name); + fprintf(fp, " = %s", bi_opcode_props[I->op].name); if (I->table) - fprintf(fp, ".table%u", I->table); - - if (I->flow) - fprintf(fp, ".flow%u", I->flow); - - if (I->op == BI_OPCODE_COLLECT_I32 || I->op == BI_OPCODE_PHI) { - for (unsigned i = 0; i < I->nr_srcs; ++i) { - if (i > 0) - fputs(", ", fp); - else - fputs(" ", fp); - - bi_print_index(fp, I->src[i]); - } - } + fprintf(fp, ".%s", bi_table_as_str(I->table)); switch (I->op) { % for opcode in ops: @@ -214,7 +192,7 @@ bi_print_instr(const bi_instr *I, FILE *fp) } if (I->branch_target) - fprintf(fp, " -> block%u", I->branch_target->index); + fprintf(fp, " -> block%u", I->branch_target->base.name); fputs("\\n", fp); diff --git a/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c b/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c index 04aa07b0c..05b731a53 100644 --- a/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c +++ b/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c @@ -38,7 +38,7 @@ * 3. The shader must wait on slot #6 before running BLEND, ATEST * 4. The shader must wait on slot #7 before running BLEND, ST_TILE * 5. ATEST, ZS_EMIT must be issued with slot #0 - * 6. BARRIER must be issued with slot #7 and wait on every active slot. + * 6. BARRIER must be issued with slot #7 * 7. Only slots #0 through #5 may be used for clauses not otherwise specified. * 8. If a clause writes to a read staging register of an unresolved * dependency, it must set a staging barrier. @@ -54,256 +54,57 @@ */ #define BI_NUM_GENERAL_SLOTS 6 -#define BI_NUM_SLOTS 8 -#define BI_NUM_REGISTERS 64 -#define BI_SLOT_SERIAL 0 /* arbitrary */ -/* - * Due to the crude scoreboarding we do, we need to serialize varying loads and - * memory access. Identify these instructions here. - */ -static bool -bi_should_serialize(bi_instr *I) -{ - /* For debug, serialize everything to disable scoreboard opts */ - if (bifrost_debug & BIFROST_DBG_NOSB) - return true; +/* A model for the state of the scoreboard */ - /* Although nominally on the attribute unit, image loads have the same - * coherency requirements as general memory loads. Serialize them for - * now until we can do something more clever. - */ - if (I->op == BI_OPCODE_LD_ATTR_TEX) - return true; - - switch (bi_opcode_props[I->op].message) { - case BIFROST_MESSAGE_VARYING: - case BIFROST_MESSAGE_LOAD: - case BIFROST_MESSAGE_STORE: - case BIFROST_MESSAGE_ATOMIC: - return true; - default: - return false; - } -} +struct bi_scoreboard_state { + /* TODO: what do we track here for a heuristic? */ +}; /* Given a scoreboard model, choose a slot for a clause wrapping a given * message passing instruction. No side effects. */ static unsigned -bi_choose_scoreboard_slot(bi_instr *message) +bi_choose_scoreboard_slot(struct bi_scoreboard_state *st, bi_instr *message) { + /* A clause that does not produce a message must use slot #0 */ + if (!message) + return 0; + + switch (message->op) { /* ATEST, ZS_EMIT must be issued with slot #0 */ - if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT) + case BI_OPCODE_ATEST: + case BI_OPCODE_ZS_EMIT: return 0; /* BARRIER must be issued with slot #7 */ - if (message->op == BI_OPCODE_BARRIER) + case BI_OPCODE_BARRIER: return 7; - /* For now, make serialization is easy */ - if (bi_should_serialize(message)) - return BI_SLOT_SERIAL; - - return 0; -} - -static uint64_t -bi_read_mask(bi_instr *I, bool staging_only) -{ - uint64_t mask = 0; - - if (staging_only && !bi_opcode_props[I->op].sr_read) - return mask; - - bi_foreach_src(I, s) { - if (I->src[s].type == BI_INDEX_REGISTER) { - unsigned reg = I->src[s].value; - unsigned count = bi_count_read_registers(I, s); - - mask |= (BITFIELD64_MASK(count) << reg); - } - - if (staging_only) - break; - } - - return mask; -} - -static uint64_t -bi_write_mask(bi_instr *I) -{ - uint64_t mask = 0; - - bi_foreach_dest(I, d) { - if (bi_is_null(I->dest[d])) continue; - - assert(I->dest[d].type == BI_INDEX_REGISTER); - - unsigned reg = I->dest[d].value; - unsigned count = bi_count_write_registers(I, d); - - mask |= (BITFIELD64_MASK(count) << reg); - } - - /* Instructions like AXCHG.i32 unconditionally both read and write - * staging registers. Even if we discard the result, the write still - * happens logically and needs to be included in our calculations. - * Obscurely, ATOM_CX is sr_write but can ignore the staging register in - * certain circumstances; this does not require consideration. - */ - if (bi_opcode_props[I->op].sr_write && I->nr_dests && I->nr_srcs && - bi_is_null(I->dest[0]) && !bi_is_null(I->src[0])) { - - unsigned reg = I->src[0].value; - unsigned count = bi_count_write_registers(I, 0); - - mask |= (BITFIELD64_MASK(count) << reg); - } - - return mask; -} - -/* Update the scoreboard model to assign an instruction to a given slot */ - -static void -bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause) -{ - bi_instr *I = clause->message; - unsigned slot = clause->scoreboard_id; - - if (!I) - return; - - st->read[slot] |= bi_read_mask(I, true); - - if (bi_opcode_props[I->op].sr_write) - st->write[slot] |= bi_write_mask(I); -} - -/* Adds a dependency on each slot writing any specified register */ - -static void -bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask) -{ - for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) { - if (!(st->write[slot] & regmask)) - continue; - - st->write[slot] = 0; - st->read[slot] = 0; - - clause->dependencies |= BITFIELD_BIT(slot); - } -} - -static void -bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask) -{ - for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) { - if (!(st->read[slot] & regmask)) - continue; - - st->read[slot] = 0; - clause->staging_barrier = true; - } -} - -/* Sets the dependencies for a given clause, updating the model */ - -static void -bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st) -{ - bi_foreach_instr_in_clause(block, clause, I) { - uint64_t read = bi_read_mask(I, false); - uint64_t written = bi_write_mask(I); - - /* Read-after-write; write-after-write */ - bi_depend_on_writers(clause, st, read | written); - - /* Write-after-read */ - bi_set_staging_barrier(clause, st, written); - } - - /* LD_VAR instructions must be serialized per-quad. Just always depend - * on any LD_VAR instructions. This isn't optimal, but doing better - * requires divergence-aware data flow analysis. - * - * Similarly, memory loads/stores need to be synchronized. For now, - * force them to be serialized. This is not optimal. - */ - if (clause->message && bi_should_serialize(clause->message)) - clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL); - - /* Barriers must wait on all slots to flush existing work. It might be - * possible to skip this with more information about the barrier. For - * now, be conservative. - */ - if (clause->message && clause->message->op == BI_OPCODE_BARRIER) - clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS); -} - -static bool -scoreboard_block_update(bi_block *blk) -{ - bool progress = false; - - /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */ - bi_foreach_predecessor(blk, pred) { - for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { - blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i]; - blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i]; - } - } - - struct bi_scoreboard_state state = blk->scoreboard_in; - - /* Assign locally */ - - bi_foreach_clause_in_block(blk, clause) { - bi_set_dependencies(blk, clause, &state); - bi_push_clause(&state, clause); + default: + break; } - /* To figure out progress, diff scoreboard_out */ - - for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) - progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state)); - - blk->scoreboard_out = state; - - return progress; + /* TODO: Use a heuristic */ + return 0; } void bi_assign_scoreboard(bi_context *ctx) { - u_worklist worklist; - bi_worklist_init(ctx, &worklist); - - /* First, assign slots. */ - bi_foreach_block(ctx, block) { - bi_foreach_clause_in_block(block, clause) { - if (clause->message) { - unsigned slot = bi_choose_scoreboard_slot(clause->message); - clause->scoreboard_id = slot; - } - } + struct bi_scoreboard_state st = {}; - bi_worklist_push_tail(&worklist, block); - } + /* Assign slots */ + bi_foreach_block(ctx, _block) { + bi_block *block = (bi_block *) _block; - /* Next, perform forward data flow analysis to calculate dependencies */ - while (!u_worklist_is_empty(&worklist)) { - /* Pop from the front for forward analysis */ - bi_block *blk = bi_worklist_pop_head(&worklist); + bi_foreach_clause_in_block(block, clause) { + unsigned slot = bi_choose_scoreboard_slot(&st, clause->message); + clause->scoreboard_id = slot; - if (scoreboard_block_update(blk)) { - bi_foreach_successor(blk, succ) - bi_worklist_push_tail(&worklist, succ); + bi_clause *next = bi_next_clause(ctx, _block, clause); + if (next) + next->dependencies |= (1 << slot); } } - - u_worklist_fini(&worklist); } diff --git a/lib/mesa/src/panfrost/bifrost/bifrost_isa.py b/lib/mesa/src/panfrost/bifrost/bifrost_isa.py index 7152509bc..ae97795f3 100644 --- a/lib/mesa/src/panfrost/bifrost/bifrost_isa.py +++ b/lib/mesa/src/panfrost/bifrost/bifrost_isa.py @@ -132,8 +132,6 @@ def parse_instruction(ins, include_pseudo): 'staging': ins.attrib.get('staging', '').split('=')[0], 'staging_count': ins.attrib.get('staging', '=0').split('=')[1], 'dests': int(ins.attrib.get('dests', '1')), - 'variable_dests': ins.attrib.get('variable_dests', False), - 'variable_srcs': ins.attrib.get('variable_srcs', False), 'unused': ins.attrib.get('unused', False), 'pseudo': ins.attrib.get('pseudo', False), 'message': ins.attrib.get('message', 'none'), @@ -145,9 +143,6 @@ def parse_instruction(ins, include_pseudo): common['exact'] = parse_exact(ins) for src in ins.findall('src'): - if src.attrib.get('pseudo', False) and not include_pseudo: - continue - mask = int(src.attrib['mask'], 0) if ('mask' in src.attrib) else 0xFF common['srcs'].append([int(src.attrib['start'], 0), mask]) @@ -245,28 +240,18 @@ def simplify_to_ir(ins): 'staging': ins['staging'], 'srcs': len(ins['srcs']), 'dests': ins['dests'], - 'variable_dests': ins['variable_dests'], - 'variable_srcs': ins['variable_srcs'], 'modifiers': [[m[0][0], m[2]] for m in ins['modifiers']], 'immediates': [m[0] for m in ins['immediates']] } -# Converstions to integers default to rounding-to-zero -# All other opcodes default to rounding to nearest even -def default_round_to_zero(name): - # 8-bit int to float is exact - subs = ['_TO_U', '_TO_S', '_TO_V2U', '_TO_V2S', '_TO_V4U', '_TO_V4S'] - return any([x in name for x in subs]) -def combine_ir_variants(instructions, key): - seen = [op for op in instructions.keys() if op[1:] == key] - variant_objs = [[simplify_to_ir(Q[1]) for Q in instructions[x]] for x in seen] - variants = sum(variant_objs, []) +def combine_ir_variants(instructions, v): + variants = sum([[simplify_to_ir(Q[1]) for Q in instructions[x]] for x in v], []) # Accumulate modifiers across variants modifiers = {} - for s in variants[0:]: + for s in variants: # Check consistency assert(s['srcs'] == variants[0]['srcs']) assert(s['dests'] == variants[0]['dests']) @@ -282,27 +267,19 @@ def combine_ir_variants(instructions, key): # Great, we've checked srcs/immediates are consistent and we've summed over # modifiers return { - 'key': key, 'srcs': variants[0]['srcs'], 'dests': variants[0]['dests'], - 'variable_dests': variants[0]['variable_dests'], - 'variable_srcs': variants[0]['variable_srcs'], 'staging': variants[0]['staging'], 'immediates': sorted(variants[0]['immediates']), - 'modifiers': modifiers, - 'v': len(variants), - 'ir': variants, - 'rtz': default_round_to_zero(key) + 'modifiers': { k: modifiers[k] for k in modifiers } } # Partition instructions to mnemonics, considering units and variants # equivalent. def partition_mnemonics(instructions): - key_func = lambda x: x[1:] - sorted_instrs = sorted(instructions.keys(), key = key_func) - partitions = itertools.groupby(sorted_instrs, key_func) - return { k: combine_ir_variants(instructions, k) for k, v in partitions } + partitions = itertools.groupby(instructions, lambda x: x[1:]) + return { k: combine_ir_variants(instructions, v) for (k, v) in partitions } # Generate modifier lists, by accumulating all the possible modifiers, and # deduplicating thus assigning canonical enum values. We don't try _too_ hard @@ -351,17 +328,3 @@ def order_modifiers(ir_instructions): def src_count(op): staging = 1 if (op["staging"] in ["r", "rw"]) else 0 return op["srcs"] + staging - -# Parses out the size part of an opocde name -def typesize(opcode): - if opcode[-3:] == '128': - return 128 - if opcode[-2:] == '48': - return 48 - elif opcode[-1] == '8': - return 8 - else: - try: - return int(opcode[-2:]) - except: - return 32 diff --git a/lib/mesa/src/panfrost/bifrost/gen_disasm.py b/lib/mesa/src/panfrost/bifrost/gen_disasm.py index 505c61cc0..11acf5ae9 100644 --- a/lib/mesa/src/panfrost/bifrost/gen_disasm.py +++ b/lib/mesa/src/panfrost/bifrost/gen_disasm.py @@ -238,7 +238,7 @@ def build_lut(mnemonic, desc, test): key_set = find_context_keys(desc, test) ordered = 'ordering' in key_set key_set.discard('ordering') - keys = sorted(list(key_set)) + keys = list(key_set) # Evaluate the deriveds for every possible state, forming a (state -> deriveds) map testf = compile_derived(test, keys) @@ -326,7 +326,7 @@ def disasm_op(name, op): for i, (pos, mask) in enumerate(srcs): body += ' fputs(", ", fp);\n' - body += ' dump_src(fp, _BITS(bits, {}, 3), *srcs, branch_offset, consts, {});\n'.format(pos, "true" if is_fma else "false") + body += ' dump_src(fp, _BITS(bits, {}, 3), *srcs, consts, {});\n'.format(pos, "true" if is_fma else "false") # Error check if needed if (mask != 0xFF): diff --git a/lib/mesa/src/panfrost/lib/pan_indirect_draw.c b/lib/mesa/src/panfrost/lib/pan_indirect_draw.c index 3fa1f5485..2886d3d91 100644 --- a/lib/mesa/src/panfrost/lib/pan_indirect_draw.c +++ b/lib/mesa/src/panfrost/lib/pan_indirect_draw.c @@ -30,6 +30,7 @@ #include "pan_indirect_draw.h" #include "pan_pool.h" #include "pan_util.h" +#include "panfrost-quirks.h" #include "compiler/nir/nir_builder.h" #include "util/u_memory.h" #include "util/macros.h" @@ -54,7 +55,6 @@ struct draw_data { nir_ssa_def *index_buf; nir_ssa_def *restart_index; nir_ssa_def *vertex_count; - nir_ssa_def *start_instance; nir_ssa_def *instance_count; nir_ssa_def *vertex_start; nir_ssa_def *index_bias; @@ -72,9 +72,6 @@ struct jobs_data { nir_ssa_def *vertex_job; nir_ssa_def *tiler_job; nir_ssa_def *base_vertex_offset; - nir_ssa_def *first_vertex_sysval; - nir_ssa_def *base_vertex_sysval; - nir_ssa_def *base_instance_sysval; nir_ssa_def *offset_start; nir_ssa_def *invocation; }; @@ -111,13 +108,6 @@ struct indirect_draw_info { uint32_t count; uint32_t instance_count; uint32_t start; - uint32_t start_instance; -}; - -struct indirect_indexed_draw_info { - uint32_t count; - uint32_t instance_count; - uint32_t start; int32_t index_bias; uint32_t start_instance; }; @@ -142,7 +132,7 @@ struct indirect_draw_context { mali_ptr varying_mem; }; -/* Indirect draw shader inputs. Those are stored in FAU. */ +/* Indirect draw shader inputs. Those are stored in a UBO. */ struct indirect_draw_inputs { /* indirect_draw_context pointer */ @@ -160,11 +150,6 @@ struct indirect_draw_inputs { /* index buffer */ mali_ptr index_buf; - /* {base,first}_{vertex,instance} sysvals */ - mali_ptr first_vertex_sysval; - mali_ptr base_vertex_sysval; - mali_ptr base_instance_sysval; - /* Pointers to various cmdstream structs that need to be patched */ mali_ptr vertex_job; mali_ptr tiler_job; @@ -175,13 +160,26 @@ struct indirect_draw_inputs { uint32_t draw_buf_stride; uint32_t restart_index; uint32_t attrib_count; -} PACKED; +}; + +static nir_ssa_def * +get_input_data(nir_builder *b, unsigned offset, unsigned size) +{ + assert(!(offset & 0x3)); + assert(size && !(size & 0x3)); + + return nir_load_ubo(b, 1, size, + nir_imm_int(b, 0), + nir_imm_int(b, offset), + .align_mul = 4, + .align_offset = 0, + .range_base = 0, + .range = ~0); +} #define get_input_field(b, name) \ - nir_load_push_constant(b, \ - 1, sizeof(((struct indirect_draw_inputs *)0)->name) * 8, \ - nir_imm_int(b, 0), \ - .base = offsetof(struct indirect_draw_inputs, name)) + get_input_data(b, offsetof(struct indirect_draw_inputs, name), \ + sizeof(((struct indirect_draw_inputs *)0)->name) * 8) static nir_ssa_def * get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset) @@ -282,12 +280,6 @@ update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val) offsetof(struct indirect_draw_info, field)), \ 1, sizeof(((struct indirect_draw_info *)0)->field) * 8) -#define get_indexed_draw_field(b, draw_ptr, field) \ - load_global(b, \ - get_address_imm(b, draw_ptr, \ - offsetof(struct indirect_indexed_draw_info, field)), \ - 1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8) - static void extract_inputs(struct indirect_draw_shader_builder *builder) { @@ -309,9 +301,6 @@ extract_inputs(struct indirect_draw_shader_builder *builder) if (builder->index_min_max_search) return; - builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval); - builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval); - builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval); builder->jobs.vertex_job = get_input_field(b, vertex_job); builder->jobs.tiler_job = get_input_field(b, tiler_job); builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs); @@ -342,49 +331,29 @@ init_shader_builder(struct indirect_draw_shader_builder *builder, if (index_min_max_search) { builder->b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, - GENX(pan_shader_get_compiler_options)(), + pan_shader_get_compiler_options(dev), "indirect_draw_min_max_index(index_size=%d)", builder->index_size); } else { builder->b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, - GENX(pan_shader_get_compiler_options)(), - "indirect_draw(index_size=%d%s%s%s%s)", + pan_shader_get_compiler_options(dev), + "indirect_draw(index_size=%d%s%s%s)", builder->index_size, flags & PAN_INDIRECT_DRAW_HAS_PSIZ ? ",psiz" : "", flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ? ",primitive_restart" : "", flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ? - ",update_primitive_size" : "", - flags & PAN_INDIRECT_DRAW_IDVS ? - ",idvs" : ""); + ",update_primitive_size" : ""); } - extract_inputs(builder); -} - -static void -update_dcd(struct indirect_draw_shader_builder *builder, - nir_ssa_def *job_ptr, - unsigned draw_offset) -{ nir_builder *b = &builder->b; - nir_ssa_def *draw_w01 = - load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32); - nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0); + nir_variable_create(b->shader, nir_var_mem_ubo, + glsl_uint_type(), "inputs"); + b->shader->info.num_ubos++; - /* Update DRAW.{instance_size,offset_start} */ - nir_ssa_def *instance_size = - nir_bcsel(b, - nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)), - nir_imm_int(b, 0), builder->instance_size.packed); - draw_w01 = nir_vec2(b, - nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff), - nir_ishl(b, instance_size, nir_imm_int(b, 16))), - builder->jobs.offset_start); - store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), - draw_w01, 2); + extract_inputs(builder); } static void @@ -402,9 +371,17 @@ update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type unsigned draw_offset = type == MALI_JOB_TYPE_VERTEX ? pan_section_offset(COMPUTE_JOB, DRAW) : - pan_section_offset(TILER_JOB, DRAW); - unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE); - unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE); + pan_is_bifrost(builder->dev) ? + pan_section_offset(BIFROST_TILER_JOB, DRAW) : + pan_section_offset(MIDGARD_TILER_JOB, DRAW); + unsigned prim_offset = + pan_is_bifrost(builder->dev) ? + pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE) : + pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE); + unsigned psiz_offset = + pan_is_bifrost(builder->dev) ? + pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE_SIZE) : + pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE_SIZE); unsigned index_size = builder->index_size; if (type == MALI_JOB_TYPE_TILER) { @@ -440,14 +417,21 @@ update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type builder->varyings.pos_ptr, 2); } - update_dcd(builder, job_ptr, draw_offset); - - if (builder->flags & PAN_INDIRECT_DRAW_IDVS) { - assert(type == MALI_JOB_TYPE_TILER); + nir_ssa_def *draw_w01 = + load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32); + nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0); - update_dcd(builder, job_ptr, - pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW)); - } + /* Update DRAW.{instance_size,offset_start} */ + nir_ssa_def *instance_size = + nir_bcsel(b, + nir_ilt(b, builder->draw.instance_count, nir_imm_int(b, 2)), + nir_imm_int(b, 0), builder->instance_size.packed); + draw_w01 = nir_vec2(b, + nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff), + nir_ishl(b, instance_size, nir_imm_int(b, 16))), + builder->jobs.offset_start); + store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), + draw_w01, 2); } static void @@ -463,7 +447,7 @@ split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d) half_div64); nir_ssa_def *fi = nir_idiv(b, f0, div64); nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64)); - nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff), + nir_ssa_def *e = nir_bcsel(b, nir_ilt(b, half_div64, ff), nir_imm_int(b, 1 << 5), nir_imm_int(b, 0)); *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31)); *r_e = nir_ior(b, r, e); @@ -504,68 +488,33 @@ update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder, } static void -zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder, - nir_ssa_def *attrib_buf_ptr) -{ - /* Stride is an unadorned 32-bit uint at word 2 */ - nir_builder *b = &builder->b; - store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), - nir_imm_int(b, 0), 1); -} - -static void adjust_attrib_offset(struct indirect_draw_shader_builder *builder, - nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr, - nir_ssa_def *instance_div) + nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr) { nir_builder *b = &builder->b; nir_ssa_def *zero = nir_imm_int(b, 0); nir_ssa_def *two = nir_imm_int(b, 2); nir_ssa_def *sub_cur_offset = nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero), - nir_uge(b, builder->draw.instance_count, two)); - - nir_ssa_def *add_base_inst_offset = - nir_iand(b, nir_ine(b, builder->draw.start_instance, zero), - nir_ine(b, instance_div, zero)); - - IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) { - nir_ssa_def *offset = - load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32); - nir_ssa_def *stride = - load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32); + nir_ige(b, builder->draw.instance_count, two)); + IF (sub_cur_offset) { /* Per-instance data needs to be offset in response to a * delayed start in an indexed draw. */ + nir_ssa_def *stride = + load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32); + nir_ssa_def *offset = + load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32); - IF (add_base_inst_offset) { - offset = nir_iadd(b, offset, - nir_idiv(b, - nir_imul(b, stride, - builder->draw.start_instance), - instance_div)); - } ENDIF - - IF (sub_cur_offset) { - offset = nir_isub(b, offset, - nir_imul(b, stride, - builder->jobs.offset_start)); - } ENDIF - + offset = nir_isub(b, offset, + nir_imul(b, stride, + builder->jobs.offset_start)); store_global(b, get_address_imm(b, attrib_ptr, WORD(1)), offset, 1); } ENDIF } -/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */ - -static nir_ssa_def * -nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x) -{ - return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2)); -} - /* Based on panfrost_emit_vertex_data() */ static void @@ -576,78 +525,78 @@ update_vertex_attribs(struct indirect_draw_shader_builder *builder) nir_local_variable_create(b->impl, glsl_uint_type(), "attrib_idx"); nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1); - -#if PAN_ARCH <= 5 nir_ssa_def *single_instance = - nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)); -#endif + nir_ilt(b, builder->draw.instance_count, nir_imm_int(b, 2)); LOOP { nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var); - IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count)) + IF (nir_ige(b, attrib_idx, builder->attribs.attrib_count)) BREAK; ENDIF nir_ssa_def *attrib_buf_ptr = get_address(b, builder->attribs.attrib_bufs, nir_imul_imm(b, attrib_idx, - 2 * pan_size(ATTRIBUTE_BUFFER))); + 2 * MALI_ATTRIBUTE_BUFFER_LENGTH)); nir_ssa_def *attrib_ptr = get_address(b, builder->attribs.attribs, nir_imul_imm(b, attrib_idx, - pan_size(ATTRIBUTE))); + MALI_ATTRIBUTE_LENGTH)); nir_ssa_def *r_e, *d; -#if PAN_ARCH <= 5 - IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) { - nir_ssa_def *r_p = - nir_bcsel(b, single_instance, - nir_imm_int(b, 0x9f), - builder->instance_size.packed); + if (!pan_is_bifrost(builder->dev)) { + IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) { + nir_ssa_def *r_p = + nir_bcsel(b, single_instance, + nir_imm_int(b, 0x9f), + builder->instance_size.packed); - store_global(b, - get_address_imm(b, attrib_buf_ptr, WORD(4)), - nir_ishl(b, r_p, nir_imm_int(b, 24)), 1); + store_global(b, + get_address_imm(b, attrib_buf_ptr, WORD(4)), + nir_ishl(b, r_p, nir_imm_int(b, 24)), 1); - nir_store_var(b, attrib_idx_var, - nir_iadd_imm(b, attrib_idx, 1), 1); - CONTINUE; - } ENDIF + nir_store_var(b, attrib_idx_var, + nir_iadd_imm(b, attrib_idx, 1), 1); + CONTINUE; + } ENDIF - IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) { - split_div(b, builder->instance_size.padded, - &r_e, &d); - nir_ssa_def *default_div = - nir_ior(b, single_instance, - nir_ult(b, - builder->instance_size.padded, - nir_imm_int(b, 2))); - r_e = nir_bcsel(b, default_div, - nir_imm_int(b, 0x3f), r_e); - d = nir_bcsel(b, default_div, - nir_imm_int(b, (1u << 31) - 1), d); - store_global(b, - get_address_imm(b, attrib_buf_ptr, WORD(1)), - nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d), - 2); - nir_store_var(b, attrib_idx_var, - nir_iadd_imm(b, attrib_idx, 1), 1); - CONTINUE; - } ENDIF -#endif + IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) { + split_div(b, builder->instance_size.padded, + &r_e, &d); + nir_ssa_def *default_div = + nir_ior(b, single_instance, + nir_ilt(b, + builder->instance_size.padded, + nir_imm_int(b, 2))); + r_e = nir_bcsel(b, default_div, + nir_imm_int(b, 0x3f), r_e); + d = nir_bcsel(b, default_div, + nir_imm_int(b, (1u << 31) - 1), d); + store_global(b, + get_address_imm(b, attrib_buf_ptr, WORD(1)), + nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d), + 2); + nir_store_var(b, attrib_idx_var, + nir_iadd_imm(b, attrib_idx, 1), 1); + CONTINUE; + } ENDIF + } - nir_ssa_def *instance_div = + nir_ssa_def *div = load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32); - nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded); + div = nir_imul(b, div, builder->instance_size.padded); nir_ssa_def *multi_instance = - nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2)); + nir_ige(b, builder->draw.instance_count, nir_imm_int(b, 2)); IF (nir_ine(b, div, nir_imm_int(b, 0))) { IF (multi_instance) { - IF (nir_is_power_of_two_or_zero(b, div)) { + nir_ssa_def *div_pow2 = + nir_ilt(b, nir_bit_count(b, div), nir_imm_int(b, 2)); + + IF (div_pow2) { nir_ssa_def *exp = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0)); @@ -662,16 +611,26 @@ update_vertex_attribs(struct indirect_draw_shader_builder *builder) } ENDIF } ELSE { /* Single instance with a non-0 divisor: all - * accesses should point to attribute 0 */ - zero_attrib_buf_stride(builder, attrib_buf_ptr); + * accesses should point to attribute 0, pick + * the biggest pot divisor. + */ + update_vertex_attrib_buf(builder, attrib_buf_ptr, + MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR, + nir_imm_int(b, 31), NULL); } ENDIF - adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div); - } ELSE IF (multi_instance) { - update_vertex_attrib_buf(builder, attrib_buf_ptr, - MALI_ATTRIBUTE_TYPE_1D_MODULUS, - builder->instance_size.packed, NULL); - } ENDIF ENDIF + adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr); + } ELSE { + IF (multi_instance) { + update_vertex_attrib_buf(builder, attrib_buf_ptr, + MALI_ATTRIBUTE_TYPE_1D_MODULUS, + builder->instance_size.packed, NULL); + } ELSE { + update_vertex_attrib_buf(builder, attrib_buf_ptr, + MALI_ATTRIBUTE_TYPE_1D, + nir_imm_int(b, 0), NULL); + } ENDIF + } ENDIF nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1); } @@ -716,19 +675,19 @@ update_varyings(struct indirect_draw_shader_builder *builder) nir_ssa_def *buf_ptr = get_address_imm(b, builder->varyings.varying_bufs, PAN_VARY_GENERAL * - pan_size(ATTRIBUTE_BUFFER)); + MALI_ATTRIBUTE_BUFFER_LENGTH); update_varying_buf(builder, buf_ptr, vertex_count); buf_ptr = get_address_imm(b, builder->varyings.varying_bufs, PAN_VARY_POSITION * - pan_size(ATTRIBUTE_BUFFER)); + MALI_ATTRIBUTE_BUFFER_LENGTH); builder->varyings.pos_ptr = update_varying_buf(builder, buf_ptr, vertex_count); if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) { buf_ptr = get_address_imm(b, builder->varyings.varying_bufs, PAN_VARY_PSIZ * - pan_size(ATTRIBUTE_BUFFER)); + MALI_ATTRIBUTE_BUFFER_LENGTH); builder->varyings.psiz_ptr = update_varying_buf(builder, buf_ptr, vertex_count); } @@ -761,14 +720,6 @@ get_invocation(struct indirect_draw_shader_builder *builder) nir_imm_int(b, 2 << 28))); } -static nir_ssa_def * -nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot) -{ - assert(pot != 0 && util_is_power_of_two_or_zero(pot)); - - return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1)); -} - /* Based on panfrost_padded_vertex_count() */ static nir_ssa_def * @@ -789,7 +740,7 @@ get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed) nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero); exp = nir_iadd(b, exp, rshift); base = nir_ushr(b, base, rshift); - base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero)); + base = nir_iadd(b, base, nir_bcsel(b, nir_ige(b, base, eleven), one, zero)); rshift = nir_imax(b, nir_find_lsb(b, base), zero); exp = nir_iadd(b, exp, rshift); base = nir_ushr(b, base, rshift); @@ -803,28 +754,10 @@ static void update_jobs(struct indirect_draw_shader_builder *builder) { get_invocation(builder); - - if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS)) - update_job(builder, MALI_JOB_TYPE_VERTEX); - + update_job(builder, MALI_JOB_TYPE_VERTEX); update_job(builder, MALI_JOB_TYPE_TILER); } - -static void -set_null_job(struct indirect_draw_shader_builder *builder, - nir_ssa_def *job_ptr) -{ - nir_builder *b = &builder->b; - nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4)); - nir_ssa_def *val = load_global(b, w4, 1, 32); - - /* Set job type to NULL (AKA NOOP) */ - val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01), - nir_imm_int(b, MALI_JOB_TYPE_NULL << 1)); - store_global(b, w4, val, 1); -} - static void get_instance_size(struct indirect_draw_shader_builder *builder) { @@ -877,8 +810,8 @@ get_instance_size(struct indirect_draw_shader_builder *builder) for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) { nir_ssa_def *oob = nir_ior(b, - nir_ult(b, nir_imm_int(b, i), offset), - nir_uge(b, nir_imm_int(b, i), end)); + nir_ilt(b, nir_imm_int(b, i), offset), + nir_ige(b, nir_imm_int(b, i), end)); nir_ssa_def *data = nir_iand_imm(b, val, mask); min = nir_umin(b, min, @@ -903,7 +836,7 @@ get_instance_size(struct indirect_draw_shader_builder *builder) nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32); for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) { - nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining); + nir_ssa_def *oob = nir_ige(b, nir_imm_int(b, i), remaining); nir_ssa_def *data = nir_iand_imm(b, val, mask); min = nir_umin(b, min, @@ -936,68 +869,25 @@ patch(struct indirect_draw_shader_builder *builder) nir_ssa_def *draw_ptr = builder->draw.draw_buf; + builder->draw.vertex_count = get_draw_field(b, draw_ptr, count); + assert(builder->draw.vertex_count->num_components); + builder->draw.instance_count = + get_draw_field(b, draw_ptr, instance_count); + builder->draw.vertex_start = get_draw_field(b, draw_ptr, start); if (index_size) { - builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count); - builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance); - builder->draw.instance_count = - get_indexed_draw_field(b, draw_ptr, instance_count); - builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start); - builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias); - } else { - builder->draw.vertex_count = get_draw_field(b, draw_ptr, count); - builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance); - builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count); - builder->draw.vertex_start = get_draw_field(b, draw_ptr, start); + builder->draw.index_bias = + get_draw_field(b, draw_ptr, index_bias); } - assert(builder->draw.vertex_count->num_components); - - nir_ssa_def *num_vertices = - nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count); + get_instance_size(builder); - IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) { - /* If there's nothing to draw, turn the vertex/tiler jobs into - * null jobs. - */ - if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS)) - set_null_job(builder, builder->jobs.vertex_job); + builder->instance_size.padded = + get_padded_count(b, builder->instance_size.raw, + &builder->instance_size.packed); - set_null_job(builder, builder->jobs.tiler_job); - } ELSE { - get_instance_size(builder); - - nir_ssa_def *count = builder->instance_size.raw; - - /* IDVS requires padding to a multiple of 4 */ - if (builder->flags & PAN_INDIRECT_DRAW_IDVS) - count = nir_align_pot(b, count, 4); - - builder->instance_size.padded = - get_padded_count(b, count, - &builder->instance_size.packed); - - update_varyings(builder); - update_jobs(builder); - update_vertex_attribs(builder); - - IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) { - store_global(b, builder->jobs.first_vertex_sysval, - builder->jobs.offset_start, 1); - } ENDIF - - IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) { - store_global(b, builder->jobs.base_vertex_sysval, - index_size ? - builder->draw.index_bias : - nir_imm_int(b, 0), - 1); - } ENDIF - - IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) { - store_global(b, builder->jobs.base_instance_sysval, - builder->draw.start_instance, 1); - } ENDIF - } ENDIF + update_varyings(builder); + update_jobs(builder); + update_vertex_attribs(builder); } /* Search the min/max index in the range covered by the indirect draw call */ @@ -1046,7 +936,7 @@ get_index_min_max(struct indirect_draw_shader_builder *builder) LOOP { nir_ssa_def *offset = nir_load_var(b, offset_var); - IF (nir_uge(b, offset, end)) + IF (nir_ige(b, offset, end)) BREAK; ENDIF @@ -1076,7 +966,7 @@ get_index_min_max(struct indirect_draw_shader_builder *builder) nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1); } - IF (nir_ult(b, start, end)) + IF (nir_ilt(b, start, end)) update_min(builder, nir_load_var(b, min_var)); update_max(builder, nir_load_var(b, max_var)); ENDIF @@ -1093,9 +983,7 @@ get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search) return flags; } - return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ? - PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART : - PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) + + return PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX + util_logbase2(index_size); } @@ -1115,46 +1003,42 @@ create_indirect_draw_shader(struct panfrost_device *dev, else patch(&builder); - struct panfrost_compile_inputs inputs = { - .gpu_id = dev->gpu_id, - .fixed_sysval_ubo = -1, - .no_ubo_to_push = true, - }; + struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id }; struct pan_shader_info shader_info; struct util_dynarray binary; util_dynarray_init(&binary, NULL); - GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info); + pan_shader_compile(dev, b->shader, &inputs, &binary, &shader_info); assert(!shader_info.tls_size); assert(!shader_info.wls_size); assert(!shader_info.sysvals.sysval_count); - shader_info.push.count = - DIV_ROUND_UP(sizeof(struct indirect_draw_inputs), 4); - unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search); struct pan_indirect_draw_shader *draw_shader = &dev->indirect_draw_shaders.shaders[shader_id]; void *state = dev->indirect_draw_shaders.states->ptr.cpu + - (shader_id * pan_size(RENDERER_STATE)); + (shader_id * MALI_RENDERER_STATE_LENGTH); pthread_mutex_lock(&dev->indirect_draw_shaders.lock); if (!draw_shader->rsd) { mali_ptr address = - pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool, - binary.data, binary.size, - PAN_ARCH >= 6 ? 128 : 64); + panfrost_pool_upload_aligned(&dev->indirect_draw_shaders.bin_pool, + binary.data, binary.size, + pan_is_bifrost(dev) ? 128 : 64); + if (!pan_is_bifrost(dev)) + address |= shader_info.midgard.first_tag; util_dynarray_fini(&binary); pan_pack(state, RENDERER_STATE, cfg) { - pan_shader_prepare_rsd(&shader_info, address, &cfg); + pan_shader_prepare_rsd(dev, &shader_info, address, &cfg); } + pthread_mutex_unlock(&dev->indirect_draw_shaders.lock); draw_shader->push = shader_info.push; draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu + - (shader_id * pan_size(RENDERER_STATE)); + (shader_id * MALI_RENDERER_STATE_LENGTH); } pthread_mutex_unlock(&dev->indirect_draw_shaders.lock); @@ -1182,7 +1066,46 @@ static mali_ptr get_tls(const struct panfrost_device *dev) { return dev->indirect_draw_shaders.states->ptr.gpu + - (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE)); + (PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH); +} + +static mali_ptr +get_ubos(struct pan_pool *pool, + const struct indirect_draw_inputs *inputs) +{ + struct panfrost_ptr inputs_buf = + panfrost_pool_alloc_aligned(pool, sizeof(inputs), 16); + + memcpy(inputs_buf.cpu, &inputs, sizeof(inputs)); + + struct panfrost_ptr ubos_buf = + panfrost_pool_alloc_desc(pool, UNIFORM_BUFFER); + + pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) { + cfg.entries = DIV_ROUND_UP(sizeof(inputs), 16); + cfg.pointer = inputs_buf.gpu; + } + + return ubos_buf.gpu; +} + +static mali_ptr +get_push_uniforms(struct pan_pool *pool, + const struct pan_indirect_draw_shader *shader, + const struct indirect_draw_inputs *inputs) +{ + if (!shader->push.count) + return 0; + + struct panfrost_ptr push_consts_buf = + panfrost_pool_alloc_aligned(pool, shader->push.count * 4, 16); + uint32_t *out = push_consts_buf.cpu; + uint8_t *in = (uint8_t *)inputs; + + for (unsigned i = 0; i < shader->push.count; ++i) + memcpy(out + i, in + shader->push.words[i].offset, 4); + + return push_consts_buf.gpu; } static void @@ -1193,15 +1116,15 @@ panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev) goto out; unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS * - pan_size(RENDERER_STATE)) + - pan_size(LOCAL_STORAGE); + MALI_RENDERER_STATE_LENGTH) + + MALI_LOCAL_STORAGE_LENGTH; dev->indirect_draw_shaders.states = - panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states"); + panfrost_bo_create(dev, state_bo_size, 0); /* Prepare the thread storage descriptor now since it's invariant. */ void *tsd = dev->indirect_draw_shaders.states->ptr.cpu + - (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE)); + (PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH); pan_pack(tsd, LOCAL_STORAGE, ls) { ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; }; @@ -1215,8 +1138,7 @@ panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev) */ dev->indirect_draw_shaders.varying_heap = panfrost_bo_create(dev, 512 * 1024 * 1024, - PAN_BO_INVISIBLE | PAN_BO_GROWABLE, - "Indirect draw varying heap"); + PAN_BO_INVISIBLE | PAN_BO_GROWABLE); out: pthread_mutex_unlock(&dev->indirect_draw_shaders.lock); @@ -1227,7 +1149,8 @@ panfrost_emit_index_min_max_search(struct pan_pool *pool, struct pan_scoreboard *scoreboard, const struct pan_indirect_draw_info *draw_info, const struct indirect_draw_inputs *inputs, - struct indirect_draw_context *draw_ctx) + struct indirect_draw_context *draw_ctx, + mali_ptr ubos) { struct panfrost_device *dev = pool->dev; unsigned index_size = draw_info->index_size; @@ -1238,34 +1161,42 @@ panfrost_emit_index_min_max_search(struct pan_pool *pool, mali_ptr rsd = get_renderer_state(dev, draw_info->flags, draw_info->index_size, true); + unsigned shader_id = + get_shader_id(draw_info->flags, draw_info->index_size, true); + const struct pan_indirect_draw_shader *shader = + &dev->indirect_draw_shaders.shaders[shader_id]; struct panfrost_ptr job = - pan_pool_alloc_desc(pool, COMPUTE_JOB); + panfrost_pool_alloc_desc(pool, COMPUTE_JOB); void *invocation = pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); panfrost_pack_work_groups_compute(invocation, 1, 1, 1, MIN_MAX_JOBS, 1, 1, - false, false); + false); pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { cfg.job_task_split = 7; } pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { + cfg.draw_descriptor_is_64b = true; + cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev); cfg.state = rsd; cfg.thread_storage = get_tls(pool->dev); - cfg.push_uniforms = - pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16); + cfg.uniform_buffers = ubos; + cfg.push_uniforms = get_push_uniforms(pool, shader, inputs); } + pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg); + return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, &job, false); } unsigned -GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - const struct pan_indirect_draw_info *draw_info, - struct panfrost_ptr *ctx) +panfrost_emit_indirect_draw(struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + const struct pan_indirect_draw_info *draw_info, + struct panfrost_ptr *ctx) { struct panfrost_device *dev = pool->dev; @@ -1277,7 +1208,7 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, panfrost_indirect_draw_alloc_deps(dev); struct panfrost_ptr job = - pan_pool_alloc_desc(pool, COMPUTE_JOB); + panfrost_pool_alloc_desc(pool, COMPUTE_JOB); mali_ptr rsd = get_renderer_state(dev, draw_info->flags, draw_info->index_size, false); @@ -1288,18 +1219,15 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, struct panfrost_ptr draw_ctx_ptr = *ctx; if (!draw_ctx_ptr.cpu) { - draw_ctx_ptr = pan_pool_alloc_aligned(pool, - sizeof(draw_ctx), - sizeof(mali_ptr)); + draw_ctx_ptr = panfrost_pool_alloc_aligned(pool, + sizeof(draw_ctx), + sizeof(mali_ptr)); } struct indirect_draw_inputs inputs = { .draw_ctx = draw_ctx_ptr.gpu, .draw_buf = draw_info->draw_buf, .index_buf = draw_info->index_buf, - .first_vertex_sysval = draw_info->first_vertex_sysval, - .base_vertex_sysval = draw_info->base_vertex_sysval, - .base_instance_sysval = draw_info->base_instance_sysval, .vertex_job = draw_info->vertex_job, .tiler_job = draw_info->tiler_job, .attrib_bufs = draw_info->attrib_bufs, @@ -1312,9 +1240,9 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, inputs.restart_index = draw_info->restart_index; struct panfrost_ptr min_max_ctx_ptr = - pan_pool_alloc_aligned(pool, - sizeof(struct min_max_context), - 4); + panfrost_pool_alloc_aligned(pool, + sizeof(struct min_max_context), + 4); struct min_max_context *ctx = min_max_ctx_ptr.cpu; ctx->min = UINT32_MAX; @@ -1322,27 +1250,37 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, inputs.min_max_ctx = min_max_ctx_ptr.gpu; } + unsigned shader_id = + get_shader_id(draw_info->flags, draw_info->index_size, false); + const struct pan_indirect_draw_shader *shader = + &dev->indirect_draw_shaders.shaders[shader_id]; + mali_ptr ubos = get_ubos(pool, &inputs); + void *invocation = pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); panfrost_pack_work_groups_compute(invocation, 1, 1, 1, 1, 1, 1, - false, false); + false); pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { cfg.job_task_split = 2; } pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { + cfg.draw_descriptor_is_64b = true; + cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev); cfg.state = rsd; cfg.thread_storage = get_tls(pool->dev); - cfg.push_uniforms = - pan_pool_upload_aligned(pool, &inputs, sizeof(inputs), 16); + cfg.uniform_buffers = ubos; + cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs); } + pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg); + unsigned global_dep = draw_info->last_indirect_draw; unsigned local_dep = panfrost_emit_index_min_max_search(pool, scoreboard, draw_info, - &inputs, &draw_ctx); + &inputs, &draw_ctx, ubos); if (!ctx->cpu) { *ctx = draw_ctx_ptr; @@ -1355,19 +1293,20 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, } void -GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev, - struct pan_pool *bin_pool) +panfrost_init_indirect_draw_shaders(struct panfrost_device *dev) { /* We allocate the states and varying_heap BO lazily to avoid * reserving memory when indirect draws are not used. */ pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL); - dev->indirect_draw_shaders.bin_pool = bin_pool; + panfrost_pool_init(&dev->indirect_draw_shaders.bin_pool, NULL, dev, + PAN_BO_EXECUTE, false); } void -GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev) +panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev) { + panfrost_pool_cleanup(&dev->indirect_draw_shaders.bin_pool); panfrost_bo_unreference(dev->indirect_draw_shaders.states); panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap); pthread_mutex_destroy(&dev->indirect_draw_shaders.lock); diff --git a/lib/mesa/src/panfrost/lib/pan_indirect_draw.h b/lib/mesa/src/panfrost/lib/pan_indirect_draw.h index 6a7737441..28bcd535d 100644 --- a/lib/mesa/src/panfrost/lib/pan_indirect_draw.h +++ b/lib/mesa/src/panfrost/lib/pan_indirect_draw.h @@ -24,8 +24,6 @@ #ifndef __PAN_INDIRECT_DRAW_SHADERS_H__ #define __PAN_INDIRECT_DRAW_SHADERS_H__ -#include "genxml/gen_macros.h" - struct pan_device; struct pan_scoreboard; struct pan_pool; @@ -33,9 +31,6 @@ struct pan_pool; struct pan_indirect_draw_info { mali_ptr draw_buf; mali_ptr index_buf; - mali_ptr first_vertex_sysval; - mali_ptr base_vertex_sysval; - mali_ptr base_instance_sysval; mali_ptr vertex_job; mali_ptr tiler_job; mali_ptr attrib_bufs; @@ -49,16 +44,15 @@ struct pan_indirect_draw_info { }; unsigned -GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - const struct pan_indirect_draw_info *draw_info, - struct panfrost_ptr *ctx); +panfrost_emit_indirect_draw(struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + const struct pan_indirect_draw_info *draw_info, + struct panfrost_ptr *ctx); void -GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev, - struct pan_pool *bin_pool); +panfrost_init_indirect_draw_shaders(struct panfrost_device *dev); void -GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev); +panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev); #endif diff --git a/lib/mesa/src/vulkan/wsi/wsi_common_win32.c b/lib/mesa/src/vulkan/wsi/wsi_common_win32.c index bef81028b..fa6f898e5 100644 --- a/lib/mesa/src/vulkan/wsi/wsi_common_win32.c +++ b/lib/mesa/src/vulkan/wsi/wsi_common_win32.c @@ -26,12 +26,9 @@ #include <stdio.h> #include <string.h> -#include "vk_format.h" -#include "vk_instance.h" -#include "vk_physical_device.h" #include "vk_util.h" -#include "wsi_common_entrypoints.h" #include "wsi_common_private.h" +#include "wsi_common_win32.h" #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wint-to-pointer-cast" // warning: cast to pointer from integer of different size @@ -70,37 +67,30 @@ struct wsi_win32_swapchain { struct wsi_win32_image images[0]; }; -VKAPI_ATTR VkBool32 VKAPI_CALL -wsi_GetPhysicalDeviceWin32PresentationSupportKHR(VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex) +VkBool32 +wsi_win32_get_presentation_support(struct wsi_device *wsi_device) { return TRUE; } -VKAPI_ATTR VkResult VKAPI_CALL -wsi_CreateWin32SurfaceKHR(VkInstance _instance, - const VkWin32SurfaceCreateInfoKHR *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkSurfaceKHR *pSurface) +VkResult +wsi_create_win32_surface(VkInstance instance, + const VkAllocationCallbacks *allocator, + const VkWin32SurfaceCreateInfoKHR *create_info, + VkSurfaceKHR *surface_khr) { - VK_FROM_HANDLE(vk_instance, instance, _instance); - VkIcdSurfaceWin32 *surface; - - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR); - - surface = vk_zalloc2(&instance->alloc, pAllocator, sizeof(*surface), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + VkIcdSurfaceWin32 *surface = vk_zalloc(allocator, sizeof *surface, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (surface == NULL) return VK_ERROR_OUT_OF_HOST_MEMORY; surface->base.platform = VK_ICD_WSI_PLATFORM_WIN32; - surface->hinstance = pCreateInfo->hinstance; - surface->hwnd = pCreateInfo->hwnd; - - *pSurface = VkIcdSurfaceBase_to_handle(&surface->base); + surface->hinstance = create_info->hinstance; + surface->hwnd = create_info->hwnd; + *surface_khr = VkIcdSurfaceBase_to_handle(&surface->base); return VK_SUCCESS; } @@ -116,24 +106,15 @@ wsi_win32_surface_get_support(VkIcdSurfaceBase *surface, } static VkResult -wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surf, +wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surface, struct wsi_device *wsi_device, VkSurfaceCapabilitiesKHR* caps) { - VkIcdSurfaceWin32 *surface = (VkIcdSurfaceWin32 *)surf; - - RECT win_rect; - if (!GetClientRect(surface->hwnd, &win_rect)) - return VK_ERROR_SURFACE_LOST_KHR; - caps->minImageCount = 1; /* There is no real maximum */ caps->maxImageCount = 0; - caps->currentExtent = (VkExtent2D) { - win_rect.right - win_rect.left, - win_rect.bottom - win_rect.top - }; + caps->currentExtent = (VkExtent2D) { UINT32_MAX, UINT32_MAX }; caps->minImageExtent = (VkExtent2D) { 1, 1 }; caps->maxImageExtent = (VkExtent2D) { wsi_device->maxImageDimension2D, @@ -153,8 +134,7 @@ wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surf, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT | - VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | - VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT; + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; return VK_SUCCESS; } @@ -228,7 +208,7 @@ wsi_win32_surface_get_formats(VkIcdSurfaceBase *icd_surface, for (unsigned i = 0; i < ARRAY_SIZE(sorted_formats); i++) { vk_outarray_append_typed(VkSurfaceFormatKHR, &out, f) { f->format = sorted_formats[i]; - f->colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR; + f->colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR; } } @@ -251,7 +231,7 @@ wsi_win32_surface_get_formats2(VkIcdSurfaceBase *icd_surface, vk_outarray_append_typed(VkSurfaceFormat2KHR, &out, f) { assert(f->sType == VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR); f->surfaceFormat.format = sorted_formats[i]; - f->surfaceFormat.colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR; + f->surfaceFormat.colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR; } } @@ -301,16 +281,155 @@ wsi_win32_surface_get_present_rectangles(VkIcdSurfaceBase *surface, return vk_outarray_status(&out); } +static uint32_t +select_memory_type(const struct wsi_device *wsi, + VkMemoryPropertyFlags props, + uint32_t type_bits) +{ + for (uint32_t i = 0; i < wsi->memory_props.memoryTypeCount; i++) { + const VkMemoryType type = wsi->memory_props.memoryTypes[i]; + if ((type_bits & (1 << i)) && (type.propertyFlags & props) == props) + return i; + } + + unreachable("No memory type found"); +} + +VkResult +wsi_create_native_image(const struct wsi_swapchain *chain, + const VkSwapchainCreateInfoKHR *pCreateInfo, + uint32_t num_modifier_lists, + const uint32_t *num_modifiers, + const uint64_t *const *modifiers, + struct wsi_image *image) +{ + const struct wsi_device *wsi = chain->wsi; + VkResult result; + + memset(image, 0, sizeof(*image)); + for (int i = 0; i < ARRAY_SIZE(image->fds); i++) + image->fds[i] = -1; + + VkImageCreateInfo image_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .flags = 0, + .imageType = VK_IMAGE_TYPE_2D, + .format = pCreateInfo->imageFormat, + .extent = { + .width = pCreateInfo->imageExtent.width, + .height = pCreateInfo->imageExtent.height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = pCreateInfo->imageUsage, + .sharingMode = pCreateInfo->imageSharingMode, + .queueFamilyIndexCount = pCreateInfo->queueFamilyIndexCount, + .pQueueFamilyIndices = pCreateInfo->pQueueFamilyIndices, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }; + + VkImageFormatListCreateInfoKHR image_format_list; + if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR) { + image_info.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT | + VK_IMAGE_CREATE_EXTENDED_USAGE_BIT_KHR; + + const VkImageFormatListCreateInfoKHR *format_list = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_FORMAT_LIST_CREATE_INFO_KHR); + +#ifndef NDEBUG + assume(format_list && format_list->viewFormatCount > 0); + bool format_found = false; + for (int i = 0; i < format_list->viewFormatCount; i++) + if (pCreateInfo->imageFormat == format_list->pViewFormats[i]) + format_found = true; + assert(format_found); +#endif + + image_format_list = *format_list; + image_format_list.pNext = NULL; + __vk_append_struct(&image_info, &image_format_list); + } + + + result = wsi->CreateImage(chain->device, &image_info, + &chain->alloc, &image->image); + if (result != VK_SUCCESS) + goto fail; + + VkMemoryRequirements reqs; + wsi->GetImageMemoryRequirements(chain->device, image->image, &reqs); + + const struct wsi_memory_allocate_info memory_wsi_info = { + .sType = VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA, + .pNext = NULL, + .implicit_sync = true, + }; + const VkExportMemoryAllocateInfo memory_export_info = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, + .pNext = &memory_wsi_info, + .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT, + }; + const VkMemoryDedicatedAllocateInfo memory_dedicated_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, + .pNext = &memory_export_info, + .image = image->image, + .buffer = VK_NULL_HANDLE, + }; + const VkMemoryAllocateInfo memory_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = &memory_dedicated_info, + .allocationSize = reqs.size, + .memoryTypeIndex = select_memory_type(wsi, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + reqs.memoryTypeBits), + }; + result = wsi->AllocateMemory(chain->device, &memory_info, + &chain->alloc, &image->memory); + if (result != VK_SUCCESS) + goto fail; + + result = wsi->BindImageMemory(chain->device, image->image, + image->memory, 0); + if (result != VK_SUCCESS) + goto fail; + + const VkImageSubresource image_subresource = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = 0, + .arrayLayer = 0, + }; + VkSubresourceLayout image_layout; + wsi->GetImageSubresourceLayout(chain->device, image->image, + &image_subresource, &image_layout); + + image->num_planes = 1; + image->sizes[0] = reqs.size; + image->row_pitches[0] = image_layout.rowPitch; + image->offsets[0] = 0; + + return VK_SUCCESS; + +fail: + wsi_destroy_image(chain, image); + + return result; +} + static VkResult wsi_win32_image_init(VkDevice device_h, - struct wsi_win32_swapchain *chain, - const VkSwapchainCreateInfoKHR *create_info, - const VkAllocationCallbacks *allocator, - struct wsi_win32_image *image) + struct wsi_swapchain *drv_chain, + const VkSwapchainCreateInfoKHR *create_info, + const VkAllocationCallbacks *allocator, + struct wsi_win32_image *image) { - assert(chain->base.use_buffer_blit); - VkResult result = wsi_create_image(&chain->base, &chain->base.image_info, - &image->base); + struct wsi_win32_swapchain *chain = (struct wsi_win32_swapchain *) drv_chain; + + VkResult result = wsi_create_native_image(&chain->base, create_info, + 0, NULL, NULL, + &image->base); if (result != VK_SUCCESS) return result; @@ -345,10 +464,13 @@ wsi_win32_image_init(VkDevice device_h, } static void -wsi_win32_image_finish(struct wsi_win32_swapchain *chain, - const VkAllocationCallbacks *allocator, - struct wsi_win32_image *image) +wsi_win32_image_finish(struct wsi_swapchain *drv_chain, + const VkAllocationCallbacks *allocator, + struct wsi_win32_image *image) { + struct wsi_win32_swapchain *chain = + (struct wsi_win32_swapchain *) drv_chain; + DeleteDC(image->dc); if(image->bmp) DeleteObject(image->bmp); @@ -363,7 +485,7 @@ wsi_win32_swapchain_destroy(struct wsi_swapchain *drv_chain, (struct wsi_win32_swapchain *) drv_chain; for (uint32_t i = 0; i < chain->base.image_count; i++) - wsi_win32_image_finish(chain, allocator, &chain->images[i]); + wsi_win32_image_finish(drv_chain, allocator, &chain->images[i]); DeleteDC(chain->chain_dc); @@ -406,19 +528,30 @@ wsi_win32_queue_present(struct wsi_swapchain *drv_chain, struct wsi_win32_swapchain *chain = (struct wsi_win32_swapchain *) drv_chain; assert(image_index < chain->base.image_count); struct wsi_win32_image *image = &chain->images[image_index]; + VkResult result; - assert(chain->base.use_buffer_blit); - - char *ptr = image->base.cpu_map; + char *ptr; char *dptr = image->ppvBits; + result = chain->base.wsi->MapMemory(chain->base.device, + image->base.memory, + 0, 0, 0, (void**)&ptr); for (unsigned h = 0; h < chain->extent.height; h++) { memcpy(dptr, ptr, chain->extent.width * 4); dptr += image->bmp_row_pitch; ptr += image->base.row_pitches[0]; } - if (!StretchBlt(chain->chain_dc, 0, 0, chain->extent.width, chain->extent.height, image->dc, 0, 0, chain->extent.width, chain->extent.height, SRCCOPY)) - chain->status = VK_ERROR_MEMORY_MAP_FAILED; + if(StretchBlt(chain->chain_dc, 0, 0, chain->extent.width, chain->extent.height, image->dc, 0, 0, chain->extent.width, chain->extent.height, SRCCOPY)) + result = VK_SUCCESS; + else + result = VK_ERROR_MEMORY_MAP_FAILED; + + chain->base.wsi->UnmapMemory(chain->base.device, image->base.memory); + if (result != VK_SUCCESS) + chain->status = result; + + if (result != VK_SUCCESS) + return result; return chain->status; } @@ -448,13 +581,8 @@ wsi_win32_surface_create_swapchain( if (chain == NULL) return VK_ERROR_OUT_OF_HOST_MEMORY; - struct wsi_cpu_image_params image_params = { - .base.image_type = WSI_IMAGE_TYPE_CPU, - }; - VkResult result = wsi_swapchain_init(wsi_device, &chain->base, device, - create_info, &image_params.base, - allocator); + create_info, allocator); if (result != VK_SUCCESS) { vk_free(allocator, chain); return result; @@ -473,20 +601,16 @@ wsi_win32_surface_create_swapchain( chain->surface = surface; - assert(wsi_device->sw); - chain->base.use_buffer_blit = true; - for (uint32_t image = 0; image < chain->base.image_count; image++) { - result = wsi_win32_image_init(device, chain, - create_info, allocator, - &chain->images[image]); + result = wsi_win32_image_init(device, &chain->base, + create_info, allocator, + &chain->images[image]); if (result != VK_SUCCESS) { while (image > 0) { --image; - wsi_win32_image_finish(chain, allocator, - &chain->images[image]); + wsi_win32_image_finish(&chain->base, allocator, + &chain->images[image]); } - wsi_swapchain_finish(&chain->base); vk_free(allocator, chain); goto fail_init_images; } |