summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/mesa/src/panfrost/bifrost/ISA.xml1193
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_builder.h.py93
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_layout.c30
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c233
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py20
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py19
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c71
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c223
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_packer.c.py12
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_printer.c.py38
-rw-r--r--lib/mesa/src/panfrost/bifrost/bi_scoreboard.c255
-rw-r--r--lib/mesa/src/panfrost/bifrost/bifrost_isa.py49
-rw-r--r--lib/mesa/src/panfrost/bifrost/gen_disasm.py4
-rw-r--r--lib/mesa/src/panfrost/lib/pan_indirect_draw.c565
-rw-r--r--lib/mesa/src/panfrost/lib/pan_indirect_draw.h18
-rw-r--r--lib/mesa/src/vulkan/wsi/wsi_common_win32.c262
16 files changed, 654 insertions, 2431 deletions
diff --git a/lib/mesa/src/panfrost/bifrost/ISA.xml b/lib/mesa/src/panfrost/bifrost/ISA.xml
index f1e908331..b5965fd3c 100644
--- a/lib/mesa/src/panfrost/bifrost/ISA.xml
+++ b/lib/mesa/src/panfrost/bifrost/ISA.xml
@@ -1986,7 +1986,7 @@
<src start="0" mask="0xfb"/>
</ins>
- <ins name="*NOP" mask="0x7fffff" exact="0x701963" dests="0"/>
+ <ins name="*NOP.i32" mask="0x7fffff" exact="0x701963"/>
<ins name="*POPCOUNT.i32" mask="0x7ffff8" exact="0x73c6d8">
<src start="0" mask="0xfb"/>
@@ -2036,7 +2036,6 @@
<opt>not</opt>
<opt>none</opt>
</mod>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
</ins>
<ins name="*RSHIFT_AND.v2i16">
@@ -2057,7 +2056,6 @@
<opt>not</opt>
<opt>none</opt>
</mod>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
<encoding mask="0x7f3800" exact="0x300800">
<or>
<eq left="lanes2" right="#b00"/>
@@ -2091,7 +2089,6 @@
<src start="0" mask="0xfb"/>
<src start="3" mask="0xfb"/>
<src start="6"/>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
<mod name="lanes2" size="3" default="b0123">
<opt>b0123</opt>
<opt>b0000</opt>
@@ -2147,7 +2144,6 @@
<opt>not</opt>
<opt>none</opt>
</mod>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
<mod name="not_result" start="15" size="1" opt="not"/>
</ins>
@@ -2169,7 +2165,6 @@
<opt>none</opt>
</mod>
<mod name="not_result" start="15" size="1" opt="not"/>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
<encoding mask="0x7f3800" exact="0x302800">
<or>
<eq left="lanes2" right="#b00"/>
@@ -2215,7 +2210,6 @@
<opt>none</opt>
</mod>
<mod name="not_result" start="15" size="1" opt="not"/>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
<encoding mask="0x7f3800" exact="0x302000">
<neq left="lanes2" right="#b0123"/>
<derived start="9" size="2">
@@ -2241,7 +2235,6 @@
<opt>b3</opt>
</mod>
<mod name="not_result" start="13" size="1" opt="not"/>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
</ins>
<ins name="*RSHIFT_XOR.v2i16">
@@ -2258,7 +2251,6 @@
<opt>b02</opt>
</mod>
<mod name="not_result" start="13" size="1" opt="not"/>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
<encoding mask="0x7fd800" exact="0x320800">
<or>
<eq left="lanes2" right="#b00"/>
@@ -2300,7 +2292,6 @@
<opt>b3333</opt>
</mod>
<mod name="not_result" start="13" size="1" opt="not"/>
- <mod name="arithmetic" opt="arithmetic" size="1" start="1" pseudo="true"/>
<encoding mask="0x7fd800" exact="0x320000">
<neq left="lanes2" right="#b0123"/>
<derived start="9" size="2">
@@ -2429,7 +2420,6 @@
<opt>rtz</opt>
<opt>rtna</opt>
</mod>
- <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
<derived start="6" size="1">
<and>
<eq left="abs0" right="#none"/>
@@ -2496,9 +2486,6 @@
<ins name="+ATEST" staging="w=1" mask="0xfff00" exact="0xc8f00" message="atest" table="true">
<src start="0" mask="0xf7"/>
<src start="3" mask="0xf7"/>
- <!-- ATEST parameter datum. Implicitly encoded into the tuple on Bifrost.
- Real source on Valhall. -->
- <src start="6" pseudo="true"/>
<mod name="widen1" start="6" size="2">
<reserved/>
<opt>none</opt>
@@ -2533,22 +2520,8 @@
<src start="0"/>
<src start="3" mask="0xf7"/>
<src start="6" mask="0xf7"/>
- <!-- pseudo source for a dual source blend input -->
- <src start="9" pseudo="true"/>
<!-- not actually encoded, but used for IR -->
<immediate name="sr_count" size="4" pseudo="true"/>
- <immediate name="sr_count_2" size="4" pseudo="true"/>
- <mod name="register_format" size="4" pseudo="true">
- <opt>f16</opt>
- <opt>f32</opt>
- <opt>s32</opt>
- <opt>u32</opt>
- <opt>s16</opt>
- <opt>u16</opt>
- <opt>f64</opt>
- <opt>i64</opt>
- <opt>auto</opt>
- </mod>
</ins>
<ins name="+BRANCH.f16" mask="0xf8000" exact="0x68000" last="true" dests="0">
@@ -3716,12 +3689,12 @@
<src start="6" mask="0xf7"/>
</ins>
- <ins name="+CLPER_OLD.i32" mask="0xfffc0" exact="0x3f0c0">
+ <ins name="+CLPER_V6.i32" mask="0xfffc0" exact="0x3f0c0">
<src start="0" mask="0x7"/>
<src start="3"/>
</ins>
- <ins name="+CLPER.i32" mask="0xfc000" exact="0x7c000">
+ <ins name="+CLPER_V7.i32" mask="0xfc000" exact="0x7c000">
<src start="0" mask="0x7"/>
<src start="3"/>
<mod name="lane_op" start="6" size="2">
@@ -3734,7 +3707,6 @@
<opt>subgroup2</opt>
<opt>subgroup4</opt>
<opt>subgroup8</opt>
- <opt pseudo="true">subgroup16</opt> <!-- Only on Valhall -->
</mod>
<mod name="inactive_result" start="10" size="4">
<opt>zero</opt>
@@ -3874,7 +3846,6 @@
<opt>h0</opt>
<opt>h1</opt>
</mod>
- <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
</ins>
<ins name="+F16_TO_S32">
@@ -6195,7 +6166,7 @@
<src start="6" mask="0xf7"/>
</ins>
- <ins name="+KABOOM" mask="0xffff8" exact="0xd7858" message="job" dests="0">
+ <ins name="+KABOOM" mask="0xffff8" exact="0xd7858" unused="true" message="job_management">
<src start="0"/>
</ins>
@@ -6381,7 +6352,7 @@
</mod>
</ins>
- <ins name="+LD_TILE" staging="w=format" mask="0xff800" exact="0xcb000" message="tile">
+ <ins name="+LD_TILE" staging="w=vecsize" mask="0xff800" exact="0xcb000" message="tile">
<src start="0"/>
<src start="3"/>
<src start="6" mask="0xf7"/>
@@ -6391,15 +6362,9 @@
<opt>v3</opt>
<opt>v4</opt>
</mod>
- <mod name="register_format" size="3" pseudo="true">
- <opt>f32</opt>
- <opt>f16</opt>
- <opt>u32</opt>
- <opt>s32</opt>
- </mod>
</ins>
- <ins name="+LD_VAR" staging="w=format" message="varying">
+ <ins name="+LD_VAR" staging="w=vecsize" message="varying">
<src start="0"/>
<src start="3"/>
<mod name="vecsize" start="8" size="2">
@@ -7105,7 +7070,6 @@
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOAD.i16" staging="w=1" message="load">
@@ -7121,7 +7085,7 @@
<reserved/>
<opt>tl</opt>
</mod>
- <mod name="lane_dest" size="2" default="h0">
+ <mod name="lane0" size="2" default="h0">
<opt>h0</opt>
<opt>h1</opt>
<opt>w0</opt>
@@ -7136,19 +7100,19 @@
<and>
<eq left="extend" right="#none"/>
<or>
- <eq left="lane_dest" right="#h0"/>
- <eq left="lane_dest" right="#h1"/>
+ <eq left="lane0" right="#h0"/>
+ <eq left="lane0" right="#h1"/>
</or>
</and>
<derived start="9" size="1">
- <eq left="lane_dest" right="#h0"/>
- <eq left="lane_dest" right="#h1"/>
+ <eq left="lane0" right="#h0"/>
+ <eq left="lane0" right="#h1"/>
</derived>
</encoding>
<encoding mask="0xffc00" exact="0x63000">
<and>
<neq left="extend" right="#none"/>
- <eq left="lane_dest" right="#w0"/>
+ <eq left="lane0" right="#w0"/>
</and>
<derived start="9" size="1">
<eq left="extend" right="#sext"/>
@@ -7158,14 +7122,13 @@
<encoding mask="0xffc00" exact="0x61800">
<and>
<neq left="extend" right="#none"/>
- <eq left="lane_dest" right="#d0"/>
+ <eq left="lane0" right="#d0"/>
</and>
<derived start="9" size="1">
<eq left="extend" right="#sext"/>
<eq left="extend" right="#zext"/>
</derived>
</encoding>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOAD.i24" staging="w=1" mask="0xffe00" exact="0x65000" message="load">
@@ -7181,7 +7144,6 @@
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOAD.i32" staging="w=1" message="load">
@@ -7197,7 +7159,7 @@
<reserved/>
<opt>tl</opt>
</mod>
- <mod name="lane_dest" size="1" opt="d0"/>
+ <mod name="lane0" size="1" opt="d0"/>
<mod name="extend" size="2">
<opt>none</opt>
<opt>sext</opt>
@@ -7206,20 +7168,19 @@
<encoding mask="0xffe00" exact="0x60c00">
<and>
<eq left="extend" right="#none"/>
- <eq left="lane_dest" right="#none"/>
+ <eq left="lane0" right="#none"/>
</and>
</encoding>
<encoding mask="0xffc00" exact="0x61c00">
<and>
<neq left="extend" right="#none"/>
- <eq left="lane_dest" right="#d0"/>
+ <eq left="lane0" right="#d0"/>
</and>
<derived start="9" size="1">
<eq left="extend" right="#sext"/>
<eq left="extend" right="#zext"/>
</derived>
</encoding>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOAD.i48" staging="w=2" mask="0xffe00" exact="0x65200" message="load">
@@ -7235,7 +7196,6 @@
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOAD.i64" staging="w=2" mask="0xffe00" exact="0x60e00" message="load">
@@ -7251,7 +7211,6 @@
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOAD.i8" staging="w=1" message="load">
@@ -7267,7 +7226,7 @@
<reserved/>
<opt>tl</opt>
</mod>
- <mod name="lane_dest" size="3" default="b0">
+ <mod name="lane0" size="3" default="b0">
<opt>b0</opt>
<opt>b1</opt>
<opt>b2</opt>
@@ -7286,25 +7245,25 @@
<and>
<eq left="extend" right="#none"/>
<or>
- <eq left="lane_dest" right="#b0"/>
- <eq left="lane_dest" right="#b1"/>
- <eq left="lane_dest" right="#b2"/>
- <eq left="lane_dest" right="#b3"/>
+ <eq left="lane0" right="#b0"/>
+ <eq left="lane0" right="#b1"/>
+ <eq left="lane0" right="#b2"/>
+ <eq left="lane0" right="#b3"/>
</or>
</and>
<derived start="9" size="2">
- <eq left="lane_dest" right="#b0"/>
- <eq left="lane_dest" right="#b1"/>
- <eq left="lane_dest" right="#b2"/>
- <eq left="lane_dest" right="#b3"/>
+ <eq left="lane0" right="#b0"/>
+ <eq left="lane0" right="#b1"/>
+ <eq left="lane0" right="#b2"/>
+ <eq left="lane0" right="#b3"/>
</derived>
</encoding>
<encoding mask="0xff800" exact="0x63800">
<and>
<neq left="extend" right="#none"/>
<or>
- <eq left="lane_dest" right="#h0"/>
- <eq left="lane_dest" right="#h1"/>
+ <eq left="lane0" right="#h0"/>
+ <eq left="lane0" right="#h1"/>
</or>
</and>
<derived start="9" size="1">
@@ -7312,14 +7271,14 @@
<eq left="extend" right="#zext"/>
</derived>
<derived start="10" size="1">
- <eq left="lane_dest" right="#h0"/>
- <eq left="lane_dest" right="#h1"/>
+ <eq left="lane0" right="#h0"/>
+ <eq left="lane0" right="#h1"/>
</derived>
</encoding>
<encoding mask="0xffc00" exact="0x63400">
<and>
<neq left="extend" right="#none"/>
- <eq left="lane_dest" right="#w0"/>
+ <eq left="lane0" right="#w0"/>
</and>
<derived start="9" size="1">
<eq left="extend" right="#sext"/>
@@ -7329,14 +7288,13 @@
<encoding mask="0xffc00" exact="0x61400">
<and>
<neq left="extend" right="#none"/>
- <eq left="lane_dest" right="#d0"/>
+ <eq left="lane0" right="#d0"/>
</and>
<derived start="9" size="1">
<eq left="extend" right="#sext"/>
<eq left="extend" right="#zext"/>
</derived>
</encoding>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOAD.i96" staging="w=3" mask="0xffe00" exact="0x65400" message="load">
@@ -7352,7 +7310,6 @@
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+LOGB.f32" mask="0xfffe0" exact="0x3d9a0">
@@ -7438,7 +7395,7 @@
</mod>
</ins>
- <ins name="+NOP" mask="0xfffff" exact="0x3d964" dests="0"/>
+ <ins name="+NOP.i32" mask="0xfffff" exact="0x3d964"/>
<ins name="+QUIET.f32" mask="0xffff8" exact="0x3d970">
<src start="0"/>
@@ -7562,12 +7519,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+STORE.i16" staging="r=1" mask="0xffe00" exact="0x62800" message="store" dests="0">
@@ -7578,12 +7534,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+STORE.i24" staging="r=1" mask="0xffe00" exact="0x65800" message="store" dests="0">
@@ -7594,12 +7549,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+STORE.i32" staging="r=1" mask="0xffe00" exact="0x62c00" message="store" dests="0">
@@ -7610,12 +7564,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+STORE.i48" staging="r=2" mask="0xffe00" exact="0x65a00" message="store" dests="0">
@@ -7626,12 +7579,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+STORE.i64" staging="r=2" mask="0xffe00" exact="0x62e00" message="store" dests="0">
@@ -7642,12 +7594,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+STORE.i8" staging="r=1" mask="0xffe00" exact="0x62000" message="store" dests="0">
@@ -7658,12 +7609,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+STORE.i96" staging="r=3" mask="0xffe00" exact="0x65c00" message="store" dests="0">
@@ -7674,12 +7624,11 @@
<opt>none</opt>
<opt>wls</opt>
<opt>stream</opt>
- <opt pseudo="true">pos</opt>
- <opt pseudo="true">vary</opt>
+ <reserved/>
+ <reserved/>
<reserved/>
<opt>tl</opt>
</mod>
- <immediate name="byte_offset" size="16" pseudo="true"/>
</ins>
<ins name="+ST_CVT" staging="r=format" mask="0xff800" exact="0xc9800" message="store" dests="0">
@@ -7704,7 +7653,7 @@
</mod>
</ins>
- <ins name="+ST_TILE" staging="r=format" mask="0xff800" exact="0xcb800" message="tile" dests="0">
+ <ins name="+ST_TILE" staging="r=vecsize" mask="0xff800" exact="0xcb800" message="tile" dests="0">
<src start="0"/>
<src start="3"/>
<src start="6" mask="0xf7"/>
@@ -7714,12 +7663,6 @@
<opt>v3</opt>
<opt>v4</opt>
</mod>
- <mod name="register_format" size="3" pseudo="true">
- <opt>f32</opt>
- <opt>f16</opt>
- <opt>u32</opt>
- <opt>s32</opt>
- </mod>
</ins>
<ins name="+SWZ.v2i16" mask="0xfffc8" exact="0x3d948">
@@ -7753,27 +7696,6 @@
<mod name="skip" start="9" size="1" opt="skip"/>
<!-- not actually encoded, but used for IR -->
<immediate name="sr_count" size="4" pseudo="true"/>
- <immediate name="sr_count_2" size="4" pseudo="true"/>
- <mod name="lod_mode" start="13" size="1" default="zero_lod" pseudo="true">
- <opt>computed_lod</opt>
- <opt>zero_lod</opt>
- </mod>
- </ins>
-
- <!-- Pseudo instruction representing dual texturing on Bifrost. Lowered to
- TEXC after register allocation, when the second destination register can
- be combined with the texture operation descriptor. -->
- <ins name="+TEXC_DUAL" staging="rw=sr_count" pseudo="true" message="tex" dests="2">
- <src start="0"/>
- <src start="3"/>
- <src start="6" mask="0xf7"/>
- <mod name="skip" start="9" size="1" opt="skip"/>
- <immediate name="sr_count" size="4" pseudo="true"/>
- <immediate name="sr_count_2" size="4" pseudo="true"/>
- <mod name="lod_mode" start="13" size="1" default="zero_lod" pseudo="true">
- <opt>computed_lod</opt>
- <opt>zero_lod</opt>
- </mod>
</ins>
<ins name="+TEXS_2D.f16" staging="w=2" mask="0xfc000" exact="0xd8000" message="tex">
@@ -7959,7 +7881,6 @@
<opt>rtz</opt>
<opt>rtna</opt>
</mod>
- <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
<derived start="6" size="1">
<and>
<eq left="abs0" right="#none"/>
@@ -8261,11 +8182,11 @@
<mod name="preserve_null" size="1" opt="preserve_null"/>
</ins>
- <!-- Scheduler lowered to *ATOM_C.i32/+ATOM_CX. Real Valhall instructions. -->
- <ins name="+ATOM_RETURN.i32" pseudo="true" staging="rw=sr_count" message="atomic">
+ <!-- Scheduler lowered to *ATOM_C.i32/+ATOM_CX -->
+ <ins name="+PATOM_C.i32" pseudo="true" staging="rw=sr_count" message="atomic">
<src start="0"/>
<src start="3"/>
- <mod name="atom_opc" start="9" size="5">
+ <mod name="atom_opc" start="9" size="4">
<reserved/>
<reserved/>
<opt>aadd</opt>
@@ -8281,14 +8202,10 @@
<opt>aand</opt>
<opt>aor</opt>
<opt>axor</opt>
- <opt>axchg</opt> <!-- For Valhall -->
- <opt>acmpxchg</opt> <!-- For Valhall -->
</mod>
- <!-- not actually encoded, but used for IR -->
- <immediate name="sr_count" size="4" pseudo="true"/>
</ins>
- <ins name="+ATOM1_RETURN.i32" pseudo="true" staging="w=sr_count" message="atomic">
+ <ins name="+PATOM_C1.i32" pseudo="true" staging="w=sr_count" message="atomic">
<src start="0"/>
<src start="3"/>
<mod name="atom_opc" start="6" size="3">
@@ -8298,32 +8215,6 @@
<opt>asmax1</opt>
<opt>aor1</opt>
</mod>
- <!-- not actually encoded, but used for IR -->
- <immediate name="sr_count" size="4" pseudo="true"/>
- </ins>
-
- <ins name="+ATOM.i32" pseudo="true" staging="r=sr_count" message="atomic">
- <src start="0"/>
- <src start="3"/>
- <mod name="atom_opc" start="9" size="4">
- <reserved/>
- <reserved/>
- <opt>aadd</opt>
- <reserved/>
- <reserved/>
- <reserved/>
- <reserved/>
- <reserved/>
- <opt>asmin</opt>
- <opt>asmax</opt>
- <opt>aumin</opt>
- <opt>aumax</opt>
- <opt>aand</opt>
- <opt>aor</opt>
- <opt>axor</opt>
- </mod>
- <!-- not actually encoded, but used for IR -->
- <immediate name="sr_count" size="4" pseudo="true"/>
</ins>
<!-- *CUBEFACE1/+CUBEFACE2 pair, two destinations, scheduler lowered -->
@@ -8336,982 +8227,4 @@
<mod name="neg2" size="1" opt="neg"/>
</ins>
- <ins name="+IADD_IMM.i32" pseudo="true">
- <src start="0"/>
- <immediate name="index" size="32"/>
- </ins>
-
- <ins name="+IADD_IMM.v2i16" pseudo="true">
- <src start="0"/>
- <immediate name="index" size="32"/>
- </ins>
-
- <ins name="+IADD_IMM.v4i8" pseudo="true">
- <src start="0"/>
- <immediate name="index" size="32"/>
- </ins>
-
- <ins name="+FADD_IMM.f32" pseudo="true">
- <src start="0"/>
- <immediate name="index" size="32"/>
- </ins>
-
- <ins name="+FADD_IMM.v2f16" pseudo="true">
- <src start="0"/>
- <immediate name="index" size="32"/>
- </ins>
-
- <ins name="*FABSNEG.f32" pseudo="true">
- <src start="0" mask="0xfb"/>
- <mod name="neg0" start="7" size="1" opt="neg"/>
- <mod name="abs0" start="12" size="1" opt="abs"/>
- <mod name="widen0" size="2">
- <opt>none</opt>
- <opt>h0</opt>
- <opt>h1</opt>
- </mod>
- </ins>
-
- <ins name="*FABSNEG.v2f16" pseudo="true">
- <src start="0" mask="0xfb"/>
- <mod name="abs0" size="1" opt="abs"/>
- <mod name="neg0" start="7" size="1" opt="neg"/>
- <mod name="swz0" start="9" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- </ins>
-
- <ins name="*FCLAMP.f32" pseudo="true">
- <src start="0" mask="0xfb"/>
- <mod name="clamp" start="15" size="2">
- <opt>none</opt>
- <opt>clamp_0_inf</opt>
- <opt>clamp_m1_1</opt>
- <opt>clamp_0_1</opt>
- </mod>
- </ins>
-
- <ins name="*FCLAMP.v2f16" pseudo="true">
- <src start="0" mask="0xfb"/>
- <mod name="clamp" start="15" size="2">
- <opt>none</opt>
- <opt>clamp_0_inf</opt>
- <opt>clamp_m1_1</opt>
- <opt>clamp_0_1</opt>
- </mod>
- </ins>
-
- <ins name="+DISCARD.b32" pseudo="true" dests="0">
- <src start="0"/>
- <mod name="widen0" size="2">
- <opt>none</opt>
- <opt>h0</opt>
- <opt>h1</opt>
- </mod>
- </ins>
-
- <ins name="+TEX_SINGLE" staging="rw=sr_count" message="tex" pseudo="true">
- <src start="0"/>
- <src start="1"/>
- <immediate name="sr_count" size="4" pseudo="true"/>
- <mod name="texel_offset" start="9" size="1" opt="texel_offset"/>
- <mod name="skip" start="9" size="1" opt="skip"/>
- <mod name="shadow" start="9" size="1" opt="shadow"/>
- <mod name="array_enable" start="9" size="1" opt="array_enable"/>
- <mod name="dimension" start="9" size="2">
- <opt>1d</opt>
- <opt>2d</opt>
- <opt>3d</opt>
- <opt>cube</opt>
- </mod>
- <mod name="write_mask" start="9" size="4">
- <opt>none</opt>
- <opt>r</opt>
- <opt>g</opt>
- <opt>rg</opt>
- <opt>b</opt>
- <opt>rb</opt>
- <opt>gb</opt>
- <opt>rgb</opt>
- <opt>a</opt>
- <opt>ra</opt>
- <opt>ga</opt>
- <opt>rga</opt>
- <opt>ba</opt>
- <opt>rba</opt>
- <opt>gba</opt>
- <opt>rgba</opt>
- </mod>
- <mod name="va_lod_mode" start="13" size="3" default="zero_lod">
- <opt>zero_lod</opt>
- <opt>computed_lod</opt>
- <opt>explicit</opt>
- <opt>computed_bias</opt>
- <opt>grdesc</opt>
- </mod>
- <mod name="register_format" size="4">
- <opt>f16</opt>
- <opt>f32</opt>
- <opt>s32</opt>
- <opt>u32</opt>
- <opt>s16</opt>
- <opt>u16</opt>
- </mod>
- </ins>
-
- <ins name="+TEX_FETCH" staging="rw=sr_count" message="tex" pseudo="true">
- <src start="0"/>
- <src start="1"/>
- <immediate name="sr_count" size="4" pseudo="true"/>
- <mod name="texel_offset" start="9" size="1" opt="texel_offset"/>
- <mod name="skip" start="9" size="1" opt="skip"/>
- <mod name="array_enable" start="9" size="1" opt="array_enable"/>
- <mod name="dimension" start="9" size="2">
- <opt>1d</opt>
- <opt>2d</opt>
- <opt>3d</opt>
- <opt>cube</opt>
- </mod>
- <mod name="write_mask" start="9" size="4">
- <opt>none</opt>
- <opt>r</opt>
- <opt>g</opt>
- <opt>rg</opt>
- <opt>b</opt>
- <opt>rb</opt>
- <opt>gb</opt>
- <opt>rgb</opt>
- <opt>a</opt>
- <opt>ra</opt>
- <opt>ga</opt>
- <opt>rga</opt>
- <opt>ba</opt>
- <opt>rba</opt>
- <opt>gba</opt>
- <opt>rgba</opt>
- </mod>
- <mod name="register_format" size="4">
- <opt>f16</opt>
- <opt>f32</opt>
- <opt>s32</opt>
- <opt>u32</opt>
- <opt>s16</opt>
- <opt>u16</opt>
- </mod>
- </ins>
-
- <ins name="+TEX_GATHER" staging="rw=sr_count" message="tex" pseudo="true">
- <src start="0"/>
- <src start="1"/>
- <immediate name="sr_count" size="4" pseudo="true"/>
- <mod name="texel_offset" start="9" size="1" opt="texel_offset"/>
- <mod name="skip" start="9" size="1" opt="skip"/>
- <mod name="shadow" start="9" size="1" opt="shadow"/>
- <mod name="array_enable" start="9" size="1" opt="array_enable"/>
- <mod name="integer_coordinates" start="9" size="1" opt="integer_coordinates"/>
- <mod name="fetch_component" start="9" size="2">
- <opt>gather4_r</opt>
- <opt>gather4_g</opt>
- <opt>gather4_b</opt>
- <opt>gather4_a</opt>
- </mod>
- <mod name="dimension" start="9" size="2">
- <opt>1d</opt>
- <opt>2d</opt>
- <opt>3d</opt>
- <opt>cube</opt>
- </mod>
- <mod name="write_mask" start="9" size="4">
- <opt>none</opt>
- <opt>r</opt>
- <opt>g</opt>
- <opt>rg</opt>
- <opt>b</opt>
- <opt>rb</opt>
- <opt>gb</opt>
- <opt>rgb</opt>
- <opt>a</opt>
- <opt>ra</opt>
- <opt>ga</opt>
- <opt>rga</opt>
- <opt>ba</opt>
- <opt>rba</opt>
- <opt>gba</opt>
- <opt>rgba</opt>
- </mod>
- <mod name="register_format" size="4">
- <opt>f16</opt>
- <opt>f32</opt>
- <opt>s32</opt>
- <opt>u32</opt>
- <opt>s16</opt>
- <opt>u16</opt>
- </mod>
- </ins>
-
- <ins name="+CUBEFACE2_V9" pseudo="true">
- <src start="0" mask="0xfb"/>
- <src start="3" mask="0xfb"/>
- <src start="6"/>
- <mod name="neg0" size="1" opt="neg"/>
- <mod name="neg1" size="1" opt="neg"/>
- <mod name="neg2" size="1" opt="neg"/>
- </ins>
-
- <ins name="+LD_VAR_BUF_IMM.f32" staging="w=format" message="varying" pseudo="true">
- <src start="0"/>
- <immediate name="index" start="3" size="5"/>
- <mod name="vecsize" start="8" size="2">
- <opt>none</opt>
- <opt>v2</opt>
- <opt>v3</opt>
- <opt>v4</opt>
- </mod>
- <mod name="update" size="2">
- <opt>store</opt>
- <opt>retrieve</opt>
- <opt>conditional</opt>
- <opt>clobber</opt>
- </mod>
- <mod name="register_format" size="2">
- <opt>f32</opt>
- <opt>f16</opt>
- <opt>u32</opt>
- <opt>u16</opt>
- </mod>
- <mod name="source_format" size="2">
- <opt>flat32</opt>
- <opt>flat16</opt>
- <opt>f32</opt>
- <opt>f16</opt>
- </mod>
- <mod name="sample" size="3">
- <opt>center</opt>
- <opt>centroid</opt>
- <opt>sample</opt>
- <opt>explicit</opt>
- <opt>none</opt>
- </mod>
- </ins>
-
- <ins name="+LD_VAR_BUF.f32" staging="w=format" message="varying" pseudo="true">
- <src start="0"/>
- <src start="1"/>
- <mod name="vecsize" start="8" size="2">
- <opt>none</opt>
- <opt>v2</opt>
- <opt>v3</opt>
- <opt>v4</opt>
- </mod>
- <mod name="update" size="2">
- <opt>store</opt>
- <opt>retrieve</opt>
- <opt>conditional</opt>
- <opt>clobber</opt>
- </mod>
- <mod name="register_format" size="2">
- <opt>f32</opt>
- <opt>f16</opt>
- <opt>u32</opt>
- <opt>u16</opt>
- </mod>
- <mod name="source_format" size="2">
- <opt>flat32</opt>
- <opt>flat16</opt>
- <opt>f32</opt>
- <opt>f16</opt>
- </mod>
- <mod name="sample" size="3">
- <opt>center</opt>
- <opt>centroid</opt>
- <opt>sample</opt>
- <opt>explicit</opt>
- <opt>none</opt>
- </mod>
- </ins>
-
- <ins name="+LD_VAR_BUF_IMM.f16" staging="w=format" message="varying" pseudo="true">
- <src start="0"/>
- <immediate name="index" start="3" size="5"/>
- <mod name="vecsize" start="8" size="2">
- <opt>none</opt>
- <opt>v2</opt>
- <opt>v3</opt>
- <opt>v4</opt>
- </mod>
- <mod name="update" size="2">
- <opt>store</opt>
- <opt>retrieve</opt>
- <opt>conditional</opt>
- <opt>clobber</opt>
- </mod>
- <mod name="register_format" size="2">
- <opt>f32</opt>
- <opt>f16</opt>
- <opt>u32</opt>
- <opt>u16</opt>
- </mod>
- <mod name="source_format" size="2">
- <opt>flat32</opt>
- <opt>flat16</opt>
- <opt>f32</opt>
- <opt>f16</opt>
- </mod>
- <mod name="sample" size="3">
- <opt>center</opt>
- <opt>centroid</opt>
- <opt>sample</opt>
- <opt>explicit</opt>
- <opt>none</opt>
- </mod>
- </ins>
-
- <ins name="+LD_VAR_BUF.f16" staging="w=format" message="varying" pseudo="true">
- <src start="0"/>
- <src start="1"/>
- <mod name="vecsize" start="8" size="2">
- <opt>none</opt>
- <opt>v2</opt>
- <opt>v3</opt>
- <opt>v4</opt>
- </mod>
- <mod name="update" size="2">
- <opt>store</opt>
- <opt>retrieve</opt>
- <opt>conditional</opt>
- <opt>clobber</opt>
- </mod>
- <mod name="register_format" size="2">
- <opt>f32</opt>
- <opt>f16</opt>
- <opt>u32</opt>
- <opt>u16</opt>
- </mod>
- <mod name="source_format" size="2">
- <opt>flat32</opt>
- <opt>flat16</opt>
- <opt>f32</opt>
- <opt>f16</opt>
- </mod>
- <mod name="sample" size="3">
- <opt>center</opt>
- <opt>centroid</opt>
- <opt>sample</opt>
- <opt>explicit</opt>
- <opt>none</opt>
- </mod>
- </ins>
-
- <ins name="+LEA_BUF_IMM" staging="w=2" message="attribute" pseudo="true">
- <src start="0"/>
- </ins>
-
- <ins name="+LD_BUFFER.i128" staging="w=4" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- </ins>
-
- <ins name="+LD_BUFFER.i16" staging="w=1" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- <mod name="lane_dest" size="2" default="h0">
- <opt>h0</opt>
- <opt>h1</opt>
- <opt>w0</opt>
- <opt>d0</opt>
- </mod>
- <mod name="extend" size="2">
- <opt>none</opt>
- <opt>sext</opt>
- <opt>zext</opt>
- </mod>
- </ins>
-
- <ins name="+LD_BUFFER.i24" staging="w=1" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- </ins>
-
- <ins name="+LD_BUFFER.i32" staging="w=1" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- <mod name="lane_dest" size="1" opt="d0"/>
- <mod name="extend" size="2">
- <opt>none</opt>
- <opt>sext</opt>
- <opt>zext</opt>
- </mod>
- </ins>
-
- <ins name="+LD_BUFFER.i48" staging="w=2" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- </ins>
-
- <ins name="+LD_BUFFER.i64" staging="w=2" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- </ins>
-
- <ins name="+LD_BUFFER.i8" staging="w=1" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- <mod name="lane_dest" size="3" default="b0">
- <opt>b0</opt>
- <opt>b1</opt>
- <opt>b2</opt>
- <opt>b3</opt>
- <opt>h0</opt>
- <opt>h1</opt>
- <opt>w0</opt>
- <opt>d0</opt>
- </mod>
- <mod name="extend" size="2">
- <opt>none</opt>
- <opt>sext</opt>
- <opt>zext</opt>
- </mod>
- </ins>
-
- <ins name="+LD_BUFFER.i96" staging="w=3" pseudo="true" message="load">
- <src start="0"/>
- <src start="3"/>
- </ins>
-
- <ins name="+BRANCHZI" pseudo="true" last="true" dests="0">
- <src start="0"/>
- <src start="6" mask="0xf7"/>
- <mod name="cmpf" size="1">
- <opt>eq</opt>
- <opt>ne</opt>
- </mod>
- </ins>
-
- <ins name="+LD_TEX" pseudo="true" staging="w=format" message="attribute">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="register_format" size="4">
- <opt>f16</opt>
- <opt>f32</opt>
- <opt>s32</opt>
- <opt>u32</opt>
- <opt>s16</opt>
- <opt>u16</opt>
- <opt>f64</opt>
- <opt>i64</opt>
- <opt>auto</opt>
- </mod>
- <mod name="vecsize" start="11" size="2">
- <opt>none</opt>
- <opt>v2</opt>
- <opt>v3</opt>
- <opt>v4</opt>
- </mod>
- </ins>
-
- <ins name="+LD_TEX_IMM" pseudo="true" staging="w=format" message="attribute">
- <src start="0"/>
- <src start="3"/>
- <immediate name="texture_index" start="6" size="4"/>
- <mod name="register_format" size="4">
- <opt>f16</opt>
- <opt>f32</opt>
- <opt>s32</opt>
- <opt>u32</opt>
- <opt>s16</opt>
- <opt>u16</opt>
- <opt>f64</opt>
- <opt>i64</opt>
- <opt>auto</opt>
- </mod>
- <mod name="vecsize" start="11" size="2">
- <opt>none</opt>
- <opt>v2</opt>
- <opt>v3</opt>
- <opt>v4</opt>
- </mod>
- </ins>
-
- <ins name="*MKVEC.v2i8" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="lane0" start="12" size="2" default="b0">
- <opt>b0</opt>
- <opt>b1</opt>
- <opt>b2</opt>
- <opt>b3</opt>
- </mod>
- <mod name="lane1" start="13" size="2" default="b0">
- <opt>b0</opt>
- <opt>b1</opt>
- <opt>b2</opt>
- <opt>b3</opt>
- </mod>
- </ins>
-
- <ins name="+PHI" pseudo="true" variable_srcs="true"/>
-
- <ins name="+COLLECT.i32" pseudo="true" variable_srcs="true"/>
-
- <ins name="+SPLIT.i32" pseudo="true" variable_dests="true">
- <src start="0"/>
- </ins>
-
- <ins name="*FCMP_OR.f32" pseudo="true">
- <src start="0" mask="0xfb"/>
- <src start="3" mask="0xfb"/>
- <src start="6" mask="0xfb"/>
- <mod name="widen0" size="2">
- <opt>none</opt>
- <opt>h0</opt>
- <opt>h1</opt>
- </mod>
- <mod name="widen1" size="2">
- <opt>none</opt>
- <opt>h0</opt>
- <opt>h1</opt>
- </mod>
- <mod name="abs1" start="6" size="1" opt="abs"/>
- <mod name="neg0" start="7" size="1" opt="neg"/>
- <mod name="neg1" start="8" size="1" opt="neg"/>
- <mod name="abs0" start="12" size="1" opt="abs"/>
- <mod name="cmpf" start="13" size="3">
- <opt>eq</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>ne</opt>
- <opt>lt</opt>
- <opt>le</opt>
- <opt>gtlt</opt>
- <opt>total</opt>
- </mod>
- <mod name="result_type" start="16" size="2" default="i1">
- <opt>i1</opt>
- <opt>f1</opt>
- <opt>m1</opt>
- </mod>
- </ins>
-
- <ins name="*FCMP_OR.v2f16" pseudo="true">
- <src start="0" mask="0xfb"/>
- <src start="3" mask="0xfb"/>
- <src start="6" mask="0xfb"/>
- <mod name="abs0" size="1" opt="abs"/>
- <mod name="abs1" size="1" opt="abs"/>
- <mod name="cmpf" size="3">
- <opt>eq</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>ne</opt>
- <opt>lt</opt>
- <opt>le</opt>
- <opt>gtlt</opt>
- <opt>total</opt>
- </mod>
- <mod name="neg0" start="7" size="1" opt="neg"/>
- <mod name="neg1" start="8" size="1" opt="neg"/>
- <mod name="swz0" start="9" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="swz1" start="11" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="result_type" start="16" size="2" default="i1">
- <opt>i1</opt>
- <opt>f1</opt>
- <opt>m1</opt>
- </mod>
- </ins>
-
- <ins name="*FCMP_AND.f32" pseudo="true">
- <src start="0" mask="0xfb"/>
- <src start="3" mask="0xfb"/>
- <src start="6" mask="0xfb"/>
- <mod name="widen0" size="2">
- <opt>none</opt>
- <opt>h0</opt>
- <opt>h1</opt>
- </mod>
- <mod name="widen1" size="2">
- <opt>none</opt>
- <opt>h0</opt>
- <opt>h1</opt>
- </mod>
- <mod name="abs1" start="6" size="1" opt="abs"/>
- <mod name="neg0" start="7" size="1" opt="neg"/>
- <mod name="neg1" start="8" size="1" opt="neg"/>
- <mod name="abs0" start="12" size="1" opt="abs"/>
- <mod name="cmpf" start="13" size="3">
- <opt>eq</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>ne</opt>
- <opt>lt</opt>
- <opt>le</opt>
- <opt>gtlt</opt>
- <opt>total</opt>
- </mod>
- <mod name="result_type" start="16" size="2" default="i1">
- <opt>i1</opt>
- <opt>f1</opt>
- <opt>m1</opt>
- </mod>
- </ins>
-
- <ins name="*FCMP_AND.v2f16" pseudo="true">
- <src start="0" mask="0xfb"/>
- <src start="3" mask="0xfb"/>
- <src start="6" mask="0xfb"/>
- <mod name="abs0" size="1" opt="abs"/>
- <mod name="abs1" size="1" opt="abs"/>
- <mod name="cmpf" size="3">
- <opt>eq</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>ne</opt>
- <opt>lt</opt>
- <opt>le</opt>
- <opt>gtlt</opt>
- <opt>total</opt>
- </mod>
- <mod name="neg0" start="7" size="1" opt="neg"/>
- <mod name="neg1" start="8" size="1" opt="neg"/>
- <mod name="swz0" start="9" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="swz1" start="11" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="result_type" start="16" size="2" default="i1">
- <opt>i1</opt>
- <opt>f1</opt>
- <opt>m1</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_MULTI.s32" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_MULTI.u32" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_OR.s32" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_OR.u32" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_OR.v2s16" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="swz0" start="6" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="swz1" start="8" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_OR.v2u16" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="swz0" start="6" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="swz1" start="8" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_OR.v4s8" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- <derived start="6" size="1">
- <eq left="cmpf" right="#gt"/>
- <eq left="cmpf" right="#ge"/>
- </derived>
- </ins>
-
- <ins name="+ICMP_OR.v4u8" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_AND.s32" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_AND.u32" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_AND.v2s16" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="swz0" start="6" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="swz1" start="8" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_AND.v2u16" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="swz0" start="6" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="swz1" start="8" size="2" default="h01">
- <opt>h00</opt>
- <opt>h10</opt>
- <opt>h01</opt>
- <opt>h11</opt>
- </mod>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
- <ins name="+ICMP_AND.v4s8" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- <derived start="6" size="1">
- <eq left="cmpf" right="#gt"/>
- <eq left="cmpf" right="#ge"/>
- </derived>
- </ins>
-
- <ins name="+ICMP_AND.v4u8" pseudo="true">
- <src start="0"/>
- <src start="3"/>
- <src start="6"/>
- <mod name="result_type" start="10" size="1" default="i1">
- <opt>i1</opt>
- <opt>m1</opt>
- </mod>
- <mod name="cmpf" size="2">
- <opt>eq</opt>
- <opt>ne</opt>
- <opt>gt</opt>
- <opt>ge</opt>
- <opt>lt</opt>
- <opt>le</opt>
- </mod>
- </ins>
-
</bifrost>
diff --git a/lib/mesa/src/panfrost/bifrost/bi_builder.h.py b/lib/mesa/src/panfrost/bifrost/bi_builder.h.py
index 4ce47fb05..903ef4e02 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_builder.h.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_builder.h.py
@@ -19,9 +19,7 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
-SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen",
- "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem",
- "not_result", "skip", "round", "ftz"])
+SKIP = set(["lane", "lanes", "lanes", "replicate", "swz", "widen", "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem", "not_result", "skip"])
TEMPLATE = """
#ifndef _BI_BUILDER_H_
@@ -30,11 +28,6 @@ TEMPLATE = """
#include "compiler.h"
<%
-# For <32-bit loads/stores, the default extend `none` with a natural sized
-# input is not encodeable! To avoid a footgun, swap the default to `zext` which
-# will work as expected
-ZEXT_DEFAULT = set(["LOAD.i8", "LOAD.i16", "LOAD.i24", "STORE.i8", "STORE.i16", "STORE.i24"])
-
def nirtypes(opcode):
split = opcode.split('.', 1)
if len(split) < 2:
@@ -60,6 +53,19 @@ def nirtypes(opcode):
else:
return None
+def typesize(opcode):
+ if opcode[-3:] == '128':
+ return 128
+ if opcode[-2:] == '48':
+ return 48
+ elif opcode[-1] == '8':
+ return 8
+ else:
+ try:
+ return int(opcode[-2:])
+ except:
+ return None
+
def condition(opcode, typecheck, sizecheck):
cond = ''
if typecheck == True:
@@ -92,51 +98,27 @@ def to_suffix(op):
static inline
bi_instr * bi_${opcode.replace('.', '_').lower()}${to_suffix(ops[opcode])}(${signature(ops[opcode], modifiers)})
{
-<%
- op = ops[opcode]
- nr_dests = "nr_dests" if op["variable_dests"] else op["dests"]
- nr_srcs = "nr_srcs" if op["variable_srcs"] else src_count(op)
-%>
- size_t size = sizeof(bi_instr) + sizeof(bi_index) * (${nr_dests} + ${nr_srcs});
- bi_instr *I = (bi_instr *) rzalloc_size(b->shader, size);
-
+ bi_instr *I = rzalloc(b->shader, bi_instr);
I->op = BI_OPCODE_${opcode.replace('.', '_').upper()};
- I->nr_dests = ${nr_dests};
- I->nr_srcs = ${nr_srcs};
- I->dest = (bi_index *) (&I[1]);
- I->src = I->dest + ${nr_dests};
-
-% if not op["variable_dests"]:
-% for dest in range(op["dests"]):
+% for dest in range(ops[opcode]["dests"]):
I->dest[${dest}] = dest${dest};
% endfor
-%endif
-
-% if not op["variable_srcs"]:
-% for src in range(src_count(op)):
+% for src in range(src_count(ops[opcode])):
I->src[${src}] = src${src};
% endfor
-% endif
-
% for mod in ops[opcode]["modifiers"]:
-% if not should_skip(mod, opcode):
+% if mod[0:-1] not in SKIP and mod not in SKIP:
I->${mod} = ${mod};
% endif
% endfor
-% if ops[opcode]["rtz"]:
- I->round = BI_ROUND_RTZ;
-% endif
% for imm in ops[opcode]["immediates"]:
I->${imm} = ${imm};
% endfor
-% if opcode in ZEXT_DEFAULT:
- I->extend = BI_EXTEND_ZEXT;
-% endif
bi_builder_insert(&b->cursor, I);
return I;
}
-% if ops[opcode]["dests"] == 1 and not ops[opcode]["variable_dests"]:
+% if ops[opcode]["dests"] == 1:
static inline
bi_index bi_${opcode.replace('.', '_').lower()}(${signature(ops[opcode], modifiers, no_dests=True)})
{
@@ -193,26 +175,19 @@ modifier_lists = order_modifiers(ir_instructions)
# Generate type signature for a builder routine
-def should_skip(mod, op):
- # FROUND and HADD only make sense in context of a round mode, so override
- # the usual skip
- if mod == "round" and ("FROUND" in op or "HADD" in op):
- return False
-
+def should_skip(mod):
return mod in SKIP or mod[0:-1] in SKIP
def modifier_signature(op):
- return sorted([m for m in op["modifiers"].keys() if not should_skip(m, op["key"])])
+ return sorted([m for m in op["modifiers"].keys() if not should_skip(m)])
def signature(op, modifiers, typeful = False, sized = False, no_dests = False):
return ", ".join(
["bi_builder *b"] +
(["nir_alu_type type"] if typeful == True else []) +
(["unsigned bitsize"] if sized == True else []) +
- (["unsigned nr_dests"] if op["variable_dests"] else
- ["bi_index dest{}".format(i) for i in range(0 if no_dests else op["dests"])]) +
- (["unsigned nr_srcs"] if op["variable_srcs"] else
- ["bi_index src{}".format(i) for i in range(src_count(op))]) +
+ ["bi_index dest{}".format(i) for i in range(0 if no_dests else op["dests"])] +
+ ["bi_index src{}".format(i) for i in range(src_count(op))] +
["{} {}".format(
"bool" if len(modifiers[T[0:-1]] if T[-1] in "0123" else modifiers[T]) == 2 else
"enum bi_" + T[0:-1] if T[-1] in "0123" else
@@ -221,19 +196,11 @@ def signature(op, modifiers, typeful = False, sized = False, no_dests = False):
["uint32_t {}".format(imm) for imm in op["immediates"]])
def arguments(op, temp_dest = True):
- dest_pattern = "bi_temp(b->shader)" if temp_dest else 'dest{}'
- dests = [dest_pattern.format(i) for i in range(op["dests"])]
- srcs = ["src{}".format(i) for i in range(src_count(op))]
-
- # Variable source/destinations just pass in the count
- if op["variable_dests"]:
- dests = ["nr_dests"]
-
- if op["variable_srcs"]:
- srcs = ["nr_srcs"]
-
- return ", ".join(["b"] + dests + srcs + modifier_signature(op) + op["immediates"])
+ return ", ".join(
+ ["b"] +
+ ["bi_temp(b->shader)" if temp_dest else 'dest{}'.format(i) for i in range(op["dests"])] +
+ ["src{}".format(i) for i in range(src_count(op))] +
+ modifier_signature(op) +
+ op["immediates"])
-print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, modifiers =
- modifier_lists, signature = signature, arguments = arguments, src_count =
- src_count, typesize = typesize, should_skip = should_skip))
+print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, modifiers = modifier_lists, signature = signature, arguments = arguments, src_count = src_count, SKIP = SKIP))
diff --git a/lib/mesa/src/panfrost/bifrost/bi_layout.c b/lib/mesa/src/panfrost/bifrost/bi_layout.c
index 7c034cb31..db66ed04f 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_layout.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_layout.c
@@ -32,6 +32,24 @@
* manipulating clause layouts.
*/
+/* Helper to see if a tuple can be inserted. We must satisfy the invariant:
+ *
+ * constant_count + tuple_count <= 13
+ *
+ * ...which is equivalent to the clause ending up with 8 or fewer quardwords.
+ * Inserting a tuple increases tuple_count by one, and if it reads a unique
+ * constant, it increases constant_count by one.
+ */
+
+bool
+bi_can_insert_tuple(bi_clause *clause, bool constant)
+{
+ unsigned constant_count = clause->constant_count + (constant ? 1 : 0);
+ unsigned tuple_count = clause->tuple_count + 1;
+
+ return (constant_count + tuple_count) <= 13;
+}
+
/* Is embedded constant 0 packed for free in a clause with this many tuples? */
bool
@@ -69,7 +87,7 @@ bi_ec0_packed(unsigned tuple_count)
* constants are packed two-by-two as constant quadwords.
*/
-static unsigned
+unsigned
bi_clause_quadwords(bi_clause *clause)
{
unsigned X = clause->tuple_count;
@@ -95,7 +113,7 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target)
/* Determine if the block we're branching to is strictly greater in
* source order */
- bool forwards = target->index > start->block->index;
+ bool forwards = target->base.name > start->block->base.name;
if (forwards) {
/* We have to jump through this block from the start of this
@@ -106,7 +124,9 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target)
/* We then need to jump through every clause of every following
* block until the target */
- bi_foreach_block_from(ctx, start->block, blk) {
+ bi_foreach_block_from(ctx, start->block, _blk) {
+ bi_block *blk = (bi_block *) _blk;
+
/* Don't double-count the first block */
if (blk == start->block)
continue;
@@ -133,7 +153,9 @@ bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target)
/* And jump back every clause of preceding blocks up through
* and including the target to get to the beginning of the
* target */
- bi_foreach_block_from_rev(ctx, start->block, blk) {
+ bi_foreach_block_from_rev(ctx, start->block, _blk) {
+ bi_block *blk = (bi_block *) _blk;
+
if (blk == start->block)
continue;
diff --git a/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c b/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c
index 883f53014..ed03d4c2c 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_lower_swizzle.c
@@ -30,51 +30,16 @@
* recombine swizzles where we can as an optimization.
*/
-static bool
-bi_swizzle_replicates_8(enum bi_swizzle swz)
-{
- switch (swz) {
- case BI_SWIZZLE_B0000:
- case BI_SWIZZLE_B1111:
- case BI_SWIZZLE_B2222:
- case BI_SWIZZLE_B3333:
- return true;
- default:
- return false;
- }
-}
-
static void
-lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
+bi_lower_swizzle_16(bi_context *ctx, bi_instr *ins, unsigned src)
{
/* TODO: Use the opcode table and be a lot more methodical about this... */
switch (ins->op) {
- /* Some instructions used with 16-bit data never have swizzles */
case BI_OPCODE_CSEL_V2F16:
case BI_OPCODE_CSEL_V2I16:
case BI_OPCODE_CSEL_V2S16:
case BI_OPCODE_CSEL_V2U16:
-
- /* Despite ostensibly being 32-bit instructions, CLPER does not
- * inherently interpret the data, so it can be used for v2f16
- * derivatives, which might require swizzle lowering */
- case BI_OPCODE_CLPER_I32:
- case BI_OPCODE_CLPER_OLD_I32:
-
- /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
- * boolean is implemented as a 16-bit integer, the swizzle is needed
- * for correct operation if the instruction producing the 16-bit
- * boolean does not replicate to both halves of the containing 32-bit
- * register. As such, we may need to lower a swizzle.
- *
- * This is a silly hack. Ideally, code gen would be smart enough to
- * avoid this case (by replicating). In practice, silly hardware design
- * decisions force our hand here.
- */
- case BI_OPCODE_MUX_I32:
- case BI_OPCODE_CSEL_I32:
break;
-
case BI_OPCODE_IADD_V2S16:
case BI_OPCODE_IADD_V2U16:
case BI_OPCODE_ISUB_V2S16:
@@ -93,212 +58,28 @@ lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
return;
else
break;
-
- /* For some reason MUX.v2i16 allows swaps but not replication */
- case BI_OPCODE_MUX_V2I16:
- if (ins->src[src].swizzle == BI_SWIZZLE_H10)
- return;
- else
- break;
-
- /* No swizzles supported */
- case BI_OPCODE_HADD_V4U8:
- case BI_OPCODE_HADD_V4S8:
- case BI_OPCODE_CLZ_V4U8:
- case BI_OPCODE_IDP_V4I8:
- case BI_OPCODE_IABS_V4S8:
- case BI_OPCODE_ICMP_V4I8:
- case BI_OPCODE_ICMP_V4U8:
- case BI_OPCODE_MUX_V4I8:
- case BI_OPCODE_IADD_IMM_V4I8:
- break;
-
- case BI_OPCODE_LSHIFT_AND_V4I8:
- case BI_OPCODE_LSHIFT_OR_V4I8:
- case BI_OPCODE_LSHIFT_XOR_V4I8:
- case BI_OPCODE_RSHIFT_AND_V4I8:
- case BI_OPCODE_RSHIFT_OR_V4I8:
- case BI_OPCODE_RSHIFT_XOR_V4I8:
- /* Last source allows identity or replication */
- if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle))
- return;
-
- /* Others do not allow swizzles */
- break;
-
- /* We don't want to deal with reswizzling logic in modifier prop. Move
- * the swizzle outside, it's easier for clamp propagation. */
- case BI_OPCODE_FCLAMP_V2F16:
- {
- bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
- bi_index dest = ins->dest[0];
- bi_index tmp = bi_temp(ctx);
-
- bi_index swizzled_src = bi_replace_index(ins->src[0], tmp);
- ins->src[0].swizzle = BI_SWIZZLE_H01;
- ins->dest[0] = tmp;
- bi_swz_v2i16_to(&b, dest, swizzled_src);
- return;
- }
-
default:
return;
}
- /* First, try to apply a given swizzle to a constant to clear the
- * runtime swizzle. This is less heavy-handed than ignoring the
- * swizzle for scalar destinations, since it maintains
- * replication of the destination.
- */
- if (ins->src[src].type == BI_INDEX_CONSTANT) {
- ins->src[src].value = bi_apply_swizzle(ins->src[src].value,
- ins->src[src].swizzle);
- ins->src[src].swizzle = BI_SWIZZLE_H01;
+ /* Identity is ok (TODO: what about replicate only?) */
+ if (ins->src[src].swizzle == BI_SWIZZLE_H01)
return;
- }
-
- /* Even if the source does not replicate, if the consuming instruction
- * produces a 16-bit scalar, we can ignore the other component.
- */
- if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
- ins->src[src].swizzle == BI_SWIZZLE_H00)
- {
- ins->src[src].swizzle = BI_SWIZZLE_H01;
- return;
- }
/* Lower it away */
bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
-
- bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8);
- bi_index orig = ins->src[src];
- bi_index stripped = bi_replace_index(bi_null(), orig);
- stripped.swizzle = ins->src[src].swizzle;
-
- bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped);
-
- bi_replace_src(ins, src, swz);
+ ins->src[src] = bi_replace_index(ins->src[src],
+ bi_swz_v2i16(&b, ins->src[src]));
ins->src[src].swizzle = BI_SWIZZLE_H01;
}
-static bool
-bi_swizzle_replicates_16(enum bi_swizzle swz)
-{
- switch (swz) {
- case BI_SWIZZLE_H00:
- case BI_SWIZZLE_H11:
- return true;
- default:
- /* If a swizzle replicates every 8-bits, it also replicates
- * every 16-bits, so allow 8-bit replicating swizzles.
- */
- return bi_swizzle_replicates_8(swz);
- }
-}
-
-static bool
-bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
-{
- switch (I->op) {
-
- /* Instructions that construct vectors have replicated output if their
- * sources are identical. Check this case first.
- */
- case BI_OPCODE_MKVEC_V2I16:
- case BI_OPCODE_V2F16_TO_V2S16:
- case BI_OPCODE_V2F16_TO_V2U16:
- case BI_OPCODE_V2F32_TO_V2F16:
- case BI_OPCODE_V2S16_TO_V2F16:
- case BI_OPCODE_V2S8_TO_V2F16:
- case BI_OPCODE_V2S8_TO_V2S16:
- case BI_OPCODE_V2U16_TO_V2F16:
- case BI_OPCODE_V2U8_TO_V2F16:
- case BI_OPCODE_V2U8_TO_V2U16:
- return bi_is_value_equiv(I->src[0], I->src[1]);
-
- /* 16-bit transcendentals are defined to output zero in their
- * upper half, so they do not replicate
- */
- case BI_OPCODE_FRCP_F16:
- case BI_OPCODE_FRSQ_F16:
- return false;
-
- /* Not sure, be conservative, we don't use these.. */
- case BI_OPCODE_VN_ASST1_F16:
- case BI_OPCODE_FPCLASS_F16:
- case BI_OPCODE_FPOW_SC_DET_F16:
- return false;
-
- default:
- break;
- }
-
- /* Replication analysis only makes sense for ALU instructions */
- if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
- return false;
-
- /* We only analyze 16-bit instructions for 16-bit replication. We could
- * maybe do better.
- */
- if (bi_opcode_props[I->op].size != BI_SIZE_16)
- return false;
-
- bi_foreach_src(I, s) {
- if (bi_is_null(I->src[s]))
- continue;
-
- /* Replicated swizzles */
- if (bi_swizzle_replicates_16(I->src[s].swizzle))
- continue;
-
- /* Replicated values */
- if (bi_is_ssa(I->src[s]) &&
- BITSET_TEST(replicates_16, I->src[s].value))
- continue;
-
- /* Replicated constants */
- if (I->src[s].type == BI_INDEX_CONSTANT &&
- (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
- continue;
-
- return false;
- }
-
- return true;
-}
-
void
bi_lower_swizzle(bi_context *ctx)
{
bi_foreach_instr_global_safe(ctx, ins) {
bi_foreach_src(ins, s) {
- if (bi_is_null(ins->src[s])) continue;
- if (ins->src[s].swizzle == BI_SWIZZLE_H01) continue;
-
- lower_swizzle(ctx, ins, s);
+ if (!bi_is_null(ins->src[s]))
+ bi_lower_swizzle_16(ctx, ins, s);
}
}
-
- /* Now that we've lowered swizzles, clean up the mess */
- BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
-
- bi_foreach_instr_global(ctx, ins) {
- if (ins->nr_dests && bi_instr_replicates(ins, replicates_16))
- BITSET_SET(replicates_16, ins->dest[0].value);
-
- if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
- BITSET_TEST(replicates_16, ins->src[0].value)) {
- ins->op = BI_OPCODE_MOV_I32;
- ins->src[0].swizzle = BI_SWIZZLE_H01;
- }
-
- /* The above passes rely on replicating destinations. For
- * Valhall, we will want to optimize this. For now, default
- * to Bifrost compatible behaviour.
- */
- if (ins->nr_dests)
- ins->dest[0].swizzle = BI_SWIZZLE_H01;
- }
-
- free(replicates_16);
}
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py b/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py
index cbe0ae458..7ef88da8f 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_opcodes.c.py
@@ -21,15 +21,11 @@
# IN THE SOFTWARE.
TEMPLATE = """#include "bi_opcodes.h"
-<%
-def hasmod(mods, name):
- return 1 if name in mods else 0
-%>
+
struct bi_op_props bi_opcode_props[BI_NUM_OPCODES] = {
% for opcode in sorted(mnemonics):
<%
add = instructions["+" + opcode][0][1] if "+" + opcode in instructions else None
- size = typesize(opcode)
message = add["message"].upper() if add else "NONE"
sr_count = add["staging_count"].upper() if add else "0"
sr_read = int(add["staging"] in ["r", "rw"] if add else False)
@@ -39,18 +35,10 @@ struct bi_op_props bi_opcode_props[BI_NUM_OPCODES] = {
branch = int(opcode.startswith('BRANCH'))
has_fma = int("*" + opcode in instructions)
has_add = int("+" + opcode in instructions)
- mods = ops[opcode]['modifiers']
- clamp = hasmod(mods, 'clamp')
- not_result = hasmod(mods, 'not_result')
- abs = hasmod(mods, 'abs0') | (hasmod(mods, 'abs1') << 1) | (hasmod(mods, 'abs2') << 2)
- neg = hasmod(mods, 'neg0') | (hasmod(mods, 'neg1') << 1) | (hasmod(mods, 'neg2') << 2)
- m_not = hasmod(mods, 'not1')
%>
[BI_OPCODE_${opcode.replace('.', '_').upper()}] = {
- "${opcode}", BIFROST_MESSAGE_${message}, BI_SIZE_${size},
- BI_SR_COUNT_${sr_count}, ${sr_read}, ${sr_write}, ${last}, ${branch},
- ${table}, ${has_fma}, ${has_add}, ${clamp}, ${not_result}, ${abs},
- ${neg}, ${m_not},
+ "${opcode}", BIFROST_MESSAGE_${message}, BI_SR_COUNT_${sr_count},
+ ${sr_read}, ${sr_write}, ${last}, ${branch}, ${table}, ${has_fma}, ${has_add},
},
% endfor
};"""
@@ -63,4 +51,4 @@ instructions = parse_instructions(sys.argv[1], include_pseudo = True)
ir_instructions = partition_mnemonics(instructions)
mnemonics = set(x[1:] for x in instructions.keys())
-print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, mnemonics = mnemonics, instructions = instructions, typesize = typesize))
+print(Template(COPYRIGHT + TEMPLATE).render(ops = ir_instructions, mnemonics = mnemonics, instructions = instructions))
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py b/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py
index 3b8ff0b33..b807513e1 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_opcodes.h.py
@@ -64,23 +64,11 @@ enum bi_sr_count {
BI_SR_COUNT_SR_COUNT = 7
};
-enum bi_size {
- BI_SIZE_8 = 0,
- BI_SIZE_16,
- BI_SIZE_24,
- BI_SIZE_32,
- BI_SIZE_48,
- BI_SIZE_64,
- BI_SIZE_96,
- BI_SIZE_128,
-};
-
/* Description of an opcode in the IR */
struct bi_op_props {
const char *name;
enum bifrost_message_type message : 4;
- enum bi_size size : 3;
enum bi_sr_count sr_count : 3;
bool sr_read : 1;
bool sr_write : 1;
@@ -89,13 +77,6 @@ struct bi_op_props {
bool table : 1;
bool fma : 1;
bool add : 1;
-
- /* Supported propagable modifiers */
- bool clamp : 1;
- bool not_result : 1;
- unsigned abs : 3;
- unsigned neg : 3;
- bool not_mod : 1;
};
/* Generated in bi_opcodes.c.py */
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c b/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c
index 13b9b0d2b..06b0e41e8 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_opt_copy_prop.c
@@ -23,89 +23,54 @@
*/
#include "compiler.h"
-#include "bi_builder.h"
-/* SSA copy propagation */
+/* A simple scalar-only SSA-based copy-propagation pass. TODO: vectors */
static bool
-bi_reads_fau(bi_instr *ins)
+bi_is_copy(bi_instr *ins)
{
- bi_foreach_src(ins, s) {
- if (ins->src[s].type == BI_INDEX_FAU)
- return true;
- }
+ return (ins->op == BI_OPCODE_MOV_I32) && bi_is_ssa(ins->dest[0])
+ && (bi_is_ssa(ins->src[0]) || ins->src[0].type == BI_INDEX_FAU);
+}
- return false;
+static inline unsigned
+bi_word_node(bi_index idx)
+{
+ assert(idx.type == BI_INDEX_NORMAL && !idx.reg);
+ return (idx.value << 2) | idx.offset;
}
void
bi_opt_copy_prop(bi_context *ctx)
{
- /* Chase SPLIT of COLLECT. Instruction selection usually avoids this
- * pattern (due to the split cache), but it is inevitably generated by
- * the UBO pushing pass.
- */
- bi_instr **collects = calloc(sizeof(bi_instr *), ctx->ssa_alloc);
- bi_foreach_instr_global_safe(ctx, I) {
- if (I->op == BI_OPCODE_COLLECT_I32) {
- /* Rewrite trivial collects while we're at it */
- if (I->nr_srcs == 1)
- I->op = BI_OPCODE_MOV_I32;
-
- collects[I->dest[0].value] = I;
- } else if (I->op == BI_OPCODE_SPLIT_I32) {
- /* Rewrite trivial splits while we're at it */
- if (I->nr_dests == 1)
- I->op = BI_OPCODE_MOV_I32;
-
- bi_instr *collect = collects[I->src[0].value];
- if (!collect)
- continue;
-
- /* Lower the split to moves, copyprop cleans up */
- bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
-
- bi_foreach_dest(I, d)
- bi_mov_i32_to(&b, I->dest[d], collect->src[d]);
-
- bi_remove_instruction(I);
- }
- }
-
- free(collects);
-
- bi_index *replacement = calloc(sizeof(bi_index), ctx->ssa_alloc);
+ bi_index *replacement = calloc(sizeof(bi_index), ((ctx->ssa_alloc + 1) << 2));
bi_foreach_instr_global_safe(ctx, ins) {
- if (ins->op == BI_OPCODE_MOV_I32 && ins->src[0].type != BI_INDEX_REGISTER) {
+ if (bi_is_copy(ins)) {
bi_index replace = ins->src[0];
/* Peek through one layer so copyprop converges in one
* iteration for chained moves */
if (bi_is_ssa(replace)) {
- bi_index chained = replacement[replace.value];
+ bi_index chained = replacement[bi_word_node(replace)];
if (!bi_is_null(chained))
replace = chained;
}
- assert(ins->nr_dests == 1);
- replacement[ins->dest[0].value] = replace;
+ replacement[bi_word_node(ins->dest[0])] = replace;
}
bi_foreach_src(ins, s) {
bi_index use = ins->src[s];
- if (use.type != BI_INDEX_NORMAL) continue;
- if (bi_is_staging_src(ins, s)) continue;
-
- bi_index repl = replacement[use.value];
+ if (use.type != BI_INDEX_NORMAL || use.reg) continue;
+ if (bi_count_read_registers(ins, s) != 1) continue;
- if (repl.type == BI_INDEX_CONSTANT && bi_reads_fau(ins))
- continue;
+ bi_index repl = replacement[bi_word_node(use)];
if (!bi_is_null(repl))
- bi_replace_src(ins, s, repl);
+ ins->src[s] = bi_replace_index(ins->src[s], repl);
}
}
diff --git a/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c b/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c
index 5a37bf3a9..8debdd486 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_opt_push_ubo.c
@@ -30,16 +30,10 @@
* structure returned back to the command stream. */
static bool
-bi_is_ubo(bi_instr *ins)
-{
- return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
- (ins->seg == BI_SEG_UBO);
-}
-
-static bool
bi_is_direct_aligned_ubo(bi_instr *ins)
{
- return bi_is_ubo(ins) &&
+ return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
+ (ins->seg == BI_SEG_UBO) &&
(ins->src[0].type == BI_INDEX_CONSTANT) &&
(ins->src[1].type == BI_INDEX_CONSTANT) &&
((ins->src[0].value & 0x3) == 0);
@@ -79,12 +73,8 @@ bi_analyze_ranges(bi_context *ctx)
assert(ubo < res.nr_blocks);
assert(channels > 0 && channels <= 4);
- if (word >= MAX_UBO_WORDS) continue;
-
- /* Must use max if the same base is read with different channel
- * counts, which is possible with nir_opt_shrink_vectors */
- uint8_t *range = res.blocks[ubo].range;
- range[word] = MAX2(range[word], channels);
+ if (word < MAX_UBO_WORDS)
+ res.blocks[ubo].range[word] = channels;
}
return res;
@@ -128,51 +118,42 @@ bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis)
void
bi_opt_push_ubo(bi_context *ctx)
{
- struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
- bi_pick_ubo(ctx->info.push, &analysis);
+ if (ctx->inputs->no_ubo_to_push)
+ return;
- ctx->ubo_mask = 0;
+ /* This pass only runs once */
+ assert(ctx->info->push.count == 0);
+
+ struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
+ bi_pick_ubo(&ctx->info->push, &analysis);
bi_foreach_instr_global_safe(ctx, ins) {
- if (!bi_is_ubo(ins)) continue;
+ if (!bi_is_direct_aligned_ubo(ins)) continue;
unsigned ubo = ins->src[1].value;
unsigned offset = ins->src[0].value;
- if (!bi_is_direct_aligned_ubo(ins)) {
- /* The load can't be pushed, so this UBO needs to be
- * uploaded conventionally */
- if (ins->src[1].type == BI_INDEX_CONSTANT)
- ctx->ubo_mask |= BITSET_BIT(ubo);
- else
- ctx->ubo_mask = ~0;
-
- continue;
- }
-
/* Check if we decided to push this */
assert(ubo < analysis.nr_blocks);
- if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) {
- ctx->ubo_mask |= BITSET_BIT(ubo);
- continue;
- }
+ if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) continue;
/* Replace the UBO load with moves from FAU */
bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
- unsigned nr = bi_opcode_props[ins->op].sr_count;
- bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0], nr);
+ unsigned channels = bi_opcode_props[ins->op].sr_count;
- bi_foreach_src(vec, w) {
+ for (unsigned w = 0; w < channels; ++w) {
/* FAU is grouped in pairs (2 x 4-byte) */
unsigned base =
- pan_lookup_pushed_ubo(ctx->info.push, ubo,
+ pan_lookup_pushed_ubo(&ctx->info->push, ubo,
(offset + 4 * w));
unsigned fau_idx = (base >> 1);
unsigned fau_hi = (base & 1);
- vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi);
+ bi_mov_i32_to(&b,
+ bi_word(ins->dest[0], w),
+ bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi));
}
bi_remove_instruction(ins);
@@ -180,169 +161,3 @@ bi_opt_push_ubo(bi_context *ctx)
free(analysis.blocks);
}
-
-typedef struct {
- BITSET_DECLARE(row, PAN_MAX_PUSH);
-} adjacency_row;
-
-/* Find the connected component containing `node` with depth-first search */
-static void
-bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited,
- unsigned *component, unsigned *size, unsigned node)
-{
- unsigned neighbour;
-
- BITSET_SET(visited, node);
- component[(*size)++] = node;
-
- BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) {
- if (!BITSET_TEST(visited, neighbour)) {
- bi_find_component(adjacency, visited, component, size,
- neighbour);
- }
- }
-}
-
-static bool
-bi_is_uniform(bi_index idx)
-{
- return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM);
-}
-
-/* Get the index of a uniform in 32-bit words from the start of FAU-RAM */
-static unsigned
-bi_uniform_word(bi_index idx)
-{
- assert(bi_is_uniform(idx));
- assert(idx.offset <= 1);
-
- return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset;
-}
-
-/*
- * Create an undirected graph where nodes are 32-bit uniform indices and edges
- * represent that two nodes are used in the same instruction.
- *
- * The graph is constructed as an adjacency matrix stored in adjacency.
- */
-static void
-bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency)
-{
- bi_foreach_instr_global(ctx, I) {
- unsigned nodes[BI_MAX_SRCS] = {};
- unsigned node_count = 0;
-
- /* Set nodes[] to 32-bit uniforms accessed */
- bi_foreach_src(I, s) {
- if (bi_is_uniform(I->src[s])) {
- unsigned word = bi_uniform_word(I->src[s]);
-
- if (word >= ctx->info.push_offset)
- nodes[node_count++] = word;
- }
- }
-
- /* Create clique connecting nodes[] */
- for (unsigned i = 0; i < node_count; ++i) {
- for (unsigned j = 0; j < node_count; ++j) {
- if (i == j)
- continue;
-
- unsigned x = nodes[i], y = nodes[j];
- assert(MAX2(x, y) < ctx->info.push->count);
-
- /* Add undirected edge between the nodes */
- BITSET_SET(adjacency[x].row, y);
- BITSET_SET(adjacency[y].row, x);
- }
- }
- }
-}
-
-/*
- * Optimization pass to reorder uniforms. The goal is to reduce the number of
- * moves we emit when lowering FAU. The pass groups uniforms used by the same
- * instruction.
- *
- * The pass works by creating a graph of pushed uniforms, where edges denote the
- * "both 32-bit uniforms required by the same instruction" relationship. We
- * perform depth-first search on this graph to find the connected components,
- * where each connected component is a cluster of uniforms that are used
- * together. We then select pairs of uniforms from each connected component.
- * The remaining unpaired uniforms (from components of odd sizes) are paired
- * together arbitrarily.
- *
- * After a new ordering is selected, pushed uniforms in the program and the
- * panfrost_ubo_push data structure must be remapped to use the new ordering.
- */
-void
-bi_opt_reorder_push(bi_context *ctx)
-{
- adjacency_row adjacency[PAN_MAX_PUSH] = { 0 };
- BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 };
-
- unsigned ordering[PAN_MAX_PUSH] = { 0 };
- unsigned unpaired[PAN_MAX_PUSH] = { 0 };
- unsigned pushed = 0, unpaired_count = 0;
-
- struct panfrost_ubo_push *push = ctx->info.push;
- unsigned push_offset = ctx->info.push_offset;
-
- bi_create_fau_interference_graph(ctx, adjacency);
-
- for (unsigned i = push_offset; i < push->count; ++i) {
- if (BITSET_TEST(visited, i)) continue;
-
- unsigned component[PAN_MAX_PUSH] = { 0 };
- unsigned size = 0;
- bi_find_component(adjacency, visited, component, &size, i);
-
- /* If there is an odd number of uses, at least one use must be
- * unpaired. Arbitrarily take the last one.
- */
- if (size % 2)
- unpaired[unpaired_count++] = component[--size];
-
- /* The rest of uses are paired */
- assert((size % 2) == 0);
-
- /* Push the paired uses */
- memcpy(ordering + pushed, component, sizeof(unsigned) * size);
- pushed += size;
- }
-
- /* Push unpaired nodes at the end */
- memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count);
- pushed += unpaired_count;
-
- /* Ordering is a permutation. Invert it for O(1) lookup. */
- unsigned old_to_new[PAN_MAX_PUSH] = { 0 };
-
- for (unsigned i = 0; i < push_offset; ++i) {
- old_to_new[i] = i;
- }
-
- for (unsigned i = 0; i < pushed; ++i) {
- assert(ordering[i] >= push_offset);
- old_to_new[ordering[i]] = push_offset + i;
- }
-
- /* Use new ordering throughout the program */
- bi_foreach_instr_global(ctx, I) {
- bi_foreach_src(I, s) {
- if (bi_is_uniform(I->src[s])) {
- unsigned node = bi_uniform_word(I->src[s]);
- unsigned new_node = old_to_new[node];
- I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1);
- I->src[s].offset = new_node & 1;
- }
- }
- }
-
- /* Use new ordering for push */
- struct panfrost_ubo_push old = *push;
- for (unsigned i = 0; i < pushed; ++i)
- push->words[push_offset + i] = old.words[ordering[i]];
-
- push->count = push_offset + pushed;
-}
diff --git a/lib/mesa/src/panfrost/bifrost/bi_packer.c.py b/lib/mesa/src/panfrost/bifrost/bi_packer.c.py
index 601750e2a..28669ebfa 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_packer.c.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_packer.c.py
@@ -24,14 +24,9 @@ import sys
from bifrost_isa import *
from mako.template import Template
-# Consider pseudo instructions when getting the modifier list
-instructions_with_pseudo = parse_instructions(sys.argv[1], include_pseudo = True)
-ir_instructions_with_pseudo = partition_mnemonics(instructions_with_pseudo)
-modifier_lists = order_modifiers(ir_instructions_with_pseudo)
-
-# ...but strip for packing
instructions = parse_instructions(sys.argv[1])
ir_instructions = partition_mnemonics(instructions)
+modifier_lists = order_modifiers(ir_instructions)
# Packs sources into an argument. Offset argument to work around a quirk of our
# compiler IR when dealing with staging registers (TODO: reorder in the IR to
@@ -112,9 +107,6 @@ def pack_modifier(mod, width, default, opts, body, pack_exprs):
# Construct a list
lists = [pick_from_bucket(opts, bucket) for bucket in SWIZZLE_BUCKETS]
ir_value = "src[{}].swizzle".format(arg)
- elif raw == "lane_dest":
- lists = [pick_from_bucket(opts, bucket) for bucket in SWIZZLE_BUCKETS]
- ir_value = "dest->swizzle"
elif raw in ["abs", "sign"]:
ir_value = "src[{}].abs".format(arg)
elif raw in ["neg", "not"]:
@@ -315,7 +307,7 @@ bi_pack_${'fma' if unit == '*' else 'add'}(bi_instr *I,
enum bifrost_packed_src src3)
{
if (!I)
- return bi_pack_${opname_to_c(unit + 'NOP')}(I, src0, src1, src2, src3);
+ return bi_pack_${opname_to_c(unit + 'NOP.i32')}(I, src0, src1, src2, src3);
% if unit == '*':
assert((1 << src0) & 0xfb);
diff --git a/lib/mesa/src/panfrost/bifrost/bi_printer.c.py b/lib/mesa/src/panfrost/bifrost/bi_printer.c.py
index 04a9c0095..5692633b4 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_printer.c.py
+++ b/lib/mesa/src/panfrost/bifrost/bi_printer.c.py
@@ -55,7 +55,6 @@ bir_fau_name(unsigned fau_idx)
"blend_descriptor_2", "blend_descriptor_3",
"blend_descriptor_4", "blend_descriptor_5",
"blend_descriptor_6", "blend_descriptor_7",
- "tls_ptr", "wls_ptr", "program_counter",
};
assert(fau_idx < ARRAY_SIZE(names));
@@ -76,9 +75,6 @@ bir_passthrough_name(unsigned idx)
static void
bi_print_index(FILE *fp, bi_index index)
{
- if (index.discard)
- fputs("^", fp);
-
if (bi_is_null(index))
fprintf(fp, "_");
else if (index.type == BI_INDEX_CONSTANT)
@@ -90,6 +86,8 @@ bi_print_index(FILE *fp, bi_index index)
else if (index.type == BI_INDEX_PASS)
fprintf(fp, "%s", bir_passthrough_name(index.value));
else if (index.type == BI_INDEX_REGISTER)
+ fprintf(fp, "br%u", index.value);
+ else if (index.type == BI_INDEX_NORMAL && index.reg)
fprintf(fp, "r%u", index.value);
else if (index.type == BI_INDEX_NORMAL)
fprintf(fp, "%u", index.value);
@@ -111,7 +109,7 @@ bi_print_index(FILE *fp, bi_index index)
% for mod in sorted(modifiers):
% if len(modifiers[mod]) > 2: # otherwise just boolean
-UNUSED static inline const char *
+static inline const char *
bi_${mod}_as_str(enum bi_${mod} ${mod})
{
switch (${mod}) {
@@ -131,13 +129,11 @@ bi_${mod}_as_str(enum bi_${mod} ${mod})
<%def name="print_modifiers(mods, table)">
% for mod in mods:
- % if mod not in ["lane_dest"]:
% if len(table[mod]) > 2:
fputs(bi_${mod}_as_str(I->${mod}), fp);
% else:
if (I->${mod}) fputs(".${mod}", fp);
% endif
- % endif
% endfor
</%def>
@@ -156,37 +152,19 @@ bi_${mod}_as_str(enum bi_${mod} ${mod})
</%def>
void
-bi_print_instr(const bi_instr *I, FILE *fp)
+bi_print_instr(bi_instr *I, FILE *fp)
{
- fputs(" ", fp);
-
bi_foreach_dest(I, d) {
+ if (bi_is_null(I->dest[d])) break;
if (d > 0) fprintf(fp, ", ");
bi_print_index(fp, I->dest[d]);
}
- if (I->nr_dests > 0)
- fputs(" = ", fp);
-
- fprintf(fp, "%s", bi_opcode_props[I->op].name);
+ fprintf(fp, " = %s", bi_opcode_props[I->op].name);
if (I->table)
- fprintf(fp, ".table%u", I->table);
-
- if (I->flow)
- fprintf(fp, ".flow%u", I->flow);
-
- if (I->op == BI_OPCODE_COLLECT_I32 || I->op == BI_OPCODE_PHI) {
- for (unsigned i = 0; i < I->nr_srcs; ++i) {
- if (i > 0)
- fputs(", ", fp);
- else
- fputs(" ", fp);
-
- bi_print_index(fp, I->src[i]);
- }
- }
+ fprintf(fp, ".%s", bi_table_as_str(I->table));
switch (I->op) {
% for opcode in ops:
@@ -214,7 +192,7 @@ bi_print_instr(const bi_instr *I, FILE *fp)
}
if (I->branch_target)
- fprintf(fp, " -> block%u", I->branch_target->index);
+ fprintf(fp, " -> block%u", I->branch_target->base.name);
fputs("\\n", fp);
diff --git a/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c b/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c
index 04aa07b0c..05b731a53 100644
--- a/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c
+++ b/lib/mesa/src/panfrost/bifrost/bi_scoreboard.c
@@ -38,7 +38,7 @@
* 3. The shader must wait on slot #6 before running BLEND, ATEST
* 4. The shader must wait on slot #7 before running BLEND, ST_TILE
* 5. ATEST, ZS_EMIT must be issued with slot #0
- * 6. BARRIER must be issued with slot #7 and wait on every active slot.
+ * 6. BARRIER must be issued with slot #7
* 7. Only slots #0 through #5 may be used for clauses not otherwise specified.
* 8. If a clause writes to a read staging register of an unresolved
* dependency, it must set a staging barrier.
@@ -54,256 +54,57 @@
*/
#define BI_NUM_GENERAL_SLOTS 6
-#define BI_NUM_SLOTS 8
-#define BI_NUM_REGISTERS 64
-#define BI_SLOT_SERIAL 0 /* arbitrary */
-/*
- * Due to the crude scoreboarding we do, we need to serialize varying loads and
- * memory access. Identify these instructions here.
- */
-static bool
-bi_should_serialize(bi_instr *I)
-{
- /* For debug, serialize everything to disable scoreboard opts */
- if (bifrost_debug & BIFROST_DBG_NOSB)
- return true;
+/* A model for the state of the scoreboard */
- /* Although nominally on the attribute unit, image loads have the same
- * coherency requirements as general memory loads. Serialize them for
- * now until we can do something more clever.
- */
- if (I->op == BI_OPCODE_LD_ATTR_TEX)
- return true;
-
- switch (bi_opcode_props[I->op].message) {
- case BIFROST_MESSAGE_VARYING:
- case BIFROST_MESSAGE_LOAD:
- case BIFROST_MESSAGE_STORE:
- case BIFROST_MESSAGE_ATOMIC:
- return true;
- default:
- return false;
- }
-}
+struct bi_scoreboard_state {
+ /* TODO: what do we track here for a heuristic? */
+};
/* Given a scoreboard model, choose a slot for a clause wrapping a given
* message passing instruction. No side effects. */
static unsigned
-bi_choose_scoreboard_slot(bi_instr *message)
+bi_choose_scoreboard_slot(struct bi_scoreboard_state *st, bi_instr *message)
{
+ /* A clause that does not produce a message must use slot #0 */
+ if (!message)
+ return 0;
+
+ switch (message->op) {
/* ATEST, ZS_EMIT must be issued with slot #0 */
- if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT)
+ case BI_OPCODE_ATEST:
+ case BI_OPCODE_ZS_EMIT:
return 0;
/* BARRIER must be issued with slot #7 */
- if (message->op == BI_OPCODE_BARRIER)
+ case BI_OPCODE_BARRIER:
return 7;
- /* For now, make serialization is easy */
- if (bi_should_serialize(message))
- return BI_SLOT_SERIAL;
-
- return 0;
-}
-
-static uint64_t
-bi_read_mask(bi_instr *I, bool staging_only)
-{
- uint64_t mask = 0;
-
- if (staging_only && !bi_opcode_props[I->op].sr_read)
- return mask;
-
- bi_foreach_src(I, s) {
- if (I->src[s].type == BI_INDEX_REGISTER) {
- unsigned reg = I->src[s].value;
- unsigned count = bi_count_read_registers(I, s);
-
- mask |= (BITFIELD64_MASK(count) << reg);
- }
-
- if (staging_only)
- break;
- }
-
- return mask;
-}
-
-static uint64_t
-bi_write_mask(bi_instr *I)
-{
- uint64_t mask = 0;
-
- bi_foreach_dest(I, d) {
- if (bi_is_null(I->dest[d])) continue;
-
- assert(I->dest[d].type == BI_INDEX_REGISTER);
-
- unsigned reg = I->dest[d].value;
- unsigned count = bi_count_write_registers(I, d);
-
- mask |= (BITFIELD64_MASK(count) << reg);
- }
-
- /* Instructions like AXCHG.i32 unconditionally both read and write
- * staging registers. Even if we discard the result, the write still
- * happens logically and needs to be included in our calculations.
- * Obscurely, ATOM_CX is sr_write but can ignore the staging register in
- * certain circumstances; this does not require consideration.
- */
- if (bi_opcode_props[I->op].sr_write && I->nr_dests && I->nr_srcs &&
- bi_is_null(I->dest[0]) && !bi_is_null(I->src[0])) {
-
- unsigned reg = I->src[0].value;
- unsigned count = bi_count_write_registers(I, 0);
-
- mask |= (BITFIELD64_MASK(count) << reg);
- }
-
- return mask;
-}
-
-/* Update the scoreboard model to assign an instruction to a given slot */
-
-static void
-bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause)
-{
- bi_instr *I = clause->message;
- unsigned slot = clause->scoreboard_id;
-
- if (!I)
- return;
-
- st->read[slot] |= bi_read_mask(I, true);
-
- if (bi_opcode_props[I->op].sr_write)
- st->write[slot] |= bi_write_mask(I);
-}
-
-/* Adds a dependency on each slot writing any specified register */
-
-static void
-bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
-{
- for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
- if (!(st->write[slot] & regmask))
- continue;
-
- st->write[slot] = 0;
- st->read[slot] = 0;
-
- clause->dependencies |= BITFIELD_BIT(slot);
- }
-}
-
-static void
-bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
-{
- for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) {
- if (!(st->read[slot] & regmask))
- continue;
-
- st->read[slot] = 0;
- clause->staging_barrier = true;
- }
-}
-
-/* Sets the dependencies for a given clause, updating the model */
-
-static void
-bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st)
-{
- bi_foreach_instr_in_clause(block, clause, I) {
- uint64_t read = bi_read_mask(I, false);
- uint64_t written = bi_write_mask(I);
-
- /* Read-after-write; write-after-write */
- bi_depend_on_writers(clause, st, read | written);
-
- /* Write-after-read */
- bi_set_staging_barrier(clause, st, written);
- }
-
- /* LD_VAR instructions must be serialized per-quad. Just always depend
- * on any LD_VAR instructions. This isn't optimal, but doing better
- * requires divergence-aware data flow analysis.
- *
- * Similarly, memory loads/stores need to be synchronized. For now,
- * force them to be serialized. This is not optimal.
- */
- if (clause->message && bi_should_serialize(clause->message))
- clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL);
-
- /* Barriers must wait on all slots to flush existing work. It might be
- * possible to skip this with more information about the barrier. For
- * now, be conservative.
- */
- if (clause->message && clause->message->op == BI_OPCODE_BARRIER)
- clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS);
-}
-
-static bool
-scoreboard_block_update(bi_block *blk)
-{
- bool progress = false;
-
- /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
- bi_foreach_predecessor(blk, pred) {
- for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
- blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
- blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
- }
- }
-
- struct bi_scoreboard_state state = blk->scoreboard_in;
-
- /* Assign locally */
-
- bi_foreach_clause_in_block(blk, clause) {
- bi_set_dependencies(blk, clause, &state);
- bi_push_clause(&state, clause);
+ default:
+ break;
}
- /* To figure out progress, diff scoreboard_out */
-
- for (unsigned i = 0; i < BI_NUM_SLOTS; ++i)
- progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
-
- blk->scoreboard_out = state;
-
- return progress;
+ /* TODO: Use a heuristic */
+ return 0;
}
void
bi_assign_scoreboard(bi_context *ctx)
{
- u_worklist worklist;
- bi_worklist_init(ctx, &worklist);
-
- /* First, assign slots. */
- bi_foreach_block(ctx, block) {
- bi_foreach_clause_in_block(block, clause) {
- if (clause->message) {
- unsigned slot = bi_choose_scoreboard_slot(clause->message);
- clause->scoreboard_id = slot;
- }
- }
+ struct bi_scoreboard_state st = {};
- bi_worklist_push_tail(&worklist, block);
- }
+ /* Assign slots */
+ bi_foreach_block(ctx, _block) {
+ bi_block *block = (bi_block *) _block;
- /* Next, perform forward data flow analysis to calculate dependencies */
- while (!u_worklist_is_empty(&worklist)) {
- /* Pop from the front for forward analysis */
- bi_block *blk = bi_worklist_pop_head(&worklist);
+ bi_foreach_clause_in_block(block, clause) {
+ unsigned slot = bi_choose_scoreboard_slot(&st, clause->message);
+ clause->scoreboard_id = slot;
- if (scoreboard_block_update(blk)) {
- bi_foreach_successor(blk, succ)
- bi_worklist_push_tail(&worklist, succ);
+ bi_clause *next = bi_next_clause(ctx, _block, clause);
+ if (next)
+ next->dependencies |= (1 << slot);
}
}
-
- u_worklist_fini(&worklist);
}
diff --git a/lib/mesa/src/panfrost/bifrost/bifrost_isa.py b/lib/mesa/src/panfrost/bifrost/bifrost_isa.py
index 7152509bc..ae97795f3 100644
--- a/lib/mesa/src/panfrost/bifrost/bifrost_isa.py
+++ b/lib/mesa/src/panfrost/bifrost/bifrost_isa.py
@@ -132,8 +132,6 @@ def parse_instruction(ins, include_pseudo):
'staging': ins.attrib.get('staging', '').split('=')[0],
'staging_count': ins.attrib.get('staging', '=0').split('=')[1],
'dests': int(ins.attrib.get('dests', '1')),
- 'variable_dests': ins.attrib.get('variable_dests', False),
- 'variable_srcs': ins.attrib.get('variable_srcs', False),
'unused': ins.attrib.get('unused', False),
'pseudo': ins.attrib.get('pseudo', False),
'message': ins.attrib.get('message', 'none'),
@@ -145,9 +143,6 @@ def parse_instruction(ins, include_pseudo):
common['exact'] = parse_exact(ins)
for src in ins.findall('src'):
- if src.attrib.get('pseudo', False) and not include_pseudo:
- continue
-
mask = int(src.attrib['mask'], 0) if ('mask' in src.attrib) else 0xFF
common['srcs'].append([int(src.attrib['start'], 0), mask])
@@ -245,28 +240,18 @@ def simplify_to_ir(ins):
'staging': ins['staging'],
'srcs': len(ins['srcs']),
'dests': ins['dests'],
- 'variable_dests': ins['variable_dests'],
- 'variable_srcs': ins['variable_srcs'],
'modifiers': [[m[0][0], m[2]] for m in ins['modifiers']],
'immediates': [m[0] for m in ins['immediates']]
}
-# Converstions to integers default to rounding-to-zero
-# All other opcodes default to rounding to nearest even
-def default_round_to_zero(name):
- # 8-bit int to float is exact
- subs = ['_TO_U', '_TO_S', '_TO_V2U', '_TO_V2S', '_TO_V4U', '_TO_V4S']
- return any([x in name for x in subs])
-def combine_ir_variants(instructions, key):
- seen = [op for op in instructions.keys() if op[1:] == key]
- variant_objs = [[simplify_to_ir(Q[1]) for Q in instructions[x]] for x in seen]
- variants = sum(variant_objs, [])
+def combine_ir_variants(instructions, v):
+ variants = sum([[simplify_to_ir(Q[1]) for Q in instructions[x]] for x in v], [])
# Accumulate modifiers across variants
modifiers = {}
- for s in variants[0:]:
+ for s in variants:
# Check consistency
assert(s['srcs'] == variants[0]['srcs'])
assert(s['dests'] == variants[0]['dests'])
@@ -282,27 +267,19 @@ def combine_ir_variants(instructions, key):
# Great, we've checked srcs/immediates are consistent and we've summed over
# modifiers
return {
- 'key': key,
'srcs': variants[0]['srcs'],
'dests': variants[0]['dests'],
- 'variable_dests': variants[0]['variable_dests'],
- 'variable_srcs': variants[0]['variable_srcs'],
'staging': variants[0]['staging'],
'immediates': sorted(variants[0]['immediates']),
- 'modifiers': modifiers,
- 'v': len(variants),
- 'ir': variants,
- 'rtz': default_round_to_zero(key)
+ 'modifiers': { k: modifiers[k] for k in modifiers }
}
# Partition instructions to mnemonics, considering units and variants
# equivalent.
def partition_mnemonics(instructions):
- key_func = lambda x: x[1:]
- sorted_instrs = sorted(instructions.keys(), key = key_func)
- partitions = itertools.groupby(sorted_instrs, key_func)
- return { k: combine_ir_variants(instructions, k) for k, v in partitions }
+ partitions = itertools.groupby(instructions, lambda x: x[1:])
+ return { k: combine_ir_variants(instructions, v) for (k, v) in partitions }
# Generate modifier lists, by accumulating all the possible modifiers, and
# deduplicating thus assigning canonical enum values. We don't try _too_ hard
@@ -351,17 +328,3 @@ def order_modifiers(ir_instructions):
def src_count(op):
staging = 1 if (op["staging"] in ["r", "rw"]) else 0
return op["srcs"] + staging
-
-# Parses out the size part of an opocde name
-def typesize(opcode):
- if opcode[-3:] == '128':
- return 128
- if opcode[-2:] == '48':
- return 48
- elif opcode[-1] == '8':
- return 8
- else:
- try:
- return int(opcode[-2:])
- except:
- return 32
diff --git a/lib/mesa/src/panfrost/bifrost/gen_disasm.py b/lib/mesa/src/panfrost/bifrost/gen_disasm.py
index 505c61cc0..11acf5ae9 100644
--- a/lib/mesa/src/panfrost/bifrost/gen_disasm.py
+++ b/lib/mesa/src/panfrost/bifrost/gen_disasm.py
@@ -238,7 +238,7 @@ def build_lut(mnemonic, desc, test):
key_set = find_context_keys(desc, test)
ordered = 'ordering' in key_set
key_set.discard('ordering')
- keys = sorted(list(key_set))
+ keys = list(key_set)
# Evaluate the deriveds for every possible state, forming a (state -> deriveds) map
testf = compile_derived(test, keys)
@@ -326,7 +326,7 @@ def disasm_op(name, op):
for i, (pos, mask) in enumerate(srcs):
body += ' fputs(", ", fp);\n'
- body += ' dump_src(fp, _BITS(bits, {}, 3), *srcs, branch_offset, consts, {});\n'.format(pos, "true" if is_fma else "false")
+ body += ' dump_src(fp, _BITS(bits, {}, 3), *srcs, consts, {});\n'.format(pos, "true" if is_fma else "false")
# Error check if needed
if (mask != 0xFF):
diff --git a/lib/mesa/src/panfrost/lib/pan_indirect_draw.c b/lib/mesa/src/panfrost/lib/pan_indirect_draw.c
index 3fa1f5485..2886d3d91 100644
--- a/lib/mesa/src/panfrost/lib/pan_indirect_draw.c
+++ b/lib/mesa/src/panfrost/lib/pan_indirect_draw.c
@@ -30,6 +30,7 @@
#include "pan_indirect_draw.h"
#include "pan_pool.h"
#include "pan_util.h"
+#include "panfrost-quirks.h"
#include "compiler/nir/nir_builder.h"
#include "util/u_memory.h"
#include "util/macros.h"
@@ -54,7 +55,6 @@ struct draw_data {
nir_ssa_def *index_buf;
nir_ssa_def *restart_index;
nir_ssa_def *vertex_count;
- nir_ssa_def *start_instance;
nir_ssa_def *instance_count;
nir_ssa_def *vertex_start;
nir_ssa_def *index_bias;
@@ -72,9 +72,6 @@ struct jobs_data {
nir_ssa_def *vertex_job;
nir_ssa_def *tiler_job;
nir_ssa_def *base_vertex_offset;
- nir_ssa_def *first_vertex_sysval;
- nir_ssa_def *base_vertex_sysval;
- nir_ssa_def *base_instance_sysval;
nir_ssa_def *offset_start;
nir_ssa_def *invocation;
};
@@ -111,13 +108,6 @@ struct indirect_draw_info {
uint32_t count;
uint32_t instance_count;
uint32_t start;
- uint32_t start_instance;
-};
-
-struct indirect_indexed_draw_info {
- uint32_t count;
- uint32_t instance_count;
- uint32_t start;
int32_t index_bias;
uint32_t start_instance;
};
@@ -142,7 +132,7 @@ struct indirect_draw_context {
mali_ptr varying_mem;
};
-/* Indirect draw shader inputs. Those are stored in FAU. */
+/* Indirect draw shader inputs. Those are stored in a UBO. */
struct indirect_draw_inputs {
/* indirect_draw_context pointer */
@@ -160,11 +150,6 @@ struct indirect_draw_inputs {
/* index buffer */
mali_ptr index_buf;
- /* {base,first}_{vertex,instance} sysvals */
- mali_ptr first_vertex_sysval;
- mali_ptr base_vertex_sysval;
- mali_ptr base_instance_sysval;
-
/* Pointers to various cmdstream structs that need to be patched */
mali_ptr vertex_job;
mali_ptr tiler_job;
@@ -175,13 +160,26 @@ struct indirect_draw_inputs {
uint32_t draw_buf_stride;
uint32_t restart_index;
uint32_t attrib_count;
-} PACKED;
+};
+
+static nir_ssa_def *
+get_input_data(nir_builder *b, unsigned offset, unsigned size)
+{
+ assert(!(offset & 0x3));
+ assert(size && !(size & 0x3));
+
+ return nir_load_ubo(b, 1, size,
+ nir_imm_int(b, 0),
+ nir_imm_int(b, offset),
+ .align_mul = 4,
+ .align_offset = 0,
+ .range_base = 0,
+ .range = ~0);
+}
#define get_input_field(b, name) \
- nir_load_push_constant(b, \
- 1, sizeof(((struct indirect_draw_inputs *)0)->name) * 8, \
- nir_imm_int(b, 0), \
- .base = offsetof(struct indirect_draw_inputs, name))
+ get_input_data(b, offsetof(struct indirect_draw_inputs, name), \
+ sizeof(((struct indirect_draw_inputs *)0)->name) * 8)
static nir_ssa_def *
get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
@@ -282,12 +280,6 @@ update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
offsetof(struct indirect_draw_info, field)), \
1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
-#define get_indexed_draw_field(b, draw_ptr, field) \
- load_global(b, \
- get_address_imm(b, draw_ptr, \
- offsetof(struct indirect_indexed_draw_info, field)), \
- 1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
-
static void
extract_inputs(struct indirect_draw_shader_builder *builder)
{
@@ -309,9 +301,6 @@ extract_inputs(struct indirect_draw_shader_builder *builder)
if (builder->index_min_max_search)
return;
- builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
- builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
- builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
builder->jobs.vertex_job = get_input_field(b, vertex_job);
builder->jobs.tiler_job = get_input_field(b, tiler_job);
builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
@@ -342,49 +331,29 @@ init_shader_builder(struct indirect_draw_shader_builder *builder,
if (index_min_max_search) {
builder->b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
- GENX(pan_shader_get_compiler_options)(),
+ pan_shader_get_compiler_options(dev),
"indirect_draw_min_max_index(index_size=%d)",
builder->index_size);
} else {
builder->b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
- GENX(pan_shader_get_compiler_options)(),
- "indirect_draw(index_size=%d%s%s%s%s)",
+ pan_shader_get_compiler_options(dev),
+ "indirect_draw(index_size=%d%s%s%s)",
builder->index_size,
flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
",psiz" : "",
flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
",primitive_restart" : "",
flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
- ",update_primitive_size" : "",
- flags & PAN_INDIRECT_DRAW_IDVS ?
- ",idvs" : "");
+ ",update_primitive_size" : "");
}
- extract_inputs(builder);
-}
-
-static void
-update_dcd(struct indirect_draw_shader_builder *builder,
- nir_ssa_def *job_ptr,
- unsigned draw_offset)
-{
nir_builder *b = &builder->b;
- nir_ssa_def *draw_w01 =
- load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
- nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
+ nir_variable_create(b->shader, nir_var_mem_ubo,
+ glsl_uint_type(), "inputs");
+ b->shader->info.num_ubos++;
- /* Update DRAW.{instance_size,offset_start} */
- nir_ssa_def *instance_size =
- nir_bcsel(b,
- nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
- nir_imm_int(b, 0), builder->instance_size.packed);
- draw_w01 = nir_vec2(b,
- nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
- nir_ishl(b, instance_size, nir_imm_int(b, 16))),
- builder->jobs.offset_start);
- store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
- draw_w01, 2);
+ extract_inputs(builder);
}
static void
@@ -402,9 +371,17 @@ update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type
unsigned draw_offset =
type == MALI_JOB_TYPE_VERTEX ?
pan_section_offset(COMPUTE_JOB, DRAW) :
- pan_section_offset(TILER_JOB, DRAW);
- unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
- unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
+ pan_is_bifrost(builder->dev) ?
+ pan_section_offset(BIFROST_TILER_JOB, DRAW) :
+ pan_section_offset(MIDGARD_TILER_JOB, DRAW);
+ unsigned prim_offset =
+ pan_is_bifrost(builder->dev) ?
+ pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE) :
+ pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE);
+ unsigned psiz_offset =
+ pan_is_bifrost(builder->dev) ?
+ pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE_SIZE) :
+ pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE_SIZE);
unsigned index_size = builder->index_size;
if (type == MALI_JOB_TYPE_TILER) {
@@ -440,14 +417,21 @@ update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type
builder->varyings.pos_ptr, 2);
}
- update_dcd(builder, job_ptr, draw_offset);
-
- if (builder->flags & PAN_INDIRECT_DRAW_IDVS) {
- assert(type == MALI_JOB_TYPE_TILER);
+ nir_ssa_def *draw_w01 =
+ load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
+ nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
- update_dcd(builder, job_ptr,
- pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW));
- }
+ /* Update DRAW.{instance_size,offset_start} */
+ nir_ssa_def *instance_size =
+ nir_bcsel(b,
+ nir_ilt(b, builder->draw.instance_count, nir_imm_int(b, 2)),
+ nir_imm_int(b, 0), builder->instance_size.packed);
+ draw_w01 = nir_vec2(b,
+ nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
+ nir_ishl(b, instance_size, nir_imm_int(b, 16))),
+ builder->jobs.offset_start);
+ store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
+ draw_w01, 2);
}
static void
@@ -463,7 +447,7 @@ split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
half_div64);
nir_ssa_def *fi = nir_idiv(b, f0, div64);
nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
- nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
+ nir_ssa_def *e = nir_bcsel(b, nir_ilt(b, half_div64, ff),
nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
*d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
*r_e = nir_ior(b, r, e);
@@ -504,68 +488,33 @@ update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
}
static void
-zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
- nir_ssa_def *attrib_buf_ptr)
-{
- /* Stride is an unadorned 32-bit uint at word 2 */
- nir_builder *b = &builder->b;
- store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
- nir_imm_int(b, 0), 1);
-}
-
-static void
adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
- nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
- nir_ssa_def *instance_div)
+ nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr)
{
nir_builder *b = &builder->b;
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_ssa_def *two = nir_imm_int(b, 2);
nir_ssa_def *sub_cur_offset =
nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
- nir_uge(b, builder->draw.instance_count, two));
-
- nir_ssa_def *add_base_inst_offset =
- nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
- nir_ine(b, instance_div, zero));
-
- IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
- nir_ssa_def *offset =
- load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
- nir_ssa_def *stride =
- load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
+ nir_ige(b, builder->draw.instance_count, two));
+ IF (sub_cur_offset) {
/* Per-instance data needs to be offset in response to a
* delayed start in an indexed draw.
*/
+ nir_ssa_def *stride =
+ load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
+ nir_ssa_def *offset =
+ load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
- IF (add_base_inst_offset) {
- offset = nir_iadd(b, offset,
- nir_idiv(b,
- nir_imul(b, stride,
- builder->draw.start_instance),
- instance_div));
- } ENDIF
-
- IF (sub_cur_offset) {
- offset = nir_isub(b, offset,
- nir_imul(b, stride,
- builder->jobs.offset_start));
- } ENDIF
-
+ offset = nir_isub(b, offset,
+ nir_imul(b, stride,
+ builder->jobs.offset_start));
store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
offset, 1);
} ENDIF
}
-/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
-
-static nir_ssa_def *
-nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
-{
- return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
-}
-
/* Based on panfrost_emit_vertex_data() */
static void
@@ -576,78 +525,78 @@ update_vertex_attribs(struct indirect_draw_shader_builder *builder)
nir_local_variable_create(b->impl, glsl_uint_type(),
"attrib_idx");
nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
-
-#if PAN_ARCH <= 5
nir_ssa_def *single_instance =
- nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
-#endif
+ nir_ilt(b, builder->draw.instance_count, nir_imm_int(b, 2));
LOOP {
nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
- IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
+ IF (nir_ige(b, attrib_idx, builder->attribs.attrib_count))
BREAK;
ENDIF
nir_ssa_def *attrib_buf_ptr =
get_address(b, builder->attribs.attrib_bufs,
nir_imul_imm(b, attrib_idx,
- 2 * pan_size(ATTRIBUTE_BUFFER)));
+ 2 * MALI_ATTRIBUTE_BUFFER_LENGTH));
nir_ssa_def *attrib_ptr =
get_address(b, builder->attribs.attribs,
nir_imul_imm(b, attrib_idx,
- pan_size(ATTRIBUTE)));
+ MALI_ATTRIBUTE_LENGTH));
nir_ssa_def *r_e, *d;
-#if PAN_ARCH <= 5
- IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
- nir_ssa_def *r_p =
- nir_bcsel(b, single_instance,
- nir_imm_int(b, 0x9f),
- builder->instance_size.packed);
+ if (!pan_is_bifrost(builder->dev)) {
+ IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
+ nir_ssa_def *r_p =
+ nir_bcsel(b, single_instance,
+ nir_imm_int(b, 0x9f),
+ builder->instance_size.packed);
- store_global(b,
- get_address_imm(b, attrib_buf_ptr, WORD(4)),
- nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
+ store_global(b,
+ get_address_imm(b, attrib_buf_ptr, WORD(4)),
+ nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
- nir_store_var(b, attrib_idx_var,
- nir_iadd_imm(b, attrib_idx, 1), 1);
- CONTINUE;
- } ENDIF
+ nir_store_var(b, attrib_idx_var,
+ nir_iadd_imm(b, attrib_idx, 1), 1);
+ CONTINUE;
+ } ENDIF
- IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
- split_div(b, builder->instance_size.padded,
- &r_e, &d);
- nir_ssa_def *default_div =
- nir_ior(b, single_instance,
- nir_ult(b,
- builder->instance_size.padded,
- nir_imm_int(b, 2)));
- r_e = nir_bcsel(b, default_div,
- nir_imm_int(b, 0x3f), r_e);
- d = nir_bcsel(b, default_div,
- nir_imm_int(b, (1u << 31) - 1), d);
- store_global(b,
- get_address_imm(b, attrib_buf_ptr, WORD(1)),
- nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
- 2);
- nir_store_var(b, attrib_idx_var,
- nir_iadd_imm(b, attrib_idx, 1), 1);
- CONTINUE;
- } ENDIF
-#endif
+ IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
+ split_div(b, builder->instance_size.padded,
+ &r_e, &d);
+ nir_ssa_def *default_div =
+ nir_ior(b, single_instance,
+ nir_ilt(b,
+ builder->instance_size.padded,
+ nir_imm_int(b, 2)));
+ r_e = nir_bcsel(b, default_div,
+ nir_imm_int(b, 0x3f), r_e);
+ d = nir_bcsel(b, default_div,
+ nir_imm_int(b, (1u << 31) - 1), d);
+ store_global(b,
+ get_address_imm(b, attrib_buf_ptr, WORD(1)),
+ nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
+ 2);
+ nir_store_var(b, attrib_idx_var,
+ nir_iadd_imm(b, attrib_idx, 1), 1);
+ CONTINUE;
+ } ENDIF
+ }
- nir_ssa_def *instance_div =
+ nir_ssa_def *div =
load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
- nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
+ div = nir_imul(b, div, builder->instance_size.padded);
nir_ssa_def *multi_instance =
- nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
+ nir_ige(b, builder->draw.instance_count, nir_imm_int(b, 2));
IF (nir_ine(b, div, nir_imm_int(b, 0))) {
IF (multi_instance) {
- IF (nir_is_power_of_two_or_zero(b, div)) {
+ nir_ssa_def *div_pow2 =
+ nir_ilt(b, nir_bit_count(b, div), nir_imm_int(b, 2));
+
+ IF (div_pow2) {
nir_ssa_def *exp =
nir_imax(b, nir_ufind_msb(b, div),
nir_imm_int(b, 0));
@@ -662,16 +611,26 @@ update_vertex_attribs(struct indirect_draw_shader_builder *builder)
} ENDIF
} ELSE {
/* Single instance with a non-0 divisor: all
- * accesses should point to attribute 0 */
- zero_attrib_buf_stride(builder, attrib_buf_ptr);
+ * accesses should point to attribute 0, pick
+ * the biggest pot divisor.
+ */
+ update_vertex_attrib_buf(builder, attrib_buf_ptr,
+ MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
+ nir_imm_int(b, 31), NULL);
} ENDIF
- adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
- } ELSE IF (multi_instance) {
- update_vertex_attrib_buf(builder, attrib_buf_ptr,
- MALI_ATTRIBUTE_TYPE_1D_MODULUS,
- builder->instance_size.packed, NULL);
- } ENDIF ENDIF
+ adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr);
+ } ELSE {
+ IF (multi_instance) {
+ update_vertex_attrib_buf(builder, attrib_buf_ptr,
+ MALI_ATTRIBUTE_TYPE_1D_MODULUS,
+ builder->instance_size.packed, NULL);
+ } ELSE {
+ update_vertex_attrib_buf(builder, attrib_buf_ptr,
+ MALI_ATTRIBUTE_TYPE_1D,
+ nir_imm_int(b, 0), NULL);
+ } ENDIF
+ } ENDIF
nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
}
@@ -716,19 +675,19 @@ update_varyings(struct indirect_draw_shader_builder *builder)
nir_ssa_def *buf_ptr =
get_address_imm(b, builder->varyings.varying_bufs,
PAN_VARY_GENERAL *
- pan_size(ATTRIBUTE_BUFFER));
+ MALI_ATTRIBUTE_BUFFER_LENGTH);
update_varying_buf(builder, buf_ptr, vertex_count);
buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
PAN_VARY_POSITION *
- pan_size(ATTRIBUTE_BUFFER));
+ MALI_ATTRIBUTE_BUFFER_LENGTH);
builder->varyings.pos_ptr =
update_varying_buf(builder, buf_ptr, vertex_count);
if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
PAN_VARY_PSIZ *
- pan_size(ATTRIBUTE_BUFFER));
+ MALI_ATTRIBUTE_BUFFER_LENGTH);
builder->varyings.psiz_ptr =
update_varying_buf(builder, buf_ptr, vertex_count);
}
@@ -761,14 +720,6 @@ get_invocation(struct indirect_draw_shader_builder *builder)
nir_imm_int(b, 2 << 28)));
}
-static nir_ssa_def *
-nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot)
-{
- assert(pot != 0 && util_is_power_of_two_or_zero(pot));
-
- return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1));
-}
-
/* Based on panfrost_padded_vertex_count() */
static nir_ssa_def *
@@ -789,7 +740,7 @@ get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
exp = nir_iadd(b, exp, rshift);
base = nir_ushr(b, base, rshift);
- base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
+ base = nir_iadd(b, base, nir_bcsel(b, nir_ige(b, base, eleven), one, zero));
rshift = nir_imax(b, nir_find_lsb(b, base), zero);
exp = nir_iadd(b, exp, rshift);
base = nir_ushr(b, base, rshift);
@@ -803,28 +754,10 @@ static void
update_jobs(struct indirect_draw_shader_builder *builder)
{
get_invocation(builder);
-
- if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
- update_job(builder, MALI_JOB_TYPE_VERTEX);
-
+ update_job(builder, MALI_JOB_TYPE_VERTEX);
update_job(builder, MALI_JOB_TYPE_TILER);
}
-
-static void
-set_null_job(struct indirect_draw_shader_builder *builder,
- nir_ssa_def *job_ptr)
-{
- nir_builder *b = &builder->b;
- nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
- nir_ssa_def *val = load_global(b, w4, 1, 32);
-
- /* Set job type to NULL (AKA NOOP) */
- val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
- nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
- store_global(b, w4, val, 1);
-}
-
static void
get_instance_size(struct indirect_draw_shader_builder *builder)
{
@@ -877,8 +810,8 @@ get_instance_size(struct indirect_draw_shader_builder *builder)
for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
nir_ssa_def *oob =
nir_ior(b,
- nir_ult(b, nir_imm_int(b, i), offset),
- nir_uge(b, nir_imm_int(b, i), end));
+ nir_ilt(b, nir_imm_int(b, i), offset),
+ nir_ige(b, nir_imm_int(b, i), end));
nir_ssa_def *data = nir_iand_imm(b, val, mask);
min = nir_umin(b, min,
@@ -903,7 +836,7 @@ get_instance_size(struct indirect_draw_shader_builder *builder)
nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
- nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
+ nir_ssa_def *oob = nir_ige(b, nir_imm_int(b, i), remaining);
nir_ssa_def *data = nir_iand_imm(b, val, mask);
min = nir_umin(b, min,
@@ -936,68 +869,25 @@ patch(struct indirect_draw_shader_builder *builder)
nir_ssa_def *draw_ptr = builder->draw.draw_buf;
+ builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
+ assert(builder->draw.vertex_count->num_components);
+ builder->draw.instance_count =
+ get_draw_field(b, draw_ptr, instance_count);
+ builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
if (index_size) {
- builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
- builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
- builder->draw.instance_count =
- get_indexed_draw_field(b, draw_ptr, instance_count);
- builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
- builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
- } else {
- builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
- builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
- builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
- builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
+ builder->draw.index_bias =
+ get_draw_field(b, draw_ptr, index_bias);
}
- assert(builder->draw.vertex_count->num_components);
-
- nir_ssa_def *num_vertices =
- nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
+ get_instance_size(builder);
- IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
- /* If there's nothing to draw, turn the vertex/tiler jobs into
- * null jobs.
- */
- if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
- set_null_job(builder, builder->jobs.vertex_job);
+ builder->instance_size.padded =
+ get_padded_count(b, builder->instance_size.raw,
+ &builder->instance_size.packed);
- set_null_job(builder, builder->jobs.tiler_job);
- } ELSE {
- get_instance_size(builder);
-
- nir_ssa_def *count = builder->instance_size.raw;
-
- /* IDVS requires padding to a multiple of 4 */
- if (builder->flags & PAN_INDIRECT_DRAW_IDVS)
- count = nir_align_pot(b, count, 4);
-
- builder->instance_size.padded =
- get_padded_count(b, count,
- &builder->instance_size.packed);
-
- update_varyings(builder);
- update_jobs(builder);
- update_vertex_attribs(builder);
-
- IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
- store_global(b, builder->jobs.first_vertex_sysval,
- builder->jobs.offset_start, 1);
- } ENDIF
-
- IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
- store_global(b, builder->jobs.base_vertex_sysval,
- index_size ?
- builder->draw.index_bias :
- nir_imm_int(b, 0),
- 1);
- } ENDIF
-
- IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
- store_global(b, builder->jobs.base_instance_sysval,
- builder->draw.start_instance, 1);
- } ENDIF
- } ENDIF
+ update_varyings(builder);
+ update_jobs(builder);
+ update_vertex_attribs(builder);
}
/* Search the min/max index in the range covered by the indirect draw call */
@@ -1046,7 +936,7 @@ get_index_min_max(struct indirect_draw_shader_builder *builder)
LOOP {
nir_ssa_def *offset = nir_load_var(b, offset_var);
- IF (nir_uge(b, offset, end))
+ IF (nir_ige(b, offset, end))
BREAK;
ENDIF
@@ -1076,7 +966,7 @@ get_index_min_max(struct indirect_draw_shader_builder *builder)
nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
}
- IF (nir_ult(b, start, end))
+ IF (nir_ilt(b, start, end))
update_min(builder, nir_load_var(b, min_var));
update_max(builder, nir_load_var(b, max_var));
ENDIF
@@ -1093,9 +983,7 @@ get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
return flags;
}
- return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
- PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
- PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
+ return PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX +
util_logbase2(index_size);
}
@@ -1115,46 +1003,42 @@ create_indirect_draw_shader(struct panfrost_device *dev,
else
patch(&builder);
- struct panfrost_compile_inputs inputs = {
- .gpu_id = dev->gpu_id,
- .fixed_sysval_ubo = -1,
- .no_ubo_to_push = true,
- };
+ struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
struct pan_shader_info shader_info;
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
- GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
+ pan_shader_compile(dev, b->shader, &inputs, &binary, &shader_info);
assert(!shader_info.tls_size);
assert(!shader_info.wls_size);
assert(!shader_info.sysvals.sysval_count);
- shader_info.push.count =
- DIV_ROUND_UP(sizeof(struct indirect_draw_inputs), 4);
-
unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
struct pan_indirect_draw_shader *draw_shader =
&dev->indirect_draw_shaders.shaders[shader_id];
void *state = dev->indirect_draw_shaders.states->ptr.cpu +
- (shader_id * pan_size(RENDERER_STATE));
+ (shader_id * MALI_RENDERER_STATE_LENGTH);
pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
if (!draw_shader->rsd) {
mali_ptr address =
- pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
- binary.data, binary.size,
- PAN_ARCH >= 6 ? 128 : 64);
+ panfrost_pool_upload_aligned(&dev->indirect_draw_shaders.bin_pool,
+ binary.data, binary.size,
+ pan_is_bifrost(dev) ? 128 : 64);
+ if (!pan_is_bifrost(dev))
+ address |= shader_info.midgard.first_tag;
util_dynarray_fini(&binary);
pan_pack(state, RENDERER_STATE, cfg) {
- pan_shader_prepare_rsd(&shader_info, address, &cfg);
+ pan_shader_prepare_rsd(dev, &shader_info, address, &cfg);
}
+ pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
draw_shader->push = shader_info.push;
draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
- (shader_id * pan_size(RENDERER_STATE));
+ (shader_id * MALI_RENDERER_STATE_LENGTH);
}
pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
@@ -1182,7 +1066,46 @@ static mali_ptr
get_tls(const struct panfrost_device *dev)
{
return dev->indirect_draw_shaders.states->ptr.gpu +
- (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
+ (PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);
+}
+
+static mali_ptr
+get_ubos(struct pan_pool *pool,
+ const struct indirect_draw_inputs *inputs)
+{
+ struct panfrost_ptr inputs_buf =
+ panfrost_pool_alloc_aligned(pool, sizeof(inputs), 16);
+
+ memcpy(inputs_buf.cpu, &inputs, sizeof(inputs));
+
+ struct panfrost_ptr ubos_buf =
+ panfrost_pool_alloc_desc(pool, UNIFORM_BUFFER);
+
+ pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
+ cfg.entries = DIV_ROUND_UP(sizeof(inputs), 16);
+ cfg.pointer = inputs_buf.gpu;
+ }
+
+ return ubos_buf.gpu;
+}
+
+static mali_ptr
+get_push_uniforms(struct pan_pool *pool,
+ const struct pan_indirect_draw_shader *shader,
+ const struct indirect_draw_inputs *inputs)
+{
+ if (!shader->push.count)
+ return 0;
+
+ struct panfrost_ptr push_consts_buf =
+ panfrost_pool_alloc_aligned(pool, shader->push.count * 4, 16);
+ uint32_t *out = push_consts_buf.cpu;
+ uint8_t *in = (uint8_t *)inputs;
+
+ for (unsigned i = 0; i < shader->push.count; ++i)
+ memcpy(out + i, in + shader->push.words[i].offset, 4);
+
+ return push_consts_buf.gpu;
}
static void
@@ -1193,15 +1116,15 @@ panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
goto out;
unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
- pan_size(RENDERER_STATE)) +
- pan_size(LOCAL_STORAGE);
+ MALI_RENDERER_STATE_LENGTH) +
+ MALI_LOCAL_STORAGE_LENGTH;
dev->indirect_draw_shaders.states =
- panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
+ panfrost_bo_create(dev, state_bo_size, 0);
/* Prepare the thread storage descriptor now since it's invariant. */
void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
- (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
+ (PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);
pan_pack(tsd, LOCAL_STORAGE, ls) {
ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
};
@@ -1215,8 +1138,7 @@ panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
*/
dev->indirect_draw_shaders.varying_heap =
panfrost_bo_create(dev, 512 * 1024 * 1024,
- PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
- "Indirect draw varying heap");
+ PAN_BO_INVISIBLE | PAN_BO_GROWABLE);
out:
pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
@@ -1227,7 +1149,8 @@ panfrost_emit_index_min_max_search(struct pan_pool *pool,
struct pan_scoreboard *scoreboard,
const struct pan_indirect_draw_info *draw_info,
const struct indirect_draw_inputs *inputs,
- struct indirect_draw_context *draw_ctx)
+ struct indirect_draw_context *draw_ctx,
+ mali_ptr ubos)
{
struct panfrost_device *dev = pool->dev;
unsigned index_size = draw_info->index_size;
@@ -1238,34 +1161,42 @@ panfrost_emit_index_min_max_search(struct pan_pool *pool,
mali_ptr rsd =
get_renderer_state(dev, draw_info->flags,
draw_info->index_size, true);
+ unsigned shader_id =
+ get_shader_id(draw_info->flags, draw_info->index_size, true);
+ const struct pan_indirect_draw_shader *shader =
+ &dev->indirect_draw_shaders.shaders[shader_id];
struct panfrost_ptr job =
- pan_pool_alloc_desc(pool, COMPUTE_JOB);
+ panfrost_pool_alloc_desc(pool, COMPUTE_JOB);
void *invocation =
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
panfrost_pack_work_groups_compute(invocation,
1, 1, 1, MIN_MAX_JOBS, 1, 1,
- false, false);
+ false);
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
cfg.job_task_split = 7;
}
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
+ cfg.draw_descriptor_is_64b = true;
+ cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);
cfg.state = rsd;
cfg.thread_storage = get_tls(pool->dev);
- cfg.push_uniforms =
- pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16);
+ cfg.uniform_buffers = ubos;
+ cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);
}
+ pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
+
return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
false, false, 0, 0, &job, false);
}
unsigned
-GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- const struct pan_indirect_draw_info *draw_info,
- struct panfrost_ptr *ctx)
+panfrost_emit_indirect_draw(struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ const struct pan_indirect_draw_info *draw_info,
+ struct panfrost_ptr *ctx)
{
struct panfrost_device *dev = pool->dev;
@@ -1277,7 +1208,7 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
panfrost_indirect_draw_alloc_deps(dev);
struct panfrost_ptr job =
- pan_pool_alloc_desc(pool, COMPUTE_JOB);
+ panfrost_pool_alloc_desc(pool, COMPUTE_JOB);
mali_ptr rsd =
get_renderer_state(dev, draw_info->flags,
draw_info->index_size, false);
@@ -1288,18 +1219,15 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
struct panfrost_ptr draw_ctx_ptr = *ctx;
if (!draw_ctx_ptr.cpu) {
- draw_ctx_ptr = pan_pool_alloc_aligned(pool,
- sizeof(draw_ctx),
- sizeof(mali_ptr));
+ draw_ctx_ptr = panfrost_pool_alloc_aligned(pool,
+ sizeof(draw_ctx),
+ sizeof(mali_ptr));
}
struct indirect_draw_inputs inputs = {
.draw_ctx = draw_ctx_ptr.gpu,
.draw_buf = draw_info->draw_buf,
.index_buf = draw_info->index_buf,
- .first_vertex_sysval = draw_info->first_vertex_sysval,
- .base_vertex_sysval = draw_info->base_vertex_sysval,
- .base_instance_sysval = draw_info->base_instance_sysval,
.vertex_job = draw_info->vertex_job,
.tiler_job = draw_info->tiler_job,
.attrib_bufs = draw_info->attrib_bufs,
@@ -1312,9 +1240,9 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
inputs.restart_index = draw_info->restart_index;
struct panfrost_ptr min_max_ctx_ptr =
- pan_pool_alloc_aligned(pool,
- sizeof(struct min_max_context),
- 4);
+ panfrost_pool_alloc_aligned(pool,
+ sizeof(struct min_max_context),
+ 4);
struct min_max_context *ctx = min_max_ctx_ptr.cpu;
ctx->min = UINT32_MAX;
@@ -1322,27 +1250,37 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
inputs.min_max_ctx = min_max_ctx_ptr.gpu;
}
+ unsigned shader_id =
+ get_shader_id(draw_info->flags, draw_info->index_size, false);
+ const struct pan_indirect_draw_shader *shader =
+ &dev->indirect_draw_shaders.shaders[shader_id];
+ mali_ptr ubos = get_ubos(pool, &inputs);
+
void *invocation =
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
panfrost_pack_work_groups_compute(invocation,
1, 1, 1, 1, 1, 1,
- false, false);
+ false);
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
cfg.job_task_split = 2;
}
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
+ cfg.draw_descriptor_is_64b = true;
+ cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);
cfg.state = rsd;
cfg.thread_storage = get_tls(pool->dev);
- cfg.push_uniforms =
- pan_pool_upload_aligned(pool, &inputs, sizeof(inputs), 16);
+ cfg.uniform_buffers = ubos;
+ cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);
}
+ pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
+
unsigned global_dep = draw_info->last_indirect_draw;
unsigned local_dep =
panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
- &inputs, &draw_ctx);
+ &inputs, &draw_ctx, ubos);
if (!ctx->cpu) {
*ctx = draw_ctx_ptr;
@@ -1355,19 +1293,20 @@ GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
}
void
-GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
- struct pan_pool *bin_pool)
+panfrost_init_indirect_draw_shaders(struct panfrost_device *dev)
{
/* We allocate the states and varying_heap BO lazily to avoid
* reserving memory when indirect draws are not used.
*/
pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
- dev->indirect_draw_shaders.bin_pool = bin_pool;
+ panfrost_pool_init(&dev->indirect_draw_shaders.bin_pool, NULL, dev,
+ PAN_BO_EXECUTE, false);
}
void
-GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
+panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev)
{
+ panfrost_pool_cleanup(&dev->indirect_draw_shaders.bin_pool);
panfrost_bo_unreference(dev->indirect_draw_shaders.states);
panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
diff --git a/lib/mesa/src/panfrost/lib/pan_indirect_draw.h b/lib/mesa/src/panfrost/lib/pan_indirect_draw.h
index 6a7737441..28bcd535d 100644
--- a/lib/mesa/src/panfrost/lib/pan_indirect_draw.h
+++ b/lib/mesa/src/panfrost/lib/pan_indirect_draw.h
@@ -24,8 +24,6 @@
#ifndef __PAN_INDIRECT_DRAW_SHADERS_H__
#define __PAN_INDIRECT_DRAW_SHADERS_H__
-#include "genxml/gen_macros.h"
-
struct pan_device;
struct pan_scoreboard;
struct pan_pool;
@@ -33,9 +31,6 @@ struct pan_pool;
struct pan_indirect_draw_info {
mali_ptr draw_buf;
mali_ptr index_buf;
- mali_ptr first_vertex_sysval;
- mali_ptr base_vertex_sysval;
- mali_ptr base_instance_sysval;
mali_ptr vertex_job;
mali_ptr tiler_job;
mali_ptr attrib_bufs;
@@ -49,16 +44,15 @@ struct pan_indirect_draw_info {
};
unsigned
-GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- const struct pan_indirect_draw_info *draw_info,
- struct panfrost_ptr *ctx);
+panfrost_emit_indirect_draw(struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ const struct pan_indirect_draw_info *draw_info,
+ struct panfrost_ptr *ctx);
void
-GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
- struct pan_pool *bin_pool);
+panfrost_init_indirect_draw_shaders(struct panfrost_device *dev);
void
-GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev);
+panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev);
#endif
diff --git a/lib/mesa/src/vulkan/wsi/wsi_common_win32.c b/lib/mesa/src/vulkan/wsi/wsi_common_win32.c
index bef81028b..fa6f898e5 100644
--- a/lib/mesa/src/vulkan/wsi/wsi_common_win32.c
+++ b/lib/mesa/src/vulkan/wsi/wsi_common_win32.c
@@ -26,12 +26,9 @@
#include <stdio.h>
#include <string.h>
-#include "vk_format.h"
-#include "vk_instance.h"
-#include "vk_physical_device.h"
#include "vk_util.h"
-#include "wsi_common_entrypoints.h"
#include "wsi_common_private.h"
+#include "wsi_common_win32.h"
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" // warning: cast to pointer from integer of different size
@@ -70,37 +67,30 @@ struct wsi_win32_swapchain {
struct wsi_win32_image images[0];
};
-VKAPI_ATTR VkBool32 VKAPI_CALL
-wsi_GetPhysicalDeviceWin32PresentationSupportKHR(VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex)
+VkBool32
+wsi_win32_get_presentation_support(struct wsi_device *wsi_device)
{
return TRUE;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-wsi_CreateWin32SurfaceKHR(VkInstance _instance,
- const VkWin32SurfaceCreateInfoKHR *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkSurfaceKHR *pSurface)
+VkResult
+wsi_create_win32_surface(VkInstance instance,
+ const VkAllocationCallbacks *allocator,
+ const VkWin32SurfaceCreateInfoKHR *create_info,
+ VkSurfaceKHR *surface_khr)
{
- VK_FROM_HANDLE(vk_instance, instance, _instance);
- VkIcdSurfaceWin32 *surface;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR);
-
- surface = vk_zalloc2(&instance->alloc, pAllocator, sizeof(*surface), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ VkIcdSurfaceWin32 *surface = vk_zalloc(allocator, sizeof *surface, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (surface == NULL)
return VK_ERROR_OUT_OF_HOST_MEMORY;
surface->base.platform = VK_ICD_WSI_PLATFORM_WIN32;
- surface->hinstance = pCreateInfo->hinstance;
- surface->hwnd = pCreateInfo->hwnd;
-
- *pSurface = VkIcdSurfaceBase_to_handle(&surface->base);
+ surface->hinstance = create_info->hinstance;
+ surface->hwnd = create_info->hwnd;
+ *surface_khr = VkIcdSurfaceBase_to_handle(&surface->base);
return VK_SUCCESS;
}
@@ -116,24 +106,15 @@ wsi_win32_surface_get_support(VkIcdSurfaceBase *surface,
}
static VkResult
-wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surf,
+wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surface,
struct wsi_device *wsi_device,
VkSurfaceCapabilitiesKHR* caps)
{
- VkIcdSurfaceWin32 *surface = (VkIcdSurfaceWin32 *)surf;
-
- RECT win_rect;
- if (!GetClientRect(surface->hwnd, &win_rect))
- return VK_ERROR_SURFACE_LOST_KHR;
-
caps->minImageCount = 1;
/* There is no real maximum */
caps->maxImageCount = 0;
- caps->currentExtent = (VkExtent2D) {
- win_rect.right - win_rect.left,
- win_rect.bottom - win_rect.top
- };
+ caps->currentExtent = (VkExtent2D) { UINT32_MAX, UINT32_MAX };
caps->minImageExtent = (VkExtent2D) { 1, 1 };
caps->maxImageExtent = (VkExtent2D) {
wsi_device->maxImageDimension2D,
@@ -153,8 +134,7 @@ wsi_win32_surface_get_capabilities(VkIcdSurfaceBase *surf,
VK_IMAGE_USAGE_SAMPLED_BIT |
VK_IMAGE_USAGE_TRANSFER_DST_BIT |
VK_IMAGE_USAGE_STORAGE_BIT |
- VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
- VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
+ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
return VK_SUCCESS;
}
@@ -228,7 +208,7 @@ wsi_win32_surface_get_formats(VkIcdSurfaceBase *icd_surface,
for (unsigned i = 0; i < ARRAY_SIZE(sorted_formats); i++) {
vk_outarray_append_typed(VkSurfaceFormatKHR, &out, f) {
f->format = sorted_formats[i];
- f->colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR;
+ f->colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
}
}
@@ -251,7 +231,7 @@ wsi_win32_surface_get_formats2(VkIcdSurfaceBase *icd_surface,
vk_outarray_append_typed(VkSurfaceFormat2KHR, &out, f) {
assert(f->sType == VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR);
f->surfaceFormat.format = sorted_formats[i];
- f->surfaceFormat.colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR;
+ f->surfaceFormat.colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
}
}
@@ -301,16 +281,155 @@ wsi_win32_surface_get_present_rectangles(VkIcdSurfaceBase *surface,
return vk_outarray_status(&out);
}
+static uint32_t
+select_memory_type(const struct wsi_device *wsi,
+ VkMemoryPropertyFlags props,
+ uint32_t type_bits)
+{
+ for (uint32_t i = 0; i < wsi->memory_props.memoryTypeCount; i++) {
+ const VkMemoryType type = wsi->memory_props.memoryTypes[i];
+ if ((type_bits & (1 << i)) && (type.propertyFlags & props) == props)
+ return i;
+ }
+
+ unreachable("No memory type found");
+}
+
+VkResult
+wsi_create_native_image(const struct wsi_swapchain *chain,
+ const VkSwapchainCreateInfoKHR *pCreateInfo,
+ uint32_t num_modifier_lists,
+ const uint32_t *num_modifiers,
+ const uint64_t *const *modifiers,
+ struct wsi_image *image)
+{
+ const struct wsi_device *wsi = chain->wsi;
+ VkResult result;
+
+ memset(image, 0, sizeof(*image));
+ for (int i = 0; i < ARRAY_SIZE(image->fds); i++)
+ image->fds[i] = -1;
+
+ VkImageCreateInfo image_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+ .flags = 0,
+ .imageType = VK_IMAGE_TYPE_2D,
+ .format = pCreateInfo->imageFormat,
+ .extent = {
+ .width = pCreateInfo->imageExtent.width,
+ .height = pCreateInfo->imageExtent.height,
+ .depth = 1,
+ },
+ .mipLevels = 1,
+ .arrayLayers = 1,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .tiling = VK_IMAGE_TILING_OPTIMAL,
+ .usage = pCreateInfo->imageUsage,
+ .sharingMode = pCreateInfo->imageSharingMode,
+ .queueFamilyIndexCount = pCreateInfo->queueFamilyIndexCount,
+ .pQueueFamilyIndices = pCreateInfo->pQueueFamilyIndices,
+ .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+ };
+
+ VkImageFormatListCreateInfoKHR image_format_list;
+ if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR) {
+ image_info.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+ VK_IMAGE_CREATE_EXTENDED_USAGE_BIT_KHR;
+
+ const VkImageFormatListCreateInfoKHR *format_list =
+ vk_find_struct_const(pCreateInfo->pNext,
+ IMAGE_FORMAT_LIST_CREATE_INFO_KHR);
+
+#ifndef NDEBUG
+ assume(format_list && format_list->viewFormatCount > 0);
+ bool format_found = false;
+ for (int i = 0; i < format_list->viewFormatCount; i++)
+ if (pCreateInfo->imageFormat == format_list->pViewFormats[i])
+ format_found = true;
+ assert(format_found);
+#endif
+
+ image_format_list = *format_list;
+ image_format_list.pNext = NULL;
+ __vk_append_struct(&image_info, &image_format_list);
+ }
+
+
+ result = wsi->CreateImage(chain->device, &image_info,
+ &chain->alloc, &image->image);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ VkMemoryRequirements reqs;
+ wsi->GetImageMemoryRequirements(chain->device, image->image, &reqs);
+
+ const struct wsi_memory_allocate_info memory_wsi_info = {
+ .sType = VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA,
+ .pNext = NULL,
+ .implicit_sync = true,
+ };
+ const VkExportMemoryAllocateInfo memory_export_info = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+ .pNext = &memory_wsi_info,
+ .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+ };
+ const VkMemoryDedicatedAllocateInfo memory_dedicated_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+ .pNext = &memory_export_info,
+ .image = image->image,
+ .buffer = VK_NULL_HANDLE,
+ };
+ const VkMemoryAllocateInfo memory_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .pNext = &memory_dedicated_info,
+ .allocationSize = reqs.size,
+ .memoryTypeIndex = select_memory_type(wsi, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ reqs.memoryTypeBits),
+ };
+ result = wsi->AllocateMemory(chain->device, &memory_info,
+ &chain->alloc, &image->memory);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ result = wsi->BindImageMemory(chain->device, image->image,
+ image->memory, 0);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ const VkImageSubresource image_subresource = {
+ .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+ .mipLevel = 0,
+ .arrayLayer = 0,
+ };
+ VkSubresourceLayout image_layout;
+ wsi->GetImageSubresourceLayout(chain->device, image->image,
+ &image_subresource, &image_layout);
+
+ image->num_planes = 1;
+ image->sizes[0] = reqs.size;
+ image->row_pitches[0] = image_layout.rowPitch;
+ image->offsets[0] = 0;
+
+ return VK_SUCCESS;
+
+fail:
+ wsi_destroy_image(chain, image);
+
+ return result;
+}
+
static VkResult
wsi_win32_image_init(VkDevice device_h,
- struct wsi_win32_swapchain *chain,
- const VkSwapchainCreateInfoKHR *create_info,
- const VkAllocationCallbacks *allocator,
- struct wsi_win32_image *image)
+ struct wsi_swapchain *drv_chain,
+ const VkSwapchainCreateInfoKHR *create_info,
+ const VkAllocationCallbacks *allocator,
+ struct wsi_win32_image *image)
{
- assert(chain->base.use_buffer_blit);
- VkResult result = wsi_create_image(&chain->base, &chain->base.image_info,
- &image->base);
+ struct wsi_win32_swapchain *chain = (struct wsi_win32_swapchain *) drv_chain;
+
+ VkResult result = wsi_create_native_image(&chain->base, create_info,
+ 0, NULL, NULL,
+ &image->base);
if (result != VK_SUCCESS)
return result;
@@ -345,10 +464,13 @@ wsi_win32_image_init(VkDevice device_h,
}
static void
-wsi_win32_image_finish(struct wsi_win32_swapchain *chain,
- const VkAllocationCallbacks *allocator,
- struct wsi_win32_image *image)
+wsi_win32_image_finish(struct wsi_swapchain *drv_chain,
+ const VkAllocationCallbacks *allocator,
+ struct wsi_win32_image *image)
{
+ struct wsi_win32_swapchain *chain =
+ (struct wsi_win32_swapchain *) drv_chain;
+
DeleteDC(image->dc);
if(image->bmp)
DeleteObject(image->bmp);
@@ -363,7 +485,7 @@ wsi_win32_swapchain_destroy(struct wsi_swapchain *drv_chain,
(struct wsi_win32_swapchain *) drv_chain;
for (uint32_t i = 0; i < chain->base.image_count; i++)
- wsi_win32_image_finish(chain, allocator, &chain->images[i]);
+ wsi_win32_image_finish(drv_chain, allocator, &chain->images[i]);
DeleteDC(chain->chain_dc);
@@ -406,19 +528,30 @@ wsi_win32_queue_present(struct wsi_swapchain *drv_chain,
struct wsi_win32_swapchain *chain = (struct wsi_win32_swapchain *) drv_chain;
assert(image_index < chain->base.image_count);
struct wsi_win32_image *image = &chain->images[image_index];
+ VkResult result;
- assert(chain->base.use_buffer_blit);
-
- char *ptr = image->base.cpu_map;
+ char *ptr;
char *dptr = image->ppvBits;
+ result = chain->base.wsi->MapMemory(chain->base.device,
+ image->base.memory,
+ 0, 0, 0, (void**)&ptr);
for (unsigned h = 0; h < chain->extent.height; h++) {
memcpy(dptr, ptr, chain->extent.width * 4);
dptr += image->bmp_row_pitch;
ptr += image->base.row_pitches[0];
}
- if (!StretchBlt(chain->chain_dc, 0, 0, chain->extent.width, chain->extent.height, image->dc, 0, 0, chain->extent.width, chain->extent.height, SRCCOPY))
- chain->status = VK_ERROR_MEMORY_MAP_FAILED;
+ if(StretchBlt(chain->chain_dc, 0, 0, chain->extent.width, chain->extent.height, image->dc, 0, 0, chain->extent.width, chain->extent.height, SRCCOPY))
+ result = VK_SUCCESS;
+ else
+ result = VK_ERROR_MEMORY_MAP_FAILED;
+
+ chain->base.wsi->UnmapMemory(chain->base.device, image->base.memory);
+ if (result != VK_SUCCESS)
+ chain->status = result;
+
+ if (result != VK_SUCCESS)
+ return result;
return chain->status;
}
@@ -448,13 +581,8 @@ wsi_win32_surface_create_swapchain(
if (chain == NULL)
return VK_ERROR_OUT_OF_HOST_MEMORY;
- struct wsi_cpu_image_params image_params = {
- .base.image_type = WSI_IMAGE_TYPE_CPU,
- };
-
VkResult result = wsi_swapchain_init(wsi_device, &chain->base, device,
- create_info, &image_params.base,
- allocator);
+ create_info, allocator);
if (result != VK_SUCCESS) {
vk_free(allocator, chain);
return result;
@@ -473,20 +601,16 @@ wsi_win32_surface_create_swapchain(
chain->surface = surface;
- assert(wsi_device->sw);
- chain->base.use_buffer_blit = true;
-
for (uint32_t image = 0; image < chain->base.image_count; image++) {
- result = wsi_win32_image_init(device, chain,
- create_info, allocator,
- &chain->images[image]);
+ result = wsi_win32_image_init(device, &chain->base,
+ create_info, allocator,
+ &chain->images[image]);
if (result != VK_SUCCESS) {
while (image > 0) {
--image;
- wsi_win32_image_finish(chain, allocator,
- &chain->images[image]);
+ wsi_win32_image_finish(&chain->base, allocator,
+ &chain->images[image]);
}
- wsi_swapchain_finish(&chain->base);
vk_free(allocator, chain);
goto fail_init_images;
}