summaryrefslogtreecommitdiff
path: root/lib/mesa/src/freedreno/ir3
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mesa/src/freedreno/ir3')
-rw-r--r--lib/mesa/src/freedreno/ir3/ir3_dce.c6
-rw-r--r--lib/mesa/src/freedreno/ir3/ir3_delay.c223
-rw-r--r--lib/mesa/src/freedreno/ir3/ir3_lexer.l60
-rw-r--r--lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c138
-rw-r--r--lib/mesa/src/freedreno/ir3/ir3_parser.y170
-rw-r--r--lib/mesa/src/freedreno/ir3/ir3_postsched.c244
-rw-r--r--lib/mesa/src/freedreno/ir3/ir3_ra.h2
-rw-r--r--lib/mesa/src/freedreno/ir3/tests/disasm.c368
8 files changed, 598 insertions, 613 deletions
diff --git a/lib/mesa/src/freedreno/ir3/ir3_dce.c b/lib/mesa/src/freedreno/ir3/ir3_dce.c
index 76298e64a..a3ddbe802 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_dce.c
+++ b/lib/mesa/src/freedreno/ir3/ir3_dce.c
@@ -53,8 +53,10 @@ instr_dce(struct ir3_instruction *instr, bool falsedep)
if (ir3_instr_check_mark(instr))
return;
- if (writes_gpr(instr))
- mark_array_use(instr, instr->dsts[0]); /* dst */
+ foreach_dst (dst, instr) {
+ if (is_dest_gpr(dst))
+ mark_array_use(instr, dst);
+ }
foreach_src (reg, instr)
mark_array_use(instr, reg); /* src */
diff --git a/lib/mesa/src/freedreno/ir3/ir3_delay.c b/lib/mesa/src/freedreno/ir3/ir3_delay.c
index 14bb403b9..054f4c831 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_delay.c
+++ b/lib/mesa/src/freedreno/ir3/ir3_delay.c
@@ -30,19 +30,6 @@
*/
#define MAX_NOPS 6
-/* The soft delay for approximating the cost of (ss). On a6xx, it takes the
- * number of delay slots to get a SFU result back (ie. using nop's instead of
- * (ss) is:
- *
- * 8 - single warp
- * 9 - two warps
- * 10 - four warps
- *
- * and so on. Not quite sure where it tapers out (ie. how many warps share an
- * SFU unit). But 10 seems like a reasonable # to choose:
- */
-#define SOFT_SS_NOPS 10
-
/*
* Helpers to figure out the necessary delay slots between instructions. Used
* both in scheduling pass(es) and the final pass to insert any required nop's
@@ -76,11 +63,11 @@ ir3_delayslots(struct ir3_instruction *assigner,
if (writes_addr0(assigner) || writes_addr1(assigner))
return 6;
- if (soft && is_sfu(assigner))
- return SOFT_SS_NOPS;
+ if (soft && is_ss_producer(assigner))
+ return soft_ss_delay(assigner);
/* handled via sync flags: */
- if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+ if (is_ss_producer(assigner) || is_sy_producer(assigner))
return 0;
/* As far as we know, shader outputs don't need any delay. */
@@ -89,7 +76,7 @@ ir3_delayslots(struct ir3_instruction *assigner,
/* assigner must be alu: */
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
- is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
+ is_mem(consumer)) {
return 6;
} else {
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
@@ -119,74 +106,6 @@ count_instruction(struct ir3_instruction *n)
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
}
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
-{
- unsigned d = 0;
-
- /* Note that this relies on incrementally building up the block's
- * instruction list.. but this is how scheduling and nopsched
- * work.
- */
- foreach_instr_rev (n, &block->instr_list) {
- if ((n == instr) || (d >= maxd))
- return MIN2(maxd, d + n->nop);
- if (count_instruction(n))
- d = MIN2(maxd, d + 1 + n->repeat + n->nop);
- }
-
- return maxd;
-}
-
-static unsigned
-delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned srcn)
-{
- unsigned delay = 0;
-
- if (assigner->opc == OPC_META_PHI)
- return 0;
-
- if (is_meta(assigner)) {
- foreach_src_n (src, n, assigner) {
- unsigned d;
-
- if (!src->def)
- continue;
-
- d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
- delay = MAX2(delay, d);
- }
- } else {
- delay = ir3_delayslots(assigner, consumer, srcn, false);
- delay -= distance(block, assigner, delay);
- }
-
- return delay;
-}
-
-/**
- * Calculate delay for instruction before register allocation, using SSA
- * source pointers. This can't handle inter-block dependencies.
- */
-unsigned
-ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
-{
- unsigned delay = 0;
-
- foreach_src_n (src, i, instr) {
- unsigned d = 0;
-
- if (src->def && src->def->instr->block == block) {
- d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
- }
-
- delay = MAX2(delay, d);
- }
-
- return delay;
-}
-
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
* and have to handle relative accesses specially.
*/
@@ -207,35 +126,21 @@ post_ra_reg_num(struct ir3_register *reg)
return reg->num;
}
-static unsigned
-delay_calc_srcn_postra(struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned assigner_n,
- unsigned consumer_n, bool soft, bool mergedregs)
+unsigned
+ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer,
+ unsigned assigner_n, unsigned consumer_n)
{
+ unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);
+
struct ir3_register *src = consumer->srcs[consumer_n];
struct ir3_register *dst = assigner->dsts[assigner_n];
- bool mismatched_half =
- (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
- /* In the mergedregs case or when the register is a special register,
- * half-registers do not alias with full registers.
- */
- if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
- mismatched_half)
- return 0;
+ if (assigner->repeat == 0 && consumer->repeat == 0)
+ return delay;
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
- unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
- unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
-
- if (dst_start >= src_end || src_start >= dst_end)
- return 0;
-
- unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
-
- if (assigner->repeat == 0 && consumer->repeat == 0)
- return delay;
/* If either side is a relative access, we can't really apply most of the
* reasoning below because we don't know which component aliases which.
@@ -250,6 +155,9 @@ delay_calc_srcn_postra(struct ir3_instruction *assigner,
if (assigner->opc == OPC_MOVMSK)
return delay;
+ bool mismatched_half =
+ (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
+
/* TODO: Handle the combination of (rpt) and different component sizes
* better like below. This complicates things significantly because the
* components don't line up.
@@ -303,10 +211,41 @@ delay_calc_srcn_postra(struct ir3_instruction *assigner,
}
static unsigned
-delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
- struct ir3_instruction *consumer, unsigned distance,
- bool soft, bool pred, bool mergedregs)
+delay_calc_srcn(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned assigner_n,
+ unsigned consumer_n, bool mergedregs)
+{
+ struct ir3_register *src = consumer->srcs[consumer_n];
+ struct ir3_register *dst = assigner->dsts[assigner_n];
+ bool mismatched_half =
+ (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
+
+ /* In the mergedregs case or when the register is a special register,
+ * half-registers do not alias with full registers.
+ */
+ if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
+ mismatched_half)
+ return 0;
+
+ unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
+ unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
+ unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
+ unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
+
+ if (dst_start >= src_end || src_start >= dst_end)
+ return 0;
+
+ return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
+}
+
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *start,
+ struct ir3_instruction *consumer, unsigned distance,
+ regmask_t *in_mask, bool mergedregs)
{
+ regmask_t mask;
+ memcpy(&mask, in_mask, sizeof(mask));
+
unsigned delay = 0;
/* Search backwards starting at the instruction before start, unless it's
* NULL then search backwards from the block end.
@@ -318,7 +257,7 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
if (count_instruction(assigner))
distance += assigner->nop;
- if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
+ if (distance + delay >= MAX_NOPS)
return delay;
if (is_meta(assigner))
@@ -329,14 +268,17 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
foreach_dst_n (dst, dst_n, assigner) {
if (dst->wrmask == 0)
continue;
+ if (!regmask_get(&mask, dst))
+ continue;
foreach_src_n (src, src_n, consumer) {
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
continue;
- unsigned src_delay = delay_calc_srcn_postra(
- assigner, consumer, dst_n, src_n, soft, mergedregs);
+ unsigned src_delay = delay_calc_srcn(
+ assigner, consumer, dst_n, src_n, mergedregs);
new_delay = MAX2(new_delay, src_delay);
}
+ regmask_clear(&mask, dst);
}
new_delay = new_delay > distance ? new_delay - distance : 0;
@@ -360,13 +302,13 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
* However any other recursion would be unnecessary.
*/
- if (pred && block->data != block) {
+ if (block->data != block) {
block->data = block;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
- unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
- soft, pred, mergedregs);
+ unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
+ &mask, mergedregs);
delay = MAX2(delay, pred_delay);
}
@@ -377,50 +319,19 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
}
/**
- * Calculate delay for post-RA scheduling based on physical registers but not
- * exact (i.e. don't recurse into predecessors, and make it possible to
- * estimate impact of sync flags).
- *
- * @soft: If true, add additional delay for situations where they
- * would not be strictly required because a sync flag would be
- * used (but scheduler would prefer to schedule some other
- * instructions first to avoid stalling on sync flag)
- * @mergedregs: True if mergedregs is enabled.
- */
-unsigned
-ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
- bool soft, bool mergedregs)
-{
- return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
-}
-
-/**
* Calculate delay for nop insertion. This must exactly match hardware
* requirements, including recursing into predecessor blocks.
*/
unsigned
-ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
- bool mergedregs)
+ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+ bool mergedregs)
{
- return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
-}
-
-/**
- * Remove nop instructions. The scheduler can insert placeholder nop's
- * so that ir3_delay_calc() can account for nop's that won't be needed
- * due to nop's triggered by a previous instruction. However, before
- * legalize, we want to remove these. The legalize pass can insert
- * some nop's if needed to hold (for example) sync flags. This final
- * remaining nops are inserted by legalize after this.
- */
-void
-ir3_remove_nops(struct ir3 *ir)
-{
- foreach_block (block, &ir->block_list) {
- foreach_instr_safe (instr, &block->instr_list) {
- if (instr->opc == OPC_NOP) {
- list_del(&instr->node);
- }
- }
+ regmask_t mask;
+ regmask_init(&mask, mergedregs);
+ foreach_src (src, instr) {
+ if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
+ regmask_set(&mask, src);
}
+
+ return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
}
diff --git a/lib/mesa/src/freedreno/ir3/ir3_lexer.l b/lib/mesa/src/freedreno/ir3/ir3_lexer.l
index 2d5582e5b..52b977896 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_lexer.l
+++ b/lib/mesa/src/freedreno/ir3/ir3_lexer.l
@@ -72,16 +72,6 @@ static int parse_reg(const char *str)
return num;
}
-static int parse_w(const char *str)
-{
- str++;
- unsigned num = strtol(str, NULL, 10);
- if ((num % 32) != 0)
- yy_fatal_error("w# must be multiple of 32");
- if (num < 32)
- yy_fatal_error("w# must be at least 32");
- return num / 32;
-}
%}
%option noyywrap
@@ -139,7 +129,7 @@ static int parse_w(const char *str)
"a0.x" return T_A0;
"a1.x" return T_A1;
"p0."[xyzw] ir3_yylval.num = parse_reg(yytext); return T_P0;
-"w"[0-9]+ ir3_yylval.num = parse_w(yytext); return T_W;
+"w"[0-9]+ ir3_yylval.num = strtol(yytext+1, NULL, 10); return T_W;
"s#"[0-9]+ ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_SAMP;
"t#"[0-9]+ ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_TEX;
@@ -167,6 +157,7 @@ static int parse_w(const char *str)
"stkr" return TOKEN(T_OP_STKR);
"xset" return TOKEN(T_OP_XSET);
"xclr" return TOKEN(T_OP_XCLR);
+"getlast" return TOKEN(T_OP_GETLAST);
"getone" return TOKEN(T_OP_GETONE);
"dbg" return TOKEN(T_OP_DBG);
"shps" return TOKEN(T_OP_SHPS);
@@ -228,6 +219,7 @@ static int parse_w(const char *str)
"shr.b" return TOKEN(T_OP_SHR_B);
"ashr.b" return TOKEN(T_OP_ASHR_B);
"bary.f" return TOKEN(T_OP_BARY_F);
+"flat.b" return TOKEN(T_OP_FLAT_B);
"mgen.b" return TOKEN(T_OP_MGEN_B);
"getbit.b" return TOKEN(T_OP_GETBIT_B);
"setrm" return TOKEN(T_OP_SETRM);
@@ -252,7 +244,15 @@ static int parse_w(const char *str)
"sel.f32" return TOKEN(T_OP_SEL_F32);
"sad.s16" return TOKEN(T_OP_SAD_S16);
"sad.s32" return TOKEN(T_OP_SAD_S32);
-"shlg.b16" return TOKEN(T_OP_SHLG_B16);
+"shrm" return TOKEN(T_OP_SHRM);
+"shlm" return TOKEN(T_OP_SHLM);
+"shrg" return TOKEN(T_OP_SHRG);
+"shlg" return TOKEN(T_OP_SHLG);
+"andg" return TOKEN(T_OP_ANDG);
+"dp2acc" return TOKEN(T_OP_DP2ACC);
+"dp4acc" return TOKEN(T_OP_DP4ACC);
+"wmm" return TOKEN(T_OP_WMM);
+"wmm.accu" return TOKEN(T_OP_WMM_ACCU);
/* category 4: */
"rcp" return TOKEN(T_OP_RCP);
@@ -295,6 +295,11 @@ static int parse_w(const char *str)
"dsypp.1" return TOKEN(T_OP_DSYPP_1);
"rgetpos" return TOKEN(T_OP_RGETPOS);
"rgetinfo" return TOKEN(T_OP_RGETINFO);
+"brcst.active" return TOKEN(T_OP_BRCST_A);
+"quad_shuffle.brcst" return TOKEN(T_OP_QSHUFFLE_BRCST);
+"quad_shuffle.horiz" return TOKEN(T_OP_QSHUFFLE_H);
+"quad_shuffle.vert" return TOKEN(T_OP_QSHUFFLE_V);
+"quad_shuffle.diag" return TOKEN(T_OP_QSHUFFLE_DIAG);
/* category 6: */
"ldg" return TOKEN(T_OP_LDG);
@@ -338,6 +343,29 @@ static int parse_w(const char *str)
"atomic.b.and" return TOKEN(T_OP_ATOMIC_B_AND);
"atomic.b.or" return TOKEN(T_OP_ATOMIC_B_OR);
"atomic.b.xor" return TOKEN(T_OP_ATOMIC_B_XOR);
+"atomic.s.add" return TOKEN(T_OP_ATOMIC_S_ADD);
+"atomic.s.sub" return TOKEN(T_OP_ATOMIC_S_SUB);
+"atomic.s.xchg" return TOKEN(T_OP_ATOMIC_S_XCHG);
+"atomic.s.inc" return TOKEN(T_OP_ATOMIC_S_INC);
+"atomic.s.dec" return TOKEN(T_OP_ATOMIC_S_DEC);
+"atomic.s.cmpxchg" return TOKEN(T_OP_ATOMIC_S_CMPXCHG);
+"atomic.s.min" return TOKEN(T_OP_ATOMIC_S_MIN);
+"atomic.s.max" return TOKEN(T_OP_ATOMIC_S_MAX);
+"atomic.s.and" return TOKEN(T_OP_ATOMIC_S_AND);
+"atomic.s.or" return TOKEN(T_OP_ATOMIC_S_OR);
+"atomic.s.xor" return TOKEN(T_OP_ATOMIC_S_XOR);
+"atomic.g.add" return TOKEN(T_OP_ATOMIC_G_ADD);
+"atomic.g.sub" return TOKEN(T_OP_ATOMIC_G_SUB);
+"atomic.g.xchg" return TOKEN(T_OP_ATOMIC_G_XCHG);
+"atomic.g.inc" return TOKEN(T_OP_ATOMIC_G_INC);
+"atomic.g.dec" return TOKEN(T_OP_ATOMIC_G_DEC);
+"atomic.g.cmpxchg" return TOKEN(T_OP_ATOMIC_G_CMPXCHG);
+"atomic.g.min" return TOKEN(T_OP_ATOMIC_G_MIN);
+"atomic.g.max" return TOKEN(T_OP_ATOMIC_G_MAX);
+"atomic.g.and" return TOKEN(T_OP_ATOMIC_G_AND);
+"atomic.g.or" return TOKEN(T_OP_ATOMIC_G_OR);
+"atomic.g.xor" return TOKEN(T_OP_ATOMIC_G_XOR);
+
"ldgb" return TOKEN(T_OP_LDGB);
"stgb" return TOKEN(T_OP_STGB);
"stib" return TOKEN(T_OP_STIB);
@@ -345,6 +373,8 @@ static int parse_w(const char *str)
"ldlv" return TOKEN(T_OP_LDLV);
"getspid" return TOKEN(T_OP_GETSPID);
"getwid" return TOKEN(T_OP_GETWID);
+"getfiberid" return TOKEN(T_OP_GETFIBERID);
+"stc" return TOKEN(T_OP_STC);
/* category 7: */
"bar" return TOKEN(T_OP_BAR);
@@ -362,6 +392,11 @@ static int parse_w(const char *str)
"untyped" return TOKEN(T_UNTYPED);
"typed" return TOKEN(T_TYPED);
+"unsigned" return TOKEN(T_UNSIGNED);
+"mixed" return TOKEN(T_MIXED);
+"low" return TOKEN(T_LOW);
+"high" return TOKEN(T_HIGH);
+
"1d" return TOKEN(T_1D);
"2d" return TOKEN(T_2D);
"3d" return TOKEN(T_3D);
@@ -379,6 +414,7 @@ static int parse_w(const char *str)
"p" return 'p';
"s2en" return TOKEN(T_S2EN);
"s" return 's';
+"k" return 'k';
"base"[0-9]+ ir3_yylval.num = strtol(yytext+4, NULL, 10); return T_BASE;
"offset"[0-9]+ ir3_yylval.num = strtol(yytext+6, NULL, 10); return T_OFFSET;
"uniform" return T_UNIFORM;
diff --git a/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
deleted file mode 100644
index 37a3dcb26..000000000
--- a/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright © 2017 Ilia Mirkin
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "ir3_nir.h"
-#include "compiler/nir/nir_builder.h"
-
-/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
- * gather results, rather than before. As a result, it must be emulated with
- * direct texture calls.
- */
-
-static bool
-lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
-{
- bool progress = false;
-
- static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
-
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_tex)
- continue;
-
- nir_tex_instr *tg4 = (nir_tex_instr *)instr;
-
- if (tg4->op != nir_texop_tg4)
- continue;
-
- b->cursor = nir_before_instr(&tg4->instr);
-
- nir_ssa_def *results[4];
- int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
- for (int i = 0; i < 4; i++) {
- int num_srcs = tg4->num_srcs + 1 /* lod */;
- if (offset_index < 0 && i < 3)
- num_srcs++;
-
- nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
- tex->op = nir_texop_txl;
- tex->sampler_dim = tg4->sampler_dim;
- tex->coord_components = tg4->coord_components;
- tex->is_array = tg4->is_array;
- tex->is_shadow = tg4->is_shadow;
- tex->is_new_style_shadow = tg4->is_new_style_shadow;
- tex->texture_index = tg4->texture_index;
- tex->sampler_index = tg4->sampler_index;
- tex->dest_type = tg4->dest_type;
-
- for (int j = 0; j < tg4->num_srcs; j++) {
- nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
- tex->src[j].src_type = tg4->src[j].src_type;
- }
- if (i != 3) {
- nir_ssa_def *offset =
- nir_vec2(b, nir_imm_int(b, offsets[i][0]),
- nir_imm_int(b, offsets[i][1]));
- if (offset_index < 0) {
- tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
- tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
- } else {
- assert(nir_tex_instr_src_size(tex, offset_index) == 2);
- nir_ssa_def *orig = nir_ssa_for_src(
- b, tex->src[offset_index].src, 2);
- tex->src[offset_index].src =
- nir_src_for_ssa(nir_iadd(b, orig, offset));
- }
- }
- tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
- tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
-
- nir_ssa_dest_init(&tex->instr, &tex->dest,
- nir_tex_instr_dest_size(tex), 32, NULL);
- nir_builder_instr_insert(b, &tex->instr);
-
- results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
- }
-
- nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
- nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
-
- nir_instr_remove(&tg4->instr);
-
- progress = true;
- }
-
- return progress;
-}
-
-static bool
-lower_tg4_func(nir_function_impl *impl)
-{
- void *mem_ctx = ralloc_parent(impl);
- nir_builder b;
- nir_builder_init(&b, impl);
-
- bool progress = false;
- nir_foreach_block_safe(block, impl) {
- progress |= lower_tg4(block, &b, mem_ctx);
- }
-
- if (progress)
- nir_metadata_preserve(impl, nir_metadata_block_index |
- nir_metadata_dominance);
-
- return progress;
-}
-
-bool
-ir3_nir_lower_tg4_to_tex(nir_shader *shader)
-{
- bool progress = false;
-
- nir_foreach_function(function, shader) {
- if (function->impl)
- progress |= lower_tg4_func(function->impl);
- }
-
- return progress;
-}
diff --git a/lib/mesa/src/freedreno/ir3/ir3_parser.y b/lib/mesa/src/freedreno/ir3/ir3_parser.y
index acd94b35a..fd29c639d 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_parser.y
+++ b/lib/mesa/src/freedreno/ir3/ir3_parser.y
@@ -399,6 +399,7 @@ static void print_token(FILE *file, int type, YYSTYPE value)
%token <tok> T_OP_STKR
%token <tok> T_OP_XSET
%token <tok> T_OP_XCLR
+%token <tok> T_OP_GETLAST
%token <tok> T_OP_GETONE
%token <tok> T_OP_DBG
%token <tok> T_OP_SHPS
@@ -458,6 +459,7 @@ static void print_token(FILE *file, int type, YYSTYPE value)
%token <tok> T_OP_SHR_B
%token <tok> T_OP_ASHR_B
%token <tok> T_OP_BARY_F
+%token <tok> T_OP_FLAT_B
%token <tok> T_OP_MGEN_B
%token <tok> T_OP_GETBIT_B
%token <tok> T_OP_SETRM
@@ -482,7 +484,15 @@ static void print_token(FILE *file, int type, YYSTYPE value)
%token <tok> T_OP_SEL_F32
%token <tok> T_OP_SAD_S16
%token <tok> T_OP_SAD_S32
-%token <tok> T_OP_SHLG_B16
+%token <tok> T_OP_SHRM
+%token <tok> T_OP_SHLM
+%token <tok> T_OP_SHRG
+%token <tok> T_OP_SHLG
+%token <tok> T_OP_ANDG
+%token <tok> T_OP_DP2ACC
+%token <tok> T_OP_DP4ACC
+%token <tok> T_OP_WMM
+%token <tok> T_OP_WMM_ACCU
/* category 4: */
%token <tok> T_OP_RCP
@@ -525,6 +535,11 @@ static void print_token(FILE *file, int type, YYSTYPE value)
%token <tok> T_OP_DSYPP_1
%token <tok> T_OP_RGETPOS
%token <tok> T_OP_RGETINFO
+%token <tok> T_OP_BRCST_A
+%token <tok> T_OP_QSHUFFLE_BRCST
+%token <tok> T_OP_QSHUFFLE_H
+%token <tok> T_OP_QSHUFFLE_V
+%token <tok> T_OP_QSHUFFLE_DIAG
/* category 6: */
%token <tok> T_OP_LDG
@@ -568,6 +583,28 @@ static void print_token(FILE *file, int type, YYSTYPE value)
%token <tok> T_OP_ATOMIC_B_AND
%token <tok> T_OP_ATOMIC_B_OR
%token <tok> T_OP_ATOMIC_B_XOR
+%token <tok> T_OP_ATOMIC_S_ADD
+%token <tok> T_OP_ATOMIC_S_SUB
+%token <tok> T_OP_ATOMIC_S_XCHG
+%token <tok> T_OP_ATOMIC_S_INC
+%token <tok> T_OP_ATOMIC_S_DEC
+%token <tok> T_OP_ATOMIC_S_CMPXCHG
+%token <tok> T_OP_ATOMIC_S_MIN
+%token <tok> T_OP_ATOMIC_S_MAX
+%token <tok> T_OP_ATOMIC_S_AND
+%token <tok> T_OP_ATOMIC_S_OR
+%token <tok> T_OP_ATOMIC_S_XOR
+%token <tok> T_OP_ATOMIC_G_ADD
+%token <tok> T_OP_ATOMIC_G_SUB
+%token <tok> T_OP_ATOMIC_G_XCHG
+%token <tok> T_OP_ATOMIC_G_INC
+%token <tok> T_OP_ATOMIC_G_DEC
+%token <tok> T_OP_ATOMIC_G_CMPXCHG
+%token <tok> T_OP_ATOMIC_G_MIN
+%token <tok> T_OP_ATOMIC_G_MAX
+%token <tok> T_OP_ATOMIC_G_AND
+%token <tok> T_OP_ATOMIC_G_OR
+%token <tok> T_OP_ATOMIC_G_XOR
%token <tok> T_OP_LDGB
%token <tok> T_OP_STGB
%token <tok> T_OP_STIB
@@ -575,6 +612,8 @@ static void print_token(FILE *file, int type, YYSTYPE value)
%token <tok> T_OP_LDLV
%token <tok> T_OP_GETSPID
%token <tok> T_OP_GETWID
+%token <tok> T_OP_GETFIBERID
+%token <tok> T_OP_STC
/* category 7: */
%token <tok> T_OP_BAR
@@ -593,6 +632,11 @@ static void print_token(FILE *file, int type, YYSTYPE value)
%token <tok> T_UNTYPED
%token <tok> T_TYPED
+%token <tok> T_MIXED
+%token <tok> T_UNSIGNED
+%token <tok> T_LOW
+%token <tok> T_HIGH
+
%token <tok> T_1D
%token <tok> T_2D
%token <tok> T_3D
@@ -746,7 +790,7 @@ iflag: T_SY { iflags.flags |= IR3_INSTR_SY; }
iflags:
| iflag iflags
-instrs: instr instrs
+instrs: instrs instr
| instr
instr: iflags cat0_instr
@@ -800,6 +844,7 @@ cat0_instr: T_OP_NOP { new_instr(OPC_NOP); }
| T_OP_PREDT { new_instr(OPC_PREDT); } cat0_src1
| T_OP_PREDF { new_instr(OPC_PREDF); } cat0_src1
| T_OP_PREDE { new_instr(OPC_PREDE); }
+| T_OP_GETLAST '.' T_W { new_instr(OPC_GETLAST); } cat0_immed
cat1_opc: T_OP_MOV '.' T_CAT1_TYPE_TYPE {
parse_type_type(new_instr(OPC_MOV), $3);
@@ -815,9 +860,16 @@ cat1_movmsk: T_OP_MOVMSK '.' T_W {
new_instr(OPC_MOVMSK);
instr->cat1.src_type = TYPE_U32;
instr->cat1.dst_type = TYPE_U32;
- instr->repeat = $3 - 1;
} dst_reg {
- instr->dsts[0]->wrmask = (1 << $3) - 1;
+ if (($3 % 32) != 0)
+ yyerror("w# must be multiple of 32");
+ if ($3 < 32)
+ yyerror("w# must be at least 32");
+
+ int num = $3 / 32;
+
+ instr->repeat = num - 1;
+ instr->dsts[0]->wrmask = (1 << num) - 1;
}
cat1_mova1: T_OP_MOVA1 T_A1 ',' {
@@ -894,6 +946,7 @@ cat2_opc_2src: T_OP_ADD_F { new_instr(OPC_ADD_F); }
| T_OP_SHR_B { new_instr(OPC_SHR_B); }
| T_OP_ASHR_B { new_instr(OPC_ASHR_B); }
| T_OP_BARY_F { new_instr(OPC_BARY_F); }
+| T_OP_FLAT_B { new_instr(OPC_FLAT_B); }
| T_OP_MGEN_B { new_instr(OPC_MGEN_B); }
| T_OP_GETBIT_B { new_instr(OPC_GETBIT_B); }
| T_OP_SHB { new_instr(OPC_SHB); }
@@ -910,6 +963,12 @@ cat2_instr: cat2_opc_1src dst_reg ',' src_reg_or_const_or_rel_or_imm
| cat2_opc_2src_cnd '.' cond dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm
| cat2_opc_2src dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm
+cat3_dp_signedness:'.' T_MIXED { instr->cat3.signedness = IR3_SRC_MIXED; }
+| '.' T_UNSIGNED{ instr->cat3.signedness = IR3_SRC_UNSIGNED; }
+
+cat3_dp_pack: '.' T_LOW { instr->cat3.packed = IR3_SRC_PACKED_LOW; }
+| '.' T_HIGH { instr->cat3.packed = IR3_SRC_PACKED_HIGH; }
+
cat3_opc: T_OP_MAD_U16 { new_instr(OPC_MAD_U16); }
| T_OP_MADSH_U16 { new_instr(OPC_MADSH_U16); }
| T_OP_MAD_S16 { new_instr(OPC_MAD_S16); }
@@ -927,8 +986,22 @@ cat3_opc: T_OP_MAD_U16 { new_instr(OPC_MAD_U16); }
| T_OP_SAD_S16 { new_instr(OPC_SAD_S16); }
| T_OP_SAD_S32 { new_instr(OPC_SAD_S32); }
+cat3_imm_reg_opc: T_OP_SHRM { new_instr(OPC_SHRM); }
+| T_OP_SHLM { new_instr(OPC_SHLM); }
+| T_OP_SHRG { new_instr(OPC_SHRG); }
+| T_OP_SHLG { new_instr(OPC_SHLG); }
+| T_OP_ANDG { new_instr(OPC_ANDG); }
+
+cat3_wmm: T_OP_WMM { new_instr(OPC_WMM); }
+| T_OP_WMM_ACCU { new_instr(OPC_WMM_ACCU); }
+
+cat3_dp: T_OP_DP2ACC { new_instr(OPC_DP2ACC); }
+| T_OP_DP4ACC { new_instr(OPC_DP4ACC); }
+
cat3_instr: cat3_opc dst_reg ',' src_reg_or_const_or_rel ',' src_reg_or_const ',' src_reg_or_const_or_rel
-| T_OP_SHLG_B16 { new_instr(OPC_SHLG_B16); } dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm
+| cat3_imm_reg_opc dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm
+| cat3_wmm dst_reg ',' src_reg_gpr ',' src_reg ',' immediate
+| cat3_dp cat3_dp_signedness cat3_dp_pack dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm
cat4_opc: T_OP_RCP { new_instr(OPC_RCP); }
| T_OP_RSQ { new_instr(OPC_RSQ); }
@@ -972,6 +1045,11 @@ cat5_opc: T_OP_ISAM { new_instr(OPC_ISAM); }
| T_OP_SAMGP3 { new_instr(OPC_SAMGP3); }
| T_OP_RGETPOS { new_instr(OPC_RGETPOS); }
| T_OP_RGETINFO { new_instr(OPC_RGETINFO); }
+| T_OP_BRCST_A { new_instr(OPC_BRCST_ACTIVE); }
+| T_OP_QSHUFFLE_BRCST { new_instr(OPC_QUAD_SHUFFLE_BRCST); }
+| T_OP_QSHUFFLE_H { new_instr(OPC_QUAD_SHUFFLE_HORIZ); }
+| T_OP_QSHUFFLE_V { new_instr(OPC_QUAD_SHUFFLE_VERT); }
+| T_OP_QSHUFFLE_DIAG { new_instr(OPC_QUAD_SHUFFLE_DIAG); }
cat5_flag: '.' T_3D { instr->flags |= IR3_INSTR_3D; }
| '.' 'a' { instr->flags |= IR3_INSTR_A; }
@@ -979,13 +1057,15 @@ cat5_flag: '.' T_3D { instr->flags |= IR3_INSTR_3D; }
| '.' 'p' { instr->flags |= IR3_INSTR_P; }
| '.' 's' { instr->flags |= IR3_INSTR_S; }
| '.' T_S2EN { instr->flags |= IR3_INSTR_S2EN; }
+| '.' T_UNIFORM { }
| '.' T_NONUNIFORM { instr->flags |= IR3_INSTR_NONUNIF; }
| '.' T_BASE { instr->flags |= IR3_INSTR_B; instr->cat5.tex_base = $2; }
+| '.' T_W { instr->cat5.cluster_size = $2; }
cat5_flags:
| cat5_flag cat5_flags
cat5_samp: T_SAMP { instr->cat5.samp = $1; }
-cat5_tex: T_TEX { if (instr->flags & IR3_INSTR_B) instr->cat5.samp |= ($1 << 4); else instr->cat5.tex = $1; }
+cat5_tex: T_TEX { instr->cat5.tex = $1; }
cat5_type: '(' type ')' { instr->cat5.type = $2; }
cat5_a1: src_reg { instr->flags |= IR3_INSTR_A1EN; }
@@ -1018,7 +1098,7 @@ cat6_imm_offset: offset { new_src(0, IR3_REG_IMMED)->iim_val = $1; }
cat6_offset: cat6_imm_offset
| '+' src
cat6_dst_offset: offset { instr->cat6.dst_offset = $1; }
-| '+' src { instr->flags |= IR3_INSTR_G; }
+| '+' src
cat6_immed: integer { instr->cat6.iim_val = $1; }
@@ -1066,14 +1146,39 @@ cat6_atomic_opc: T_OP_ATOMIC_ADD { new_instr(OPC_ATOMIC_ADD); }
| T_OP_ATOMIC_OR { new_instr(OPC_ATOMIC_OR); }
| T_OP_ATOMIC_XOR { new_instr(OPC_ATOMIC_XOR); }
-cat6_atomic_g: cat6_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' 'g' '[' cat6_reg_or_immed ']' ',' src ',' src ',' src {
- instr->flags |= IR3_INSTR_G;
- }
+cat6_a3xx_atomic_opc: T_OP_ATOMIC_S_ADD { new_instr(OPC_ATOMIC_S_ADD); }
+| T_OP_ATOMIC_S_SUB { new_instr(OPC_ATOMIC_S_SUB); }
+| T_OP_ATOMIC_S_XCHG { new_instr(OPC_ATOMIC_S_XCHG); }
+| T_OP_ATOMIC_S_INC { new_instr(OPC_ATOMIC_S_INC); }
+| T_OP_ATOMIC_S_DEC { new_instr(OPC_ATOMIC_S_DEC); }
+| T_OP_ATOMIC_S_CMPXCHG { new_instr(OPC_ATOMIC_S_CMPXCHG); }
+| T_OP_ATOMIC_S_MIN { new_instr(OPC_ATOMIC_S_MIN); }
+| T_OP_ATOMIC_S_MAX { new_instr(OPC_ATOMIC_S_MAX); }
+| T_OP_ATOMIC_S_AND { new_instr(OPC_ATOMIC_S_AND); }
+| T_OP_ATOMIC_S_OR { new_instr(OPC_ATOMIC_S_OR); }
+| T_OP_ATOMIC_S_XOR { new_instr(OPC_ATOMIC_S_XOR); }
+
+cat6_a6xx_atomic_opc: T_OP_ATOMIC_G_ADD { new_instr(OPC_ATOMIC_G_ADD); }
+| T_OP_ATOMIC_G_SUB { new_instr(OPC_ATOMIC_G_SUB); }
+| T_OP_ATOMIC_G_XCHG { new_instr(OPC_ATOMIC_G_XCHG); }
+| T_OP_ATOMIC_G_INC { new_instr(OPC_ATOMIC_G_INC); }
+| T_OP_ATOMIC_G_DEC { new_instr(OPC_ATOMIC_G_DEC); }
+| T_OP_ATOMIC_G_CMPXCHG { new_instr(OPC_ATOMIC_G_CMPXCHG); }
+| T_OP_ATOMIC_G_MIN { new_instr(OPC_ATOMIC_G_MIN); }
+| T_OP_ATOMIC_G_MAX { new_instr(OPC_ATOMIC_G_MAX); }
+| T_OP_ATOMIC_G_AND { new_instr(OPC_ATOMIC_G_AND); }
+| T_OP_ATOMIC_G_OR { new_instr(OPC_ATOMIC_G_OR); }
+| T_OP_ATOMIC_G_XOR { new_instr(OPC_ATOMIC_G_XOR); }
+
+cat6_a3xx_atomic_s: cat6_a3xx_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' 'g' '[' cat6_reg_or_immed ']' ',' src ',' src ',' src
+
+cat6_a6xx_atomic_g: cat6_a6xx_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' src ',' src
cat6_atomic_l: cat6_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'l' dst_reg ',' 'l' '[' cat6_reg_or_immed ']' ',' src
-cat6_atomic: cat6_atomic_g
-| cat6_atomic_l
+cat6_atomic: cat6_atomic_l
+| cat6_a3xx_atomic_s
+| cat6_a6xx_atomic_g
cat6_ibo_opc_1src: T_OP_RESINFO { new_instr(OPC_RESINFO); }
@@ -1087,6 +1192,7 @@ cat6_ibo: cat6_ibo_opc_1src cat6_type cat6_dim dst_reg ',' 'g' '[' cat6
cat6_id_opc:
T_OP_GETSPID { new_instr(OPC_GETSPID); }
| T_OP_GETWID { new_instr(OPC_GETWID); }
+| T_OP_GETFIBERID { new_instr(OPC_GETFIBERID); }
cat6_id: cat6_id_opc cat6_type dst_reg
@@ -1102,17 +1208,17 @@ cat6_reg_or_immed: src
cat6_bindless_ibo_opc_1src: T_OP_RESINFO_B { new_instr(OPC_RESINFO); }
-cat6_bindless_ibo_opc_2src: T_OP_ATOMIC_B_ADD { new_instr(OPC_ATOMIC_ADD)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_SUB { new_instr(OPC_ATOMIC_SUB)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_XCHG { new_instr(OPC_ATOMIC_XCHG)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_INC { new_instr(OPC_ATOMIC_INC)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_DEC { new_instr(OPC_ATOMIC_DEC)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_CMPXCHG { new_instr(OPC_ATOMIC_CMPXCHG)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_MIN { new_instr(OPC_ATOMIC_MIN)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_MAX { new_instr(OPC_ATOMIC_MAX)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_AND { new_instr(OPC_ATOMIC_AND)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_OR { new_instr(OPC_ATOMIC_OR)->flags |= IR3_INSTR_G; dummy_dst(); }
-| T_OP_ATOMIC_B_XOR { new_instr(OPC_ATOMIC_XOR)->flags |= IR3_INSTR_G; dummy_dst(); }
+cat6_bindless_ibo_opc_2src: T_OP_ATOMIC_B_ADD { new_instr(OPC_ATOMIC_B_ADD); dummy_dst(); }
+| T_OP_ATOMIC_B_SUB { new_instr(OPC_ATOMIC_B_SUB); dummy_dst(); }
+| T_OP_ATOMIC_B_XCHG { new_instr(OPC_ATOMIC_B_XCHG); dummy_dst(); }
+| T_OP_ATOMIC_B_INC { new_instr(OPC_ATOMIC_B_INC); dummy_dst(); }
+| T_OP_ATOMIC_B_DEC { new_instr(OPC_ATOMIC_B_DEC); dummy_dst(); }
+| T_OP_ATOMIC_B_CMPXCHG { new_instr(OPC_ATOMIC_B_CMPXCHG); dummy_dst(); }
+| T_OP_ATOMIC_B_MIN { new_instr(OPC_ATOMIC_B_MIN); dummy_dst(); }
+| T_OP_ATOMIC_B_MAX { new_instr(OPC_ATOMIC_B_MAX); dummy_dst(); }
+| T_OP_ATOMIC_B_AND { new_instr(OPC_ATOMIC_B_AND); dummy_dst(); }
+| T_OP_ATOMIC_B_OR { new_instr(OPC_ATOMIC_B_OR); dummy_dst(); }
+| T_OP_ATOMIC_B_XOR { new_instr(OPC_ATOMIC_B_XOR); dummy_dst(); }
| T_OP_STIB_B { new_instr(OPC_STIB); dummy_dst(); }
cat6_bindless_ibo_opc_2src_dst: T_OP_LDIB_B { new_instr(OPC_LDIB); }
@@ -1123,13 +1229,23 @@ cat6_bindless_ibo: cat6_bindless_ibo_opc_1src cat6_typed cat6_dim cat6_type '.'
cat6_bindless_ldc_opc: T_OP_LDC { new_instr(OPC_LDC); }
-cat6_bindless_ldc: cat6_bindless_ldc_opc '.' T_OFFSET '.' cat6_immed '.' cat6_bindless_mode dst_reg ',' cat6_reg_or_immed ',' cat6_reg_or_immed {
- instr->cat6.d = $3;
+/* This is separated from the opcode to avoid lookahead/shift-reduce conflicts */
+cat6_bindless_ldc_middle:
+ T_OFFSET '.' cat6_immed '.' cat6_bindless_mode dst_reg { instr->cat6.d = $1; }
+| cat6_immed '.' 'k' '.' cat6_bindless_mode 'c' '[' T_A1 ']' { instr->opc = OPC_LDC_K; }
+
+cat6_bindless_ldc: cat6_bindless_ldc_opc '.' cat6_bindless_ldc_middle ',' cat6_reg_or_immed ',' cat6_reg_or_immed {
instr->cat6.type = TYPE_U32;
/* TODO cleanup ir3 src order: */
swap(instr->srcs[0], instr->srcs[1]);
}
+stc_dst: integer { new_src(0, IR3_REG_IMMED)->iim_val = $1; }
+| T_A1 { new_src(0, IR3_REG_IMMED)->iim_val = 0; instr->flags |= IR3_INSTR_A1EN; }
+| T_A1 '+' integer { new_src(0, IR3_REG_IMMED)->iim_val = $3; instr->flags |= IR3_INSTR_A1EN; }
+
+cat6_stc: T_OP_STC { new_instr(OPC_STC); } cat6_type 'c' '[' stc_dst ']' ',' src_reg ',' cat6_immed
+
cat6_todo: T_OP_G2L { new_instr(OPC_G2L); }
| T_OP_L2G { new_instr(OPC_L2G); }
| T_OP_RESFMT { new_instr(OPC_RESFMT); }
@@ -1144,6 +1260,7 @@ cat6_instr: cat6_load
| cat6_id
| cat6_bindless_ldc
| cat6_bindless_ibo
+| cat6_stc
| cat6_todo
cat7_scope: '.' 'w' { instr->cat7.w = true; }
@@ -1195,6 +1312,9 @@ src_reg_flags: src_reg_flag
src_reg: src
| src_reg_flags src
+src_reg_gpr: src_reg
+| relative_gpr_src
+
src_const: const
| src_reg_flags const
diff --git a/lib/mesa/src/freedreno/ir3/ir3_postsched.c b/lib/mesa/src/freedreno/ir3/ir3_postsched.c
index 507302a00..39de84add 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_postsched.c
+++ b/lib/mesa/src/freedreno/ir3/ir3_postsched.c
@@ -68,8 +68,10 @@ struct ir3_postsched_ctx {
struct list_head unscheduled_list; /* unscheduled instructions */
- int sfu_delay;
- int tex_delay;
+ unsigned ip;
+
+ int ss_delay;
+ int sy_delay;
};
struct ir3_postsched_node {
@@ -77,7 +79,9 @@ struct ir3_postsched_node {
struct ir3_instruction *instr;
bool partially_evaluated_path;
- bool has_tex_src, has_sfu_src;
+ unsigned earliest_ip;
+
+ bool has_sy_src, has_ss_src;
unsigned delay;
unsigned max_delay;
@@ -87,17 +91,17 @@ struct ir3_postsched_node {
list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
static bool
-has_tex_src(struct ir3_instruction *instr)
+has_sy_src(struct ir3_instruction *instr)
{
struct ir3_postsched_node *node = instr->data;
- return node->has_tex_src;
+ return node->has_sy_src;
}
static bool
-has_sfu_src(struct ir3_instruction *instr)
+has_ss_src(struct ir3_instruction *instr)
{
struct ir3_postsched_node *node = instr->data;
- return node->has_sfu_src;
+ return node->has_ss_src;
}
static void
@@ -111,28 +115,45 @@ schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
di(instr, "schedule");
- list_addtail(&instr->node, &instr->block->instr_list);
+ bool counts_for_delay = is_alu(instr) || is_flow(instr);
+
+ unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
struct ir3_postsched_node *n = instr->data;
+
+ /* We insert any nop's needed to get to earliest_ip, then advance
+ * delay_cycles by scheduling the instruction.
+ */
+ ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
+
+ util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
+ unsigned delay = (unsigned)(uintptr_t)edge->data;
+ struct ir3_postsched_node *child =
+ container_of(edge->child, struct ir3_postsched_node, dag);
+ child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
+ }
+
+ list_addtail(&instr->node, &instr->block->instr_list);
+
dag_prune_head(ctx->dag, &n->dag);
if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
return;
- if (is_sfu(instr)) {
- ctx->sfu_delay = 8;
- } else if (has_sfu_src(instr)) {
- ctx->sfu_delay = 0;
- } else if (ctx->sfu_delay > 0) {
- ctx->sfu_delay--;
+ if (is_ss_producer(instr)) {
+ ctx->ss_delay = soft_ss_delay(instr);
+ } else if (has_ss_src(instr)) {
+ ctx->ss_delay = 0;
+ } else if (ctx->ss_delay > 0) {
+ ctx->ss_delay--;
}
- if (is_tex_or_prefetch(instr)) {
- ctx->tex_delay = 10;
- } else if (has_tex_src(instr)) {
- ctx->tex_delay = 0;
- } else if (ctx->tex_delay > 0) {
- ctx->tex_delay--;
+ if (is_sy_producer(instr)) {
+ ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
+ } else if (has_sy_src(instr)) {
+ ctx->sy_delay = 0;
+ } else if (ctx->sy_delay > 0) {
+ ctx->sy_delay--;
}
}
@@ -154,25 +175,26 @@ dump_state(struct ir3_postsched_ctx *ctx)
}
}
-/* Determine if this is an instruction that we'd prefer not to schedule
- * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
- * counter, ie. the more cycles it has been since the last SFU, the less
- * costly a sync would be.
- */
-static bool
-would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+static unsigned
+node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
{
- if (ctx->sfu_delay) {
- if (has_sfu_src(instr))
- return true;
- }
+ return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
+}
- if (ctx->tex_delay) {
- if (has_tex_src(instr))
- return true;
- }
+static unsigned
+node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
+{
+ unsigned delay = node_delay(ctx, n);
+
+ /* This takes into account that as when we schedule multiple tex or sfu, the
+ * first user has to wait for all of them to complete.
+ */
+ if (n->has_ss_src)
+ delay = MAX2(delay, ctx->ss_delay);
+ if (n->has_sy_src)
+ delay = MAX2(delay, ctx->sy_delay);
- return false;
+ return delay;
}
/* find instruction to schedule: */
@@ -215,8 +237,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
/* Next prioritize discards: */
foreach_sched_node (n, &ctx->dag->heads) {
- unsigned d =
- ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+ unsigned d = node_delay(ctx, n);
if (d > 0)
continue;
@@ -235,13 +256,12 @@ choose_instr(struct ir3_postsched_ctx *ctx)
/* Next prioritize expensive instructions: */
foreach_sched_node (n, &ctx->dag->heads) {
- unsigned d =
- ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+ unsigned d = node_delay_soft(ctx, n);
if (d > 0)
continue;
- if (!(is_sfu(n->instr) || is_tex(n->instr)))
+ if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
continue;
if (!chosen || (chosen->max_delay < n->max_delay))
@@ -249,53 +269,36 @@ choose_instr(struct ir3_postsched_ctx *ctx)
}
if (chosen) {
- di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
+ di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
return chosen->instr;
}
- /*
- * Sometimes be better to take a nop, rather than scheduling an
- * instruction that would require an (ss) shortly after another
- * SFU.. ie. if last SFU was just one or two instr ago, and we
- * could choose between taking a nop and then scheduling
- * something else, vs scheduling the immed avail instruction that
- * would require (ss), we are better with the nop.
- */
- for (unsigned delay = 0; delay < 4; delay++) {
- foreach_sched_node (n, &ctx->dag->heads) {
- if (would_sync(ctx, n->instr))
- continue;
-
- unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
- ctx->v->mergedregs);
-
- if (d > delay)
- continue;
-
- if (!chosen || (chosen->max_delay < n->max_delay))
- chosen = n;
- }
-
- if (chosen) {
- di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
- return chosen->instr;
- }
- }
-
/* Next try to find a ready leader w/ soft delay (ie. including extra
* delay for things like tex fetch which can be synchronized w/ sync
* bit (but we probably do want to schedule some other instructions
- * while we wait)
+ * while we wait). We also allow a small amount of nops, to prefer now-nops
+ * over future-nops up to a point, as that gives better results.
*/
+ unsigned chosen_delay = 0;
foreach_sched_node (n, &ctx->dag->heads) {
- unsigned d =
- ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
+ unsigned d = node_delay_soft(ctx, n);
- if (d > 0)
+ if (d > 3)
continue;
- if (!chosen || (chosen->max_delay < n->max_delay))
+ if (!chosen || d < chosen_delay) {
+ chosen = n;
+ chosen_delay = d;
+ continue;
+ }
+
+ if (d > chosen_delay)
+ continue;
+
+ if (chosen->max_delay < n->max_delay) {
chosen = n;
+ chosen_delay = d;
+ }
}
if (chosen) {
@@ -308,8 +311,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
* stalls.. but we've already decided there is not a better option.
*/
foreach_sched_node (n, &ctx->dag->heads) {
- unsigned d =
- ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+ unsigned d = node_delay(ctx, n);
if (d > 0)
continue;
@@ -324,9 +326,6 @@ choose_instr(struct ir3_postsched_ctx *ctx)
}
/* Otherwise choose leader with maximum cost:
- *
- * TODO should we try to balance cost and delays? I guess it is
- * a balance between now-nop's and future-nop's?
*/
foreach_sched_node (n, &ctx->dag->heads) {
if (!chosen || chosen->max_delay < n->max_delay)
@@ -361,6 +360,7 @@ struct ir3_postsched_deps_state {
* for full precision and 2nd half for half-precision.
*/
struct ir3_postsched_node *regs[2 * 256];
+ unsigned dst_n[2 * 256];
};
/* bounds checking read/write accessors, since OoB access to stuff on
@@ -374,7 +374,8 @@ struct ir3_postsched_deps_state {
static void
add_dep(struct ir3_postsched_deps_state *state,
- struct ir3_postsched_node *before, struct ir3_postsched_node *after)
+ struct ir3_postsched_node *before, struct ir3_postsched_node *after,
+ unsigned d)
{
if (!before || !after)
return;
@@ -382,30 +383,36 @@ add_dep(struct ir3_postsched_deps_state *state,
assert(before != after);
if (state->direction == F) {
- dag_add_edge(&before->dag, &after->dag, NULL);
+ dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
} else {
- dag_add_edge(&after->dag, &before->dag, NULL);
+ dag_add_edge_max_data(&after->dag, &before->dag, 0);
}
}
static void
add_single_reg_dep(struct ir3_postsched_deps_state *state,
- struct ir3_postsched_node *node, unsigned num, int src_n)
+ struct ir3_postsched_node *node, unsigned num, int src_n,
+ int dst_n)
{
struct ir3_postsched_node *dep = dep_reg(state, num);
+ unsigned d = 0;
if (src_n >= 0 && dep && state->direction == F) {
- unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
- node->delay = MAX2(node->delay, d);
- if (is_tex_or_prefetch(dep->instr))
- node->has_tex_src = true;
- if (is_tex_or_prefetch(dep->instr))
- node->has_sfu_src = true;
- }
-
- add_dep(state, dep, node);
+ /* get the dst_n this corresponds to */
+ unsigned dst_n = state->dst_n[num];
+ unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true);
+ d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n);
+ node->delay = MAX2(node->delay, d_soft);
+ if (is_sy_producer(dep->instr))
+ node->has_sy_src = true;
+ if (is_ss_producer(dep->instr))
+ node->has_ss_src = true;
+ }
+
+ add_dep(state, dep, node, d);
if (src_n < 0) {
dep_reg(state, num) = node;
+ state->dst_n[num] = dst_n;
}
}
@@ -413,15 +420,15 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state,
* between half and full precision that result in additional dependencies.
* The 'reg' arg is really just to know half vs full precision.
*
- * If non-negative, then this adds a dependency on a source register, and
+ * If src_n is positive, then this adds a dependency on a source register, and
* src_n is the index passed into ir3_delayslots() for calculating the delay:
- * If positive, corresponds to node->instr->regs[src_n]. If negative, then
- * this is for a destination register.
+ * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
+ * this is for the destination register corresponding to dst_n.
*/
static void
add_reg_dep(struct ir3_postsched_deps_state *state,
struct ir3_postsched_node *node, const struct ir3_register *reg,
- unsigned num, int src_n)
+ unsigned num, int src_n, int dst_n)
{
if (state->merged) {
/* Make sure that special registers like a0.x that are written as
@@ -430,16 +437,16 @@ add_reg_dep(struct ir3_postsched_deps_state *state,
*/
if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
/* single conflict in half-reg space: */
- add_single_reg_dep(state, node, num, src_n);
+ add_single_reg_dep(state, node, num, src_n, dst_n);
} else {
/* two conflicts in half-reg space: */
- add_single_reg_dep(state, node, 2 * num + 0, src_n);
- add_single_reg_dep(state, node, 2 * num + 1, src_n);
+ add_single_reg_dep(state, node, 2 * num + 0, src_n, dst_n);
+ add_single_reg_dep(state, node, 2 * num + 1, src_n, dst_n);
}
} else {
if (reg->flags & IR3_REG_HALF)
num += ARRAY_SIZE(state->regs) / 2;
- add_single_reg_dep(state, node, num, src_n);
+ add_single_reg_dep(state, node, num, src_n, dst_n);
}
}
@@ -457,12 +464,12 @@ calculate_deps(struct ir3_postsched_deps_state *state,
if (reg->flags & IR3_REG_RELATIV) {
/* mark entire array as read: */
for (unsigned j = 0; j < reg->size; j++) {
- add_reg_dep(state, node, reg, reg->array.base + j, i);
+ add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
}
} else {
assert(reg->wrmask >= 1);
u_foreach_bit (b, reg->wrmask) {
- add_reg_dep(state, node, reg, reg->num + b, i);
+ add_reg_dep(state, node, reg, reg->num + b, i, -1);
}
}
}
@@ -470,18 +477,18 @@ calculate_deps(struct ir3_postsched_deps_state *state,
/* And then after we update the state for what this instruction
* wrote:
*/
- foreach_dst (reg, node->instr) {
+ foreach_dst_n (reg, i, node->instr) {
if (reg->wrmask == 0)
continue;
if (reg->flags & IR3_REG_RELATIV) {
/* mark the entire array as written: */
- for (unsigned i = 0; i < reg->size; i++) {
- add_reg_dep(state, node, reg, reg->array.base + i, -1);
+ for (unsigned j = 0; j < reg->size; j++) {
+ add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
}
} else {
assert(reg->wrmask >= 1);
u_foreach_bit (b, reg->wrmask) {
- add_reg_dep(state, node, reg, reg->num + b, -1);
+ add_reg_dep(state, node, reg, reg->num + b, -1, i);
}
}
}
@@ -593,7 +600,7 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
if (src->block != instr->block)
continue;
- dag_add_edge(&sn->dag, &n->dag, NULL);
+ dag_add_edge_max_data(&sn->dag, &n->dag, 0);
}
if (is_input(instr)) {
@@ -602,14 +609,14 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
struct ir3_instruction *input = *instrp;
struct ir3_postsched_node *in = input->data;
- dag_add_edge(&in->dag, &n->dag, NULL);
+ dag_add_edge_max_data(&in->dag, &n->dag, 0);
}
util_dynarray_append(&kills, struct ir3_instruction *, instr);
} else if (is_tex(instr) || is_mem(instr)) {
util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
struct ir3_instruction *kill = *instrp;
struct ir3_postsched_node *kn = kill->data;
- dag_add_edge(&kn->dag, &n->dag, NULL);
+ dag_add_edge_max_data(&kn->dag, &n->dag, 0);
}
}
}
@@ -630,8 +637,8 @@ static void
sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
{
ctx->block = block;
- ctx->tex_delay = 0;
- ctx->sfu_delay = 0;
+ ctx->sy_delay = 0;
+ ctx->ss_delay = 0;
/* move all instructions to the unscheduled list, and
* empty the block's instruction list (to which we will
@@ -677,18 +684,10 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
while (!list_is_empty(&ctx->unscheduled_list)) {
struct ir3_instruction *instr = choose_instr(ctx);
- unsigned delay =
- ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
+ unsigned delay = node_delay(ctx, instr->data);
d("delay=%u", delay);
- /* and if we run out of instructions that can be scheduled,
- * then it is time for nop's:
- */
debug_assert(delay <= 6);
- while (delay > 0) {
- ir3_NOP(block);
- delay--;
- }
schedule(ctx, instr);
}
@@ -750,7 +749,6 @@ ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
.v = v,
};
- ir3_remove_nops(ir);
cleanup_self_movs(ir);
foreach_block (block, &ir->block_list) {
diff --git a/lib/mesa/src/freedreno/ir3/ir3_ra.h b/lib/mesa/src/freedreno/ir3/ir3_ra.h
index 259341eaa..c6837aaae 100644
--- a/lib/mesa/src/freedreno/ir3/ir3_ra.h
+++ b/lib/mesa/src/freedreno/ir3/ir3_ra.h
@@ -124,7 +124,7 @@ ra_reg_is_dst(const struct ir3_register *reg)
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
#define ra_foreach_dst_n(__dstreg, __n, __instr) \
- foreach_dst_n(__dstreg, __n, instr) \
+ foreach_dst_n(__dstreg, __n, __instr) \
if (ra_reg_is_dst(__dstreg))
#define ra_foreach_dst(__dstreg, __instr) \
diff --git a/lib/mesa/src/freedreno/ir3/tests/disasm.c b/lib/mesa/src/freedreno/ir3/tests/disasm.c
index 542469aa1..2f1b89f0d 100644
--- a/lib/mesa/src/freedreno/ir3/tests/disasm.c
+++ b/lib/mesa/src/freedreno/ir3/tests/disasm.c
@@ -43,6 +43,8 @@
#include "isa/isa.h"
/* clang-format off */
+/* Note: @anholt's 4xx disasm was done on an a418 Nexus 5x */
+#define INSTR_4XX(i, d, ...) { .gpu_id = 420, .instr = #i, .expected = d, __VA_ARGS__ }
#define INSTR_5XX(i, d, ...) { .gpu_id = 540, .instr = #i, .expected = d, __VA_ARGS__ }
#define INSTR_6XX(i, d, ...) { .gpu_id = 630, .instr = #i, .expected = d, __VA_ARGS__ }
/* clang-format on */
@@ -58,153 +60,185 @@ static const struct test {
bool parse_fail;
} tests[] = {
/* clang-format off */
- /* cat0 */
- INSTR_6XX(00000000_00000000, "nop"),
- INSTR_6XX(00000200_00000000, "(rpt2)nop"),
- INSTR_6XX(03000000_00000000, "end"),
- INSTR_6XX(00800000_00000004, "br p0.x, #4"),
- INSTR_6XX(00900000_00000003, "br !p0.x, #3"),
- INSTR_6XX(03820000_00000015, "shps #21"), /* emit */
- INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */
- INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */
- INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"),
- INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"),
- INSTR_6XX(07820000_00000000, "prede"),
- INSTR_6XX(00800063_0000001e, "brac.3 #30"),
- INSTR_6XX(06820000_00000000, "predt p0.x"),
- INSTR_6XX(07020000_00000000, "predf p0.x"),
- INSTR_6XX(07820000_00000000, "prede"),
-
- /* cat1 */
- INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"),
- INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"),
- INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"),
- INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"),
- INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"),
- INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"),
- INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"),
- INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"),
- INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"),
- /* dEQP-VK.subgroups.ballot.compute.compute */
- INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */
-
- INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"),
- INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"),
- INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"),
- INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"),
-
- INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"),
- INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"),
- INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"),
- INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"),
-
- /* cat2 */
- INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"),
- INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"),
- INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"),
- INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"),
- INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"),
- INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"),
- INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"),
- INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"),
- INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"),
- INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"),
- INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"),
- INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"),
- INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"),
- INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"),
-
- /* cat3 */
- INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"),
- INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"),
- INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"),
- INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"),
- INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"),
- INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"),
- INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"),
- INSTR_6XX(65900820_100cb008, "(nop3) shlg.b16 hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */
- INSTR_6XX(65ae085c_0002a001, "(nop3) shlg.b16 hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */
- INSTR_6XX(65900820_0c0aac05, "(nop3) shlg.b16 hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */
-
- /* cat4 */
- INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"),
-
- /* cat5 */
- /* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */
- INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */
- /* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */
- INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */
- /* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */
- INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
- INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
-
- INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"),
- INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"), /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */
- INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"), /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */
- INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"),
- INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"),
- INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"),
- INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"),
- INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"),
- /* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */
- INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"),
- INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"),
-
-
- /* cat6 */
-
- INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"), /* ldgb.a.untyped.1dtype.u32.1 r0.x, g[r1.x], r0.x, 0 */
- INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"), /* (sy)stib.a.u32.2d.1 g[r1.x], r0.x, r0.z, 1. r1.x is offset in ibo, r0.x is value*/
- /* dEQP-VK.image.load_store.1d_array.r8g8b8a8_unorm */
- INSTR_5XX(c1a20006_0600ba01, "ldib.typed.2d.f32.4 r1.z, g[0], r0.z, r1.z"), /* ldib.a.f32.2d.4 r1.z, g[r0.z], r1.z, 0. r0.z is offset in ibo as src. r1.z */
- /* dEQP-VK.image.load_store.3d.r32g32b32a32_sint */
- INSTR_5XX(c1aa0003_0500fc01, "ldib.typed.3d.s32.4 r0.w, g[0], r0.w, r1.y"), /* ldib.a.s32.3d.4 r0.w, g[r0.w], r1.y, 0. r0.w is offset in ibo as src, and dst */
- /* dEQP-VK.binding_model.shader_access.primary_cmd_buf.storage_image.vertex.descriptor_array.3d */
- INSTR_5XX(c1a20204_0401fc01, "ldib.typed.3d.f32.4 r1.x, g[1], r1.w, r1.x"), /* ldib.a.f32.3d.4 r1.x, g[r1.w], r1.x, 1 */
- /* dEQP-VK.binding_model.shader_access.secondary_cmd_buf.with_push.storage_texel_buffer.vertex_fragment.single_descriptor.offset_zero */
- INSTR_5XX(c1a20005_0501be01, "ldib.typed.4d.f32.4 r1.y, g[0], r1.z, r1.y"), /* ldib.a.f32.1dtype.4 r1.y, g[r1.z], r1.y, 0 */
- /* dEQP-VK.texture.filtering.cube.formats.r8g8b8a8_snorm_nearest */
- INSTR_5XX(c1a60200_0000ba01, "ldib.typed.2d.u32.4 r0.x, g[1], r0.z, r0.x"), /* ldib.a.u32.2d.4 r0.x, g[r0.z], r0.x, 1 */
-
- // TODO is this a real instruction? Or float -6.0 ?
- // INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true),
- /* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */
- INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */
- INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */
- INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"),
- INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"),
- INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"),
-
- /* Customely crafted */
- INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"),
- INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"),
-
- INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
- INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
- INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
- INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
- INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"),
-
- /* Found in TCS/TES shaders of GTA V */
- INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */
-
- /* Customely crafted */
- INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"),
-
- INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
- INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
- INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
- INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
-
- /* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */
- INSTR_6XX(c7020020_01800000, "stc c[32], r0.x, 1", .parse_fail=true),
- /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
- INSTR_6XX(c7060020_03800000, "stc c[32], r0.x, 3", .parse_fail=true),
-
- /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
- INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */
-
- INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 r0.z, r0.x, 2"),
+ /* cat0 */
+ INSTR_6XX(00000000_00000000, "nop"),
+ INSTR_6XX(00000200_00000000, "(rpt2)nop"),
+ INSTR_6XX(03000000_00000000, "end"),
+ INSTR_6XX(00800000_00000004, "br p0.x, #4"),
+ INSTR_6XX(00800000_fffffffc, "br p0.x, #-4"),
+ INSTR_6XX(00900000_00000003, "br !p0.x, #3"),
+ INSTR_6XX(03820000_00000015, "shps #21"), /* emit */
+ INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */
+ INSTR_6XX(02220000_00000004, "getlast.w8 #4"),
+ INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */
+ INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"),
+ INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"),
+ INSTR_6XX(07820000_00000000, "prede"),
+ INSTR_6XX(00800063_0000001e, "brac.3 #30"),
+ INSTR_6XX(06820000_00000000, "predt p0.x"),
+ INSTR_6XX(07020000_00000000, "predf p0.x"),
+ INSTR_6XX(07820000_00000000, "prede"),
+
+ /* cat1 */
+ INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"),
+ INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"),
+ INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"),
+ INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"),
+ INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"),
+ INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"),
+ INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"),
+ INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"),
+ INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"),
+ /* dEQP-VK.subgroups.ballot.compute.compute */
+ INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */
+
+ INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"),
+ INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"),
+ INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"),
+ INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"),
+
+ INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"),
+ INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"),
+ INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"),
+ INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"),
+
+ /* cat2 */
+ INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"),
+ INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"),
+ INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"),
+ INSTR_6XX(47348000_00002000, "flat.b (ei)r0.x, 0, r0.x"),
+ INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"),
+ INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"),
+ INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"),
+ INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"),
+ INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"),
+ INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"),
+ INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"),
+ INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"),
+ INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"),
+ INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"),
+ INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"),
+
+ /* cat3 */
+ INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"),
+ INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"),
+ INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"),
+ INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"),
+ INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"),
+ INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"),
+ INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"),
+ INSTR_6XX(65900820_100cb008, "(nop3) shlg hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */
+ INSTR_6XX(65ae085c_0002a001, "(nop3) shlg hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */
+ INSTR_6XX(65900820_0c0aac05, "(nop3) shlg hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */
+ INSTR_6XX(65ae0c5c_0002a001, "(nop3) shlg r23.x, r0.y, r23.x, r0.z"), /* (nop3) shlg.b32 r23.x, (r)r0.y, (r)r23.x, r0.z */
+ INSTR_6XX(64018802_0002e003, "(nop3) shrm hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+ INSTR_6XX(64818802_0002e003, "(nop3) shlm hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+ INSTR_6XX(65018802_0002e003, "(nop3) shrg hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+ INSTR_6XX(66018802_0002e003, "(nop3) andg hr0.z, (neg)hr0.w, hr0.w, hr0.z"),
+ INSTR_6XX(67018802_1002e003, "(nop3) wmm hr0.z, (neg)hr0.w, hr0.w, 2"), /* (nop3) wmm.f16f16 hr0.z, (abs)(r)hr0.w, (r)hr0.w, 2 */
+ INSTR_6XX(67018c02_1002e003, "(nop3) wmm.accu hr0.z, (neg)hr0.w, hr0.w, 2"),
+ INSTR_6XX(6701c802_9002a003, "(nop3) wmm r0.z, r0.w, r0.w, 2"), /* (nop3) wmm.f32f32 r0.z, (r)r0.w, (r)r0.w, 2 */
+ /* custom test with qcom_dot8 function from cl_qcom_dot_product8 */
+ INSTR_6XX(66818c02_0002e003, "(sat)(nop3) dp2acc.mixed.low r0.z, r0.w, r0.w, r0.z"), /* (nop3) dp2acc (sat)r0.z, (signed)(low)(r)r0.w, (low)(r)r0.w, r0.z */
+ INSTR_6XX(6681c802_8002a003, "(nop3) dp4acc.unsigned.low r0.z, r0.w, r0.w, (neg)r0.z"), /* (nop3) dp4acc r0.z, (unsigned)(r)r0.w, (r)r0.w, (neg)r0.z */
+
+ /* cat4 */
+ INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"),
+
+ /* cat5 */
+ /* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */
+ INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */
+ /* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */
+ INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */
+ /* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */
+ INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
+ INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
+
+ INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"),
+ INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"), /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */
+ INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"), /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */
+ INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"),
+ INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"),
+ INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"),
+ INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"),
+ INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"),
+ /* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */
+ INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"),
+ INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"),
+ INSTR_6XX(a0c81108_e2000001, "sam.base0 (f32)(x)r2.x, r0.x, s#16, a1.x"),
+ INSTR_6XX(a048d107_cc080a07, "isaml.base3 (s32)(x)r1.w, r0.w, r1.y, s#0, t#6"),
+
+
+ /* dEQP-VK.subgroups.arithmetic.compute.subgroupadd_float */
+ INSTR_6XX(a7c03102_00100003, "brcst.active.w8 (u32)(x)r0.z, r0.y"), /* brcst.active.w8 (u32)(xOOO)r0.z, r0.y */
+ /* dEQP-VK.subgroups.quad.graphics.subgroupquadbroadcast_int */
+ INSTR_6XX(b7e03107_00000401, "(sy)quad_shuffle.brcst (u32)(x)r1.w, r0.x, r0.z"), /* (sy)quad_shuffle.brcst (u32)(xOOO)r1.w, r0.x, r0.z */
+ /* dEQP-VK.subgroups.quad.graphics.subgroupquadswapdiagonal_int */
+ INSTR_6XX(b7e03104_00180001, "(sy)quad_shuffle.diag (u32)(x)r1.x, r0.x"), /* (sy)quad_shuffle.diag (u32)(xOOO)r1.x, r0.x */
+
+ /* cat6 */
+
+ INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"), /* ldgb.a.untyped.1dtype.u32.1 r0.x, g[r1.x], r0.x, 0 */
+ INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"), /* (sy)stib.a.u32.2d.1 g[r1.x], r0.x, r0.z, 1. r1.x is offset in ibo, r0.x is value*/
+ /* dEQP-VK.image.load_store.1d_array.r8g8b8a8_unorm */
+ INSTR_5XX(c1a20006_0600ba01, "ldib.typed.2d.f32.4 r1.z, g[0], r0.z, r1.z"), /* ldib.a.f32.2d.4 r1.z, g[r0.z], r1.z, 0. r0.z is offset in ibo as src. r1.z */
+ /* dEQP-VK.image.load_store.3d.r32g32b32a32_sint */
+ INSTR_5XX(c1aa0003_0500fc01, "ldib.typed.3d.s32.4 r0.w, g[0], r0.w, r1.y"), /* ldib.a.s32.3d.4 r0.w, g[r0.w], r1.y, 0. r0.w is offset in ibo as src, and dst */
+ /* dEQP-VK.binding_model.shader_access.primary_cmd_buf.storage_image.vertex.descriptor_array.3d */
+ INSTR_5XX(c1a20204_0401fc01, "ldib.typed.3d.f32.4 r1.x, g[1], r1.w, r1.x"), /* ldib.a.f32.3d.4 r1.x, g[r1.w], r1.x, 1 */
+ /* dEQP-VK.binding_model.shader_access.secondary_cmd_buf.with_push.storage_texel_buffer.vertex_fragment.single_descriptor.offset_zero */
+ INSTR_5XX(c1a20005_0501be01, "ldib.typed.4d.f32.4 r1.y, g[0], r1.z, r1.y"), /* ldib.a.f32.1dtype.4 r1.y, g[r1.z], r1.y, 0 */
+ /* dEQP-VK.texture.filtering.cube.formats.r8g8b8a8_snorm_nearest */
+ INSTR_5XX(c1a60200_0000ba01, "ldib.typed.2d.u32.4 r0.x, g[1], r0.z, r0.x"), /* ldib.a.u32.2d.4 r0.x, g[r0.z], r0.x, 1 */
+
+ // TODO is this a real instruction? Or float -6.0 ?
+ // INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true),
+ /* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */
+ INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */
+ INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */
+ INSTR_6XX(c0dc052e_01800042, "stg.a.u8 g[r0.z+(r11.z)<<2], hr8.y, 1"),
+ INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"),
+ INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"),
+ INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"),
+ INSTR_5XX(c0ce0100_02800000, "stg.s8 g[r0.x], hr0.x, 2"),
+ INSTR_5XX(c0c00100_02800000, "stg.f16 g[r0.x], hr0.x, 2"),
+
+ /* Customely crafted */
+ INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"),
+ INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"),
+
+ INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
+ INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
+ INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
+ INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
+ INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"),
+ INSTR_6XX(c0040003_0180c269, "ldg.u16 hr0.w, g[r0.w+308], 1"),
+
+ /* Found in TCS/TES shaders of GTA V */
+ INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */
+
+ /* Customely crafted */
+ INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"),
+
+ INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
+ INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
+ INSTR_6XX(c0000006_01c18017, "ldg.a.f16 hr1.z, g[r1.z+(r2.w)<<2], 1"),
+ INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
+ INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
+
+ /* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */
+ INSTR_6XX(c7020020_01800000, "stc.f32 c[32], r0.x, 1"), /* stc c[32], r0.x, 1 */
+ /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
+ INSTR_6XX(c7060020_03800000, "stc.u32 c[32], r0.x, 3"), /* stc c[32], r0.x, 3 */
+
+ /* custom */
+ INSTR_6XX(c7060100_03800000, "stc.u32 c[a1.x], r0.x, 3"), /* stc c[a1.x], r0.x, 3 */
+ INSTR_6XX(c7060120_03800000, "stc.u32 c[a1.x+32], r0.x, 3"), /* stc c[a1.x+32], r0.x, 3 */
+
+ /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
+ INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */
+
+ INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 hr0.z, r0.x, 2"),
#if 0
/* TODO blob sometimes/frequently sets b0, although there does not seem
* to be an obvious pattern and our encoding never sets it. AFAICT it
@@ -298,6 +332,13 @@ static const struct test {
INSTR_6XX(c0260000_00c78080, "ldc.offset0.1.nonuniform r0.x, 0, r0.x"), /* ldc.1.mode2.base0 r0.x, 0, r0.x */
INSTR_6XX(c0260201_00c78080, "ldc.offset0.1.nonuniform r0.y, 0, r0.y"), /* ldc.1.mode2.base0 r0.y, 0, r0.y */
+ /* a4xx-a5xx has the exact same instrs in
+ * dEQP-GLES31.functional.shaders.opaque_type_indexing.ubo.(dynamically_)uniform_fragment
+ * with no change based on the mode. Note that we can't decode this yet.
+ */
+ /* INSTR_4XX(c7860000_00810001), */ /* ldc.1 r0.x, g[r1.x], 0, r0.x */
+ /* INSTR_5XX(c7860000_00800000), */ /* ldc.a.1 r0.x, g[r0.x], 0, r0.x */
+
/* custom */
INSTR_6XX(c0260201_ffc78080, "ldc.offset0.1.nonuniform r0.y, 255, r0.y"), /* ldc.1.mode2.base0 r0.y, 255, r0.y */
@@ -307,6 +348,11 @@ static const struct test {
INSTR_6XX(c0260000_00478400, "ldc.offset2.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */
INSTR_6XX(c0260000_00478600, "ldc.offset3.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */
+ /* dEQP-VK.glsl.conditionals.if.if_else_vertex */
+ INSTR_6XX(c0360000_00c78100, "ldc.1.k.imm.base0 c[a1.x], 0, 0"), /* ldc.1.k.mode4.base0 c[a1.x], 0, 0 */
+ /* custom */
+ INSTR_6XX(c0360003_00c78100, "ldc.4.k.imm.base0 c[a1.x], 0, 0"), /* ldc.4.k.mode4.base0 c[a1.x], 0, 0 */
+
/* dEQP-VK.glsl.struct.local.nested_struct_array_dynamic_index_fragment */
INSTR_6XX(c1425b50_01803e02, "stp.f32 p[r11.y-176], r0.y, 1"),
INSTR_6XX(c1425b98_02803e14, "stp.f32 p[r11.y-104], r2.z, 2"),
@@ -318,14 +364,17 @@ static const struct test {
/* Atomic: */
#if 0
/* TODO our encoding differs in b53 for these two */
- INSTR_5XX(c4d60002_00008001, "atomic.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
- INSTR_5XX(c4160205_03000001, "atomic.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
+ INSTR_5XX(c4f60002_00008001, "atomic.s.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
+ INSTR_5XX(c4360205_03000001, "atomic.s.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
#else
- INSTR_5XX(c4f60002_00008001, "atomic.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
- INSTR_5XX(c4360205_03000001, "atomic.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
+ INSTR_5XX(c4f60002_00008001, "atomic.s.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"),
+ INSTR_5XX(c4360205_03000001, "atomic.s.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"),
#endif
INSTR_6XX(d5c60003_03008001, "(sy)atomic.max.untyped.1d.u32.1.l r0.w, l[r0.z], r0.w"),
+ /* dEQP-VK.glsl.atomic_operations.add_unsigned_compute_reference */
+ INSTR_6XX(c4160002_02000001, "atomic.g.add.untyped.1d.u32.1.g r0.z, r0.x, r0.z"),
+
/* Bindless atomic: */
INSTR_6XX(c03a0003_01640000, "atomic.b.add.untyped.1d.s32.1.imm r0.w, r0.y, 0"), /* atomic.b.add.g.s32.1d.mode0.base0 r0.w,r0.y,0 */
INSTR_6XX(c03a0003_01660000, "atomic.b.and.untyped.1d.s32.1.imm r0.w, r0.y, 0"), /* atomic.b.and.g.s32.1d.mode0.base0 r0.w,r0.y,0 */
@@ -333,10 +382,14 @@ static const struct test {
/* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_literal.fragment.sampler2d */
INSTR_6XX(a0c01f04_0cc00005, "sam (f32)(xyzw)r1.x, r0.z, s#6, t#6"),
- /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.uniform.fragment.sampler2d (looks like maybe the compiler didn't figure out */
- INSTR_6XX(a0c81f07_0100000b, "sam.s2en (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode0 (f32)(xyzw)r1.w, r1.y, hr2.x */
+
+ /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.uniform.fragment.sampler2d */
+ INSTR_4XX(a0c81f02_00800001, "sam.s2en.uniform (f32)(xyzw)r0.z, r0.x, hr1.x"), /* sam.s2en.mode0 (f32)(xyzw)r0.z, r0.x, hr1.x */ /* same for 5xx */
+ INSTR_6XX(a0c81f07_0100000b, "sam.s2en.uniform (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode0 (f32)(xyzw)r1.w, r1.y, hr2.x */
+
/* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.dynamically_uniform.fragment.sampler2d */
- INSTR_6XX(a0c81f07_8100000b, "sam.s2en.uniform (f32)(xyzw)r1.w, r1.y, hr2.x", .parse_fail=true), /* sam.s2en.mode4 (f32)(xyzw)r1.w, r1.y, hr2.x */
+ INSTR_4XX(a0c81f02_80800001, "sam.s2en.nonuniform (f32)(xyzw)r0.z, r0.x, hr1.x"), /* sam.s2en.uniform (f32)(xyzw)r0.z, r0.x, hr1.x */ /* same for 5xx */
+ INSTR_6XX(a0c81f07_8100000b, "sam.s2en.nonuniform (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode4 (f32)(xyzw)r1.w, r1.y, hr2.x */
/* NonUniform: */
/* dEQP-VK.descriptor_indexing.storage_buffer */
@@ -349,6 +402,9 @@ static const struct test {
/* dEQP-VK.descriptor_indexing.sampler */
INSTR_6XX(a0c81f00_40000005, "sam.s2en.nonuniform.base0 (f32)(xyzw)r0.x, r0.z, r0.x"),
+ /* dEQP-VK.subgroups.quad.graphics.subgroupquadbroadcast_int */
+ INSTR_6XX(c0260001_00c98000, "getfiberid.u32 r0.y"),
+
/* Custom test since we've never seen the blob emit these. */
INSTR_6XX(c0260004_00490000, "getspid.u32 r1.x"),
INSTR_6XX(c0260005_00494000, "getwid.u32 r1.y"),
@@ -416,7 +472,6 @@ main(int argc, char **argv)
printf(" Got: \"%s\"\n", disasm_output);
retval = 1;
decode_fails++;
- continue;
}
/*
@@ -426,7 +481,8 @@ main(int argc, char **argv)
unsigned gen = test->gpu_id / 100;
if (!compilers[gen]) {
dev_ids[gen].gpu_id = test->gpu_id;
- compilers[gen] = ir3_compiler_create(NULL, &dev_ids[gen], false);
+ compilers[gen] = ir3_compiler_create(NULL, &dev_ids[gen],
+ &(struct ir3_compiler_options){});
}
FILE *fasm =