diff options
Diffstat (limited to 'lib/mesa/src/freedreno/ir3')
-rw-r--r-- | lib/mesa/src/freedreno/ir3/ir3_dce.c | 6 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/ir3/ir3_delay.c | 223 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/ir3/ir3_lexer.l | 60 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c | 138 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/ir3/ir3_parser.y | 170 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/ir3/ir3_postsched.c | 244 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/ir3/ir3_ra.h | 2 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/ir3/tests/disasm.c | 368 |
8 files changed, 598 insertions, 613 deletions
diff --git a/lib/mesa/src/freedreno/ir3/ir3_dce.c b/lib/mesa/src/freedreno/ir3/ir3_dce.c index 76298e64a..a3ddbe802 100644 --- a/lib/mesa/src/freedreno/ir3/ir3_dce.c +++ b/lib/mesa/src/freedreno/ir3/ir3_dce.c @@ -53,8 +53,10 @@ instr_dce(struct ir3_instruction *instr, bool falsedep) if (ir3_instr_check_mark(instr)) return; - if (writes_gpr(instr)) - mark_array_use(instr, instr->dsts[0]); /* dst */ + foreach_dst (dst, instr) { + if (is_dest_gpr(dst)) + mark_array_use(instr, dst); + } foreach_src (reg, instr) mark_array_use(instr, reg); /* src */ diff --git a/lib/mesa/src/freedreno/ir3/ir3_delay.c b/lib/mesa/src/freedreno/ir3/ir3_delay.c index 14bb403b9..054f4c831 100644 --- a/lib/mesa/src/freedreno/ir3/ir3_delay.c +++ b/lib/mesa/src/freedreno/ir3/ir3_delay.c @@ -30,19 +30,6 @@ */ #define MAX_NOPS 6 -/* The soft delay for approximating the cost of (ss). On a6xx, it takes the - * number of delay slots to get a SFU result back (ie. using nop's instead of - * (ss) is: - * - * 8 - single warp - * 9 - two warps - * 10 - four warps - * - * and so on. Not quite sure where it tapers out (ie. how many warps share an - * SFU unit). But 10 seems like a reasonable # to choose: - */ -#define SOFT_SS_NOPS 10 - /* * Helpers to figure out the necessary delay slots between instructions. Used * both in scheduling pass(es) and the final pass to insert any required nop's @@ -76,11 +63,11 @@ ir3_delayslots(struct ir3_instruction *assigner, if (writes_addr0(assigner) || writes_addr1(assigner)) return 6; - if (soft && is_sfu(assigner)) - return SOFT_SS_NOPS; + if (soft && is_ss_producer(assigner)) + return soft_ss_delay(assigner); /* handled via sync flags: */ - if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) + if (is_ss_producer(assigner) || is_sy_producer(assigner)) return 0; /* As far as we know, shader outputs don't need any delay. */ @@ -89,7 +76,7 @@ ir3_delayslots(struct ir3_instruction *assigner, /* assigner must be alu: */ if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || - is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) { + is_mem(consumer)) { return 6; } else { /* In mergedregs mode, there is an extra 2-cycle penalty when half of @@ -119,74 +106,6 @@ count_instruction(struct ir3_instruction *n) (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B)); } -static unsigned -distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd) -{ - unsigned d = 0; - - /* Note that this relies on incrementally building up the block's - * instruction list.. but this is how scheduling and nopsched - * work. - */ - foreach_instr_rev (n, &block->instr_list) { - if ((n == instr) || (d >= maxd)) - return MIN2(maxd, d + n->nop); - if (count_instruction(n)) - d = MIN2(maxd, d + 1 + n->repeat + n->nop); - } - - return maxd; -} - -static unsigned -delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned srcn) -{ - unsigned delay = 0; - - if (assigner->opc == OPC_META_PHI) - return 0; - - if (is_meta(assigner)) { - foreach_src_n (src, n, assigner) { - unsigned d; - - if (!src->def) - continue; - - d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn); - delay = MAX2(delay, d); - } - } else { - delay = ir3_delayslots(assigner, consumer, srcn, false); - delay -= distance(block, assigner, delay); - } - - return delay; -} - -/** - * Calculate delay for instruction before register allocation, using SSA - * source pointers. This can't handle inter-block dependencies. - */ -unsigned -ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr) -{ - unsigned delay = 0; - - foreach_src_n (src, i, instr) { - unsigned d = 0; - - if (src->def && src->def->instr->block == block) { - d = delay_calc_srcn_prera(block, src->def->instr, instr, i); - } - - delay = MAX2(delay, d); - } - - return delay; -} - /* Post-RA, we don't have arrays any more, so we have to be a bit careful here * and have to handle relative accesses specially. */ @@ -207,35 +126,21 @@ post_ra_reg_num(struct ir3_register *reg) return reg->num; } -static unsigned -delay_calc_srcn_postra(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned assigner_n, - unsigned consumer_n, bool soft, bool mergedregs) +unsigned +ir3_delayslots_with_repeat(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, + unsigned assigner_n, unsigned consumer_n) { + unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false); + struct ir3_register *src = consumer->srcs[consumer_n]; struct ir3_register *dst = assigner->dsts[assigner_n]; - bool mismatched_half = - (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF); - /* In the mergedregs case or when the register is a special register, - * half-registers do not alias with full registers. - */ - if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) && - mismatched_half) - return 0; + if (assigner->repeat == 0 && consumer->repeat == 0) + return delay; unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src); - unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src); unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst); - unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst); - - if (dst_start >= src_end || src_start >= dst_end) - return 0; - - unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft); - - if (assigner->repeat == 0 && consumer->repeat == 0) - return delay; /* If either side is a relative access, we can't really apply most of the * reasoning below because we don't know which component aliases which. @@ -250,6 +155,9 @@ delay_calc_srcn_postra(struct ir3_instruction *assigner, if (assigner->opc == OPC_MOVMSK) return delay; + bool mismatched_half = + (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF); + /* TODO: Handle the combination of (rpt) and different component sizes * better like below. This complicates things significantly because the * components don't line up. @@ -303,10 +211,41 @@ delay_calc_srcn_postra(struct ir3_instruction *assigner, } static unsigned -delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start, - struct ir3_instruction *consumer, unsigned distance, - bool soft, bool pred, bool mergedregs) +delay_calc_srcn(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned assigner_n, + unsigned consumer_n, bool mergedregs) +{ + struct ir3_register *src = consumer->srcs[consumer_n]; + struct ir3_register *dst = assigner->dsts[assigner_n]; + bool mismatched_half = + (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF); + + /* In the mergedregs case or when the register is a special register, + * half-registers do not alias with full registers. + */ + if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) && + mismatched_half) + return 0; + + unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src); + unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src); + unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst); + unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst); + + if (dst_start >= src_end || src_start >= dst_end) + return 0; + + return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n); +} + +static unsigned +delay_calc(struct ir3_block *block, struct ir3_instruction *start, + struct ir3_instruction *consumer, unsigned distance, + regmask_t *in_mask, bool mergedregs) { + regmask_t mask; + memcpy(&mask, in_mask, sizeof(mask)); + unsigned delay = 0; /* Search backwards starting at the instruction before start, unless it's * NULL then search backwards from the block end. @@ -318,7 +257,7 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start, if (count_instruction(assigner)) distance += assigner->nop; - if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS)) + if (distance + delay >= MAX_NOPS) return delay; if (is_meta(assigner)) @@ -329,14 +268,17 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start, foreach_dst_n (dst, dst_n, assigner) { if (dst->wrmask == 0) continue; + if (!regmask_get(&mask, dst)) + continue; foreach_src_n (src, src_n, consumer) { if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) continue; - unsigned src_delay = delay_calc_srcn_postra( - assigner, consumer, dst_n, src_n, soft, mergedregs); + unsigned src_delay = delay_calc_srcn( + assigner, consumer, dst_n, src_n, mergedregs); new_delay = MAX2(new_delay, src_delay); } + regmask_clear(&mask, dst); } new_delay = new_delay > distance ? new_delay - distance : 0; @@ -360,13 +302,13 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start, * However any other recursion would be unnecessary. */ - if (pred && block->data != block) { + if (block->data != block) { block->data = block; for (unsigned i = 0; i < block->predecessors_count; i++) { struct ir3_block *pred = block->predecessors[i]; - unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance, - soft, pred, mergedregs); + unsigned pred_delay = delay_calc(pred, NULL, consumer, distance, + &mask, mergedregs); delay = MAX2(delay, pred_delay); } @@ -377,50 +319,19 @@ delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start, } /** - * Calculate delay for post-RA scheduling based on physical registers but not - * exact (i.e. don't recurse into predecessors, and make it possible to - * estimate impact of sync flags). - * - * @soft: If true, add additional delay for situations where they - * would not be strictly required because a sync flag would be - * used (but scheduler would prefer to schedule some other - * instructions first to avoid stalling on sync flag) - * @mergedregs: True if mergedregs is enabled. - */ -unsigned -ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr, - bool soft, bool mergedregs) -{ - return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs); -} - -/** * Calculate delay for nop insertion. This must exactly match hardware * requirements, including recursing into predecessor blocks. */ unsigned -ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr, - bool mergedregs) +ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, + bool mergedregs) { - return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs); -} - -/** - * Remove nop instructions. The scheduler can insert placeholder nop's - * so that ir3_delay_calc() can account for nop's that won't be needed - * due to nop's triggered by a previous instruction. However, before - * legalize, we want to remove these. The legalize pass can insert - * some nop's if needed to hold (for example) sync flags. This final - * remaining nops are inserted by legalize after this. - */ -void -ir3_remove_nops(struct ir3 *ir) -{ - foreach_block (block, &ir->block_list) { - foreach_instr_safe (instr, &block->instr_list) { - if (instr->opc == OPC_NOP) { - list_del(&instr->node); - } - } + regmask_t mask; + regmask_init(&mask, mergedregs); + foreach_src (src, instr) { + if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST))) + regmask_set(&mask, src); } + + return delay_calc(block, NULL, instr, 0, &mask, mergedregs); } diff --git a/lib/mesa/src/freedreno/ir3/ir3_lexer.l b/lib/mesa/src/freedreno/ir3/ir3_lexer.l index 2d5582e5b..52b977896 100644 --- a/lib/mesa/src/freedreno/ir3/ir3_lexer.l +++ b/lib/mesa/src/freedreno/ir3/ir3_lexer.l @@ -72,16 +72,6 @@ static int parse_reg(const char *str) return num; } -static int parse_w(const char *str) -{ - str++; - unsigned num = strtol(str, NULL, 10); - if ((num % 32) != 0) - yy_fatal_error("w# must be multiple of 32"); - if (num < 32) - yy_fatal_error("w# must be at least 32"); - return num / 32; -} %} %option noyywrap @@ -139,7 +129,7 @@ static int parse_w(const char *str) "a0.x" return T_A0; "a1.x" return T_A1; "p0."[xyzw] ir3_yylval.num = parse_reg(yytext); return T_P0; -"w"[0-9]+ ir3_yylval.num = parse_w(yytext); return T_W; +"w"[0-9]+ ir3_yylval.num = strtol(yytext+1, NULL, 10); return T_W; "s#"[0-9]+ ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_SAMP; "t#"[0-9]+ ir3_yylval.num = strtol(yytext+2, NULL, 10); return T_TEX; @@ -167,6 +157,7 @@ static int parse_w(const char *str) "stkr" return TOKEN(T_OP_STKR); "xset" return TOKEN(T_OP_XSET); "xclr" return TOKEN(T_OP_XCLR); +"getlast" return TOKEN(T_OP_GETLAST); "getone" return TOKEN(T_OP_GETONE); "dbg" return TOKEN(T_OP_DBG); "shps" return TOKEN(T_OP_SHPS); @@ -228,6 +219,7 @@ static int parse_w(const char *str) "shr.b" return TOKEN(T_OP_SHR_B); "ashr.b" return TOKEN(T_OP_ASHR_B); "bary.f" return TOKEN(T_OP_BARY_F); +"flat.b" return TOKEN(T_OP_FLAT_B); "mgen.b" return TOKEN(T_OP_MGEN_B); "getbit.b" return TOKEN(T_OP_GETBIT_B); "setrm" return TOKEN(T_OP_SETRM); @@ -252,7 +244,15 @@ static int parse_w(const char *str) "sel.f32" return TOKEN(T_OP_SEL_F32); "sad.s16" return TOKEN(T_OP_SAD_S16); "sad.s32" return TOKEN(T_OP_SAD_S32); -"shlg.b16" return TOKEN(T_OP_SHLG_B16); +"shrm" return TOKEN(T_OP_SHRM); +"shlm" return TOKEN(T_OP_SHLM); +"shrg" return TOKEN(T_OP_SHRG); +"shlg" return TOKEN(T_OP_SHLG); +"andg" return TOKEN(T_OP_ANDG); +"dp2acc" return TOKEN(T_OP_DP2ACC); +"dp4acc" return TOKEN(T_OP_DP4ACC); +"wmm" return TOKEN(T_OP_WMM); +"wmm.accu" return TOKEN(T_OP_WMM_ACCU); /* category 4: */ "rcp" return TOKEN(T_OP_RCP); @@ -295,6 +295,11 @@ static int parse_w(const char *str) "dsypp.1" return TOKEN(T_OP_DSYPP_1); "rgetpos" return TOKEN(T_OP_RGETPOS); "rgetinfo" return TOKEN(T_OP_RGETINFO); +"brcst.active" return TOKEN(T_OP_BRCST_A); +"quad_shuffle.brcst" return TOKEN(T_OP_QSHUFFLE_BRCST); +"quad_shuffle.horiz" return TOKEN(T_OP_QSHUFFLE_H); +"quad_shuffle.vert" return TOKEN(T_OP_QSHUFFLE_V); +"quad_shuffle.diag" return TOKEN(T_OP_QSHUFFLE_DIAG); /* category 6: */ "ldg" return TOKEN(T_OP_LDG); @@ -338,6 +343,29 @@ static int parse_w(const char *str) "atomic.b.and" return TOKEN(T_OP_ATOMIC_B_AND); "atomic.b.or" return TOKEN(T_OP_ATOMIC_B_OR); "atomic.b.xor" return TOKEN(T_OP_ATOMIC_B_XOR); +"atomic.s.add" return TOKEN(T_OP_ATOMIC_S_ADD); +"atomic.s.sub" return TOKEN(T_OP_ATOMIC_S_SUB); +"atomic.s.xchg" return TOKEN(T_OP_ATOMIC_S_XCHG); +"atomic.s.inc" return TOKEN(T_OP_ATOMIC_S_INC); +"atomic.s.dec" return TOKEN(T_OP_ATOMIC_S_DEC); +"atomic.s.cmpxchg" return TOKEN(T_OP_ATOMIC_S_CMPXCHG); +"atomic.s.min" return TOKEN(T_OP_ATOMIC_S_MIN); +"atomic.s.max" return TOKEN(T_OP_ATOMIC_S_MAX); +"atomic.s.and" return TOKEN(T_OP_ATOMIC_S_AND); +"atomic.s.or" return TOKEN(T_OP_ATOMIC_S_OR); +"atomic.s.xor" return TOKEN(T_OP_ATOMIC_S_XOR); +"atomic.g.add" return TOKEN(T_OP_ATOMIC_G_ADD); +"atomic.g.sub" return TOKEN(T_OP_ATOMIC_G_SUB); +"atomic.g.xchg" return TOKEN(T_OP_ATOMIC_G_XCHG); +"atomic.g.inc" return TOKEN(T_OP_ATOMIC_G_INC); +"atomic.g.dec" return TOKEN(T_OP_ATOMIC_G_DEC); +"atomic.g.cmpxchg" return TOKEN(T_OP_ATOMIC_G_CMPXCHG); +"atomic.g.min" return TOKEN(T_OP_ATOMIC_G_MIN); +"atomic.g.max" return TOKEN(T_OP_ATOMIC_G_MAX); +"atomic.g.and" return TOKEN(T_OP_ATOMIC_G_AND); +"atomic.g.or" return TOKEN(T_OP_ATOMIC_G_OR); +"atomic.g.xor" return TOKEN(T_OP_ATOMIC_G_XOR); + "ldgb" return TOKEN(T_OP_LDGB); "stgb" return TOKEN(T_OP_STGB); "stib" return TOKEN(T_OP_STIB); @@ -345,6 +373,8 @@ static int parse_w(const char *str) "ldlv" return TOKEN(T_OP_LDLV); "getspid" return TOKEN(T_OP_GETSPID); "getwid" return TOKEN(T_OP_GETWID); +"getfiberid" return TOKEN(T_OP_GETFIBERID); +"stc" return TOKEN(T_OP_STC); /* category 7: */ "bar" return TOKEN(T_OP_BAR); @@ -362,6 +392,11 @@ static int parse_w(const char *str) "untyped" return TOKEN(T_UNTYPED); "typed" return TOKEN(T_TYPED); +"unsigned" return TOKEN(T_UNSIGNED); +"mixed" return TOKEN(T_MIXED); +"low" return TOKEN(T_LOW); +"high" return TOKEN(T_HIGH); + "1d" return TOKEN(T_1D); "2d" return TOKEN(T_2D); "3d" return TOKEN(T_3D); @@ -379,6 +414,7 @@ static int parse_w(const char *str) "p" return 'p'; "s2en" return TOKEN(T_S2EN); "s" return 's'; +"k" return 'k'; "base"[0-9]+ ir3_yylval.num = strtol(yytext+4, NULL, 10); return T_BASE; "offset"[0-9]+ ir3_yylval.num = strtol(yytext+6, NULL, 10); return T_OFFSET; "uniform" return T_UNIFORM; diff --git a/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c deleted file mode 100644 index 37a3dcb26..000000000 --- a/lib/mesa/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright © 2017 Ilia Mirkin - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "ir3_nir.h" -#include "compiler/nir/nir_builder.h" - -/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the - * gather results, rather than before. As a result, it must be emulated with - * direct texture calls. - */ - -static bool -lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx) -{ - bool progress = false; - - static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} }; - - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_tex) - continue; - - nir_tex_instr *tg4 = (nir_tex_instr *)instr; - - if (tg4->op != nir_texop_tg4) - continue; - - b->cursor = nir_before_instr(&tg4->instr); - - nir_ssa_def *results[4]; - int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset); - for (int i = 0; i < 4; i++) { - int num_srcs = tg4->num_srcs + 1 /* lod */; - if (offset_index < 0 && i < 3) - num_srcs++; - - nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs); - tex->op = nir_texop_txl; - tex->sampler_dim = tg4->sampler_dim; - tex->coord_components = tg4->coord_components; - tex->is_array = tg4->is_array; - tex->is_shadow = tg4->is_shadow; - tex->is_new_style_shadow = tg4->is_new_style_shadow; - tex->texture_index = tg4->texture_index; - tex->sampler_index = tg4->sampler_index; - tex->dest_type = tg4->dest_type; - - for (int j = 0; j < tg4->num_srcs; j++) { - nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex); - tex->src[j].src_type = tg4->src[j].src_type; - } - if (i != 3) { - nir_ssa_def *offset = - nir_vec2(b, nir_imm_int(b, offsets[i][0]), - nir_imm_int(b, offsets[i][1])); - if (offset_index < 0) { - tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset); - tex->src[tg4->num_srcs].src_type = nir_tex_src_offset; - } else { - assert(nir_tex_instr_src_size(tex, offset_index) == 2); - nir_ssa_def *orig = nir_ssa_for_src( - b, tex->src[offset_index].src, 2); - tex->src[offset_index].src = - nir_src_for_ssa(nir_iadd(b, orig, offset)); - } - } - tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0)); - tex->src[num_srcs - 1].src_type = nir_tex_src_lod; - - nir_ssa_dest_init(&tex->instr, &tex->dest, - nir_tex_instr_dest_size(tex), 32, NULL); - nir_builder_instr_insert(b, &tex->instr); - - results[i] = nir_channel(b, &tex->dest.ssa, tg4->component); - } - - nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]); - nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result)); - - nir_instr_remove(&tg4->instr); - - progress = true; - } - - return progress; -} - -static bool -lower_tg4_func(nir_function_impl *impl) -{ - void *mem_ctx = ralloc_parent(impl); - nir_builder b; - nir_builder_init(&b, impl); - - bool progress = false; - nir_foreach_block_safe(block, impl) { - progress |= lower_tg4(block, &b, mem_ctx); - } - - if (progress) - nir_metadata_preserve(impl, nir_metadata_block_index | - nir_metadata_dominance); - - return progress; -} - -bool -ir3_nir_lower_tg4_to_tex(nir_shader *shader) -{ - bool progress = false; - - nir_foreach_function(function, shader) { - if (function->impl) - progress |= lower_tg4_func(function->impl); - } - - return progress; -} diff --git a/lib/mesa/src/freedreno/ir3/ir3_parser.y b/lib/mesa/src/freedreno/ir3/ir3_parser.y index acd94b35a..fd29c639d 100644 --- a/lib/mesa/src/freedreno/ir3/ir3_parser.y +++ b/lib/mesa/src/freedreno/ir3/ir3_parser.y @@ -399,6 +399,7 @@ static void print_token(FILE *file, int type, YYSTYPE value) %token <tok> T_OP_STKR %token <tok> T_OP_XSET %token <tok> T_OP_XCLR +%token <tok> T_OP_GETLAST %token <tok> T_OP_GETONE %token <tok> T_OP_DBG %token <tok> T_OP_SHPS @@ -458,6 +459,7 @@ static void print_token(FILE *file, int type, YYSTYPE value) %token <tok> T_OP_SHR_B %token <tok> T_OP_ASHR_B %token <tok> T_OP_BARY_F +%token <tok> T_OP_FLAT_B %token <tok> T_OP_MGEN_B %token <tok> T_OP_GETBIT_B %token <tok> T_OP_SETRM @@ -482,7 +484,15 @@ static void print_token(FILE *file, int type, YYSTYPE value) %token <tok> T_OP_SEL_F32 %token <tok> T_OP_SAD_S16 %token <tok> T_OP_SAD_S32 -%token <tok> T_OP_SHLG_B16 +%token <tok> T_OP_SHRM +%token <tok> T_OP_SHLM +%token <tok> T_OP_SHRG +%token <tok> T_OP_SHLG +%token <tok> T_OP_ANDG +%token <tok> T_OP_DP2ACC +%token <tok> T_OP_DP4ACC +%token <tok> T_OP_WMM +%token <tok> T_OP_WMM_ACCU /* category 4: */ %token <tok> T_OP_RCP @@ -525,6 +535,11 @@ static void print_token(FILE *file, int type, YYSTYPE value) %token <tok> T_OP_DSYPP_1 %token <tok> T_OP_RGETPOS %token <tok> T_OP_RGETINFO +%token <tok> T_OP_BRCST_A +%token <tok> T_OP_QSHUFFLE_BRCST +%token <tok> T_OP_QSHUFFLE_H +%token <tok> T_OP_QSHUFFLE_V +%token <tok> T_OP_QSHUFFLE_DIAG /* category 6: */ %token <tok> T_OP_LDG @@ -568,6 +583,28 @@ static void print_token(FILE *file, int type, YYSTYPE value) %token <tok> T_OP_ATOMIC_B_AND %token <tok> T_OP_ATOMIC_B_OR %token <tok> T_OP_ATOMIC_B_XOR +%token <tok> T_OP_ATOMIC_S_ADD +%token <tok> T_OP_ATOMIC_S_SUB +%token <tok> T_OP_ATOMIC_S_XCHG +%token <tok> T_OP_ATOMIC_S_INC +%token <tok> T_OP_ATOMIC_S_DEC +%token <tok> T_OP_ATOMIC_S_CMPXCHG +%token <tok> T_OP_ATOMIC_S_MIN +%token <tok> T_OP_ATOMIC_S_MAX +%token <tok> T_OP_ATOMIC_S_AND +%token <tok> T_OP_ATOMIC_S_OR +%token <tok> T_OP_ATOMIC_S_XOR +%token <tok> T_OP_ATOMIC_G_ADD +%token <tok> T_OP_ATOMIC_G_SUB +%token <tok> T_OP_ATOMIC_G_XCHG +%token <tok> T_OP_ATOMIC_G_INC +%token <tok> T_OP_ATOMIC_G_DEC +%token <tok> T_OP_ATOMIC_G_CMPXCHG +%token <tok> T_OP_ATOMIC_G_MIN +%token <tok> T_OP_ATOMIC_G_MAX +%token <tok> T_OP_ATOMIC_G_AND +%token <tok> T_OP_ATOMIC_G_OR +%token <tok> T_OP_ATOMIC_G_XOR %token <tok> T_OP_LDGB %token <tok> T_OP_STGB %token <tok> T_OP_STIB @@ -575,6 +612,8 @@ static void print_token(FILE *file, int type, YYSTYPE value) %token <tok> T_OP_LDLV %token <tok> T_OP_GETSPID %token <tok> T_OP_GETWID +%token <tok> T_OP_GETFIBERID +%token <tok> T_OP_STC /* category 7: */ %token <tok> T_OP_BAR @@ -593,6 +632,11 @@ static void print_token(FILE *file, int type, YYSTYPE value) %token <tok> T_UNTYPED %token <tok> T_TYPED +%token <tok> T_MIXED +%token <tok> T_UNSIGNED +%token <tok> T_LOW +%token <tok> T_HIGH + %token <tok> T_1D %token <tok> T_2D %token <tok> T_3D @@ -746,7 +790,7 @@ iflag: T_SY { iflags.flags |= IR3_INSTR_SY; } iflags: | iflag iflags -instrs: instr instrs +instrs: instrs instr | instr instr: iflags cat0_instr @@ -800,6 +844,7 @@ cat0_instr: T_OP_NOP { new_instr(OPC_NOP); } | T_OP_PREDT { new_instr(OPC_PREDT); } cat0_src1 | T_OP_PREDF { new_instr(OPC_PREDF); } cat0_src1 | T_OP_PREDE { new_instr(OPC_PREDE); } +| T_OP_GETLAST '.' T_W { new_instr(OPC_GETLAST); } cat0_immed cat1_opc: T_OP_MOV '.' T_CAT1_TYPE_TYPE { parse_type_type(new_instr(OPC_MOV), $3); @@ -815,9 +860,16 @@ cat1_movmsk: T_OP_MOVMSK '.' T_W { new_instr(OPC_MOVMSK); instr->cat1.src_type = TYPE_U32; instr->cat1.dst_type = TYPE_U32; - instr->repeat = $3 - 1; } dst_reg { - instr->dsts[0]->wrmask = (1 << $3) - 1; + if (($3 % 32) != 0) + yyerror("w# must be multiple of 32"); + if ($3 < 32) + yyerror("w# must be at least 32"); + + int num = $3 / 32; + + instr->repeat = num - 1; + instr->dsts[0]->wrmask = (1 << num) - 1; } cat1_mova1: T_OP_MOVA1 T_A1 ',' { @@ -894,6 +946,7 @@ cat2_opc_2src: T_OP_ADD_F { new_instr(OPC_ADD_F); } | T_OP_SHR_B { new_instr(OPC_SHR_B); } | T_OP_ASHR_B { new_instr(OPC_ASHR_B); } | T_OP_BARY_F { new_instr(OPC_BARY_F); } +| T_OP_FLAT_B { new_instr(OPC_FLAT_B); } | T_OP_MGEN_B { new_instr(OPC_MGEN_B); } | T_OP_GETBIT_B { new_instr(OPC_GETBIT_B); } | T_OP_SHB { new_instr(OPC_SHB); } @@ -910,6 +963,12 @@ cat2_instr: cat2_opc_1src dst_reg ',' src_reg_or_const_or_rel_or_imm | cat2_opc_2src_cnd '.' cond dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm | cat2_opc_2src dst_reg ',' src_reg_or_const_or_rel_or_imm ',' src_reg_or_const_or_rel_or_imm +cat3_dp_signedness:'.' T_MIXED { instr->cat3.signedness = IR3_SRC_MIXED; } +| '.' T_UNSIGNED{ instr->cat3.signedness = IR3_SRC_UNSIGNED; } + +cat3_dp_pack: '.' T_LOW { instr->cat3.packed = IR3_SRC_PACKED_LOW; } +| '.' T_HIGH { instr->cat3.packed = IR3_SRC_PACKED_HIGH; } + cat3_opc: T_OP_MAD_U16 { new_instr(OPC_MAD_U16); } | T_OP_MADSH_U16 { new_instr(OPC_MADSH_U16); } | T_OP_MAD_S16 { new_instr(OPC_MAD_S16); } @@ -927,8 +986,22 @@ cat3_opc: T_OP_MAD_U16 { new_instr(OPC_MAD_U16); } | T_OP_SAD_S16 { new_instr(OPC_SAD_S16); } | T_OP_SAD_S32 { new_instr(OPC_SAD_S32); } +cat3_imm_reg_opc: T_OP_SHRM { new_instr(OPC_SHRM); } +| T_OP_SHLM { new_instr(OPC_SHLM); } +| T_OP_SHRG { new_instr(OPC_SHRG); } +| T_OP_SHLG { new_instr(OPC_SHLG); } +| T_OP_ANDG { new_instr(OPC_ANDG); } + +cat3_wmm: T_OP_WMM { new_instr(OPC_WMM); } +| T_OP_WMM_ACCU { new_instr(OPC_WMM_ACCU); } + +cat3_dp: T_OP_DP2ACC { new_instr(OPC_DP2ACC); } +| T_OP_DP4ACC { new_instr(OPC_DP4ACC); } + cat3_instr: cat3_opc dst_reg ',' src_reg_or_const_or_rel ',' src_reg_or_const ',' src_reg_or_const_or_rel -| T_OP_SHLG_B16 { new_instr(OPC_SHLG_B16); } dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm +| cat3_imm_reg_opc dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm +| cat3_wmm dst_reg ',' src_reg_gpr ',' src_reg ',' immediate +| cat3_dp cat3_dp_signedness cat3_dp_pack dst_reg ',' src_reg_or_rel_or_imm ',' src_reg_or_const ',' src_reg_or_rel_or_imm cat4_opc: T_OP_RCP { new_instr(OPC_RCP); } | T_OP_RSQ { new_instr(OPC_RSQ); } @@ -972,6 +1045,11 @@ cat5_opc: T_OP_ISAM { new_instr(OPC_ISAM); } | T_OP_SAMGP3 { new_instr(OPC_SAMGP3); } | T_OP_RGETPOS { new_instr(OPC_RGETPOS); } | T_OP_RGETINFO { new_instr(OPC_RGETINFO); } +| T_OP_BRCST_A { new_instr(OPC_BRCST_ACTIVE); } +| T_OP_QSHUFFLE_BRCST { new_instr(OPC_QUAD_SHUFFLE_BRCST); } +| T_OP_QSHUFFLE_H { new_instr(OPC_QUAD_SHUFFLE_HORIZ); } +| T_OP_QSHUFFLE_V { new_instr(OPC_QUAD_SHUFFLE_VERT); } +| T_OP_QSHUFFLE_DIAG { new_instr(OPC_QUAD_SHUFFLE_DIAG); } cat5_flag: '.' T_3D { instr->flags |= IR3_INSTR_3D; } | '.' 'a' { instr->flags |= IR3_INSTR_A; } @@ -979,13 +1057,15 @@ cat5_flag: '.' T_3D { instr->flags |= IR3_INSTR_3D; } | '.' 'p' { instr->flags |= IR3_INSTR_P; } | '.' 's' { instr->flags |= IR3_INSTR_S; } | '.' T_S2EN { instr->flags |= IR3_INSTR_S2EN; } +| '.' T_UNIFORM { } | '.' T_NONUNIFORM { instr->flags |= IR3_INSTR_NONUNIF; } | '.' T_BASE { instr->flags |= IR3_INSTR_B; instr->cat5.tex_base = $2; } +| '.' T_W { instr->cat5.cluster_size = $2; } cat5_flags: | cat5_flag cat5_flags cat5_samp: T_SAMP { instr->cat5.samp = $1; } -cat5_tex: T_TEX { if (instr->flags & IR3_INSTR_B) instr->cat5.samp |= ($1 << 4); else instr->cat5.tex = $1; } +cat5_tex: T_TEX { instr->cat5.tex = $1; } cat5_type: '(' type ')' { instr->cat5.type = $2; } cat5_a1: src_reg { instr->flags |= IR3_INSTR_A1EN; } @@ -1018,7 +1098,7 @@ cat6_imm_offset: offset { new_src(0, IR3_REG_IMMED)->iim_val = $1; } cat6_offset: cat6_imm_offset | '+' src cat6_dst_offset: offset { instr->cat6.dst_offset = $1; } -| '+' src { instr->flags |= IR3_INSTR_G; } +| '+' src cat6_immed: integer { instr->cat6.iim_val = $1; } @@ -1066,14 +1146,39 @@ cat6_atomic_opc: T_OP_ATOMIC_ADD { new_instr(OPC_ATOMIC_ADD); } | T_OP_ATOMIC_OR { new_instr(OPC_ATOMIC_OR); } | T_OP_ATOMIC_XOR { new_instr(OPC_ATOMIC_XOR); } -cat6_atomic_g: cat6_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' 'g' '[' cat6_reg_or_immed ']' ',' src ',' src ',' src { - instr->flags |= IR3_INSTR_G; - } +cat6_a3xx_atomic_opc: T_OP_ATOMIC_S_ADD { new_instr(OPC_ATOMIC_S_ADD); } +| T_OP_ATOMIC_S_SUB { new_instr(OPC_ATOMIC_S_SUB); } +| T_OP_ATOMIC_S_XCHG { new_instr(OPC_ATOMIC_S_XCHG); } +| T_OP_ATOMIC_S_INC { new_instr(OPC_ATOMIC_S_INC); } +| T_OP_ATOMIC_S_DEC { new_instr(OPC_ATOMIC_S_DEC); } +| T_OP_ATOMIC_S_CMPXCHG { new_instr(OPC_ATOMIC_S_CMPXCHG); } +| T_OP_ATOMIC_S_MIN { new_instr(OPC_ATOMIC_S_MIN); } +| T_OP_ATOMIC_S_MAX { new_instr(OPC_ATOMIC_S_MAX); } +| T_OP_ATOMIC_S_AND { new_instr(OPC_ATOMIC_S_AND); } +| T_OP_ATOMIC_S_OR { new_instr(OPC_ATOMIC_S_OR); } +| T_OP_ATOMIC_S_XOR { new_instr(OPC_ATOMIC_S_XOR); } + +cat6_a6xx_atomic_opc: T_OP_ATOMIC_G_ADD { new_instr(OPC_ATOMIC_G_ADD); } +| T_OP_ATOMIC_G_SUB { new_instr(OPC_ATOMIC_G_SUB); } +| T_OP_ATOMIC_G_XCHG { new_instr(OPC_ATOMIC_G_XCHG); } +| T_OP_ATOMIC_G_INC { new_instr(OPC_ATOMIC_G_INC); } +| T_OP_ATOMIC_G_DEC { new_instr(OPC_ATOMIC_G_DEC); } +| T_OP_ATOMIC_G_CMPXCHG { new_instr(OPC_ATOMIC_G_CMPXCHG); } +| T_OP_ATOMIC_G_MIN { new_instr(OPC_ATOMIC_G_MIN); } +| T_OP_ATOMIC_G_MAX { new_instr(OPC_ATOMIC_G_MAX); } +| T_OP_ATOMIC_G_AND { new_instr(OPC_ATOMIC_G_AND); } +| T_OP_ATOMIC_G_OR { new_instr(OPC_ATOMIC_G_OR); } +| T_OP_ATOMIC_G_XOR { new_instr(OPC_ATOMIC_G_XOR); } + +cat6_a3xx_atomic_s: cat6_a3xx_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' 'g' '[' cat6_reg_or_immed ']' ',' src ',' src ',' src + +cat6_a6xx_atomic_g: cat6_a6xx_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'g' dst_reg ',' src ',' src cat6_atomic_l: cat6_atomic_opc cat6_typed cat6_dim cat6_type '.' cat6_immed '.' 'l' dst_reg ',' 'l' '[' cat6_reg_or_immed ']' ',' src -cat6_atomic: cat6_atomic_g -| cat6_atomic_l +cat6_atomic: cat6_atomic_l +| cat6_a3xx_atomic_s +| cat6_a6xx_atomic_g cat6_ibo_opc_1src: T_OP_RESINFO { new_instr(OPC_RESINFO); } @@ -1087,6 +1192,7 @@ cat6_ibo: cat6_ibo_opc_1src cat6_type cat6_dim dst_reg ',' 'g' '[' cat6 cat6_id_opc: T_OP_GETSPID { new_instr(OPC_GETSPID); } | T_OP_GETWID { new_instr(OPC_GETWID); } +| T_OP_GETFIBERID { new_instr(OPC_GETFIBERID); } cat6_id: cat6_id_opc cat6_type dst_reg @@ -1102,17 +1208,17 @@ cat6_reg_or_immed: src cat6_bindless_ibo_opc_1src: T_OP_RESINFO_B { new_instr(OPC_RESINFO); } -cat6_bindless_ibo_opc_2src: T_OP_ATOMIC_B_ADD { new_instr(OPC_ATOMIC_ADD)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_SUB { new_instr(OPC_ATOMIC_SUB)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_XCHG { new_instr(OPC_ATOMIC_XCHG)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_INC { new_instr(OPC_ATOMIC_INC)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_DEC { new_instr(OPC_ATOMIC_DEC)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_CMPXCHG { new_instr(OPC_ATOMIC_CMPXCHG)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_MIN { new_instr(OPC_ATOMIC_MIN)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_MAX { new_instr(OPC_ATOMIC_MAX)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_AND { new_instr(OPC_ATOMIC_AND)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_OR { new_instr(OPC_ATOMIC_OR)->flags |= IR3_INSTR_G; dummy_dst(); } -| T_OP_ATOMIC_B_XOR { new_instr(OPC_ATOMIC_XOR)->flags |= IR3_INSTR_G; dummy_dst(); } +cat6_bindless_ibo_opc_2src: T_OP_ATOMIC_B_ADD { new_instr(OPC_ATOMIC_B_ADD); dummy_dst(); } +| T_OP_ATOMIC_B_SUB { new_instr(OPC_ATOMIC_B_SUB); dummy_dst(); } +| T_OP_ATOMIC_B_XCHG { new_instr(OPC_ATOMIC_B_XCHG); dummy_dst(); } +| T_OP_ATOMIC_B_INC { new_instr(OPC_ATOMIC_B_INC); dummy_dst(); } +| T_OP_ATOMIC_B_DEC { new_instr(OPC_ATOMIC_B_DEC); dummy_dst(); } +| T_OP_ATOMIC_B_CMPXCHG { new_instr(OPC_ATOMIC_B_CMPXCHG); dummy_dst(); } +| T_OP_ATOMIC_B_MIN { new_instr(OPC_ATOMIC_B_MIN); dummy_dst(); } +| T_OP_ATOMIC_B_MAX { new_instr(OPC_ATOMIC_B_MAX); dummy_dst(); } +| T_OP_ATOMIC_B_AND { new_instr(OPC_ATOMIC_B_AND); dummy_dst(); } +| T_OP_ATOMIC_B_OR { new_instr(OPC_ATOMIC_B_OR); dummy_dst(); } +| T_OP_ATOMIC_B_XOR { new_instr(OPC_ATOMIC_B_XOR); dummy_dst(); } | T_OP_STIB_B { new_instr(OPC_STIB); dummy_dst(); } cat6_bindless_ibo_opc_2src_dst: T_OP_LDIB_B { new_instr(OPC_LDIB); } @@ -1123,13 +1229,23 @@ cat6_bindless_ibo: cat6_bindless_ibo_opc_1src cat6_typed cat6_dim cat6_type '.' cat6_bindless_ldc_opc: T_OP_LDC { new_instr(OPC_LDC); } -cat6_bindless_ldc: cat6_bindless_ldc_opc '.' T_OFFSET '.' cat6_immed '.' cat6_bindless_mode dst_reg ',' cat6_reg_or_immed ',' cat6_reg_or_immed { - instr->cat6.d = $3; +/* This is separated from the opcode to avoid lookahead/shift-reduce conflicts */ +cat6_bindless_ldc_middle: + T_OFFSET '.' cat6_immed '.' cat6_bindless_mode dst_reg { instr->cat6.d = $1; } +| cat6_immed '.' 'k' '.' cat6_bindless_mode 'c' '[' T_A1 ']' { instr->opc = OPC_LDC_K; } + +cat6_bindless_ldc: cat6_bindless_ldc_opc '.' cat6_bindless_ldc_middle ',' cat6_reg_or_immed ',' cat6_reg_or_immed { instr->cat6.type = TYPE_U32; /* TODO cleanup ir3 src order: */ swap(instr->srcs[0], instr->srcs[1]); } +stc_dst: integer { new_src(0, IR3_REG_IMMED)->iim_val = $1; } +| T_A1 { new_src(0, IR3_REG_IMMED)->iim_val = 0; instr->flags |= IR3_INSTR_A1EN; } +| T_A1 '+' integer { new_src(0, IR3_REG_IMMED)->iim_val = $3; instr->flags |= IR3_INSTR_A1EN; } + +cat6_stc: T_OP_STC { new_instr(OPC_STC); } cat6_type 'c' '[' stc_dst ']' ',' src_reg ',' cat6_immed + cat6_todo: T_OP_G2L { new_instr(OPC_G2L); } | T_OP_L2G { new_instr(OPC_L2G); } | T_OP_RESFMT { new_instr(OPC_RESFMT); } @@ -1144,6 +1260,7 @@ cat6_instr: cat6_load | cat6_id | cat6_bindless_ldc | cat6_bindless_ibo +| cat6_stc | cat6_todo cat7_scope: '.' 'w' { instr->cat7.w = true; } @@ -1195,6 +1312,9 @@ src_reg_flags: src_reg_flag src_reg: src | src_reg_flags src +src_reg_gpr: src_reg +| relative_gpr_src + src_const: const | src_reg_flags const diff --git a/lib/mesa/src/freedreno/ir3/ir3_postsched.c b/lib/mesa/src/freedreno/ir3/ir3_postsched.c index 507302a00..39de84add 100644 --- a/lib/mesa/src/freedreno/ir3/ir3_postsched.c +++ b/lib/mesa/src/freedreno/ir3/ir3_postsched.c @@ -68,8 +68,10 @@ struct ir3_postsched_ctx { struct list_head unscheduled_list; /* unscheduled instructions */ - int sfu_delay; - int tex_delay; + unsigned ip; + + int ss_delay; + int sy_delay; }; struct ir3_postsched_node { @@ -77,7 +79,9 @@ struct ir3_postsched_node { struct ir3_instruction *instr; bool partially_evaluated_path; - bool has_tex_src, has_sfu_src; + unsigned earliest_ip; + + bool has_sy_src, has_ss_src; unsigned delay; unsigned max_delay; @@ -87,17 +91,17 @@ struct ir3_postsched_node { list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link) static bool -has_tex_src(struct ir3_instruction *instr) +has_sy_src(struct ir3_instruction *instr) { struct ir3_postsched_node *node = instr->data; - return node->has_tex_src; + return node->has_sy_src; } static bool -has_sfu_src(struct ir3_instruction *instr) +has_ss_src(struct ir3_instruction *instr) { struct ir3_postsched_node *node = instr->data; - return node->has_sfu_src; + return node->has_ss_src; } static void @@ -111,28 +115,45 @@ schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) di(instr, "schedule"); - list_addtail(&instr->node, &instr->block->instr_list); + bool counts_for_delay = is_alu(instr) || is_flow(instr); + + unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0; struct ir3_postsched_node *n = instr->data; + + /* We insert any nop's needed to get to earliest_ip, then advance + * delay_cycles by scheduling the instruction. + */ + ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles; + + util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { + unsigned delay = (unsigned)(uintptr_t)edge->data; + struct ir3_postsched_node *child = + container_of(edge->child, struct ir3_postsched_node, dag); + child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay); + } + + list_addtail(&instr->node, &instr->block->instr_list); + dag_prune_head(ctx->dag, &n->dag); if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH)) return; - if (is_sfu(instr)) { - ctx->sfu_delay = 8; - } else if (has_sfu_src(instr)) { - ctx->sfu_delay = 0; - } else if (ctx->sfu_delay > 0) { - ctx->sfu_delay--; + if (is_ss_producer(instr)) { + ctx->ss_delay = soft_ss_delay(instr); + } else if (has_ss_src(instr)) { + ctx->ss_delay = 0; + } else if (ctx->ss_delay > 0) { + ctx->ss_delay--; } - if (is_tex_or_prefetch(instr)) { - ctx->tex_delay = 10; - } else if (has_tex_src(instr)) { - ctx->tex_delay = 0; - } else if (ctx->tex_delay > 0) { - ctx->tex_delay--; + if (is_sy_producer(instr)) { + ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader); + } else if (has_sy_src(instr)) { + ctx->sy_delay = 0; + } else if (ctx->sy_delay > 0) { + ctx->sy_delay--; } } @@ -154,25 +175,26 @@ dump_state(struct ir3_postsched_ctx *ctx) } } -/* Determine if this is an instruction that we'd prefer not to schedule - * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay - * counter, ie. the more cycles it has been since the last SFU, the less - * costly a sync would be. - */ -static bool -would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) +static unsigned +node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n) { - if (ctx->sfu_delay) { - if (has_sfu_src(instr)) - return true; - } + return MAX2(n->earliest_ip, ctx->ip) - ctx->ip; +} - if (ctx->tex_delay) { - if (has_tex_src(instr)) - return true; - } +static unsigned +node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n) +{ + unsigned delay = node_delay(ctx, n); + + /* This takes into account that as when we schedule multiple tex or sfu, the + * first user has to wait for all of them to complete. + */ + if (n->has_ss_src) + delay = MAX2(delay, ctx->ss_delay); + if (n->has_sy_src) + delay = MAX2(delay, ctx->sy_delay); - return false; + return delay; } /* find instruction to schedule: */ @@ -215,8 +237,7 @@ choose_instr(struct ir3_postsched_ctx *ctx) /* Next prioritize discards: */ foreach_sched_node (n, &ctx->dag->heads) { - unsigned d = - ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs); + unsigned d = node_delay(ctx, n); if (d > 0) continue; @@ -235,13 +256,12 @@ choose_instr(struct ir3_postsched_ctx *ctx) /* Next prioritize expensive instructions: */ foreach_sched_node (n, &ctx->dag->heads) { - unsigned d = - ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs); + unsigned d = node_delay_soft(ctx, n); if (d > 0) continue; - if (!(is_sfu(n->instr) || is_tex(n->instr))) + if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr))) continue; if (!chosen || (chosen->max_delay < n->max_delay)) @@ -249,53 +269,36 @@ choose_instr(struct ir3_postsched_ctx *ctx) } if (chosen) { - di(chosen->instr, "csp: chose (sfu/tex, hard ready)"); + di(chosen->instr, "csp: chose (sfu/tex, soft ready)"); return chosen->instr; } - /* - * Sometimes be better to take a nop, rather than scheduling an - * instruction that would require an (ss) shortly after another - * SFU.. ie. if last SFU was just one or two instr ago, and we - * could choose between taking a nop and then scheduling - * something else, vs scheduling the immed avail instruction that - * would require (ss), we are better with the nop. - */ - for (unsigned delay = 0; delay < 4; delay++) { - foreach_sched_node (n, &ctx->dag->heads) { - if (would_sync(ctx, n->instr)) - continue; - - unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true, - ctx->v->mergedregs); - - if (d > delay) - continue; - - if (!chosen || (chosen->max_delay < n->max_delay)) - chosen = n; - } - - if (chosen) { - di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay); - return chosen->instr; - } - } - /* Next try to find a ready leader w/ soft delay (ie. including extra * delay for things like tex fetch which can be synchronized w/ sync * bit (but we probably do want to schedule some other instructions - * while we wait) + * while we wait). We also allow a small amount of nops, to prefer now-nops + * over future-nops up to a point, as that gives better results. */ + unsigned chosen_delay = 0; foreach_sched_node (n, &ctx->dag->heads) { - unsigned d = - ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs); + unsigned d = node_delay_soft(ctx, n); - if (d > 0) + if (d > 3) continue; - if (!chosen || (chosen->max_delay < n->max_delay)) + if (!chosen || d < chosen_delay) { + chosen = n; + chosen_delay = d; + continue; + } + + if (d > chosen_delay) + continue; + + if (chosen->max_delay < n->max_delay) { chosen = n; + chosen_delay = d; + } } if (chosen) { @@ -308,8 +311,7 @@ choose_instr(struct ir3_postsched_ctx *ctx) * stalls.. but we've already decided there is not a better option. */ foreach_sched_node (n, &ctx->dag->heads) { - unsigned d = - ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs); + unsigned d = node_delay(ctx, n); if (d > 0) continue; @@ -324,9 +326,6 @@ choose_instr(struct ir3_postsched_ctx *ctx) } /* Otherwise choose leader with maximum cost: - * - * TODO should we try to balance cost and delays? I guess it is - * a balance between now-nop's and future-nop's? */ foreach_sched_node (n, &ctx->dag->heads) { if (!chosen || chosen->max_delay < n->max_delay) @@ -361,6 +360,7 @@ struct ir3_postsched_deps_state { * for full precision and 2nd half for half-precision. */ struct ir3_postsched_node *regs[2 * 256]; + unsigned dst_n[2 * 256]; }; /* bounds checking read/write accessors, since OoB access to stuff on @@ -374,7 +374,8 @@ struct ir3_postsched_deps_state { static void add_dep(struct ir3_postsched_deps_state *state, - struct ir3_postsched_node *before, struct ir3_postsched_node *after) + struct ir3_postsched_node *before, struct ir3_postsched_node *after, + unsigned d) { if (!before || !after) return; @@ -382,30 +383,36 @@ add_dep(struct ir3_postsched_deps_state *state, assert(before != after); if (state->direction == F) { - dag_add_edge(&before->dag, &after->dag, NULL); + dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d); } else { - dag_add_edge(&after->dag, &before->dag, NULL); + dag_add_edge_max_data(&after->dag, &before->dag, 0); } } static void add_single_reg_dep(struct ir3_postsched_deps_state *state, - struct ir3_postsched_node *node, unsigned num, int src_n) + struct ir3_postsched_node *node, unsigned num, int src_n, + int dst_n) { struct ir3_postsched_node *dep = dep_reg(state, num); + unsigned d = 0; if (src_n >= 0 && dep && state->direction == F) { - unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true); - node->delay = MAX2(node->delay, d); - if (is_tex_or_prefetch(dep->instr)) - node->has_tex_src = true; - if (is_tex_or_prefetch(dep->instr)) - node->has_sfu_src = true; - } - - add_dep(state, dep, node); + /* get the dst_n this corresponds to */ + unsigned dst_n = state->dst_n[num]; + unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true); + d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n); + node->delay = MAX2(node->delay, d_soft); + if (is_sy_producer(dep->instr)) + node->has_sy_src = true; + if (is_ss_producer(dep->instr)) + node->has_ss_src = true; + } + + add_dep(state, dep, node, d); if (src_n < 0) { dep_reg(state, num) = node; + state->dst_n[num] = dst_n; } } @@ -413,15 +420,15 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state, * between half and full precision that result in additional dependencies. * The 'reg' arg is really just to know half vs full precision. * - * If non-negative, then this adds a dependency on a source register, and + * If src_n is positive, then this adds a dependency on a source register, and * src_n is the index passed into ir3_delayslots() for calculating the delay: - * If positive, corresponds to node->instr->regs[src_n]. If negative, then - * this is for a destination register. + * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then + * this is for the destination register corresponding to dst_n. */ static void add_reg_dep(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *node, const struct ir3_register *reg, - unsigned num, int src_n) + unsigned num, int src_n, int dst_n) { if (state->merged) { /* Make sure that special registers like a0.x that are written as @@ -430,16 +437,16 @@ add_reg_dep(struct ir3_postsched_deps_state *state, */ if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) { /* single conflict in half-reg space: */ - add_single_reg_dep(state, node, num, src_n); + add_single_reg_dep(state, node, num, src_n, dst_n); } else { /* two conflicts in half-reg space: */ - add_single_reg_dep(state, node, 2 * num + 0, src_n); - add_single_reg_dep(state, node, 2 * num + 1, src_n); + add_single_reg_dep(state, node, 2 * num + 0, src_n, dst_n); + add_single_reg_dep(state, node, 2 * num + 1, src_n, dst_n); } } else { if (reg->flags & IR3_REG_HALF) num += ARRAY_SIZE(state->regs) / 2; - add_single_reg_dep(state, node, num, src_n); + add_single_reg_dep(state, node, num, src_n, dst_n); } } @@ -457,12 +464,12 @@ calculate_deps(struct ir3_postsched_deps_state *state, if (reg->flags & IR3_REG_RELATIV) { /* mark entire array as read: */ for (unsigned j = 0; j < reg->size; j++) { - add_reg_dep(state, node, reg, reg->array.base + j, i); + add_reg_dep(state, node, reg, reg->array.base + j, i, -1); } } else { assert(reg->wrmask >= 1); u_foreach_bit (b, reg->wrmask) { - add_reg_dep(state, node, reg, reg->num + b, i); + add_reg_dep(state, node, reg, reg->num + b, i, -1); } } } @@ -470,18 +477,18 @@ calculate_deps(struct ir3_postsched_deps_state *state, /* And then after we update the state for what this instruction * wrote: */ - foreach_dst (reg, node->instr) { + foreach_dst_n (reg, i, node->instr) { if (reg->wrmask == 0) continue; if (reg->flags & IR3_REG_RELATIV) { /* mark the entire array as written: */ - for (unsigned i = 0; i < reg->size; i++) { - add_reg_dep(state, node, reg, reg->array.base + i, -1); + for (unsigned j = 0; j < reg->size; j++) { + add_reg_dep(state, node, reg, reg->array.base + j, -1, i); } } else { assert(reg->wrmask >= 1); u_foreach_bit (b, reg->wrmask) { - add_reg_dep(state, node, reg, reg->num + b, -1); + add_reg_dep(state, node, reg, reg->num + b, -1, i); } } } @@ -593,7 +600,7 @@ sched_dag_init(struct ir3_postsched_ctx *ctx) if (src->block != instr->block) continue; - dag_add_edge(&sn->dag, &n->dag, NULL); + dag_add_edge_max_data(&sn->dag, &n->dag, 0); } if (is_input(instr)) { @@ -602,14 +609,14 @@ sched_dag_init(struct ir3_postsched_ctx *ctx) util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) { struct ir3_instruction *input = *instrp; struct ir3_postsched_node *in = input->data; - dag_add_edge(&in->dag, &n->dag, NULL); + dag_add_edge_max_data(&in->dag, &n->dag, 0); } util_dynarray_append(&kills, struct ir3_instruction *, instr); } else if (is_tex(instr) || is_mem(instr)) { util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) { struct ir3_instruction *kill = *instrp; struct ir3_postsched_node *kn = kill->data; - dag_add_edge(&kn->dag, &n->dag, NULL); + dag_add_edge_max_data(&kn->dag, &n->dag, 0); } } } @@ -630,8 +637,8 @@ static void sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) { ctx->block = block; - ctx->tex_delay = 0; - ctx->sfu_delay = 0; + ctx->sy_delay = 0; + ctx->ss_delay = 0; /* move all instructions to the unscheduled list, and * empty the block's instruction list (to which we will @@ -677,18 +684,10 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) while (!list_is_empty(&ctx->unscheduled_list)) { struct ir3_instruction *instr = choose_instr(ctx); - unsigned delay = - ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs); + unsigned delay = node_delay(ctx, instr->data); d("delay=%u", delay); - /* and if we run out of instructions that can be scheduled, - * then it is time for nop's: - */ debug_assert(delay <= 6); - while (delay > 0) { - ir3_NOP(block); - delay--; - } schedule(ctx, instr); } @@ -750,7 +749,6 @@ ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v) .v = v, }; - ir3_remove_nops(ir); cleanup_self_movs(ir); foreach_block (block, &ir->block_list) { diff --git a/lib/mesa/src/freedreno/ir3/ir3_ra.h b/lib/mesa/src/freedreno/ir3/ir3_ra.h index 259341eaa..c6837aaae 100644 --- a/lib/mesa/src/freedreno/ir3/ir3_ra.h +++ b/lib/mesa/src/freedreno/ir3/ir3_ra.h @@ -124,7 +124,7 @@ ra_reg_is_dst(const struct ir3_register *reg) if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i]))) #define ra_foreach_dst_n(__dstreg, __n, __instr) \ - foreach_dst_n(__dstreg, __n, instr) \ + foreach_dst_n(__dstreg, __n, __instr) \ if (ra_reg_is_dst(__dstreg)) #define ra_foreach_dst(__dstreg, __instr) \ diff --git a/lib/mesa/src/freedreno/ir3/tests/disasm.c b/lib/mesa/src/freedreno/ir3/tests/disasm.c index 542469aa1..2f1b89f0d 100644 --- a/lib/mesa/src/freedreno/ir3/tests/disasm.c +++ b/lib/mesa/src/freedreno/ir3/tests/disasm.c @@ -43,6 +43,8 @@ #include "isa/isa.h" /* clang-format off */ +/* Note: @anholt's 4xx disasm was done on an a418 Nexus 5x */ +#define INSTR_4XX(i, d, ...) { .gpu_id = 420, .instr = #i, .expected = d, __VA_ARGS__ } #define INSTR_5XX(i, d, ...) { .gpu_id = 540, .instr = #i, .expected = d, __VA_ARGS__ } #define INSTR_6XX(i, d, ...) { .gpu_id = 630, .instr = #i, .expected = d, __VA_ARGS__ } /* clang-format on */ @@ -58,153 +60,185 @@ static const struct test { bool parse_fail; } tests[] = { /* clang-format off */ - /* cat0 */ - INSTR_6XX(00000000_00000000, "nop"), - INSTR_6XX(00000200_00000000, "(rpt2)nop"), - INSTR_6XX(03000000_00000000, "end"), - INSTR_6XX(00800000_00000004, "br p0.x, #4"), - INSTR_6XX(00900000_00000003, "br !p0.x, #3"), - INSTR_6XX(03820000_00000015, "shps #21"), /* emit */ - INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */ - INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */ - INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"), - INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"), - INSTR_6XX(07820000_00000000, "prede"), - INSTR_6XX(00800063_0000001e, "brac.3 #30"), - INSTR_6XX(06820000_00000000, "predt p0.x"), - INSTR_6XX(07020000_00000000, "predf p0.x"), - INSTR_6XX(07820000_00000000, "prede"), - - /* cat1 */ - INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"), - INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"), - INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"), - INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"), - INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"), - INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"), - INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"), - INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"), - INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"), - /* dEQP-VK.subgroups.ballot.compute.compute */ - INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */ - - INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"), - INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"), - INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"), - INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"), - - INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"), - INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"), - INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"), - INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"), - - /* cat2 */ - INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"), - INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"), - INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"), - INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"), - INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"), - INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"), - INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"), - INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"), - INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"), - INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"), - INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"), - INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"), - INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"), - INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"), - - /* cat3 */ - INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"), - INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"), - INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"), - INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"), - INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"), - INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"), - INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"), - INSTR_6XX(65900820_100cb008, "(nop3) shlg.b16 hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */ - INSTR_6XX(65ae085c_0002a001, "(nop3) shlg.b16 hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */ - INSTR_6XX(65900820_0c0aac05, "(nop3) shlg.b16 hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */ - - /* cat4 */ - INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"), - - /* cat5 */ - /* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */ - INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */ - /* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */ - INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */ - /* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */ - INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */ - INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */ - - INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"), - INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"), /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */ - INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"), /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */ - INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"), - INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"), - INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"), - INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"), - INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"), - /* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */ - INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"), - INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"), - - - /* cat6 */ - - INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"), /* ldgb.a.untyped.1dtype.u32.1 r0.x, g[r1.x], r0.x, 0 */ - INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"), /* (sy)stib.a.u32.2d.1 g[r1.x], r0.x, r0.z, 1. r1.x is offset in ibo, r0.x is value*/ - /* dEQP-VK.image.load_store.1d_array.r8g8b8a8_unorm */ - INSTR_5XX(c1a20006_0600ba01, "ldib.typed.2d.f32.4 r1.z, g[0], r0.z, r1.z"), /* ldib.a.f32.2d.4 r1.z, g[r0.z], r1.z, 0. r0.z is offset in ibo as src. r1.z */ - /* dEQP-VK.image.load_store.3d.r32g32b32a32_sint */ - INSTR_5XX(c1aa0003_0500fc01, "ldib.typed.3d.s32.4 r0.w, g[0], r0.w, r1.y"), /* ldib.a.s32.3d.4 r0.w, g[r0.w], r1.y, 0. r0.w is offset in ibo as src, and dst */ - /* dEQP-VK.binding_model.shader_access.primary_cmd_buf.storage_image.vertex.descriptor_array.3d */ - INSTR_5XX(c1a20204_0401fc01, "ldib.typed.3d.f32.4 r1.x, g[1], r1.w, r1.x"), /* ldib.a.f32.3d.4 r1.x, g[r1.w], r1.x, 1 */ - /* dEQP-VK.binding_model.shader_access.secondary_cmd_buf.with_push.storage_texel_buffer.vertex_fragment.single_descriptor.offset_zero */ - INSTR_5XX(c1a20005_0501be01, "ldib.typed.4d.f32.4 r1.y, g[0], r1.z, r1.y"), /* ldib.a.f32.1dtype.4 r1.y, g[r1.z], r1.y, 0 */ - /* dEQP-VK.texture.filtering.cube.formats.r8g8b8a8_snorm_nearest */ - INSTR_5XX(c1a60200_0000ba01, "ldib.typed.2d.u32.4 r0.x, g[1], r0.z, r0.x"), /* ldib.a.u32.2d.4 r0.x, g[r0.z], r0.x, 1 */ - - // TODO is this a real instruction? Or float -6.0 ? - // INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true), - /* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */ - INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */ - INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */ - INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"), - INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"), - INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"), - - /* Customely crafted */ - INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"), - INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"), - - INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */ - INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */ - INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"), - INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"), - INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"), - - /* Found in TCS/TES shaders of GTA V */ - INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */ - - /* Customely crafted */ - INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"), - - INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */ - INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */ - INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"), - INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"), - - /* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */ - INSTR_6XX(c7020020_01800000, "stc c[32], r0.x, 1", .parse_fail=true), - /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */ - INSTR_6XX(c7060020_03800000, "stc c[32], r0.x, 3", .parse_fail=true), - - /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */ - INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */ - - INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 r0.z, r0.x, 2"), + /* cat0 */ + INSTR_6XX(00000000_00000000, "nop"), + INSTR_6XX(00000200_00000000, "(rpt2)nop"), + INSTR_6XX(03000000_00000000, "end"), + INSTR_6XX(00800000_00000004, "br p0.x, #4"), + INSTR_6XX(00800000_fffffffc, "br p0.x, #-4"), + INSTR_6XX(00900000_00000003, "br !p0.x, #3"), + INSTR_6XX(03820000_00000015, "shps #21"), /* emit */ + INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */ + INSTR_6XX(02220000_00000004, "getlast.w8 #4"), + INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */ + INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"), + INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"), + INSTR_6XX(07820000_00000000, "prede"), + INSTR_6XX(00800063_0000001e, "brac.3 #30"), + INSTR_6XX(06820000_00000000, "predt p0.x"), + INSTR_6XX(07020000_00000000, "predf p0.x"), + INSTR_6XX(07820000_00000000, "prede"), + + /* cat1 */ + INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"), + INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"), + INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"), + INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"), + INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"), + INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"), + INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"), + INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"), + INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"), + /* dEQP-VK.subgroups.ballot.compute.compute */ + INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */ + + INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"), + INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"), + INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"), + INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"), + + INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"), + INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"), + INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"), + INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"), + + /* cat2 */ + INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"), + INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"), + INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"), + INSTR_6XX(47348000_00002000, "flat.b (ei)r0.x, 0, r0.x"), + INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"), + INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"), + INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"), + INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"), + INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"), + INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"), + INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"), + INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"), + INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"), + INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"), + INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"), + + /* cat3 */ + INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"), + INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"), + INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"), + INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"), + INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"), + INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"), + INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"), + INSTR_6XX(65900820_100cb008, "(nop3) shlg hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */ + INSTR_6XX(65ae085c_0002a001, "(nop3) shlg hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */ + INSTR_6XX(65900820_0c0aac05, "(nop3) shlg hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */ + INSTR_6XX(65ae0c5c_0002a001, "(nop3) shlg r23.x, r0.y, r23.x, r0.z"), /* (nop3) shlg.b32 r23.x, (r)r0.y, (r)r23.x, r0.z */ + INSTR_6XX(64018802_0002e003, "(nop3) shrm hr0.z, (neg)hr0.w, hr0.w, hr0.z"), + INSTR_6XX(64818802_0002e003, "(nop3) shlm hr0.z, (neg)hr0.w, hr0.w, hr0.z"), + INSTR_6XX(65018802_0002e003, "(nop3) shrg hr0.z, (neg)hr0.w, hr0.w, hr0.z"), + INSTR_6XX(66018802_0002e003, "(nop3) andg hr0.z, (neg)hr0.w, hr0.w, hr0.z"), + INSTR_6XX(67018802_1002e003, "(nop3) wmm hr0.z, (neg)hr0.w, hr0.w, 2"), /* (nop3) wmm.f16f16 hr0.z, (abs)(r)hr0.w, (r)hr0.w, 2 */ + INSTR_6XX(67018c02_1002e003, "(nop3) wmm.accu hr0.z, (neg)hr0.w, hr0.w, 2"), + INSTR_6XX(6701c802_9002a003, "(nop3) wmm r0.z, r0.w, r0.w, 2"), /* (nop3) wmm.f32f32 r0.z, (r)r0.w, (r)r0.w, 2 */ + /* custom test with qcom_dot8 function from cl_qcom_dot_product8 */ + INSTR_6XX(66818c02_0002e003, "(sat)(nop3) dp2acc.mixed.low r0.z, r0.w, r0.w, r0.z"), /* (nop3) dp2acc (sat)r0.z, (signed)(low)(r)r0.w, (low)(r)r0.w, r0.z */ + INSTR_6XX(6681c802_8002a003, "(nop3) dp4acc.unsigned.low r0.z, r0.w, r0.w, (neg)r0.z"), /* (nop3) dp4acc r0.z, (unsigned)(r)r0.w, (r)r0.w, (neg)r0.z */ + + /* cat4 */ + INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"), + + /* cat5 */ + /* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */ + INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */ + /* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */ + INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */ + /* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */ + INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */ + INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */ + + INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"), + INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"), /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */ + INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"), /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */ + INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"), + INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"), + INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"), + INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"), + INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"), + /* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */ + INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"), + INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"), + INSTR_6XX(a0c81108_e2000001, "sam.base0 (f32)(x)r2.x, r0.x, s#16, a1.x"), + INSTR_6XX(a048d107_cc080a07, "isaml.base3 (s32)(x)r1.w, r0.w, r1.y, s#0, t#6"), + + + /* dEQP-VK.subgroups.arithmetic.compute.subgroupadd_float */ + INSTR_6XX(a7c03102_00100003, "brcst.active.w8 (u32)(x)r0.z, r0.y"), /* brcst.active.w8 (u32)(xOOO)r0.z, r0.y */ + /* dEQP-VK.subgroups.quad.graphics.subgroupquadbroadcast_int */ + INSTR_6XX(b7e03107_00000401, "(sy)quad_shuffle.brcst (u32)(x)r1.w, r0.x, r0.z"), /* (sy)quad_shuffle.brcst (u32)(xOOO)r1.w, r0.x, r0.z */ + /* dEQP-VK.subgroups.quad.graphics.subgroupquadswapdiagonal_int */ + INSTR_6XX(b7e03104_00180001, "(sy)quad_shuffle.diag (u32)(x)r1.x, r0.x"), /* (sy)quad_shuffle.diag (u32)(xOOO)r1.x, r0.x */ + + /* cat6 */ + + INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"), /* ldgb.a.untyped.1dtype.u32.1 r0.x, g[r1.x], r0.x, 0 */ + INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"), /* (sy)stib.a.u32.2d.1 g[r1.x], r0.x, r0.z, 1. r1.x is offset in ibo, r0.x is value*/ + /* dEQP-VK.image.load_store.1d_array.r8g8b8a8_unorm */ + INSTR_5XX(c1a20006_0600ba01, "ldib.typed.2d.f32.4 r1.z, g[0], r0.z, r1.z"), /* ldib.a.f32.2d.4 r1.z, g[r0.z], r1.z, 0. r0.z is offset in ibo as src. r1.z */ + /* dEQP-VK.image.load_store.3d.r32g32b32a32_sint */ + INSTR_5XX(c1aa0003_0500fc01, "ldib.typed.3d.s32.4 r0.w, g[0], r0.w, r1.y"), /* ldib.a.s32.3d.4 r0.w, g[r0.w], r1.y, 0. r0.w is offset in ibo as src, and dst */ + /* dEQP-VK.binding_model.shader_access.primary_cmd_buf.storage_image.vertex.descriptor_array.3d */ + INSTR_5XX(c1a20204_0401fc01, "ldib.typed.3d.f32.4 r1.x, g[1], r1.w, r1.x"), /* ldib.a.f32.3d.4 r1.x, g[r1.w], r1.x, 1 */ + /* dEQP-VK.binding_model.shader_access.secondary_cmd_buf.with_push.storage_texel_buffer.vertex_fragment.single_descriptor.offset_zero */ + INSTR_5XX(c1a20005_0501be01, "ldib.typed.4d.f32.4 r1.y, g[0], r1.z, r1.y"), /* ldib.a.f32.1dtype.4 r1.y, g[r1.z], r1.y, 0 */ + /* dEQP-VK.texture.filtering.cube.formats.r8g8b8a8_snorm_nearest */ + INSTR_5XX(c1a60200_0000ba01, "ldib.typed.2d.u32.4 r0.x, g[1], r0.z, r0.x"), /* ldib.a.u32.2d.4 r0.x, g[r0.z], r0.x, 1 */ + + // TODO is this a real instruction? Or float -6.0 ? + // INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true), + /* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */ + INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */ + INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */ + INSTR_6XX(c0dc052e_01800042, "stg.a.u8 g[r0.z+(r11.z)<<2], hr8.y, 1"), + INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"), + INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"), + INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"), + INSTR_5XX(c0ce0100_02800000, "stg.s8 g[r0.x], hr0.x, 2"), + INSTR_5XX(c0c00100_02800000, "stg.f16 g[r0.x], hr0.x, 2"), + + /* Customely crafted */ + INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"), + INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"), + + INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */ + INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */ + INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"), + INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"), + INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"), + INSTR_6XX(c0040003_0180c269, "ldg.u16 hr0.w, g[r0.w+308], 1"), + + /* Found in TCS/TES shaders of GTA V */ + INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */ + + /* Customely crafted */ + INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"), + + INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */ + INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */ + INSTR_6XX(c0000006_01c18017, "ldg.a.f16 hr1.z, g[r1.z+(r2.w)<<2], 1"), + INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"), + INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"), + + /* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */ + INSTR_6XX(c7020020_01800000, "stc.f32 c[32], r0.x, 1"), /* stc c[32], r0.x, 1 */ + /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */ + INSTR_6XX(c7060020_03800000, "stc.u32 c[32], r0.x, 3"), /* stc c[32], r0.x, 3 */ + + /* custom */ + INSTR_6XX(c7060100_03800000, "stc.u32 c[a1.x], r0.x, 3"), /* stc c[a1.x], r0.x, 3 */ + INSTR_6XX(c7060120_03800000, "stc.u32 c[a1.x+32], r0.x, 3"), /* stc c[a1.x+32], r0.x, 3 */ + + /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */ + INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */ + + INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 hr0.z, r0.x, 2"), #if 0 /* TODO blob sometimes/frequently sets b0, although there does not seem * to be an obvious pattern and our encoding never sets it. AFAICT it @@ -298,6 +332,13 @@ static const struct test { INSTR_6XX(c0260000_00c78080, "ldc.offset0.1.nonuniform r0.x, 0, r0.x"), /* ldc.1.mode2.base0 r0.x, 0, r0.x */ INSTR_6XX(c0260201_00c78080, "ldc.offset0.1.nonuniform r0.y, 0, r0.y"), /* ldc.1.mode2.base0 r0.y, 0, r0.y */ + /* a4xx-a5xx has the exact same instrs in + * dEQP-GLES31.functional.shaders.opaque_type_indexing.ubo.(dynamically_)uniform_fragment + * with no change based on the mode. Note that we can't decode this yet. + */ + /* INSTR_4XX(c7860000_00810001), */ /* ldc.1 r0.x, g[r1.x], 0, r0.x */ + /* INSTR_5XX(c7860000_00800000), */ /* ldc.a.1 r0.x, g[r0.x], 0, r0.x */ + /* custom */ INSTR_6XX(c0260201_ffc78080, "ldc.offset0.1.nonuniform r0.y, 255, r0.y"), /* ldc.1.mode2.base0 r0.y, 255, r0.y */ @@ -307,6 +348,11 @@ static const struct test { INSTR_6XX(c0260000_00478400, "ldc.offset2.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */ INSTR_6XX(c0260000_00478600, "ldc.offset3.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */ + /* dEQP-VK.glsl.conditionals.if.if_else_vertex */ + INSTR_6XX(c0360000_00c78100, "ldc.1.k.imm.base0 c[a1.x], 0, 0"), /* ldc.1.k.mode4.base0 c[a1.x], 0, 0 */ + /* custom */ + INSTR_6XX(c0360003_00c78100, "ldc.4.k.imm.base0 c[a1.x], 0, 0"), /* ldc.4.k.mode4.base0 c[a1.x], 0, 0 */ + /* dEQP-VK.glsl.struct.local.nested_struct_array_dynamic_index_fragment */ INSTR_6XX(c1425b50_01803e02, "stp.f32 p[r11.y-176], r0.y, 1"), INSTR_6XX(c1425b98_02803e14, "stp.f32 p[r11.y-104], r2.z, 2"), @@ -318,14 +364,17 @@ static const struct test { /* Atomic: */ #if 0 /* TODO our encoding differs in b53 for these two */ - INSTR_5XX(c4d60002_00008001, "atomic.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"), - INSTR_5XX(c4160205_03000001, "atomic.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"), + INSTR_5XX(c4f60002_00008001, "atomic.s.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"), + INSTR_5XX(c4360205_03000001, "atomic.s.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"), #else - INSTR_5XX(c4f60002_00008001, "atomic.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"), - INSTR_5XX(c4360205_03000001, "atomic.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"), + INSTR_5XX(c4f60002_00008001, "atomic.s.inc.untyped.1d.u32.1.g r0.z, g[0], r0.z, r0.x, r0.x"), + INSTR_5XX(c4360205_03000001, "atomic.s.add.untyped.1d.u32.1.g r1.y, g[1], r0.x, r0.w, r0.x"), #endif INSTR_6XX(d5c60003_03008001, "(sy)atomic.max.untyped.1d.u32.1.l r0.w, l[r0.z], r0.w"), + /* dEQP-VK.glsl.atomic_operations.add_unsigned_compute_reference */ + INSTR_6XX(c4160002_02000001, "atomic.g.add.untyped.1d.u32.1.g r0.z, r0.x, r0.z"), + /* Bindless atomic: */ INSTR_6XX(c03a0003_01640000, "atomic.b.add.untyped.1d.s32.1.imm r0.w, r0.y, 0"), /* atomic.b.add.g.s32.1d.mode0.base0 r0.w,r0.y,0 */ INSTR_6XX(c03a0003_01660000, "atomic.b.and.untyped.1d.s32.1.imm r0.w, r0.y, 0"), /* atomic.b.and.g.s32.1d.mode0.base0 r0.w,r0.y,0 */ @@ -333,10 +382,14 @@ static const struct test { /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_literal.fragment.sampler2d */ INSTR_6XX(a0c01f04_0cc00005, "sam (f32)(xyzw)r1.x, r0.z, s#6, t#6"), - /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.uniform.fragment.sampler2d (looks like maybe the compiler didn't figure out */ - INSTR_6XX(a0c81f07_0100000b, "sam.s2en (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode0 (f32)(xyzw)r1.w, r1.y, hr2.x */ + + /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.uniform.fragment.sampler2d */ + INSTR_4XX(a0c81f02_00800001, "sam.s2en.uniform (f32)(xyzw)r0.z, r0.x, hr1.x"), /* sam.s2en.mode0 (f32)(xyzw)r0.z, r0.x, hr1.x */ /* same for 5xx */ + INSTR_6XX(a0c81f07_0100000b, "sam.s2en.uniform (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode0 (f32)(xyzw)r1.w, r1.y, hr2.x */ + /* dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.dynamically_uniform.fragment.sampler2d */ - INSTR_6XX(a0c81f07_8100000b, "sam.s2en.uniform (f32)(xyzw)r1.w, r1.y, hr2.x", .parse_fail=true), /* sam.s2en.mode4 (f32)(xyzw)r1.w, r1.y, hr2.x */ + INSTR_4XX(a0c81f02_80800001, "sam.s2en.nonuniform (f32)(xyzw)r0.z, r0.x, hr1.x"), /* sam.s2en.uniform (f32)(xyzw)r0.z, r0.x, hr1.x */ /* same for 5xx */ + INSTR_6XX(a0c81f07_8100000b, "sam.s2en.nonuniform (f32)(xyzw)r1.w, r1.y, hr2.x"), /* sam.s2en.mode4 (f32)(xyzw)r1.w, r1.y, hr2.x */ /* NonUniform: */ /* dEQP-VK.descriptor_indexing.storage_buffer */ @@ -349,6 +402,9 @@ static const struct test { /* dEQP-VK.descriptor_indexing.sampler */ INSTR_6XX(a0c81f00_40000005, "sam.s2en.nonuniform.base0 (f32)(xyzw)r0.x, r0.z, r0.x"), + /* dEQP-VK.subgroups.quad.graphics.subgroupquadbroadcast_int */ + INSTR_6XX(c0260001_00c98000, "getfiberid.u32 r0.y"), + /* Custom test since we've never seen the blob emit these. */ INSTR_6XX(c0260004_00490000, "getspid.u32 r1.x"), INSTR_6XX(c0260005_00494000, "getwid.u32 r1.y"), @@ -416,7 +472,6 @@ main(int argc, char **argv) printf(" Got: \"%s\"\n", disasm_output); retval = 1; decode_fails++; - continue; } /* @@ -426,7 +481,8 @@ main(int argc, char **argv) unsigned gen = test->gpu_id / 100; if (!compilers[gen]) { dev_ids[gen].gpu_id = test->gpu_id; - compilers[gen] = ir3_compiler_create(NULL, &dev_ids[gen], false); + compilers[gen] = ir3_compiler_create(NULL, &dev_ids[gen], + &(struct ir3_compiler_options){}); } FILE *fasm = |