diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2016-05-29 10:22:51 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2016-05-29 10:22:51 +0000 |
commit | c9223eed3c16cd3e98a8f56dda953d8f299de0e3 (patch) | |
tree | 53e2a1c3f13bcf6b4ed201d7bc135e7213c94ebe /lib/mesa/src/gallium/drivers/freedreno/ir3 | |
parent | 6e8f2d062ab9c198239b9283b2b7ed12f4ea17d8 (diff) |
Import Mesa 11.2.2
Diffstat (limited to 'lib/mesa/src/gallium/drivers/freedreno/ir3')
16 files changed, 1431 insertions, 1028 deletions
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c index 83ed5ffdc..599872470 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c @@ -220,7 +220,7 @@ static void print_instr_cat1(instr_t *instr) else if (cat1->off > 0) printf("%c<a0.x + %d>", type, cat1->off); else - printf("c<a0.x>"); + printf("%c<a0.x>", type); } else { print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32, cat1->src_r, cat1->src_c, cat1->src_im, false, false, false); @@ -650,7 +650,7 @@ static void print_instr_cat6(instr_t *instr) /* size of largest OPC field of all the instruction categories: */ #define NOPC_BITS 6 -struct opc_info { +static const struct opc_info { uint16_t cat; uint16_t opc; const char *name; diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h index c3fb68d51..1b1f1f0a7 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h @@ -261,6 +261,7 @@ typedef union PACKED { /* to make compiler happy: */ uint32_t dummy32; uint32_t dummy10 : 10; + int32_t idummy10 : 10; uint32_t dummy11 : 11; uint32_t dummy12 : 12; uint32_t dummy13 : 13; diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c index b24825cff..7d89142d7 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c @@ -81,6 +81,7 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler, shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout); list_inithead(&shader->block_list); + list_inithead(&shader->array_list); return shader; } @@ -121,18 +122,19 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, val.iim_val = reg->iim_val; } else { unsigned components; + int16_t max; if (reg->flags & IR3_REG_RELATIV) { components = reg->size; - val.dummy10 = reg->offset; + val.idummy10 = reg->array.offset; + max = (reg->array.offset + repeat + components - 1) >> 2; } else { components = util_last_bit(reg->wrmask); val.comp = reg->num & 0x3; val.num = reg->num >> 2; + max = (reg->num + repeat + components - 1) >> 2; } - int16_t max = (reg->num + repeat + components - 1) >> 2; - if (reg->flags & IR3_REG_CONST) { info->max_const = MAX2(info->max_const, max); } else if (val.num == 63) { @@ -233,7 +235,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr, iassert((instr->regs_count == 2) || (instr->regs_count == 3)); if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); + iassert(src1->array.offset < (1 << 10)); cat2->rel1.src1 = reg(src1, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -260,7 +262,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr, !((src1->flags ^ src2->flags) & IR3_REG_HALF)); if (src2->flags & IR3_REG_RELATIV) { - iassert(src2->num < (1 << 10)); + iassert(src2->array.offset < (1 << 10)); cat2->rel2.src2 = reg(src2, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -333,7 +335,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr, iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); + iassert(src1->array.offset < (1 << 10)); cat3->rel1.src1 = reg(src1, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -361,7 +363,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr, if (src3->flags & IR3_REG_RELATIV) { - iassert(src3->num < (1 << 10)); + iassert(src3->array.offset < (1 << 10)); cat3->rel2.src3 = reg(src3, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -404,7 +406,7 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr, iassert(instr->regs_count == 2); if (src->flags & IR3_REG_RELATIV) { - iassert(src->num < (1 << 10)); + iassert(src->array.offset < (1 << 10)); cat4->rel.src = reg(src, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF); @@ -737,6 +739,14 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, return reg; } +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg) +{ + struct ir3_register *new_reg = reg_create(shader, 0, 0); + *new_reg = *reg; + return new_reg; +} + void ir3_instr_set_address(struct ir3_instruction *instr, struct ir3_instruction *addr) @@ -777,3 +787,12 @@ ir3_count_instructions(struct ir3 *ir) } return cnt; } + +struct ir3_array * +ir3_lookup_array(struct ir3 *ir, unsigned id) +{ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) + if (arr->id == id) + return arr; + return NULL; +} diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h index 12f2ebe18..1a109d880 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h @@ -83,7 +83,8 @@ struct ir3_register { * before register assignment is done: */ IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */ - IR3_REG_PHI_SRC= 0x4000, /* phi src, regs[0]->instr points to phi */ + IR3_REG_ARRAY = 0x4000, + IR3_REG_PHI_SRC= 0x8000, /* phi src, regs[0]->instr points to phi */ } flags; union { @@ -97,11 +98,18 @@ struct ir3_register { uint32_t uim_val; float fim_val; /* relative: */ - int offset; + struct { + uint16_t id; + int16_t offset; + } array; }; - /* for IR3_REG_SSA, src registers contain ptr back to - * assigning instruction. + /* For IR3_REG_SSA, src registers contain ptr back to assigning + * instruction. + * + * For IR3_REG_ARRAY, the pointer is back to the last dependent + * array access (although the net effect is the same, it points + * back to a previous instruction that we depend on). */ struct ir3_instruction *instr; @@ -177,6 +185,7 @@ struct ir3_instruction { * before register assignment is done: */ IR3_INSTR_MARK = 0x1000, + IR3_INSTR_UNUSED= 0x2000, } flags; int repeat; #ifdef DEBUG @@ -221,9 +230,6 @@ struct ir3_instruction { int off; /* component/offset */ } fo; struct { - int aid; - } fi; - struct { /* used to temporarily hold reference to nir_phi_instr * until we resolve the phi srcs */ @@ -243,11 +249,7 @@ struct ir3_instruction { * result of moving a const to a reg would have a low cost, so to * it could make sense to duplicate the instruction at various * points where the result is needed to reduce register footprint. - * - * DEPTH_UNUSED used to mark unused instructions after depth - * calculation pass. */ -#define DEPTH_UNUSED ~0 unsigned depth; /* When we get to the RA stage, we no longer need depth, but * we do need instruction's position/name: @@ -258,6 +260,10 @@ struct ir3_instruction { }; }; + /* used for per-pass extra instruction data. + */ + void *data; + /* Used during CP and RA stages. For fanin and shader inputs/ * outputs where we need a sequence of consecutive registers, * keep track of each src instructions left (ie 'n-1') and right @@ -292,19 +298,6 @@ struct ir3_instruction { */ struct ir3_instruction *address; - /* in case of a instruction with relative dst instruction, we need to - * capture the dependency on the fanin for the previous values of - * the array elements. Since we don't know at compile time actually - * which array elements are written, this serves to preserve the - * unconditional write to array elements prior to the conditional - * write. - * - * TODO only cat1 can do indirect write.. we could maybe move this - * into instr->cat1.fanin (but would require the frontend to insert - * the extra mov) - */ - struct ir3_instruction *fanin; - /* Entry in ir3_block's instruction list: */ struct list_head node; @@ -378,10 +371,41 @@ struct ir3 { /* List of blocks: */ struct list_head block_list; + /* List of ir3_array's: */ + struct list_head array_list; + unsigned heap_idx; struct ir3_heap_chunk *chunk; }; +typedef struct nir_variable nir_variable; + +struct ir3_array { + struct list_head node; + unsigned length; + unsigned id; + + nir_variable *var; + + /* We track the last write and last access (read or write) to + * setup dependencies on instructions that read or write the + * array. Reads can be re-ordered wrt. other reads, but should + * not be re-ordered wrt. to writes. Writes cannot be reordered + * wrt. any other access to the array. + * + * So array reads depend on last write, and array writes depend + * on the last access. + */ + struct ir3_instruction *last_write, *last_access; + + /* extra stuff used in RA pass: */ + unsigned base; /* base vreg name */ + unsigned reg; /* base physical reg */ + uint16_t start_ip, end_ip; +}; + +struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id); + typedef struct nir_block nir_block; struct ir3_block { @@ -404,7 +428,7 @@ struct ir3_block { /* used for per-pass extra block data. Mainly used right * now in RA step to track livein/liveout. */ - void *bd; + void *data; #ifdef DEBUG uint32_t serialno; @@ -429,6 +453,8 @@ const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags); +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg); void ir3_instr_set_address(struct ir3_instruction *instr, struct ir3_instruction *addr); @@ -509,6 +535,9 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr) if (dst->num == regid(REG_A0, 0)) return false; + if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) + return false; + if ((instr->category == 1) && (instr->cat1.src_type == instr->cat1.dst_type)) return true; @@ -622,8 +651,10 @@ static inline bool writes_pred(struct ir3_instruction *instr) /* TODO better name */ static inline struct ir3_instruction *ssa(struct ir3_register *reg) { - if (reg->flags & IR3_REG_SSA) + if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) { + debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED))); return reg->instr; + } return NULL; } @@ -812,8 +843,6 @@ static inline unsigned ir3_cat3_absneg(opc_t opc) static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) { - if (instr->fanin) - return instr->regs_count + 2; if (instr->address) return instr->regs_count + 1; return instr->regs_count; @@ -821,8 +850,6 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) { - if (n == (instr->regs_count + 1)) - return instr->fanin; if (n == (instr->regs_count + 0)) return instr->address; return ssa(instr->regs[n]); @@ -833,8 +860,8 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr /* iterator for an instruction's SSA sources (instr), also returns src #: */ #define foreach_ssa_src_n(__srcinst, __n, __instr) \ if ((__instr)->regs_count) \ - for (unsigned __cnt = __ssa_src_cnt(__instr) - 1, __n = 0; __n < __cnt; __n++) \ - if ((__srcinst = __ssa_src_n(__instr, __n + 1))) + for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ + if ((__srcinst = __ssa_src_n(__instr, __n))) /* iterator for an instruction's SSA sources (instr): */ #define foreach_ssa_src(__srcinst, __instr) \ @@ -877,7 +904,15 @@ ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type) struct ir3_instruction *instr = ir3_instr_create(block, 1, 0); ir3_reg_create(instr, 0, 0); /* dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + if (src->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_register *src_reg = + ir3_reg_create(instr, 0, IR3_REG_ARRAY); + src_reg->array = src->regs[0]->array; + src_reg->instr = src; + } else { + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + } + debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV)); instr->cat1.src_type = type; instr->cat1.dst_type = type; return instr; @@ -893,6 +928,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src, ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; instr->cat1.src_type = src_type; instr->cat1.dst_type = dst_type; + debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY)); return instr; } @@ -1082,7 +1118,7 @@ typedef uint8_t regmask_t[2 * MAX_REG / 8]; static inline unsigned regmask_idx(struct ir3_register *reg) { - unsigned num = reg->num; + unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num; debug_assert(num < MAX_REG); if (reg->flags & IR3_REG_HALF) num += MAX_REG; diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index ede29f445..481859efb 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -40,6 +40,7 @@ #include "freedreno_util.h" #include "ir3_compiler.h" +#include "ir3_nir.h" #include "instr-a3xx.h" #include "ir3.h" @@ -94,6 +95,8 @@ static void print_usage(void) printf(" --saturate-t MASK - bitmask of samplers to saturate T coord\n"); printf(" --saturate-r MASK - bitmask of samplers to saturate R coord\n"); printf(" --stream-out - enable stream-out (aka transform feedback)\n"); + printf(" --ucp MASK - bitmask of enabled user-clip-planes\n"); + printf(" --gpu GPU_ID - specify gpu-id (default 320)\n"); printf(" --help - show this message\n"); } @@ -103,16 +106,15 @@ int main(int argc, char **argv) const char *filename; struct tgsi_token toks[65536]; struct tgsi_parse_context parse; - struct ir3_compiler *compiler; struct ir3_shader_variant v; struct ir3_shader s; struct ir3_shader_key key = {}; + /* TODO cmdline option to target different gpus: */ + unsigned gpu_id = 320; const char *info; void *ptr; size_t size; - fd_mesa_debug |= FD_DBG_DISASM; - memset(&s, 0, sizeof(s)); memset(&v, 0, sizeof(v)); @@ -125,7 +127,7 @@ int main(int argc, char **argv) while (n < argc) { if (!strcmp(argv[n], "--verbose")) { - fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS; + fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS | FD_DBG_DISASM; n++; continue; } @@ -190,6 +192,20 @@ int main(int argc, char **argv) continue; } + if (!strcmp(argv[n], "--ucp")) { + debug_printf(" %s %s", argv[n], argv[n+1]); + key.ucp_enables = strtol(argv[n+1], NULL, 0); + n += 2; + continue; + } + + if (!strcmp(argv[n], "--gpu")) { + debug_printf(" %s %s", argv[n], argv[n+1]); + gpu_id = strtol(argv[n+1], NULL, 0); + n += 2; + continue; + } + if (!strcmp(argv[n], "--help")) { print_usage(); return 0; @@ -213,7 +229,12 @@ int main(int argc, char **argv) if (!tgsi_text_translate(ptr, toks, Elements(toks))) errx(1, "could not parse `%s'", filename); - s.tokens = toks; + if (fd_mesa_debug & FD_DBG_OPTMSGS) + tgsi_dump(toks, 0); + + nir_shader *nir = ir3_tgsi_to_nir(toks); + s.compiler = ir3_compiler_create(gpu_id); + s.nir = ir3_optimize_nir(&s, nir, NULL); v.key = key; v.shader = &s; @@ -231,11 +252,8 @@ int main(int argc, char **argv) break; } - /* TODO cmdline option to target different gpus: */ - compiler = ir3_compiler_create(320); - info = "NIR compiler"; - ret = ir3_compile_shader_nir(compiler, &v); + ret = ir3_compile_shader_nir(s.compiler, &v); if (ret) { fprintf(stderr, "compiler failed!\n"); return ret; diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 13c395f3c..7a1812f25 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -32,11 +32,6 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "tgsi/tgsi_lowering.h" -#include "tgsi/tgsi_strings.h" - -#include "nir/tgsi_to_nir.h" -#include "glsl/shader_enums.h" #include "freedreno_util.h" @@ -51,7 +46,6 @@ struct ir3_compile { struct ir3_compiler *compiler; - const struct tgsi_token *tokens; struct nir_shader *s; struct ir3 *ir; @@ -80,8 +74,6 @@ struct ir3_compile { /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; - /* mapping from nir_variable to ir3_array: */ - struct hash_table *var_ht; unsigned num_arrays; /* a common pattern for indirect addressing is to request the @@ -97,9 +89,6 @@ struct ir3_compile { */ struct hash_table *block_ht; - /* for calculating input/output positions/linkages: */ - unsigned next_inloc; - /* a4xx (at least patchlevel 0) cannot seem to flat-interpolate * so we need to use ldlv.u32 to load the varying directly: */ @@ -127,101 +116,12 @@ struct ir3_compile { static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock); -static struct nir_shader *to_nir(const struct tgsi_token *tokens) -{ - struct nir_shader_compiler_options options = { - .lower_fpow = true, - .lower_fsat = true, - .lower_scmp = true, - .lower_flrp = true, - .native_integers = true, - }; - bool progress; - - struct nir_shader *s = tgsi_to_nir(tokens, &options); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - nir_opt_global_to_local(s); - nir_convert_to_ssa(s); - nir_lower_idiv(s); - nir_lower_load_const_to_scalar(s); - - do { - progress = false; - - nir_lower_vars_to_ssa(s); - nir_lower_alu_to_scalar(s); - nir_lower_phis_to_scalar(s); - - progress |= nir_copy_prop(s); - progress |= nir_opt_dce(s); - progress |= nir_opt_cse(s); - progress |= ir3_nir_lower_if_else(s); - progress |= nir_opt_algebraic(s); - progress |= nir_opt_constant_folding(s); - - } while (progress); - - nir_remove_dead_variables(s); - nir_validate_shader(s); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - return s; -} - -/* TODO nir doesn't lower everything for us yet, but ideally it would: */ -static const struct tgsi_token * -lower_tgsi(struct ir3_compile *ctx, const struct tgsi_token *tokens, - struct ir3_shader_variant *so) -{ - struct tgsi_shader_info info; - struct tgsi_lowering_config lconfig = { - .color_two_side = so->key.color_two_side, - .lower_FRC = true, - }; - - switch (so->type) { - case SHADER_FRAGMENT: - case SHADER_COMPUTE: - lconfig.saturate_s = so->key.fsaturate_s; - lconfig.saturate_t = so->key.fsaturate_t; - lconfig.saturate_r = so->key.fsaturate_r; - break; - case SHADER_VERTEX: - lconfig.saturate_s = so->key.vsaturate_s; - lconfig.saturate_t = so->key.vsaturate_t; - lconfig.saturate_r = so->key.vsaturate_r; - break; - } - - if (ctx->compiler->gpu_id >= 400) { - /* a4xx seems to have *no* sam.p */ - lconfig.lower_TXP = ~0; /* lower all txp */ - } else { - /* a3xx just needs to avoid sam.p for 3d tex */ - lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D); - } - - return tgsi_transform_lowering(&lconfig, tokens, &info); -} static struct ir3_compile * compile_init(struct ir3_compiler *compiler, - struct ir3_shader_variant *so, - const struct tgsi_token *tokens) + struct ir3_shader_variant *so) { struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile); - const struct tgsi_token *lowered_tokens; if (compiler->gpu_id >= 400) { /* need special handling for "flat" */ @@ -238,23 +138,33 @@ compile_init(struct ir3_compiler *compiler, ctx->compiler = compiler; ctx->ir = so->ir; ctx->so = so; - ctx->next_inloc = 8; ctx->def_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->var_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->addr_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->block_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - lowered_tokens = lower_tgsi(ctx, tokens, so); - if (!lowered_tokens) - lowered_tokens = tokens; - ctx->s = to_nir(lowered_tokens); + /* TODO: maybe generate some sort of bitmask of what key + * lowers vs what shader has (ie. no need to lower + * texture clamp lowering if no texture sample instrs).. + * although should be done further up the stack to avoid + * creating duplicate variants.. + */ + + if (ir3_key_lowers_nir(&so->key)) { + nir_shader *s = nir_shader_clone(ctx, so->shader->nir); + ctx->s = ir3_optimize_nir(so->shader, s, &so->key); + } else { + /* fast-path for shader key that lowers nothing in NIR: */ + ctx->s = so->shader->nir; + } - if (lowered_tokens != tokens) - free((void *)lowered_tokens); + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}", + so->shader->id, so->id, so->type, + so->key.binning_pass, so->key.color_two_side, + so->key.half_precision); + nir_print_shader(ctx->s, stdout); + } so->first_driver_param = so->first_immediate = ctx->s->num_uniforms; @@ -263,7 +173,7 @@ compile_init(struct ir3_compiler *compiler, * num_uniform * vec4 - user consts * 4 * vec4 - UBO addresses * if (vertex shader) { - * 1 * vec4 - driver params (IR3_DP_*) + * N * vec4 - driver params (IR3_DP_*) * 1 * vec4 - stream-out addresses * } * @@ -275,8 +185,8 @@ compile_init(struct ir3_compiler *compiler, so->first_immediate += 4; if (so->type == SHADER_VERTEX) { - /* one (vec4) slot for driver params (see ir3_driver_param): */ - so->first_immediate++; + /* driver params (see ir3_driver_param): */ + so->first_immediate += IR3_DP_COUNT/4; /* convert to vec4 */ /* one (vec4) slot for stream-output base addresses: */ so->first_immediate++; } @@ -306,206 +216,26 @@ compile_free(struct ir3_compile *ctx) ralloc_free(ctx); } -/* global per-array information: */ -struct ir3_array { - unsigned length, aid; -}; - -/* per-block array state: */ -struct ir3_array_value { - /* TODO drop length/aid, and just have ptr back to ir3_array */ - unsigned length, aid; - /* initial array element values are phi's, other than for the - * entry block. The phi src's get added later in a resolve step - * after we have visited all the blocks, to account for back - * edges in the cfg. - */ - struct ir3_instruction **phis; - /* current array element values (as block is processed). When - * the array phi's are resolved, it will contain the array state - * at exit of block, so successor blocks can use it to add their - * phi srcs. - */ - struct ir3_instruction *arr[]; -}; - -/* track array assignments per basic block. When an array is read - * outside of the same basic block, we can use NIR's dominance-frontier - * information to figure out where phi nodes are needed. - */ -struct ir3_nir_block_data { - unsigned foo; - /* indexed by array-id (aid): */ - struct ir3_array_value *arrs[]; -}; - -static struct ir3_nir_block_data * -get_block_data(struct ir3_compile *ctx, struct ir3_block *block) -{ - if (!block->bd) { - struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) + - ((ctx->num_arrays + 1) * sizeof(bd->arrs[0]))); - block->bd = bd; - } - return block->bd; -} - static void declare_var(struct ir3_compile *ctx, nir_variable *var) { unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ struct ir3_array *arr = ralloc(ctx, struct ir3_array); + arr->id = ++ctx->num_arrays; arr->length = length; - arr->aid = ++ctx->num_arrays; - _mesa_hash_table_insert(ctx->var_ht, var, arr); -} - -static nir_block * -nir_block_pred(nir_block *block) -{ - assert(block->predecessors->entries < 2); - if (block->predecessors->entries == 0) - return NULL; - return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key; + arr->var = var; + list_addtail(&arr->node, &ctx->ir->array_list); } -static struct ir3_array_value * +static struct ir3_array * get_var(struct ir3_compile *ctx, nir_variable *var) { - struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); - struct ir3_block *block = ctx->block; - struct ir3_nir_block_data *bd = get_block_data(ctx, block); - struct ir3_array *arr = entry->data; - - if (!bd->arrs[arr->aid]) { - struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) + - (arr->length * sizeof(av->arr[0]))); - struct ir3_array_value *defn = NULL; - nir_block *pred_block; - - av->length = arr->length; - av->aid = arr->aid; - - /* For loops, we have to consider that we have not visited some - * of the blocks who should feed into the phi (ie. back-edges in - * the cfg).. for example: - * - * loop { - * block { load_var; ... } - * if then block {} else block {} - * block { store_var; ... } - * if then block {} else block {} - * block {...} - * } - * - * We can skip the phi if we can chase the block predecessors - * until finding the block previously defining the array without - * crossing a block that has more than one predecessor. - * - * Otherwise create phi's and resolve them as a post-pass after - * all the blocks have been visited (to handle back-edges). - */ - - for (pred_block = block->nblock; - pred_block && (pred_block->predecessors->entries < 2) && !defn; - pred_block = nir_block_pred(pred_block)) { - struct ir3_block *pblock = get_block(ctx, pred_block); - struct ir3_nir_block_data *pbd = pblock->bd; - if (!pbd) - continue; - defn = pbd->arrs[arr->aid]; - } - - if (defn) { - /* only one possible definer: */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = defn->arr[i]; - } else if (pred_block) { - /* not the first block, and multiple potential definers: */ - av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0])); - - for (unsigned i = 0; i < arr->length; i++) { - struct ir3_instruction *phi; - - phi = ir3_instr_create2(block, -1, OPC_META_PHI, - 1 + ctx->impl->num_blocks); - ir3_reg_create(phi, 0, 0); /* dst */ - - /* phi's should go at head of block: */ - list_delinit(&phi->node); - list_add(&phi->node, &block->instr_list); - - av->phis[i] = av->arr[i] = phi; - } - } else { - /* Some shaders end up reading array elements without - * first writing.. so initialize things to prevent null - * instr ptrs later: - */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = create_immed(block, 0); - } - - bd->arrs[arr->aid] = av; - } - - return bd->arrs[arr->aid]; -} - -static void -add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock, - struct ir3_array_value *av, BITSET_WORD *visited) -{ - struct ir3_block *block; - struct ir3_nir_block_data *bd; - - if (BITSET_TEST(visited, nblock->index)) - return; - - BITSET_SET(visited, nblock->index); - - block = get_block(ctx, nblock); - bd = block->bd; - - if (bd && bd->arrs[av->aid]) { - struct ir3_array_value *dav = bd->arrs[av->aid]; - for (unsigned i = 0; i < av->length; i++) { - ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr = - dav->arr[i]; - } - } else { - /* didn't find defn, recurse predecessors: */ - struct set_entry *entry; - set_foreach(nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } - } -} - -static void -resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block) -{ - struct ir3_nir_block_data *bd = block->bd; - unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks); - - if (!bd) - return; - - /* TODO use nir dom_frontier to help us with this? */ - - for (unsigned i = 1; i <= ctx->num_arrays; i++) { - struct ir3_array_value *av = bd->arrs[i]; - BITSET_WORD visited[bitset_words]; - struct set_entry *entry; - - if (!(av && av->phis)) - continue; - - memset(visited, 0, sizeof(visited)); - set_foreach(block->nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + if (arr->var == var) + return arr; } + compile_error(ctx, "bogus var: %s\n", var->name); + return NULL; } /* allocate a n element value array (to be populated by caller) and @@ -523,6 +253,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n) static struct ir3_instruction ** get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n) { + compile_assert(ctx, dst->is_ssa); if (dst->is_ssa) { return __get_dst(ctx, &dst->ssa, n); } else { @@ -540,6 +271,7 @@ static struct ir3_instruction ** get_src(struct ir3_compile *ctx, nir_src *src) { struct hash_entry *entry; + compile_assert(ctx, src->is_ssa); if (src->is_ssa) { entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); } else { @@ -596,12 +328,17 @@ static struct ir3_instruction * get_addr(struct ir3_compile *ctx, struct ir3_instruction *src) { struct ir3_instruction *addr; - struct hash_entry *entry; - entry = _mesa_hash_table_search(ctx->addr_ht, src); - if (entry) - return entry->data; - /* TODO do we need to cache per block? */ + if (!ctx->addr_ht) { + ctx->addr_ht = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); + } else { + struct hash_entry *entry; + entry = _mesa_hash_table_search(ctx->addr_ht, src); + if (entry) + return entry->data; + } + addr = create_addr(ctx->block, src); _mesa_hash_table_insert(ctx->addr_ht, src, addr); @@ -640,7 +377,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n) } static struct ir3_instruction * -create_uniform_indirect(struct ir3_compile *ctx, unsigned n, +create_uniform_indirect(struct ir3_compile *ctx, int n, struct ir3_instruction *address) { struct ir3_instruction *mov; @@ -649,7 +386,7 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n, mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); - ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV); + ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; ir3_instr_set_address(mov, address); @@ -674,7 +411,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr, } static struct ir3_instruction * -create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, +create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n, struct ir3_instruction *address, struct ir3_instruction *collect) { struct ir3_block *block = ctx->block; @@ -688,17 +425,45 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); src->instr = collect; src->size = arrsz; - src->offset = n; + src->array.offset = n; ir3_instr_set_address(mov, address); return mov; } +/* relative (indirect) if address!=NULL */ static struct ir3_instruction * -create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, - struct ir3_instruction *src, struct ir3_instruction *address, - struct ir3_instruction *collect) +create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *address) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *src; + + mov = ir3_instr_create(block, 1, 0); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + ir3_reg_create(mov, 0, 0); + src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + src->instr = arr->last_write; + src->size = arr->length; + src->array.id = arr->id; + src->array.offset = n; + + if (address) + ir3_instr_set_address(mov, address); + + arr->last_access = mov; + + return mov; +} + +/* relative (indirect) if address!=NULL */ +static struct ir3_instruction * +create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *src, struct ir3_instruction *address) { struct ir3_block *block = ctx->block; struct ir3_instruction *mov; @@ -707,14 +472,18 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, mov = ir3_instr_create(block, 1, 0); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; - dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV); - dst->size = arrsz; - dst->offset = n; + dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + dst->instr = arr->last_access; + dst->size = arr->length; + dst->array.id = arr->id; + dst->array.offset = n; ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; - mov->fanin = collect; ir3_instr_set_address(mov, address); + arr->last_write = arr->last_access = mov; + return mov; } @@ -731,11 +500,12 @@ create_input(struct ir3_block *block, unsigned n) } static struct ir3_instruction * -create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv) +create_frag_input(struct ir3_compile *ctx, bool use_ldlv) { struct ir3_block *block = ctx->block; struct ir3_instruction *instr; - struct ir3_instruction *inloc = create_immed(block, n); + /* actual inloc is assigned and fixed up later: */ + struct ir3_instruction *inloc = create_immed(block, 0); if (use_ldlv) { instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); @@ -786,6 +556,10 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp) } } +/* NOTE: this creates the "TGSI" style fragface (ie. input slot + * VARYING_SLOT_FACE). For NIR style nir_intrinsic_load_front_face + * we can just use the value from hw directly (since it is boolean) + */ static struct ir3_instruction * create_frag_face(struct ir3_compile *ctx, unsigned comp) { @@ -828,7 +602,9 @@ static struct ir3_instruction * create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp) { /* first four vec4 sysval's reserved for UBOs: */ - unsigned r = regid(ctx->so->first_driver_param + 4, dp); + /* NOTE: dp is in scalar, but there can be >4 dp components: */ + unsigned n = ctx->so->first_driver_param + IR3_DRIVER_PARAM_OFF; + unsigned r = regid(n + dp / 4, dp % 4); return create_uniform(ctx, r); } @@ -1184,6 +960,33 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu) dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0); break; + case nir_op_bit_count: + dst[0] = ir3_CBITS_B(b, src[0], 0); + break; + case nir_op_ifind_msb: { + struct ir3_instruction *cmp; + dst[0] = ir3_CLZ_S(b, src[0], 0); + cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0); + cmp->cat2.condition = IR3_COND_GE; + dst[0] = ir3_SEL_B32(b, + ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, + cmp, 0, dst[0], 0); + break; + } + case nir_op_ufind_msb: + dst[0] = ir3_CLZ_B(b, src[0], 0); + dst[0] = ir3_SEL_B32(b, + ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, + src[0], 0, dst[0], 0); + break; + case nir_op_find_lsb: + dst[0] = ir3_BFREV_B(b, src[0], 0); + dst[0] = ir3_CLZ_B(b, dst[0], 0); + break; + case nir_op_bitfield_reverse: + dst[0] = ir3_BFREV_B(b, src[0], 0); + break; + default: compile_error(ctx, "Unhandled ALU op: %s\n", nir_op_infos[alu->op].name); @@ -1198,9 +1001,10 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, { struct ir3_block *b = ctx->block; struct ir3_instruction *addr, *src0, *src1; + nir_const_value *const_offset; /* UBO addresses are the first driver params: */ - unsigned ubo = regid(ctx->so->first_driver_param, 0); - unsigned off = intr->const_index[0]; + unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0); + int off = 0; /* First src is ubo index, which could either be an immed or not: */ src0 = get_src(ctx, &intr->src[0])[0]; @@ -1211,7 +1015,10 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0)); } - if (intr->intrinsic == nir_intrinsic_load_ubo_indirect) { + const_offset = nir_src_as_const_value(intr->src[1]); + if (const_offset) { + off += const_offset->u[0]; + } else { /* For load_ubo_indirect, second src is indirect offset: */ src1 = get_src(ctx, &intr->src[1])[0]; @@ -1240,12 +1047,12 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, /* handles array reads: */ static void -emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, +emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); + struct ir3_array *arr = get_var(ctx, dvar->var); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1256,19 +1063,17 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = arr->arr[n]; + dst[i] = create_var_load(ctx, arr, n, NULL); } break; case nir_deref_array_type_indirect: { /* for indirect, we need to collect all the array elements: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); struct ir3_instruction *addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect); + dst[i] = create_var_load(ctx, arr, n, addr); } break; } @@ -1281,12 +1086,13 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, /* handles array writes: */ static void -emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); - struct ir3_instruction **src; + struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_instruction *addr, **src; + unsigned wrmask = nir_intrinsic_write_mask(intr); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1295,71 +1101,38 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (darr->deref_array_type) { case nir_deref_array_type_direct: - /* direct access does not require anything special: */ - for (int i = 0; i < intr->num_components; i++) { - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - arr->arr[n] = src[i]; - } + addr = NULL; break; - case nir_deref_array_type_indirect: { - /* for indirect, create indirect-store and fan that out: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); - struct ir3_instruction *addr = - get_addr(ctx, get_src(ctx, &darr->indirect)[0]); - for (int i = 0; i < intr->num_components; i++) { - struct ir3_instruction *store; - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - - store = create_indirect_store(ctx, arr->length, - n, src[i], addr, collect); - - store->fanin->fi.aid = arr->aid; - - /* TODO: probably split this out to be used for - * store_output_indirect? or move this into - * create_indirect_store()? - */ - for (int j = i; j < arr->length; j += intr->num_components) { - struct ir3_instruction *split; - - split = ir3_instr_create(ctx->block, -1, OPC_META_FO); - split->fo.off = j; - ir3_reg_create(split, 0, 0); - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store; - - arr->arr[j] = split; - } - } - /* fixup fanout/split neighbors: */ - for (int i = 0; i < arr->length; i++) { - arr->arr[i]->cp.right = (i < (arr->length - 1)) ? - arr->arr[i+1] : NULL; - arr->arr[i]->cp.left = (i > 0) ? - arr->arr[i-1] : NULL; - } + case nir_deref_array_type_indirect: + addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); break; - } default: compile_error(ctx, "Unhandled store deref type: %u\n", darr->deref_array_type); break; } + + for (int i = 0; i < intr->num_components; i++) { + if (!(wrmask & (1 << i))) + continue; + unsigned n = darr->base_offset * 4 + i; + compile_assert(ctx, n < arr->length); + create_var_store(ctx, arr, n, src[i], addr); + } } -static void add_sysval_input(struct ir3_compile *ctx, unsigned name, +static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, struct ir3_instruction *instr) { struct ir3_shader_variant *so = ctx->so; unsigned r = regid(so->inputs_count, 0); unsigned n = so->inputs_count++; - so->inputs[n].semantic = ir3_semantic_name(name, 0); + so->inputs[n].sysval = true; + so->inputs[n].slot = slot; so->inputs[n].compmask = 1; so->inputs[n].regid = r; - so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; + so->inputs[n].interpolate = INTERP_QUALIFIER_FLAT; so->total_in++; ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1); @@ -1367,12 +1140,13 @@ static void add_sysval_input(struct ir3_compile *ctx, unsigned name, } static void -emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; struct ir3_instruction **dst, **src; struct ir3_block *b = ctx->block; - unsigned idx = intr->const_index[0]; + nir_const_value *const_offset; + int idx; if (info->has_dest) { dst = get_dst(ctx, &intr->dest, intr->num_components); @@ -1382,52 +1156,65 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (intr->intrinsic) { case nir_intrinsic_load_uniform: - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_uniform(ctx, n); + idx = nir_intrinsic_base(intr); + const_offset = nir_src_as_const_value(intr->src[0]); + if (const_offset) { + idx += const_offset->u[0]; + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_uniform(ctx, n); + } + } else { + src = get_src(ctx, &intr->src[0]); + for (int i = 0; i < intr->num_components; i++) { + int n = idx * 4 + i; + dst[i] = create_uniform_indirect(ctx, n, + get_addr(ctx, src[0])); + } + /* NOTE: if relative addressing is used, we set + * constlen in the compiler (to worst-case value) + * since we don't know in the assembler what the max + * addr reg value can be: + */ + ctx->so->constlen = ctx->s->num_uniforms; } break; - case nir_intrinsic_load_uniform_indirect: - src = get_src(ctx, &intr->src[0]); - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_uniform_indirect(ctx, n, - get_addr(ctx, src[0])); - } - /* NOTE: if relative addressing is used, we set constlen in - * the compiler (to worst-case value) since we don't know in - * the assembler what the max addr reg value can be: - */ - ctx->so->constlen = ctx->s->num_uniforms; - break; case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ubo_indirect: emit_intrinsic_load_ubo(ctx, intr, dst); break; case nir_intrinsic_load_input: - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = ctx->ir->inputs[n]; - } - break; - case nir_intrinsic_load_input_indirect: - src = get_src(ctx, &intr->src[0]); - struct ir3_instruction *collect = - create_collect(b, ctx->ir->inputs, ctx->ir->ninputs); - struct ir3_instruction *addr = get_addr(ctx, src[0]); - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, - n, addr, collect); + idx = nir_intrinsic_base(intr); + const_offset = nir_src_as_const_value(intr->src[0]); + if (const_offset) { + idx += const_offset->u[0]; + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = ctx->ir->inputs[n]; + } + } else { + src = get_src(ctx, &intr->src[0]); + struct ir3_instruction *collect = + create_collect(b, ctx->ir->inputs, ctx->ir->ninputs); + struct ir3_instruction *addr = get_addr(ctx, src[0]); + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, + n, addr, collect); + } } break; case nir_intrinsic_load_var: - emit_intrinisic_load_var(ctx, intr, dst); + emit_intrinsic_load_var(ctx, intr, dst); break; case nir_intrinsic_store_var: - emit_intrinisic_store_var(ctx, intr); + emit_intrinsic_store_var(ctx, intr); break; case nir_intrinsic_store_output: + idx = nir_intrinsic_base(intr); + const_offset = nir_src_as_const_value(intr->src[1]); + compile_assert(ctx, const_offset != NULL); + idx += const_offset->u[0]; + src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; @@ -1437,27 +1224,42 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_load_base_vertex: if (!ctx->basevertex) { ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); - add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX, + add_sysval_input(ctx, SYSTEM_VALUE_BASE_VERTEX, ctx->basevertex); } dst[0] = ctx->basevertex; break; case nir_intrinsic_load_vertex_id_zero_base: if (!ctx->vertex_id) { - ctx->vertex_id = create_input(ctx->block, 0); - add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE, + ctx->vertex_id = create_input(b, 0); + add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, ctx->vertex_id); } dst[0] = ctx->vertex_id; break; case nir_intrinsic_load_instance_id: if (!ctx->instance_id) { - ctx->instance_id = create_input(ctx->block, 0); - add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID, + ctx->instance_id = create_input(b, 0); + add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, ctx->instance_id); } dst[0] = ctx->instance_id; break; + case nir_intrinsic_load_user_clip_plane: + idx = nir_intrinsic_ucp_id(intr); + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); + } + break; + case nir_intrinsic_load_front_face: + if (!ctx->frag_face) { + ctx->so->frag_face = true; + ctx->frag_face = create_input(b, 0); + ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; + } + dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0); + break; case nir_intrinsic_discard_if: case nir_intrinsic_discard: { struct ir3_instruction *cond, *kill; @@ -1547,10 +1349,10 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) unreachable("bad sampler_dim"); } - if (tex->is_shadow) + if (tex->is_shadow && tex->op != nir_texop_lod) flags |= IR3_INSTR_S; - if (tex->is_array) + if (tex->is_array && tex->op != nir_texop_lod) flags |= IR3_INSTR_A; *flagsp = flags; @@ -1606,7 +1408,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) ddy = get_src(ctx, &tex->src[i].src); break; default: - compile_error(ctx, "Unhandled NIR tex serc type: %d\n", + compile_error(ctx, "Unhandled NIR tex src type: %d\n", tex->src[i].src_type); return; } @@ -1618,11 +1420,13 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) case nir_texop_txl: opc = OPC_SAML; break; case nir_texop_txd: opc = OPC_SAMGQ; break; case nir_texop_txf: opc = OPC_ISAML; break; + case nir_texop_lod: opc = OPC_GETLOD; break; case nir_texop_txf_ms: case nir_texop_txs: - case nir_texop_lod: case nir_texop_tg4: case nir_texop_query_levels: + case nir_texop_texture_samples: + case nir_texop_samples_identical: compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); return; } @@ -1664,10 +1468,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) src0[nsrc0++] = create_immed(b, fui(0.5)); } - if (tex->is_shadow) + if (tex->is_shadow && tex->op != nir_texop_lod) src0[nsrc0++] = compare; - if (tex->is_array) + if (tex->is_array && tex->op != nir_texop_lod) src0[nsrc0++] = coord[coords]; if (has_proj) { @@ -1716,7 +1520,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) case nir_type_int: type = TYPE_S32; break; - case nir_type_unsigned: + case nir_type_uint: case nir_type_bool: type = TYPE_U32; break; @@ -1724,12 +1528,26 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) unreachable("bad dest_type"); } + if (opc == OPC_GETLOD) + type = TYPE_U32; + sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, - flags, tex->sampler_index, tex->sampler_index, + flags, tex->texture_index, tex->texture_index, create_collect(b, src0, nsrc0), create_collect(b, src1, nsrc1)); split_dest(b, dst, sam, 4); + + /* GETLOD returns results in 4.8 fixed point */ + if (opc == OPC_GETLOD) { + struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256)); + + compile_assert(ctx, tex->dest_type == nir_type_float); + for (i = 0; i < 2; i++) { + dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0, + factor, 0); + } + } } static void @@ -1741,7 +1559,7 @@ emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex) dst = get_dst(ctx, &tex->dest, 1); sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0, - tex->sampler_index, tex->sampler_index, NULL, NULL); + tex->texture_index, tex->texture_index, NULL, NULL); /* even though there is only one component, since it ends * up in .z rather than .x, we need a split_dest() @@ -1778,7 +1596,7 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) lod = get_src(ctx, &tex->src[0].src)[0]; sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags, - tex->sampler_index, tex->sampler_index, lod, NULL); + tex->texture_index, tex->texture_index, lod, NULL); split_dest(b, dst, sam, 4); @@ -1840,8 +1658,6 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; } } - - resolve_array_phis(ctx, block); } static void @@ -1869,7 +1685,7 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) emit_alu(ctx, nir_instr_as_alu(instr)); break; case nir_instr_type_intrinsic: - emit_intrinisic(ctx, nir_instr_as_intrinsic(instr)); + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; case nir_instr_type_load_const: emit_load_const(ctx, nir_instr_as_load_const(instr)); @@ -1938,6 +1754,10 @@ emit_block(struct ir3_compile *ctx, nir_block *nblock) ctx->block = block; list_addtail(&block->node, &ctx->ir->block_list); + /* re-emit addr register in each block if needed: */ + _mesa_hash_table_destroy(ctx->addr_ht, NULL); + ctx->addr_ht = NULL; + nir_foreach_instr(nblock, instr) { emit_instr(ctx, instr); if (ctx->error) @@ -2020,7 +1840,7 @@ emit_stream_out(struct ir3_compile *ctx) * of the shader: */ vtxcnt = create_input(ctx->in_block, 0); - add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt); + add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt); maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); @@ -2064,7 +1884,7 @@ emit_stream_out(struct ir3_compile *ctx) unsigned stride = strmout->stride[i]; struct ir3_instruction *base, *off; - base = create_uniform(ctx, regid(v->first_driver_param + 5, i)); + base = create_uniform(ctx, regid(v->first_driver_param + IR3_TFBOS_OFF, i)); /* 24-bit should be enough: */ off = ir3_MUL_U(ctx->block, vtxcnt, 0, @@ -2098,6 +1918,8 @@ emit_stream_out(struct ir3_compile *ctx) static void emit_function(struct ir3_compile *ctx, nir_function_impl *impl) { + nir_metadata_require(impl, nir_metadata_block_index); + emit_cf_list(ctx, &impl->body); emit_block(ctx, impl->end_block); @@ -2132,90 +1954,73 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) struct ir3_shader_variant *so = ctx->so; unsigned array_len = MAX2(glsl_get_length(in->type), 1); unsigned ncomp = glsl_get_components(in->type); - /* XXX: map loc slots to semantics */ - unsigned semantic_name = in->data.location; - unsigned semantic_index = in->data.index; unsigned n = in->data.driver_location; + unsigned slot = in->data.location; - DBG("; in: %u:%u, len=%ux%u, loc=%u", - semantic_name, semantic_index, array_len, - ncomp, n); + DBG("; in: slot=%u, len=%ux%u, drvloc=%u", + slot, array_len, ncomp, n); - so->inputs[n].semantic = - ir3_semantic_name(semantic_name, semantic_index); + so->inputs[n].slot = slot; so->inputs[n].compmask = (1 << ncomp) - 1; - so->inputs[n].inloc = ctx->next_inloc; - so->inputs[n].interpolate = 0; so->inputs_count = MAX2(so->inputs_count, n + 1); + so->inputs[n].interpolate = in->data.interpolation; - /* the fdN_program_emit() code expects tgsi consts here, so map - * things back to tgsi for now: - */ - switch (in->data.interpolation) { - case INTERP_QUALIFIER_FLAT: - so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; - break; - case INTERP_QUALIFIER_NOPERSPECTIVE: - so->inputs[n].interpolate = TGSI_INTERPOLATE_LINEAR; - break; - case INTERP_QUALIFIER_SMOOTH: - so->inputs[n].interpolate = TGSI_INTERPOLATE_PERSPECTIVE; - break; - } - - for (int i = 0; i < ncomp; i++) { - struct ir3_instruction *instr = NULL; - unsigned idx = (n * 4) + i; + if (ctx->so->type == SHADER_FRAGMENT) { + for (int i = 0; i < ncomp; i++) { + struct ir3_instruction *instr = NULL; + unsigned idx = (n * 4) + i; - if (ctx->so->type == SHADER_FRAGMENT) { - if (semantic_name == TGSI_SEMANTIC_POSITION) { + if (slot == VARYING_SLOT_POS) { so->inputs[n].bary = false; so->frag_coord = true; instr = create_frag_coord(ctx, i); - } else if (semantic_name == TGSI_SEMANTIC_FACE) { + } else if (slot == VARYING_SLOT_FACE) { so->inputs[n].bary = false; so->frag_face = true; instr = create_frag_face(ctx, i); } else { bool use_ldlv = false; - /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR - * from the semantic name: + /* detect the special case for front/back colors where + * we need to do flat vs smooth shading depending on + * rast state: */ - if ((in->data.interpolation == INTERP_QUALIFIER_NONE) && - ((semantic_name == TGSI_SEMANTIC_COLOR) || - (semantic_name == TGSI_SEMANTIC_BCOLOR))) - so->inputs[n].interpolate = TGSI_INTERPOLATE_COLOR; + if (in->data.interpolation == INTERP_QUALIFIER_NONE) { + switch (slot) { + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + so->inputs[n].rasterflat = true; + break; + default: + break; + } + } if (ctx->flat_bypass) { - /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR - * from the semantic name: - */ - switch (so->inputs[n].interpolate) { - case TGSI_INTERPOLATE_COLOR: - if (!ctx->so->key.rasterflat) - break; - /* fallthrough */ - case TGSI_INTERPOLATE_CONSTANT: + if ((so->inputs[n].interpolate == INTERP_QUALIFIER_FLAT) || + (so->inputs[n].rasterflat && ctx->so->key.rasterflat)) use_ldlv = true; - break; - } } so->inputs[n].bary = true; - instr = create_frag_input(ctx, - so->inputs[n].inloc + i - 8, use_ldlv); + instr = create_frag_input(ctx, use_ldlv); } - } else { - instr = create_input(ctx->block, idx); - } - ctx->ir->inputs[idx] = instr; + ctx->ir->inputs[idx] = instr; + } + } else if (ctx->so->type == SHADER_VERTEX) { + for (int i = 0; i < ncomp; i++) { + unsigned idx = (n * 4) + i; + ctx->ir->inputs[idx] = create_input(ctx->block, idx); + } + } else { + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); } if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) { - ctx->next_inloc += ncomp; so->total_in += ncomp; } } @@ -2226,56 +2031,62 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) struct ir3_shader_variant *so = ctx->so; unsigned array_len = MAX2(glsl_get_length(out->type), 1); unsigned ncomp = glsl_get_components(out->type); - /* XXX: map loc slots to semantics */ - unsigned semantic_name = out->data.location; - unsigned semantic_index = out->data.index; unsigned n = out->data.driver_location; + unsigned slot = out->data.location; unsigned comp = 0; - DBG("; out: %u:%u, len=%ux%u, loc=%u", - semantic_name, semantic_index, array_len, - ncomp, n); + DBG("; out: slot=%u, len=%ux%u, drvloc=%u", + slot, array_len, ncomp, n); - if (ctx->so->type == SHADER_VERTEX) { - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: + if (ctx->so->type == SHADER_FRAGMENT) { + switch (slot) { + case FRAG_RESULT_DEPTH: + comp = 2; /* tgsi will write to .z component */ so->writes_pos = true; break; - case TGSI_SEMANTIC_PSIZE: - so->writes_psize = true; - break; - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_GENERIC: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: + case FRAG_RESULT_COLOR: + so->color0_mrt = 1; break; default: - compile_error(ctx, "unknown VS semantic name: %s\n", - tgsi_semantic_names[semantic_name]); + if (slot >= FRAG_RESULT_DATA0) + break; + compile_error(ctx, "unknown FS output name: %s\n", + gl_frag_result_name(slot)); } - } else { - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: - comp = 2; /* tgsi will write to .z component */ + } else if (ctx->so->type == SHADER_VERTEX) { + switch (slot) { + case VARYING_SLOT_POS: so->writes_pos = true; break; - case TGSI_SEMANTIC_COLOR: - if (semantic_index == -1) { - semantic_index = 0; - so->color0_mrt = 1; - } + case VARYING_SLOT_PSIZ: + so->writes_psize = true; break; + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + case VARYING_SLOT_FOGC: + case VARYING_SLOT_CLIP_DIST0: + case VARYING_SLOT_CLIP_DIST1: + break; + case VARYING_SLOT_CLIP_VERTEX: + /* handled entirely in nir_lower_clip: */ + return; default: - compile_error(ctx, "unknown FS semantic name: %s\n", - tgsi_semantic_names[semantic_name]); + if (slot >= VARYING_SLOT_VAR0) + break; + if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7)) + break; + compile_error(ctx, "unknown VS output name: %s\n", + gl_varying_slot_name(slot)); } + } else { + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); } compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); - so->outputs[n].semantic = - ir3_semantic_name(semantic_name, semantic_index); + so->outputs[n].slot = slot; so->outputs[n].regid = regid(n, comp); so->outputs_count = MAX2(so->outputs_count, n + 1); @@ -2293,10 +2104,10 @@ emit_instructions(struct ir3_compile *ctx) nir_function_impl *fxn = NULL; /* Find the main function: */ - nir_foreach_overload(ctx->s, overload) { - compile_assert(ctx, strcmp(overload->function->name, "main") == 0); - compile_assert(ctx, overload->impl); - fxn = overload->impl; + nir_foreach_function(ctx->s, function) { + compile_assert(ctx, strcmp(function->name, "main") == 0); + compile_assert(ctx, function->impl); + fxn = function->impl; break; } @@ -2312,7 +2123,7 @@ emit_instructions(struct ir3_compile *ctx) ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); /* Create inputs in first block: */ - ctx->block = get_block(ctx, fxn->start_block); + ctx->block = get_block(ctx, nir_start_block(fxn)); ctx->in_block = ctx->block; list_addtail(&ctx->block->node, &ctx->ir->block_list); @@ -2334,17 +2145,23 @@ emit_instructions(struct ir3_compile *ctx) } /* Setup inputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) { + nir_foreach_variable(var, &ctx->s->inputs) { setup_input(ctx, var); } /* Setup outputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) { + nir_foreach_variable(var, &ctx->s->outputs) { setup_output(ctx, var); } - /* Setup variables (which should only be arrays): */ - foreach_list_typed(nir_variable, var, node, &ctx->s->globals) { + /* Setup global variables (which should only be arrays): */ + nir_foreach_variable(var, &ctx->s->globals) { + declare_var(ctx, var); + } + + /* Setup local variables (which should only be arrays): */ + /* NOTE: need to do something more clever when we support >1 fxn */ + nir_foreach_variable(var, &fxn->locals) { declare_var(ctx, var); } @@ -2436,12 +2253,12 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, struct ir3_compile *ctx; struct ir3 *ir; struct ir3_instruction **inputs; - unsigned i, j, actual_in; + unsigned i, j, actual_in, inloc; int ret = 0, max_bary; assert(!so->ir); - ctx = compile_init(compiler, so, so->shader->tokens); + ctx = compile_init(compiler, so); if (!ctx) { DBG("INIT failed!"); ret = -1; @@ -2468,12 +2285,10 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, /* at this point, for binning pass, throw away unneeded outputs: */ if (so->key.binning_pass) { for (i = 0, j = 0; i < so->outputs_count; i++) { - unsigned name = sem2name(so->outputs[i].semantic); - unsigned idx = sem2idx(so->outputs[i].semantic); + unsigned slot = so->outputs[i].slot; /* throw away everything but first position/psize */ - if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || - (name == TGSI_SEMANTIC_PSIZE))) { + if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) { if (i != j) { so->outputs[j] = so->outputs[i]; ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0]; @@ -2558,13 +2373,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_print(ir); } - ir3_legalize(ir, &so->has_samp, &max_bary); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER LEGALIZE:\n"); - ir3_print(ir); - } - /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; @@ -2572,38 +2380,52 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, * but what we give the hw is the scalar register: */ if ((so->type == SHADER_FRAGMENT) && - (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) + (so->outputs[i].slot == FRAG_RESULT_DEPTH)) so->outputs[i].regid += 2; } /* Note that some or all channels of an input may be unused: */ actual_in = 0; + inloc = 0; for (i = 0; i < so->inputs_count; i++) { unsigned j, regid = ~0, compmask = 0; so->inputs[i].ncomp = 0; + so->inputs[i].inloc = inloc + 8; for (j = 0; j < 4; j++) { struct ir3_instruction *in = inputs[(i*4) + j]; - if (in) { + if (in && !(in->flags & IR3_INSTR_UNUSED)) { compmask |= (1 << j); regid = in->regs[0]->num - j; actual_in++; so->inputs[i].ncomp++; + if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) { + /* assign inloc: */ + assert(in->regs[1]->flags & IR3_REG_IMMED); + in->regs[1]->iim_val = inloc++; + } } } + if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary) + so->varying_in++; so->inputs[i].regid = regid; so->inputs[i].compmask = compmask; } - /* fragment shader always gets full vec4's even if it doesn't - * fetch all components, but vertex shader we need to update - * with the actual number of components fetch, otherwise thing - * will hang due to mismaptch between VFD_DECODE's and - * TOTALATTRTOVS + /* We need to do legalize after (for frag shader's) the "bary.f" + * offsets (inloc) have been assigned. */ + ir3_legalize(ir, &so->has_samp, &max_bary); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER LEGALIZE:\n"); + ir3_print(ir); + } + + /* Note that actual_in counts inputs that are not bary.f'd for FS: */ if (so->type == SHADER_VERTEX) so->total_in = actual_in; else - so->total_in = align(max_bary + 1, 4); + so->total_in = max_bary + 1; out: if (ret) { diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c index be4e4e811..1cc211a76 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -41,16 +41,22 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) struct ir3_register *dst = instr->regs[0]; struct ir3_register *src = instr->regs[1]; struct ir3_instruction *src_instr = ssa(src); + + /* only if mov src is SSA (not const/immed): */ + if (!src_instr) + return false; + + /* no indirect: */ if (dst->flags & IR3_REG_RELATIV) return false; if (src->flags & IR3_REG_RELATIV) return false; + if (!allow_flags) if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG | IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) return false; - if (!src_instr) - return false; + /* TODO: remove this hack: */ if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO)) return false; @@ -82,10 +88,17 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n, unsigned valid_flags; flags = cp_flags(flags); + /* If destination is indirect, then source cannot be.. at least + * I don't think so.. + */ + if ((instr->regs[0]->flags & IR3_REG_RELATIV) && + (flags & IR3_REG_RELATIV)) + return false; + /* clear flags that are 'ok' */ switch (instr->category) { case 1: - valid_flags = IR3_REG_IMMED | IR3_REG_RELATIV; + valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV; if (flags & ~valid_flags) return false; break; @@ -183,9 +196,14 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags) *dstflags ^= IR3_REG_SNEG; if (srcflags & IR3_REG_BNOT) *dstflags ^= IR3_REG_BNOT; -} -static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, unsigned *flags); + *dstflags &= ~IR3_REG_SSA; + *dstflags |= srcflags & IR3_REG_SSA; + *dstflags |= srcflags & IR3_REG_CONST; + *dstflags |= srcflags & IR3_REG_IMMED; + *dstflags |= srcflags & IR3_REG_RELATIV; + *dstflags |= srcflags & IR3_REG_ARRAY; +} /* the "plain" MAD's (ie. the ones that don't shift first src prior to * multiply) can swap their first two srcs if src[0] is !CONST and @@ -206,52 +224,35 @@ static bool is_valid_mad(struct ir3_instruction *instr) static void reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) { - unsigned src_flags = 0, new_flags; - struct ir3_instruction *src_instr; + struct ir3_instruction *src = ssa(reg); - if (is_meta(instr)) { - /* meta instructions cannot fold up register - * flags.. they are usually src for texture - * fetch, etc, where we cannot specify abs/neg - */ - reg->instr = instr_cp(reg->instr, NULL); - return; - } - - src_instr = instr_cp(reg->instr, &src_flags); + if (is_eligible_mov(src, true)) { + /* simple case, no immed/const/relativ, only mov's w/ ssa src: */ + struct ir3_register *src_reg = src->regs[1]; + unsigned new_flags = reg->flags; - new_flags = reg->flags; - combine_flags(&new_flags, src_flags); + combine_flags(&new_flags, src_reg->flags); - reg->flags = new_flags; - reg->instr = src_instr; - - if (!valid_flags(instr, n, reg->flags)) { - /* insert an absneg.f */ - if (reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) { - debug_assert(!(reg->flags & (IR3_REG_FNEG | IR3_REG_FABS))); - reg->instr = ir3_ABSNEG_S(instr->block, - reg->instr, cp_flags(src_flags)); - } else { - debug_assert(!(reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))); - reg->instr = ir3_ABSNEG_F(instr->block, - reg->instr, cp_flags(src_flags)); + if (valid_flags(instr, n, new_flags)) { + if (new_flags & IR3_REG_ARRAY) { + debug_assert(!(reg->flags & IR3_REG_ARRAY)); + reg->array = src_reg->array; + } + reg->flags = new_flags; + reg->instr = ssa(src_reg); } - reg->flags &= ~cp_flags(src_flags); - debug_assert(valid_flags(instr, n, reg->flags)); - /* send it through instr_cp() again since - * the absneg src might be a mov from const - * that could be cleaned up: - */ - reg->instr = instr_cp(reg->instr, NULL); - return; - } - if (is_same_type_mov(reg->instr)) { - struct ir3_register *src_reg = reg->instr->regs[1]; - unsigned new_flags = src_reg->flags; + src = ssa(reg); /* could be null for IR3_REG_ARRAY case */ + if (!src) + return; + } else if (is_same_type_mov(src) && + /* cannot collapse const/immed/etc into meta instrs: */ + !is_meta(instr)) { + /* immed/const/etc cases, which require some special handling: */ + struct ir3_register *src_reg = src->regs[1]; + unsigned new_flags = reg->flags; - combine_flags(&new_flags, reg->flags); + combine_flags(&new_flags, src_reg->flags); if (!valid_flags(instr, n, new_flags)) { /* special case for "normal" mad instructions, we can @@ -287,6 +288,16 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) conflicts(instr->address, reg->instr->address)) return; + /* This seems to be a hw bug, or something where the timings + * just somehow don't work out. This restriction may only + * apply if the first src is also CONST. + */ + if ((instr->category == 3) && (n == 2) && + (src_reg->flags & IR3_REG_RELATIV) && + (src_reg->array.offset == 0)) + return; + + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; instr->regs[n+1] = src_reg; @@ -298,6 +309,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) if ((src_reg->flags & IR3_REG_RELATIV) && !conflicts(instr->address, reg->instr->address)) { + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; instr->regs[n+1] = src_reg; ir3_instr_set_address(instr, reg->instr->address); @@ -330,8 +342,10 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) if (new_flags & IR3_REG_BNOT) iim_val = ~iim_val; - if (!(iim_val & ~0x3ff)) { + /* other than category 1 (mov) we can only encode up to 10 bits: */ + if ((instr->category == 1) || !(iim_val & ~0x3ff)) { new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT); + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; src_reg->iim_val = iim_val; instr->regs[n+1] = src_reg; @@ -342,56 +356,68 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) } } -/** - * Given an SSA src (instruction), return the one with extraneous - * mov's removed, ie, for (to copy NIR syntax): - * - * vec1 ssa1 = fadd <something>, <somethingelse> - * vec1 ssa2 = fabs ssa1 - * vec1 ssa3 = fneg ssa1 - * - * then calling instr_cp(ssa3, &flags) would return ssa1 with - * (IR3_REG_ABS | IR3_REG_NEGATE) in flags. If flags is NULL, - * then disallow eliminating copies which would require flag - * propagation (for example, we cannot propagate abs/neg into - * an output). +/* Handle special case of eliminating output mov, and similar cases where + * there isn't a normal "consuming" instruction. In this case we cannot + * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot + * be eliminated) */ static struct ir3_instruction * -instr_cp(struct ir3_instruction *instr, unsigned *flags) +eliminate_output_mov(struct ir3_instruction *instr) { - struct ir3_register *reg; - - if (is_eligible_mov(instr, !!flags)) { + if (is_eligible_mov(instr, false)) { struct ir3_register *reg = instr->regs[1]; - struct ir3_instruction *src_instr = ssa(reg); - if (flags) - combine_flags(flags, reg->flags); - return instr_cp(src_instr, flags); + if (!(reg->flags & IR3_REG_ARRAY)) { + struct ir3_instruction *src_instr = ssa(reg); + debug_assert(src_instr); + return src_instr; + } } + return instr; +} + +/** + * Find instruction src's which are mov's that can be collapsed, replacing + * the mov dst with the mov src + */ +static void +instr_cp(struct ir3_instruction *instr) +{ + struct ir3_register *reg; + + if (instr->regs_count == 0) + return; - /* Check termination condition before walking children (rather - * than before checking eligible-mov). A mov instruction may - * appear as ssa-src for multiple other instructions, and we - * want to consider it for removal for each, rather than just - * the first one. (But regardless of how many places it shows - * up as a src, we only need to recursively walk the children - * once.) - */ if (ir3_instr_check_mark(instr)) - return instr; + return; /* walk down the graph from each src: */ foreach_src_n(reg, n, instr) { - if (!(reg->flags & IR3_REG_SSA)) + struct ir3_instruction *src = ssa(reg); + + if (!src) + continue; + + instr_cp(src); + + /* TODO non-indirect access we could figure out which register + * we actually want and allow cp.. + */ + if (reg->flags & IR3_REG_ARRAY) continue; reg_cp(instr, reg, n); } - if (instr->address) - ir3_instr_set_address(instr, instr_cp(instr->address, NULL)); + if (instr->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_instruction *src = ssa(instr->regs[0]); + if (src) + instr_cp(src); + } - return instr; + if (instr->address) { + instr_cp(instr->address); + ir3_instr_set_address(instr, eliminate_output_mov(instr->address)); + } } void @@ -401,19 +427,20 @@ ir3_cp(struct ir3 *ir) for (unsigned i = 0; i < ir->noutputs; i++) { if (ir->outputs[i]) { - struct ir3_instruction *out = - instr_cp(ir->outputs[i], NULL); - - ir->outputs[i] = out; + instr_cp(ir->outputs[i]); + ir->outputs[i] = eliminate_output_mov(ir->outputs[i]); } } for (unsigned i = 0; i < ir->keeps_count; i++) { - ir->keeps[i] = instr_cp(ir->keeps[i], NULL); + instr_cp(ir->keeps[i]); + ir->keeps[i] = eliminate_output_mov(ir->keeps[i]); } list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - if (block->condition) - block->condition = instr_cp(block->condition, NULL); + if (block->condition) { + instr_cp(block->condition); + block->condition = eliminate_output_mov(block->condition); + } } } diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c index 97df0c2ac..6d294f1a4 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -76,7 +76,7 @@ int ir3_delayslots(struct ir3_instruction *assigner, return 6; } else if ((consumer->category == 3) && (is_mad(consumer->opc) || is_madsh(consumer->opc)) && - (n == 2)) { + (n == 3)) { /* special case, 3rd src to cat3 not required on first cycle */ return 1; } else { @@ -118,6 +118,10 @@ ir3_instr_depth(struct ir3_instruction *instr) /* visit child to compute it's depth: */ ir3_instr_depth(src); + /* for array writes, no need to delay on previous write: */ + if (i == 0) + continue; + sd = ir3_delayslots(src, instr, i) + src->depth; instr->depth = MAX2(instr->depth, sd); @@ -139,7 +143,7 @@ remove_unused_by_block(struct ir3_block *block) /* mark it, in case it is input, so we can * remove unused inputs: */ - instr->depth = DEPTH_UNUSED; + instr->flags |= IR3_INSTR_UNUSED; /* and remove from instruction list: */ list_delinit(&instr->node); } @@ -175,14 +179,14 @@ ir3_depth(struct ir3 *ir) */ for (i = 0; i < ir->indirects_count; i++) { struct ir3_instruction *instr = ir->indirects[i]; - if (instr->depth == DEPTH_UNUSED) + if (instr->flags & IR3_INSTR_UNUSED) ir->indirects[i] = NULL; } /* cleanup unused inputs: */ for (i = 0; i < ir->ninputs; i++) { struct ir3_instruction *in = ir->inputs[i]; - if (in && (in->depth == DEPTH_UNUSED)) + if (in && (in->flags & IR3_INSTR_UNUSED)) ir->inputs[i] = NULL; } } diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.c new file mode 100644 index 000000000..565b9c32c --- /dev/null +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.c @@ -0,0 +1,153 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <robclark@freedesktop.org> + */ + + +#include "freedreno_util.h" + +#include "ir3_nir.h" +#include "ir3_compiler.h" +#include "ir3_shader.h" + +#include "nir/tgsi_to_nir.h" + +struct nir_shader * +ir3_tgsi_to_nir(const struct tgsi_token *tokens) +{ + static const nir_shader_compiler_options options = { + .lower_fpow = true, + .lower_fsat = true, + .lower_scmp = true, + .lower_flrp = true, + .lower_ffract = true, + .native_integers = true, + }; + return tgsi_to_nir(tokens, &options); +} + +/* for given shader key, are any steps handled in nir? */ +bool +ir3_key_lowers_nir(const struct ir3_shader_key *key) +{ + return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r | + key->vsaturate_s | key->vsaturate_t | key->vsaturate_r | + key->ucp_enables | key->color_two_side; +} + +#define OPT(nir, pass, ...) ({ \ + bool this_progress = false; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + this_progress; \ +}) + +#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) + +struct nir_shader * +ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, + const struct ir3_shader_key *key) +{ + struct nir_lower_tex_options tex_options = { + .lower_rect = 0, + }; + bool progress; + + if (key) { + switch (shader->type) { + case SHADER_FRAGMENT: + case SHADER_COMPUTE: + tex_options.saturate_s = key->fsaturate_s; + tex_options.saturate_t = key->fsaturate_t; + tex_options.saturate_r = key->fsaturate_r; + break; + case SHADER_VERTEX: + tex_options.saturate_s = key->vsaturate_s; + tex_options.saturate_t = key->vsaturate_t; + tex_options.saturate_r = key->vsaturate_r; + break; + } + } + + if (shader->compiler->gpu_id >= 400) { + /* a4xx seems to have *no* sam.p */ + tex_options.lower_txp = ~0; /* lower all txp */ + } else { + /* a3xx just needs to avoid sam.p for 3d tex */ + tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D); + } + + if (fd_mesa_debug & FD_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + OPT_V(s, nir_opt_global_to_local); + OPT_V(s, nir_convert_to_ssa); + + if (key) { + if (s->stage == MESA_SHADER_VERTEX) { + OPT_V(s, nir_lower_clip_vs, key->ucp_enables); + } else if (s->stage == MESA_SHADER_FRAGMENT) { + OPT_V(s, nir_lower_clip_fs, key->ucp_enables); + } + if (key->color_two_side) { + OPT_V(s, nir_lower_two_sided_color); + } + } + + OPT_V(s, nir_lower_tex, &tex_options); + OPT_V(s, nir_lower_idiv); + OPT_V(s, nir_lower_load_const_to_scalar); + + do { + progress = false; + + OPT_V(s, nir_lower_vars_to_ssa); + OPT_V(s, nir_lower_alu_to_scalar); + OPT_V(s, nir_lower_phis_to_scalar); + + progress |= OPT(s, nir_copy_prop); + progress |= OPT(s, nir_opt_dce); + progress |= OPT(s, nir_opt_cse); + progress |= OPT(s, ir3_nir_lower_if_else); + progress |= OPT(s, nir_opt_algebraic); + progress |= OPT(s, nir_opt_constant_folding); + + } while (progress); + + OPT_V(s, nir_remove_dead_variables); + + if (fd_mesa_debug & FD_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + nir_sweep(s); + + return s; +} diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h index f3d3075e6..e2d885960 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h @@ -29,8 +29,16 @@ #ifndef IR3_NIR_H_ #define IR3_NIR_H_ -#include "glsl/nir/nir.h" +#include "compiler/nir/nir.h" +#include "compiler/shader_enums.h" + +#include "ir3_shader.h" bool ir3_nir_lower_if_else(nir_shader *shader); +struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens); +bool ir3_key_lowers_nir(const struct ir3_shader_key *key); +struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, + const struct ir3_shader_key *key); + #endif /* IR3_NIR_H_ */ diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c index dc9e4626f..8815ac981 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c @@ -28,7 +28,8 @@ */ #include "ir3_nir.h" -#include "glsl/nir/nir_builder.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_control_flow.h" /* Based on nir_opt_peephole_select, and hacked up to more aggressively * flatten anything that can be flattened @@ -171,7 +172,7 @@ flatten_block(nir_builder *bld, nir_block *if_block, nir_block *prev_block, (intr->intrinsic == nir_intrinsic_discard_if)) { nir_ssa_def *discard_cond; - nir_builder_insert_after_instr(bld, + bld->cursor = nir_after_instr( nir_block_last_instr(prev_block)); if (invert) { @@ -293,8 +294,7 @@ lower_if_else_block(nir_block *block, void *void_state) sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1; nir_ssa_def_rewrite_uses(&phi->dest.ssa, - nir_src_for_ssa(&sel->dest.dest.ssa), - state->mem_ctx); + nir_src_for_ssa(&sel->dest.dest.ssa)); nir_instr_insert_before(&phi->instr, &sel->instr); nir_instr_remove(&phi->instr); @@ -328,9 +328,9 @@ ir3_nir_lower_if_else(nir_shader *shader) { bool progress = false; - nir_foreach_overload(shader, overload) { - if (overload->impl) - progress |= lower_if_else_impl(overload->impl); + nir_foreach_function(shader, function) { + if (function->impl) + progress |= lower_if_else_impl(function->impl); } return progress; diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c index a84e7989c..ba0c4a57a 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -94,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr) } } -static void print_reg_name(struct ir3_register *reg, bool followssa) +static void print_reg_name(struct ir3_register *reg) { if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) @@ -106,20 +106,29 @@ static void print_reg_name(struct ir3_register *reg, bool followssa) if (reg->flags & IR3_REG_IMMED) { printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); - } else if (reg->flags & IR3_REG_SSA) { - printf("_"); - if (followssa) { - printf("["); + } else if (reg->flags & IR3_REG_ARRAY) { + printf("arr[id=%u, offset=%d, size=%u", reg->array.id, + reg->array.offset, reg->size); + /* for ARRAY we could have null src, for example first write + * instruction.. + */ + if (reg->instr) { + printf(", _["); print_instr_name(reg->instr); printf("]"); } + printf("]"); + } else if (reg->flags & IR3_REG_SSA) { + printf("_["); + print_instr_name(reg->instr); + printf("]"); } else if (reg->flags & IR3_REG_RELATIV) { if (reg->flags & IR3_REG_HALF) printf("h"); if (reg->flags & IR3_REG_CONST) - printf("c<a0.x + %u>", reg->num); + printf("c<a0.x + %d>", reg->array.offset); else - printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size); + printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size); } else { if (reg->flags & IR3_REG_HALF) printf("h"); @@ -158,7 +167,7 @@ print_instr(struct ir3_instruction *instr, int lvl) for (i = 0; i < instr->regs_count; i++) { struct ir3_register *reg = instr->regs[i]; printf(i ? ", " : " "); - print_reg_name(reg, !!i); + print_reg_name(reg); } if (instr->address) { @@ -168,13 +177,6 @@ print_instr(struct ir3_instruction *instr, int lvl) printf("]"); } - if (instr->fanin) { - printf(", fanin=_"); - printf("["); - print_instr_name(instr->fanin); - printf("]"); - } - if (instr->cp.left) { printf(", left=_"); printf("["); @@ -192,8 +194,6 @@ print_instr(struct ir3_instruction *instr, int lvl) if (is_meta(instr)) { if (instr->opc == OPC_META_FO) { printf(", off=%d", instr->fo.off); - } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) { - printf(", aid=%d", instr->fi.aid); } } diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c index 88018398e..bcad96e8a 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -68,25 +68,24 @@ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after * register assignment. But for us that is horrible from a scheduling * standpoint. Instead what we do is use idea of 'definer' instruction. - * Ie. the first instruction (lowest ip) to write to the array is the + * Ie. the first instruction (lowest ip) to write to the variable is the * one we consider from use/def perspective when building interference - * graph. (Other instructions which write other array elements just - * define the variable some more.) + * graph. (Other instructions which write other variable components + * just define the variable some more.) + * + * Arrays of arbitrary size are handled via pre-coloring a consecutive + * sequence of registers. Additional scalar (single component) reg + * names are allocated starting at ctx->class_base[total_class_count] + * (see arr->base), which are pre-colored. In the use/def graph direct + * access is treated as a single element use/def, and indirect access + * is treated as use or def of all array elements. (Only the first + * def is tracked, in case of multiple indirect writes, etc.) */ static const unsigned class_sizes[] = { 1, 2, 3, 4, 4 + 4, /* txd + 1d/2d */ 4 + 6, /* txd + 3d */ - /* temporary: until we can assign arrays, create classes so we - * can round up array to fit. NOTE with tgsi arrays should - * really all be multiples of four: - */ - 4 * 4, - 4 * 8, - 4 * 16, - 4 * 32, - }; #define class_count ARRAY_SIZE(class_sizes) @@ -265,13 +264,21 @@ struct ir3_ra_ctx { struct ir3_ra_reg_set *set; struct ra_graph *g; unsigned alloc_count; - unsigned class_alloc_count[total_class_count]; - unsigned class_base[total_class_count]; + /* one per class, plus one slot for arrays: */ + unsigned class_alloc_count[total_class_count + 1]; + unsigned class_base[total_class_count + 1]; unsigned instr_cnt; unsigned *def, *use; /* def/use table */ struct ir3_ra_instr_data *instrd; }; +/* does it conflict? */ +static inline bool +intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) +{ + return !((a_start >= b_end) || (b_start >= a_end)); +} + static bool is_half(struct ir3_instruction *instr) { @@ -314,6 +321,14 @@ writes_gpr(struct ir3_instruction *instr) return is_temp(instr->regs[0]); } +static bool +instr_before(struct ir3_instruction *a, struct ir3_instruction *b) +{ + if (a->flags & IR3_INSTR_UNUSED) + return false; + return (a->ip < b->ip); +} + static struct ir3_instruction * get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, int *sz, int *off) @@ -321,9 +336,6 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; struct ir3_instruction *d = NULL; - if (instr->fanin) - return get_definer(ctx, instr->fanin, sz, off); - if (id->defn) { *sz = id->sz; *off = id->off; @@ -348,7 +360,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, dd = get_definer(ctx, src->instr, &dsz, &doff); - if ((!d) || (dd->ip < d->ip)) { + if ((!d) || instr_before(dd, d)) { d = dd; *sz = dsz; *off = doff - n; @@ -369,9 +381,14 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, */ int cnt = 0; - d = f; + /* need to skip over unused in the group: */ + while (f && (f->flags & IR3_INSTR_UNUSED)) { + f = f->cp.right; + cnt++; + } + while (f) { - if (f->ip < d->ip) + if ((!d) || instr_before(f, d)) d = f; if (f == instr) *off = cnt; @@ -414,7 +431,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, *sz = MAX2(*sz, dsz); *off = doff; - if (dd->ip < d->ip) { + if (instr_before(dd, d)) { d = dd; } } @@ -432,7 +449,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, foreach_src(src, d) { if (!src->instr) continue; - if (src->instr->ip < dd->ip) + if (instr_before(src->instr, dd)) dd = src->instr; } @@ -446,7 +463,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff); /* by definition, should come before: */ - debug_assert(dd->ip < d->ip); + debug_assert(instr_before(dd, d)); *sz = MAX2(*sz, dsz); @@ -472,10 +489,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* couple special cases: */ if (writes_addr(instr) || writes_pred(instr)) { id->cls = -1; - continue; + } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { + id->cls = total_class_count; + id->defn = instr; + } else { + id->defn = get_definer(ctx, instr, &id->sz, &id->off); + id->cls = size_to_class(id->sz, is_half(id->defn)); } - id->defn = get_definer(ctx, instr, &id->sz, &id->off); - id->cls = size_to_class(id->sz, is_half(id->defn)); } } @@ -505,8 +525,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* arrays which don't fit in one of the pre-defined class * sizes are pre-colored: - * - * TODO but we still need to allocate names for them, don't we?? */ if (id->cls >= 0) { instr->name = ctx->class_alloc_count[id->cls]++; @@ -518,7 +536,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) static void ra_init(struct ir3_ra_ctx *ctx) { - unsigned n; + unsigned n, base; ir3_clear_mark(ctx->ir); n = ir3_count_instructions(ctx->ir); @@ -537,11 +555,20 @@ ra_init(struct ir3_ra_ctx *ctx) * actual ra name is class_base[cls] + instr->name; */ ctx->class_base[0] = 0; - for (unsigned i = 1; i < total_class_count; i++) { + for (unsigned i = 1; i <= total_class_count; i++) { ctx->class_base[i] = ctx->class_base[i-1] + ctx->class_alloc_count[i-1]; } + /* and vreg names for array elements: */ + base = ctx->class_base[total_class_count]; + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + arr->base = base; + ctx->class_alloc_count[total_class_count] += arr->length; + base += arr->length; + } + ctx->alloc_count += ctx->class_alloc_count[total_class_count]; + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); ralloc_steal(ctx->g, ctx->instrd); ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); @@ -549,15 +576,23 @@ ra_init(struct ir3_ra_ctx *ctx) } static unsigned -ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) +__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) { unsigned name; debug_assert(cls >= 0); + debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */ name = ctx->class_base[cls] + defn->name; debug_assert(name < ctx->alloc_count); return name; } +static int +ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id) +{ + /* TODO handle name mapping for arrays */ + return __ra_name(ctx, id->cls, id->defn); +} + static void ra_destroy(struct ir3_ra_ctx *ctx) { @@ -570,6 +605,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_ra_block_data *bd; unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); +#define def(name, instr) \ + do { \ + /* defined on first write: */ \ + if (!ctx->def[name]) \ + ctx->def[name] = instr->ip; \ + ctx->use[name] = instr->ip; \ + BITSET_SET(bd->def, name); \ + } while(0); + +#define use(name, instr) \ + do { \ + ctx->use[name] = MAX2(ctx->use[name], instr->ip); \ + if (!BITSET_TEST(bd->def, name)) \ + BITSET_SET(bd->use, name); \ + } while(0); + bd = rzalloc(ctx->g, struct ir3_ra_block_data); bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); @@ -577,10 +628,11 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words); bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); - block->bd = bd; + block->data = bd; list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { struct ir3_instruction *src; + struct ir3_register *reg; if (instr->regs_count == 0) continue; @@ -612,61 +664,101 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (writes_gpr(instr)) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_register *dst = instr->regs[0]; - if (id->defn == instr) { - /* arrays which don't fit in one of the pre-defined class - * sizes are pre-colored: - */ - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); + if (dst->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, dst->array.id); + unsigned i; - ctx->def[name] = id->defn->ip; - ctx->use[name] = id->defn->ip; + debug_assert(!(dst->flags & IR3_REG_PHI_SRC)); - /* since we are in SSA at this point: */ - debug_assert(!BITSET_TEST(bd->use, name)); + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); - BITSET_SET(bd->def, name); + /* set the node class now.. in case we don't encounter + * this array dst again. From register_alloc algo's + * perspective, these are all single/scalar regs: + */ + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + ra_set_node_class(ctx->g, name, ctx->set->classes[0]); + } - if (is_half(id->defn)) { - ra_set_node_class(ctx->g, name, - ctx->set->half_classes[id->cls - class_count]); - } else { - ra_set_node_class(ctx->g, name, - ctx->set->classes[id->cls]); + /* indirect write is treated like a write to all array + * elements, since we don't know which one is actually + * written: + */ + if (dst->flags & IR3_REG_RELATIV) { + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + def(name, instr); } + } else { + unsigned name = arr->base + dst->array.offset; + def(name, instr); + } + + } else if (id->defn == instr) { + unsigned name = ra_name(ctx, id); + + /* since we are in SSA at this point: */ + debug_assert(!BITSET_TEST(bd->use, name)); + + def(name, id->defn); + + if (is_half(id->defn)) { + ra_set_node_class(ctx->g, name, + ctx->set->half_classes[id->cls - class_count]); + } else { + ra_set_node_class(ctx->g, name, + ctx->set->classes[id->cls]); + } - /* extend the live range for phi srcs, which may come - * from the bottom of the loop - */ - if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) { - struct ir3_instruction *phi = id->defn->regs[0]->instr; - foreach_ssa_src(src, phi) { - /* if src is after phi, then we need to extend - * the liverange to the end of src's block: - */ - if (src->ip > phi->ip) { - struct ir3_instruction *last = + /* extend the live range for phi srcs, which may come + * from the bottom of the loop + */ + if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = id->defn->regs[0]->instr; + foreach_ssa_src(src, phi) { + /* if src is after phi, then we need to extend + * the liverange to the end of src's block: + */ + if (src->ip > phi->ip) { + struct ir3_instruction *last = list_last_entry(&src->block->instr_list, - struct ir3_instruction, node); - ctx->use[name] = MAX2(ctx->use[name], last->ip); - } + struct ir3_instruction, node); + ctx->use[name] = MAX2(ctx->use[name], last->ip); } } } } } - foreach_ssa_src(src, instr) { - if (writes_gpr(src)) { - struct ir3_ra_instr_data *id = &ctx->instrd[src->ip]; - - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); - ctx->use[name] = MAX2(ctx->use[name], instr->ip); - if (!BITSET_TEST(bd->def, name)) - BITSET_SET(bd->use, name); + foreach_src(reg, instr) { + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); + /* indirect read is treated like a read fromall array + * elements, since we don't know which one is actually + * read: + */ + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + use(name, instr); + } + } else { + unsigned name = arr->base + reg->array.offset; + use(name, instr); + debug_assert(reg->array.offset < arr->length); } + } else if ((src = ssa(reg)) && writes_gpr(src)) { + unsigned name = ra_name(ctx, &ctx->instrd[src->ip]); + use(name, instr); } } } @@ -679,7 +771,7 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) bool progress = false; list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { - struct ir3_ra_block_data *bd = block->bd; + struct ir3_ra_block_data *bd = block->data; /* update livein: */ for (unsigned i = 0; i < bitset_words; i++) { @@ -700,7 +792,7 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) if (!succ) continue; - succ_bd = succ->bd; + succ_bd = succ->data; for (unsigned i = 0; i < bitset_words; i++) { BITSET_WORD new_liveout = @@ -722,6 +814,12 @@ ra_add_interference(struct ir3_ra_ctx *ctx) { struct ir3 *ir = ctx->ir; + /* initialize array live ranges: */ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { + arr->start_ip = ~0; + arr->end_ip = 0; + } + /* compute live ranges (use/def) on a block level, also updating * block's def/use bitmasks (used below to calculate per-block * livein/liveout): @@ -736,7 +834,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx) /* extend start/end ranges based on livein/liveout info from cfg: */ unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - struct ir3_ra_block_data *bd = block->bd; + struct ir3_ra_block_data *bd = block->data; for (unsigned i = 0; i < bitset_words; i++) { if (BITSET_TEST(bd->livein, i)) { @@ -754,18 +852,14 @@ ra_add_interference(struct ir3_ra_ctx *ctx) /* need to fix things up to keep outputs live: */ for (unsigned i = 0; i < ir->noutputs; i++) { struct ir3_instruction *instr = ir->outputs[i]; - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); - ctx->use[name] = ctx->instr_cnt; - } + unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]); + ctx->use[name] = ctx->instr_cnt; } for (unsigned i = 0; i < ctx->alloc_count; i++) { for (unsigned j = 0; j < ctx->alloc_count; j++) { - if (!((ctx->def[i] >= ctx->use[j]) || - (ctx->def[j] >= ctx->use[i]))) { + if (intersects(ctx->def[i], ctx->use[i], + ctx->def[j], ctx->use[j])) { ra_add_node_interference(ctx->g, i, j); } } @@ -823,19 +917,36 @@ static void fixup_half_instr_src(struct ir3_instruction *instr) } } +/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first + * array access(es) which do not have any previous access to depend + * on from scheduling point of view + */ static void reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, struct ir3_instruction *instr) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_ra_instr_data *id; + + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + unsigned name = arr->base + reg->array.offset; + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r]; + + if (reg->flags & IR3_REG_RELATIV) { + reg->array.offset = num; + } else { + reg->num = num; + } - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); + reg->flags &= ~IR3_REG_ARRAY; + } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { + unsigned name = ra_name(ctx, id); unsigned r = ra_get_node_reg(ctx->g, name); unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; - if (reg->flags & IR3_REG_RELATIV) - num += reg->offset; + debug_assert(!(reg->flags & IR3_REG_RELATIV)); reg->num = num; reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC); @@ -862,9 +973,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) foreach_src_n(reg, n, instr) { struct ir3_instruction *src = reg->instr; - if (!src) + /* Note: reg->instr could be null for IR3_REG_ARRAY */ + if (!(src || (reg->flags & IR3_REG_ARRAY))) continue; - reg_assign(ctx, instr->regs[n+1], src); if (instr->regs[n+1]->flags & IR3_REG_HALF) fixup_half_instr_src(instr); @@ -875,6 +986,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) static int ra_alloc(struct ir3_ra_ctx *ctx) { + unsigned n = 0; + /* frag shader inputs get pre-assigned, since we have some * constraints/unknowns about setup for some of these regs: */ @@ -884,7 +997,7 @@ ra_alloc(struct ir3_ra_ctx *ctx) if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) { struct ir3_instruction *instr = ir->inputs[i]; int cls = size_to_class(1, true); - unsigned name = ra_name(ctx, cls, instr); + unsigned name = __ra_name(ctx, cls, instr); unsigned reg = ctx->set->gpr_to_ra_reg[cls][0]; /* if we have frag_face, it gets hr0.x */ @@ -892,7 +1005,8 @@ ra_alloc(struct ir3_ra_ctx *ctx) i += 4; } - for (j = 0; i < ir->ninputs; i++) { + j = 0; + for (; i < ir->ninputs; i++) { struct ir3_instruction *instr = ir->inputs[i]; if (instr) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; @@ -900,7 +1014,7 @@ ra_alloc(struct ir3_ra_ctx *ctx) if (id->defn == instr) { unsigned name, reg; - name = ra_name(ctx, id->cls, id->defn); + name = ra_name(ctx, id); reg = ctx->set->gpr_to_ra_reg[id->cls][j]; ra_set_node_reg(ctx->g, name, reg); @@ -908,6 +1022,46 @@ ra_alloc(struct ir3_ra_ctx *ctx) } } } + n = j; + } + + /* pre-assign array elements: + */ + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + unsigned base = n; + + if (arr->end_ip == 0) + continue; + + /* figure out what else we conflict with which has already + * been assigned: + */ +retry: + list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) { + if (arr2 == arr) + break; + if (arr2->end_ip == 0) + continue; + /* if it intersects with liverange AND register range.. */ + if (intersects(arr->start_ip, arr->end_ip, + arr2->start_ip, arr2->end_ip) && + intersects(base, base + arr->length, + arr2->reg, arr2->reg + arr2->length)) { + base = MAX2(base, arr2->reg + arr2->length); + goto retry; + } + } + + arr->reg = base; + + for (unsigned i = 0; i < arr->length; i++) { + unsigned name, reg; + + name = arr->base + i; + reg = ctx->set->gpr_to_ra_reg[0][base++]; + + ra_set_node_reg(ctx->g, name, reg); + } } if (!ra_allocate(ctx->g)) diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c index 2ee325518..8f640febc 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -34,11 +34,12 @@ /* * Instruction Scheduling: * - * A priority-queue based scheduling algo. Add eligible instructions, - * ie. ones with all their dependencies scheduled, to the priority - * (depth) sorted queue (list). Pop highest priority instruction off - * the queue and schedule it, add newly eligible instructions to the - * priority queue, rinse, repeat. + * A recursive depth based scheduling algo. Recursively find an eligible + * instruction to schedule from the deepest instruction (recursing through + * it's unscheduled src instructions). Normally this would result in a + * lot of re-traversal of the same instructions, so we cache results in + * instr->data (and clear cached results that would be no longer valid + * after scheduling an instruction). * * There are a few special cases that need to be handled, since sched * is currently independent of register allocation. Usages of address @@ -52,6 +53,7 @@ struct ir3_sched_ctx { struct ir3_block *block; /* the current block */ + struct list_head depth_list; /* depth sorted unscheduled instrs */ struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ struct ir3_instruction *addr; /* current a0.x user, if any */ struct ir3_instruction *pred; /* current p0.x user, if any */ @@ -63,6 +65,17 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr) return is_sfu(instr) || is_mem(instr); } +#define NULL_INSTR ((void *)~0) + +static void +clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) +{ + list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) { + if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr) + instr2->data = NULL; + } +} + static void schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { @@ -93,6 +106,34 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) list_addtail(&instr->node, &instr->block->instr_list); ctx->scheduled = instr; + + if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) { + clear_cache(ctx, NULL); + } else { + /* invalidate only the necessary entries.. */ + clear_cache(ctx, instr); + } +} + +static struct ir3_instruction * +deepest(struct ir3_instruction **srcs, unsigned nsrcs) +{ + struct ir3_instruction *d = NULL; + unsigned i = 0, id = 0; + + while ((i < nsrcs) && !(d = srcs[id = i])) + i++; + + if (!d) + return NULL; + + for (; i < nsrcs; i++) + if (srcs[i] && (srcs[i]->depth > d->depth)) + d = srcs[id = i]; + + srcs[id] = NULL; + + return d; } static unsigned @@ -146,6 +187,9 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) foreach_ssa_src_n(src, i, instr) { unsigned d; + /* for array writes, no need to delay on previous write: */ + if (i == 0) + continue; if (src->block != instr->block) continue; d = delay_calc_srcn(ctx, src, instr, i); @@ -171,10 +215,51 @@ static bool is_scheduled(struct ir3_instruction *instr) return !!(instr->flags & IR3_INSTR_MARK); } +/* could an instruction be scheduled if specified ssa src was scheduled? */ static bool -check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, +could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) +{ + struct ir3_instruction *other_src; + foreach_ssa_src(other_src, instr) { + /* if dependency not scheduled, we aren't ready yet: */ + if ((src != other_src) && !is_scheduled(other_src)) { + return false; + } + } + return true; +} + +/* Check if instruction is ok to schedule. Make sure it is not blocked + * by use of addr/predicate register, etc. + */ +static bool +check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, struct ir3_instruction *instr) { + /* For instructions that write address register we need to + * make sure there is at least one instruction that uses the + * addr value which is otherwise ready. + * + * TODO if any instructions use pred register and have other + * src args, we would need to do the same for writes_pred().. + */ + if (writes_addr(instr)) { + struct ir3 *ir = instr->block->shader; + bool ready = false; + for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + if (!indirect) + continue; + if (indirect->address != instr) + continue; + ready = could_sched(indirect, instr); + } + + /* nothing could be scheduled, so keep looking: */ + if (!ready) + return false; + } + /* if this is a write to address/predicate register, and that * register is currently in use, we need to defer until it is * free: @@ -182,52 +267,15 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, if (writes_addr(instr) && ctx->addr) { debug_assert(ctx->addr != instr); notes->addr_conflict = true; - return true; + return false; } if (writes_pred(instr) && ctx->pred) { debug_assert(ctx->pred != instr); notes->pred_conflict = true; - return true; + return false; } - return false; -} - -/* is this instruction ready to be scheduled? Return negative for not - * ready (updating notes if needed), or >= 0 to indicate number of - * delay slots needed. - */ -static int -instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - struct ir3_instruction *instr) -{ - struct ir3_instruction *src; - unsigned delay = 0; - - /* Phi instructions can have a dependency on something not - * scheduled yet (for ex, loops). But OTOH we don't really - * care. By definition phi's should appear at the top of - * the block, and it's sources should be values from the - * previously executing block, so they are always ready to - * be scheduled: - */ - if (is_meta(instr) && (instr->opc == OPC_META_PHI)) - return 0; - - foreach_ssa_src(src, instr) { - /* if dependency not scheduled, we aren't ready yet: */ - if (!is_scheduled(src)) - return -1; - } - - /* all our dependents are scheduled, figure out if - * we have enough delay slots to schedule ourself: - */ - delay = delay_calc(ctx, instr); - if (delay) - return delay; - /* if the instruction is a kill, we need to ensure *every* * bary.f is scheduled. The hw seems unhappy if the thread * gets killed before the end-input (ei) flag is hit. @@ -246,80 +294,109 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, for (unsigned i = 0; i < ir->baryfs_count; i++) { struct ir3_instruction *baryf = ir->baryfs[i]; - if (baryf->depth == DEPTH_UNUSED) + if (baryf->flags & IR3_INSTR_UNUSED) continue; if (!is_scheduled(baryf)) { notes->blocked_kill = true; - return -1; + return false; } } } - if (check_conflict(ctx, notes, instr)) - return -1; - - return 0; + return true; } -/* could an instruction be scheduled if specified ssa src was scheduled? */ -static bool -could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) +/* Find the best instruction to schedule from specified instruction or + * recursively it's ssa sources. + */ +static struct ir3_instruction * +find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) { - struct ir3_instruction *other_src; - foreach_ssa_src(other_src, instr) { - /* if dependency not scheduled, we aren't ready yet: */ - if ((src != other_src) && !is_scheduled(other_src)) { - return false; + struct ir3_instruction *srcs[__ssa_src_cnt(instr)]; + struct ir3_instruction *src; + unsigned nsrcs = 0; + + if (is_scheduled(instr)) + return NULL; + + /* use instr->data to cache the results of recursing up the + * instr src's. Otherwise the recursive algo can scale quite + * badly w/ shader size. But this takes some care to clear + * the cache appropriately when instructions are scheduled. + */ + if (instr->data) { + if (instr->data == NULL_INSTR) + return NULL; + return instr->data; + } + + /* find unscheduled srcs: */ + foreach_ssa_src(src, instr) { + if (!is_scheduled(src)) { + debug_assert(nsrcs < ARRAY_SIZE(srcs)); + srcs[nsrcs++] = src; } } - return true; + + /* if all our src's are already scheduled: */ + if (nsrcs == 0) { + if (check_instr(ctx, notes, instr)) { + instr->data = instr; + return instr; + } + return NULL; + } + + while ((src = deepest(srcs, nsrcs))) { + struct ir3_instruction *candidate; + + candidate = find_instr_recursive(ctx, notes, src); + if (!candidate) + continue; + + if (check_instr(ctx, notes, candidate)) { + instr->data = candidate; + return candidate; + } + } + + instr->data = NULL_INSTR; + return NULL; } -/* move eligible instructions to the priority list: */ -static unsigned -add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - struct list_head *prio_queue, struct list_head *unscheduled_list) +/* find instruction to schedule: */ +static struct ir3_instruction * +find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes) { + struct ir3_instruction *best_instr = NULL; unsigned min_delay = ~0; - list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) { - int e = instr_eligibility(ctx, notes, instr); - if (e < 0) - continue; + /* TODO we'd really rather use the list/array of block outputs. But we + * don't have such a thing. Recursing *every* instruction in the list + * will result in a lot of repeated traversal, since instructions will + * get traversed both when they appear as ssa src to a later instruction + * as well as where they appear in the depth_list. + */ + list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) { + struct ir3_instruction *candidate; + unsigned delay; - /* For instructions that write address register we need to - * make sure there is at least one instruction that uses the - * addr value which is otherwise ready. - * - * TODO if any instructions use pred register and have other - * src args, we would need to do the same for writes_pred().. - */ - if (unlikely(writes_addr(instr))) { - struct ir3 *ir = instr->block->shader; - bool ready = false; - for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; - if (!indirect) - continue; - if (indirect->address != instr) - continue; - ready = could_sched(indirect, instr); - } + candidate = find_instr_recursive(ctx, notes, instr); + if (!candidate) + continue; - /* nothing could be scheduled, so keep looking: */ - if (!ready) - continue; + delay = delay_calc(ctx, candidate); + if (delay < min_delay) { + best_instr = candidate; + min_delay = delay; } - min_delay = MIN2(min_delay, e); - if (e == 0) { - /* remove from unscheduled list and into priority queue: */ - list_delinit(&instr->node); - ir3_insert_by_depth(instr, prio_queue); - } + if (min_delay == 0) + break; } - return min_delay; + return best_instr; } /* "spill" the address register by remapping any unscheduled @@ -413,50 +490,55 @@ split_pred(struct ir3_sched_ctx *ctx) static void sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) { - struct list_head unscheduled_list, prio_queue; + struct list_head unscheduled_list; ctx->block = block; + /* addr/pred writes are per-block: */ + ctx->addr = NULL; + ctx->pred = NULL; + /* move all instructions to the unscheduled list, and * empty the block's instruction list (to which we will - * be inserting. + * be inserting). */ list_replace(&block->instr_list, &unscheduled_list); list_inithead(&block->instr_list); - list_inithead(&prio_queue); + list_inithead(&ctx->depth_list); /* first a pre-pass to schedule all meta:input/phi instructions * (which need to appear first so that RA knows the register is - * occupied: + * occupied), and move remaining to depth sorted list: */ list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) || - (instr->opc == OPC_META_PHI))) + (instr->opc == OPC_META_PHI))) { schedule(ctx, instr); + } else { + ir3_insert_by_depth(instr, &ctx->depth_list); + } } - while (!(list_empty(&unscheduled_list) && - list_empty(&prio_queue))) { + while (!list_empty(&ctx->depth_list)) { struct ir3_sched_notes notes = {0}; - unsigned delay; + struct ir3_instruction *instr; + + instr = find_eligible_instr(ctx, ¬es); - delay = add_eligible_instrs(ctx, ¬es, &prio_queue, &unscheduled_list); + if (instr) { + unsigned delay = delay_calc(ctx, instr); - if (!list_empty(&prio_queue)) { - struct ir3_instruction *instr = list_last_entry(&prio_queue, - struct ir3_instruction, node); - /* ugg, this is a bit ugly, but between the time when - * the instruction became eligible and now, a new - * conflict may have arose.. + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: */ - if (check_conflict(ctx, ¬es, instr)) { - list_del(&instr->node); - list_addtail(&instr->node, &unscheduled_list); - continue; + debug_assert(delay <= 6); + while (delay > 0) { + ir3_NOP(block); + delay--; } schedule(ctx, instr); - } else if (delay == ~0) { + } else { struct ir3_instruction *new_instr = NULL; /* nothing available to schedule.. if we are blocked on @@ -475,23 +557,17 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) } if (new_instr) { - list_del(&new_instr->node); - list_addtail(&new_instr->node, &unscheduled_list); + /* clearing current addr/pred can change what is + * available to schedule, so clear cache.. + */ + clear_cache(ctx, NULL); + + ir3_insert_by_depth(new_instr, &ctx->depth_list); /* the original instr that wrote addr/pred may have * originated from a different block: */ new_instr->block = block; } - - } else { - /* and if we run out of instructions that can be scheduled, - * then it is time for nop's: - */ - debug_assert(delay <= 6); - while (delay > 0) { - ir3_NOP(block); - delay--; - } } } diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 312174c0c..7d17f426a 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -39,7 +39,7 @@ #include "ir3_shader.h" #include "ir3_compiler.h" - +#include "ir3_nir.h" static void delete_variant(struct ir3_shader_variant *v) @@ -187,12 +187,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key) v->key = key; v->type = shader->type; - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type, - key.binning_pass, key.color_two_side, key.half_precision); - tgsi_dump(shader->tokens, 0); - } - ret = ir3_compile_shader_nir(shader->compiler, v); if (ret) { debug_error("compile failed!"); @@ -267,7 +261,7 @@ ir3_shader_destroy(struct ir3_shader *shader) v = v->next; delete_variant(t); } - free((void *)shader->tokens); + ralloc_free(shader->nir); free(shader); } @@ -281,14 +275,24 @@ ir3_shader_create(struct pipe_context *pctx, shader->id = ++shader->compiler->shader_count; shader->pctx = pctx; shader->type = type; - shader->tokens = tgsi_dup_tokens(cso->tokens); + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump tgsi: type=%d", shader->type); + tgsi_dump(cso->tokens, 0); + } + nir_shader *nir = ir3_tgsi_to_nir(cso->tokens); + /* do first pass optimization, ignoring the key: */ + shader->nir = ir3_optimize_nir(shader, nir, NULL); + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump nir%d: type=%d", shader->id, shader->type); + nir_print_shader(shader->nir, stdout); + } shader->stream_output = cso->stream_output; if (fd_mesa_debug & FD_DBG_SHADERDB) { /* if shader-db run, create a standard variant immediately * (as otherwise nothing will trigger the shader to be * actually compiled) */ - static struct ir3_shader_key key = {}; + static struct ir3_shader_key key = {0}; ir3_shader_variant(shader, key); } return shader; @@ -300,11 +304,11 @@ static void dump_reg(const char *name, uint32_t r) debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]); } -static void dump_semantic(struct ir3_shader_variant *so, - unsigned sem, const char *name) +static void dump_output(struct ir3_shader_variant *so, + unsigned slot, const char *name) { uint32_t regid; - regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0)); + regid = ir3_find_output_regid(so, slot); dump_reg(name, regid); } @@ -355,27 +359,51 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin) disasm_a3xx(bin, so->info.sizedwords, 0, so->type); - debug_printf("; %s: outputs:", type); - for (i = 0; i < so->outputs_count; i++) { - uint8_t regid = so->outputs[i].regid; - ir3_semantic sem = so->outputs[i].semantic; - debug_printf(" r%d.%c (%u:%u)", - (regid >> 2), "xyzw"[regid & 0x3], - sem2name(sem), sem2idx(sem)); - } - debug_printf("\n"); - debug_printf("; %s: inputs:", type); - for (i = 0; i < so->inputs_count; i++) { - uint8_t regid = so->inputs[i].regid; - ir3_semantic sem = so->inputs[i].semantic; - debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)", - (regid >> 2), "xyzw"[regid & 0x3], - sem2name(sem), sem2idx(sem), - so->inputs[i].compmask, - so->inputs[i].inloc, - so->inputs[i].bary); + switch (so->type) { + case SHADER_VERTEX: + debug_printf("; %s: outputs:", type); + for (i = 0; i < so->outputs_count; i++) { + uint8_t regid = so->outputs[i].regid; + debug_printf(" r%d.%c (%s)", + (regid >> 2), "xyzw"[regid & 0x3], + gl_varying_slot_name(so->outputs[i].slot)); + } + debug_printf("\n"); + debug_printf("; %s: inputs:", type); + for (i = 0; i < so->inputs_count; i++) { + uint8_t regid = so->inputs[i].regid; + debug_printf(" r%d.%c (cm=%x,il=%u,b=%u)", + (regid >> 2), "xyzw"[regid & 0x3], + so->inputs[i].compmask, + so->inputs[i].inloc, + so->inputs[i].bary); + } + debug_printf("\n"); + break; + case SHADER_FRAGMENT: + debug_printf("; %s: outputs:", type); + for (i = 0; i < so->outputs_count; i++) { + uint8_t regid = so->outputs[i].regid; + debug_printf(" r%d.%c (%s)", + (regid >> 2), "xyzw"[regid & 0x3], + gl_frag_result_name(so->outputs[i].slot)); + } + debug_printf("\n"); + debug_printf("; %s: inputs:", type); + for (i = 0; i < so->inputs_count; i++) { + uint8_t regid = so->inputs[i].regid; + debug_printf(" r%d.%c (%s,cm=%x,il=%u,b=%u)", + (regid >> 2), "xyzw"[regid & 0x3], + gl_varying_slot_name(so->inputs[i].slot), + so->inputs[i].compmask, + so->inputs[i].inloc, + so->inputs[i].bary); + } + debug_printf("\n"); + break; + case SHADER_COMPUTE: + break; } - debug_printf("\n"); /* print generic shader info: */ debug_printf("; %s prog %d/%d: %u instructions, %d half, %d full\n", @@ -391,13 +419,24 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin) /* print shader type specific info: */ switch (so->type) { case SHADER_VERTEX: - dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos"); - dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize"); + dump_output(so, VARYING_SLOT_POS, "pos"); + dump_output(so, VARYING_SLOT_PSIZ, "psize"); break; case SHADER_FRAGMENT: dump_reg("pos (bary)", so->pos_regid); - dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz"); - dump_semantic(so, TGSI_SEMANTIC_COLOR, "color"); + dump_output(so, FRAG_RESULT_DEPTH, "posz"); + if (so->color0_mrt) { + dump_output(so, FRAG_RESULT_COLOR, "color"); + } else { + dump_output(so, FRAG_RESULT_DATA0, "data0"); + dump_output(so, FRAG_RESULT_DATA1, "data1"); + dump_output(so, FRAG_RESULT_DATA2, "data2"); + dump_output(so, FRAG_RESULT_DATA3, "data3"); + dump_output(so, FRAG_RESULT_DATA4, "data4"); + dump_output(so, FRAG_RESULT_DATA5, "data5"); + dump_output(so, FRAG_RESULT_DATA6, "data6"); + dump_output(so, FRAG_RESULT_DATA7, "data7"); + } /* these two are hard-coded since we don't know how to * program them to anything but all 0's... */ @@ -466,7 +505,7 @@ static void emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { - uint32_t offset = v->first_driver_param; /* UBOs after user consts */ + uint32_t offset = v->first_driver_param + IR3_UBOS_OFF; if (v->constlen > offset) { struct fd_context *ctx = fd_context(v->shader->pctx); uint32_t params = MIN2(4, v->constlen - offset) * 4; @@ -519,7 +558,8 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) static void emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) { - uint32_t offset = v->first_driver_param + 5; /* streamout addresses after driver-params*/ + /* streamout addresses after driver-params: */ + uint32_t offset = v->first_driver_param + IR3_TFBOS_OFF; if (v->constlen > offset) { struct fd_context *ctx = fd_context(v->shader->pctx); struct fd_streamout_stateobj *so = &ctx->streamout; @@ -622,17 +662,33 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, /* emit driver params every time: */ /* TODO skip emit if shader doesn't use driver params to avoid WFI.. */ if (info && (v->type == SHADER_VERTEX)) { - uint32_t offset = v->first_driver_param + 4; /* driver params after UBOs */ + uint32_t offset = v->first_driver_param + IR3_DRIVER_PARAM_OFF; if (v->constlen >= offset) { - uint32_t vertex_params[4] = { + uint32_t vertex_params[IR3_DP_COUNT] = { [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start, [IR3_DP_VTXCNT_MAX] = max_tf_vtx(v), }; + /* if no user-clip-planes, we don't need to emit the + * entire thing: + */ + uint32_t vertex_params_size = 4; + + if (v->key.ucp_enables) { + struct pipe_clip_state *ucp = &ctx->ucp; + unsigned pos = IR3_DP_UCP0_X; + for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) { + for (unsigned j = 0; j < 4; j++) { + vertex_params[pos] = fui(ucp->ucp[i][j]); + pos++; + } + } + vertex_params_size = ARRAY_SIZE(vertex_params); + } fd_wfi(ctx, ring); ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0, - ARRAY_SIZE(vertex_params), vertex_params, NULL); + vertex_params_size, vertex_params, NULL); /* if needed, emit stream-out buffer addresses: */ if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 1bbbdbd22..03d4fa2e9 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -30,6 +30,7 @@ #define IR3_SHADER_H_ #include "pipe/p_state.h" +#include "compiler/shader_enums.h" #include "ir3.h" #include "disasm.h" @@ -38,29 +39,28 @@ enum ir3_driver_param { IR3_DP_VTXID_BASE = 0, IR3_DP_VTXCNT_MAX = 1, + /* user-clip-plane components, up to 8x vec4's: */ + IR3_DP_UCP0_X = 4, + /* .... */ + IR3_DP_UCP7_W = 35, + IR3_DP_COUNT = 36 /* must be aligned to vec4 */ }; -/* internal semantic used for passing vtxcnt to vertex shader to - * implement transform feedback: +/* Layout of constant registers: + * + * num_uniform * vec4 - user consts + * 4 * vec4 - UBO addresses + * if (vertex shader) { + * N * vec4 - driver params (IR3_DP_*) + * 1 * vec4 - stream-out addresses + * } + * + * TODO this could be made more dynamic, to at least skip sections + * that we don't need.. */ -#define IR3_SEMANTIC_VTXCNT (TGSI_SEMANTIC_COUNT + 0) - -typedef uint16_t ir3_semantic; /* semantic name + index */ -static inline ir3_semantic -ir3_semantic_name(uint8_t name, uint16_t index) -{ - return (name << 8) | (index & 0xff); -} - -static inline uint8_t sem2name(ir3_semantic sem) -{ - return sem >> 8; -} - -static inline uint16_t sem2idx(ir3_semantic sem) -{ - return sem & 0xff; -} +#define IR3_UBOS_OFF 0 /* UBOs after user consts */ +#define IR3_DRIVER_PARAM_OFF 4 /* driver params after UBOs */ +#define IR3_TFBOS_OFF (IR3_DRIVER_PARAM_OFF + IR3_DP_COUNT/4) /* Configuration key used to identify a shader variant.. different * shader variants can be used to implement features not supported @@ -69,6 +69,11 @@ static inline uint16_t sem2idx(ir3_semantic sem) struct ir3_shader_key { union { struct { + /* + * Combined Vertex/Fragment shader parameters: + */ + unsigned ucp_enables : 8; + /* do we need to check {v,f}saturate_{s,t,r}? */ unsigned has_per_samp : 1; @@ -82,8 +87,8 @@ struct ir3_shader_key { */ unsigned color_two_side : 1; unsigned half_precision : 1; - /* used when shader needs to handle flat varyings (a4xx), - * for TGSI_INTERPOLATE_COLOR: + /* used when shader needs to handle flat varyings (a4xx) + * for front/back color inputs to frag shader: */ unsigned rasterflat : 1; }; @@ -147,18 +152,26 @@ struct ir3_shader_variant { uint8_t pos_regid; bool frag_coord, frag_face, color0_mrt; + /* NOTE: for input/outputs, slot is: + * gl_vert_attrib - for VS inputs + * gl_varying_slot - for VS output / FS input + * gl_frag_result - for FS output + */ + /* varyings/outputs: */ unsigned outputs_count; struct { - ir3_semantic semantic; + uint8_t slot; uint8_t regid; } outputs[16 + 2]; /* +POSITION +PSIZE */ bool writes_pos, writes_psize; - /* vertices/inputs: */ + /* attributes (VS) / varyings (FS): + * Note that sysval's should come *after* normal inputs. + */ unsigned inputs_count; struct { - ir3_semantic semantic; + uint8_t slot; uint8_t regid; uint8_t compmask; uint8_t ncomp; @@ -174,11 +187,23 @@ struct ir3_shader_variant { * spots where inloc is used. */ uint8_t inloc; - uint8_t bary; - uint8_t interpolate; + /* vertex shader specific: */ + bool sysval : 1; /* slot is a gl_system_value */ + /* fragment shader specific: */ + bool bary : 1; /* fetched varying (vs one loaded into reg) */ + bool rasterflat : 1; /* special handling for emit->rasterflat */ + enum glsl_interp_qualifier interpolate; } inputs[16 + 2]; /* +POSITION +FACE */ - unsigned total_in; /* sum of inputs (scalar) */ + /* sum of input components (scalar). For frag shaders, it only counts + * the varying inputs: + */ + unsigned total_in; + + /* For frag shaders, the total number of inputs (not scalar, + * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR) + */ + unsigned varying_in; /* do we have one or more texture sample instructions: */ bool has_samp; @@ -205,6 +230,8 @@ struct ir3_shader_variant { struct ir3_shader *shader; }; +typedef struct nir_shader nir_shader; + struct ir3_shader { enum shader_t type; @@ -214,8 +241,8 @@ struct ir3_shader { struct ir3_compiler *compiler; - struct pipe_context *pctx; - const struct tgsi_token *tokens; + struct pipe_context *pctx; /* TODO replace w/ pipe_screen */ + nir_shader *nir; struct pipe_stream_output_info stream_output; struct ir3_shader_variant *variants; @@ -254,12 +281,12 @@ ir3_shader_stage(struct ir3_shader *shader) #include "pipe/p_shader_tokens.h" static inline int -ir3_find_output(const struct ir3_shader_variant *so, ir3_semantic semantic) +ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot) { int j; for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].semantic == semantic) + if (so->outputs[j].slot == slot) return j; /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n] @@ -269,18 +296,20 @@ ir3_find_output(const struct ir3_shader_variant *so, ir3_semantic semantic) * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only * a OUT.BCOLOR[n] but no matching OUT.COLOR[n] */ - if (sem2name(semantic) == TGSI_SEMANTIC_BCOLOR) { - unsigned idx = sem2idx(semantic); - semantic = ir3_semantic_name(TGSI_SEMANTIC_COLOR, idx); - } else if (sem2name(semantic) == TGSI_SEMANTIC_COLOR) { - unsigned idx = sem2idx(semantic); - semantic = ir3_semantic_name(TGSI_SEMANTIC_BCOLOR, idx); + if (slot == VARYING_SLOT_BFC0) { + slot = VARYING_SLOT_COL0; + } else if (slot == VARYING_SLOT_BFC1) { + slot = VARYING_SLOT_COL1; + } else if (slot == VARYING_SLOT_COL0) { + slot = VARYING_SLOT_BFC0; + } else if (slot == VARYING_SLOT_COL1) { + slot = VARYING_SLOT_BFC1; } else { return 0; } for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].semantic == semantic) + if (so->outputs[j].slot == slot) return j; debug_assert(0); @@ -298,11 +327,11 @@ ir3_next_varying(const struct ir3_shader_variant *so, int i) } static inline uint32_t -ir3_find_output_regid(const struct ir3_shader_variant *so, ir3_semantic semantic) +ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot) { int j; for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].semantic == semantic) + if (so->outputs[j].slot == slot) return so->outputs[j].regid; return regid(63, 0); } |