diff options
5 files changed, 550 insertions, 979 deletions
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h b/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h index f26bb2ffc..6b1e9f7e8 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h +++ b/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h @@ -1,3 +1,5 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + /* * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org> * diff --git a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c index af9811864..2b62b3ae2 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c +++ b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c @@ -35,13 +35,19 @@ #define WARN_MSG(f, ...) DBG("WARN: "f, ##__VA_ARGS__) #define ERROR_MSG(f, ...) DBG("ERROR: "f, ##__VA_ARGS__) +#define REG_MASK 0x3f + +static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr); + static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords, uint32_t idx, struct ir2_shader_info *info); -static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n); -static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg); -static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg); -static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg); +static void reg_update_stats(struct ir2_register *reg, + struct ir2_shader_info *info, bool dest); +static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n); +static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg); +static uint32_t reg_alu_dst_swiz(struct ir2_register *reg); +static uint32_t reg_alu_src_swiz(struct ir2_register *reg); /* simple allocator to carve allocations out of an up-front allocated heap, * so that we can free everything easily in one shot. @@ -49,7 +55,7 @@ static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg); static void * ir2_alloc(struct ir2_shader *shader, int sz) { void *ptr = &shader->heap[shader->heap_idx]; - shader->heap_idx += align(sz, 4) / 4; + shader->heap_idx += align(sz, 4); return ptr; } @@ -68,9 +74,7 @@ static char * ir2_strdup(struct ir2_shader *shader, const char *str) struct ir2_shader * ir2_shader_create(void) { DEBUG_MSG(""); - struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader)); - shader->max_reg = -1; - return shader; + return calloc(1, sizeof(struct ir2_shader)); } void ir2_shader_destroy(struct ir2_shader *shader) @@ -79,344 +83,189 @@ void ir2_shader_destroy(struct ir2_shader *shader) free(shader); } -/* check if an instruction is a simple MOV - */ -static struct ir2_instruction * simple_mov(struct ir2_instruction *instr, - bool output) +/* resolve addr/cnt/sequence fields in the individual CF's */ +static int shader_resolve(struct ir2_shader *shader, struct ir2_shader_info *info) { - struct ir2_src_register *src_reg = instr->src_reg; - struct ir2_dst_register *dst_reg = &instr->dst_reg; - struct ir2_register *reg; - unsigned i; - - /* MAXv used for MOV */ - if (instr->instr_type != IR2_ALU_VECTOR || - instr->alu_vector.opc != MAXv) - return NULL; - - /* non identical srcs */ - if (src_reg[0].num != src_reg[1].num) - return NULL; - - /* flags */ - int flags = IR2_REG_NEGATE | IR2_REG_ABS; - if (output) - flags |= IR2_REG_INPUT | IR2_REG_CONST; - if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags)) - return NULL; - - /* clamping */ - if (instr->alu_vector.clamp) - return NULL; - - /* swizzling */ - for (i = 0; i < 4; i++) { - char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i]; - if (swiz == '_') - continue; - - if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] || - swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i]) - return NULL; - } - - if (output) - reg = &instr->shader->reg[src_reg[0].num]; - else - reg = &instr->shader->reg[dst_reg->num]; - - assert(reg->write_idx >= 0); - if (reg->write_idx != reg->write_idx2) - return NULL; - - if (!output) - return instr; - - instr = instr->shader->instr[reg->write_idx]; - return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr; -} + uint32_t addr; + unsigned i; + int j; + + addr = shader->cfs_count / 2; + for (i = 0; i < shader->cfs_count; i++) { + struct ir2_cf *cf = shader->cfs[i]; + if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) { + uint32_t sequence = 0; + + if (cf->exec.addr && (cf->exec.addr != addr)) + WARN_MSG("invalid addr '%d' at CF %d", cf->exec.addr, i); + if (cf->exec.cnt && (cf->exec.cnt != cf->exec.instrs_count)) + WARN_MSG("invalid cnt '%d' at CF %d", cf->exec.cnt, i); + + for (j = cf->exec.instrs_count - 1; j >= 0; j--) { + struct ir2_instruction *instr = cf->exec.instrs[j]; + sequence <<= 2; + if (instr->instr_type == IR2_FETCH) + sequence |= 0x1; + if (instr->sync) + sequence |= 0x2; + } -static int src_to_reg(struct ir2_instruction *instr, - struct ir2_src_register *reg) -{ - if (reg->flags & IR2_REG_CONST) - return reg->num; + cf->exec.addr = addr; + cf->exec.cnt = cf->exec.instrs_count; + cf->exec.sequence = sequence; - return instr->shader->reg[reg->num].reg; -} - -static int dst_to_reg(struct ir2_instruction *instr, - struct ir2_dst_register *reg) -{ - if (reg->flags & IR2_REG_EXPORT) - return reg->num; + addr += cf->exec.instrs_count; + } + } - return instr->shader->reg[reg->num].reg; -} + info->sizedwords = 3 * addr; -static bool mask_get(uint32_t *mask, unsigned index) -{ - return !!(mask[index / 32] & 1 << index % 32); + return 0; } -static void mask_set(uint32_t *mask, struct ir2_register *reg, int index) +void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info *info) { - if (reg) { - unsigned i; - for (i = 0; i < ARRAY_SIZE(reg->regmask); i++) - mask[i] |= reg->regmask[i]; + uint32_t i, j; + uint32_t *ptr, *dwords = NULL; + uint32_t idx = 0; + int ret; + + info->sizedwords = 0; + info->max_reg = -1; + info->max_input_reg = 0; + info->regs_written = 0; + + /* we need an even # of CF's.. insert a NOP if needed */ + if (shader->cfs_count != align(shader->cfs_count, 2)) + ir2_cf_create(shader, NOP); + + /* first pass, resolve sizes and addresses: */ + ret = shader_resolve(shader, info); + if (ret) { + ERROR_MSG("resolve failed: %d", ret); + goto fail; } - if (index >= 0) - mask[index / 32] |= 1 << index % 32; -} -static bool sets_pred(struct ir2_instruction *instr) -{ - return instr->instr_type == IR2_ALU_SCALAR && - instr->alu_scalar.opc >= PRED_SETEs && - instr->alu_scalar.opc <= PRED_SET_RESTOREs; -} + ptr = dwords = calloc(4, info->sizedwords); - - -void* ir2_shader_assemble(struct ir2_shader *shader, - struct ir2_shader_info *info) -{ - /* NOTES - * blob compiler seems to always puts PRED_* instrs in a CF by - * themselves, and wont combine EQ/NE in the same CF - * (not doing this - doesn't seem to make a difference) - * - * TODO: implement scheduling for combining vector+scalar instructions - * -some vector instructions can be replaced by scalar - */ - - /* first step: - * 1. remove "NOP" MOV instructions generated by TGSI for input/output: - * 2. track information for register allocation, and to remove - * the dead code when some exports are not needed - * 3. add additional instructions for a20x hw binning if needed - * NOTE: modifies the shader instrs - * this step could be done as instructions are added by compiler instead - */ - - /* mask of exports that must be generated - * used to avoid calculating ps exports with hw binning - */ - uint64_t export = ~0ull; - /* bitmask of variables required for exports defined by "export" */ - uint32_t export_mask[REG_MASK/32+1] = {}; - - unsigned idx, reg_idx; - unsigned max_input = 0; - int export_size = -1; - - for (idx = 0; idx < shader->instr_count; idx++) { - struct ir2_instruction *instr = shader->instr[idx], *prev; - struct ir2_dst_register dst_reg = instr->dst_reg; - - if (dst_reg.flags & IR2_REG_EXPORT) { - if (dst_reg.num < 32) - export_size++; - - if ((prev = simple_mov(instr, true))) { - /* copy instruction but keep dst */ - *instr = *prev; - instr->dst_reg = dst_reg; - } + /* second pass, emit CF program in pairs: */ + for (i = 0; i < shader->cfs_count; i += 2) { + instr_cf_t *cfs = (instr_cf_t *)ptr; + ret = cf_emit(shader->cfs[i], &cfs[0]); + if (ret) { + ERROR_MSG("CF emit failed: %d\n", ret); + goto fail; } - - for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) { - struct ir2_src_register *src_reg = &instr->src_reg[reg_idx]; - struct ir2_register *reg; - int num; - - if (src_reg->flags & IR2_REG_CONST) - continue; - - num = src_reg->num; - reg = &shader->reg[num]; - reg->read_idx = idx; - - if (src_reg->flags & IR2_REG_INPUT) { - max_input = MAX2(max_input, num); - } else { - /* bypass simple mov used to set src_reg */ - assert(reg->write_idx >= 0); - prev = shader->instr[reg->write_idx]; - if (simple_mov(prev, false)) { - *src_reg = prev->src_reg[0]; - /* process same src_reg again */ - reg_idx -= 1; - continue; - } - } - - /* update dependencies */ - uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ? - export_mask : shader->reg[dst_reg.num].regmask; - mask_set(mask, reg, num); - if (sets_pred(instr)) - mask_set(export_mask, reg, num); + ret = cf_emit(shader->cfs[i+1], &cfs[1]); + if (ret) { + ERROR_MSG("CF emit failed: %d\n", ret); + goto fail; } + ptr += 3; + assert((ptr - dwords) <= info->sizedwords); } - /* second step: - * emit instructions (with CFs) + RA - */ - instr_cf_t cfs[128], *cf = cfs; - uint32_t alufetch[3*256], *af = alufetch; - - /* RA is done on write, so inputs must be allocated here */ - for (reg_idx = 0; reg_idx <= max_input; reg_idx++) - shader->reg[reg_idx].reg = reg_idx; - info->max_reg = max_input; - - /* CF instr state */ - instr_cf_exec_t exec = { .opc = EXEC }; - instr_cf_alloc_t alloc = { .opc = ALLOC }; - bool need_alloc = 0; - bool pos_export = 0; - - export_size = MAX2(export_size, 0); - - for (idx = 0; idx < shader->instr_count; idx++) { - struct ir2_instruction *instr = shader->instr[idx]; - struct ir2_dst_register *dst_reg = &instr->dst_reg; - unsigned num = dst_reg->num; - struct ir2_register *reg; - - /* a2xx only has 64 registers, so we can use a single 64-bit mask */ - uint64_t regmask = 0ull; - - /* compute the current regmask */ - for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) { - reg = &shader->reg[reg_idx]; - if ((int) idx > reg->write_idx && idx < reg->read_idx) - regmask |= (1ull << reg->reg); - } - - if (dst_reg->flags & IR2_REG_EXPORT) { - /* skip if export is not needed */ - if (!(export & (1ull << num))) - continue; - - /* ALLOC CF: - * want to alloc all < 32 at once - * 32/33 and 62/63 come in pairs - * XXX assuming all 3 types are never interleaved - */ - if (num < 32) { - alloc.size = export_size; - alloc.buffer_select = SQ_PARAMETER_PIXEL; - need_alloc = export_size >= 0; - export_size = -1; - } else if (num == 32 || num == 33) { - alloc.size = 0; - alloc.buffer_select = SQ_MEMORY; - need_alloc = num != 33; - } else { - alloc.size = 0; - alloc.buffer_select = SQ_POSITION; - need_alloc = !pos_export; - pos_export = true; - } - - } else { - /* skip if dst register not needed to compute exports */ - if (!mask_get(export_mask, num)) - continue; - - /* RA on first write */ - reg = &shader->reg[num]; - if (reg->write_idx == idx) { - reg->reg = ffsll(~regmask) - 1; - info->max_reg = MAX2(info->max_reg, reg->reg); + /* third pass, emit ALU/FETCH: */ + for (i = 0; i < shader->cfs_count; i++) { + struct ir2_cf *cf = shader->cfs[i]; + if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) { + for (j = 0; j < cf->exec.instrs_count; j++) { + ret = instr_emit(cf->exec.instrs[j], ptr, idx++, info); + if (ret) { + ERROR_MSG("instruction emit failed: %d", ret); + goto fail; + } + ptr += 3; + assert((ptr - dwords) <= info->sizedwords); } } - - if (exec.count == 6 || (exec.count && need_alloc)) { - *cf++ = *(instr_cf_t*) &exec; - exec.address += exec.count; - exec.serialize = 0; - exec.count = 0; - } - - if (need_alloc) { - *cf++ = *(instr_cf_t*) &alloc; - need_alloc = false; - } - - int ret = instr_emit(instr, af, idx, info); af += 3; - assert(!ret); - - if (instr->instr_type == IR2_FETCH) - exec.serialize |= 0x1 << exec.count * 2; - if (instr->sync) - exec.serialize |= 0x2 << exec.count * 2; - exec.count += 1; } + return dwords; - exec.opc = !export_size ? EXEC : EXEC_END; - *cf++ = *(instr_cf_t*) &exec; - exec.address += exec.count; - exec.serialize = 0; - exec.count = 0; +fail: + free(dwords); + return NULL; +} - /* GPU will hang without at least one pixel alloc */ - if (!export_size) { - alloc.size = 0; - alloc.buffer_select = SQ_PARAMETER_PIXEL; - *cf++ = *(instr_cf_t*) &alloc; - exec.opc = EXEC_END; - *cf++ = *(instr_cf_t*) &exec; - } +struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type) +{ + struct ir2_cf *cf = ir2_alloc(shader, sizeof(struct ir2_cf)); + DEBUG_MSG("%d", cf_type); + cf->shader = shader; + cf->cf_type = cf_type; + assert(shader->cfs_count < ARRAY_SIZE(shader->cfs)); + shader->cfs[shader->cfs_count++] = cf; + return cf; +} - unsigned num_cfs = cf - cfs; - /* insert nop to get an even # of CFs */ - if (num_cfs % 2) { - *cf++ = (instr_cf_t) { .opc = NOP }; - num_cfs++; - } +/* + * CF instructions: + */ - /* offset cf addrs */ - for (idx = 0; idx < num_cfs; idx++) { - switch (cfs[idx].opc) { - case EXEC: - case EXEC_END: - cfs[idx].exec.address += num_cfs / 2; +static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr) +{ + memset(instr, 0, sizeof(*instr)); + + instr->opc = cf->cf_type; + + switch (cf->cf_type) { + case NOP: + break; + case EXEC: + case EXEC_END: + assert(cf->exec.addr <= 0x1ff); + assert(cf->exec.cnt <= 0x6); + assert(cf->exec.sequence <= 0xfff); + instr->exec.address = cf->exec.addr; + instr->exec.count = cf->exec.cnt; + instr->exec.serialize = cf->exec.sequence; + break; + case ALLOC: + assert(cf->alloc.size <= 0xf); + instr->alloc.size = cf->alloc.size; + switch (cf->alloc.type) { + case SQ_POSITION: + case SQ_PARAMETER_PIXEL: + instr->alloc.buffer_select = cf->alloc.type; break; default: - break; - /* XXX and any other address using cf that gets implemented */ + ERROR_MSG("invalid alloc type: %d", cf->alloc.type); + return -1; } + break; + case COND_EXEC: + case COND_EXEC_END: + case COND_PRED_EXEC: + case COND_PRED_EXEC_END: + case LOOP_START: + case LOOP_END: + case COND_CALL: + case RETURN: + case COND_JMP: + case COND_EXEC_PRED_CLEAN: + case COND_EXEC_PRED_CLEAN_END: + case MARK_VS_FETCH_DONE: + ERROR_MSG("TODO"); + return -1; } - /* concatenate cfs+alufetchs */ - uint32_t cfdwords = num_cfs / 2 * 3; - uint32_t alufetchdwords = exec.address * 3; - info->sizedwords = cfdwords + alufetchdwords; - uint32_t *dwords = malloc(info->sizedwords * 4); - assert(dwords); - memcpy(dwords, cfs, cfdwords * 4); - memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4); - return dwords; + return 0; } -struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader, - int instr_type) + +struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type) { struct ir2_instruction *instr = - ir2_alloc(shader, sizeof(struct ir2_instruction)); + ir2_alloc(cf->shader, sizeof(struct ir2_instruction)); DEBUG_MSG("%d", instr_type); - instr->shader = shader; - instr->idx = shader->instr_count; - instr->pred = shader->pred; + instr->shader = cf->shader; + instr->pred = cf->shader->pred; instr->instr_type = instr_type; - shader->instr[shader->instr_count++] = instr; + assert(cf->exec.instrs_count < ARRAY_SIZE(cf->exec.instrs)); + cf->exec.instrs[cf->exec.instrs_count++] = instr; return instr; } @@ -430,11 +279,15 @@ static int instr_emit_fetch(struct ir2_instruction *instr, struct ir2_shader_info *info) { instr_fetch_t *fetch = (instr_fetch_t *)dwords; - struct ir2_dst_register *dst_reg = &instr->dst_reg; - struct ir2_src_register *src_reg = &instr->src_reg[0]; + int reg = 0; + struct ir2_register *dst_reg = instr->regs[reg++]; + struct ir2_register *src_reg = instr->regs[reg++]; memset(fetch, 0, sizeof(*fetch)); + reg_update_stats(dst_reg, info, true); + reg_update_stats(src_reg, info, false); + fetch->opc = instr->fetch.opc; if (instr->fetch.opc == VTX_FETCH) { @@ -445,9 +298,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr, assert(instr->fetch.const_idx <= 0x1f); assert(instr->fetch.const_idx_sel <= 0x3); - vtx->src_reg = src_to_reg(instr, src_reg); + vtx->src_reg = src_reg->num; vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1); - vtx->dst_reg = dst_to_reg(instr, dst_reg); + vtx->dst_reg = dst_reg->num; vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg); vtx->must_be_one = 1; vtx->const_index = instr->fetch.const_idx; @@ -473,9 +326,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr, assert(instr->fetch.const_idx <= 0x1f); - tex->src_reg = src_to_reg(instr, src_reg); + tex->src_reg = src_reg->num; tex->src_swiz = reg_fetch_src_swiz(src_reg, 3); - tex->dst_reg = dst_to_reg(instr, dst_reg); + tex->dst_reg = dst_reg->num; tex->dst_swiz = reg_fetch_dst_swiz(dst_reg); tex->const_idx = instr->fetch.const_idx; tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; @@ -488,7 +341,6 @@ static int instr_emit_fetch(struct ir2_instruction *instr, tex->use_comp_lod = 1; tex->use_reg_lod = !instr->fetch.is_cube; tex->sample_location = SAMPLE_CENTER; - tex->tx_coord_denorm = instr->fetch.is_rect; if (instr->pred != IR2_PRED_NONE) { tex->pred_select = 1; @@ -507,62 +359,95 @@ static int instr_emit_fetch(struct ir2_instruction *instr, * ALU instructions: */ -static int instr_emit_alu(struct ir2_instruction *instr_v, - struct ir2_instruction *instr_s, uint32_t *dwords, +static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords, struct ir2_shader_info *info) { + int reg = 0; instr_alu_t *alu = (instr_alu_t *)dwords; - struct ir2_dst_register *vdst_reg, *sdst_reg; - struct ir2_src_register *src1_reg, *src2_reg, *src3_reg; - struct ir2_shader *shader = instr_v ? instr_v->shader : instr_s->shader; - enum ir2_pred pred = IR2_PRED_NONE; + struct ir2_register *dst_reg = instr->regs[reg++]; + struct ir2_register *src1_reg; + struct ir2_register *src2_reg; + struct ir2_register *src3_reg; memset(alu, 0, sizeof(*alu)); - vdst_reg = NULL; - sdst_reg = NULL; - src1_reg = NULL; - src2_reg = NULL; - src3_reg = NULL; - - if (instr_v) { - vdst_reg = &instr_v->dst_reg; - assert(instr_v->src_reg_count >= 2); - src1_reg = &instr_v->src_reg[0]; - src2_reg = &instr_v->src_reg[1]; - if (instr_v->src_reg_count > 2) - src3_reg = &instr_v->src_reg[2]; - pred = instr_v->pred; + /* handle instructions w/ 3 src operands: */ + switch (instr->alu.vector_opc) { + case MULADDv: + case CNDEv: + case CNDGTEv: + case CNDGTv: + case DOT2ADDv: + /* note: disassembler lists 3rd src first, ie: + * MULADDv Rdst = Rsrc3 + (Rsrc1 * Rsrc2) + * which is the reason for this strange ordering. + */ + src3_reg = instr->regs[reg++]; + break; + default: + src3_reg = NULL; + break; } - if (instr_s) { - sdst_reg = &instr_s->dst_reg; - assert(instr_s->src_reg_count == 1); - assert(!instr_v || vdst_reg->flags == sdst_reg->flags); - assert(!instr_v || pred == instr_s->pred); - if (src3_reg) { - assert(src3_reg->flags == instr_s->src_reg[0].flags); - assert(src3_reg->num == instr_s->src_reg[0].num); - assert(!strcmp(src3_reg->swizzle, instr_s->src_reg[0].swizzle)); - } - src3_reg = &instr_s->src_reg[0]; - pred = instr_s->pred; - } + src1_reg = instr->regs[reg++]; + src2_reg = instr->regs[reg++]; - if (vdst_reg) { - assert((vdst_reg->flags & ~IR2_REG_EXPORT) == 0); - assert(!vdst_reg->swizzle || (strlen(vdst_reg->swizzle) == 4)); - alu->vector_opc = instr_v->alu_vector.opc; - alu->vector_write_mask = reg_alu_dst_swiz(vdst_reg); - alu->vector_dest = dst_to_reg(instr_v, vdst_reg); - } else { + reg_update_stats(dst_reg, info, true); + reg_update_stats(src1_reg, info, false); + reg_update_stats(src2_reg, info, false); + + assert((dst_reg->flags & ~IR2_REG_EXPORT) == 0); + assert(!dst_reg->swizzle || (strlen(dst_reg->swizzle) == 4)); + assert((src1_reg->flags & IR2_REG_EXPORT) == 0); + assert(!src1_reg->swizzle || (strlen(src1_reg->swizzle) == 4)); + assert((src2_reg->flags & IR2_REG_EXPORT) == 0); + assert(!src2_reg->swizzle || (strlen(src2_reg->swizzle) == 4)); + + if (instr->alu.vector_opc == ~0) { alu->vector_opc = MAXv; + alu->vector_write_mask = 0; + } else { + alu->vector_opc = instr->alu.vector_opc; + alu->vector_write_mask = reg_alu_dst_swiz(dst_reg); } - if (sdst_reg) { - alu->scalar_opc = instr_s->alu_scalar.opc; + alu->vector_dest = dst_reg->num; + alu->export_data = !!(dst_reg->flags & IR2_REG_EXPORT); + + // TODO predicate case/condition.. need to add to parser + + alu->src2_reg = src2_reg->num; + alu->src2_swiz = reg_alu_src_swiz(src2_reg); + alu->src2_reg_negate = !!(src2_reg->flags & IR2_REG_NEGATE); + alu->src2_reg_abs = !!(src2_reg->flags & IR2_REG_ABS); + alu->src2_sel = !(src2_reg->flags & IR2_REG_CONST); + + alu->src1_reg = src1_reg->num; + alu->src1_swiz = reg_alu_src_swiz(src1_reg); + alu->src1_reg_negate = !!(src1_reg->flags & IR2_REG_NEGATE); + alu->src1_reg_abs = !!(src1_reg->flags & IR2_REG_ABS); + alu->src1_sel = !(src1_reg->flags & IR2_REG_CONST); + + alu->vector_clamp = instr->alu.vector_clamp; + alu->scalar_clamp = instr->alu.scalar_clamp; + + if (instr->alu.scalar_opc != ~0) { + struct ir2_register *sdst_reg = instr->regs[reg++]; + + reg_update_stats(sdst_reg, info, true); + + assert(sdst_reg->flags == dst_reg->flags); + + if (src3_reg) { + assert(src3_reg == instr->regs[reg]); + reg++; + } else { + src3_reg = instr->regs[reg++]; + } + + alu->scalar_dest = sdst_reg->num; alu->scalar_write_mask = reg_alu_dst_swiz(sdst_reg); - alu->scalar_dest = dst_to_reg(instr_s, sdst_reg); + alu->scalar_opc = instr->alu.scalar_opc; } else { /* not sure if this is required, but adreno compiler seems * to always set scalar opc to MAXs if it is not used: @@ -570,58 +455,13 @@ static int instr_emit_alu(struct ir2_instruction *instr_v, alu->scalar_opc = MAXs; } - alu->export_data = - !!((instr_v ? vdst_reg : sdst_reg)->flags & IR2_REG_EXPORT); + if (src3_reg) { + reg_update_stats(src3_reg, info, false); - /* export32 has this bit set.. it seems to do more than just set - * the base address of the constants used to zero - * TODO make this less of a hack - */ - if (alu->export_data && alu->vector_dest == 32) { - assert(!instr_s); - alu->relative_addr = 1; - } - - if (src1_reg) { - if (src1_reg->flags & IR2_REG_CONST) { - assert(!(src1_reg->flags & IR2_REG_ABS)); - alu->src1_reg_const = src1_reg->num; - } else { - alu->src1_reg = shader->reg[src1_reg->num].reg; - alu->src1_reg_abs = !!(src1_reg->flags & IR2_REG_ABS); - } - alu->src1_swiz = reg_alu_src_swiz(src1_reg); - alu->src1_reg_negate = !!(src1_reg->flags & IR2_REG_NEGATE); - alu->src1_sel = !(src1_reg->flags & IR2_REG_CONST); - } else { - alu->src1_sel = 1; - } - - if (src2_reg) { - if (src2_reg->flags & IR2_REG_CONST) { - assert(!(src2_reg->flags & IR2_REG_ABS)); - alu->src2_reg_const = src2_reg->num; - } else { - alu->src2_reg = shader->reg[src2_reg->num].reg; - alu->src2_reg_abs = !!(src2_reg->flags & IR2_REG_ABS); - } - alu->src2_swiz = reg_alu_src_swiz(src2_reg); - alu->src2_reg_negate = !!(src2_reg->flags & IR2_REG_NEGATE); - alu->src2_sel = !(src2_reg->flags & IR2_REG_CONST); - } else { - alu->src2_sel = 1; - } - - if (src3_reg) { - if (src3_reg->flags & IR2_REG_CONST) { - assert(!(src3_reg->flags & IR2_REG_ABS)); - alu->src3_reg_const = src3_reg->num; - } else { - alu->src3_reg = shader->reg[src3_reg->num].reg; - alu->src3_reg_abs = !!(src3_reg->flags & IR2_REG_ABS); - } + alu->src3_reg = src3_reg->num; alu->src3_swiz = reg_alu_src_swiz(src3_reg); alu->src3_reg_negate = !!(src3_reg->flags & IR2_REG_NEGATE); + alu->src3_reg_abs = !!(src3_reg->flags & IR2_REG_ABS); alu->src3_sel = !(src3_reg->flags & IR2_REG_CONST); } else { /* not sure if this is required, but adreno compiler seems @@ -630,11 +470,9 @@ static int instr_emit_alu(struct ir2_instruction *instr_v, alu->src3_sel = 1; } - alu->vector_clamp = instr_v ? instr_v->alu_vector.clamp : 0; - alu->scalar_clamp = instr_s ? instr_s->alu_scalar.clamp : 0; - - if (pred != IR2_PRED_NONE) - alu->pred_select = (pred == IR2_PRED_EQ) ? 3 : 2; + if (instr->pred != IR2_PRED_NONE) { + alu->pred_select = (instr->pred == IR2_PRED_EQ) ? 3 : 2; + } return 0; } @@ -644,63 +482,51 @@ static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords, { switch (instr->instr_type) { case IR2_FETCH: return instr_emit_fetch(instr, dwords, idx, info); - case IR2_ALU_VECTOR: return instr_emit_alu(instr, NULL, dwords, info); - case IR2_ALU_SCALAR: return instr_emit_alu(NULL, instr, dwords, info); + case IR2_ALU: return instr_emit_alu(instr, dwords, info); } return -1; } -struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr, + +struct ir2_register * ir2_reg_create(struct ir2_instruction *instr, int num, const char *swizzle, int flags) { - if (!(flags & IR2_REG_EXPORT)) { - struct ir2_register *reg = &instr->shader->reg[num]; - - unsigned i; - for (i = instr->shader->max_reg + 1; i <= num; i++) - instr->shader->reg[i].write_idx = -1; - instr->shader->max_reg = i - 1; - - if (reg->write_idx < 0) - reg->write_idx = instr->idx; - reg->write_idx2 = instr->idx; - } - - struct ir2_dst_register *reg = &instr->dst_reg; + struct ir2_register *reg = + ir2_alloc(instr->shader, sizeof(struct ir2_register)); + DEBUG_MSG("%x, %d, %s", flags, num, swizzle); + assert(num <= REG_MASK); reg->flags = flags; reg->num = num; reg->swizzle = ir2_strdup(instr->shader, swizzle); + assert(instr->regs_count < ARRAY_SIZE(instr->regs)); + instr->regs[instr->regs_count++] = reg; return reg; } -struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr, - int num, const char *swizzle, int flags) +static void reg_update_stats(struct ir2_register *reg, + struct ir2_shader_info *info, bool dest) { - assert(instr->src_reg_count + 1 <= ARRAY_SIZE(instr->src_reg)); - if (!(flags & IR2_REG_CONST)) { - struct ir2_register *reg = &instr->shader->reg[num]; - - reg->read_idx = instr->idx; - - unsigned i; - for (i = instr->shader->max_reg + 1; i <= num; i++) - instr->shader->reg[i].write_idx = -1; - instr->shader->max_reg = i - 1; + if (!(reg->flags & (IR2_REG_CONST|IR2_REG_EXPORT))) { + info->max_reg = MAX2(info->max_reg, reg->num); + + if (dest) { + info->regs_written |= (1 << reg->num); + } else if (!(info->regs_written & (1 << reg->num))) { + /* for registers that haven't been written, they must be an + * input register that the thread scheduler (presumably?) + * needs to know about: + */ + info->max_input_reg = MAX2(info->max_input_reg, reg->num); + } } - - struct ir2_src_register *reg = &instr->src_reg[instr->src_reg_count++]; - reg->flags = flags; - reg->num = num; - reg->swizzle = ir2_strdup(instr->shader, swizzle); - return reg; } -static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n) +static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n) { uint32_t swiz = 0; int i; - assert((reg->flags & ~IR2_REG_INPUT) == 0); + assert(reg->flags == 0); assert(reg->swizzle); DEBUG_MSG("fetch src R%d.%s", reg->num, reg->swizzle); @@ -720,7 +546,7 @@ static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n) return swiz; } -static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg) +static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg) { uint32_t swiz = 0; int i; @@ -753,7 +579,7 @@ static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg) } /* actually, a write-mask */ -static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg) +static uint32_t reg_alu_dst_swiz(struct ir2_register *reg) { uint32_t swiz = 0; int i; @@ -780,11 +606,12 @@ static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg) return swiz; } -static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg) +static uint32_t reg_alu_src_swiz(struct ir2_register *reg) { uint32_t swiz = 0; int i; + assert((reg->flags & IR2_REG_EXPORT) == 0); assert(!reg->swizzle || (strlen(reg->swizzle) == 4)); DEBUG_MSG("vector src R%d.%s", reg->num, reg->swizzle); diff --git a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h index ac2931266..822e5ec4c 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h +++ b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h @@ -33,25 +33,17 @@ struct ir2_shader; -#define REG_MASK 0xff - struct ir2_shader_info { uint16_t sizedwords; int8_t max_reg; /* highest GPR # used by shader */ + uint8_t max_input_reg; + uint64_t regs_written; }; struct ir2_register { - int16_t write_idx, write_idx2, read_idx, reg; - /* bitmask of variables on which this one depends - * XXX: use bitmask util? - */ - uint32_t regmask[REG_MASK/32+1]; -}; - -struct ir2_src_register { enum { - IR2_REG_INPUT = 0x1, - IR2_REG_CONST = 0x2, + IR2_REG_CONST = 0x1, + IR2_REG_EXPORT = 0x2, IR2_REG_NEGATE = 0x4, IR2_REG_ABS = 0x8, } flags; @@ -59,14 +51,6 @@ struct ir2_src_register { char *swizzle; }; -struct ir2_dst_register { - enum { - IR2_REG_EXPORT = 0x1, - } flags; - int num; - char *swizzle; -}; - enum ir2_pred { IR2_PRED_NONE = 0, IR2_PRED_EQ = 1, @@ -75,17 +59,14 @@ enum ir2_pred { struct ir2_instruction { struct ir2_shader *shader; - unsigned idx; enum { IR2_FETCH, - IR2_ALU_VECTOR, - IR2_ALU_SCALAR, + IR2_ALU, } instr_type; enum ir2_pred pred; int sync; - unsigned src_reg_count; - struct ir2_dst_register dst_reg; - struct ir2_src_register src_reg[3]; + unsigned regs_count; + struct ir2_register *regs[5]; union { /* FETCH specific: */ struct { @@ -93,7 +74,6 @@ struct ir2_instruction { unsigned const_idx; /* texture fetch specific: */ bool is_cube : 1; - bool is_rect : 1; /* vertex fetch specific: */ unsigned const_idx_sel; enum a2xx_sq_surfaceformat fmt; @@ -102,25 +82,38 @@ struct ir2_instruction { uint32_t stride; uint32_t offset; } fetch; - /* ALU-Vector specific: */ + /* ALU specific: */ struct { - instr_vector_opc_t opc; - bool clamp; - } alu_vector; - /* ALU-Scalar specific: */ + instr_vector_opc_t vector_opc; + instr_scalar_opc_t scalar_opc; + bool vector_clamp : 1; + bool scalar_clamp : 1; + } alu; + }; +}; + +struct ir2_cf { + struct ir2_shader *shader; + instr_cf_opc_t cf_type; + + union { + /* EXEC/EXEC_END specific: */ + struct { + unsigned instrs_count; + struct ir2_instruction *instrs[6]; + uint32_t addr, cnt, sequence; + } exec; + /* ALLOC specific: */ struct { - instr_scalar_opc_t opc; - bool clamp; - } alu_scalar; + instr_alloc_type_t type; /* SQ_POSITION or SQ_PARAMETER_PIXEL */ + int size; + } alloc; }; }; struct ir2_shader { - unsigned instr_count; - int max_reg; - struct ir2_register reg[REG_MASK+1]; - - struct ir2_instruction *instr[0x200]; + unsigned cfs_count; + struct ir2_cf *cfs[0x56]; uint32_t heap[100 * 4096]; unsigned heap_idx; @@ -132,41 +125,40 @@ void ir2_shader_destroy(struct ir2_shader *shader); void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info *info); -struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader, - int instr_type); +struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type); -struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr, - int num, const char *swizzle, int flags); -struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr, +struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type); + +struct ir2_register * ir2_reg_create(struct ir2_instruction *instr, int num, const char *swizzle, int flags); /* some helper fxns: */ -static inline struct ir2_instruction * -ir2_instr_create_alu_v(struct ir2_shader *shader, instr_vector_opc_t vop) +static inline struct ir2_cf * +ir2_cf_create_alloc(struct ir2_shader *shader, instr_alloc_type_t type, int size) { - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_VECTOR); - if (!instr) - return instr; - instr->alu_vector.opc = vop; - return instr; + struct ir2_cf *cf = ir2_cf_create(shader, ALLOC); + if (!cf) + return cf; + cf->alloc.type = type; + cf->alloc.size = size; + return cf; } - static inline struct ir2_instruction * -ir2_instr_create_alu_s(struct ir2_shader *shader, instr_scalar_opc_t sop) +ir2_instr_create_alu(struct ir2_cf *cf, instr_vector_opc_t vop, instr_scalar_opc_t sop) { - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_SCALAR); + struct ir2_instruction *instr = ir2_instr_create(cf, IR2_ALU); if (!instr) return instr; - instr->alu_scalar.opc = sop; + instr->alu.vector_opc = vop; + instr->alu.scalar_opc = sop; return instr; } - static inline struct ir2_instruction * -ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis, +ir2_instr_create_vtx_fetch(struct ir2_cf *cf, int ci, int cis, enum a2xx_sq_surfaceformat fmt, bool is_signed, int stride) { - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH); + struct ir2_instruction *instr = instr = ir2_instr_create(cf, IR2_FETCH); instr->fetch.opc = VTX_FETCH; instr->fetch.const_idx = ci; instr->fetch.const_idx_sel = cis; @@ -176,9 +168,9 @@ ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis, return instr; } static inline struct ir2_instruction * -ir2_instr_create_tex_fetch(struct ir2_shader *shader, int ci) +ir2_instr_create_tex_fetch(struct ir2_cf *cf, int ci) { - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH); + struct ir2_instruction *instr = instr = ir2_instr_create(cf, IR2_FETCH); instr->fetch.opc = TEX_FETCH; instr->fetch.const_idx = ci; return instr; diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c index b6ef6e4b5..07e03d269 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -1,3 +1,5 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + /* * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> * @@ -33,13 +35,9 @@ static void print_instr_name(struct ir3_instruction *instr) { - if (!instr) - return; #ifdef DEBUG printf("%04u:", instr->serialno); #endif - printf("%04u:", instr->name); - printf("%04u:", instr->ip); printf("%03u: ", instr->depth); if (instr->flags & IR3_INSTR_SY) @@ -48,15 +46,22 @@ static void print_instr_name(struct ir3_instruction *instr) printf("(ss)"); if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_INPUT: printf("_meta:in"); break; - case OPC_META_FO: printf("_meta:fo"); break; - case OPC_META_FI: printf("_meta:fi"); break; - - /* shouldn't hit here.. just for debugging: */ - default: printf("_meta:%d", instr->opc); break; + switch(instr->opc) { + case OPC_META_PHI: + printf("Φ"); + break; + default: + /* shouldn't hit here.. just for debugging: */ + switch (instr->opc) { + case OPC_META_INPUT: printf("_meta:in"); break; + case OPC_META_FO: printf("_meta:fo"); break; + case OPC_META_FI: printf("_meta:fi"); break; + + default: printf("_meta:%d", instr->opc); break; + } + break; } - } else if (instr->opc == OPC_MOV) { + } else if (instr->category == 1) { static const char *type[] = { [TYPE_F16] = "f16", [TYPE_F32] = "f32", @@ -89,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr) } } -static void print_reg_name(struct ir3_register *reg) +static void print_reg_name(struct ir3_register *reg, bool followssa) { if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) @@ -101,29 +106,20 @@ static void print_reg_name(struct ir3_register *reg) if (reg->flags & IR3_REG_IMMED) { printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); - } else if (reg->flags & IR3_REG_ARRAY) { - printf("arr[id=%u, offset=%d, size=%u", reg->array.id, - reg->array.offset, reg->size); - /* for ARRAY we could have null src, for example first write - * instruction.. - */ - if (reg->instr) { - printf(", _["); + } else if (reg->flags & IR3_REG_SSA) { + printf("_"); + if (followssa) { + printf("["); print_instr_name(reg->instr); printf("]"); } - printf("]"); - } else if (reg->flags & IR3_REG_SSA) { - printf("_["); - print_instr_name(reg->instr); - printf("]"); } else if (reg->flags & IR3_REG_RELATIV) { if (reg->flags & IR3_REG_HALF) printf("h"); if (reg->flags & IR3_REG_CONST) - printf("c<a0.x + %d>", reg->array.offset); + printf("c<a0.x + %u>", reg->num); else - printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size); + printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size); } else { if (reg->flags & IR3_REG_HALF) printf("h"); @@ -141,6 +137,16 @@ tab(int lvl) printf("\t"); } +static uint32_t +block_id(struct ir3_block *block) +{ +#ifdef DEBUG + return block->serialno; +#else + return (uint32_t)(uint64_t)block; +#endif +} + static void print_instr(struct ir3_instruction *instr, int lvl) { @@ -152,7 +158,7 @@ print_instr(struct ir3_instruction *instr, int lvl) for (i = 0; i < instr->regs_count; i++) { struct ir3_register *reg = instr->regs[i]; printf(i ? ", " : " "); - print_reg_name(reg); + print_reg_name(reg, !!i); } if (instr->address) { @@ -162,6 +168,13 @@ print_instr(struct ir3_instruction *instr, int lvl) printf("]"); } + if (instr->fanin) { + printf(", fanin=_"); + printf("["); + print_instr_name(instr->fanin); + printf("]"); + } + if (instr->cp.left) { printf(", left=_"); printf("["); @@ -176,8 +189,12 @@ print_instr(struct ir3_instruction *instr, int lvl) printf("]"); } - if (instr->opc == OPC_META_FO) { - printf(", off=%d", instr->fo.off); + if (is_meta(instr)) { + if (instr->opc == OPC_META_FO) { + printf(", off=%d", instr->fo.off); + } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) { + printf(", aid=%d", instr->fi.aid); + } } if (is_flow(instr) && instr->cat0.target) { @@ -188,17 +205,6 @@ print_instr(struct ir3_instruction *instr, int lvl) printf(", target=block%u", block_id(instr->cat0.target)); } - if (instr->deps_count) { - printf(", false-deps:"); - for (unsigned i = 0; i < instr->deps_count; i++) { - if (i > 0) - printf(", "); - printf("_["); - print_instr_name(instr->deps[i]); - printf("]"); - } - } - printf("\n"); } @@ -211,28 +217,9 @@ static void print_block(struct ir3_block *block, int lvl) { tab(lvl); printf("block%u {\n", block_id(block)); - - if (block->predecessors_count > 0) { - tab(lvl+1); - printf("pred: "); - for (unsigned i = 0; i < block->predecessors_count; i++) { - if (i) - printf(", "); - printf("block%u", block_id(block->predecessors[i])); - } - printf("\n"); - } - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { print_instr(instr, lvl+1); } - - tab(lvl+1); printf("/* keeps:\n"); - for (unsigned i = 0; i < block->keeps_count; i++) { - print_instr(block->keeps[i], lvl+2); - } - tab(lvl+1); printf(" */\n"); - if (block->successors[1]) { /* leading into if/else: */ tab(lvl+1); diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c index 6552980d9..2ee325518 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -1,3 +1,5 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + /* * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> * @@ -32,12 +34,11 @@ /* * Instruction Scheduling: * - * A recursive depth based scheduling algo. Recursively find an eligible - * instruction to schedule from the deepest instruction (recursing through - * it's unscheduled src instructions). Normally this would result in a - * lot of re-traversal of the same instructions, so we cache results in - * instr->data (and clear cached results that would be no longer valid - * after scheduling an instruction). + * A priority-queue based scheduling algo. Add eligible instructions, + * ie. ones with all their dependencies scheduled, to the priority + * (depth) sorted queue (list). Pop highest priority instruction off + * the queue and schedule it, add newly eligible instructions to the + * priority queue, rinse, repeat. * * There are a few special cases that need to be handled, since sched * is currently independent of register allocation. Usages of address @@ -51,7 +52,6 @@ struct ir3_sched_ctx { struct ir3_block *block; /* the current block */ - struct list_head depth_list; /* depth sorted unscheduled instrs */ struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ struct ir3_instruction *addr; /* current a0.x user, if any */ struct ir3_instruction *pred; /* current p0.x user, if any */ @@ -63,17 +63,6 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr) return is_sfu(instr) || is_mem(instr); } -#define NULL_INSTR ((void *)~0) - -static void -clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) -{ - list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) { - if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr) - instr2->data = NULL; - } -} - static void schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { @@ -104,103 +93,30 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) list_addtail(&instr->node, &instr->block->instr_list); ctx->scheduled = instr; - - if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) { - clear_cache(ctx, NULL); - } else { - /* invalidate only the necessary entries.. */ - clear_cache(ctx, instr); - } -} - -static struct ir3_instruction * -deepest(struct ir3_instruction **srcs, unsigned nsrcs) -{ - struct ir3_instruction *d = NULL; - unsigned i = 0, id = 0; - - while ((i < nsrcs) && !(d = srcs[id = i])) - i++; - - if (!d) - return NULL; - - for (; i < nsrcs; i++) - if (srcs[i] && (srcs[i]->depth > d->depth)) - d = srcs[id = i]; - - srcs[id] = NULL; - - return d; } -/** - * @block: the block to search in, starting from end; in first pass, - * this will be the block the instruction would be inserted into - * (but has not yet, ie. it only contains already scheduled - * instructions). For intra-block scheduling (second pass), this - * would be one of the predecessor blocks. - * @instr: the instruction to search for - * @maxd: max distance, bail after searching this # of instruction - * slots, since it means the instruction we are looking for is - * far enough away - * @pred: if true, recursively search into predecessor blocks to - * find the worst case (shortest) distance (only possible after - * individual blocks are all scheduled - */ static unsigned -distance(struct ir3_block *block, struct ir3_instruction *instr, - unsigned maxd, bool pred) +distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, + unsigned maxd) { + struct list_head *instr_list = &ctx->block->instr_list; unsigned d = 0; - list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) { + list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) { if ((n == instr) || (d >= maxd)) - return d; - /* NOTE: don't count branch/jump since we don't know yet if they will - * be eliminated later in resolve_jumps().. really should do that - * earlier so we don't have this constraint. - */ - if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR))) + break; + if (is_alu(n) || is_flow(n)) d++; } - /* if coming from a predecessor block, assume it is assigned far - * enough away.. we'll fix up later. - */ - if (!pred) - return maxd; - - if (pred && (block->data != block)) { - /* Search into predecessor blocks, finding the one with the - * shortest distance, since that will be the worst case - */ - unsigned min = maxd - d; - - /* (ab)use block->data to prevent recursion: */ - block->data = block; - - for (unsigned i = 0; i < block->predecessors_count; i++) { - unsigned n; - - n = distance(block->predecessors[i], instr, min, pred); - - min = MIN2(min, n); - } - - block->data = NULL; - d += min; - } - return d; } /* calculate delay for specified src: */ static unsigned -delay_calc_srcn(struct ir3_block *block, +delay_calc_srcn(struct ir3_sched_ctx *ctx, struct ir3_instruction *assigner, - struct ir3_instruction *consumer, - unsigned srcn, bool soft, bool pred) + struct ir3_instruction *consumer, unsigned srcn) { unsigned delay = 0; @@ -208,20 +124,14 @@ delay_calc_srcn(struct ir3_block *block, struct ir3_instruction *src; foreach_ssa_src(src, assigner) { unsigned d; - d = delay_calc_srcn(block, src, consumer, srcn, soft, pred); + if (src->block != assigner->block) + break; + d = delay_calc_srcn(ctx, src, consumer, srcn); delay = MAX2(delay, d); } } else { - if (soft) { - if (is_sfu(assigner)) { - delay = 4; - } else { - delay = ir3_delayslots(assigner, consumer, srcn); - } - } else { - delay = ir3_delayslots(assigner, consumer, srcn); - } - delay -= distance(block, assigner, delay, pred); + delay = ir3_delayslots(assigner, consumer, srcn); + delay -= distance(ctx, assigner, delay); } return delay; @@ -229,15 +139,16 @@ delay_calc_srcn(struct ir3_block *block, /* calculate delay for instruction (maximum of delay for all srcs): */ static unsigned -delay_calc(struct ir3_block *block, struct ir3_instruction *instr, - bool soft, bool pred) +delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { unsigned delay = 0; struct ir3_instruction *src; foreach_ssa_src_n(src, i, instr) { unsigned d; - d = delay_calc_srcn(block, src, instr, i, soft, pred); + if (src->block != instr->block) + continue; + d = delay_calc_srcn(ctx, src, instr, i); delay = MAX2(delay, d); } @@ -260,51 +171,10 @@ static bool is_scheduled(struct ir3_instruction *instr) return !!(instr->flags & IR3_INSTR_MARK); } -/* could an instruction be scheduled if specified ssa src was scheduled? */ -static bool -could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) -{ - struct ir3_instruction *other_src; - foreach_ssa_src(other_src, instr) { - /* if dependency not scheduled, we aren't ready yet: */ - if ((src != other_src) && !is_scheduled(other_src)) { - return false; - } - } - return true; -} - -/* Check if instruction is ok to schedule. Make sure it is not blocked - * by use of addr/predicate register, etc. - */ static bool -check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, +check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, struct ir3_instruction *instr) { - /* For instructions that write address register we need to - * make sure there is at least one instruction that uses the - * addr value which is otherwise ready. - * - * TODO if any instructions use pred register and have other - * src args, we would need to do the same for writes_pred().. - */ - if (writes_addr(instr)) { - struct ir3 *ir = instr->block->shader; - bool ready = false; - for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; - if (!indirect) - continue; - if (indirect->address != instr) - continue; - ready = could_sched(indirect, instr); - } - - /* nothing could be scheduled, so keep looking: */ - if (!ready) - return false; - } - /* if this is a write to address/predicate register, and that * register is currently in use, we need to defer until it is * free: @@ -312,15 +182,52 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, if (writes_addr(instr) && ctx->addr) { debug_assert(ctx->addr != instr); notes->addr_conflict = true; - return false; + return true; } if (writes_pred(instr) && ctx->pred) { debug_assert(ctx->pred != instr); notes->pred_conflict = true; - return false; + return true; + } + + return false; +} + +/* is this instruction ready to be scheduled? Return negative for not + * ready (updating notes if needed), or >= 0 to indicate number of + * delay slots needed. + */ +static int +instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) +{ + struct ir3_instruction *src; + unsigned delay = 0; + + /* Phi instructions can have a dependency on something not + * scheduled yet (for ex, loops). But OTOH we don't really + * care. By definition phi's should appear at the top of + * the block, and it's sources should be values from the + * previously executing block, so they are always ready to + * be scheduled: + */ + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) + return 0; + + foreach_ssa_src(src, instr) { + /* if dependency not scheduled, we aren't ready yet: */ + if (!is_scheduled(src)) + return -1; } + /* all our dependents are scheduled, figure out if + * we have enough delay slots to schedule ourself: + */ + delay = delay_calc(ctx, instr); + if (delay) + return delay; + /* if the instruction is a kill, we need to ensure *every* * bary.f is scheduled. The hw seems unhappy if the thread * gets killed before the end-input (ei) flag is hit. @@ -339,110 +246,80 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, for (unsigned i = 0; i < ir->baryfs_count; i++) { struct ir3_instruction *baryf = ir->baryfs[i]; - if (baryf->flags & IR3_INSTR_UNUSED) + if (baryf->depth == DEPTH_UNUSED) continue; if (!is_scheduled(baryf)) { notes->blocked_kill = true; - return false; + return -1; } } } - return true; + if (check_conflict(ctx, notes, instr)) + return -1; + + return 0; } -/* Find the best instruction to schedule from specified instruction or - * recursively it's ssa sources. - */ -static struct ir3_instruction * -find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - struct ir3_instruction *instr) +/* could an instruction be scheduled if specified ssa src was scheduled? */ +static bool +could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) { - struct ir3_instruction *srcs[__ssa_src_cnt(instr)]; - struct ir3_instruction *src; - unsigned nsrcs = 0; - - if (is_scheduled(instr)) - return NULL; - - /* use instr->data to cache the results of recursing up the - * instr src's. Otherwise the recursive algo can scale quite - * badly w/ shader size. But this takes some care to clear - * the cache appropriately when instructions are scheduled. - */ - if (instr->data) { - if (instr->data == NULL_INSTR) - return NULL; - return instr->data; - } - - /* find unscheduled srcs: */ - foreach_ssa_src(src, instr) { - if (!is_scheduled(src)) { - debug_assert(nsrcs < ARRAY_SIZE(srcs)); - srcs[nsrcs++] = src; - } - } - - /* if all our src's are already scheduled: */ - if (nsrcs == 0) { - if (check_instr(ctx, notes, instr)) { - instr->data = instr; - return instr; - } - return NULL; - } - - while ((src = deepest(srcs, nsrcs))) { - struct ir3_instruction *candidate; - - candidate = find_instr_recursive(ctx, notes, src); - if (!candidate) - continue; - - if (check_instr(ctx, notes, candidate)) { - instr->data = candidate; - return candidate; + struct ir3_instruction *other_src; + foreach_ssa_src(other_src, instr) { + /* if dependency not scheduled, we aren't ready yet: */ + if ((src != other_src) && !is_scheduled(other_src)) { + return false; } } - - instr->data = NULL_INSTR; - return NULL; + return true; } -/* find instruction to schedule: */ -static struct ir3_instruction * -find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - bool soft) +/* move eligible instructions to the priority list: */ +static unsigned +add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct list_head *prio_queue, struct list_head *unscheduled_list) { - struct ir3_instruction *best_instr = NULL; unsigned min_delay = ~0; - /* TODO we'd really rather use the list/array of block outputs. But we - * don't have such a thing. Recursing *every* instruction in the list - * will result in a lot of repeated traversal, since instructions will - * get traversed both when they appear as ssa src to a later instruction - * as well as where they appear in the depth_list. - */ - list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) { - struct ir3_instruction *candidate; - unsigned delay; - - candidate = find_instr_recursive(ctx, notes, instr); - if (!candidate) + list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) { + int e = instr_eligibility(ctx, notes, instr); + if (e < 0) continue; - delay = delay_calc(ctx->block, candidate, soft, false); - if (delay < min_delay) { - best_instr = candidate; - min_delay = delay; + /* For instructions that write address register we need to + * make sure there is at least one instruction that uses the + * addr value which is otherwise ready. + * + * TODO if any instructions use pred register and have other + * src args, we would need to do the same for writes_pred().. + */ + if (unlikely(writes_addr(instr))) { + struct ir3 *ir = instr->block->shader; + bool ready = false; + for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + if (!indirect) + continue; + if (indirect->address != instr) + continue; + ready = could_sched(indirect, instr); + } + + /* nothing could be scheduled, so keep looking: */ + if (!ready) + continue; } - if (min_delay == 0) - break; + min_delay = MIN2(min_delay, e); + if (e == 0) { + /* remove from unscheduled list and into priority queue: */ + list_delinit(&instr->node); + ir3_insert_by_depth(instr, prio_queue); + } } - return best_instr; + return min_delay; } /* "spill" the address register by remapping any unscheduled @@ -536,56 +413,50 @@ split_pred(struct ir3_sched_ctx *ctx) static void sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) { - struct list_head unscheduled_list; + struct list_head unscheduled_list, prio_queue; ctx->block = block; - /* addr/pred writes are per-block: */ - ctx->addr = NULL; - ctx->pred = NULL; - /* move all instructions to the unscheduled list, and * empty the block's instruction list (to which we will - * be inserting). + * be inserting. */ list_replace(&block->instr_list, &unscheduled_list); list_inithead(&block->instr_list); - list_inithead(&ctx->depth_list); + list_inithead(&prio_queue); - /* first a pre-pass to schedule all meta:input instructions + /* first a pre-pass to schedule all meta:input/phi instructions * (which need to appear first so that RA knows the register is - * occupied), and move remaining to depth sorted list: + * occupied: */ list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { - if (instr->opc == OPC_META_INPUT) { + if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) || + (instr->opc == OPC_META_PHI))) schedule(ctx, instr); - } else { - ir3_insert_by_depth(instr, &ctx->depth_list); - } } - while (!list_empty(&ctx->depth_list)) { + while (!(list_empty(&unscheduled_list) && + list_empty(&prio_queue))) { struct ir3_sched_notes notes = {0}; - struct ir3_instruction *instr; - - instr = find_eligible_instr(ctx, ¬es, true); - if (!instr) - instr = find_eligible_instr(ctx, ¬es, false); + unsigned delay; - if (instr) { - unsigned delay = delay_calc(ctx->block, instr, false, false); + delay = add_eligible_instrs(ctx, ¬es, &prio_queue, &unscheduled_list); - /* and if we run out of instructions that can be scheduled, - * then it is time for nop's: + if (!list_empty(&prio_queue)) { + struct ir3_instruction *instr = list_last_entry(&prio_queue, + struct ir3_instruction, node); + /* ugg, this is a bit ugly, but between the time when + * the instruction became eligible and now, a new + * conflict may have arose.. */ - debug_assert(delay <= 6); - while (delay > 0) { - ir3_NOP(block); - delay--; + if (check_conflict(ctx, ¬es, instr)) { + list_del(&instr->node); + list_addtail(&instr->node, &unscheduled_list); + continue; } schedule(ctx, instr); - } else { + } else if (delay == ~0) { struct ir3_instruction *new_instr = NULL; /* nothing available to schedule.. if we are blocked on @@ -604,17 +475,23 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) } if (new_instr) { - /* clearing current addr/pred can change what is - * available to schedule, so clear cache.. - */ - clear_cache(ctx, NULL); - - ir3_insert_by_depth(new_instr, &ctx->depth_list); + list_del(&new_instr->node); + list_addtail(&new_instr->node, &unscheduled_list); /* the original instr that wrote addr/pred may have * originated from a different block: */ new_instr->block = block; } + + } else { + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + debug_assert(delay <= 6); + while (delay > 0) { + ir3_NOP(block); + delay--; + } } } @@ -630,7 +507,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) debug_assert(ctx->pred); debug_assert(block->condition); - delay -= distance(ctx->block, ctx->pred, delay, false); + delay -= distance(ctx, ctx->pred, delay); while (delay > 0) { ir3_NOP(block); @@ -669,150 +546,36 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) */ } -/* After scheduling individual blocks, we still could have cases where - * one (or more) paths into a block, a value produced by a previous - * has too few delay slots to be legal. We can't deal with this in the - * first pass, because loops (ie. we can't ensure all predecessor blocks - * are already scheduled in the first pass). All we can really do at - * this point is stuff in extra nop's until things are legal. - */ +/* this is needed to ensure later RA stage succeeds: */ static void -sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) +sched_insert_parallel_copies(struct ir3_block *block) { - unsigned n = 0; - - ctx->block = block; - - list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { - unsigned delay = 0; - - for (unsigned i = 0; i < block->predecessors_count; i++) { - unsigned d = delay_calc(block->predecessors[i], instr, false, true); - delay = MAX2(d, delay); - } - - while (delay > n) { - struct ir3_instruction *nop = ir3_NOP(block); - - /* move to before instr: */ - list_delinit(&nop->node); - list_addtail(&nop->node, &instr->node); - - n++; + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) { + struct ir3_register *reg; + foreach_src(reg, instr) { + struct ir3_instruction *src = reg->instr; + struct ir3_instruction *mov = + ir3_MOV(src->block, src, TYPE_U32); + mov->regs[0]->flags |= IR3_REG_PHI_SRC; + mov->regs[0]->instr = instr; + reg->instr = mov; + } } - - /* we can bail once we hit worst case delay: */ - if (++n > 6) - break; } } int ir3_sched(struct ir3 *ir) { struct ir3_sched_ctx ctx = {0}; - - ir3_clear_mark(ir); - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - sched_block(&ctx, block); + sched_insert_parallel_copies(block); } - + ir3_clear_mark(ir); list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - sched_intra_block(&ctx, block); + sched_block(&ctx, block); } - if (ctx.error) return -1; return 0; } - -/* does instruction 'prior' need to be scheduled before 'instr'? */ -static bool -depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior) -{ - /* TODO for dependencies that are related to a specific object, ie - * a specific SSBO/image/array, we could relax this constraint to - * make accesses to unrelated objects not depend on each other (at - * least as long as not declared coherent) - */ - if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) || - ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class)) - return true; - return !!(instr->barrier_class & prior->barrier_conflict); -} - -static void -add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr) -{ - struct list_head *prev = instr->node.prev; - struct list_head *next = instr->node.next; - - /* add dependencies on previous instructions that must be scheduled - * prior to the current instruction - */ - while (prev != &block->instr_list) { - struct ir3_instruction *pi = - LIST_ENTRY(struct ir3_instruction, prev, node); - - prev = prev->prev; - - if (is_meta(pi)) - continue; - - if (instr->barrier_class == pi->barrier_class) { - ir3_instr_add_dep(instr, pi); - break; - } - - if (depends_on(instr, pi)) - ir3_instr_add_dep(instr, pi); - } - - /* add dependencies on this instruction to following instructions - * that must be scheduled after the current instruction: - */ - while (next != &block->instr_list) { - struct ir3_instruction *ni = - LIST_ENTRY(struct ir3_instruction, next, node); - - next = next->next; - - if (is_meta(ni)) - continue; - - if (instr->barrier_class == ni->barrier_class) { - ir3_instr_add_dep(ni, instr); - break; - } - - if (depends_on(ni, instr)) - ir3_instr_add_dep(ni, instr); - } -} - -/* before scheduling a block, we need to add any necessary false-dependencies - * to ensure that: - * - * (1) barriers are scheduled in the right order wrt instructions related - * to the barrier - * - * (2) reads that come before a write actually get scheduled before the - * write - */ -static void -calculate_deps(struct ir3_block *block) -{ - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - if (instr->barrier_class) { - add_barrier_deps(block, instr); - } - } -} - -void -ir3_sched_add_deps(struct ir3 *ir) -{ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - calculate_deps(block); - } -} |