diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2021-07-22 10:17:30 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2021-07-22 10:17:30 +0000 |
commit | ca11beabae33eb59fb981b8adf50b1d47a2a98f0 (patch) | |
tree | 3e4691a396e6e54cd54224a190663d5cf976625b /lib/mesa/src/broadcom/compiler/qpu_schedule.c | |
parent | 27c8a50e8bbde7d28b1fc46d715a4c469e24f2c4 (diff) |
Import Mesa 21.1.5
Diffstat (limited to 'lib/mesa/src/broadcom/compiler/qpu_schedule.c')
-rw-r--r-- | lib/mesa/src/broadcom/compiler/qpu_schedule.c | 1042 |
1 files changed, 924 insertions, 118 deletions
diff --git a/lib/mesa/src/broadcom/compiler/qpu_schedule.c b/lib/mesa/src/broadcom/compiler/qpu_schedule.c index 9f11fe27e..8af2e8ef2 100644 --- a/lib/mesa/src/broadcom/compiler/qpu_schedule.c +++ b/lib/mesa/src/broadcom/compiler/qpu_schedule.c @@ -79,10 +79,12 @@ struct schedule_state { struct schedule_node *last_vpm_read; struct schedule_node *last_tmu_write; struct schedule_node *last_tmu_config; + struct schedule_node *last_tmu_read; struct schedule_node *last_tlb; struct schedule_node *last_vpm; struct schedule_node *last_unif; struct schedule_node *last_rtop; + struct schedule_node *last_unifa; enum direction dir; /* Estimated cycle when the current instruction would start. */ uint32_t time; @@ -167,6 +169,36 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, } } +static bool +tmu_write_is_sequence_terminator(uint32_t waddr) +{ + switch (waddr) { + case V3D_QPU_WADDR_TMUS: + case V3D_QPU_WADDR_TMUSCM: + case V3D_QPU_WADDR_TMUSF: + case V3D_QPU_WADDR_TMUSLOD: + case V3D_QPU_WADDR_TMUA: + case V3D_QPU_WADDR_TMUAU: + return true; + default: + return false; + } +} + +static bool +can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) +{ + if (devinfo->ver < 40) + return false; + + if (tmu_write_is_sequence_terminator(waddr)) + return false; + + if (waddr == V3D_QPU_WADDR_TMUD) + return false; + + return true; +} static void process_waddr_deps(struct schedule_state *state, struct schedule_node *n, @@ -174,21 +206,14 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, { if (!magic) { add_write_dep(state, &state->last_rf[waddr], n); - } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { - /* XXX perf: For V3D 4.x, we could reorder TMU writes other - * than the TMUS/TMUD/TMUA to improve scheduling flexibility. - */ - add_write_dep(state, &state->last_tmu_write, n); - switch (waddr) { - case V3D_QPU_WADDR_TMUS: - case V3D_QPU_WADDR_TMUSCM: - case V3D_QPU_WADDR_TMUSF: - case V3D_QPU_WADDR_TMUSLOD: + } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) { + if (can_reorder_tmu_write(state->devinfo, waddr)) + add_read_dep(state, state->last_tmu_write, n); + else + add_write_dep(state, &state->last_tmu_write, n); + + if (tmu_write_is_sequence_terminator(waddr)) add_write_dep(state, &state->last_tmu_config, n); - break; - default: - break; - } } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { /* Handled by v3d_qpu_writes_r4() check. */ } else { @@ -224,6 +249,12 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, * barriers to affect ALU operations. */ add_write_dep(state, &state->last_tmu_write, n); + add_write_dep(state, &state->last_tmu_read, n); + break; + + case V3D_QPU_WADDR_UNIFA: + if (state->devinfo->ver >= 40) + add_write_dep(state, &state->last_unifa, n); break; case V3D_QPU_WADDR_NOP: @@ -354,6 +385,9 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) if (v3d_qpu_writes_r5(devinfo, inst)) add_write_dep(state, &state->last_r[5], n); + /* If we add any more dependencies here we should consider whether we + * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. + */ if (inst->sig.thrsw) { /* All accumulator contents and flags are undefined after the * switch. @@ -375,11 +409,17 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) if (v3d_qpu_waits_on_tmu(inst)) { /* TMU loads are coming from a FIFO, so ordering is important. */ - add_write_dep(state, &state->last_tmu_write, n); + add_write_dep(state, &state->last_tmu_read, n); + /* Keep TMU loads after their TMU lookup terminator */ + add_read_dep(state, state->last_tmu_config, n); } + /* Allow wrtmuc to be reordered with other instructions in the + * same TMU sequence by using a read dependency on the last TMU + * sequence terminator. + */ if (inst->sig.wrtmuc) - add_write_dep(state, &state->last_tmu_config, n); + add_read_dep(state, state->last_tmu_config, n); if (inst->sig.ldtlb | inst->sig.ldtlbu) add_write_dep(state, &state->last_tlb, n); @@ -398,6 +438,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) if (vir_has_uniform(qinst)) add_write_dep(state, &state->last_unif, n); + /* Both unifa and ldunifa must preserve ordering */ + if (inst->sig.ldunifa || inst->sig.ldunifarf) + add_write_dep(state, &state->last_unifa, n); + if (v3d_qpu_reads_flags(inst)) add_read_dep(state, state->last_sf, n); if (v3d_qpu_writes_flags(inst)) @@ -443,9 +487,14 @@ struct choose_scoreboard { int last_stallable_sfu_reg; int last_stallable_sfu_tick; int last_ldvary_tick; + int last_unifa_write_tick; int last_uniforms_reset_tick; int last_thrsw_tick; + int last_branch_tick; + int last_setmsf_tick; bool tlb_locked; + bool fixup_ldvary; + int ldvary_count; }; static bool @@ -566,7 +615,8 @@ mux_read_stalls(struct choose_scoreboard *scoreboard, #define MAX_SCHEDULE_PRIORITY 16 static int -get_instruction_priority(const struct v3d_qpu_instr *inst) +get_instruction_priority(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { uint32_t baseline_score; uint32_t next_score = 0; @@ -588,7 +638,7 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) next_score++; /* Schedule texture read setup early to hide their latency better. */ - if (v3d_qpu_writes_tmu(inst)) + if (v3d_qpu_writes_tmu(devinfo, inst)) return next_score; next_score++; @@ -599,9 +649,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) } static bool -qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) +qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo, + enum v3d_qpu_waddr waddr) { - return (v3d_qpu_magic_waddr_is_tmu(waddr) || + return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) || v3d_qpu_magic_waddr_is_sfu(waddr) || v3d_qpu_magic_waddr_is_tlb(waddr) || v3d_qpu_magic_waddr_is_vpm(waddr) || @@ -609,7 +660,8 @@ qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) } static bool -qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) +qpu_accesses_peripheral(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { if (v3d_qpu_uses_vpm(inst)) return true; @@ -619,7 +671,7 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && - qpu_magic_waddr_is_periph(inst->alu.add.waddr)) { + qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) { return true; } @@ -628,7 +680,7 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) if (inst->alu.mul.op != V3D_QPU_M_NOP && inst->alu.mul.magic_write && - qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { + qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) { return true; } } @@ -645,8 +697,8 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { - const bool a_uses_peripheral = qpu_accesses_peripheral(a); - const bool b_uses_peripheral = qpu_accesses_peripheral(b); + const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a); + const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b); /* We can always do one peripheral access per instruction. */ if (!a_uses_peripheral || !b_uses_peripheral) @@ -658,19 +710,169 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, /* V3D 4.1 and later allow TMU read along with a VPM read or write, and * WRTMUC with a TMU magic register write (other than tmuc). */ - if ((a->sig.ldtmu && v3d_qpu_uses_vpm(b)) || - (b->sig.ldtmu && v3d_qpu_uses_vpm(a))) { + if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) || + (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) { return true; } - if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(b)) || - (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(a))) { + if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || + (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) { return true; } return false; } +/* Compute a bitmask of which rf registers are used between + * the two instructions. + */ +static uint64_t +qpu_raddrs_used(const struct v3d_qpu_instr *a, + const struct v3d_qpu_instr *b) +{ + assert(a->type == V3D_QPU_INSTR_TYPE_ALU); + assert(b->type == V3D_QPU_INSTR_TYPE_ALU); + + uint64_t raddrs_used = 0; + if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) + raddrs_used |= (1ll << a->raddr_a); + if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) + raddrs_used |= (1ll << a->raddr_b); + if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) + raddrs_used |= (1ll << b->raddr_a); + if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) + raddrs_used |= (1ll << b->raddr_b); + + return raddrs_used; +} + +/* Take two instructions and attempt to merge their raddr fields + * into one merged instruction. Returns false if the two instructions + * access more than two different rf registers between them, or more + * than one rf register and one small immediate. + */ +static bool +qpu_merge_raddrs(struct v3d_qpu_instr *result, + const struct v3d_qpu_instr *add_instr, + const struct v3d_qpu_instr *mul_instr) +{ + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); + int naddrs = util_bitcount64(raddrs_used); + + if (naddrs > 2) + return false; + + if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { + if (naddrs > 1) + return false; + + if (add_instr->sig.small_imm && mul_instr->sig.small_imm) + if (add_instr->raddr_b != mul_instr->raddr_b) + return false; + + result->sig.small_imm = true; + result->raddr_b = add_instr->sig.small_imm ? + add_instr->raddr_b : mul_instr->raddr_b; + } + + if (naddrs == 0) + return true; + + int raddr_a = ffsll(raddrs_used) - 1; + raddrs_used &= ~(1ll << raddr_a); + result->raddr_a = raddr_a; + + if (!result->sig.small_imm) { + if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && + raddr_a == add_instr->raddr_b) { + if (add_instr->alu.add.a == V3D_QPU_MUX_B) + result->alu.add.a = V3D_QPU_MUX_A; + if (add_instr->alu.add.b == V3D_QPU_MUX_B && + v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { + result->alu.add.b = V3D_QPU_MUX_A; + } + } + if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && + raddr_a == mul_instr->raddr_b) { + if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) + result->alu.mul.a = V3D_QPU_MUX_A; + if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && + v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { + result->alu.mul.b = V3D_QPU_MUX_A; + } + } + } + if (!raddrs_used) + return true; + + int raddr_b = ffsll(raddrs_used) - 1; + result->raddr_b = raddr_b; + if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && + raddr_b == add_instr->raddr_a) { + if (add_instr->alu.add.a == V3D_QPU_MUX_A) + result->alu.add.a = V3D_QPU_MUX_B; + if (add_instr->alu.add.b == V3D_QPU_MUX_A && + v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { + result->alu.add.b = V3D_QPU_MUX_B; + } + } + if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && + raddr_b == mul_instr->raddr_a) { + if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) + result->alu.mul.a = V3D_QPU_MUX_B; + if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && + v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { + result->alu.mul.b = V3D_QPU_MUX_B; + } + } + + return true; +} + +static bool +can_do_add_as_mul(enum v3d_qpu_add_op op) +{ + switch (op) { + case V3D_QPU_A_ADD: + case V3D_QPU_A_SUB: + return true; + default: + return false; + } +} + +static enum v3d_qpu_mul_op +add_op_as_mul_op(enum v3d_qpu_add_op op) +{ + switch (op) { + case V3D_QPU_A_ADD: + return V3D_QPU_M_ADD; + case V3D_QPU_A_SUB: + return V3D_QPU_M_SUB; + default: + unreachable("unexpected add opcode"); + } +} + +static void +qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) +{ + STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); + assert(inst->alu.add.op != V3D_QPU_A_NOP); + assert(inst->alu.mul.op == V3D_QPU_M_NOP); + + memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul)); + inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op); + inst->alu.add.op = V3D_QPU_A_NOP; + + inst->flags.mc = inst->flags.ac; + inst->flags.mpf = inst->flags.apf; + inst->flags.muf = inst->flags.auf; + inst->flags.ac = V3D_QPU_PF_NONE; + inst->flags.apf = V3D_QPU_PF_NONE; + inst->flags.auf = V3D_QPU_PF_NONE; +} + static bool qpu_merge_inst(const struct v3d_device_info *devinfo, struct v3d_qpu_instr *result, @@ -686,15 +888,54 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, return false; struct v3d_qpu_instr merge = *a; + const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL; + struct v3d_qpu_instr mul_inst; if (b->alu.add.op != V3D_QPU_A_NOP) { - if (a->alu.add.op != V3D_QPU_A_NOP) - return false; - merge.alu.add = b->alu.add; + if (a->alu.add.op == V3D_QPU_A_NOP) { + merge.alu.add = b->alu.add; - merge.flags.ac = b->flags.ac; - merge.flags.apf = b->flags.apf; - merge.flags.auf = b->flags.auf; + merge.flags.ac = b->flags.ac; + merge.flags.apf = b->flags.apf; + merge.flags.auf = b->flags.auf; + + add_instr = b; + mul_instr = a; + } + /* If a's add op is used but its mul op is not, then see if we + * can convert either a's add op or b's add op to a mul op + * so we can merge. + */ + else if (a->alu.mul.op == V3D_QPU_M_NOP && + can_do_add_as_mul(b->alu.add.op)) { + mul_inst = *b; + qpu_convert_add_to_mul(&mul_inst); + + merge.alu.mul = mul_inst.alu.mul; + + merge.flags.mc = b->flags.ac; + merge.flags.mpf = b->flags.apf; + merge.flags.muf = b->flags.auf; + + add_instr = a; + mul_instr = &mul_inst; + } else if (a->alu.mul.op == V3D_QPU_M_NOP && + can_do_add_as_mul(a->alu.add.op)) { + mul_inst = *a; + qpu_convert_add_to_mul(&mul_inst); + + merge = mul_inst; + merge.alu.add = b->alu.add; + + merge.flags.ac = b->flags.ac; + merge.flags.apf = b->flags.apf; + merge.flags.auf = b->flags.auf; + + add_instr = b; + mul_instr = &mul_inst; + } else { + return false; + } } if (b->alu.mul.op != V3D_QPU_M_NOP) { @@ -705,23 +946,14 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, merge.flags.mc = b->flags.mc; merge.flags.mpf = b->flags.mpf; merge.flags.muf = b->flags.muf; - } - if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) { - if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) && - a->raddr_a != b->raddr_a) { - return false; - } - merge.raddr_a = b->raddr_a; + mul_instr = b; + add_instr = a; } - if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { - if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && - (a->raddr_b != b->raddr_b || - a->sig.small_imm != b->sig.small_imm)) { + if (add_instr && mul_instr && + !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { return false; - } - merge.raddr_b = b->raddr_b; } merge.sig.thrsw |= b->sig.thrsw; @@ -755,8 +987,19 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, return ok; } +static inline bool +try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst) +{ + return inst->sig.ldunif || inst->sig.ldunifrf; +} + +static bool +qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, + const struct qinst *qinst); + static struct schedule_node * -choose_instruction_to_schedule(const struct v3d_device_info *devinfo, +choose_instruction_to_schedule(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct schedule_node *prev_inst) { @@ -771,10 +1014,19 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, return NULL; } + bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT && + scoreboard->ldvary_count < c->num_inputs; + bool skipped_insts_for_ldvary_pipelining = false; +retry: list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, dag.link) { const struct v3d_qpu_instr *inst = &n->inst->qpu; + if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) { + skipped_insts_for_ldvary_pipelining = true; + continue; + } + /* Don't choose the branch instruction until it's the last one * left. We'll move it up to fit its delay slots after we * choose it. @@ -784,6 +1036,13 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, continue; } + /* We need to have 3 delay slots between a write to unifa and + * a follow-up ldunifa. + */ + if ((inst->sig.ldunifa || inst->sig.ldunifarf) && + scoreboard->tick - scoreboard->last_unifa_write_tick <= 3) + continue; + /* "An instruction must not read from a location in physical * regfile A or B that was written to by the previous * instruction." @@ -791,7 +1050,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, if (reads_too_soon_after_write(scoreboard, n->inst)) continue; - if (writes_too_soon_after_write(devinfo, scoreboard, n->inst)) + if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; /* "A scoreboard wait must not occur in the first two @@ -806,18 +1065,42 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, * sooner. If the ldvary's r5 wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update * r5 in the same tick. - * - * XXX perf: To get good pipelining of a sequence of varying - * loads, we need to figure out how to pair the ldvary signal - * up to the instruction before the last r5 user in the - * previous ldvary sequence. Currently, it usually pairs with - * the last r5 user. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { continue; } + /* If we are in a thrsw delay slot check that this instruction + * is valid for that. + */ + if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick && + !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard, + n->inst)) { + continue; + } + + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { + /* Don't try to put a branch in the delay slots of another + * branch or a unifa write. + */ + if (scoreboard->last_branch_tick + 3 >= scoreboard->tick) + continue; + if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick) + continue; + + /* No branch with cond != 0,2,3 and msfign != 0 after + * setmsf. + */ + if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 && + inst->branch.msfign != V3D_QPU_MSFIGN_NONE && + inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && + inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && + inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { + continue; + } + } + /* If we're trying to pair with another instruction, check * that they're compatible. */ @@ -832,6 +1115,22 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, n->inst->uniform != -1) continue; + /* Simulator complains if we have two uniforms loaded in + * the the same instruction, which could happen if we + * have a ldunif or sideband uniform and we pair that + * with ldunifa. + */ + if (vir_has_uniform(prev_inst->inst) && + (inst->sig.ldunifa || inst->sig.ldunifarf)) { + continue; + } + + if ((prev_inst->inst->qpu.sig.ldunifa || + prev_inst->inst->qpu.sig.ldunifarf) && + vir_has_uniform(n->inst)) { + continue; + } + /* Don't merge in something that will lock the TLB. * Hopwefully what we have in inst will release some * other instructions, allowing us to delay the @@ -840,14 +1139,27 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) continue; + /* When we succesfully pair up an ldvary we then try + * to merge it into the previous instruction if + * possible to improve pipelining. Don't pick up the + * ldvary now if the follow-up fixup would place + * it in the delay slots of a thrsw, which is not + * allowed and would prevent the fixup from being + * successul. + */ + if (inst->sig.ldvary && + scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { + continue; + } + struct v3d_qpu_instr merged_inst; - if (!qpu_merge_inst(devinfo, &merged_inst, + if (!qpu_merge_inst(c->devinfo, &merged_inst, &prev_inst->inst->qpu, inst)) { continue; } } - int prio = get_instruction_priority(inst); + int prio = get_instruction_priority(c->devinfo, inst); if (mux_read_stalls(scoreboard, inst)) { /* Don't merge an instruction that stalls */ @@ -885,15 +1197,36 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, } } + /* If we did not find any instruction to schedule but we discarded + * some of them to prioritize ldvary pipelining, try again. + */ + if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) { + skipped_insts_for_ldvary_pipelining = false; + ldvary_pipelining = false; + goto retry; + } + + if (chosen && chosen->inst->qpu.sig.ldvary) { + scoreboard->ldvary_count++; + /* If we are pairing an ldvary, flag it so we can fix it up for + * optimal pipelining of ldvary sequences. + */ + if (prev_inst) + scoreboard->fixup_ldvary = true; + } + return chosen; } static void update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, - enum v3d_qpu_waddr waddr) + enum v3d_qpu_waddr waddr, + const struct v3d_device_info *devinfo) { if (v3d_qpu_magic_waddr_is_sfu(waddr)) scoreboard->last_magic_sfu_write_tick = scoreboard->tick; + else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) + scoreboard->last_unifa_write_tick = scoreboard->tick; } static void @@ -908,7 +1241,8 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst) + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) { if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; @@ -918,17 +1252,22 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.add.op != V3D_QPU_A_NOP) { if (inst->alu.add.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, - inst->alu.add.waddr); + inst->alu.add.waddr, + devinfo); } else { update_scoreboard_for_sfu_stall_waddr(scoreboard, inst); } + + if (inst->alu.add.op == V3D_QPU_A_SETMSF) + scoreboard->last_setmsf_tick = scoreboard->tick; } if (inst->alu.mul.op != V3D_QPU_M_NOP) { if (inst->alu.mul.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, - inst->alu.mul.waddr); + inst->alu.mul.waddr, + devinfo); } } @@ -962,7 +1301,8 @@ dump_state(const struct v3d_device_info *devinfo, struct dag *dag) } } -static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, +static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo, + enum v3d_qpu_waddr waddr, const struct v3d_qpu_instr *after) { /* Apply some huge latency between texture fetch requests and getting @@ -988,8 +1328,10 @@ static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, * * because we associate the first load_tmu0 with the *second* tmu0_s. */ - if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after)) + if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) && + v3d_qpu_waits_on_tmu(after)) { return 100; + } /* Assume that anything depending on us is consuming the SFU result. */ if (v3d_qpu_magic_waddr_is_sfu(waddr)) @@ -999,7 +1341,8 @@ static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, } static uint32_t -instruction_latency(struct schedule_node *before, struct schedule_node *after) +instruction_latency(const struct v3d_device_info *devinfo, + struct schedule_node *before, struct schedule_node *after) { const struct v3d_qpu_instr *before_inst = &before->inst->qpu; const struct v3d_qpu_instr *after_inst = &after->inst->qpu; @@ -1011,13 +1354,15 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after) if (before_inst->alu.add.magic_write) { latency = MAX2(latency, - magic_waddr_latency(before_inst->alu.add.waddr, + magic_waddr_latency(devinfo, + before_inst->alu.add.waddr, after_inst)); } if (before_inst->alu.mul.magic_write) { latency = MAX2(latency, - magic_waddr_latency(before_inst->alu.mul.waddr, + magic_waddr_latency(devinfo, + before_inst->alu.mul.waddr, after_inst)); } @@ -1032,6 +1377,7 @@ static void compute_delay(struct dag_node *node, void *state) { struct schedule_node *n = (struct schedule_node *)node; + struct v3d_compile *c = (struct v3d_compile *) state; n->delay = 1; @@ -1040,7 +1386,8 @@ compute_delay(struct dag_node *node, void *state) (struct schedule_node *)edge->child; n->delay = MAX2(n->delay, (child->delay + - instruction_latency(n, child))); + instruction_latency(c->devinfo, n, + child))); } } @@ -1059,7 +1406,8 @@ pre_remove_head(struct dag *dag, struct schedule_node *n) } static void -mark_instruction_scheduled(struct dag *dag, +mark_instruction_scheduled(const struct v3d_device_info *devinfo, + struct dag *dag, uint32_t time, struct schedule_node *node) { @@ -1073,7 +1421,7 @@ mark_instruction_scheduled(struct dag *dag, if (!child) continue; - uint32_t latency = instruction_latency(node, child); + uint32_t latency = instruction_latency(devinfo, node, child); child->unblocked_time = MAX2(child->unblocked_time, time + latency); @@ -1089,7 +1437,7 @@ insert_scheduled_instruction(struct v3d_compile *c, { list_addtail(&inst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, &inst->qpu); + update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); c->qpu_inst_count++; scoreboard->tick++; } @@ -1111,8 +1459,8 @@ emit_nop(struct v3d_compile *c, struct qblock *block, } static bool -qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, - const struct qinst *qinst, int slot) +qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, + const struct qinst *qinst, int slot) { const struct v3d_qpu_instr *inst = &qinst->qpu; @@ -1169,6 +1517,140 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, return true; } +/** + * This is called when trying to merge a thrsw back into the instruction stream + * of instructions that were scheduled *before* the thrsw signal to fill its + * delay slots. Because the actual execution of the thrsw happens after the + * delay slots, it is usually safe to do this, but there are some cases that + * need special care. + */ +static bool +qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + const struct qinst *qinst, + uint32_t slot) +{ + /* No scheduling SFU when the result would land in the other + * thread. The simulator complains for safety, though it + * would only occur for dead code in our case. + */ + if (slot > 0 && + qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || + v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { + return false; + } + + if (slot > 0 && qinst->qpu.sig.ldvary) + return false; + + /* unifa and the following 3 instructions can't overlap a + * thread switch/end. The docs further clarify that this means + * the cycle at which the actual thread switch/end happens + * and not when the thrsw instruction is processed, which would + * be after the 2 delay slots following the thrsw instruction. + * This means that we can move up a thrsw up to the instruction + * right after unifa: + * + * unifa, r5 + * thrsw + * delay slot 1 + * delay slot 2 + * Thread switch happens here, 4 instructions away from unifa + */ + if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) + return false; + + return true; +} + +/** + * This is called for instructions scheduled *after* a thrsw signal that may + * land in the delay slots of the thrsw. Because these instructions were + * scheduled after the thrsw, we need to be careful when placing them into + * the delay slots, since that means that we are moving them ahead of the + * thread switch and we need to ensure that is not a problem. + */ +static bool +qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, + const struct qinst *qinst) +{ + const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick; + assert(slot <= 2); + + /* We merge thrsw instructions back into the instruction stream + * manually, so any instructions scheduled after a thrsw shold be + * in the actual delay slots and not in the same slot as the thrsw. + */ + assert(slot >= 1); + + /* No emitting a thrsw while the previous thrsw hasn't happened yet. */ + if (qinst->qpu.sig.thrsw) + return false; + + /* The restrictions for instructions scheduled before the the thrsw + * also apply to instructions scheduled after the thrsw that we want + * to place in its delay slots. + */ + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + return false; + + /* TLB access is disallowed until scoreboard wait is executed, which + * we do on the last thread switch. + */ + if (qpu_inst_is_tlb(&qinst->qpu)) + return false; + + /* Instruction sequence restrictions: Branch is not allowed in delay + * slots of a thrsw. + */ + if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) + return false; + + /* Miscellaneous restrictions: At the point of a thrsw we need to have + * at least one outstanding lookup or TSY wait. + * + * So avoid placing TMU instructions scheduled after the thrsw into + * its delay slots or we may be compromising the integrity of our TMU + * sequences. Also, notice that if we moved these instructions into + * the delay slots of a previous thrsw we could overflow our TMU output + * fifo, since we could be effectively pipelining a lookup scheduled + * after the thrsw into the sequence before the thrsw. + */ + if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) || + qinst->qpu.sig.wrtmuc) { + return false; + } + + /* Don't move instructions that wait on the TMU before the thread switch + * happens since that would make the current thread stall before the + * switch, which is exactly what we want to avoid with the thrsw + * instruction. + */ + if (v3d_qpu_waits_on_tmu(&qinst->qpu)) + return false; + + /* A thread switch invalidates all accumulators, so don't place any + * instructions that write accumulators into the delay slots. + */ + if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu)) + return false; + + /* Multop has an implicit write to the rtop register which is an + * specialized accumulator that is only used with this instruction. + */ + if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) + return false; + + /* Flags are invalidated across a thread switch, so dont' place + * instructions that write flags into delay slots. + */ + if (v3d_qpu_writes_flags(&qinst->qpu)) + return false; + + return true; +} + static bool valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qinst *qinst, int instructions_in_sequence, @@ -1181,22 +1663,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard } for (int slot = 0; slot < instructions_in_sequence; slot++) { - /* No scheduling SFU when the result would land in the other - * thread. The simulator complains for safety, though it - * would only occur for dead code in our case. - */ - if (slot > 0 && - qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || - v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { - return false; - } - - if (slot > 0 && qinst->qpu.sig.ldvary) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) return false; if (is_thrend && - !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) { + !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { return false; } @@ -1229,6 +1700,18 @@ emit_thrsw(struct v3d_compile *c, assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); + /* Don't try to emit a thrsw in the delay slots of a previous thrsw + * or branch. + */ + while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) { + emit_nop(c, block, scoreboard); + time++; + } + while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) { + emit_nop(c, block, scoreboard); + time++; + } + /* Find how far back into previous instructions we can put the THRSW. */ int slots_filled = 0; struct qinst *merge_inst = NULL; @@ -1264,21 +1747,27 @@ emit_thrsw(struct v3d_compile *c, merge_inst = inst; } - /* Insert any extra delay slot NOPs we need. */ - for (int i = 0; i < 3 - slots_filled; i++) { - emit_nop(c, block, scoreboard); - time++; - } - /* If we're emitting the last THRSW (other than program end), then * signal that to the HW by emitting two THRSWs in a row. */ if (inst->is_last_thrsw) { + if (slots_filled <= 1) { + emit_nop(c, block, scoreboard); + time++; + } struct qinst *second_inst = (struct qinst *)merge_inst->link.next; second_inst->qpu.sig.thrsw = true; } + /* Make sure the thread end executes within the program lifespan */ + if (is_thrend) { + for (int i = 0; i < 3 - slots_filled; i++) { + emit_nop(c, block, scoreboard); + time++; + } + } + /* If we put our THRSW into another instruction, free up the * instruction that didn't end up scheduled into the list. */ @@ -1288,6 +1777,273 @@ emit_thrsw(struct v3d_compile *c, return time; } +static bool +qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst) +{ + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) + return false; + + if (inst->qpu.sig.thrsw) + return false; + + if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu)) + return false; + + if (vir_has_uniform(inst)) + return false; + + return true; +} + +static void +emit_branch(struct v3d_compile *c, + struct qblock *block, + struct choose_scoreboard *scoreboard, + struct qinst *inst) +{ + assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); + + /* We should've not picked up a branch for the delay slots of a previous + * thrsw, branch or unifa write instruction. + */ + int branch_tick = scoreboard->tick; + assert(scoreboard->last_thrsw_tick + 2 < branch_tick); + assert(scoreboard->last_branch_tick + 3 < branch_tick); + assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); + + /* Can't place a branch with msfign != 0 and cond != 0,2,3 after + * setmsf. + */ + bool is_safe_msf_branch = + inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || + inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || + inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || + inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0; + assert(scoreboard->last_setmsf_tick != branch_tick - 1 || + is_safe_msf_branch); + + /* Insert the branch instruction */ + insert_scheduled_instruction(c, block, scoreboard, inst); + + /* Now see if we can move the branch instruction back into the + * instruction stream to fill its delay slots + */ + int slots_filled = 0; + while (slots_filled < 3 && block->instructions.next != &inst->link) { + struct qinst *prev_inst = (struct qinst *) inst->link.prev; + assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH); + + /* Can't move the branch instruction if that would place it + * in the delay slots of other instructions. + */ + if (scoreboard->last_branch_tick + 3 >= + branch_tick - slots_filled - 1) { + break; + } + + if (scoreboard->last_thrsw_tick + 2 >= + branch_tick - slots_filled - 1) { + break; + } + + if (scoreboard->last_unifa_write_tick + 3 >= + branch_tick - slots_filled - 1) { + break; + } + + /* Can't move a conditional branch before the instruction + * that writes the flags for its condition. + */ + if (v3d_qpu_writes_flags(&prev_inst->qpu) && + inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) { + break; + } + + if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst)) + break; + + if (!is_safe_msf_branch) { + struct qinst *prev_prev_inst = + (struct qinst *) prev_inst->link.prev; + if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) { + break; + } + } + + list_del(&prev_inst->link); + list_add(&prev_inst->link, &inst->link); + slots_filled++; + } + + block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled; + scoreboard->last_branch_tick = branch_tick - slots_filled; + + /* Fill any remaining delay slots. + * + * For unconditional branches we'll try to fill these with the + * first instructions in the successor block after scheduling + * all blocks when setting up branch targets. + */ + for (int i = 0; i < 3 - slots_filled; i++) + emit_nop(c, block, scoreboard); +} + +static bool +alu_reads_register(struct v3d_qpu_instr *inst, + bool add, bool magic, uint32_t index) +{ + uint32_t num_src; + enum v3d_qpu_mux mux_a, mux_b; + + if (add) { + num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); + mux_a = inst->alu.add.a; + mux_b = inst->alu.add.b; + } else { + num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); + mux_a = inst->alu.mul.a; + mux_b = inst->alu.mul.b; + } + + for (int i = 0; i < num_src; i++) { + if (magic) { + if (i == 0 && mux_a == index) + return true; + if (i == 1 && mux_b == index) + return true; + } else { + if (i == 0 && mux_a == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 0 && mux_a == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + } + } + + return false; +} + +/** + * This takes and ldvary signal merged into 'inst' and tries to move it up to + * the previous instruction to get good pipelining of ldvary sequences, + * transforming this: + * + * nop ; nop ; ldvary.r4 + * nop ; fmul r0, r4, rf0 ; + * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst + * + * into: + * + * nop ; nop ; ldvary.r4 + * nop ; fmul r0, r4, rf0 ; ldvary.r1 + * fadd rf13, r0, r5 ; nop; ; <-- inst + * + * If we manage to do this successfully (we return true here), then flagging + * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that + * we will be able to pick up to merge into 'inst', leading to code like this: + * + * nop ; nop ; ldvary.r4 + * nop ; fmul r0, r4, rf0 ; ldvary.r1 + * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst + */ +static bool +fixup_pipelined_ldvary(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, + struct qblock *block, + struct v3d_qpu_instr *inst) +{ + /* We only call this if we have successfuly merged an ldvary into a + * previous instruction. + */ + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + assert(inst->sig.ldvary); + uint32_t ldvary_magic = inst->sig_magic; + uint32_t ldvary_index = inst->sig_addr; + + /* The instruction in which we merged the ldvary cannot read + * the ldvary destination, if it does, then moving the ldvary before + * it would overwrite it. + */ + if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) + return false; + if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) + return false; + + /* The previous instruction can't write to the same destination as the + * ldvary. + */ + struct qinst *prev = (struct qinst *) block->instructions.prev; + if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) { + if (prev->qpu.alu.add.magic_write == ldvary_magic && + prev->qpu.alu.add.waddr == ldvary_index) { + return false; + } + } + + if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) { + if (prev->qpu.alu.mul.magic_write == ldvary_magic && + prev->qpu.alu.mul.waddr == ldvary_index) { + return false; + } + } + + /* The previous instruction cannot have a conflicting signal */ + if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) + return false; + + /* The previous instruction cannot use flags since ldvary uses the + * 'cond' instruction field to store the destination. + */ + if (v3d_qpu_writes_flags(&prev->qpu)) + return false; + if (v3d_qpu_reads_flags(&prev->qpu)) + return false; + + /* We can't put an ldvary in the delay slots of a thrsw. We should've + * prevented this when pairing up the ldvary with another instruction + * and flagging it for a fixup. + */ + assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); + + /* Move the ldvary to the previous instruction and remove it from the + * current one. + */ + prev->qpu.sig.ldvary = true; + prev->qpu.sig_magic = ldvary_magic; + prev->qpu.sig_addr = ldvary_index; + scoreboard->last_ldvary_tick = scoreboard->tick - 1; + + inst->sig.ldvary = false; + inst->sig_magic = false; + inst->sig_addr = 0; + + /* By moving ldvary to the previous instruction we make it update + * r5 in the current one, so nothing else in it should write r5. + * This should've been prevented by our depedency tracking, which + * would not allow ldvary to be paired up with an instruction that + * writes r5 (since our dependency tracking doesn't know that the + * ldvary write r5 happens in the next instruction). + */ + assert(!v3d_qpu_writes_r5(c->devinfo, inst)); + + return true; +} + static uint32_t schedule_instructions(struct v3d_compile *c, struct choose_scoreboard *scoreboard, @@ -1301,9 +2057,7 @@ schedule_instructions(struct v3d_compile *c, while (!list_is_empty(&scoreboard->dag->heads)) { struct schedule_node *chosen = - choose_instruction_to_schedule(devinfo, - scoreboard, - NULL); + choose_instruction_to_schedule(c, scoreboard, NULL); struct schedule_node *merge = NULL; /* If there are no valid instructions to schedule, drop a NOP @@ -1336,11 +2090,10 @@ schedule_instructions(struct v3d_compile *c, pre_remove_head(scoreboard->dag, chosen); while ((merge = - choose_instruction_to_schedule(devinfo, - scoreboard, + choose_instruction_to_schedule(c, scoreboard, chosen))) { time = MAX2(merge->unblocked_time, time); - pre_remove_head(scoreboard->dag, chosen); + pre_remove_head(scoreboard->dag, merge); list_addtail(&merge->link, &merged_list); (void)qpu_merge_inst(devinfo, inst, inst, &merge->inst->qpu); @@ -1358,6 +2111,21 @@ schedule_instructions(struct v3d_compile *c, v3d_qpu_dump(devinfo, inst); fprintf(stderr, "\n"); } + + if (scoreboard->fixup_ldvary) { + scoreboard->fixup_ldvary = false; + if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) { + /* Flag the ldvary as scheduled + * now so we can try to merge the + * follow-up instruction in the + * the ldvary sequence into the + * current instruction. + */ + mark_instruction_scheduled( + devinfo, scoreboard->dag, + time, merge); + } + } } if (mux_read_stalls(scoreboard, inst)) c->qpu_inst_stalled_count++; @@ -1388,10 +2156,10 @@ schedule_instructions(struct v3d_compile *c, * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - mark_instruction_scheduled(scoreboard->dag, time, chosen); + mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen); list_for_each_entry(struct schedule_node, merge, &merged_list, link) { - mark_instruction_scheduled(scoreboard->dag, time, merge); + mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge); /* The merged VIR instruction doesn't get re-added to the * block, so free it now. @@ -1401,23 +2169,11 @@ schedule_instructions(struct v3d_compile *c, if (inst->sig.thrsw) { time += emit_thrsw(c, block, scoreboard, qinst, false); + } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { + emit_branch(c, block, scoreboard, qinst); } else { insert_scheduled_instruction(c, block, scoreboard, qinst); - - if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { - block->branch_qpu_ip = c->qpu_inst_count - 1; - /* Fill the delay slots. - * - * We should fill these with actual instructions, - * instead, but that will probably need to be done - * after this, once we know what the leading - * instructions of the successors are (so we can - * handle A/B register file write latency) - */ - for (int i = 0; i < 3; i++) - emit_nop(c, block, scoreboard); - } } } @@ -1454,7 +2210,7 @@ qpu_schedule_instructions_block(struct v3d_compile *c, calculate_forward_deps(c, scoreboard->dag, &setup_list); calculate_reverse_deps(c, scoreboard->dag, &setup_list); - dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); + dag_traverse_bottom_up(scoreboard->dag, compute_delay, c); uint32_t cycles = schedule_instructions(c, scoreboard, block, orig_uniform_contents, @@ -1487,11 +2243,30 @@ qpu_set_branch_targets(struct v3d_compile *c) /* Walk back through the delay slots to find the branch * instr. */ + struct qinst *branch = NULL; struct list_head *entry = block->instructions.prev; - for (int i = 0; i < 3; i++) + int32_t delay_slot_count = -1; + struct qinst *delay_slots_start = NULL; + for (int i = 0; i < 3; i++) { entry = entry->prev; - struct qinst *branch = container_of(entry, branch, link); - assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); + struct qinst *inst = + container_of(entry, struct qinst, link); + + if (delay_slot_count == -1) { + if (!v3d_qpu_is_nop(&inst->qpu)) + delay_slot_count = i; + else + delay_slots_start = inst; + } + + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { + branch = inst; + break; + } + } + assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); + assert(delay_slot_count >= 0 && delay_slot_count <= 3); + assert(delay_slot_count == 0 || delay_slots_start != NULL); /* Make sure that the if-we-don't-jump * successor was scheduled just after the @@ -1517,6 +2292,34 @@ qpu_set_branch_targets(struct v3d_compile *c) c->uniform_data[branch->uniform] = (block->successors[0]->start_uniform - (block->branch_uniform + 1)) * 4; + + /* If this is an unconditional branch, try to fill any remaining + * delay slots with the initial instructions of the successor + * block. + * + * FIXME: we can do the same for conditional branches if we + * predicate the instructions to match the branch condition. + */ + if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) { + struct list_head *successor_insts = + &block->successors[0]->instructions; + delay_slot_count = MIN2(delay_slot_count, + list_length(successor_insts)); + struct qinst *s_inst = + (struct qinst *) successor_insts->next; + struct qinst *slot = delay_slots_start; + int slots_filled = 0; + while (slots_filled < delay_slot_count && + qpu_inst_valid_in_branch_delay_slot(c, s_inst)) { + memcpy(&slot->qpu, &s_inst->qpu, + sizeof(slot->qpu)); + s_inst = (struct qinst *) s_inst->link.next; + slot = (struct qinst *) slot->link.next; + slots_filled++; + } + branch->qpu.branch.offset += + slots_filled * sizeof(uint64_t); + } } } @@ -1541,9 +2344,12 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) struct choose_scoreboard scoreboard; memset(&scoreboard, 0, sizeof(scoreboard)); scoreboard.last_ldvary_tick = -10; + scoreboard.last_unifa_write_tick = -10; scoreboard.last_magic_sfu_write_tick = -10; scoreboard.last_uniforms_reset_tick = -10; scoreboard.last_thrsw_tick = -10; + scoreboard.last_branch_tick = -10; + scoreboard.last_setmsf_tick = -10; scoreboard.last_stallable_sfu_tick = -10; if (debug) { |