diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2015-11-22 02:46:45 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2015-11-22 02:46:45 +0000 |
commit | 0784c49c0f8fcc8b3abd4c9286d9fd8bc089dd7d (patch) | |
tree | a6394e3e264a0f80b57f4ce0f5d9526aa543d4b0 /lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c | |
parent | d91d0007eecf589ea5699e34aa4d748fce2c57b2 (diff) |
import Mesa 11.0.6
Diffstat (limited to 'lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c')
-rw-r--r-- | lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c | 581 |
1 files changed, 581 insertions, 0 deletions
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c new file mode 100644 index 000000000..2ee325518 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -0,0 +1,581 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <robclark@freedesktop.org> + */ + + +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Instruction Scheduling: + * + * A priority-queue based scheduling algo. Add eligible instructions, + * ie. ones with all their dependencies scheduled, to the priority + * (depth) sorted queue (list). Pop highest priority instruction off + * the queue and schedule it, add newly eligible instructions to the + * priority queue, rinse, repeat. + * + * There are a few special cases that need to be handled, since sched + * is currently independent of register allocation. Usages of address + * register (a0.x) or predicate register (p0.x) must be serialized. Ie. + * if you have two pairs of instructions that write the same special + * register and then read it, then those pairs cannot be interleaved. + * To solve this, when we are in such a scheduling "critical section", + * and we encounter a conflicting write to a special register, we try + * to schedule any remaining instructions that use that value first. + */ + +struct ir3_sched_ctx { + struct ir3_block *block; /* the current block */ + struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ + struct ir3_instruction *addr; /* current a0.x user, if any */ + struct ir3_instruction *pred; /* current p0.x user, if any */ + bool error; +}; + +static bool is_sfu_or_mem(struct ir3_instruction *instr) +{ + return is_sfu(instr) || is_mem(instr); +} + +static void +schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) +{ + debug_assert(ctx->block == instr->block); + + /* maybe there is a better way to handle this than just stuffing + * a nop.. ideally we'd know about this constraint in the + * scheduling and depth calculation.. + */ + if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) + ir3_NOP(ctx->block); + + /* remove from depth list: + */ + list_delinit(&instr->node); + + if (writes_addr(instr)) { + debug_assert(ctx->addr == NULL); + ctx->addr = instr; + } + + if (writes_pred(instr)) { + debug_assert(ctx->pred == NULL); + ctx->pred = instr; + } + + instr->flags |= IR3_INSTR_MARK; + + list_addtail(&instr->node, &instr->block->instr_list); + ctx->scheduled = instr; +} + +static unsigned +distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, + unsigned maxd) +{ + struct list_head *instr_list = &ctx->block->instr_list; + unsigned d = 0; + + list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) { + if ((n == instr) || (d >= maxd)) + break; + if (is_alu(n) || is_flow(n)) + d++; + } + + return d; +} + +/* calculate delay for specified src: */ +static unsigned +delay_calc_srcn(struct ir3_sched_ctx *ctx, + struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned srcn) +{ + unsigned delay = 0; + + if (is_meta(assigner)) { + struct ir3_instruction *src; + foreach_ssa_src(src, assigner) { + unsigned d; + if (src->block != assigner->block) + break; + d = delay_calc_srcn(ctx, src, consumer, srcn); + delay = MAX2(delay, d); + } + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + delay -= distance(ctx, assigner, delay); + } + + return delay; +} + +/* calculate delay for instruction (maximum of delay for all srcs): */ +static unsigned +delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) +{ + unsigned delay = 0; + struct ir3_instruction *src; + + foreach_ssa_src_n(src, i, instr) { + unsigned d; + if (src->block != instr->block) + continue; + d = delay_calc_srcn(ctx, src, instr, i); + delay = MAX2(delay, d); + } + + return delay; +} + +struct ir3_sched_notes { + /* there is at least one kill which could be scheduled, except + * for unscheduled bary.f's: + */ + bool blocked_kill; + /* there is at least one instruction that could be scheduled, + * except for conflicting address/predicate register usage: + */ + bool addr_conflict, pred_conflict; +}; + +static bool is_scheduled(struct ir3_instruction *instr) +{ + return !!(instr->flags & IR3_INSTR_MARK); +} + +static bool +check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) +{ + /* if this is a write to address/predicate register, and that + * register is currently in use, we need to defer until it is + * free: + */ + if (writes_addr(instr) && ctx->addr) { + debug_assert(ctx->addr != instr); + notes->addr_conflict = true; + return true; + } + + if (writes_pred(instr) && ctx->pred) { + debug_assert(ctx->pred != instr); + notes->pred_conflict = true; + return true; + } + + return false; +} + +/* is this instruction ready to be scheduled? Return negative for not + * ready (updating notes if needed), or >= 0 to indicate number of + * delay slots needed. + */ +static int +instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) +{ + struct ir3_instruction *src; + unsigned delay = 0; + + /* Phi instructions can have a dependency on something not + * scheduled yet (for ex, loops). But OTOH we don't really + * care. By definition phi's should appear at the top of + * the block, and it's sources should be values from the + * previously executing block, so they are always ready to + * be scheduled: + */ + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) + return 0; + + foreach_ssa_src(src, instr) { + /* if dependency not scheduled, we aren't ready yet: */ + if (!is_scheduled(src)) + return -1; + } + + /* all our dependents are scheduled, figure out if + * we have enough delay slots to schedule ourself: + */ + delay = delay_calc(ctx, instr); + if (delay) + return delay; + + /* if the instruction is a kill, we need to ensure *every* + * bary.f is scheduled. The hw seems unhappy if the thread + * gets killed before the end-input (ei) flag is hit. + * + * We could do this by adding each bary.f instruction as + * virtual ssa src for the kill instruction. But we have + * fixed length instr->regs[]. + * + * TODO this wouldn't be quite right if we had multiple + * basic blocks, if any block was conditional. We'd need + * to schedule the bary.f's outside of any block which + * was conditional that contained a kill.. I think.. + */ + if (is_kill(instr)) { + struct ir3 *ir = instr->block->shader; + + for (unsigned i = 0; i < ir->baryfs_count; i++) { + struct ir3_instruction *baryf = ir->baryfs[i]; + if (baryf->depth == DEPTH_UNUSED) + continue; + if (!is_scheduled(baryf)) { + notes->blocked_kill = true; + return -1; + } + } + } + + if (check_conflict(ctx, notes, instr)) + return -1; + + return 0; +} + +/* could an instruction be scheduled if specified ssa src was scheduled? */ +static bool +could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) +{ + struct ir3_instruction *other_src; + foreach_ssa_src(other_src, instr) { + /* if dependency not scheduled, we aren't ready yet: */ + if ((src != other_src) && !is_scheduled(other_src)) { + return false; + } + } + return true; +} + +/* move eligible instructions to the priority list: */ +static unsigned +add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct list_head *prio_queue, struct list_head *unscheduled_list) +{ + unsigned min_delay = ~0; + + list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) { + int e = instr_eligibility(ctx, notes, instr); + if (e < 0) + continue; + + /* For instructions that write address register we need to + * make sure there is at least one instruction that uses the + * addr value which is otherwise ready. + * + * TODO if any instructions use pred register and have other + * src args, we would need to do the same for writes_pred().. + */ + if (unlikely(writes_addr(instr))) { + struct ir3 *ir = instr->block->shader; + bool ready = false; + for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + if (!indirect) + continue; + if (indirect->address != instr) + continue; + ready = could_sched(indirect, instr); + } + + /* nothing could be scheduled, so keep looking: */ + if (!ready) + continue; + } + + min_delay = MIN2(min_delay, e); + if (e == 0) { + /* remove from unscheduled list and into priority queue: */ + list_delinit(&instr->node); + ir3_insert_by_depth(instr, prio_queue); + } + } + + return min_delay; +} + +/* "spill" the address register by remapping any unscheduled + * instructions which depend on the current address register + * to a clone of the instruction which wrote the address reg. + */ +static struct ir3_instruction * +split_addr(struct ir3_sched_ctx *ctx) +{ + struct ir3 *ir; + struct ir3_instruction *new_addr = NULL; + unsigned i; + + debug_assert(ctx->addr); + + ir = ctx->addr->block->shader; + + for (i = 0; i < ir->indirects_count; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + + if (!indirect) + continue; + + /* skip instructions already scheduled: */ + if (is_scheduled(indirect)) + continue; + + /* remap remaining instructions using current addr + * to new addr: + */ + if (indirect->address == ctx->addr) { + if (!new_addr) { + new_addr = ir3_instr_clone(ctx->addr); + /* original addr is scheduled, but new one isn't: */ + new_addr->flags &= ~IR3_INSTR_MARK; + } + ir3_instr_set_address(indirect, new_addr); + } + } + + /* all remaining indirects remapped to new addr: */ + ctx->addr = NULL; + + return new_addr; +} + +/* "spill" the predicate register by remapping any unscheduled + * instructions which depend on the current predicate register + * to a clone of the instruction which wrote the address reg. + */ +static struct ir3_instruction * +split_pred(struct ir3_sched_ctx *ctx) +{ + struct ir3 *ir; + struct ir3_instruction *new_pred = NULL; + unsigned i; + + debug_assert(ctx->pred); + + ir = ctx->pred->block->shader; + + for (i = 0; i < ir->predicates_count; i++) { + struct ir3_instruction *predicated = ir->predicates[i]; + + /* skip instructions already scheduled: */ + if (is_scheduled(predicated)) + continue; + + /* remap remaining instructions using current pred + * to new pred: + * + * TODO is there ever a case when pred isn't first + * (and only) src? + */ + if (ssa(predicated->regs[1]) == ctx->pred) { + if (!new_pred) { + new_pred = ir3_instr_clone(ctx->pred); + /* original pred is scheduled, but new one isn't: */ + new_pred->flags &= ~IR3_INSTR_MARK; + } + predicated->regs[1]->instr = new_pred; + } + } + + /* all remaining predicated remapped to new pred: */ + ctx->pred = NULL; + + return new_pred; +} + +static void +sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) +{ + struct list_head unscheduled_list, prio_queue; + + ctx->block = block; + + /* move all instructions to the unscheduled list, and + * empty the block's instruction list (to which we will + * be inserting. + */ + list_replace(&block->instr_list, &unscheduled_list); + list_inithead(&block->instr_list); + list_inithead(&prio_queue); + + /* first a pre-pass to schedule all meta:input/phi instructions + * (which need to appear first so that RA knows the register is + * occupied: + */ + list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { + if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) || + (instr->opc == OPC_META_PHI))) + schedule(ctx, instr); + } + + while (!(list_empty(&unscheduled_list) && + list_empty(&prio_queue))) { + struct ir3_sched_notes notes = {0}; + unsigned delay; + + delay = add_eligible_instrs(ctx, ¬es, &prio_queue, &unscheduled_list); + + if (!list_empty(&prio_queue)) { + struct ir3_instruction *instr = list_last_entry(&prio_queue, + struct ir3_instruction, node); + /* ugg, this is a bit ugly, but between the time when + * the instruction became eligible and now, a new + * conflict may have arose.. + */ + if (check_conflict(ctx, ¬es, instr)) { + list_del(&instr->node); + list_addtail(&instr->node, &unscheduled_list); + continue; + } + + schedule(ctx, instr); + } else if (delay == ~0) { + struct ir3_instruction *new_instr = NULL; + + /* nothing available to schedule.. if we are blocked on + * address/predicate register conflict, then break the + * deadlock by cloning the instruction that wrote that + * reg: + */ + if (notes.addr_conflict) { + new_instr = split_addr(ctx); + } else if (notes.pred_conflict) { + new_instr = split_pred(ctx); + } else { + debug_assert(0); + ctx->error = true; + return; + } + + if (new_instr) { + list_del(&new_instr->node); + list_addtail(&new_instr->node, &unscheduled_list); + /* the original instr that wrote addr/pred may have + * originated from a different block: + */ + new_instr->block = block; + } + + } else { + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + debug_assert(delay <= 6); + while (delay > 0) { + ir3_NOP(block); + delay--; + } + } + } + + /* And lastly, insert branch/jump instructions to take us to + * the next block. Later we'll strip back out the branches + * that simply jump to next instruction. + */ + if (block->successors[1]) { + /* if/else, conditional branches to "then" or "else": */ + struct ir3_instruction *br; + unsigned delay = 6; + + debug_assert(ctx->pred); + debug_assert(block->condition); + + delay -= distance(ctx, ctx->pred, delay); + + while (delay > 0) { + ir3_NOP(block); + delay--; + } + + /* create "else" branch first (since "then" block should + * frequently/always end up being a fall-thru): + */ + br = ir3_BR(block); + br->cat0.inv = true; + br->cat0.target = block->successors[1]; + + /* NOTE: we have to hard code delay of 6 above, since + * we want to insert the nop's before constructing the + * branch. Throw in an assert so we notice if this + * ever breaks on future generation: + */ + debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6); + + br = ir3_BR(block); + br->cat0.target = block->successors[0]; + + } else if (block->successors[0]) { + /* otherwise unconditional jump to next block: */ + struct ir3_instruction *jmp; + + jmp = ir3_JUMP(block); + jmp->cat0.target = block->successors[0]; + } + + /* NOTE: if we kept track of the predecessors, we could do a better + * job w/ (jp) flags.. every node w/ > predecessor is a join point. + * Note that as we eliminate blocks which contain only an unconditional + * jump we probably need to propagate (jp) flag.. + */ +} + +/* this is needed to ensure later RA stage succeeds: */ +static void +sched_insert_parallel_copies(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) { + struct ir3_register *reg; + foreach_src(reg, instr) { + struct ir3_instruction *src = reg->instr; + struct ir3_instruction *mov = + ir3_MOV(src->block, src, TYPE_U32); + mov->regs[0]->flags |= IR3_REG_PHI_SRC; + mov->regs[0]->instr = instr; + reg->instr = mov; + } + } + } +} + +int ir3_sched(struct ir3 *ir) +{ + struct ir3_sched_ctx ctx = {0}; + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_insert_parallel_copies(block); + } + ir3_clear_mark(ir); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_block(&ctx, block); + } + if (ctx.error) + return -1; + return 0; +} |