diff options
Diffstat (limited to 'lib/mesa/src/gallium/drivers/r300/compiler')
24 files changed, 3281 insertions, 639 deletions
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.c b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.c new file mode 100644 index 000000000..0fda015ce --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.c @@ -0,0 +1,2519 @@ +/* + * Copyright © 2014-2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_deref.h" +#include "compiler/nir/nir_legacy.h" +#include "compiler/nir/nir_worklist.h" +#include "nir_to_rc.h" +#include "r300_nir.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_from_mesa.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_ureg.h" +#include "tgsi/tgsi_util.h" +#include "util/u_debug.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_dynarray.h" + +struct ntr_insn { + enum tgsi_opcode opcode; + struct ureg_dst dst[2]; + struct ureg_src src[4]; + enum tgsi_texture_type tex_target; + enum tgsi_return_type tex_return_type; + struct tgsi_texture_offset tex_offset[4]; + + unsigned mem_qualifier; + enum pipe_format mem_format; + + bool is_tex : 1; + bool precise : 1; +}; + +struct ntr_block { + /* Array of struct ntr_insn */ + struct util_dynarray insns; + int start_ip; + int end_ip; +}; + +struct ntr_reg_interval { + uint32_t start, end; +}; + +struct ntr_compile { + nir_shader *s; + nir_function_impl *impl; + const struct nir_to_rc_options *options; + struct pipe_screen *screen; + struct ureg_program *ureg; + + bool addr_declared[3]; + struct ureg_dst addr_reg[3]; + + /* if condition set up at the end of a block, for ntr_emit_if(). */ + struct ureg_src if_cond; + + /* TGSI temps for our NIR SSA and register values. */ + struct ureg_dst *reg_temp; + struct ureg_src *ssa_temp; + + struct ntr_reg_interval *liveness; + + /* Map from nir_block to ntr_block */ + struct hash_table *blocks; + struct ntr_block *cur_block; + unsigned current_if_else; + unsigned cf_label; + + /* Whether we're currently emitting instructiosn for a precise NIR instruction. */ + bool precise; + + unsigned num_temps; + unsigned first_non_array_temp; + + /* Mappings from driver_location to TGSI input/output number. + * + * We'll be declaring TGSI input/outputs in an arbitrary order, and they get + * their numbers assigned incrementally, unlike inputs or constants. + */ + struct ureg_src *input_index_map; + uint64_t centroid_inputs; + + uint32_t first_ubo; +}; + +static struct ureg_dst +ntr_temp(struct ntr_compile *c) +{ + return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++); +} + +static struct ntr_block * +ntr_block_from_nir(struct ntr_compile *c, struct nir_block *block) +{ + struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block); + return entry->data; +} + +static void ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list); +static void ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list); + +static struct ntr_insn * +ntr_insn(struct ntr_compile *c, enum tgsi_opcode opcode, + struct ureg_dst dst, + struct ureg_src src0, struct ureg_src src1, + struct ureg_src src2, struct ureg_src src3) +{ + struct ntr_insn insn = { + .opcode = opcode, + .dst = { dst, ureg_dst_undef() }, + .src = { src0, src1, src2, src3 }, + .precise = c->precise, + }; + util_dynarray_append(&c->cur_block->insns, struct ntr_insn, insn); + return util_dynarray_top_ptr(&c->cur_block->insns, struct ntr_insn); +} + +#define OP00( op ) \ +static inline void ntr_##op(struct ntr_compile *c) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP01( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_src src0) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + + +#define OP10( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP11( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP12( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0, \ + struct ureg_src src1) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP13( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0, \ + struct ureg_src src1, \ + struct ureg_src src2) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \ +} + +#define OP14( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0, \ + struct ureg_src src1, \ + struct ureg_src src2, \ + struct ureg_src src3) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \ +} + +/* We hand-craft our tex instructions */ +#define OP12_TEX(op) +#define OP14_TEX(op) + +/* Use a template include to generate a correctly-typed ntr_OP() + * function for each TGSI opcode: + */ +#include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h" + +/** + * Interprets a nir_load_const used as a NIR src as a uint. + * + * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU + * instruction (or in a phi-web used by an integer ALU instruction) were + * converted to floats and the ALU instruction swapped to the float equivalent. + * However, this means that integer load_consts used by intrinsics (which don't + * normally get that conversion) may have been reformatted to be floats. Given + * that all of our intrinsic nir_src_as_uint() calls are expected to be small, + * we can just look and see if they look like floats and convert them back to + * ints. + */ +static uint32_t +ntr_src_as_uint(struct ntr_compile *c, nir_src src) +{ + uint32_t val = nir_src_as_uint(src); + if (val >= fui(1.0)) + val = (uint32_t)uif(val); + return val; +} + +/* Per-channel masks of def/use within the block, and the per-channel + * livein/liveout for the block as a whole. + */ +struct ntr_live_reg_block_state { + uint8_t *def, *use, *livein, *liveout, *defin, *defout; +}; + +struct ntr_live_reg_state { + unsigned bitset_words; + + struct ntr_reg_interval *regs; + + /* Used in propagate_across_edge() */ + BITSET_WORD *tmp_live; + + struct ntr_live_reg_block_state *blocks; + + nir_block_worklist worklist; +}; + +static void +ntr_live_reg_mark_use(struct ntr_compile *c, struct ntr_live_reg_block_state *bs, + int ip, unsigned index, unsigned used_mask) +{ + bs->use[index] |= used_mask & ~bs->def[index]; + + c->liveness[index].start = MIN2(c->liveness[index].start, ip); + c->liveness[index].end = MAX2(c->liveness[index].end, ip); + +} +static void +ntr_live_reg_setup_def_use(struct ntr_compile *c, nir_function_impl *impl, struct ntr_live_reg_state *state) +{ + for (int i = 0; i < impl->num_blocks; i++) { + state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps); + } + + int ip = 0; + nir_foreach_block(block, impl) { + struct ntr_live_reg_block_state *bs = &state->blocks[block->index]; + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + + ntr_block->start_ip = ip; + + util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) { + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(insn->opcode); + + /* Set up use[] for the srcs. + * + * Uses are the channels of the reg read in the block that don't have a + * preceding def to screen them off. Note that we don't do per-element + * tracking of array regs, so they're never screened off. + */ + for (int i = 0; i < opcode_info->num_src; i++) { + if (insn->src[i].File != TGSI_FILE_TEMPORARY) + continue; + int index = insn->src[i].Index; + + uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i, + insn->dst->WriteMask, + insn->src[i].SwizzleX, + insn->src[i].SwizzleY, + insn->src[i].SwizzleZ, + insn->src[i].SwizzleW, + insn->tex_target, + insn->tex_target); + + assert(!insn->src[i].Indirect || index < c->first_non_array_temp); + ntr_live_reg_mark_use(c, bs, ip, index, used_mask); + } + + if (insn->is_tex) { + for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) { + if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) + ntr_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf); + } + } + + /* Set up def[] for the srcs. + * + * Defs are the unconditionally-written (not R/M/W) channels of the reg in + * the block that don't have a preceding use. + */ + for (int i = 0; i < opcode_info->num_dst; i++) { + if (insn->dst[i].File != TGSI_FILE_TEMPORARY) + continue; + int index = insn->dst[i].Index; + uint32_t writemask = insn->dst[i].WriteMask; + + bs->def[index] |= writemask & ~bs->use[index]; + bs->defout[index] |= writemask; + + assert(!insn->dst[i].Indirect || index < c->first_non_array_temp); + c->liveness[index].start = MIN2(c->liveness[index].start, ip); + c->liveness[index].end = MAX2(c->liveness[index].end, ip); + } + ip++; + } + + ntr_block->end_ip = ip; + } +} + +static void +ntr_live_regs(struct ntr_compile *c, nir_function_impl *impl) +{ + nir_metadata_require(impl, nir_metadata_block_index); + + c->liveness = rzalloc_array(c, struct ntr_reg_interval, c->num_temps); + + struct ntr_live_reg_state state = { + .blocks = rzalloc_array(impl, struct ntr_live_reg_block_state, impl->num_blocks), + }; + + /* The intervals start out with start > end (indicating unused) */ + for (int i = 0; i < c->num_temps; i++) + c->liveness[i].start = ~0; + + ntr_live_reg_setup_def_use(c, impl, &state); + + /* Make a forward-order worklist of all the blocks. */ + nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL); + nir_foreach_block(block, impl) { + nir_block_worklist_push_tail(&state.worklist, block); + } + + /* Propagate defin/defout down the CFG to calculate the live variables + * potentially defined along any possible control flow path. We'll use this + * to keep things like conditional defs of the reg (or array regs where we + * don't track defs!) from making the reg's live range extend back to the + * start of the program. + */ + while (!nir_block_worklist_is_empty(&state.worklist)) { + nir_block *block = nir_block_worklist_pop_head(&state.worklist); + for (int j = 0; j < ARRAY_SIZE(block->successors); j++) { + nir_block *succ = block->successors[j]; + if (!succ || succ->index == impl->num_blocks) + continue; + + for (int i = 0; i < c->num_temps; i++) { + uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i]; + + if (new_def) { + state.blocks[succ->index].defin[i] |= new_def; + state.blocks[succ->index].defout[i] |= new_def; + nir_block_worklist_push_tail(&state.worklist, succ); + } + } + } + } + + /* Make a reverse-order worklist of all the blocks. */ + nir_foreach_block(block, impl) { + nir_block_worklist_push_head(&state.worklist, block); + } + + /* We're now ready to work through the worklist and update the liveness sets + * of each of the blocks. As long as we keep the worklist up-to-date as we + * go, everything will get covered. + */ + while (!nir_block_worklist_is_empty(&state.worklist)) { + /* We pop them off in the reverse order we pushed them on. This way + * the first walk of the instructions is backwards so we only walk + * once in the case of no control flow. + */ + nir_block *block = nir_block_worklist_pop_head(&state.worklist); + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + struct ntr_live_reg_block_state *bs = &state.blocks[block->index]; + + for (int i = 0; i < c->num_temps; i++) { + /* Collect livein from our successors to include in our liveout. */ + for (int j = 0; j < ARRAY_SIZE(block->successors); j++) { + nir_block *succ = block->successors[j]; + if (!succ || succ->index == impl->num_blocks) + continue; + struct ntr_live_reg_block_state *sbs = &state.blocks[succ->index]; + + uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i]; + if (new_liveout) { + if (state.blocks[block->index].defout[i]) + c->liveness[i].end = MAX2(c->liveness[i].end, ntr_block->end_ip); + bs->liveout[i] |= sbs->livein[i]; + } + } + + /* Propagate use requests from either our block's uses or our + * non-screened-off liveout up to our predecessors. + */ + uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) & + ~bs->livein[i]); + if (new_livein) { + bs->livein[i] |= new_livein; + set_foreach(block->predecessors, entry) { + nir_block *pred = (void *)entry->key; + nir_block_worklist_push_tail(&state.worklist, pred); + } + + if (new_livein & state.blocks[block->index].defin[i]) + c->liveness[i].start = MIN2(c->liveness[i].start, ntr_block->start_ip); + } + } + } + + ralloc_free(state.blocks); + nir_block_worklist_fini(&state.worklist); +} + +static void +ntr_ra_check(struct ntr_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index) +{ + if (index < c->first_non_array_temp) + return; + + if (c->liveness[index].start == ip && ra_map[index] == ~0) + ra_map[index] = ureg_DECL_temporary(c->ureg).Index; + + if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) { + ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index])); + BITSET_SET(released, index); + } +} + +static void +ntr_allocate_regs(struct ntr_compile *c, nir_function_impl *impl) +{ + ntr_live_regs(c, impl); + + unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps); + unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps)); + + /* No RA on NIR array regs */ + for (int i = 0; i < c->first_non_array_temp; i++) + ra_map[i] = i; + + for (int i = c->first_non_array_temp; i < c->num_temps; i++) + ra_map[i] = ~0; + + int ip = 0; + nir_foreach_block(block, impl) { + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + + for (int i = 0; i < c->num_temps; i++) + ntr_ra_check(c, ra_map, released, ip, i); + + util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) { + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(insn->opcode); + + for (int i = 0; i < opcode_info->num_src; i++) { + if (insn->src[i].File == TGSI_FILE_TEMPORARY) { + ntr_ra_check(c, ra_map, released, ip, insn->src[i].Index); + insn->src[i].Index = ra_map[insn->src[i].Index]; + } + } + + if (insn->is_tex) { + for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) { + if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) { + ntr_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index); + insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index]; + } + } + } + + for (int i = 0; i < opcode_info->num_dst; i++) { + if (insn->dst[i].File == TGSI_FILE_TEMPORARY) { + ntr_ra_check(c, ra_map, released, ip, insn->dst[i].Index); + insn->dst[i].Index = ra_map[insn->dst[i].Index]; + } + } + ip++; + } + + for (int i = 0; i < c->num_temps; i++) + ntr_ra_check(c, ra_map, released, ip, i); + } +} + +static void +ntr_allocate_regs_unoptimized(struct ntr_compile *c, nir_function_impl *impl) +{ + for (int i = c->first_non_array_temp; i < c->num_temps; i++) + ureg_DECL_temporary(c->ureg); +} + +/* TGSI varying declarations have a component usage mask associated (used by + * r600 and svga). + */ +static uint32_t +ntr_tgsi_var_usage_mask(const struct nir_variable *var) +{ + const struct glsl_type *type_without_array = + glsl_without_array(var->type); + unsigned num_components = glsl_get_vector_elements(type_without_array); + if (num_components == 0) /* structs */ + num_components = 4; + + return u_bit_consecutive(var->data.location_frac, num_components); +} + +static struct ureg_dst +ntr_output_decl(struct ntr_compile *c, nir_intrinsic_instr *instr, uint32_t *frac) +{ + nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); + int base = nir_intrinsic_base(instr); + *frac = nir_intrinsic_component(instr); + + struct ureg_dst out; + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + unsigned semantic_name, semantic_index; + tgsi_get_gl_frag_result_semantic(semantics.location, + &semantic_name, &semantic_index); + semantic_index += semantics.dual_source_blend_index; + + switch (semantics.location) { + case FRAG_RESULT_DEPTH: + *frac = 2; /* z write is the to the .z channel in TGSI */ + break; + case FRAG_RESULT_STENCIL: + *frac = 1; + break; + default: + break; + } + + out = ureg_DECL_output(c->ureg, semantic_name, semantic_index); + } else { + unsigned semantic_name, semantic_index; + + tgsi_get_gl_varying_semantic(semantics.location, true, + &semantic_name, &semantic_index); + + uint32_t usage_mask = u_bit_consecutive(*frac, instr->num_components); + uint32_t gs_streams = semantics.gs_streams; + for (int i = 0; i < 4; i++) { + if (!(usage_mask & (1 << i))) + gs_streams &= ~(0x3 << 2 * i); + } + + /* No driver appears to use array_id of outputs. */ + unsigned array_id = 0; + + /* This bit is lost in the i/o semantics, but it's unused in in-tree + * drivers. + */ + bool invariant = semantics.invariant; + + out = ureg_DECL_output_layout(c->ureg, + semantic_name, semantic_index, + gs_streams, + base, + usage_mask, + array_id, + semantics.num_slots, + invariant); + } + + unsigned write_mask; + if (nir_intrinsic_has_write_mask(instr)) + write_mask = nir_intrinsic_write_mask(instr); + else + write_mask = ((1 << instr->num_components) - 1) << *frac; + + write_mask = write_mask << *frac; + return ureg_writemask(out, write_mask); +} + +static bool +ntr_try_store_in_tgsi_output_with_use(struct ntr_compile *c, + struct ureg_dst *dst, + nir_src *src) +{ + *dst = ureg_dst_undef(); + + if (nir_src_is_if(src)) + return false; + + if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src)); + if (intr->intrinsic != nir_intrinsic_store_output || + !nir_src_is_const(intr->src[1])) { + return false; + } + + uint32_t frac; + *dst = ntr_output_decl(c, intr, &frac); + dst->Index += ntr_src_as_uint(c, intr->src[1]); + + return frac == 0; +} + +/* If this reg is used only for storing an output, then in the simple + * cases we can write directly to the TGSI output instead of having + * store_output emit its own MOV. + */ +static bool +ntr_try_store_reg_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst, + nir_intrinsic_instr *reg_decl) +{ + assert(reg_decl->intrinsic == nir_intrinsic_decl_reg); + + *dst = ureg_dst_undef(); + + /* Look for a single use for try_store_in_tgsi_output */ + nir_src *use = NULL; + nir_foreach_reg_load(src, reg_decl) { + nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src)); + nir_foreach_use_including_if(load_use, &load->def) { + /* We can only have one use */ + if (use != NULL) + return false; + + use = load_use; + } + } + + if (use == NULL) + return false; + + return ntr_try_store_in_tgsi_output_with_use(c, dst, use); +} + +/* If this SSA def is used only for storing an output, then in the simple + * cases we can write directly to the TGSI output instead of having + * store_output emit its own MOV. + */ +static bool +ntr_try_store_ssa_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst, + nir_def *def) +{ + *dst = ureg_dst_undef(); + + if (!list_is_singular(&def->uses)) + return false; + + nir_foreach_use_including_if(use, def) { + return ntr_try_store_in_tgsi_output_with_use(c, dst, use); + } + unreachable("We have one use"); +} + +static void +ntr_setup_inputs(struct ntr_compile *c) +{ + if (c->s->info.stage != MESA_SHADER_FRAGMENT) + return; + + unsigned num_inputs = 0; + int num_input_arrays = 0; + + nir_foreach_shader_in_variable(var, c->s) { + const struct glsl_type *type = var->type; + unsigned array_len = + glsl_count_attribute_slots(type, false); + + num_inputs = MAX2(num_inputs, var->data.driver_location + array_len); + } + + c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs); + + nir_foreach_shader_in_variable(var, c->s) { + const struct glsl_type *type = var->type; + unsigned array_len = + glsl_count_attribute_slots(type, false); + + unsigned interpolation = TGSI_INTERPOLATE_CONSTANT; + unsigned sample_loc; + struct ureg_src decl; + + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + interpolation = + tgsi_get_interp_mode(var->data.interpolation, + var->data.location == VARYING_SLOT_COL0 || + var->data.location == VARYING_SLOT_COL1); + + if (var->data.location == VARYING_SLOT_POS) + interpolation = TGSI_INTERPOLATE_LINEAR; + } + + unsigned semantic_name, semantic_index; + tgsi_get_gl_varying_semantic(var->data.location, true, + &semantic_name, &semantic_index); + + if (var->data.sample) { + sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE; + } else if (var->data.centroid) { + sample_loc = TGSI_INTERPOLATE_LOC_CENTROID; + c->centroid_inputs |= (BITSET_MASK(array_len) << + var->data.driver_location); + } else { + sample_loc = TGSI_INTERPOLATE_LOC_CENTER; + } + + unsigned array_id = 0; + if (glsl_type_is_array(type)) + array_id = ++num_input_arrays; + + uint32_t usage_mask = ntr_tgsi_var_usage_mask(var); + + decl = ureg_DECL_fs_input_centroid_layout(c->ureg, + semantic_name, + semantic_index, + interpolation, + sample_loc, + var->data.driver_location, + usage_mask, + array_id, array_len); + + if (semantic_name == TGSI_SEMANTIC_FACE) { + struct ureg_dst temp = ntr_temp(c); + /* tgsi docs say that floating point FACE will be positive for + * frontface and negative for backface, but realistically + * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0. + * Copy that behavior, since some drivers (r300) have been doing a + * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0 + * front face). + */ + temp.Saturate = true; + ntr_MOV(c, temp, decl); + decl = ureg_src(temp); + } + + for (unsigned i = 0; i < array_len; i++) { + c->input_index_map[var->data.driver_location + i] = decl; + c->input_index_map[var->data.driver_location + i].Index += i; + } + } +} + +static int +ntr_sort_by_location(const nir_variable *a, const nir_variable *b) +{ + return a->data.location - b->data.location; +} + +/** + * Workaround for virglrenderer requiring that TGSI FS output color variables + * are declared in order. Besides, it's a lot nicer to read the TGSI this way. + */ +static void +ntr_setup_outputs(struct ntr_compile *c) +{ + if (c->s->info.stage != MESA_SHADER_FRAGMENT) + return; + + nir_sort_variables_with_modes(c->s, ntr_sort_by_location, nir_var_shader_out); + + nir_foreach_shader_out_variable(var, c->s) { + if (var->data.location == FRAG_RESULT_COLOR) + ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1); + + unsigned semantic_name, semantic_index; + tgsi_get_gl_frag_result_semantic(var->data.location, + &semantic_name, &semantic_index); + + (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index); + } +} + +static enum tgsi_texture_type +tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + if (is_shadow) + return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D; + else + return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_EXTERNAL: + if (is_shadow) + return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D; + else + return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D; + case GLSL_SAMPLER_DIM_3D: + return TGSI_TEXTURE_3D; + case GLSL_SAMPLER_DIM_CUBE: + if (is_shadow) + return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE; + else + return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE; + case GLSL_SAMPLER_DIM_RECT: + if (is_shadow) + return TGSI_TEXTURE_SHADOWRECT; + else + return TGSI_TEXTURE_RECT; + case GLSL_SAMPLER_DIM_MS: + return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA; + case GLSL_SAMPLER_DIM_BUF: + return TGSI_TEXTURE_BUFFER; + default: + unreachable("unknown sampler dim"); + } +} + +static enum tgsi_return_type +tgsi_return_type_from_base_type(enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_INT: + return TGSI_RETURN_TYPE_SINT; + case GLSL_TYPE_UINT: + return TGSI_RETURN_TYPE_UINT; + case GLSL_TYPE_FLOAT: + return TGSI_RETURN_TYPE_FLOAT; + default: + unreachable("unexpected texture type"); + } +} + +static void +ntr_setup_uniforms(struct ntr_compile *c) +{ + nir_foreach_uniform_variable(var, c->s) { + if (glsl_type_is_sampler(glsl_without_array(var->type)) || + glsl_type_is_texture(glsl_without_array(var->type))) { + /* Don't use this size for the check for samplers -- arrays of structs + * containing samplers should be ignored, and just the separate lowered + * sampler uniform decl used. + */ + int size = glsl_type_get_sampler_count(var->type) + + glsl_type_get_texture_count(var->type); + + const struct glsl_type *stype = glsl_without_array(var->type); + enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype), + glsl_sampler_type_is_array(stype), + glsl_sampler_type_is_shadow(stype)); + enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype)); + for (int i = 0; i < size; i++) { + ureg_DECL_sampler_view(c->ureg, var->data.binding + i, + target, ret_type, ret_type, ret_type, ret_type); + ureg_DECL_sampler(c->ureg, var->data.binding + i); + } + + /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0 + * size declaration happens with other UBOs below. + */ + } + } + + c->first_ubo = ~0; + + unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0}; + nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) { + int ubo = var->data.driver_location; + if (ubo == -1) + continue; + + if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo)) + c->first_ubo = MIN2(c->first_ubo, ubo); + + unsigned size = glsl_get_explicit_size(var->interface_type, false); + ubo_sizes[ubo] = size; + } + + for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) { + if (ubo_sizes[i]) + ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i); + } +} + +static void +ntr_setup_registers(struct ntr_compile *c) +{ + assert(c->num_temps == 0); + + nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) { + /* Permanently allocate all the array regs at the start. */ + unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg); + unsigned index = nir_reg->def.index; + + if (num_array_elems != 0) { + struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true); + c->reg_temp[index] = decl; + assert(c->num_temps == decl.Index); + c->num_temps += num_array_elems; + } + } + c->first_non_array_temp = c->num_temps; + + /* After that, allocate non-array regs in our virtual space that we'll + * register-allocate before ureg emit. + */ + nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) { + unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg); + unsigned num_components = nir_intrinsic_num_components(nir_reg); + unsigned index = nir_reg->def.index; + + /* We already handled arrays */ + if (num_array_elems == 0) { + struct ureg_dst decl; + uint32_t write_mask = BITFIELD_MASK(num_components); + + if (!ntr_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) { + decl = ureg_writemask(ntr_temp(c), write_mask); + } + c->reg_temp[index] = decl; + } + } +} + +static struct ureg_src +ntr_get_load_const_src(struct ntr_compile *c, nir_load_const_instr *instr) +{ + int num_components = instr->def.num_components; + + float values[4]; + assert(instr->def.bit_size == 32); + for (int i = 0; i < num_components; i++) + values[i] = uif(instr->value[i].u32); + + return ureg_DECL_immediate(c->ureg, values, num_components); +} + +static struct ureg_src +ntr_reladdr(struct ntr_compile *c, struct ureg_src addr, int addr_index) +{ + assert(addr_index < ARRAY_SIZE(c->addr_reg)); + + for (int i = 0; i <= addr_index; i++) { + if (!c->addr_declared[i]) { + c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg), + TGSI_WRITEMASK_X); + c->addr_declared[i] = true; + } + } + + ntr_ARL(c, c->addr_reg[addr_index], addr); + return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0); +} + +/* Forward declare for recursion with indirects */ +static struct ureg_src +ntr_get_src(struct ntr_compile *c, nir_src src); + +static struct ureg_src +ntr_get_chased_src(struct ntr_compile *c, nir_legacy_src *src) +{ + if (src->is_ssa) { + if (src->ssa->parent_instr->type == nir_instr_type_load_const) + return ntr_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr)); + + return c->ssa_temp[src->ssa->index]; + } else { + struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index]; + reg_temp.Index += src->reg.base_offset; + + if (src->reg.indirect) { + struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(src->reg.indirect)); + return ureg_src_indirect(ureg_src(reg_temp), + ntr_reladdr(c, offset, 0)); + } else { + return ureg_src(reg_temp); + } + } +} + +static struct ureg_src +ntr_get_src(struct ntr_compile *c, nir_src src) +{ + nir_legacy_src chased = nir_legacy_chase_src(&src); + return ntr_get_chased_src(c, &chased); +} + +static struct ureg_src +ntr_get_alu_src(struct ntr_compile *c, nir_alu_instr *instr, int i) +{ + /* We only support 32-bit float modifiers. The only other modifier type + * officially supported by TGSI is 32-bit integer negates, but even those are + * broken on virglrenderer, so skip lowering all integer and f64 float mods. + * + * The options->lower_fabs requests that we not have native source modifiers + * for fabs, and instead emit MAX(a,-a) for nir_op_fabs. + */ + nir_legacy_alu_src src = + nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs); + struct ureg_src usrc = ntr_get_chased_src(c, &src.src); + + usrc = ureg_swizzle(usrc, + src.swizzle[0], + src.swizzle[1], + src.swizzle[2], + src.swizzle[3]); + + if (src.fabs) + usrc = ureg_abs(usrc); + if (src.fneg) + usrc = ureg_negate(usrc); + + return usrc; +} + +/* Reswizzles a source so that the unset channels in the write mask still refer + * to one of the channels present in the write mask. + */ +static struct ureg_src +ntr_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask) +{ + assert(write_mask); + int first_chan = ffs(write_mask) - 1; + return ureg_swizzle(src, + (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan, + (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan, + (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan, + (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan); +} + +static struct ureg_dst +ntr_get_ssa_def_decl(struct ntr_compile *c, nir_def *ssa) +{ + uint32_t writemask = BITSET_MASK(ssa->num_components); + + struct ureg_dst dst; + if (!ntr_try_store_ssa_in_tgsi_output(c, &dst, ssa)) + dst = ntr_temp(c); + + c->ssa_temp[ssa->index] = ntr_swizzle_for_write_mask(ureg_src(dst), writemask); + + return ureg_writemask(dst, writemask); +} + +static struct ureg_dst +ntr_get_chased_dest_decl(struct ntr_compile *c, nir_legacy_dest *dest) +{ + if (dest->is_ssa) + return ntr_get_ssa_def_decl(c, dest->ssa); + else + return c->reg_temp[dest->reg.handle->index]; +} + +static struct ureg_dst +ntr_get_chased_dest(struct ntr_compile *c, nir_legacy_dest *dest) +{ + struct ureg_dst dst = ntr_get_chased_dest_decl(c, dest); + + if (!dest->is_ssa) { + dst.Index += dest->reg.base_offset; + + if (dest->reg.indirect) { + struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(dest->reg.indirect)); + dst = ureg_dst_indirect(dst, ntr_reladdr(c, offset, 0)); + } + } + + return dst; +} + +static struct ureg_dst +ntr_get_dest(struct ntr_compile *c, nir_def *def) +{ + nir_legacy_dest chased = nir_legacy_chase_dest(def); + return ntr_get_chased_dest(c, &chased); +} + +static struct ureg_dst +ntr_get_alu_dest(struct ntr_compile *c, nir_def *def) +{ + nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def); + struct ureg_dst dst = ntr_get_chased_dest(c, &chased.dest); + + if (chased.fsat) + dst.Saturate = true; + + /* Only registers get write masks */ + if (chased.dest.is_ssa) + return dst; + + return ureg_writemask(dst, chased.write_mask); +} + +/* For an SSA dest being populated by a constant src, replace the storage with + * a copy of the ureg_src. + */ +static void +ntr_store_def(struct ntr_compile *c, nir_def *def, struct ureg_src src) +{ + if (!src.Indirect && !src.DimIndirect) { + switch (src.File) { + case TGSI_FILE_IMMEDIATE: + case TGSI_FILE_INPUT: + case TGSI_FILE_CONSTANT: + case TGSI_FILE_SYSTEM_VALUE: + c->ssa_temp[def->index] = src; + return; + } + } + + ntr_MOV(c, ntr_get_ssa_def_decl(c, def), src); +} + +static void +ntr_store(struct ntr_compile *c, nir_def *def, struct ureg_src src) +{ + nir_legacy_dest chased = nir_legacy_chase_dest(def); + + if (chased.is_ssa) + ntr_store_def(c, chased.ssa, src); + else { + struct ureg_dst dst = ntr_get_chased_dest(c, &chased); + ntr_MOV(c, dst, src); + } +} + +static void +ntr_emit_scalar(struct ntr_compile *c, unsigned tgsi_op, + struct ureg_dst dst, + struct ureg_src src0, + struct ureg_src src1) +{ + unsigned i; + + /* POW is the only 2-operand scalar op. */ + if (tgsi_op != TGSI_OPCODE_POW) + src1 = src0; + + for (i = 0; i < 4; i++) { + if (dst.WriteMask & (1 << i)) { + ntr_insn(c, tgsi_op, + ureg_writemask(dst, 1 << i), + ureg_scalar(src0, i), + ureg_scalar(src1, i), + ureg_src_undef(), ureg_src_undef()); + } + } +} + +static void +ntr_emit_alu(struct ntr_compile *c, nir_alu_instr *instr) +{ + struct ureg_src src[4]; + struct ureg_dst dst; + unsigned i; + int num_srcs = nir_op_infos[instr->op].num_inputs; + + /* Don't try to translate folded fsat since their source won't be valid */ + if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr)) + return; + + c->precise = instr->exact; + + assert(num_srcs <= ARRAY_SIZE(src)); + for (i = 0; i < num_srcs; i++) + src[i] = ntr_get_alu_src(c, instr, i); + for (; i < ARRAY_SIZE(src); i++) + src[i] = ureg_src_undef(); + + dst = ntr_get_alu_dest(c, &instr->def); + + static enum tgsi_opcode op_map[] = { + [nir_op_mov] = TGSI_OPCODE_MOV, + + [nir_op_fdot2_replicated] = TGSI_OPCODE_DP2, + [nir_op_fdot3_replicated] = TGSI_OPCODE_DP3, + [nir_op_fdot4_replicated] = TGSI_OPCODE_DP4, + [nir_op_ffloor] = TGSI_OPCODE_FLR, + [nir_op_ffract] = TGSI_OPCODE_FRC, + [nir_op_fceil] = TGSI_OPCODE_CEIL, + [nir_op_fround_even] = TGSI_OPCODE_ROUND, + + [nir_op_slt] = TGSI_OPCODE_SLT, + [nir_op_sge] = TGSI_OPCODE_SGE, + [nir_op_seq] = TGSI_OPCODE_SEQ, + [nir_op_sne] = TGSI_OPCODE_SNE, + + [nir_op_ftrunc] = TGSI_OPCODE_TRUNC, + [nir_op_fddx] = TGSI_OPCODE_DDX, + [nir_op_fddy] = TGSI_OPCODE_DDY, + [nir_op_fddx_coarse] = TGSI_OPCODE_DDX, + [nir_op_fddy_coarse] = TGSI_OPCODE_DDY, + [nir_op_fadd] = TGSI_OPCODE_ADD, + [nir_op_fmul] = TGSI_OPCODE_MUL, + + [nir_op_fmin] = TGSI_OPCODE_MIN, + [nir_op_fmax] = TGSI_OPCODE_MAX, + [nir_op_ffma] = TGSI_OPCODE_MAD, + }; + + if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op] > 0) { + /* The normal path for NIR to TGSI ALU op translation */ + ntr_insn(c, op_map[instr->op], + dst, src[0], src[1], src[2], src[3]); + } else { + /* Special cases for NIR to TGSI ALU op translation. */ + + /* TODO: Use something like the ntr_store() path for the MOV calls so we + * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm. + */ + + switch (instr->op) { + case nir_op_fabs: + /* Try to eliminate */ + if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr)) + break; + + if (c->options->lower_fabs) + ntr_MAX(c, dst, src[0], ureg_negate(src[0])); + else + ntr_MOV(c, dst, ureg_abs(src[0])); + break; + + case nir_op_fsat: + ntr_MOV(c, ureg_saturate(dst), src[0]); + break; + + case nir_op_fneg: + /* Try to eliminate */ + if (nir_legacy_float_mod_folds(instr)) + break; + + ntr_MOV(c, dst, ureg_negate(src[0])); + break; + + /* NOTE: TGSI 32-bit math ops have the old "one source channel + * replicated to all dst channels" behavior, while 64 is normal mapping + * of src channels to dst. + */ + case nir_op_frcp: + ntr_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef()); + break; + + case nir_op_frsq: + ntr_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fexp2: + ntr_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef()); + break; + + case nir_op_flog2: + ntr_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fsin: + ntr_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fcos: + ntr_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fsub: + ntr_ADD(c, dst, src[0], ureg_negate(src[1])); + break; + + case nir_op_fmod: + unreachable("should be handled by .lower_fmod = true"); + break; + + case nir_op_fpow: + ntr_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]); + break; + + case nir_op_flrp: + ntr_LRP(c, dst, src[2], src[1], src[0]); + break; + + case nir_op_fcsel: + /* If CMP isn't supported, then the flags that enable NIR to generate + * this opcode should also not be set. + */ + assert(!c->options->lower_cmp); + + /* Implement this as CMP(-abs(src0), src1, src2). */ + ntr_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]); + break; + + case nir_op_fcsel_gt: + /* If CMP isn't supported, then the flags that enable NIR to generate + * these opcodes should also not be set. + */ + assert(!c->options->lower_cmp); + + ntr_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]); + break; + + case nir_op_fcsel_ge: + /* If CMP isn't supported, then the flags that enable NIR to generate + * these opcodes should also not be set. + */ + assert(!c->options->lower_cmp); + + /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */ + ntr_CMP(c, dst, src[0], src[2], src[1]); + break; + + case nir_op_vec4: + case nir_op_vec3: + case nir_op_vec2: + unreachable("covered by nir_lower_vec_to_movs()"); + + default: + fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name); + unreachable("Unknown NIR opcode"); + } + } + + c->precise = false; +} + +static struct ureg_src +ntr_ureg_src_indirect(struct ntr_compile *c, struct ureg_src usrc, + nir_src src, int addr_reg) +{ + if (nir_src_is_const(src)) { + usrc.Index += ntr_src_as_uint(c, src); + return usrc; + } else { + return ureg_src_indirect(usrc, ntr_reladdr(c, ntr_get_src(c, src), addr_reg)); + } +} + +static struct ureg_dst +ntr_ureg_dst_indirect(struct ntr_compile *c, struct ureg_dst dst, + nir_src src) +{ + if (nir_src_is_const(src)) { + dst.Index += ntr_src_as_uint(c, src); + return dst; + } else { + return ureg_dst_indirect(dst, ntr_reladdr(c, ntr_get_src(c, src), 0)); + } +} + +static struct ureg_dst +ntr_ureg_dst_dimension_indirect(struct ntr_compile *c, struct ureg_dst udst, + nir_src src) +{ + if (nir_src_is_const(src)) { + return ureg_dst_dimension(udst, ntr_src_as_uint(c, src)); + } else { + return ureg_dst_dimension_indirect(udst, + ntr_reladdr(c, ntr_get_src(c, src), 1), + 0); + } +} +/* Some load operations in NIR will have a fractional offset that we need to + * swizzle down before storing to the result register. + */ +static struct ureg_src +ntr_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components) +{ + return ureg_swizzle(src, + frac, + frac + MIN2(num_components - 1, 1), + frac + MIN2(num_components - 1, 2), + frac + MIN2(num_components - 1, 3)); +} + + +static void +ntr_emit_load_ubo(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0); + + struct ureg_dst addr_temp = ureg_dst_undef(); + + if (nir_src_is_const(instr->src[0])) { + src = ureg_src_dimension(src, ntr_src_as_uint(c, instr->src[0])); + } else { + /* virglrenderer requires that indirect UBO references have the UBO + * array's base index in the Index field, not added to the indrect + * address. + * + * Many nir intrinsics have a base address const value for the start of + * their array indirection, but load_ubo doesn't. We fake it by + * subtracting it off here. + */ + addr_temp = ntr_temp(c); + ntr_UADD(c, addr_temp, ntr_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo)); + src = ureg_src_dimension_indirect(src, + ntr_reladdr(c, ureg_src(addr_temp), 1), + c->first_ubo); + } + + /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const + * file. + */ + src.Index = nir_intrinsic_base(instr); + + if (nir_src_is_const(instr->src[1])) { + src.Index += ntr_src_as_uint(c, instr->src[1]); + } else { + src = ureg_src_indirect(src, ntr_reladdr(c, ntr_get_src(c, instr->src[1]), 0)); + } + + int start_component = nir_intrinsic_component(instr); + + src = ntr_shift_by_frac(src, start_component, instr->num_components); + + ntr_store(c, &instr->def, src); +} + +static void +ntr_emit_load_input(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + uint32_t frac = nir_intrinsic_component(instr); + uint32_t num_components = instr->num_components; + unsigned base = nir_intrinsic_base(instr); + struct ureg_src input; + nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); + + if (c->s->info.stage == MESA_SHADER_VERTEX) { + input = ureg_DECL_vs_input(c->ureg, base); + for (int i = 1; i < semantics.num_slots; i++) + ureg_DECL_vs_input(c->ureg, base + i); + } else { + input = c->input_index_map[base]; + } + + input = ntr_shift_by_frac(input, frac, num_components); + + switch (instr->intrinsic) { + case nir_intrinsic_load_input: + input = ntr_ureg_src_indirect(c, input, instr->src[0], 0); + ntr_store(c, &instr->def, input); + break; + + case nir_intrinsic_load_interpolated_input: { + input = ntr_ureg_src_indirect(c, input, instr->src[1], 0); + + nir_intrinsic_instr *bary_instr = + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); + + switch (bary_instr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_sample: + /* For these, we know that the barycentric load matches the + * interpolation on the input declaration, so we can use it directly. + */ + ntr_store(c, &instr->def, input); + break; + + case nir_intrinsic_load_barycentric_centroid: + /* If the input was declared centroid, then there's no need to + * emit the extra TGSI interp instruction, we can just read the + * input. + */ + if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) { + ntr_store(c, &instr->def, input); + } else { + ntr_INTERP_CENTROID(c, ntr_get_dest(c, &instr->def), input); + } + break; + + case nir_intrinsic_load_barycentric_at_sample: + /* We stored the sample in the fake "bary" dest. */ + ntr_INTERP_SAMPLE(c, ntr_get_dest(c, &instr->def), input, + ntr_get_src(c, instr->src[0])); + break; + + case nir_intrinsic_load_barycentric_at_offset: + /* We stored the offset in the fake "bary" dest. */ + ntr_INTERP_OFFSET(c, ntr_get_dest(c, &instr->def), input, + ntr_get_src(c, instr->src[0])); + break; + + default: + unreachable("bad barycentric interp intrinsic\n"); + } + break; + } + + default: + unreachable("bad load input intrinsic\n"); + } +} + +static void +ntr_emit_store_output(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + struct ureg_src src = ntr_get_src(c, instr->src[0]); + + if (src.File == TGSI_FILE_OUTPUT) { + /* If our src is the output file, that's an indication that we were able + * to emit the output stores in the generating instructions and we have + * nothing to do here. + */ + return; + } + + uint32_t frac; + struct ureg_dst out = ntr_output_decl(c, instr, &frac); + + if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) { + out = ntr_ureg_dst_indirect(c, out, instr->src[2]); + out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[1]); + } else { + out = ntr_ureg_dst_indirect(c, out, instr->src[1]); + } + + uint8_t swizzle[4] = { 0, 0, 0, 0 }; + for (int i = frac; i < 4; i++) { + if (out.WriteMask & (1 << i)) + swizzle[i] = i - frac; + } + + src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]); + + ntr_MOV(c, out, src); +} + +static void +ntr_emit_load_output(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); + + /* ntr_try_store_in_tgsi_output() optimization is not valid if normal + * load_output is present. + */ + assert(c->s->info.stage != MESA_SHADER_VERTEX && + (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output)); + + uint32_t frac; + struct ureg_dst out = ntr_output_decl(c, instr, &frac); + + if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) { + out = ntr_ureg_dst_indirect(c, out, instr->src[1]); + out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[0]); + } else { + out = ntr_ureg_dst_indirect(c, out, instr->src[0]); + } + + struct ureg_dst dst = ntr_get_dest(c, &instr->def); + struct ureg_src out_src = ureg_src(out); + + /* Don't swizzling unavailable channels of the output in the writemasked-out + * components. Avoids compile failures in virglrenderer with + * TESS_LEVEL_INNER. + */ + int fill_channel = ffs(dst.WriteMask) - 1; + uint8_t swizzles[4] = { 0, 1, 2, 3 }; + for (int i = 0; i < 4; i++) + if (!(dst.WriteMask & (1 << i))) + swizzles[i] = fill_channel; + out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]); + + if (semantics.fb_fetch_output) + ntr_FBFETCH(c, dst, out_src); + else + ntr_MOV(c, dst, out_src); +} + +static void +ntr_emit_load_sysval(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic); + enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval); + struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0); + + /* virglrenderer doesn't like references to channels of the sysval that + * aren't defined, even if they aren't really read. (GLSL compile fails on + * gl_NumWorkGroups.w, for example). + */ + uint32_t write_mask = BITSET_MASK(instr->def.num_components); + sv = ntr_swizzle_for_write_mask(sv, write_mask); + + /* TGSI and NIR define these intrinsics as always loading ints, but they can + * still appear on hardware with non-native-integers fragment shaders using + * the draw path (i915g). In that case, having called nir_lower_int_to_float + * means that we actually want floats instead. + */ + switch (instr->intrinsic) { + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_instance_id: + ntr_U2F(c, ntr_get_dest(c, &instr->def), sv); + return; + + default: + break; + } + + ntr_store(c, &instr->def, sv); +} + +static void +ntr_emit_intrinsic(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + ntr_emit_load_ubo(c, instr); + break; + + /* Vertex */ + case nir_intrinsic_load_draw_id: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_point_coord: + case nir_intrinsic_load_front_face: + ntr_emit_load_sysval(c, instr); + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_interpolated_input: + ntr_emit_load_input(c, instr); + break; + + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: + ntr_emit_store_output(c, instr); + break; + + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: + ntr_emit_load_output(c, instr); + break; + + case nir_intrinsic_discard: + ntr_KILL(c); + break; + + case nir_intrinsic_discard_if: { + struct ureg_src cond = ureg_scalar(ntr_get_src(c, instr->src[0]), 0); + /* For !native_integers, the bool got lowered to 1.0 or 0.0. */ + ntr_KILL_IF(c, ureg_negate(cond)); + break; + } + /* In TGSI we don't actually generate the barycentric coords, and emit + * interp intrinsics later. However, we do need to store the + * load_barycentric_at_* argument so that we can use it at that point. + */ + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + break; + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + ntr_store(c, &instr->def, ntr_get_src(c, instr->src[0])); + break; + + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_load_reg_indirect: + case nir_intrinsic_store_reg: + case nir_intrinsic_store_reg_indirect: + /* fully consumed */ + break; + + default: + fprintf(stderr, "Unknown intrinsic: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + break; + } +} + +struct ntr_tex_operand_state { + struct ureg_src srcs[4]; + unsigned i; +}; + +static void +ntr_push_tex_arg(struct ntr_compile *c, + nir_tex_instr *instr, + nir_tex_src_type tex_src_type, + struct ntr_tex_operand_state *s) +{ + int tex_src = nir_tex_instr_src_index(instr, tex_src_type); + if (tex_src < 0) + return; + + nir_src *src = &instr->src[tex_src].src; + s->srcs[s->i++] = ntr_get_src(c, *src); +} + +static void +ntr_emit_texture(struct ntr_compile *c, nir_tex_instr *instr) +{ + struct ureg_dst dst = ntr_get_dest(c, &instr->def); + enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow); + unsigned tex_opcode; + + int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle); + int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle); + + struct ureg_src sampler; + if (tex_handle_src >= 0 && sampler_handle_src >= 0) { + /* It seems we can't get separate tex/sampler on GL, just use one of the handles */ + sampler = ntr_get_src(c, instr->src[tex_handle_src].src); + assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); + } else { + assert(tex_handle_src == -1 && sampler_handle_src == -1); + sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index); + int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset); + if (sampler_src >= 0) { + struct ureg_src reladdr = ntr_get_src(c, instr->src[sampler_src].src); + sampler = ureg_src_indirect(sampler, ntr_reladdr(c, reladdr, 2)); + } + } + + switch (instr->op) { + case nir_texop_tex: + if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) > + MAX2(instr->coord_components, 2) + instr->is_shadow) + tex_opcode = TGSI_OPCODE_TXP; + else + tex_opcode = TGSI_OPCODE_TEX; + break; + case nir_texop_txl: + tex_opcode = TGSI_OPCODE_TXL; + break; + case nir_texop_txb: + tex_opcode = TGSI_OPCODE_TXB; + break; + case nir_texop_txd: + tex_opcode = TGSI_OPCODE_TXD; + break; + case nir_texop_txs: + tex_opcode = TGSI_OPCODE_TXQ; + break; + case nir_texop_tg4: + tex_opcode = TGSI_OPCODE_TG4; + break; + case nir_texop_query_levels: + tex_opcode = TGSI_OPCODE_TXQ; + break; + case nir_texop_lod: + tex_opcode = TGSI_OPCODE_LODQ; + break; + case nir_texop_texture_samples: + tex_opcode = TGSI_OPCODE_TXQS; + break; + default: + unreachable("unsupported tex op"); + } + + struct ntr_tex_operand_state s = { .i = 0 }; + ntr_push_tex_arg(c, instr, nir_tex_src_backend1, &s); + ntr_push_tex_arg(c, instr, nir_tex_src_backend2, &s); + + /* non-coord arg for TXQ */ + if (tex_opcode == TGSI_OPCODE_TXQ) { + ntr_push_tex_arg(c, instr, nir_tex_src_lod, &s); + /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's + * scalar + */ + s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0); + } + + if (s.i > 1) { + if (tex_opcode == TGSI_OPCODE_TEX) + tex_opcode = TGSI_OPCODE_TEX2; + if (tex_opcode == TGSI_OPCODE_TXB) + tex_opcode = TGSI_OPCODE_TXB2; + if (tex_opcode == TGSI_OPCODE_TXL) + tex_opcode = TGSI_OPCODE_TXL2; + } + + if (instr->op == nir_texop_txd) { + /* Derivs appear in their own src args */ + int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx); + int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy); + s.srcs[s.i++] = ntr_get_src(c, instr->src[ddx].src); + s.srcs[s.i++] = ntr_get_src(c, instr->src[ddy].src); + } + + if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) { + if (c->screen->get_param(c->screen, + PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) { + sampler = ureg_scalar(sampler, instr->component); + s.srcs[s.i++] = ureg_src_undef(); + } else { + s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component); + } + } + + s.srcs[s.i++] = sampler; + + enum tgsi_return_type tex_type; + switch (instr->dest_type) { + case nir_type_float32: + tex_type = TGSI_RETURN_TYPE_FLOAT; + break; + case nir_type_int32: + tex_type = TGSI_RETURN_TYPE_SINT; + break; + case nir_type_uint32: + tex_type = TGSI_RETURN_TYPE_UINT; + break; + default: + unreachable("unknown texture type"); + } + + struct ureg_dst tex_dst; + if (instr->op == nir_texop_query_levels) + tex_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W); + else + tex_dst = dst; + + while (s.i < 4) + s.srcs[s.i++] = ureg_src_undef(); + + struct ntr_insn *insn = ntr_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]); + insn->tex_target = target; + insn->tex_return_type = tex_type; + insn->is_tex = true; + + int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset); + if (tex_offset_src >= 0) { + struct ureg_src offset = ntr_get_src(c, instr->src[tex_offset_src].src); + + insn->tex_offset[0].File = offset.File; + insn->tex_offset[0].Index = offset.Index; + insn->tex_offset[0].SwizzleX = offset.SwizzleX; + insn->tex_offset[0].SwizzleY = offset.SwizzleY; + insn->tex_offset[0].SwizzleZ = offset.SwizzleZ; + insn->tex_offset[0].Padding = 0; + } + + if (nir_tex_instr_has_explicit_tg4_offsets(instr)) { + for (uint8_t i = 0; i < 4; ++i) { + struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]); + insn->tex_offset[i].File = imm.File; + insn->tex_offset[i].Index = imm.Index; + insn->tex_offset[i].SwizzleX = imm.SwizzleX; + insn->tex_offset[i].SwizzleY = imm.SwizzleY; + insn->tex_offset[i].SwizzleZ = imm.SwizzleZ; + } + } + + if (instr->op == nir_texop_query_levels) + ntr_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3)); +} + +static void +ntr_emit_jump(struct ntr_compile *c, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + ntr_BRK(c); + break; + + case nir_jump_continue: + ntr_CONT(c); + break; + + default: + fprintf(stderr, "Unknown jump instruction: "); + nir_print_instr(&jump->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static void +ntr_emit_ssa_undef(struct ntr_compile *c, nir_undef_instr *instr) +{ + /* Nothing to do but make sure that we have some storage to deref. */ + (void)ntr_get_ssa_def_decl(c, &instr->def); +} + +static void +ntr_emit_instr(struct ntr_compile *c, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_deref: + /* ignored, will be walked by nir_intrinsic_image_*_deref. */ + break; + + case nir_instr_type_alu: + ntr_emit_alu(c, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_intrinsic: + ntr_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_load_const: + /* Nothing to do here, as load consts are done directly from + * ntr_get_src() (since many constant NIR srcs will often get folded + * directly into a register file index instead of as a TGSI src). + */ + break; + + case nir_instr_type_tex: + ntr_emit_texture(c, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_jump: + ntr_emit_jump(c, nir_instr_as_jump(instr)); + break; + + case nir_instr_type_undef: + ntr_emit_ssa_undef(c, nir_instr_as_undef(instr)); + break; + + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static void +ntr_emit_if(struct ntr_compile *c, nir_if *if_stmt) +{ + ntr_IF(c, c->if_cond); + + ntr_emit_cf_list(c, &if_stmt->then_list); + + if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { + ntr_ELSE(c); + ntr_emit_cf_list(c, &if_stmt->else_list); + } + + ntr_ENDIF(c); +} + +static void +ntr_emit_loop(struct ntr_compile *c, nir_loop *loop) +{ + assert(!nir_loop_has_continue_construct(loop)); + ntr_BGNLOOP(c); + ntr_emit_cf_list(c, &loop->body); + ntr_ENDLOOP(c); +} + +static void +ntr_emit_block(struct ntr_compile *c, nir_block *block) +{ + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + c->cur_block = ntr_block; + + nir_foreach_instr(instr, block) { + ntr_emit_instr(c, instr); + + /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntr_OPCODE(). */ + if (ureg_get_instruction_number(c->ureg) != 0) { + fprintf(stderr, "Emitted ureg insn during: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + unreachable("emitted ureg insn"); + } + } + + /* Set up the if condition for ntr_emit_if(), which we have to do before + * freeing up the temps (the "if" is treated as inside the block for liveness + * purposes, despite not being an instruction) + * + * Note that, while IF and UIF are supposed to look at only .x, virglrenderer + * looks at all of .xyzw. No harm in working around the bug. + */ + nir_if *nif = nir_block_get_following_if(block); + if (nif) + c->if_cond = ureg_scalar(ntr_get_src(c, nif->condition), TGSI_SWIZZLE_X); +} + +static void +ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + ntr_emit_block(c, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + ntr_emit_if(c, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + ntr_emit_loop(c, nir_cf_node_as_loop(node)); + break; + + default: + unreachable("unknown CF type"); + } + } +} + +static void +ntr_emit_block_ureg(struct ntr_compile *c, struct nir_block *block) +{ + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + + /* Emit the ntr insns to tgsi_ureg. */ + util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) { + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(insn->opcode); + + switch (insn->opcode) { + case TGSI_OPCODE_IF: + ureg_IF(c->ureg, insn->src[0], &c->cf_label); + break; + + case TGSI_OPCODE_ELSE: + ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg)); + ureg_ELSE(c->ureg, &c->cf_label); + c->current_if_else = c->cf_label; + break; + + case TGSI_OPCODE_ENDIF: + ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg)); + ureg_ENDIF(c->ureg); + break; + + case TGSI_OPCODE_BGNLOOP: + /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx + * does reference BGNLOOP's. Follow the former behavior unless something comes up + * with a need. + */ + ureg_BGNLOOP(c->ureg, &c->cf_label); + break; + + case TGSI_OPCODE_ENDLOOP: + ureg_ENDLOOP(c->ureg, &c->cf_label); + break; + + default: + if (insn->is_tex) { + int num_offsets = 0; + for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) { + if (insn->tex_offset[i].File != TGSI_FILE_NULL) + num_offsets = i + 1; + } + ureg_tex_insn(c->ureg, insn->opcode, + insn->dst, opcode_info->num_dst, + insn->tex_target, insn->tex_return_type, + insn->tex_offset, + num_offsets, + insn->src, opcode_info->num_src); + } else { + ureg_insn(c->ureg, insn->opcode, + insn->dst, opcode_info->num_dst, + insn->src, opcode_info->num_src, + insn->precise); + } + } + } +} + +static void +ntr_emit_if_ureg(struct ntr_compile *c, nir_if *if_stmt) +{ + /* Note: the last block emitted our IF opcode. */ + + int if_stack = c->current_if_else; + c->current_if_else = c->cf_label; + + /* Either the then or else block includes the ENDIF, which will fix up the + * IF(/ELSE)'s label for jumping + */ + ntr_emit_cf_list_ureg(c, &if_stmt->then_list); + ntr_emit_cf_list_ureg(c, &if_stmt->else_list); + + c->current_if_else = if_stack; +} + +static void +ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + ntr_emit_block_ureg(c, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + ntr_emit_if_ureg(c, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx + * does reference BGNLOOP's. Follow the former behavior unless something comes up + * with a need. + */ + ntr_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body); + break; + + default: + unreachable("unknown CF type"); + } + } +} + +static void +ntr_emit_impl(struct ntr_compile *c, nir_function_impl *impl) +{ + c->impl = impl; + + c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc); + c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc); + + /* Set up the struct ntr_blocks to put insns in */ + c->blocks = _mesa_pointer_hash_table_create(c); + nir_foreach_block(block, impl) { + struct ntr_block *ntr_block = rzalloc(c->blocks, struct ntr_block); + util_dynarray_init(&ntr_block->insns, ntr_block); + _mesa_hash_table_insert(c->blocks, block, ntr_block); + } + + + ntr_setup_registers(c); + + c->cur_block = ntr_block_from_nir(c, nir_start_block(impl)); + ntr_setup_inputs(c); + ntr_setup_outputs(c); + ntr_setup_uniforms(c); + + /* Emit the ntr insns */ + ntr_emit_cf_list(c, &impl->body); + + /* Don't do optimized RA if the driver requests it, unless the number of + * temps is too large to be covered by the 16 bit signed int that TGSI + * allocates for the register index */ + if (!c->options->unoptimized_ra || c->num_temps > 0x7fff) + ntr_allocate_regs(c, impl); + else + ntr_allocate_regs_unoptimized(c, impl); + + /* Turn the ntr insns into actual TGSI tokens */ + ntr_emit_cf_list_ureg(c, &impl->body); + + ralloc_free(c->liveness); + c->liveness = NULL; + +} + +static int +type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +/* Allow vectorizing of ALU instructions. + */ +static uint8_t +ntr_should_vectorize_instr(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return 0; + + return 4; +} + +static bool +ntr_should_vectorize_io(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, + void *data) +{ + if (bit_size != 32) + return false; + + /* Our offset alignment should aways be at least 4 bytes */ + if (align < 4) + return false; + + /* No wrapping off the end of a TGSI reg. We could do a bit better by + * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't + * need this restriction. + */ + unsigned worst_start_component = align == 4 ? 3 : align / 4; + if (worst_start_component + num_components > 4) + return false; + + return true; +} + +static nir_variable_mode +ntr_no_indirects_mask(nir_shader *s, struct pipe_screen *screen) +{ + unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage); + unsigned indirect_mask = 0; + + if (!screen->get_shader_param(screen, pipe_stage, + PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) { + indirect_mask |= nir_var_shader_in; + } + + if (!screen->get_shader_param(screen, pipe_stage, + PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) { + indirect_mask |= nir_var_shader_out; + } + + if (!screen->get_shader_param(screen, pipe_stage, + PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) { + indirect_mask |= nir_var_function_temp; + } + + return indirect_mask; +} + +struct ntr_lower_tex_state { + nir_scalar channels[8]; + unsigned i; +}; + +static void +nir_to_rc_lower_tex_instr_arg(nir_builder *b, + nir_tex_instr *instr, + nir_tex_src_type tex_src_type, + struct ntr_lower_tex_state *s) +{ + int tex_src = nir_tex_instr_src_index(instr, tex_src_type); + if (tex_src < 0) + return; + + nir_def *def = instr->src[tex_src].src.ssa; + for (int i = 0; i < def->num_components; i++) { + s->channels[s->i++] = nir_get_scalar(def, i); + } + + nir_tex_instr_remove_src(instr, tex_src); +} + +/** + * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex + * src. This lets NIR handle the coalescing of the vec4 rather than trying to + * manage it on our own, and may lead to more vectorization. + */ +static bool +nir_to_rc_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_tex) + return false; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + + if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0) + return false; + + b->cursor = nir_before_instr(instr); + + struct ntr_lower_tex_state s = {0}; + + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s); + /* We always have at least two slots for the coordinate, even on 1D. */ + s.i = MAX2(s.i, 2); + + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s); + s.i = MAX2(s.i, 3); + + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s); + + /* XXX: LZ */ + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s); + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s); + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s); + + /* No need to pack undefs in unused channels of the tex instr */ + while (!s.channels[s.i - 1].def) + s.i--; + + /* Instead of putting undefs in the unused slots of the vecs, just put in + * another used channel. Otherwise, we'll get unnecessary moves into + * registers. + */ + assert(s.channels[0].def != NULL); + for (int i = 1; i < s.i; i++) { + if (!s.channels[i].def) + s.channels[i] = s.channels[0]; + } + + nir_tex_instr_add_src(tex, nir_tex_src_backend1, + nir_vec_scalars(b, s.channels, MIN2(s.i, 4))); + if (s.i > 4) + nir_tex_instr_add_src(tex, nir_tex_src_backend2, + nir_vec_scalars(b, &s.channels[4], s.i - 4)); + + return true; +} + +static bool +nir_to_rc_lower_tex(nir_shader *s) +{ + return nir_shader_instructions_pass(s, + nir_to_rc_lower_tex_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} + +/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */ +static void +nir_to_rc_lower_txp(nir_shader *s) +{ + nir_lower_tex_options lower_tex_options = { + .lower_txp = 0, + }; + + nir_foreach_block(block, nir_shader_get_entrypoint(s)) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_tex) + continue; + nir_tex_instr *tex = nir_instr_as_tex(instr); + + if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0) + continue; + + bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0; + bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT; + bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0; + + /* We can do TXP for any tex (not txg) where we can fit all the + * coordinates and comparator and projector in one vec4 without any + * other modifiers to add on. + * + * nir_lower_tex() only handles the lowering on a sampler-dim basis, so + * if we get any funny projectors then we just blow them all away. + */ + if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare)) + lower_tex_options.lower_txp |= 1 << tex->sampler_dim; + } + } + + /* nir_lower_tex must be run even if no options are set, because we need the + * LOD to be set for query_levels and for non-fragment shaders. + */ + NIR_PASS_V(s, nir_lower_tex, &lower_tex_options); +} + +const void * +nir_to_rc(struct nir_shader *s, + struct pipe_screen *screen) +{ + static const struct nir_to_rc_options default_ntr_options = {0}; + return nir_to_rc_options(s, screen, &default_ntr_options); +} + +/** + * Translates the NIR shader to TGSI. + * + * This requires some lowering of the NIR shader to prepare it for translation. + * We take ownership of the NIR shader passed, returning a reference to the new + * TGSI tokens instead. If you need to keep the NIR, then pass us a clone. + */ +const void *nir_to_rc_options(struct nir_shader *s, + struct pipe_screen *screen, + const struct nir_to_rc_options *options) +{ + struct ntr_compile *c; + const void *tgsi_tokens; + nir_variable_mode no_indirects_mask = ntr_no_indirects_mask(s, screen); + + /* Lower array indexing on FS inputs. Since we don't set + * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to + * elements by ureg, and so dynamically indexing them would be invalid. + * Ideally we would set that ureg flag based on + * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st + * splitting NIR VS outputs to elements even if the FS doesn't get the + * corresponding splitting, and virgl depends on TGSI across link boundaries + * having matching declarations. + */ + if (s->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX); + NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL); + } + + NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size, (nir_lower_io_options)0); + + nir_to_rc_lower_txp(s); + NIR_PASS_V(s, nir_to_rc_lower_tex); + + if (!s->options->lower_uniforms_to_ubo) { + NIR_PASS_V(s, nir_lower_uniforms_to_ubo, + screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS), + true); + } + + if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF)) + NIR_PASS_V(s, nir_lower_ubo_vec4); + + bool progress; + NIR_PASS_V(s, nir_opt_constant_folding); + + /* Clean up after triginometric input normalization. */ + NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL); + do { + progress = false; + NIR_PASS(progress, s, nir_opt_shrink_vectors); + } while (progress); + NIR_PASS_V(s, nir_copy_prop); + NIR_PASS_V(s, nir_opt_cse); + NIR_PASS_V(s, nir_opt_dce); + NIR_PASS_V(s, nir_opt_shrink_stores, true); + + NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX); + + /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */ + NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf); + + NIR_PASS_V(s, nir_lower_frexp); + + do { + progress = false; + NIR_PASS(progress, s, nir_opt_algebraic_late); + if (progress) { + NIR_PASS_V(s, nir_copy_prop); + NIR_PASS_V(s, nir_opt_dce); + NIR_PASS_V(s, nir_opt_cse); + } + } while (progress); + + if (s->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(s, r300_nir_prepare_presubtract); + NIR_PASS_V(s, r300_nir_clean_double_fneg); + } + + NIR_PASS_V(s, nir_lower_int_to_float); + NIR_PASS_V(s, nir_lower_bool_to_float, + !options->lower_cmp && !options->lower_fabs); + /* bool_to_float generates MOVs for b2f32 that we want to clean up. */ + NIR_PASS_V(s, nir_copy_prop); + NIR_PASS_V(s, nir_opt_dce); + + nir_move_options move_all = + nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | + nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; + + NIR_PASS_V(s, nir_opt_move, move_all); + NIR_PASS_V(s, nir_move_vec_src_uses_to_dest, true); + + NIR_PASS_V(s, nir_convert_from_ssa, true); + NIR_PASS_V(s, nir_lower_vec_to_regs, NULL, NULL); + + /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up. + */ + NIR_PASS_V(s, nir_lower_locals_to_regs, 32); + NIR_PASS_V(s, nir_opt_dce); + + /* See comment in ntr_get_alu_src for supported modifiers */ + NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs); + + if (NIR_DEBUG(TGSI)) { + fprintf(stderr, "NIR before translation to TGSI:\n"); + nir_print_shader(s, stderr); + } + + c = rzalloc(NULL, struct ntr_compile); + c->screen = screen; + c->options = options; + + c->s = s; + c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage)); + ureg_setup_shader_info(c->ureg, &s->info); + if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES)) + ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1); + + if (s->info.stage == MESA_SHADER_FRAGMENT) { + /* The draw module's polygon stipple layer doesn't respect the chosen + * coordinate mode, so leave it as unspecified unless we're actually + * reading the position in the shader already. See + * gl-2.1-polygon-stipple-fs on softpipe. + */ + if ((s->info.inputs_read & VARYING_BIT_POS) || + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) { + ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN, + s->info.fs.origin_upper_left ? + TGSI_FS_COORD_ORIGIN_UPPER_LEFT : + TGSI_FS_COORD_ORIGIN_LOWER_LEFT); + + ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER, + s->info.fs.pixel_center_integer ? + TGSI_FS_COORD_PIXEL_CENTER_INTEGER : + TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER); + } + } + /* Emit the main function */ + nir_function_impl *impl = nir_shader_get_entrypoint(c->s); + ntr_emit_impl(c, impl); + ureg_END(c->ureg); + + tgsi_tokens = ureg_get_tokens(c->ureg, NULL); + + if (NIR_DEBUG(TGSI)) { + fprintf(stderr, "TGSI after translation from NIR:\n"); + tgsi_dump(tgsi_tokens, 0); + } + + ureg_destroy(c->ureg); + + ralloc_free(c); + ralloc_free(s); + + return tgsi_tokens; +} diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.h b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.h new file mode 100644 index 000000000..ebbe87770 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.h @@ -0,0 +1,50 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef NIR_TO_RC_H +#define NIR_TO_RC_H + +#include <stdbool.h> +#include "pipe/p_defines.h" + +struct nir_shader; +struct pipe_screen; +struct pipe_shader_state; + +struct nir_to_rc_options { + bool lower_cmp; + /* Emit MAX(a,-a) instead of abs src modifier) */ + bool lower_fabs; + bool unoptimized_ra; + bool lower_ssbo_bindings; + uint32_t ubo_vec4_max; +}; + +const void *nir_to_rc(struct nir_shader *s, + struct pipe_screen *screen); + +const void *nir_to_rc_options(struct nir_shader *s, + struct pipe_screen *screen, + const struct nir_to_rc_options *ntr_options); + +#endif /* NIR_TO_RC_H */ diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.c b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.c new file mode 100644 index 000000000..05f7b8c59 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.c @@ -0,0 +1,182 @@ +/* + * Copyright 2023 Pavel Ondračka <pavel.ondracka@gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +#include "r300_nir.h" + +#include "r300_screen.h" + +static unsigned char +r300_should_vectorize_instr(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return 0; + + return 4; +} + +static bool +r300_should_vectorize_io(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, + void *data) +{ + if (bit_size != 32) + return false; + + /* Our offset alignment should aways be at least 4 bytes */ + if (align < 4) + return false; + + /* No wrapping off the end of a TGSI reg. We could do a bit better by + * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't + * need this restriction. + */ + unsigned worst_start_component = align == 4 ? 3 : align / 4; + if (worst_start_component + num_components > 4) + return false; + + return true; +} + +static void +r300_optimize_nir(struct nir_shader *s, struct pipe_screen *screen) +{ + bool is_r500 = r300_screen(screen)->caps.is_r500; + + bool progress; + do { + progress = false; + + NIR_PASS_V(s, nir_lower_vars_to_ssa); + + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_algebraic); + if (s->info.stage == MESA_SHADER_VERTEX) { + if (!is_r500) + NIR_PASS(progress, s, r300_nir_lower_bool_to_float); + NIR_PASS(progress, s, r300_nir_fuse_fround_d3d9); + } + NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_conditional_discard); + NIR_PASS(progress, s, nir_opt_dce); + NIR_PASS(progress, s, nir_opt_dead_cf); + NIR_PASS(progress, s, nir_opt_cse); + NIR_PASS(progress, s, nir_opt_find_array_copies); + NIR_PASS(progress, s, nir_opt_copy_prop_vars); + NIR_PASS(progress, s, nir_opt_dead_write_vars); + + NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false); + NIR_PASS(progress, s, nir_opt_peephole_select, is_r500 ? 8 : ~0, true, true); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + nir_load_store_vectorize_options vectorize_opts = { + .modes = nir_var_mem_ubo, + .callback = r300_should_vectorize_io, + .robust_modes = 0, + }; + NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); + NIR_PASS(progress, s, nir_opt_shrink_stores, true); + NIR_PASS(progress, s, nir_opt_shrink_vectors); + NIR_PASS(progress, s, nir_opt_trivial_continues); + NIR_PASS(progress, s, nir_opt_vectorize, r300_should_vectorize_instr, NULL); + NIR_PASS(progress, s, nir_opt_undef); + if(!progress) + NIR_PASS(progress, s, nir_lower_undef_to_zero); + NIR_PASS(progress, s, nir_opt_loop_unroll); + + /* Try to fold addressing math into ubo_vec4's base to avoid load_consts + * and ALU ops for it. + */ + nir_opt_offsets_options offset_options = { + .ubo_vec4_max = 255, + + /* No const offset in TGSI for shared accesses. */ + .shared_max = 0, + + /* unused intrinsics */ + .uniform_max = 0, + .buffer_max = 0, + }; + + NIR_PASS(progress, s, nir_opt_offsets, &offset_options); + } while (progress); + + NIR_PASS_V(s, nir_lower_var_copies); + NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp, + NULL); +} + +static char *r300_check_control_flow(nir_shader *s) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(s); + nir_block *first = nir_start_block(impl); + nir_cf_node *next = nir_cf_node_next(&first->cf_node); + + if (next) { + switch (next->type) { + case nir_cf_node_if: + return "If/then statements not supported by R300/R400 shaders, should have been flattened by peephole_select."; + case nir_cf_node_loop: + return "Looping not supported R300/R400 shaders, all loops must be statically unrollable."; + default: + return "Unknown control flow type"; + } + } + + return NULL; +} + +char * +r300_finalize_nir(struct pipe_screen *pscreen, void *nir) +{ + nir_shader *s = nir; + + r300_optimize_nir(s, pscreen); + + /* st_program.c's parameter list optimization requires that future nir + * variants don't reallocate the uniform storage, so we have to remove + * uniforms that occupy storage. But we don't want to remove samplers, + * because they're needed for YUV variant lowering. + */ + nir_remove_dead_derefs(s); + nir_foreach_uniform_variable_safe(var, s) { + if (var->data.mode == nir_var_uniform && + (glsl_type_get_image_count(var->type) || + glsl_type_get_sampler_count(var->type))) + continue; + + exec_node_remove(&var->node); + } + nir_validate_shader(s, "after uniform var removal"); + + nir_sweep(s); + + if (!r300_screen(pscreen)->caps.is_r500 && + (r300_screen(pscreen)->caps.has_tcl || s->info.stage == MESA_SHADER_FRAGMENT)) { + char *msg = r300_check_control_flow(s); + if (msg) + return strdup(msg); + } + + return NULL; +} diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.h b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.h new file mode 100644 index 000000000..916eb08fd --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.h @@ -0,0 +1,64 @@ +/* + * Copyright 2023 Pavel Ondračka <pavel.ondracka@gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +#ifndef R300_NIR_H +#define R300_NIR_H + +#include "pipe/p_screen.h" +#include "compiler/nir/nir.h" + +static inline bool +is_ubo_or_input(UNUSED struct hash_table *ht, const nir_alu_instr *instr, + unsigned src, unsigned num_components, + const uint8_t *swizzle) +{ + nir_instr *parent = instr->src[src].src.ssa->parent_instr; + if (parent->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(parent); + + switch (intrinsic->intrinsic) { + case nir_intrinsic_load_ubo_vec4: + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: + return true; + default: + return false; + } +} + +char *r300_finalize_nir(struct pipe_screen *pscreen, void *nir); + +extern bool r300_transform_vs_trig_input(struct nir_shader *shader); + +extern bool r300_transform_fs_trig_input(struct nir_shader *shader); + +extern bool r300_nir_fuse_fround_d3d9(struct nir_shader *shader); + +extern bool r300_nir_lower_bool_to_float(struct nir_shader *shader); + +extern bool r300_nir_prepare_presubtract(struct nir_shader *shader); + +extern bool r300_nir_clean_double_fneg(struct nir_shader *shader); + +#endif /* R300_NIR_H */ diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py index ec6f85adf..f03b8eaf6 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py +++ b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py @@ -25,6 +25,13 @@ import argparse import sys from math import pi +# Convenience variables +a = 'a' +b = 'b' +c = 'c' +d = 'd' +e = 'e' + # Transform input to range [-PI, PI]: # # y = frac(x / 2PI + 0.5) * 2PI - PI @@ -43,6 +50,50 @@ transform_trig_input_fs_r500 = [ (('fcos', 'a'), ('fcos', ('ffract', ('fmul', 'a', 1 / (2 * pi))))), ] +# The is a pattern produced by wined3d for A0 register load. +# The specific pattern wined3d emits looks like this +# A0.x = (int(floor(abs(R0.x) + 0.5) * sign(R0.x))); +# however we lower both sign and floor so here we check for the already lowered +# sequence. +r300_nir_fuse_fround_d3d9 = [ + (('fmul', ('fadd', ('fadd', ('fabs', 'a') , 0.5), + ('fneg', ('ffract', ('fadd', ('fabs', 'a') , 0.5)))), + ('fadd', ('b2f', ('!flt', 0.0, 'a')), + ('fneg', ('b2f', ('!flt', 'a', 0.0))))), + ('fround_even', 'a')) +] + +# Here are some specific optimizations for code reordering such that the backend +# has easier task of recognizing output modifiers and presubtract patterns. +r300_nir_prepare_presubtract = [ + # Backend can only recognize 1 - x pattern. + (('fadd', ('fneg', a), 1.0), ('fadd', 1.0, ('fneg', a))), + (('fadd', a, -1.0), ('fneg', ('fadd', 1.0, ('fneg', a)))), + (('fadd', -1.0, a), ('fneg', ('fadd', 1.0, ('fneg', a)))), + # Bias presubtract 1 - 2 * x expects MAD -a 2.0 1.0 form. + (('ffma', 2.0, ('fneg', a), 1.0), ('ffma', ('fneg', a), 2.0, 1.0)), + (('ffma', a, -2.0, 1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))), + (('ffma', -2.0, a, 1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))), + (('ffma', 2.0, a, -1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))), + (('ffma', a, 2.0, -1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))), + # x * 2 can be usually folded into output modifier for the previous + # instruction, but that only works if x is a temporary. If it is input or + # constant just convert it to add instead. + (('fmul', 'a(is_ubo_or_input)', 2.0), ('fadd', a, a)), +] + +for multiplier in [2.0, 4.0, 8.0, 16.0, 0.5, 0.25, 0.125, 0.0625]: + r300_nir_prepare_presubtract.extend([ + (('fmul', a, ('fmul(is_used_once)', 'b(is_ubo_or_input)', multiplier)), ('fmul', multiplier, ('fmul', a, b))), +]) + +# Previous prepare_presubtract pass can sometimes produce double fneg patterns. +# The backend copy propagate could handle it, but the nir to tgsi translation +# does not and blows up. Just run a simple pass to clean it up. +r300_nir_clean_double_fneg = [ + (('fneg', ('fneg', a)), a) +] + def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', '--import-path', required=True) @@ -51,9 +102,25 @@ def main(): sys.path.insert(0, args.import_path) import nir_algebraic # pylint: disable=import-error + ignore_exact = nir_algebraic.ignore_exact + + r300_nir_lower_bool_to_float = [ + (('bcsel@32(is_only_used_as_float)', ignore_exact('feq', 'a@32', 'b@32'), c, d), + ('fadd', ('fmul', c, ('seq', a, b)), ('fsub', d, ('fmul', d, ('seq', a, b)))), + "!options->has_fused_comp_and_csel"), + (('bcsel@32(is_only_used_as_float)', ignore_exact('fneu', 'a@32', 'b@32'), c, d), + ('fadd', ('fmul', c, ('sne', a, b)), ('fsub', d, ('fmul', d, ('sne', a, b)))), + "!options->has_fused_comp_and_csel"), + (('bcsel@32(is_only_used_as_float)', ignore_exact('flt', 'a@32', 'b@32'), c, d), + ('fadd', ('fmul', c, ('slt', a, b)), ('fsub', d, ('fmul', d, ('slt', a, b)))), + "!options->has_fused_comp_and_csel"), + (('bcsel@32(is_only_used_as_float)', ignore_exact('fge', 'a@32', 'b@32'), c, d), + ('fadd', ('fmul', c, ('sge', a, b)), ('fsub', d, ('fmul', d, ('sge', a, b)))), + "!options->has_fused_comp_and_csel"), +] with open(args.output, 'w') as f: - f.write('#include "r300_vs.h"') + f.write('#include "compiler/r300_nir.h"') f.write(nir_algebraic.AlgebraicPass("r300_transform_vs_trig_input", transform_trig_input_vs_r500).render()) @@ -61,6 +128,17 @@ def main(): f.write(nir_algebraic.AlgebraicPass("r300_transform_fs_trig_input", transform_trig_input_fs_r500).render()) + f.write(nir_algebraic.AlgebraicPass("r300_nir_fuse_fround_d3d9", + r300_nir_fuse_fround_d3d9).render()) + + f.write(nir_algebraic.AlgebraicPass("r300_nir_lower_bool_to_float", + r300_nir_lower_bool_to_float).render()) + + f.write(nir_algebraic.AlgebraicPass("r300_nir_prepare_presubtract", + r300_nir_prepare_presubtract).render()) + + f.write(nir_algebraic.AlgebraicPass("r300_nir_clean_double_fneg", + r300_nir_clean_double_fneg).render()) if __name__ == '__main__': main() diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c index 9f058e781..676809152 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c @@ -30,6 +30,8 @@ #include "radeon_program_tex.h" #include "radeon_rename_regs.h" #include "radeon_remove_constants.h" +#include "radeon_variable.h" +#include "radeon_list.h" #include "r300_fragprog.h" #include "r300_fragprog_swizzle.h" #include "r500_fragprog.h" @@ -65,6 +67,48 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user) } } +/** + * This function will try to convert rgb instructions into alpha instructions + * and vice versa. While this is already attempted during the pair scheduling, + * it is much simpler to do it before pair conversion, so do it here at least for + * the simple cases. + * + * Currently only math opcodes writing to rgb (and with no friends) are + * converted to alpha. + * + * This function assumes all the instructions are still of type + * RC_INSTRUCTION_NORMAL, the conversion is much simpler. + * + * Beware that this needs to be also called before doing presubtract, because + * rc_get_variables can't get properly readers for normal instructions if presubtract + * is present (it works fine for pair instructions). + */ +static void rc_convert_rgb_alpha(struct radeon_compiler *c, void *user) +{ + struct rc_list * variables; + struct rc_list * var_ptr; + + variables = rc_get_variables(c); + + for (var_ptr = variables; var_ptr; var_ptr = var_ptr->Next) { + struct rc_variable * var = var_ptr->Item; + + if (var->Inst->U.I.DstReg.File != RC_FILE_TEMPORARY) { + continue; + } + + /* Only rewrite scalar opcodes that are used separatelly for now. */ + if (var->Friend) + continue; + + const struct rc_opcode_info * opcode = rc_get_opcode_info(var->Inst->U.I.Opcode); + if (opcode->IsStandardScalar && var->Dst.WriteMask != RC_MASK_W) { + unsigned index = rc_find_free_temporary(c); + rc_variable_change_dst(var, index, RC_MASK_W); + } + } +} + void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) { int is_r500 = c->Base.is_r500; @@ -85,14 +129,12 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) struct radeon_program_transformation native_rewrite_r500[] = { { &radeonTransformALU, NULL }, { &radeonTransformDeriv, NULL }, - { &radeonTransformTrigScale, NULL }, { NULL, NULL } }; struct radeon_program_transformation native_rewrite_r300[] = { { &radeonTransformALU, NULL }, { &radeonStubDeriv, NULL }, - { &r300_transform_trig_simple, NULL }, { NULL, NULL } }; @@ -106,6 +148,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) {"native rewrite", 1, is_r500, rc_local_transform, native_rewrite_r500}, {"native rewrite", 1, !is_r500, rc_local_transform, native_rewrite_r300}, {"deadcode", 1, opt, rc_dataflow_deadcode, NULL}, + {"convert rgb<->alpha", 1, opt, rc_convert_rgb_alpha, NULL}, {"register rename", 1, !is_r500 || opt, rc_rename_regs, NULL}, {"dataflow optimize", 1, opt, rc_optimize, NULL}, {"inline literals", 1, is_r500 && opt, rc_inline_literals, NULL}, diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c index f322785ab..a02147a82 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c @@ -237,6 +237,36 @@ static void ei_math1(struct r300_vertex_program_code *vp, inst[3] = __CONST(0, RC_SWIZZLE_ZERO); } +static void ei_cmp(struct r300_vertex_program_code *vp, + struct rc_sub_instruction *vpi, + unsigned int * inst) +{ + inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE, + 0, + 0, + t_dst_index(vp, &vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask), + t_dst_class(vpi->DstReg.File), + vpi->SaturateMode == RC_SATURATE_ZERO_ONE); + + /* Arguments with constant swizzles still count as a unique + * temporary, so we should make sure these arguments share a + * register index with one of the other arguments. */ + for (unsigned i = 0; i < 3; i++) { + unsigned j = (i + 1) % 3; + if (vpi->SrcReg[i].File == RC_FILE_NONE && + (vpi->SrcReg[j].File == RC_FILE_NONE || + vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) { + vpi->SrcReg[i].Index = vpi->SrcReg[j].Index; + break; + } + } + + inst[1] = t_src(vp, &vpi->SrcReg[0]); + inst[2] = t_src(vp, &vpi->SrcReg[2]); + inst[3] = t_src(vp, &vpi->SrcReg[1]); +} + static void ei_lit(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int * inst) @@ -414,6 +444,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break; case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; + case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break; case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; @@ -471,11 +502,15 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) "Too many flow control instructions."); return; } + /* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here + * we reduce it to half to avoid occasional hangs on RV516 + * and downclocked RV530. + */ if (compiler->Base.is_r500) { compiler->code->fc_op_addrs.r500 [compiler->code->num_fc_ops].lw = R500_PVS_FC_ACT_ADRS(act_addr) - | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff) + | R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080) ; compiler->code->fc_op_addrs.r500 [compiler->code->num_fc_ops].uw = @@ -805,18 +840,12 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) int opt = !c->Base.disable_optimizations; /* Lists of instruction transformations. */ - struct radeon_program_transformation alu_rewrite_r500[] = { - { &r300_transform_vertex_alu, NULL }, - { NULL, NULL } - }; - - struct radeon_program_transformation alu_rewrite_r300[] = { + struct radeon_program_transformation alu_rewrite[] = { { &r300_transform_vertex_alu, NULL }, - { &r300_transform_trig_simple, NULL }, { NULL, NULL } }; - /* Note: These passes have to be done seperately from ALU rewrite, + /* Note: These passes have to be done separately from ALU rewrite, * otherwise non-native ALU instructions with source conflits * or non-native modifiers will not be treated properly. */ @@ -834,8 +863,7 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) struct radeon_compiler_pass vs_list[] = { /* NAME DUMP PREDICATE FUNCTION PARAM */ {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL}, - {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500}, - {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300}, + {"native rewrite", 1, 1, rc_local_transform, alu_rewrite}, {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers}, {"deadcode", 1, opt, rc_dataflow_deadcode, NULL}, {"dataflow optimize", 1, opt, rc_optimize, NULL}, diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c b/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c index 258b873d8..28c05ada6 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c @@ -281,8 +281,10 @@ static void emit_paired(struct r300_fragment_program_compiler *c, struct rc_pair code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex); code->inst[ip].inst5 |= R500_ALU_RGBA_ADDRD(inst->RGB.DestIndex); - use_temporary(code, inst->Alpha.DestIndex); - use_temporary(code, inst->RGB.DestIndex); + if (inst->Alpha.WriteMask) + use_temporary(code, inst->Alpha.DestIndex); + if (inst->RGB.WriteMask) + use_temporary(code, inst->RGB.DestIndex); if (inst->RGB.Saturate) code->inst[ip].inst0 |= R500_INST_RGB_CLAMP; diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c index ab36513e0..fddc23702 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c @@ -357,17 +357,24 @@ void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s) { struct rc_instruction * tmp; memset(s, 0, sizeof(*s)); + unsigned ip = 0; + int last_begintex = -1; for(tmp = c->Program.Instructions.Next; tmp != &c->Program.Instructions; - tmp = tmp->Next){ + tmp = tmp->Next, ip++){ const struct rc_opcode_info * info; rc_for_all_reads_mask(tmp, reg_count_callback, s); if (tmp->Type == RC_INSTRUCTION_NORMAL) { info = rc_get_opcode_info(tmp->U.I.Opcode); - if (info->Opcode == RC_OPCODE_BEGIN_TEX) + if (info->Opcode == RC_OPCODE_BEGIN_TEX) { + /* The R5xx docs mention ~30 cycles in section 8.3.1 */ + s->num_cycles += 30; + last_begintex = ip; continue; - if (tmp->U.I.PreSub.Opcode != RC_PRESUB_NONE) - s->num_presub_ops++; + } + if (info->Opcode == RC_OPCODE_MAD && + rc_inst_has_three_diff_temp_srcs(tmp)) + s->num_cycles++; } else { if (tmp->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used) s->num_presub_ops++; @@ -387,6 +394,15 @@ void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s) tmp->U.P.Alpha.Omod != RC_OMOD_DISABLE) { s->num_omod_ops++; } + if (tmp->U.P.Nop) + s->num_cycles++; + /* SemWait has effect only on R500, the more instructions we can put + * between the tex block and the first texture semaphore, the better. + */ + if (tmp->U.P.SemWait && c->is_r500 && last_begintex != -1) { + s->num_cycles -= MIN2(30, ip - last_begintex); + last_begintex = -1; + } info = rc_get_opcode_info(tmp->U.P.RGB.Opcode); } if (info->IsFlowControl) { @@ -402,6 +418,7 @@ void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s) if (info->HasTexture) s->num_tex_insts++; s->num_insts++; + s->num_cycles++; } /* Increment here because the reg_count_callback store the max * temporary reg index in s->nun_temp_regs. */ @@ -415,14 +432,17 @@ static void print_stats(struct radeon_compiler * c) rc_get_stats(c, &s); /* Note that we print some dummy values for instruction categories that - * only the FS has, becasue shader-db's report.py wants all shaders to + * only the FS has, because shader-db's report.py wants all shaders to * have the same set. */ - util_debug_message(c->debug, SHADER_INFO, "%s shader: %u inst, %u vinst, %u sinst, %u predicate, %u flowcontrol, %u loops, %u tex, %u presub, %u omod, %u temps, %u consts, %u lits", + util_debug_message(c->debug, SHADER_INFO, + "%s shader: %u inst, %u vinst, %u sinst, %u predicate, %u flowcontrol," + "%u loops, %u tex, %u presub, %u omod, %u temps, %u consts, %u lits, %u cycles", c->type == RC_VERTEX_PROGRAM ? "VS" : "FS", s.num_insts, s.num_rgb_insts, s.num_alpha_insts, s.num_pred_insts, s.num_fc_insts, s.num_loops, s.num_tex_insts, s.num_presub_ops, - s.num_omod_ops, s.num_temp_regs, s.num_consts, s.num_inline_literals); + s.num_omod_ops, s.num_temp_regs, s.num_consts, s.num_inline_literals, + s.num_cycles); } static const char *shader_name[RC_NUM_PROGRAM_TYPES] = { diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h index 100f43423..0e4321fae 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h @@ -56,7 +56,6 @@ struct radeon_compiler { unsigned has_presub:1; unsigned has_omod:1; unsigned disable_optimizations:1; - unsigned needs_trig_input_transform:1; unsigned max_temp_regs; unsigned max_constants; int max_alu_insts; @@ -148,6 +147,7 @@ struct radeon_compiler_pass { }; struct rc_program_stats { + unsigned num_cycles; unsigned num_consts; unsigned num_insts; unsigned num_fc_insts; diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c index 2a2542a47..17cb498b1 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c @@ -758,3 +758,13 @@ unsigned int rc_get_scalar_src_swz(unsigned int swizzle) assert(swz != RC_SWIZZLE_UNUSED); return swz; } + +bool rc_inst_has_three_diff_temp_srcs(struct rc_instruction *inst) +{ + return (inst->U.I.SrcReg[0].File == RC_FILE_TEMPORARY && + inst->U.I.SrcReg[1].File == RC_FILE_TEMPORARY && + inst->U.I.SrcReg[2].File == RC_FILE_TEMPORARY && + inst->U.I.SrcReg[0].Index != inst->U.I.SrcReg[1].Index && + inst->U.I.SrcReg[1].Index != inst->U.I.SrcReg[2].Index && + inst->U.I.SrcReg[0].Index != inst->U.I.SrcReg[2].Index); +} diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h index 7c1d6bbc9..c16f768e8 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h @@ -30,6 +30,8 @@ #ifndef RADEON_PROGRAM_UTIL_H #define RADEON_PROGRAM_UTIL_H +#include <stdbool.h> + #include "radeon_opcodes.h" struct radeon_compiler; @@ -126,4 +128,5 @@ float rc_get_constant_value( unsigned int rc_get_scalar_src_swz(unsigned int swizzle); +bool rc_inst_has_three_diff_temp_srcs(struct rc_instruction *inst); #endif /* RADEON_PROGRAM_UTIL_H */ diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h index 0c7bf8adf..09e0a9608 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h @@ -99,7 +99,7 @@ struct rc_reader_data { unsigned int ReadersReserved; struct rc_reader * Readers; - /* If this flag is enabled, rc_get_readers will exit as soon possbile + /* If this flag is enabled, rc_get_readers will exit as soon possible * after the Abort flag is set.*/ unsigned int ExitOnAbort; void * CbData; diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c index 7ca4cdfef..1458d03aa 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c @@ -61,13 +61,6 @@ const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { .HasDstReg = 1 }, { - .Opcode = RC_OPCODE_CEIL, - .Name = "CEIL", - .NumSrcRegs = 1, - .HasDstReg = 1, - .IsComponentwise = 1 - }, - { .Opcode = RC_OPCODE_CMP, .Name = "CMP", .NumSrcRegs = 3, @@ -140,13 +133,6 @@ const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { .HasDstReg = 1 }, { - .Opcode = RC_OPCODE_FLR, - .Name = "FLR", - .NumSrcRegs = 1, - .HasDstReg = 1, - .IsComponentwise = 1 - }, - { .Opcode = RC_OPCODE_FRC, .Name = "FRC", .NumSrcRegs = 1, @@ -297,13 +283,6 @@ const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { .IsComponentwise = 1 }, { - .Opcode = RC_OPCODE_SSG, - .Name = "SSG", - .NumSrcRegs = 1, - .HasDstReg = 1, - .IsComponentwise = 1 - }, - { .Opcode = RC_OPCODE_SUB, .Name = "SUB", .NumSrcRegs = 2, diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h index acce9f527..88d6f212b 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h @@ -48,9 +48,6 @@ typedef enum { * dst.x = round(src.x), where dst must be an address register */ RC_OPCODE_ARR, - /** vec4 instruction: dst.c = ceil(src0.c) */ - RC_OPCODE_CEIL, - /** vec4 instruction: dst.c = src0.c < 0.0 ? src1.c : src2.c */ RC_OPCODE_CMP, @@ -86,9 +83,6 @@ typedef enum { /** special instruction, see ARB_vertex_program */ RC_OPCODE_EXP, - /** vec4 instruction: dst.c = floor(src0.c) */ - RC_OPCODE_FLR, - /** vec4 instruction: dst.c = src0.c - floor(src0.c) */ RC_OPCODE_FRC, @@ -155,9 +149,6 @@ typedef enum { /** vec4 instruction: dst.c = (src0.c != src1.c) ? 1.0 : 0.0 */ RC_OPCODE_SNE, - /** vec4 instruction: dst.c = (src0.c < 0 ?) -1 : ((src0.c > 0) : 1 : 0) */ - RC_OPCODE_SSG, - /** vec4 instruction: dst.c = src0.c - src1.c */ RC_OPCODE_SUB, diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c index 02a937c69..fc475f135 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c @@ -242,82 +242,6 @@ static int is_src_uniform_constant(struct rc_src_register src, return 1; } -static void constant_folding_mad(struct rc_instruction * inst) -{ - rc_swizzle swz = 0; - unsigned int negate= 0; - - if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) { - if (swz == RC_SWIZZLE_ZERO) { - inst->U.I.Opcode = RC_OPCODE_MUL; - return; - } - } - - if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) { - if (swz == RC_SWIZZLE_ONE) { - inst->U.I.Opcode = RC_OPCODE_ADD; - if (negate) - inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW; - inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2]; - return; - } else if (swz == RC_SWIZZLE_ZERO) { - inst->U.I.Opcode = RC_OPCODE_MOV; - inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; - return; - } - } - - if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) { - if (swz == RC_SWIZZLE_ONE) { - inst->U.I.Opcode = RC_OPCODE_ADD; - if (negate) - inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; - inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; - return; - } else if (swz == RC_SWIZZLE_ZERO) { - inst->U.I.Opcode = RC_OPCODE_MOV; - inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; - return; - } - } -} - -static void constant_folding_mul(struct rc_instruction * inst) -{ - rc_swizzle swz = 0; - unsigned int negate = 0; - - if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) { - if (swz == RC_SWIZZLE_ONE) { - inst->U.I.Opcode = RC_OPCODE_MOV; - inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1]; - if (negate) - inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW; - return; - } else if (swz == RC_SWIZZLE_ZERO) { - inst->U.I.Opcode = RC_OPCODE_MOV; - inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000; - inst->U.I.SrcReg[0].File = RC_FILE_NONE; - return; - } - } - - if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) { - if (swz == RC_SWIZZLE_ONE) { - inst->U.I.Opcode = RC_OPCODE_MOV; - if (negate) - inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW; - return; - } else if (swz == RC_SWIZZLE_ZERO) { - inst->U.I.Opcode = RC_OPCODE_MOV; - inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000; - inst->U.I.SrcReg[0].File = RC_FILE_NONE; - return; - } - } -} - static void constant_folding_add(struct rc_instruction * inst) { rc_swizzle swz = 0; @@ -420,14 +344,8 @@ static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst->U.I.SrcReg[src] = newsrc; } - /* Simplify instructions based on constants */ - if (inst->U.I.Opcode == RC_OPCODE_MAD) - constant_folding_mad(inst); - - /* note: MAD can simplify to MUL or ADD */ - if (inst->U.I.Opcode == RC_OPCODE_MUL) - constant_folding_mul(inst); - else if (inst->U.I.Opcode == RC_OPCODE_ADD) + if (c->type == RC_FRAGMENT_PROGRAM && + inst->U.I.Opcode == RC_OPCODE_ADD) constant_folding_add(inst); /* In case this instruction has been converted, make sure all of the @@ -563,7 +481,7 @@ static int is_presub_candidate( unsigned int i; unsigned int is_constant[2] = {0, 0}; - assert(inst->U.I.Opcode == RC_OPCODE_ADD); + assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD); if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE || inst->U.I.SaturateMode @@ -572,7 +490,7 @@ static int is_presub_candidate( return 0; } - /* If both sources use a constant swizzle, then we can't convert it to + /* If first two sources use a constant swizzle, then we can't convert it to * a presubtract operation. In fact for the ADD and SUB presubtract * operations neither source can contain a constant swizzle. This * specific case is checked in peephole_add_presub_add() when @@ -655,10 +573,27 @@ static void presub_replace_inv( inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV; } +static void presub_replace_bias( + struct rc_instruction * inst_mad, + struct rc_instruction * inst_reader, + unsigned int src_index) +{ + /* We must be careful not to modify inst_mad, since it + * is possible it will remain part of the program.*/ + inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0]; + inst_reader->U.I.PreSub.SrcReg[0].Negate = 0; + inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS; + inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index], + inst_reader->U.I.PreSub.SrcReg[0]); + + inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB; + inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS; +} + /** * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1] * Use the presubtract 1 - src0 for all readers of TEMP[0]. The first source - * of the add instruction must have the constatnt 1 swizzle. This function + * of the add instruction must have the constant 1 swizzle. This function * does not check const registers to see if their value is 1.0, so it should * be called after the constant_folding optimization. * @return @@ -690,8 +625,6 @@ static int peephole_add_presub_inv( if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) != inst_add->U.I.DstReg.WriteMask || inst_add->U.I.SrcReg[1].Abs - || (inst_add->U.I.SrcReg[1].File != RC_FILE_TEMPORARY - && inst_add->U.I.SrcReg[1].File != RC_FILE_CONSTANT) || src_has_const_swz(inst_add->U.I.SrcReg[1])) { return 0; @@ -704,6 +637,66 @@ static int peephole_add_presub_inv( return 0; } +/** + * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0 + * Use the presubtract 1 - 2*src0 for all readers of TEMP[0]. The first source + * of the add instruction must have the constant 1 swizzle. This function + * does not check const registers to see if their value is 1.0, so it should + * be called after the constant_folding optimization. + * @return + * 0 if the MAD instruction is still part of the program. + * 1 if the MAD instruction is no longer part of the program. + */ +static int peephole_mad_presub_bias( + struct radeon_compiler * c, + struct rc_instruction * inst_mad) +{ + unsigned int i, swz; + + if (!is_presub_candidate(c, inst_mad)) + return 0; + + /* Check if src2 is 1. */ + for(i = 0; i < 4; i++ ) { + if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i))) + continue; + + swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i); + if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i)) + return 0; + } + + /* Check if src1 is 2. */ + struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1]; + if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs) + return 0; + struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index]; + if (constant->Type != RC_CONSTANT_IMMEDIATE) + return 0; + for (i = 0; i < 4; i++) { + if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i))) + continue; + swz = GET_SWZ(src1_reg.Swizzle, i); + if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0) + return 0; + } + + /* Check src0. */ + if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) != + inst_mad->U.I.DstReg.WriteMask + || inst_mad->U.I.SrcReg[0].Abs + || src_has_const_swz(inst_mad->U.I.SrcReg[0])) { + + return 0; + } + + if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) { + rc_remove_instruction(inst_mad); + return 1; + } + return 0; +} + struct peephole_mul_cb_data { struct rc_dst_register * Writer; unsigned int Clobbered; @@ -891,15 +884,24 @@ static int peephole_mul_omod( */ static int peephole(struct radeon_compiler * c, struct rc_instruction * inst) { - switch(inst->U.I.Opcode){ + if (!c->has_presub) + return 0; + + switch(inst->U.I.Opcode) { case RC_OPCODE_ADD: - if (c->has_presub) { - if(peephole_add_presub_inv(c, inst)) - return 1; - if(peephole_add_presub_add(c, inst)) - return 1; - } + { + if (peephole_add_presub_inv(c, inst)) + return 1; + if (peephole_add_presub_add(c, inst)) + return 1; + break; + } + case RC_OPCODE_MAD: + { + if (peephole_mad_presub_bias(c, inst)) + return 1; break; + } default: break; } @@ -1331,6 +1333,126 @@ static void merge_channels(struct radeon_compiler * c, struct rc_instruction * i } } +/** + * Searches for duplicate ARLs/ARRs + * + * Only a very trivial case is now optimized where if a second one is detected which reads from + * the same register as the first one and source is the same, just remove the second one. + */ +static void merge_A0_loads( + struct radeon_compiler * c, + struct rc_instruction * inst, + bool is_ARL) +{ + unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index; + unsigned int A0_src_file = inst->U.I.SrcReg[0].File; + unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle; + int cf_depth = 0; + + struct rc_instruction * cur = inst; + while (cur != &c->Program.Instructions) { + cur = cur->Next; + const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode); + + /* Keep it simple for now and stop when encountering any + * control flow besides simple ifs. + */ + if (opcode->IsFlowControl) { + switch (cur->U.I.Opcode) { + case RC_OPCODE_IF: + { + cf_depth++; + break; + } + case RC_OPCODE_ELSE: + { + if (cf_depth < 1) + return; + break; + } + case RC_OPCODE_ENDIF: + { + cf_depth--; + break; + } + default: + return; + } + } + + /* Stop when the original source is overwritten */ + if (A0_src_reg == cur->U.I.DstReg.Index && + A0_src_file == cur->U.I.DstReg.File && + cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle)) + return; + + /* Wrong A0 load type. */ + if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) || + (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL)) + return; + + if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) { + if (A0_src_reg == cur->U.I.SrcReg[0].Index && + A0_src_file == cur->U.I.SrcReg[0].File && + A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) { + struct rc_instruction * next = cur->Next; + rc_remove_instruction(cur); + cur = next; + } else { + return; + } + } + } +} + +/** + * According to the GLSL spec, round is only 1.30 and up + * so the only reason why we should ever see round is if it actually + * is lowered ARR (from nine->ttn). In that case we want to reconstruct + * the ARR instead of lowering the round. + */ +static void transform_vertex_ROUND(struct radeon_compiler* c, + struct rc_instruction* inst) +{ + struct rc_reader_data readers; + rc_get_readers(c, inst, &readers, NULL, NULL, NULL); + + assert(readers.ReaderCount > 0); + for (unsigned i = 0; i < readers.ReaderCount; i++) { + struct rc_instruction *reader = readers.Readers[i].Inst; + if (reader->U.I.Opcode != RC_OPCODE_ARL) { + assert(!"Unable to convert ROUND+ARL to ARR\n"); + return; + } + } + + /* Only ARL readers, convert all to ARR */ + for (unsigned i = 0; i < readers.ReaderCount; i++) { + readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR; + } + /* Switch ROUND to MOV and let copy propagate sort it out later. */ + inst->U.I.Opcode = RC_OPCODE_MOV; +} + +/** + * Apply various optimizations specific to the A0 adress register loads. + */ +static void optimize_A0_loads(struct radeon_compiler * c) { + struct rc_instruction * inst = c->Program.Instructions.Next; + + while (inst != &c->Program.Instructions) { + struct rc_instruction * cur = inst; + inst = inst->Next; + if (cur->U.I.Opcode == RC_OPCODE_ARL) { + merge_A0_loads(c, cur, true); + } else if (cur->U.I.Opcode == RC_OPCODE_ARR) { + merge_A0_loads(c, cur, false); + } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) { + transform_vertex_ROUND(c, cur); + } + } +} + void rc_optimize(struct radeon_compiler * c, void *user) { struct rc_instruction * inst = c->Program.Instructions.Next; @@ -1350,8 +1472,12 @@ void rc_optimize(struct radeon_compiler * c, void *user) } } + if (c->type == RC_VERTEX_PROGRAM) { + optimize_A0_loads(c); + } + /* Merge MOVs to same source in different channels using the constant - * swizzles. + * swizzle. */ if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) { inst = c->Program.Instructions.Next; @@ -1376,6 +1502,10 @@ void rc_optimize(struct radeon_compiler * c, void *user) } } + if (c->type != RC_FRAGMENT_PROGRAM) { + return; + } + /* Presubtract operations. */ inst = c->Program.Instructions.Next; while(inst != &c->Program.Instructions) { @@ -1384,10 +1514,7 @@ void rc_optimize(struct radeon_compiler * c, void *user) peephole(c, cur); } - if (!c->has_omod) { - return; - } - + /* Output modifiers. */ inst = c->Program.Instructions.Next; struct rc_list * var_list = NULL; while(inst != &c->Program.Instructions) { diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c index e232e93f0..428bf471c 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c @@ -494,7 +494,7 @@ static void emit_all_tex(struct schedule_state * s, struct rc_instruction * befo * dst_full is an rgb instruction, meaning that it has a vector instruction(rgb) * but no scalar instruction (alpha). * @return 0 if merging the presubtract sources fails. - * @retrun 1 if merging the presubtract sources succeeds. + * @return 1 if merging the presubtract sources succeeds. */ static int merge_presub_sources( struct rc_pair_instruction * dst_full, @@ -571,7 +571,7 @@ static int merge_presub_sources( for(arg = 0; arg < info->NumSrcRegs; arg++) { /* If the arg does read both from rgb and alpha, then we need to rewrite * both sources and the code currently doesn't handle this. - * FIXME: This is definitelly solvable, however shader-db shows it is + * FIXME: This is definitely solvable, however shader-db shows it is * not worth the effort. */ if (rc_source_type_swz(dst_full->RGB.Arg[arg].Swizzle) & RC_SOURCE_ALPHA && @@ -844,7 +844,7 @@ static void is_rgb_to_alpha_possible( } /* Make sure the source only reads the register component that we - * are going to be convering from. It is OK if the instruction uses + * are going to be converting from. It is OK if the instruction uses * this component more than once. * XXX If the index we will be converting to is the same as the * current index, then it is OK to read from more than one component. diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h index 67be1b9f2..41af9815f 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h @@ -42,7 +42,7 @@ struct rc_src_register { unsigned int File:4; /** Negative values may be used for relative addressing. */ - signed int Index:(RC_REGISTER_INDEX_BITS+1); + unsigned int Index:RC_REGISTER_INDEX_BITS; unsigned int RelAddr:1; unsigned int Swizzle:12; diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c index c6d682b40..a56d81c62 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c @@ -37,6 +37,7 @@ #include "radeon_compiler.h" #include "radeon_compiler_util.h" +#include "radeon_dataflow.h" #include "util/log.h" @@ -117,12 +118,6 @@ static const struct rc_src_register builtin_one = { .Swizzle = RC_SWIZZLE_1111 }; -static const struct rc_src_register builtin_half = { - .File = RC_FILE_NONE, - .Index = 0, - .Swizzle = RC_SWIZZLE_HHHH -}; - static const struct rc_src_register srcreg_undefined = { .File = RC_FILE_NONE, .Index = 0, @@ -202,26 +197,6 @@ static struct rc_dst_register new_dst_reg(struct radeon_compiler *c, return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask); } -static void transform_CEIL(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - /* Assuming: - * ceil(x) = -floor(-x) - * - * After inlining floor: - * ceil(x) = -(-x-frac(-x)) - * - * After simplification: - * ceil(x) = x+frac(-x) - */ - - struct rc_dst_register dst = new_dst_reg(c, inst); - emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dst, negate(inst->U.I.SrcReg[0])); - emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg, - inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index)); - rc_remove_instruction(inst); -} - static void transform_DP2(struct radeon_compiler* c, struct rc_instruction* inst) { @@ -237,29 +212,6 @@ static void transform_DP2(struct radeon_compiler* c, rc_remove_instruction(inst); } -/** - * [1, src0.y*src1.y, src0.z, src1.w] - * So basically MUL with lotsa swizzling. - */ -static void transform_DST(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg, - swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE), - swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W)); - rc_remove_instruction(inst); -} - -static void transform_FLR(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - struct rc_dst_register dst = new_dst_reg(c, inst); - emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dst, inst->U.I.SrcReg[0]); - emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg, - inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index))); - rc_remove_instruction(inst); -} - static void transform_TRUNC(struct radeon_compiler* c, struct rc_instruction* inst) { @@ -296,89 +248,6 @@ static void transform_TRUNC(struct radeon_compiler* c, rc_remove_instruction(inst); } -/** - * Definition of LIT (from ARB_fragment_program): - * - * tmp = VectorLoad(op0); - * if (tmp.x < 0) tmp.x = 0; - * if (tmp.y < 0) tmp.y = 0; - * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); - * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; - * result.x = 1.0; - * result.y = tmp.x; - * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0; - * result.w = 1.0; - * - * The longest path of computation is the one leading to result.z, - * consisting of 5 operations. This implementation of LIT takes - * 5 slots, if the subsequent optimization passes are clever enough - * to pair instructions correctly. - */ -static void transform_LIT(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - unsigned int constant; - unsigned int constant_swizzle; - unsigned int temp; - struct rc_src_register srctemp; - - constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle); - - if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) { - struct rc_instruction * inst_mov; - - inst_mov = emit1(c, inst, - RC_OPCODE_MOV, NULL, inst->U.I.DstReg, - srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c))); - - inst->U.I.DstReg.File = RC_FILE_TEMPORARY; - inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index; - inst->U.I.DstReg.WriteMask = RC_MASK_XYZW; - } - - temp = inst->U.I.DstReg.Index; - srctemp = srcreg(RC_FILE_TEMPORARY, temp); - - /* tmp.x = max(0.0, Src.x); */ - /* tmp.y = max(0.0, Src.y); */ - /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */ - emit2(c, inst->Prev, RC_OPCODE_MAX, NULL, - dstregtmpmask(temp, RC_MASK_XYW), - inst->U.I.SrcReg[0], - swizzle(srcreg(RC_FILE_CONSTANT, constant), - RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3)); - emit2(c, inst->Prev, RC_OPCODE_MIN, NULL, - dstregtmpmask(temp, RC_MASK_Z), - swizzle_wwww(srctemp), - negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle))); - - /* tmp.w = Pow(tmp.y, tmp.w) */ - emit1(c, inst->Prev, RC_OPCODE_LG2, NULL, - dstregtmpmask(temp, RC_MASK_W), - swizzle_yyyy(srctemp)); - emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, - dstregtmpmask(temp, RC_MASK_W), - swizzle_wwww(srctemp), - swizzle_zzzz(srctemp)); - emit1(c, inst->Prev, RC_OPCODE_EX2, NULL, - dstregtmpmask(temp, RC_MASK_W), - swizzle_wwww(srctemp)); - - /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */ - emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, - dstregtmpmask(temp, RC_MASK_Z), - negate(swizzle_xxxx(srctemp)), - swizzle_wwww(srctemp), - builtin_zero); - - /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */ - emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, - dstregtmpmask(temp, RC_MASK_XYW), - swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE)); - - rc_remove_instruction(inst); -} - static void transform_LRP(struct radeon_compiler* c, struct rc_instruction* inst) { @@ -394,58 +263,6 @@ static void transform_LRP(struct radeon_compiler* c, rc_remove_instruction(inst); } -static void transform_POW(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - struct rc_dst_register tempdst = new_dst_reg(c, inst); - struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index); - tempdst.WriteMask = RC_MASK_W; - tempsrc.Swizzle = RC_SWIZZLE_WWWW; - - emit1(c, inst->Prev, RC_OPCODE_LG2, NULL, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0])); - emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1])); - emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc); - - rc_remove_instruction(inst); -} - -/* dst = ROUND(src) : - * add = src + .5 - * frac = FRC(add) - * dst = add - frac - * - * According to the GLSL spec, the implementor can decide which way to round - * when the fraction is .5. We round down for .5. - * - */ -static void transform_ROUND(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - unsigned int mask = inst->U.I.DstReg.WriteMask; - unsigned int frac_index, add_index; - struct rc_dst_register frac_dst, add_dst; - struct rc_src_register frac_src, add_src; - - /* add = src + .5 */ - add_index = rc_find_free_temporary(c); - add_dst = dstregtmpmask(add_index, mask); - emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, add_dst, inst->U.I.SrcReg[0], - builtin_half); - add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index); - - - /* frac = FRC(add) */ - frac_index = rc_find_free_temporary(c); - frac_dst = dstregtmpmask(frac_index, mask); - emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, frac_dst, add_src); - frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index); - - /* dst = add - frac */ - emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, inst->U.I.DstReg, - add_src, negate(frac_src)); - rc_remove_instruction(inst); -} - static void transform_RSQ(struct radeon_compiler* c, struct rc_instruction* inst) { @@ -524,44 +341,6 @@ static void transform_SNE(struct radeon_compiler* c, rc_remove_instruction(inst); } -static void transform_SSG(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - /* result = sign(x) - * - * CMP tmp0, -x, 1, 0 - * CMP tmp1, x, 1, 0 - * ADD result, tmp0, -tmp1; - */ - struct rc_dst_register dst0; - unsigned tmp1; - - /* 0 < x */ - dst0 = new_dst_reg(c, inst); - emit3(c, inst->Prev, RC_OPCODE_CMP, NULL, - dst0, - negate(inst->U.I.SrcReg[0]), - builtin_one, - builtin_zero); - - /* x < 0 */ - tmp1 = rc_find_free_temporary(c); - emit3(c, inst->Prev, RC_OPCODE_CMP, NULL, - dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask), - inst->U.I.SrcReg[0], - builtin_one, - builtin_zero); - - /* Either both are zero, or one of them is one and the other is zero. */ - /* result = tmp0 - tmp1 */ - emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, - inst->U.I.DstReg, - srcreg(RC_FILE_TEMPORARY, dst0.Index), - negate(srcreg(RC_FILE_TEMPORARY, tmp1))); - - rc_remove_instruction(inst); -} - static void transform_SUB(struct radeon_compiler* c, struct rc_instruction* inst) { @@ -581,7 +360,7 @@ static void transform_KILP(struct radeon_compiler * c, * no userData necessary. * * Eliminates the following ALU instructions: - * CEIL, DST, FLR, LIT, LRP, POW, SEQ, SGE, SGT, SLE, SLT, SNE, SUB + * LRP, SEQ, SGE, SGT, SLE, SLT, SNE, SUB * using: * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP * @@ -596,15 +375,9 @@ int radeonTransformALU( void* unused) { switch(inst->U.I.Opcode) { - case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1; case RC_OPCODE_DP2: transform_DP2(c, inst); return 1; - case RC_OPCODE_DST: transform_DST(c, inst); return 1; - case RC_OPCODE_FLR: transform_FLR(c, inst); return 1; case RC_OPCODE_KILP: transform_KILP(c, inst); return 1; - case RC_OPCODE_LIT: transform_LIT(c, inst); return 1; case RC_OPCODE_LRP: transform_LRP(c, inst); return 1; - case RC_OPCODE_POW: transform_POW(c, inst); return 1; - case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1; case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1; case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1; case RC_OPCODE_SGE: transform_SGE(c, inst); return 1; @@ -612,7 +385,6 @@ int radeonTransformALU( case RC_OPCODE_SLE: transform_SLE(c, inst); return 1; case RC_OPCODE_SLT: transform_SLT(c, inst); return 1; case RC_OPCODE_SNE: transform_SNE(c, inst); return 1; - case RC_OPCODE_SSG: transform_SSG(c, inst); return 1; case RC_OPCODE_SUB: transform_SUB(c, inst); return 1; case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1; default: @@ -623,7 +395,12 @@ int radeonTransformALU( static void transform_r300_vertex_CMP(struct radeon_compiler* c, struct rc_instruction* inst) { - /* There is no decent CMP available, so let's rig one up. + /* R5xx has a CMP, but we can use it only if it reads from less than + * three different temps. */ + if (c->is_r500 && !rc_inst_has_three_diff_temp_srcs(inst)) + return; + + /* There is no decent CMP available on r300, so let's rig one up. * CMP is defined as dst = src0 < 0.0 ? src1 : src2 * The following sequence consumes zero to two temps and two extra slots * (the second temp and the second slot is consumed by transform_LRP), @@ -768,42 +545,6 @@ static void transform_r300_vertex_SLE(struct radeon_compiler* c, inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; } -static void transform_r300_vertex_SSG(struct radeon_compiler* c, - struct rc_instruction* inst) -{ - /* result = sign(x) - * - * SLT tmp0, 0, x; - * SLT tmp1, x, 0; - * ADD result, tmp0, -tmp1; - */ - struct rc_dst_register dst0; - unsigned tmp1; - - /* 0 < x */ - dst0 = new_dst_reg(c, inst); - emit2(c, inst->Prev, RC_OPCODE_SLT, NULL, - dst0, - builtin_zero, - inst->U.I.SrcReg[0]); - - /* x < 0 */ - tmp1 = rc_find_free_temporary(c); - emit2(c, inst->Prev, RC_OPCODE_SLT, NULL, - dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask), - inst->U.I.SrcReg[0], - builtin_zero); - - /* Either both are zero, or one of them is one and the other is zero. */ - /* result = tmp0 - tmp1 */ - emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, - inst->U.I.DstReg, - srcreg(RC_FILE_TEMPORARY, dst0.Index), - negate(srcreg(RC_FILE_TEMPORARY, tmp1))); - - rc_remove_instruction(inst); -} - static void transform_vertex_TRUNC(struct radeon_compiler* c, struct rc_instruction* inst) { @@ -825,11 +566,9 @@ int r300_transform_vertex_alu( void* unused) { switch(inst->U.I.Opcode) { - case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1; case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1; case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1; case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1; - case RC_OPCODE_FLR: transform_FLR(c, inst); return 1; case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1; case RC_OPCODE_LRP: transform_LRP(c, inst); return 1; case RC_OPCODE_SEQ: @@ -846,7 +585,6 @@ int r300_transform_vertex_alu( return 1; } return 0; - case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1; case RC_OPCODE_SUB: transform_SUB(c, inst); return 1; case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1; default: @@ -854,196 +592,6 @@ int r300_transform_vertex_alu( } } -static void sincos_constants(struct radeon_compiler* c, unsigned int *constants) -{ - static const float SinCosConsts[2][4] = { - { - 1.273239545, /* 4/PI */ - -0.405284735, /* -4/(PI*PI) */ - 3.141592654, /* PI */ - 0.2225 /* weight */ - }, - { - 0.75, - 0.5, - 0.159154943, /* 1/(2*PI) */ - 6.283185307 /* 2*PI */ - } - }; - int i; - - for(i = 0; i < 2; ++i) - constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]); -} - -/** - * Approximate sin(x), where x is clamped to (-pi/2, pi/2). - * - * MUL tmp.xy, src, { 4/PI, -4/(PI^2) } - * MAD tmp.x, tmp.y, |src|, tmp.x - * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x - * MAD dest, tmp.y, weight, tmp.x - */ -static void sin_approx( - struct radeon_compiler* c, struct rc_instruction * inst, - struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants) -{ - unsigned int tempreg = rc_find_free_temporary(c); - - emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(tempreg, RC_MASK_XY), - swizzle_xxxx(src), - srcreg(RC_FILE_CONSTANT, constants[0])); - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_X), - swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)), - absolute(swizzle_xxxx(src)), - swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))); - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_Y), - swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)), - absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))), - negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)))); - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dst, - swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)), - swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])), - swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))); -} - -/** - * Translate the trigonometric functions COS and SIN - * using only the basic instructions - * MOV, ADD, MUL, MAD, FRC - */ -int r300_transform_trig_simple(struct radeon_compiler* c, - struct rc_instruction* inst, - void* unused) -{ - unsigned int constants[2]; - unsigned int tempreg; - - if (inst->U.I.Opcode != RC_OPCODE_COS && - inst->U.I.Opcode != RC_OPCODE_SIN) - return 0; - - tempreg = rc_find_free_temporary(c); - - sincos_constants(c, constants); - - if (inst->U.I.Opcode == RC_OPCODE_COS) { - /* MAD tmp.x, src, 1/(2*PI), 0.75 */ - /* FRC tmp.x, tmp.x */ - /* MAD tmp.z, tmp.x, 2*PI, -PI */ - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W), - swizzle_xxxx(inst->U.I.SrcReg[0]), - swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])), - swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1]))); - emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W), - swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg))); - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W), - swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), - swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])), - negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0])))); - - sin_approx(c, inst, inst->U.I.DstReg, - swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), - constants); - } else if (inst->U.I.Opcode == RC_OPCODE_SIN) { - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W), - swizzle_xxxx(inst->U.I.SrcReg[0]), - swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])), - swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1]))); - emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W), - swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg))); - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W), - swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), - swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])), - negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0])))); - - sin_approx(c, inst, inst->U.I.DstReg, - swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), - constants); - } else { - struct rc_dst_register dst; - - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY), - swizzle_xxxx(inst->U.I.SrcReg[0]), - swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])), - swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W)); - emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_XY), - srcreg(RC_FILE_TEMPORARY, tempreg)); - emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY), - srcreg(RC_FILE_TEMPORARY, tempreg), - swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])), - negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0])))); - - dst = inst->U.I.DstReg; - - dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X; - sin_approx(c, inst, dst, - swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)), - constants); - - dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y; - sin_approx(c, inst, dst, - swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)), - constants); - } - - rc_remove_instruction(inst); - - return 1; -} - -static void r300_transform_SIN_COS(struct radeon_compiler *c, - struct rc_instruction *inst, - unsigned srctmp) -{ - if (inst->U.I.Opcode == RC_OPCODE_COS) { - emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg, - srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW)); - } else if (inst->U.I.Opcode == RC_OPCODE_SIN) { - emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, - inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW)); - } - - rc_remove_instruction(inst); -} - - -/** - * Transform the trigonometric functions COS and SIN - * to include pre-scaling by 1/(2*PI) and taking the fractional - * part, so that the input to COS and SIN is always in the range [0,1). - * - * @warning This transformation implicitly changes the semantics of SIN and COS! - */ -int radeonTransformTrigScale(struct radeon_compiler* c, - struct rc_instruction* inst, - void* unused) -{ - static const float RCP_2PI = 0.15915494309189535; - unsigned int temp; - unsigned int constant; - unsigned int constant_swizzle; - - if (inst->U.I.Opcode != RC_OPCODE_COS && - inst->U.I.Opcode != RC_OPCODE_SIN) - return 0; - - if (!c->needs_trig_input_transform) - return 1; - - temp = rc_find_free_temporary(c); - constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle); - - emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(temp, RC_MASK_W), - swizzle_xxxx(inst->U.I.SrcReg[0]), - srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)); - emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(temp, RC_MASK_W), - srcreg(RC_FILE_TEMPORARY, temp)); - - r300_transform_SIN_COS(c, inst, temp); - return 1; -} - /** * Replaces DDX/DDY instructions with MOV 0 to avoid using dummy shaders on r300/r400. * diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h index eb522b2ea..861a6a39d 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h @@ -40,21 +40,6 @@ int r300_transform_vertex_alu( struct rc_instruction * inst, void*); -int r300_transform_trig_simple( - struct radeon_compiler * c, - struct rc_instruction * inst, - void*); - -int radeonTransformTrigScale( - struct radeon_compiler * c, - struct rc_instruction * inst, - void*); - -int r300_transform_trig_scale_vertex( - struct radeon_compiler *c, - struct rc_instruction *inst, - void*); - int radeonStubDeriv( struct radeon_compiler * c, struct rc_instruction * inst, diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h index 6a8cbe333..4c12c5f2e 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h @@ -101,7 +101,7 @@ enum { RC_NUM_SPECIAL_REGISTERS }; -#define RC_REGISTER_INDEX_BITS 10 +#define RC_REGISTER_INDEX_BITS 11 #define RC_REGISTER_MAX_INDEX (1 << RC_REGISTER_INDEX_BITS) typedef enum { diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c index 4882527bf..9995d5158 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c @@ -230,7 +230,7 @@ int radeonTransformTEX( else inst_add->U.I.SrcReg[0].Negate = inst_add->U.I.SrcReg[0].Negate ^ RC_MASK_XYZW; - /* This negates the whole expresion: */ + /* This negates the whole expression: */ if (comparefunc == RC_COMPARE_FUNC_LESS || comparefunc == RC_COMPARE_FUNC_GREATER || comparefunc == RC_COMPARE_FUNC_NOTEQUAL) { pass = 1; diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c index 4c276a4c1..30e1232f5 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c @@ -383,7 +383,7 @@ struct rc_list * rc_get_variables(struct radeon_compiler * c) * as the src1.xyz and src1.w of the instruction where the value is used are * in theory independent. They are not because the same register is written * also by the texture instruction in the other branch and TEX can't write xyz - * and w separatelly. + * and w separately. * * Therefore first search for RC_INSTRUCTION_NORMAL to create variables from * the texture instruction and than the pair instructions will be properly @@ -401,7 +401,19 @@ struct rc_list * rc_get_variables(struct radeon_compiler * c) memset(&reader_data, 0, sizeof(reader_data)); rc_get_readers(c, inst, &reader_data, NULL, NULL, NULL); if (reader_data.ReaderCount == 0) { - continue; + /* Variable is only returned if there is both writer + * and reader. This means dead writes will not get + * register allocated as a result and can overwrite random + * registers. Assert on dead writes insted so we can improve + * the DCE. + */ + const struct rc_opcode_info *opcode = + rc_get_opcode_info(inst->U.I.Opcode); + assert(c->type == RC_FRAGMENT_PROGRAM || + !opcode->HasDstReg || + inst->U.I.DstReg.File == RC_FILE_OUTPUT || + inst->U.I.DstReg.File == RC_FILE_ADDRESS); + continue; } new_var = rc_variable(c, inst->U.I.DstReg.File, inst->U.I.DstReg.Index, diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c b/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c index 008bf5d31..0c85579ca 100644 --- a/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c +++ b/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c @@ -513,6 +513,7 @@ void init_compiler( rc_init_regalloc_state(rs, program_type); rc_init(c, rs); + c->type = program_type; c->is_r500 = is_r500; c->max_temp_regs = is_r500 ? 128 : (is_r400 ? 64 : 32); c->max_constants = is_r500 ? 256 : 32; |