summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers/r300/compiler
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mesa/src/gallium/drivers/r300/compiler')
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.c2519
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.h50
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.c182
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.h64
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py80
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c47
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c50
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c6
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c34
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h2
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c10
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h3
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h2
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c21
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h9
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c329
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c6
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h2
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c468
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h15
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h2
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c2
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c16
-rw-r--r--lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c1
24 files changed, 3281 insertions, 639 deletions
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.c b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.c
new file mode 100644
index 000000000..0fda015ce
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.c
@@ -0,0 +1,2519 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_deref.h"
+#include "compiler/nir/nir_legacy.h"
+#include "compiler/nir/nir_worklist.h"
+#include "nir_to_rc.h"
+#include "r300_nir.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_from_mesa.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_util.h"
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_dynarray.h"
+
+struct ntr_insn {
+ enum tgsi_opcode opcode;
+ struct ureg_dst dst[2];
+ struct ureg_src src[4];
+ enum tgsi_texture_type tex_target;
+ enum tgsi_return_type tex_return_type;
+ struct tgsi_texture_offset tex_offset[4];
+
+ unsigned mem_qualifier;
+ enum pipe_format mem_format;
+
+ bool is_tex : 1;
+ bool precise : 1;
+};
+
+struct ntr_block {
+ /* Array of struct ntr_insn */
+ struct util_dynarray insns;
+ int start_ip;
+ int end_ip;
+};
+
+struct ntr_reg_interval {
+ uint32_t start, end;
+};
+
+struct ntr_compile {
+ nir_shader *s;
+ nir_function_impl *impl;
+ const struct nir_to_rc_options *options;
+ struct pipe_screen *screen;
+ struct ureg_program *ureg;
+
+ bool addr_declared[3];
+ struct ureg_dst addr_reg[3];
+
+ /* if condition set up at the end of a block, for ntr_emit_if(). */
+ struct ureg_src if_cond;
+
+ /* TGSI temps for our NIR SSA and register values. */
+ struct ureg_dst *reg_temp;
+ struct ureg_src *ssa_temp;
+
+ struct ntr_reg_interval *liveness;
+
+ /* Map from nir_block to ntr_block */
+ struct hash_table *blocks;
+ struct ntr_block *cur_block;
+ unsigned current_if_else;
+ unsigned cf_label;
+
+ /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
+ bool precise;
+
+ unsigned num_temps;
+ unsigned first_non_array_temp;
+
+ /* Mappings from driver_location to TGSI input/output number.
+ *
+ * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
+ * their numbers assigned incrementally, unlike inputs or constants.
+ */
+ struct ureg_src *input_index_map;
+ uint64_t centroid_inputs;
+
+ uint32_t first_ubo;
+};
+
+static struct ureg_dst
+ntr_temp(struct ntr_compile *c)
+{
+ return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
+}
+
+static struct ntr_block *
+ntr_block_from_nir(struct ntr_compile *c, struct nir_block *block)
+{
+ struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
+ return entry->data;
+}
+
+static void ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list);
+static void ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list);
+
+static struct ntr_insn *
+ntr_insn(struct ntr_compile *c, enum tgsi_opcode opcode,
+ struct ureg_dst dst,
+ struct ureg_src src0, struct ureg_src src1,
+ struct ureg_src src2, struct ureg_src src3)
+{
+ struct ntr_insn insn = {
+ .opcode = opcode,
+ .dst = { dst, ureg_dst_undef() },
+ .src = { src0, src1, src2, src3 },
+ .precise = c->precise,
+ };
+ util_dynarray_append(&c->cur_block->insns, struct ntr_insn, insn);
+ return util_dynarray_top_ptr(&c->cur_block->insns, struct ntr_insn);
+}
+
+#define OP00( op ) \
+static inline void ntr_##op(struct ntr_compile *c) \
+{ \
+ ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
+}
+
+#define OP01( op ) \
+static inline void ntr_##op(struct ntr_compile *c, \
+ struct ureg_src src0) \
+{ \
+ ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
+}
+
+
+#define OP10( op ) \
+static inline void ntr_##op(struct ntr_compile *c, \
+ struct ureg_dst dst) \
+{ \
+ ntr_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
+}
+
+#define OP11( op ) \
+static inline void ntr_##op(struct ntr_compile *c, \
+ struct ureg_dst dst, \
+ struct ureg_src src0) \
+{ \
+ ntr_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
+}
+
+#define OP12( op ) \
+static inline void ntr_##op(struct ntr_compile *c, \
+ struct ureg_dst dst, \
+ struct ureg_src src0, \
+ struct ureg_src src1) \
+{ \
+ ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
+}
+
+#define OP13( op ) \
+static inline void ntr_##op(struct ntr_compile *c, \
+ struct ureg_dst dst, \
+ struct ureg_src src0, \
+ struct ureg_src src1, \
+ struct ureg_src src2) \
+{ \
+ ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \
+}
+
+#define OP14( op ) \
+static inline void ntr_##op(struct ntr_compile *c, \
+ struct ureg_dst dst, \
+ struct ureg_src src0, \
+ struct ureg_src src1, \
+ struct ureg_src src2, \
+ struct ureg_src src3) \
+{ \
+ ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \
+}
+
+/* We hand-craft our tex instructions */
+#define OP12_TEX(op)
+#define OP14_TEX(op)
+
+/* Use a template include to generate a correctly-typed ntr_OP()
+ * function for each TGSI opcode:
+ */
+#include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
+
+/**
+ * Interprets a nir_load_const used as a NIR src as a uint.
+ *
+ * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
+ * instruction (or in a phi-web used by an integer ALU instruction) were
+ * converted to floats and the ALU instruction swapped to the float equivalent.
+ * However, this means that integer load_consts used by intrinsics (which don't
+ * normally get that conversion) may have been reformatted to be floats. Given
+ * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
+ * we can just look and see if they look like floats and convert them back to
+ * ints.
+ */
+static uint32_t
+ntr_src_as_uint(struct ntr_compile *c, nir_src src)
+{
+ uint32_t val = nir_src_as_uint(src);
+ if (val >= fui(1.0))
+ val = (uint32_t)uif(val);
+ return val;
+}
+
+/* Per-channel masks of def/use within the block, and the per-channel
+ * livein/liveout for the block as a whole.
+ */
+struct ntr_live_reg_block_state {
+ uint8_t *def, *use, *livein, *liveout, *defin, *defout;
+};
+
+struct ntr_live_reg_state {
+ unsigned bitset_words;
+
+ struct ntr_reg_interval *regs;
+
+ /* Used in propagate_across_edge() */
+ BITSET_WORD *tmp_live;
+
+ struct ntr_live_reg_block_state *blocks;
+
+ nir_block_worklist worklist;
+};
+
+static void
+ntr_live_reg_mark_use(struct ntr_compile *c, struct ntr_live_reg_block_state *bs,
+ int ip, unsigned index, unsigned used_mask)
+{
+ bs->use[index] |= used_mask & ~bs->def[index];
+
+ c->liveness[index].start = MIN2(c->liveness[index].start, ip);
+ c->liveness[index].end = MAX2(c->liveness[index].end, ip);
+
+}
+static void
+ntr_live_reg_setup_def_use(struct ntr_compile *c, nir_function_impl *impl, struct ntr_live_reg_state *state)
+{
+ for (int i = 0; i < impl->num_blocks; i++) {
+ state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
+ state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
+ state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
+ state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
+ state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
+ state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
+ }
+
+ int ip = 0;
+ nir_foreach_block(block, impl) {
+ struct ntr_live_reg_block_state *bs = &state->blocks[block->index];
+ struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
+
+ ntr_block->start_ip = ip;
+
+ util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
+ const struct tgsi_opcode_info *opcode_info =
+ tgsi_get_opcode_info(insn->opcode);
+
+ /* Set up use[] for the srcs.
+ *
+ * Uses are the channels of the reg read in the block that don't have a
+ * preceding def to screen them off. Note that we don't do per-element
+ * tracking of array regs, so they're never screened off.
+ */
+ for (int i = 0; i < opcode_info->num_src; i++) {
+ if (insn->src[i].File != TGSI_FILE_TEMPORARY)
+ continue;
+ int index = insn->src[i].Index;
+
+ uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
+ insn->dst->WriteMask,
+ insn->src[i].SwizzleX,
+ insn->src[i].SwizzleY,
+ insn->src[i].SwizzleZ,
+ insn->src[i].SwizzleW,
+ insn->tex_target,
+ insn->tex_target);
+
+ assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
+ ntr_live_reg_mark_use(c, bs, ip, index, used_mask);
+ }
+
+ if (insn->is_tex) {
+ for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
+ if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
+ ntr_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
+ }
+ }
+
+ /* Set up def[] for the srcs.
+ *
+ * Defs are the unconditionally-written (not R/M/W) channels of the reg in
+ * the block that don't have a preceding use.
+ */
+ for (int i = 0; i < opcode_info->num_dst; i++) {
+ if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
+ continue;
+ int index = insn->dst[i].Index;
+ uint32_t writemask = insn->dst[i].WriteMask;
+
+ bs->def[index] |= writemask & ~bs->use[index];
+ bs->defout[index] |= writemask;
+
+ assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
+ c->liveness[index].start = MIN2(c->liveness[index].start, ip);
+ c->liveness[index].end = MAX2(c->liveness[index].end, ip);
+ }
+ ip++;
+ }
+
+ ntr_block->end_ip = ip;
+ }
+}
+
+static void
+ntr_live_regs(struct ntr_compile *c, nir_function_impl *impl)
+{
+ nir_metadata_require(impl, nir_metadata_block_index);
+
+ c->liveness = rzalloc_array(c, struct ntr_reg_interval, c->num_temps);
+
+ struct ntr_live_reg_state state = {
+ .blocks = rzalloc_array(impl, struct ntr_live_reg_block_state, impl->num_blocks),
+ };
+
+ /* The intervals start out with start > end (indicating unused) */
+ for (int i = 0; i < c->num_temps; i++)
+ c->liveness[i].start = ~0;
+
+ ntr_live_reg_setup_def_use(c, impl, &state);
+
+ /* Make a forward-order worklist of all the blocks. */
+ nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
+ nir_foreach_block(block, impl) {
+ nir_block_worklist_push_tail(&state.worklist, block);
+ }
+
+ /* Propagate defin/defout down the CFG to calculate the live variables
+ * potentially defined along any possible control flow path. We'll use this
+ * to keep things like conditional defs of the reg (or array regs where we
+ * don't track defs!) from making the reg's live range extend back to the
+ * start of the program.
+ */
+ while (!nir_block_worklist_is_empty(&state.worklist)) {
+ nir_block *block = nir_block_worklist_pop_head(&state.worklist);
+ for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
+ nir_block *succ = block->successors[j];
+ if (!succ || succ->index == impl->num_blocks)
+ continue;
+
+ for (int i = 0; i < c->num_temps; i++) {
+ uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
+
+ if (new_def) {
+ state.blocks[succ->index].defin[i] |= new_def;
+ state.blocks[succ->index].defout[i] |= new_def;
+ nir_block_worklist_push_tail(&state.worklist, succ);
+ }
+ }
+ }
+ }
+
+ /* Make a reverse-order worklist of all the blocks. */
+ nir_foreach_block(block, impl) {
+ nir_block_worklist_push_head(&state.worklist, block);
+ }
+
+ /* We're now ready to work through the worklist and update the liveness sets
+ * of each of the blocks. As long as we keep the worklist up-to-date as we
+ * go, everything will get covered.
+ */
+ while (!nir_block_worklist_is_empty(&state.worklist)) {
+ /* We pop them off in the reverse order we pushed them on. This way
+ * the first walk of the instructions is backwards so we only walk
+ * once in the case of no control flow.
+ */
+ nir_block *block = nir_block_worklist_pop_head(&state.worklist);
+ struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
+ struct ntr_live_reg_block_state *bs = &state.blocks[block->index];
+
+ for (int i = 0; i < c->num_temps; i++) {
+ /* Collect livein from our successors to include in our liveout. */
+ for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
+ nir_block *succ = block->successors[j];
+ if (!succ || succ->index == impl->num_blocks)
+ continue;
+ struct ntr_live_reg_block_state *sbs = &state.blocks[succ->index];
+
+ uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
+ if (new_liveout) {
+ if (state.blocks[block->index].defout[i])
+ c->liveness[i].end = MAX2(c->liveness[i].end, ntr_block->end_ip);
+ bs->liveout[i] |= sbs->livein[i];
+ }
+ }
+
+ /* Propagate use requests from either our block's uses or our
+ * non-screened-off liveout up to our predecessors.
+ */
+ uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
+ ~bs->livein[i]);
+ if (new_livein) {
+ bs->livein[i] |= new_livein;
+ set_foreach(block->predecessors, entry) {
+ nir_block *pred = (void *)entry->key;
+ nir_block_worklist_push_tail(&state.worklist, pred);
+ }
+
+ if (new_livein & state.blocks[block->index].defin[i])
+ c->liveness[i].start = MIN2(c->liveness[i].start, ntr_block->start_ip);
+ }
+ }
+ }
+
+ ralloc_free(state.blocks);
+ nir_block_worklist_fini(&state.worklist);
+}
+
+static void
+ntr_ra_check(struct ntr_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
+{
+ if (index < c->first_non_array_temp)
+ return;
+
+ if (c->liveness[index].start == ip && ra_map[index] == ~0)
+ ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
+
+ if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
+ ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
+ BITSET_SET(released, index);
+ }
+}
+
+static void
+ntr_allocate_regs(struct ntr_compile *c, nir_function_impl *impl)
+{
+ ntr_live_regs(c, impl);
+
+ unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
+ unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
+
+ /* No RA on NIR array regs */
+ for (int i = 0; i < c->first_non_array_temp; i++)
+ ra_map[i] = i;
+
+ for (int i = c->first_non_array_temp; i < c->num_temps; i++)
+ ra_map[i] = ~0;
+
+ int ip = 0;
+ nir_foreach_block(block, impl) {
+ struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
+
+ for (int i = 0; i < c->num_temps; i++)
+ ntr_ra_check(c, ra_map, released, ip, i);
+
+ util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
+ const struct tgsi_opcode_info *opcode_info =
+ tgsi_get_opcode_info(insn->opcode);
+
+ for (int i = 0; i < opcode_info->num_src; i++) {
+ if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
+ ntr_ra_check(c, ra_map, released, ip, insn->src[i].Index);
+ insn->src[i].Index = ra_map[insn->src[i].Index];
+ }
+ }
+
+ if (insn->is_tex) {
+ for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
+ if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
+ ntr_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
+ insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
+ }
+ }
+ }
+
+ for (int i = 0; i < opcode_info->num_dst; i++) {
+ if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
+ ntr_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
+ insn->dst[i].Index = ra_map[insn->dst[i].Index];
+ }
+ }
+ ip++;
+ }
+
+ for (int i = 0; i < c->num_temps; i++)
+ ntr_ra_check(c, ra_map, released, ip, i);
+ }
+}
+
+static void
+ntr_allocate_regs_unoptimized(struct ntr_compile *c, nir_function_impl *impl)
+{
+ for (int i = c->first_non_array_temp; i < c->num_temps; i++)
+ ureg_DECL_temporary(c->ureg);
+}
+
+/* TGSI varying declarations have a component usage mask associated (used by
+ * r600 and svga).
+ */
+static uint32_t
+ntr_tgsi_var_usage_mask(const struct nir_variable *var)
+{
+ const struct glsl_type *type_without_array =
+ glsl_without_array(var->type);
+ unsigned num_components = glsl_get_vector_elements(type_without_array);
+ if (num_components == 0) /* structs */
+ num_components = 4;
+
+ return u_bit_consecutive(var->data.location_frac, num_components);
+}
+
+static struct ureg_dst
+ntr_output_decl(struct ntr_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
+{
+ nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
+ int base = nir_intrinsic_base(instr);
+ *frac = nir_intrinsic_component(instr);
+
+ struct ureg_dst out;
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+ unsigned semantic_name, semantic_index;
+ tgsi_get_gl_frag_result_semantic(semantics.location,
+ &semantic_name, &semantic_index);
+ semantic_index += semantics.dual_source_blend_index;
+
+ switch (semantics.location) {
+ case FRAG_RESULT_DEPTH:
+ *frac = 2; /* z write is the to the .z channel in TGSI */
+ break;
+ case FRAG_RESULT_STENCIL:
+ *frac = 1;
+ break;
+ default:
+ break;
+ }
+
+ out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
+ } else {
+ unsigned semantic_name, semantic_index;
+
+ tgsi_get_gl_varying_semantic(semantics.location, true,
+ &semantic_name, &semantic_index);
+
+ uint32_t usage_mask = u_bit_consecutive(*frac, instr->num_components);
+ uint32_t gs_streams = semantics.gs_streams;
+ for (int i = 0; i < 4; i++) {
+ if (!(usage_mask & (1 << i)))
+ gs_streams &= ~(0x3 << 2 * i);
+ }
+
+ /* No driver appears to use array_id of outputs. */
+ unsigned array_id = 0;
+
+ /* This bit is lost in the i/o semantics, but it's unused in in-tree
+ * drivers.
+ */
+ bool invariant = semantics.invariant;
+
+ out = ureg_DECL_output_layout(c->ureg,
+ semantic_name, semantic_index,
+ gs_streams,
+ base,
+ usage_mask,
+ array_id,
+ semantics.num_slots,
+ invariant);
+ }
+
+ unsigned write_mask;
+ if (nir_intrinsic_has_write_mask(instr))
+ write_mask = nir_intrinsic_write_mask(instr);
+ else
+ write_mask = ((1 << instr->num_components) - 1) << *frac;
+
+ write_mask = write_mask << *frac;
+ return ureg_writemask(out, write_mask);
+}
+
+static bool
+ntr_try_store_in_tgsi_output_with_use(struct ntr_compile *c,
+ struct ureg_dst *dst,
+ nir_src *src)
+{
+ *dst = ureg_dst_undef();
+
+ if (nir_src_is_if(src))
+ return false;
+
+ if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
+ return false;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
+ if (intr->intrinsic != nir_intrinsic_store_output ||
+ !nir_src_is_const(intr->src[1])) {
+ return false;
+ }
+
+ uint32_t frac;
+ *dst = ntr_output_decl(c, intr, &frac);
+ dst->Index += ntr_src_as_uint(c, intr->src[1]);
+
+ return frac == 0;
+}
+
+/* If this reg is used only for storing an output, then in the simple
+ * cases we can write directly to the TGSI output instead of having
+ * store_output emit its own MOV.
+ */
+static bool
+ntr_try_store_reg_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
+ nir_intrinsic_instr *reg_decl)
+{
+ assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
+
+ *dst = ureg_dst_undef();
+
+ /* Look for a single use for try_store_in_tgsi_output */
+ nir_src *use = NULL;
+ nir_foreach_reg_load(src, reg_decl) {
+ nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
+ nir_foreach_use_including_if(load_use, &load->def) {
+ /* We can only have one use */
+ if (use != NULL)
+ return false;
+
+ use = load_use;
+ }
+ }
+
+ if (use == NULL)
+ return false;
+
+ return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
+}
+
+/* If this SSA def is used only for storing an output, then in the simple
+ * cases we can write directly to the TGSI output instead of having
+ * store_output emit its own MOV.
+ */
+static bool
+ntr_try_store_ssa_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
+ nir_def *def)
+{
+ *dst = ureg_dst_undef();
+
+ if (!list_is_singular(&def->uses))
+ return false;
+
+ nir_foreach_use_including_if(use, def) {
+ return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
+ }
+ unreachable("We have one use");
+}
+
+static void
+ntr_setup_inputs(struct ntr_compile *c)
+{
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+ return;
+
+ unsigned num_inputs = 0;
+ int num_input_arrays = 0;
+
+ nir_foreach_shader_in_variable(var, c->s) {
+ const struct glsl_type *type = var->type;
+ unsigned array_len =
+ glsl_count_attribute_slots(type, false);
+
+ num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
+ }
+
+ c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
+
+ nir_foreach_shader_in_variable(var, c->s) {
+ const struct glsl_type *type = var->type;
+ unsigned array_len =
+ glsl_count_attribute_slots(type, false);
+
+ unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
+ unsigned sample_loc;
+ struct ureg_src decl;
+
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+ interpolation =
+ tgsi_get_interp_mode(var->data.interpolation,
+ var->data.location == VARYING_SLOT_COL0 ||
+ var->data.location == VARYING_SLOT_COL1);
+
+ if (var->data.location == VARYING_SLOT_POS)
+ interpolation = TGSI_INTERPOLATE_LINEAR;
+ }
+
+ unsigned semantic_name, semantic_index;
+ tgsi_get_gl_varying_semantic(var->data.location, true,
+ &semantic_name, &semantic_index);
+
+ if (var->data.sample) {
+ sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
+ } else if (var->data.centroid) {
+ sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
+ c->centroid_inputs |= (BITSET_MASK(array_len) <<
+ var->data.driver_location);
+ } else {
+ sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
+ }
+
+ unsigned array_id = 0;
+ if (glsl_type_is_array(type))
+ array_id = ++num_input_arrays;
+
+ uint32_t usage_mask = ntr_tgsi_var_usage_mask(var);
+
+ decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
+ semantic_name,
+ semantic_index,
+ interpolation,
+ sample_loc,
+ var->data.driver_location,
+ usage_mask,
+ array_id, array_len);
+
+ if (semantic_name == TGSI_SEMANTIC_FACE) {
+ struct ureg_dst temp = ntr_temp(c);
+ /* tgsi docs say that floating point FACE will be positive for
+ * frontface and negative for backface, but realistically
+ * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
+ * Copy that behavior, since some drivers (r300) have been doing a
+ * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
+ * front face).
+ */
+ temp.Saturate = true;
+ ntr_MOV(c, temp, decl);
+ decl = ureg_src(temp);
+ }
+
+ for (unsigned i = 0; i < array_len; i++) {
+ c->input_index_map[var->data.driver_location + i] = decl;
+ c->input_index_map[var->data.driver_location + i].Index += i;
+ }
+ }
+}
+
+static int
+ntr_sort_by_location(const nir_variable *a, const nir_variable *b)
+{
+ return a->data.location - b->data.location;
+}
+
+/**
+ * Workaround for virglrenderer requiring that TGSI FS output color variables
+ * are declared in order. Besides, it's a lot nicer to read the TGSI this way.
+ */
+static void
+ntr_setup_outputs(struct ntr_compile *c)
+{
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+ return;
+
+ nir_sort_variables_with_modes(c->s, ntr_sort_by_location, nir_var_shader_out);
+
+ nir_foreach_shader_out_variable(var, c->s) {
+ if (var->data.location == FRAG_RESULT_COLOR)
+ ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
+
+ unsigned semantic_name, semantic_index;
+ tgsi_get_gl_frag_result_semantic(var->data.location,
+ &semantic_name, &semantic_index);
+
+ (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
+ }
+}
+
+static enum tgsi_texture_type
+tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
+{
+ switch (dim) {
+ case GLSL_SAMPLER_DIM_1D:
+ if (is_shadow)
+ return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
+ else
+ return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
+ if (is_shadow)
+ return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
+ else
+ return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
+ case GLSL_SAMPLER_DIM_3D:
+ return TGSI_TEXTURE_3D;
+ case GLSL_SAMPLER_DIM_CUBE:
+ if (is_shadow)
+ return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
+ else
+ return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
+ case GLSL_SAMPLER_DIM_RECT:
+ if (is_shadow)
+ return TGSI_TEXTURE_SHADOWRECT;
+ else
+ return TGSI_TEXTURE_RECT;
+ case GLSL_SAMPLER_DIM_MS:
+ return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
+ case GLSL_SAMPLER_DIM_BUF:
+ return TGSI_TEXTURE_BUFFER;
+ default:
+ unreachable("unknown sampler dim");
+ }
+}
+
+static enum tgsi_return_type
+tgsi_return_type_from_base_type(enum glsl_base_type type)
+{
+ switch (type) {
+ case GLSL_TYPE_INT:
+ return TGSI_RETURN_TYPE_SINT;
+ case GLSL_TYPE_UINT:
+ return TGSI_RETURN_TYPE_UINT;
+ case GLSL_TYPE_FLOAT:
+ return TGSI_RETURN_TYPE_FLOAT;
+ default:
+ unreachable("unexpected texture type");
+ }
+}
+
+static void
+ntr_setup_uniforms(struct ntr_compile *c)
+{
+ nir_foreach_uniform_variable(var, c->s) {
+ if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
+ glsl_type_is_texture(glsl_without_array(var->type))) {
+ /* Don't use this size for the check for samplers -- arrays of structs
+ * containing samplers should be ignored, and just the separate lowered
+ * sampler uniform decl used.
+ */
+ int size = glsl_type_get_sampler_count(var->type) +
+ glsl_type_get_texture_count(var->type);
+
+ const struct glsl_type *stype = glsl_without_array(var->type);
+ enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
+ glsl_sampler_type_is_array(stype),
+ glsl_sampler_type_is_shadow(stype));
+ enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
+ for (int i = 0; i < size; i++) {
+ ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
+ target, ret_type, ret_type, ret_type, ret_type);
+ ureg_DECL_sampler(c->ureg, var->data.binding + i);
+ }
+
+ /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
+ * size declaration happens with other UBOs below.
+ */
+ }
+ }
+
+ c->first_ubo = ~0;
+
+ unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
+ nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
+ int ubo = var->data.driver_location;
+ if (ubo == -1)
+ continue;
+
+ if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
+ c->first_ubo = MIN2(c->first_ubo, ubo);
+
+ unsigned size = glsl_get_explicit_size(var->interface_type, false);
+ ubo_sizes[ubo] = size;
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
+ if (ubo_sizes[i])
+ ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
+ }
+}
+
+static void
+ntr_setup_registers(struct ntr_compile *c)
+{
+ assert(c->num_temps == 0);
+
+ nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
+ /* Permanently allocate all the array regs at the start. */
+ unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
+ unsigned index = nir_reg->def.index;
+
+ if (num_array_elems != 0) {
+ struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
+ c->reg_temp[index] = decl;
+ assert(c->num_temps == decl.Index);
+ c->num_temps += num_array_elems;
+ }
+ }
+ c->first_non_array_temp = c->num_temps;
+
+ /* After that, allocate non-array regs in our virtual space that we'll
+ * register-allocate before ureg emit.
+ */
+ nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
+ unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
+ unsigned num_components = nir_intrinsic_num_components(nir_reg);
+ unsigned index = nir_reg->def.index;
+
+ /* We already handled arrays */
+ if (num_array_elems == 0) {
+ struct ureg_dst decl;
+ uint32_t write_mask = BITFIELD_MASK(num_components);
+
+ if (!ntr_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
+ decl = ureg_writemask(ntr_temp(c), write_mask);
+ }
+ c->reg_temp[index] = decl;
+ }
+ }
+}
+
+static struct ureg_src
+ntr_get_load_const_src(struct ntr_compile *c, nir_load_const_instr *instr)
+{
+ int num_components = instr->def.num_components;
+
+ float values[4];
+ assert(instr->def.bit_size == 32);
+ for (int i = 0; i < num_components; i++)
+ values[i] = uif(instr->value[i].u32);
+
+ return ureg_DECL_immediate(c->ureg, values, num_components);
+}
+
+static struct ureg_src
+ntr_reladdr(struct ntr_compile *c, struct ureg_src addr, int addr_index)
+{
+ assert(addr_index < ARRAY_SIZE(c->addr_reg));
+
+ for (int i = 0; i <= addr_index; i++) {
+ if (!c->addr_declared[i]) {
+ c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
+ TGSI_WRITEMASK_X);
+ c->addr_declared[i] = true;
+ }
+ }
+
+ ntr_ARL(c, c->addr_reg[addr_index], addr);
+ return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
+}
+
+/* Forward declare for recursion with indirects */
+static struct ureg_src
+ntr_get_src(struct ntr_compile *c, nir_src src);
+
+static struct ureg_src
+ntr_get_chased_src(struct ntr_compile *c, nir_legacy_src *src)
+{
+ if (src->is_ssa) {
+ if (src->ssa->parent_instr->type == nir_instr_type_load_const)
+ return ntr_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
+
+ return c->ssa_temp[src->ssa->index];
+ } else {
+ struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
+ reg_temp.Index += src->reg.base_offset;
+
+ if (src->reg.indirect) {
+ struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(src->reg.indirect));
+ return ureg_src_indirect(ureg_src(reg_temp),
+ ntr_reladdr(c, offset, 0));
+ } else {
+ return ureg_src(reg_temp);
+ }
+ }
+}
+
+static struct ureg_src
+ntr_get_src(struct ntr_compile *c, nir_src src)
+{
+ nir_legacy_src chased = nir_legacy_chase_src(&src);
+ return ntr_get_chased_src(c, &chased);
+}
+
+static struct ureg_src
+ntr_get_alu_src(struct ntr_compile *c, nir_alu_instr *instr, int i)
+{
+ /* We only support 32-bit float modifiers. The only other modifier type
+ * officially supported by TGSI is 32-bit integer negates, but even those are
+ * broken on virglrenderer, so skip lowering all integer and f64 float mods.
+ *
+ * The options->lower_fabs requests that we not have native source modifiers
+ * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
+ */
+ nir_legacy_alu_src src =
+ nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
+ struct ureg_src usrc = ntr_get_chased_src(c, &src.src);
+
+ usrc = ureg_swizzle(usrc,
+ src.swizzle[0],
+ src.swizzle[1],
+ src.swizzle[2],
+ src.swizzle[3]);
+
+ if (src.fabs)
+ usrc = ureg_abs(usrc);
+ if (src.fneg)
+ usrc = ureg_negate(usrc);
+
+ return usrc;
+}
+
+/* Reswizzles a source so that the unset channels in the write mask still refer
+ * to one of the channels present in the write mask.
+ */
+static struct ureg_src
+ntr_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
+{
+ assert(write_mask);
+ int first_chan = ffs(write_mask) - 1;
+ return ureg_swizzle(src,
+ (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
+ (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
+ (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
+ (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
+}
+
+static struct ureg_dst
+ntr_get_ssa_def_decl(struct ntr_compile *c, nir_def *ssa)
+{
+ uint32_t writemask = BITSET_MASK(ssa->num_components);
+
+ struct ureg_dst dst;
+ if (!ntr_try_store_ssa_in_tgsi_output(c, &dst, ssa))
+ dst = ntr_temp(c);
+
+ c->ssa_temp[ssa->index] = ntr_swizzle_for_write_mask(ureg_src(dst), writemask);
+
+ return ureg_writemask(dst, writemask);
+}
+
+static struct ureg_dst
+ntr_get_chased_dest_decl(struct ntr_compile *c, nir_legacy_dest *dest)
+{
+ if (dest->is_ssa)
+ return ntr_get_ssa_def_decl(c, dest->ssa);
+ else
+ return c->reg_temp[dest->reg.handle->index];
+}
+
+static struct ureg_dst
+ntr_get_chased_dest(struct ntr_compile *c, nir_legacy_dest *dest)
+{
+ struct ureg_dst dst = ntr_get_chased_dest_decl(c, dest);
+
+ if (!dest->is_ssa) {
+ dst.Index += dest->reg.base_offset;
+
+ if (dest->reg.indirect) {
+ struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(dest->reg.indirect));
+ dst = ureg_dst_indirect(dst, ntr_reladdr(c, offset, 0));
+ }
+ }
+
+ return dst;
+}
+
+static struct ureg_dst
+ntr_get_dest(struct ntr_compile *c, nir_def *def)
+{
+ nir_legacy_dest chased = nir_legacy_chase_dest(def);
+ return ntr_get_chased_dest(c, &chased);
+}
+
+static struct ureg_dst
+ntr_get_alu_dest(struct ntr_compile *c, nir_def *def)
+{
+ nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
+ struct ureg_dst dst = ntr_get_chased_dest(c, &chased.dest);
+
+ if (chased.fsat)
+ dst.Saturate = true;
+
+ /* Only registers get write masks */
+ if (chased.dest.is_ssa)
+ return dst;
+
+ return ureg_writemask(dst, chased.write_mask);
+}
+
+/* For an SSA dest being populated by a constant src, replace the storage with
+ * a copy of the ureg_src.
+ */
+static void
+ntr_store_def(struct ntr_compile *c, nir_def *def, struct ureg_src src)
+{
+ if (!src.Indirect && !src.DimIndirect) {
+ switch (src.File) {
+ case TGSI_FILE_IMMEDIATE:
+ case TGSI_FILE_INPUT:
+ case TGSI_FILE_CONSTANT:
+ case TGSI_FILE_SYSTEM_VALUE:
+ c->ssa_temp[def->index] = src;
+ return;
+ }
+ }
+
+ ntr_MOV(c, ntr_get_ssa_def_decl(c, def), src);
+}
+
+static void
+ntr_store(struct ntr_compile *c, nir_def *def, struct ureg_src src)
+{
+ nir_legacy_dest chased = nir_legacy_chase_dest(def);
+
+ if (chased.is_ssa)
+ ntr_store_def(c, chased.ssa, src);
+ else {
+ struct ureg_dst dst = ntr_get_chased_dest(c, &chased);
+ ntr_MOV(c, dst, src);
+ }
+}
+
+static void
+ntr_emit_scalar(struct ntr_compile *c, unsigned tgsi_op,
+ struct ureg_dst dst,
+ struct ureg_src src0,
+ struct ureg_src src1)
+{
+ unsigned i;
+
+ /* POW is the only 2-operand scalar op. */
+ if (tgsi_op != TGSI_OPCODE_POW)
+ src1 = src0;
+
+ for (i = 0; i < 4; i++) {
+ if (dst.WriteMask & (1 << i)) {
+ ntr_insn(c, tgsi_op,
+ ureg_writemask(dst, 1 << i),
+ ureg_scalar(src0, i),
+ ureg_scalar(src1, i),
+ ureg_src_undef(), ureg_src_undef());
+ }
+ }
+}
+
+static void
+ntr_emit_alu(struct ntr_compile *c, nir_alu_instr *instr)
+{
+ struct ureg_src src[4];
+ struct ureg_dst dst;
+ unsigned i;
+ int num_srcs = nir_op_infos[instr->op].num_inputs;
+
+ /* Don't try to translate folded fsat since their source won't be valid */
+ if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
+ return;
+
+ c->precise = instr->exact;
+
+ assert(num_srcs <= ARRAY_SIZE(src));
+ for (i = 0; i < num_srcs; i++)
+ src[i] = ntr_get_alu_src(c, instr, i);
+ for (; i < ARRAY_SIZE(src); i++)
+ src[i] = ureg_src_undef();
+
+ dst = ntr_get_alu_dest(c, &instr->def);
+
+ static enum tgsi_opcode op_map[] = {
+ [nir_op_mov] = TGSI_OPCODE_MOV,
+
+ [nir_op_fdot2_replicated] = TGSI_OPCODE_DP2,
+ [nir_op_fdot3_replicated] = TGSI_OPCODE_DP3,
+ [nir_op_fdot4_replicated] = TGSI_OPCODE_DP4,
+ [nir_op_ffloor] = TGSI_OPCODE_FLR,
+ [nir_op_ffract] = TGSI_OPCODE_FRC,
+ [nir_op_fceil] = TGSI_OPCODE_CEIL,
+ [nir_op_fround_even] = TGSI_OPCODE_ROUND,
+
+ [nir_op_slt] = TGSI_OPCODE_SLT,
+ [nir_op_sge] = TGSI_OPCODE_SGE,
+ [nir_op_seq] = TGSI_OPCODE_SEQ,
+ [nir_op_sne] = TGSI_OPCODE_SNE,
+
+ [nir_op_ftrunc] = TGSI_OPCODE_TRUNC,
+ [nir_op_fddx] = TGSI_OPCODE_DDX,
+ [nir_op_fddy] = TGSI_OPCODE_DDY,
+ [nir_op_fddx_coarse] = TGSI_OPCODE_DDX,
+ [nir_op_fddy_coarse] = TGSI_OPCODE_DDY,
+ [nir_op_fadd] = TGSI_OPCODE_ADD,
+ [nir_op_fmul] = TGSI_OPCODE_MUL,
+
+ [nir_op_fmin] = TGSI_OPCODE_MIN,
+ [nir_op_fmax] = TGSI_OPCODE_MAX,
+ [nir_op_ffma] = TGSI_OPCODE_MAD,
+ };
+
+ if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op] > 0) {
+ /* The normal path for NIR to TGSI ALU op translation */
+ ntr_insn(c, op_map[instr->op],
+ dst, src[0], src[1], src[2], src[3]);
+ } else {
+ /* Special cases for NIR to TGSI ALU op translation. */
+
+ /* TODO: Use something like the ntr_store() path for the MOV calls so we
+ * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
+ */
+
+ switch (instr->op) {
+ case nir_op_fabs:
+ /* Try to eliminate */
+ if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
+ break;
+
+ if (c->options->lower_fabs)
+ ntr_MAX(c, dst, src[0], ureg_negate(src[0]));
+ else
+ ntr_MOV(c, dst, ureg_abs(src[0]));
+ break;
+
+ case nir_op_fsat:
+ ntr_MOV(c, ureg_saturate(dst), src[0]);
+ break;
+
+ case nir_op_fneg:
+ /* Try to eliminate */
+ if (nir_legacy_float_mod_folds(instr))
+ break;
+
+ ntr_MOV(c, dst, ureg_negate(src[0]));
+ break;
+
+ /* NOTE: TGSI 32-bit math ops have the old "one source channel
+ * replicated to all dst channels" behavior, while 64 is normal mapping
+ * of src channels to dst.
+ */
+ case nir_op_frcp:
+ ntr_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
+ break;
+
+ case nir_op_frsq:
+ ntr_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
+ break;
+
+ case nir_op_fexp2:
+ ntr_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
+ break;
+
+ case nir_op_flog2:
+ ntr_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
+ break;
+
+ case nir_op_fsin:
+ ntr_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
+ break;
+
+ case nir_op_fcos:
+ ntr_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
+ break;
+
+ case nir_op_fsub:
+ ntr_ADD(c, dst, src[0], ureg_negate(src[1]));
+ break;
+
+ case nir_op_fmod:
+ unreachable("should be handled by .lower_fmod = true");
+ break;
+
+ case nir_op_fpow:
+ ntr_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
+ break;
+
+ case nir_op_flrp:
+ ntr_LRP(c, dst, src[2], src[1], src[0]);
+ break;
+
+ case nir_op_fcsel:
+ /* If CMP isn't supported, then the flags that enable NIR to generate
+ * this opcode should also not be set.
+ */
+ assert(!c->options->lower_cmp);
+
+ /* Implement this as CMP(-abs(src0), src1, src2). */
+ ntr_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
+ break;
+
+ case nir_op_fcsel_gt:
+ /* If CMP isn't supported, then the flags that enable NIR to generate
+ * these opcodes should also not be set.
+ */
+ assert(!c->options->lower_cmp);
+
+ ntr_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
+ break;
+
+ case nir_op_fcsel_ge:
+ /* If CMP isn't supported, then the flags that enable NIR to generate
+ * these opcodes should also not be set.
+ */
+ assert(!c->options->lower_cmp);
+
+ /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
+ ntr_CMP(c, dst, src[0], src[2], src[1]);
+ break;
+
+ case nir_op_vec4:
+ case nir_op_vec3:
+ case nir_op_vec2:
+ unreachable("covered by nir_lower_vec_to_movs()");
+
+ default:
+ fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
+ unreachable("Unknown NIR opcode");
+ }
+ }
+
+ c->precise = false;
+}
+
+static struct ureg_src
+ntr_ureg_src_indirect(struct ntr_compile *c, struct ureg_src usrc,
+ nir_src src, int addr_reg)
+{
+ if (nir_src_is_const(src)) {
+ usrc.Index += ntr_src_as_uint(c, src);
+ return usrc;
+ } else {
+ return ureg_src_indirect(usrc, ntr_reladdr(c, ntr_get_src(c, src), addr_reg));
+ }
+}
+
+static struct ureg_dst
+ntr_ureg_dst_indirect(struct ntr_compile *c, struct ureg_dst dst,
+ nir_src src)
+{
+ if (nir_src_is_const(src)) {
+ dst.Index += ntr_src_as_uint(c, src);
+ return dst;
+ } else {
+ return ureg_dst_indirect(dst, ntr_reladdr(c, ntr_get_src(c, src), 0));
+ }
+}
+
+static struct ureg_dst
+ntr_ureg_dst_dimension_indirect(struct ntr_compile *c, struct ureg_dst udst,
+ nir_src src)
+{
+ if (nir_src_is_const(src)) {
+ return ureg_dst_dimension(udst, ntr_src_as_uint(c, src));
+ } else {
+ return ureg_dst_dimension_indirect(udst,
+ ntr_reladdr(c, ntr_get_src(c, src), 1),
+ 0);
+ }
+}
+/* Some load operations in NIR will have a fractional offset that we need to
+ * swizzle down before storing to the result register.
+ */
+static struct ureg_src
+ntr_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
+{
+ return ureg_swizzle(src,
+ frac,
+ frac + MIN2(num_components - 1, 1),
+ frac + MIN2(num_components - 1, 2),
+ frac + MIN2(num_components - 1, 3));
+}
+
+
+static void
+ntr_emit_load_ubo(struct ntr_compile *c, nir_intrinsic_instr *instr)
+{
+ struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
+
+ struct ureg_dst addr_temp = ureg_dst_undef();
+
+ if (nir_src_is_const(instr->src[0])) {
+ src = ureg_src_dimension(src, ntr_src_as_uint(c, instr->src[0]));
+ } else {
+ /* virglrenderer requires that indirect UBO references have the UBO
+ * array's base index in the Index field, not added to the indrect
+ * address.
+ *
+ * Many nir intrinsics have a base address const value for the start of
+ * their array indirection, but load_ubo doesn't. We fake it by
+ * subtracting it off here.
+ */
+ addr_temp = ntr_temp(c);
+ ntr_UADD(c, addr_temp, ntr_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
+ src = ureg_src_dimension_indirect(src,
+ ntr_reladdr(c, ureg_src(addr_temp), 1),
+ c->first_ubo);
+ }
+
+ /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
+ * file.
+ */
+ src.Index = nir_intrinsic_base(instr);
+
+ if (nir_src_is_const(instr->src[1])) {
+ src.Index += ntr_src_as_uint(c, instr->src[1]);
+ } else {
+ src = ureg_src_indirect(src, ntr_reladdr(c, ntr_get_src(c, instr->src[1]), 0));
+ }
+
+ int start_component = nir_intrinsic_component(instr);
+
+ src = ntr_shift_by_frac(src, start_component, instr->num_components);
+
+ ntr_store(c, &instr->def, src);
+}
+
+static void
+ntr_emit_load_input(struct ntr_compile *c, nir_intrinsic_instr *instr)
+{
+ uint32_t frac = nir_intrinsic_component(instr);
+ uint32_t num_components = instr->num_components;
+ unsigned base = nir_intrinsic_base(instr);
+ struct ureg_src input;
+ nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
+
+ if (c->s->info.stage == MESA_SHADER_VERTEX) {
+ input = ureg_DECL_vs_input(c->ureg, base);
+ for (int i = 1; i < semantics.num_slots; i++)
+ ureg_DECL_vs_input(c->ureg, base + i);
+ } else {
+ input = c->input_index_map[base];
+ }
+
+ input = ntr_shift_by_frac(input, frac, num_components);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_input:
+ input = ntr_ureg_src_indirect(c, input, instr->src[0], 0);
+ ntr_store(c, &instr->def, input);
+ break;
+
+ case nir_intrinsic_load_interpolated_input: {
+ input = ntr_ureg_src_indirect(c, input, instr->src[1], 0);
+
+ nir_intrinsic_instr *bary_instr =
+ nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
+
+ switch (bary_instr->intrinsic) {
+ case nir_intrinsic_load_barycentric_pixel:
+ case nir_intrinsic_load_barycentric_sample:
+ /* For these, we know that the barycentric load matches the
+ * interpolation on the input declaration, so we can use it directly.
+ */
+ ntr_store(c, &instr->def, input);
+ break;
+
+ case nir_intrinsic_load_barycentric_centroid:
+ /* If the input was declared centroid, then there's no need to
+ * emit the extra TGSI interp instruction, we can just read the
+ * input.
+ */
+ if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
+ ntr_store(c, &instr->def, input);
+ } else {
+ ntr_INTERP_CENTROID(c, ntr_get_dest(c, &instr->def), input);
+ }
+ break;
+
+ case nir_intrinsic_load_barycentric_at_sample:
+ /* We stored the sample in the fake "bary" dest. */
+ ntr_INTERP_SAMPLE(c, ntr_get_dest(c, &instr->def), input,
+ ntr_get_src(c, instr->src[0]));
+ break;
+
+ case nir_intrinsic_load_barycentric_at_offset:
+ /* We stored the offset in the fake "bary" dest. */
+ ntr_INTERP_OFFSET(c, ntr_get_dest(c, &instr->def), input,
+ ntr_get_src(c, instr->src[0]));
+ break;
+
+ default:
+ unreachable("bad barycentric interp intrinsic\n");
+ }
+ break;
+ }
+
+ default:
+ unreachable("bad load input intrinsic\n");
+ }
+}
+
+static void
+ntr_emit_store_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
+{
+ struct ureg_src src = ntr_get_src(c, instr->src[0]);
+
+ if (src.File == TGSI_FILE_OUTPUT) {
+ /* If our src is the output file, that's an indication that we were able
+ * to emit the output stores in the generating instructions and we have
+ * nothing to do here.
+ */
+ return;
+ }
+
+ uint32_t frac;
+ struct ureg_dst out = ntr_output_decl(c, instr, &frac);
+
+ if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
+ out = ntr_ureg_dst_indirect(c, out, instr->src[2]);
+ out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[1]);
+ } else {
+ out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
+ }
+
+ uint8_t swizzle[4] = { 0, 0, 0, 0 };
+ for (int i = frac; i < 4; i++) {
+ if (out.WriteMask & (1 << i))
+ swizzle[i] = i - frac;
+ }
+
+ src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+
+ ntr_MOV(c, out, src);
+}
+
+static void
+ntr_emit_load_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
+{
+ nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
+
+ /* ntr_try_store_in_tgsi_output() optimization is not valid if normal
+ * load_output is present.
+ */
+ assert(c->s->info.stage != MESA_SHADER_VERTEX &&
+ (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
+
+ uint32_t frac;
+ struct ureg_dst out = ntr_output_decl(c, instr, &frac);
+
+ if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
+ out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
+ out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[0]);
+ } else {
+ out = ntr_ureg_dst_indirect(c, out, instr->src[0]);
+ }
+
+ struct ureg_dst dst = ntr_get_dest(c, &instr->def);
+ struct ureg_src out_src = ureg_src(out);
+
+ /* Don't swizzling unavailable channels of the output in the writemasked-out
+ * components. Avoids compile failures in virglrenderer with
+ * TESS_LEVEL_INNER.
+ */
+ int fill_channel = ffs(dst.WriteMask) - 1;
+ uint8_t swizzles[4] = { 0, 1, 2, 3 };
+ for (int i = 0; i < 4; i++)
+ if (!(dst.WriteMask & (1 << i)))
+ swizzles[i] = fill_channel;
+ out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
+
+ if (semantics.fb_fetch_output)
+ ntr_FBFETCH(c, dst, out_src);
+ else
+ ntr_MOV(c, dst, out_src);
+}
+
+static void
+ntr_emit_load_sysval(struct ntr_compile *c, nir_intrinsic_instr *instr)
+{
+ gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
+ enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
+ struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
+
+ /* virglrenderer doesn't like references to channels of the sysval that
+ * aren't defined, even if they aren't really read. (GLSL compile fails on
+ * gl_NumWorkGroups.w, for example).
+ */
+ uint32_t write_mask = BITSET_MASK(instr->def.num_components);
+ sv = ntr_swizzle_for_write_mask(sv, write_mask);
+
+ /* TGSI and NIR define these intrinsics as always loading ints, but they can
+ * still appear on hardware with non-native-integers fragment shaders using
+ * the draw path (i915g). In that case, having called nir_lower_int_to_float
+ * means that we actually want floats instead.
+ */
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_vertex_id:
+ case nir_intrinsic_load_instance_id:
+ ntr_U2F(c, ntr_get_dest(c, &instr->def), sv);
+ return;
+
+ default:
+ break;
+ }
+
+ ntr_store(c, &instr->def, sv);
+}
+
+static void
+ntr_emit_intrinsic(struct ntr_compile *c, nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ubo_vec4:
+ ntr_emit_load_ubo(c, instr);
+ break;
+
+ /* Vertex */
+ case nir_intrinsic_load_draw_id:
+ case nir_intrinsic_load_invocation_id:
+ case nir_intrinsic_load_frag_coord:
+ case nir_intrinsic_load_point_coord:
+ case nir_intrinsic_load_front_face:
+ ntr_emit_load_sysval(c, instr);
+ break;
+
+ case nir_intrinsic_load_input:
+ case nir_intrinsic_load_per_vertex_input:
+ case nir_intrinsic_load_interpolated_input:
+ ntr_emit_load_input(c, instr);
+ break;
+
+ case nir_intrinsic_store_output:
+ case nir_intrinsic_store_per_vertex_output:
+ ntr_emit_store_output(c, instr);
+ break;
+
+ case nir_intrinsic_load_output:
+ case nir_intrinsic_load_per_vertex_output:
+ ntr_emit_load_output(c, instr);
+ break;
+
+ case nir_intrinsic_discard:
+ ntr_KILL(c);
+ break;
+
+ case nir_intrinsic_discard_if: {
+ struct ureg_src cond = ureg_scalar(ntr_get_src(c, instr->src[0]), 0);
+ /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
+ ntr_KILL_IF(c, ureg_negate(cond));
+ break;
+ }
+ /* In TGSI we don't actually generate the barycentric coords, and emit
+ * interp intrinsics later. However, we do need to store the
+ * load_barycentric_at_* argument so that we can use it at that point.
+ */
+ case nir_intrinsic_load_barycentric_pixel:
+ case nir_intrinsic_load_barycentric_centroid:
+ case nir_intrinsic_load_barycentric_sample:
+ break;
+ case nir_intrinsic_load_barycentric_at_sample:
+ case nir_intrinsic_load_barycentric_at_offset:
+ ntr_store(c, &instr->def, ntr_get_src(c, instr->src[0]));
+ break;
+
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_load_reg_indirect:
+ case nir_intrinsic_store_reg:
+ case nir_intrinsic_store_reg_indirect:
+ /* fully consumed */
+ break;
+
+ default:
+ fprintf(stderr, "Unknown intrinsic: ");
+ nir_print_instr(&instr->instr, stderr);
+ fprintf(stderr, "\n");
+ break;
+ }
+}
+
+struct ntr_tex_operand_state {
+ struct ureg_src srcs[4];
+ unsigned i;
+};
+
+static void
+ntr_push_tex_arg(struct ntr_compile *c,
+ nir_tex_instr *instr,
+ nir_tex_src_type tex_src_type,
+ struct ntr_tex_operand_state *s)
+{
+ int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
+ if (tex_src < 0)
+ return;
+
+ nir_src *src = &instr->src[tex_src].src;
+ s->srcs[s->i++] = ntr_get_src(c, *src);
+}
+
+static void
+ntr_emit_texture(struct ntr_compile *c, nir_tex_instr *instr)
+{
+ struct ureg_dst dst = ntr_get_dest(c, &instr->def);
+ enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
+ unsigned tex_opcode;
+
+ int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
+ int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
+
+ struct ureg_src sampler;
+ if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
+ /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
+ sampler = ntr_get_src(c, instr->src[tex_handle_src].src);
+ assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
+ } else {
+ assert(tex_handle_src == -1 && sampler_handle_src == -1);
+ sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
+ int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
+ if (sampler_src >= 0) {
+ struct ureg_src reladdr = ntr_get_src(c, instr->src[sampler_src].src);
+ sampler = ureg_src_indirect(sampler, ntr_reladdr(c, reladdr, 2));
+ }
+ }
+
+ switch (instr->op) {
+ case nir_texop_tex:
+ if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
+ MAX2(instr->coord_components, 2) + instr->is_shadow)
+ tex_opcode = TGSI_OPCODE_TXP;
+ else
+ tex_opcode = TGSI_OPCODE_TEX;
+ break;
+ case nir_texop_txl:
+ tex_opcode = TGSI_OPCODE_TXL;
+ break;
+ case nir_texop_txb:
+ tex_opcode = TGSI_OPCODE_TXB;
+ break;
+ case nir_texop_txd:
+ tex_opcode = TGSI_OPCODE_TXD;
+ break;
+ case nir_texop_txs:
+ tex_opcode = TGSI_OPCODE_TXQ;
+ break;
+ case nir_texop_tg4:
+ tex_opcode = TGSI_OPCODE_TG4;
+ break;
+ case nir_texop_query_levels:
+ tex_opcode = TGSI_OPCODE_TXQ;
+ break;
+ case nir_texop_lod:
+ tex_opcode = TGSI_OPCODE_LODQ;
+ break;
+ case nir_texop_texture_samples:
+ tex_opcode = TGSI_OPCODE_TXQS;
+ break;
+ default:
+ unreachable("unsupported tex op");
+ }
+
+ struct ntr_tex_operand_state s = { .i = 0 };
+ ntr_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
+ ntr_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
+
+ /* non-coord arg for TXQ */
+ if (tex_opcode == TGSI_OPCODE_TXQ) {
+ ntr_push_tex_arg(c, instr, nir_tex_src_lod, &s);
+ /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
+ * scalar
+ */
+ s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
+ }
+
+ if (s.i > 1) {
+ if (tex_opcode == TGSI_OPCODE_TEX)
+ tex_opcode = TGSI_OPCODE_TEX2;
+ if (tex_opcode == TGSI_OPCODE_TXB)
+ tex_opcode = TGSI_OPCODE_TXB2;
+ if (tex_opcode == TGSI_OPCODE_TXL)
+ tex_opcode = TGSI_OPCODE_TXL2;
+ }
+
+ if (instr->op == nir_texop_txd) {
+ /* Derivs appear in their own src args */
+ int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
+ int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
+ s.srcs[s.i++] = ntr_get_src(c, instr->src[ddx].src);
+ s.srcs[s.i++] = ntr_get_src(c, instr->src[ddy].src);
+ }
+
+ if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
+ if (c->screen->get_param(c->screen,
+ PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
+ sampler = ureg_scalar(sampler, instr->component);
+ s.srcs[s.i++] = ureg_src_undef();
+ } else {
+ s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
+ }
+ }
+
+ s.srcs[s.i++] = sampler;
+
+ enum tgsi_return_type tex_type;
+ switch (instr->dest_type) {
+ case nir_type_float32:
+ tex_type = TGSI_RETURN_TYPE_FLOAT;
+ break;
+ case nir_type_int32:
+ tex_type = TGSI_RETURN_TYPE_SINT;
+ break;
+ case nir_type_uint32:
+ tex_type = TGSI_RETURN_TYPE_UINT;
+ break;
+ default:
+ unreachable("unknown texture type");
+ }
+
+ struct ureg_dst tex_dst;
+ if (instr->op == nir_texop_query_levels)
+ tex_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W);
+ else
+ tex_dst = dst;
+
+ while (s.i < 4)
+ s.srcs[s.i++] = ureg_src_undef();
+
+ struct ntr_insn *insn = ntr_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
+ insn->tex_target = target;
+ insn->tex_return_type = tex_type;
+ insn->is_tex = true;
+
+ int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
+ if (tex_offset_src >= 0) {
+ struct ureg_src offset = ntr_get_src(c, instr->src[tex_offset_src].src);
+
+ insn->tex_offset[0].File = offset.File;
+ insn->tex_offset[0].Index = offset.Index;
+ insn->tex_offset[0].SwizzleX = offset.SwizzleX;
+ insn->tex_offset[0].SwizzleY = offset.SwizzleY;
+ insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
+ insn->tex_offset[0].Padding = 0;
+ }
+
+ if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
+ for (uint8_t i = 0; i < 4; ++i) {
+ struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
+ insn->tex_offset[i].File = imm.File;
+ insn->tex_offset[i].Index = imm.Index;
+ insn->tex_offset[i].SwizzleX = imm.SwizzleX;
+ insn->tex_offset[i].SwizzleY = imm.SwizzleY;
+ insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
+ }
+ }
+
+ if (instr->op == nir_texop_query_levels)
+ ntr_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
+}
+
+static void
+ntr_emit_jump(struct ntr_compile *c, nir_jump_instr *jump)
+{
+ switch (jump->type) {
+ case nir_jump_break:
+ ntr_BRK(c);
+ break;
+
+ case nir_jump_continue:
+ ntr_CONT(c);
+ break;
+
+ default:
+ fprintf(stderr, "Unknown jump instruction: ");
+ nir_print_instr(&jump->instr, stderr);
+ fprintf(stderr, "\n");
+ abort();
+ }
+}
+
+static void
+ntr_emit_ssa_undef(struct ntr_compile *c, nir_undef_instr *instr)
+{
+ /* Nothing to do but make sure that we have some storage to deref. */
+ (void)ntr_get_ssa_def_decl(c, &instr->def);
+}
+
+static void
+ntr_emit_instr(struct ntr_compile *c, nir_instr *instr)
+{
+ switch (instr->type) {
+ case nir_instr_type_deref:
+ /* ignored, will be walked by nir_intrinsic_image_*_deref. */
+ break;
+
+ case nir_instr_type_alu:
+ ntr_emit_alu(c, nir_instr_as_alu(instr));
+ break;
+
+ case nir_instr_type_intrinsic:
+ ntr_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
+ break;
+
+ case nir_instr_type_load_const:
+ /* Nothing to do here, as load consts are done directly from
+ * ntr_get_src() (since many constant NIR srcs will often get folded
+ * directly into a register file index instead of as a TGSI src).
+ */
+ break;
+
+ case nir_instr_type_tex:
+ ntr_emit_texture(c, nir_instr_as_tex(instr));
+ break;
+
+ case nir_instr_type_jump:
+ ntr_emit_jump(c, nir_instr_as_jump(instr));
+ break;
+
+ case nir_instr_type_undef:
+ ntr_emit_ssa_undef(c, nir_instr_as_undef(instr));
+ break;
+
+ default:
+ fprintf(stderr, "Unknown NIR instr type: ");
+ nir_print_instr(instr, stderr);
+ fprintf(stderr, "\n");
+ abort();
+ }
+}
+
+static void
+ntr_emit_if(struct ntr_compile *c, nir_if *if_stmt)
+{
+ ntr_IF(c, c->if_cond);
+
+ ntr_emit_cf_list(c, &if_stmt->then_list);
+
+ if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
+ ntr_ELSE(c);
+ ntr_emit_cf_list(c, &if_stmt->else_list);
+ }
+
+ ntr_ENDIF(c);
+}
+
+static void
+ntr_emit_loop(struct ntr_compile *c, nir_loop *loop)
+{
+ assert(!nir_loop_has_continue_construct(loop));
+ ntr_BGNLOOP(c);
+ ntr_emit_cf_list(c, &loop->body);
+ ntr_ENDLOOP(c);
+}
+
+static void
+ntr_emit_block(struct ntr_compile *c, nir_block *block)
+{
+ struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
+ c->cur_block = ntr_block;
+
+ nir_foreach_instr(instr, block) {
+ ntr_emit_instr(c, instr);
+
+ /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntr_OPCODE(). */
+ if (ureg_get_instruction_number(c->ureg) != 0) {
+ fprintf(stderr, "Emitted ureg insn during: ");
+ nir_print_instr(instr, stderr);
+ fprintf(stderr, "\n");
+ unreachable("emitted ureg insn");
+ }
+ }
+
+ /* Set up the if condition for ntr_emit_if(), which we have to do before
+ * freeing up the temps (the "if" is treated as inside the block for liveness
+ * purposes, despite not being an instruction)
+ *
+ * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
+ * looks at all of .xyzw. No harm in working around the bug.
+ */
+ nir_if *nif = nir_block_get_following_if(block);
+ if (nif)
+ c->if_cond = ureg_scalar(ntr_get_src(c, nif->condition), TGSI_SWIZZLE_X);
+}
+
+static void
+ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list)
+{
+ foreach_list_typed(nir_cf_node, node, node, list) {
+ switch (node->type) {
+ case nir_cf_node_block:
+ ntr_emit_block(c, nir_cf_node_as_block(node));
+ break;
+
+ case nir_cf_node_if:
+ ntr_emit_if(c, nir_cf_node_as_if(node));
+ break;
+
+ case nir_cf_node_loop:
+ ntr_emit_loop(c, nir_cf_node_as_loop(node));
+ break;
+
+ default:
+ unreachable("unknown CF type");
+ }
+ }
+}
+
+static void
+ntr_emit_block_ureg(struct ntr_compile *c, struct nir_block *block)
+{
+ struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
+
+ /* Emit the ntr insns to tgsi_ureg. */
+ util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
+ const struct tgsi_opcode_info *opcode_info =
+ tgsi_get_opcode_info(insn->opcode);
+
+ switch (insn->opcode) {
+ case TGSI_OPCODE_IF:
+ ureg_IF(c->ureg, insn->src[0], &c->cf_label);
+ break;
+
+ case TGSI_OPCODE_ELSE:
+ ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
+ ureg_ELSE(c->ureg, &c->cf_label);
+ c->current_if_else = c->cf_label;
+ break;
+
+ case TGSI_OPCODE_ENDIF:
+ ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
+ ureg_ENDIF(c->ureg);
+ break;
+
+ case TGSI_OPCODE_BGNLOOP:
+ /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
+ * does reference BGNLOOP's. Follow the former behavior unless something comes up
+ * with a need.
+ */
+ ureg_BGNLOOP(c->ureg, &c->cf_label);
+ break;
+
+ case TGSI_OPCODE_ENDLOOP:
+ ureg_ENDLOOP(c->ureg, &c->cf_label);
+ break;
+
+ default:
+ if (insn->is_tex) {
+ int num_offsets = 0;
+ for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
+ if (insn->tex_offset[i].File != TGSI_FILE_NULL)
+ num_offsets = i + 1;
+ }
+ ureg_tex_insn(c->ureg, insn->opcode,
+ insn->dst, opcode_info->num_dst,
+ insn->tex_target, insn->tex_return_type,
+ insn->tex_offset,
+ num_offsets,
+ insn->src, opcode_info->num_src);
+ } else {
+ ureg_insn(c->ureg, insn->opcode,
+ insn->dst, opcode_info->num_dst,
+ insn->src, opcode_info->num_src,
+ insn->precise);
+ }
+ }
+ }
+}
+
+static void
+ntr_emit_if_ureg(struct ntr_compile *c, nir_if *if_stmt)
+{
+ /* Note: the last block emitted our IF opcode. */
+
+ int if_stack = c->current_if_else;
+ c->current_if_else = c->cf_label;
+
+ /* Either the then or else block includes the ENDIF, which will fix up the
+ * IF(/ELSE)'s label for jumping
+ */
+ ntr_emit_cf_list_ureg(c, &if_stmt->then_list);
+ ntr_emit_cf_list_ureg(c, &if_stmt->else_list);
+
+ c->current_if_else = if_stack;
+}
+
+static void
+ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list)
+{
+ foreach_list_typed(nir_cf_node, node, node, list) {
+ switch (node->type) {
+ case nir_cf_node_block:
+ ntr_emit_block_ureg(c, nir_cf_node_as_block(node));
+ break;
+
+ case nir_cf_node_if:
+ ntr_emit_if_ureg(c, nir_cf_node_as_if(node));
+ break;
+
+ case nir_cf_node_loop:
+ /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
+ * does reference BGNLOOP's. Follow the former behavior unless something comes up
+ * with a need.
+ */
+ ntr_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
+ break;
+
+ default:
+ unreachable("unknown CF type");
+ }
+ }
+}
+
+static void
+ntr_emit_impl(struct ntr_compile *c, nir_function_impl *impl)
+{
+ c->impl = impl;
+
+ c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
+ c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
+
+ /* Set up the struct ntr_blocks to put insns in */
+ c->blocks = _mesa_pointer_hash_table_create(c);
+ nir_foreach_block(block, impl) {
+ struct ntr_block *ntr_block = rzalloc(c->blocks, struct ntr_block);
+ util_dynarray_init(&ntr_block->insns, ntr_block);
+ _mesa_hash_table_insert(c->blocks, block, ntr_block);
+ }
+
+
+ ntr_setup_registers(c);
+
+ c->cur_block = ntr_block_from_nir(c, nir_start_block(impl));
+ ntr_setup_inputs(c);
+ ntr_setup_outputs(c);
+ ntr_setup_uniforms(c);
+
+ /* Emit the ntr insns */
+ ntr_emit_cf_list(c, &impl->body);
+
+ /* Don't do optimized RA if the driver requests it, unless the number of
+ * temps is too large to be covered by the 16 bit signed int that TGSI
+ * allocates for the register index */
+ if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
+ ntr_allocate_regs(c, impl);
+ else
+ ntr_allocate_regs_unoptimized(c, impl);
+
+ /* Turn the ntr insns into actual TGSI tokens */
+ ntr_emit_cf_list_ureg(c, &impl->body);
+
+ ralloc_free(c->liveness);
+ c->liveness = NULL;
+
+}
+
+static int
+type_size(const struct glsl_type *type, bool bindless)
+{
+ return glsl_count_attribute_slots(type, false);
+}
+
+/* Allow vectorizing of ALU instructions.
+ */
+static uint8_t
+ntr_should_vectorize_instr(const nir_instr *instr, const void *data)
+{
+ if (instr->type != nir_instr_type_alu)
+ return 0;
+
+ return 4;
+}
+
+static bool
+ntr_should_vectorize_io(unsigned align, unsigned bit_size,
+ unsigned num_components, unsigned high_offset,
+ nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+ void *data)
+{
+ if (bit_size != 32)
+ return false;
+
+ /* Our offset alignment should aways be at least 4 bytes */
+ if (align < 4)
+ return false;
+
+ /* No wrapping off the end of a TGSI reg. We could do a bit better by
+ * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
+ * need this restriction.
+ */
+ unsigned worst_start_component = align == 4 ? 3 : align / 4;
+ if (worst_start_component + num_components > 4)
+ return false;
+
+ return true;
+}
+
+static nir_variable_mode
+ntr_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
+{
+ unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
+ unsigned indirect_mask = 0;
+
+ if (!screen->get_shader_param(screen, pipe_stage,
+ PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
+ indirect_mask |= nir_var_shader_in;
+ }
+
+ if (!screen->get_shader_param(screen, pipe_stage,
+ PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
+ indirect_mask |= nir_var_shader_out;
+ }
+
+ if (!screen->get_shader_param(screen, pipe_stage,
+ PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
+ indirect_mask |= nir_var_function_temp;
+ }
+
+ return indirect_mask;
+}
+
+struct ntr_lower_tex_state {
+ nir_scalar channels[8];
+ unsigned i;
+};
+
+static void
+nir_to_rc_lower_tex_instr_arg(nir_builder *b,
+ nir_tex_instr *instr,
+ nir_tex_src_type tex_src_type,
+ struct ntr_lower_tex_state *s)
+{
+ int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
+ if (tex_src < 0)
+ return;
+
+ nir_def *def = instr->src[tex_src].src.ssa;
+ for (int i = 0; i < def->num_components; i++) {
+ s->channels[s->i++] = nir_get_scalar(def, i);
+ }
+
+ nir_tex_instr_remove_src(instr, tex_src);
+}
+
+/**
+ * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
+ * src. This lets NIR handle the coalescing of the vec4 rather than trying to
+ * manage it on our own, and may lead to more vectorization.
+ */
+static bool
+nir_to_rc_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+ if (instr->type != nir_instr_type_tex)
+ return false;
+
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+ if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
+ return false;
+
+ b->cursor = nir_before_instr(instr);
+
+ struct ntr_lower_tex_state s = {0};
+
+ nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
+ /* We always have at least two slots for the coordinate, even on 1D. */
+ s.i = MAX2(s.i, 2);
+
+ nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
+ s.i = MAX2(s.i, 3);
+
+ nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
+
+ /* XXX: LZ */
+ nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
+ nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
+ nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
+
+ /* No need to pack undefs in unused channels of the tex instr */
+ while (!s.channels[s.i - 1].def)
+ s.i--;
+
+ /* Instead of putting undefs in the unused slots of the vecs, just put in
+ * another used channel. Otherwise, we'll get unnecessary moves into
+ * registers.
+ */
+ assert(s.channels[0].def != NULL);
+ for (int i = 1; i < s.i; i++) {
+ if (!s.channels[i].def)
+ s.channels[i] = s.channels[0];
+ }
+
+ nir_tex_instr_add_src(tex, nir_tex_src_backend1,
+ nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
+ if (s.i > 4)
+ nir_tex_instr_add_src(tex, nir_tex_src_backend2,
+ nir_vec_scalars(b, &s.channels[4], s.i - 4));
+
+ return true;
+}
+
+static bool
+nir_to_rc_lower_tex(nir_shader *s)
+{
+ return nir_shader_instructions_pass(s,
+ nir_to_rc_lower_tex_instr,
+ nir_metadata_block_index |
+ nir_metadata_dominance,
+ NULL);
+}
+
+/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
+static void
+nir_to_rc_lower_txp(nir_shader *s)
+{
+ nir_lower_tex_options lower_tex_options = {
+ .lower_txp = 0,
+ };
+
+ nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_tex)
+ continue;
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+ if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
+ continue;
+
+ bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
+ bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
+ bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
+
+ /* We can do TXP for any tex (not txg) where we can fit all the
+ * coordinates and comparator and projector in one vec4 without any
+ * other modifiers to add on.
+ *
+ * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
+ * if we get any funny projectors then we just blow them all away.
+ */
+ if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
+ lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
+ }
+ }
+
+ /* nir_lower_tex must be run even if no options are set, because we need the
+ * LOD to be set for query_levels and for non-fragment shaders.
+ */
+ NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
+}
+
+const void *
+nir_to_rc(struct nir_shader *s,
+ struct pipe_screen *screen)
+{
+ static const struct nir_to_rc_options default_ntr_options = {0};
+ return nir_to_rc_options(s, screen, &default_ntr_options);
+}
+
+/**
+ * Translates the NIR shader to TGSI.
+ *
+ * This requires some lowering of the NIR shader to prepare it for translation.
+ * We take ownership of the NIR shader passed, returning a reference to the new
+ * TGSI tokens instead. If you need to keep the NIR, then pass us a clone.
+ */
+const void *nir_to_rc_options(struct nir_shader *s,
+ struct pipe_screen *screen,
+ const struct nir_to_rc_options *options)
+{
+ struct ntr_compile *c;
+ const void *tgsi_tokens;
+ nir_variable_mode no_indirects_mask = ntr_no_indirects_mask(s, screen);
+
+ /* Lower array indexing on FS inputs. Since we don't set
+ * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
+ * elements by ureg, and so dynamically indexing them would be invalid.
+ * Ideally we would set that ureg flag based on
+ * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
+ * splitting NIR VS outputs to elements even if the FS doesn't get the
+ * corresponding splitting, and virgl depends on TGSI across link boundaries
+ * having matching declarations.
+ */
+ if (s->info.stage == MESA_SHADER_FRAGMENT) {
+ NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
+ NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+ }
+
+ NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+ type_size, (nir_lower_io_options)0);
+
+ nir_to_rc_lower_txp(s);
+ NIR_PASS_V(s, nir_to_rc_lower_tex);
+
+ if (!s->options->lower_uniforms_to_ubo) {
+ NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
+ screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
+ true);
+ }
+
+ if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
+ NIR_PASS_V(s, nir_lower_ubo_vec4);
+
+ bool progress;
+ NIR_PASS_V(s, nir_opt_constant_folding);
+
+ /* Clean up after triginometric input normalization. */
+ NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL);
+ do {
+ progress = false;
+ NIR_PASS(progress, s, nir_opt_shrink_vectors);
+ } while (progress);
+ NIR_PASS_V(s, nir_copy_prop);
+ NIR_PASS_V(s, nir_opt_cse);
+ NIR_PASS_V(s, nir_opt_dce);
+ NIR_PASS_V(s, nir_opt_shrink_stores, true);
+
+ NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
+
+ /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */
+ NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf);
+
+ NIR_PASS_V(s, nir_lower_frexp);
+
+ do {
+ progress = false;
+ NIR_PASS(progress, s, nir_opt_algebraic_late);
+ if (progress) {
+ NIR_PASS_V(s, nir_copy_prop);
+ NIR_PASS_V(s, nir_opt_dce);
+ NIR_PASS_V(s, nir_opt_cse);
+ }
+ } while (progress);
+
+ if (s->info.stage == MESA_SHADER_FRAGMENT) {
+ NIR_PASS_V(s, r300_nir_prepare_presubtract);
+ NIR_PASS_V(s, r300_nir_clean_double_fneg);
+ }
+
+ NIR_PASS_V(s, nir_lower_int_to_float);
+ NIR_PASS_V(s, nir_lower_bool_to_float,
+ !options->lower_cmp && !options->lower_fabs);
+ /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
+ NIR_PASS_V(s, nir_copy_prop);
+ NIR_PASS_V(s, nir_opt_dce);
+
+ nir_move_options move_all =
+ nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
+ nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
+
+ NIR_PASS_V(s, nir_opt_move, move_all);
+ NIR_PASS_V(s, nir_move_vec_src_uses_to_dest, true);
+
+ NIR_PASS_V(s, nir_convert_from_ssa, true);
+ NIR_PASS_V(s, nir_lower_vec_to_regs, NULL, NULL);
+
+ /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
+ */
+ NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
+ NIR_PASS_V(s, nir_opt_dce);
+
+ /* See comment in ntr_get_alu_src for supported modifiers */
+ NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
+
+ if (NIR_DEBUG(TGSI)) {
+ fprintf(stderr, "NIR before translation to TGSI:\n");
+ nir_print_shader(s, stderr);
+ }
+
+ c = rzalloc(NULL, struct ntr_compile);
+ c->screen = screen;
+ c->options = options;
+
+ c->s = s;
+ c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
+ ureg_setup_shader_info(c->ureg, &s->info);
+ if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
+ ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
+
+ if (s->info.stage == MESA_SHADER_FRAGMENT) {
+ /* The draw module's polygon stipple layer doesn't respect the chosen
+ * coordinate mode, so leave it as unspecified unless we're actually
+ * reading the position in the shader already. See
+ * gl-2.1-polygon-stipple-fs on softpipe.
+ */
+ if ((s->info.inputs_read & VARYING_BIT_POS) ||
+ BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
+ ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
+ s->info.fs.origin_upper_left ?
+ TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
+ TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
+
+ ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
+ s->info.fs.pixel_center_integer ?
+ TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
+ TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
+ }
+ }
+ /* Emit the main function */
+ nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
+ ntr_emit_impl(c, impl);
+ ureg_END(c->ureg);
+
+ tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
+
+ if (NIR_DEBUG(TGSI)) {
+ fprintf(stderr, "TGSI after translation from NIR:\n");
+ tgsi_dump(tgsi_tokens, 0);
+ }
+
+ ureg_destroy(c->ureg);
+
+ ralloc_free(c);
+ ralloc_free(s);
+
+ return tgsi_tokens;
+}
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.h b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.h
new file mode 100644
index 000000000..ebbe87770
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/nir_to_rc.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef NIR_TO_RC_H
+#define NIR_TO_RC_H
+
+#include <stdbool.h>
+#include "pipe/p_defines.h"
+
+struct nir_shader;
+struct pipe_screen;
+struct pipe_shader_state;
+
+struct nir_to_rc_options {
+ bool lower_cmp;
+ /* Emit MAX(a,-a) instead of abs src modifier) */
+ bool lower_fabs;
+ bool unoptimized_ra;
+ bool lower_ssbo_bindings;
+ uint32_t ubo_vec4_max;
+};
+
+const void *nir_to_rc(struct nir_shader *s,
+ struct pipe_screen *screen);
+
+const void *nir_to_rc_options(struct nir_shader *s,
+ struct pipe_screen *screen,
+ const struct nir_to_rc_options *ntr_options);
+
+#endif /* NIR_TO_RC_H */
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.c b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.c
new file mode 100644
index 000000000..05f7b8c59
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2023 Pavel Ondračka <pavel.ondracka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_nir.h"
+
+#include "r300_screen.h"
+
+static unsigned char
+r300_should_vectorize_instr(const nir_instr *instr, const void *data)
+{
+ if (instr->type != nir_instr_type_alu)
+ return 0;
+
+ return 4;
+}
+
+static bool
+r300_should_vectorize_io(unsigned align, unsigned bit_size,
+ unsigned num_components, unsigned high_offset,
+ nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+ void *data)
+{
+ if (bit_size != 32)
+ return false;
+
+ /* Our offset alignment should aways be at least 4 bytes */
+ if (align < 4)
+ return false;
+
+ /* No wrapping off the end of a TGSI reg. We could do a bit better by
+ * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
+ * need this restriction.
+ */
+ unsigned worst_start_component = align == 4 ? 3 : align / 4;
+ if (worst_start_component + num_components > 4)
+ return false;
+
+ return true;
+}
+
+static void
+r300_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
+{
+ bool is_r500 = r300_screen(screen)->caps.is_r500;
+
+ bool progress;
+ do {
+ progress = false;
+
+ NIR_PASS_V(s, nir_lower_vars_to_ssa);
+
+ NIR_PASS(progress, s, nir_copy_prop);
+ NIR_PASS(progress, s, nir_opt_algebraic);
+ if (s->info.stage == MESA_SHADER_VERTEX) {
+ if (!is_r500)
+ NIR_PASS(progress, s, r300_nir_lower_bool_to_float);
+ NIR_PASS(progress, s, r300_nir_fuse_fround_d3d9);
+ }
+ NIR_PASS(progress, s, nir_opt_constant_folding);
+ NIR_PASS(progress, s, nir_opt_remove_phis);
+ NIR_PASS(progress, s, nir_opt_conditional_discard);
+ NIR_PASS(progress, s, nir_opt_dce);
+ NIR_PASS(progress, s, nir_opt_dead_cf);
+ NIR_PASS(progress, s, nir_opt_cse);
+ NIR_PASS(progress, s, nir_opt_find_array_copies);
+ NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, s, nir_opt_dead_write_vars);
+
+ NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false);
+ NIR_PASS(progress, s, nir_opt_peephole_select, is_r500 ? 8 : ~0, true, true);
+ NIR_PASS(progress, s, nir_opt_algebraic);
+ NIR_PASS(progress, s, nir_opt_constant_folding);
+ nir_load_store_vectorize_options vectorize_opts = {
+ .modes = nir_var_mem_ubo,
+ .callback = r300_should_vectorize_io,
+ .robust_modes = 0,
+ };
+ NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
+ NIR_PASS(progress, s, nir_opt_shrink_stores, true);
+ NIR_PASS(progress, s, nir_opt_shrink_vectors);
+ NIR_PASS(progress, s, nir_opt_trivial_continues);
+ NIR_PASS(progress, s, nir_opt_vectorize, r300_should_vectorize_instr, NULL);
+ NIR_PASS(progress, s, nir_opt_undef);
+ if(!progress)
+ NIR_PASS(progress, s, nir_lower_undef_to_zero);
+ NIR_PASS(progress, s, nir_opt_loop_unroll);
+
+ /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
+ * and ALU ops for it.
+ */
+ nir_opt_offsets_options offset_options = {
+ .ubo_vec4_max = 255,
+
+ /* No const offset in TGSI for shared accesses. */
+ .shared_max = 0,
+
+ /* unused intrinsics */
+ .uniform_max = 0,
+ .buffer_max = 0,
+ };
+
+ NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
+ } while (progress);
+
+ NIR_PASS_V(s, nir_lower_var_copies);
+ NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp,
+ NULL);
+}
+
+static char *r300_check_control_flow(nir_shader *s)
+{
+ nir_function_impl *impl = nir_shader_get_entrypoint(s);
+ nir_block *first = nir_start_block(impl);
+ nir_cf_node *next = nir_cf_node_next(&first->cf_node);
+
+ if (next) {
+ switch (next->type) {
+ case nir_cf_node_if:
+ return "If/then statements not supported by R300/R400 shaders, should have been flattened by peephole_select.";
+ case nir_cf_node_loop:
+ return "Looping not supported R300/R400 shaders, all loops must be statically unrollable.";
+ default:
+ return "Unknown control flow type";
+ }
+ }
+
+ return NULL;
+}
+
+char *
+r300_finalize_nir(struct pipe_screen *pscreen, void *nir)
+{
+ nir_shader *s = nir;
+
+ r300_optimize_nir(s, pscreen);
+
+ /* st_program.c's parameter list optimization requires that future nir
+ * variants don't reallocate the uniform storage, so we have to remove
+ * uniforms that occupy storage. But we don't want to remove samplers,
+ * because they're needed for YUV variant lowering.
+ */
+ nir_remove_dead_derefs(s);
+ nir_foreach_uniform_variable_safe(var, s) {
+ if (var->data.mode == nir_var_uniform &&
+ (glsl_type_get_image_count(var->type) ||
+ glsl_type_get_sampler_count(var->type)))
+ continue;
+
+ exec_node_remove(&var->node);
+ }
+ nir_validate_shader(s, "after uniform var removal");
+
+ nir_sweep(s);
+
+ if (!r300_screen(pscreen)->caps.is_r500 &&
+ (r300_screen(pscreen)->caps.has_tcl || s->info.stage == MESA_SHADER_FRAGMENT)) {
+ char *msg = r300_check_control_flow(s);
+ if (msg)
+ return strdup(msg);
+ }
+
+ return NULL;
+}
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.h b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.h
new file mode 100644
index 000000000..916eb08fd
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2023 Pavel Ondračka <pavel.ondracka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_NIR_H
+#define R300_NIR_H
+
+#include "pipe/p_screen.h"
+#include "compiler/nir/nir.h"
+
+static inline bool
+is_ubo_or_input(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
+ unsigned src, unsigned num_components,
+ const uint8_t *swizzle)
+{
+ nir_instr *parent = instr->src[src].src.ssa->parent_instr;
+ if (parent->type != nir_instr_type_intrinsic)
+ return false;
+
+ nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(parent);
+
+ switch (intrinsic->intrinsic) {
+ case nir_intrinsic_load_ubo_vec4:
+ case nir_intrinsic_load_input:
+ case nir_intrinsic_load_interpolated_input:
+ return true;
+ default:
+ return false;
+ }
+}
+
+char *r300_finalize_nir(struct pipe_screen *pscreen, void *nir);
+
+extern bool r300_transform_vs_trig_input(struct nir_shader *shader);
+
+extern bool r300_transform_fs_trig_input(struct nir_shader *shader);
+
+extern bool r300_nir_fuse_fround_d3d9(struct nir_shader *shader);
+
+extern bool r300_nir_lower_bool_to_float(struct nir_shader *shader);
+
+extern bool r300_nir_prepare_presubtract(struct nir_shader *shader);
+
+extern bool r300_nir_clean_double_fneg(struct nir_shader *shader);
+
+#endif /* R300_NIR_H */
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py
index ec6f85adf..f03b8eaf6 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py
@@ -25,6 +25,13 @@ import argparse
import sys
from math import pi
+# Convenience variables
+a = 'a'
+b = 'b'
+c = 'c'
+d = 'd'
+e = 'e'
+
# Transform input to range [-PI, PI]:
#
# y = frac(x / 2PI + 0.5) * 2PI - PI
@@ -43,6 +50,50 @@ transform_trig_input_fs_r500 = [
(('fcos', 'a'), ('fcos', ('ffract', ('fmul', 'a', 1 / (2 * pi))))),
]
+# The is a pattern produced by wined3d for A0 register load.
+# The specific pattern wined3d emits looks like this
+# A0.x = (int(floor(abs(R0.x) + 0.5) * sign(R0.x)));
+# however we lower both sign and floor so here we check for the already lowered
+# sequence.
+r300_nir_fuse_fround_d3d9 = [
+ (('fmul', ('fadd', ('fadd', ('fabs', 'a') , 0.5),
+ ('fneg', ('ffract', ('fadd', ('fabs', 'a') , 0.5)))),
+ ('fadd', ('b2f', ('!flt', 0.0, 'a')),
+ ('fneg', ('b2f', ('!flt', 'a', 0.0))))),
+ ('fround_even', 'a'))
+]
+
+# Here are some specific optimizations for code reordering such that the backend
+# has easier task of recognizing output modifiers and presubtract patterns.
+r300_nir_prepare_presubtract = [
+ # Backend can only recognize 1 - x pattern.
+ (('fadd', ('fneg', a), 1.0), ('fadd', 1.0, ('fneg', a))),
+ (('fadd', a, -1.0), ('fneg', ('fadd', 1.0, ('fneg', a)))),
+ (('fadd', -1.0, a), ('fneg', ('fadd', 1.0, ('fneg', a)))),
+ # Bias presubtract 1 - 2 * x expects MAD -a 2.0 1.0 form.
+ (('ffma', 2.0, ('fneg', a), 1.0), ('ffma', ('fneg', a), 2.0, 1.0)),
+ (('ffma', a, -2.0, 1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
+ (('ffma', -2.0, a, 1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
+ (('ffma', 2.0, a, -1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
+ (('ffma', a, 2.0, -1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
+ # x * 2 can be usually folded into output modifier for the previous
+ # instruction, but that only works if x is a temporary. If it is input or
+ # constant just convert it to add instead.
+ (('fmul', 'a(is_ubo_or_input)', 2.0), ('fadd', a, a)),
+]
+
+for multiplier in [2.0, 4.0, 8.0, 16.0, 0.5, 0.25, 0.125, 0.0625]:
+ r300_nir_prepare_presubtract.extend([
+ (('fmul', a, ('fmul(is_used_once)', 'b(is_ubo_or_input)', multiplier)), ('fmul', multiplier, ('fmul', a, b))),
+])
+
+# Previous prepare_presubtract pass can sometimes produce double fneg patterns.
+# The backend copy propagate could handle it, but the nir to tgsi translation
+# does not and blows up. Just run a simple pass to clean it up.
+r300_nir_clean_double_fneg = [
+ (('fneg', ('fneg', a)), a)
+]
+
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--import-path', required=True)
@@ -51,9 +102,25 @@ def main():
sys.path.insert(0, args.import_path)
import nir_algebraic # pylint: disable=import-error
+ ignore_exact = nir_algebraic.ignore_exact
+
+ r300_nir_lower_bool_to_float = [
+ (('bcsel@32(is_only_used_as_float)', ignore_exact('feq', 'a@32', 'b@32'), c, d),
+ ('fadd', ('fmul', c, ('seq', a, b)), ('fsub', d, ('fmul', d, ('seq', a, b)))),
+ "!options->has_fused_comp_and_csel"),
+ (('bcsel@32(is_only_used_as_float)', ignore_exact('fneu', 'a@32', 'b@32'), c, d),
+ ('fadd', ('fmul', c, ('sne', a, b)), ('fsub', d, ('fmul', d, ('sne', a, b)))),
+ "!options->has_fused_comp_and_csel"),
+ (('bcsel@32(is_only_used_as_float)', ignore_exact('flt', 'a@32', 'b@32'), c, d),
+ ('fadd', ('fmul', c, ('slt', a, b)), ('fsub', d, ('fmul', d, ('slt', a, b)))),
+ "!options->has_fused_comp_and_csel"),
+ (('bcsel@32(is_only_used_as_float)', ignore_exact('fge', 'a@32', 'b@32'), c, d),
+ ('fadd', ('fmul', c, ('sge', a, b)), ('fsub', d, ('fmul', d, ('sge', a, b)))),
+ "!options->has_fused_comp_and_csel"),
+]
with open(args.output, 'w') as f:
- f.write('#include "r300_vs.h"')
+ f.write('#include "compiler/r300_nir.h"')
f.write(nir_algebraic.AlgebraicPass("r300_transform_vs_trig_input",
transform_trig_input_vs_r500).render())
@@ -61,6 +128,17 @@ def main():
f.write(nir_algebraic.AlgebraicPass("r300_transform_fs_trig_input",
transform_trig_input_fs_r500).render())
+ f.write(nir_algebraic.AlgebraicPass("r300_nir_fuse_fround_d3d9",
+ r300_nir_fuse_fround_d3d9).render())
+
+ f.write(nir_algebraic.AlgebraicPass("r300_nir_lower_bool_to_float",
+ r300_nir_lower_bool_to_float).render())
+
+ f.write(nir_algebraic.AlgebraicPass("r300_nir_prepare_presubtract",
+ r300_nir_prepare_presubtract).render())
+
+ f.write(nir_algebraic.AlgebraicPass("r300_nir_clean_double_fneg",
+ r300_nir_clean_double_fneg).render())
if __name__ == '__main__':
main()
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c
index 9f058e781..676809152 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_fragprog.c
@@ -30,6 +30,8 @@
#include "radeon_program_tex.h"
#include "radeon_rename_regs.h"
#include "radeon_remove_constants.h"
+#include "radeon_variable.h"
+#include "radeon_list.h"
#include "r300_fragprog.h"
#include "r300_fragprog_swizzle.h"
#include "r500_fragprog.h"
@@ -65,6 +67,48 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
}
}
+/**
+ * This function will try to convert rgb instructions into alpha instructions
+ * and vice versa. While this is already attempted during the pair scheduling,
+ * it is much simpler to do it before pair conversion, so do it here at least for
+ * the simple cases.
+ *
+ * Currently only math opcodes writing to rgb (and with no friends) are
+ * converted to alpha.
+ *
+ * This function assumes all the instructions are still of type
+ * RC_INSTRUCTION_NORMAL, the conversion is much simpler.
+ *
+ * Beware that this needs to be also called before doing presubtract, because
+ * rc_get_variables can't get properly readers for normal instructions if presubtract
+ * is present (it works fine for pair instructions).
+ */
+static void rc_convert_rgb_alpha(struct radeon_compiler *c, void *user)
+{
+ struct rc_list * variables;
+ struct rc_list * var_ptr;
+
+ variables = rc_get_variables(c);
+
+ for (var_ptr = variables; var_ptr; var_ptr = var_ptr->Next) {
+ struct rc_variable * var = var_ptr->Item;
+
+ if (var->Inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
+ continue;
+ }
+
+ /* Only rewrite scalar opcodes that are used separatelly for now. */
+ if (var->Friend)
+ continue;
+
+ const struct rc_opcode_info * opcode = rc_get_opcode_info(var->Inst->U.I.Opcode);
+ if (opcode->IsStandardScalar && var->Dst.WriteMask != RC_MASK_W) {
+ unsigned index = rc_find_free_temporary(c);
+ rc_variable_change_dst(var, index, RC_MASK_W);
+ }
+ }
+}
+
void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
{
int is_r500 = c->Base.is_r500;
@@ -85,14 +129,12 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
struct radeon_program_transformation native_rewrite_r500[] = {
{ &radeonTransformALU, NULL },
{ &radeonTransformDeriv, NULL },
- { &radeonTransformTrigScale, NULL },
{ NULL, NULL }
};
struct radeon_program_transformation native_rewrite_r300[] = {
{ &radeonTransformALU, NULL },
{ &radeonStubDeriv, NULL },
- { &r300_transform_trig_simple, NULL },
{ NULL, NULL }
};
@@ -106,6 +148,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
{"native rewrite", 1, is_r500, rc_local_transform, native_rewrite_r500},
{"native rewrite", 1, !is_r500, rc_local_transform, native_rewrite_r300},
{"deadcode", 1, opt, rc_dataflow_deadcode, NULL},
+ {"convert rgb<->alpha", 1, opt, rc_convert_rgb_alpha, NULL},
{"register rename", 1, !is_r500 || opt, rc_rename_regs, NULL},
{"dataflow optimize", 1, opt, rc_optimize, NULL},
{"inline literals", 1, is_r500 && opt, rc_inline_literals, NULL},
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
index f322785ab..a02147a82 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
@@ -237,6 +237,36 @@ static void ei_math1(struct r300_vertex_program_code *vp,
inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
}
+static void ei_cmp(struct r300_vertex_program_code *vp,
+ struct rc_sub_instruction *vpi,
+ unsigned int * inst)
+{
+ inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
+ 0,
+ 0,
+ t_dst_index(vp, &vpi->DstReg),
+ t_dst_mask(vpi->DstReg.WriteMask),
+ t_dst_class(vpi->DstReg.File),
+ vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
+
+ /* Arguments with constant swizzles still count as a unique
+ * temporary, so we should make sure these arguments share a
+ * register index with one of the other arguments. */
+ for (unsigned i = 0; i < 3; i++) {
+ unsigned j = (i + 1) % 3;
+ if (vpi->SrcReg[i].File == RC_FILE_NONE &&
+ (vpi->SrcReg[j].File == RC_FILE_NONE ||
+ vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
+ vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
+ break;
+ }
+ }
+
+ inst[1] = t_src(vp, &vpi->SrcReg[0]);
+ inst[2] = t_src(vp, &vpi->SrcReg[2]);
+ inst[3] = t_src(vp, &vpi->SrcReg[1]);
+}
+
static void ei_lit(struct r300_vertex_program_code *vp,
struct rc_sub_instruction *vpi,
unsigned int * inst)
@@ -414,6 +444,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
+ case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
@@ -471,11 +502,15 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
"Too many flow control instructions.");
return;
}
+ /* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
+ * we reduce it to half to avoid occasional hangs on RV516
+ * and downclocked RV530.
+ */
if (compiler->Base.is_r500) {
compiler->code->fc_op_addrs.r500
[compiler->code->num_fc_ops].lw =
R500_PVS_FC_ACT_ADRS(act_addr)
- | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
+ | R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080)
;
compiler->code->fc_op_addrs.r500
[compiler->code->num_fc_ops].uw =
@@ -805,18 +840,12 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
int opt = !c->Base.disable_optimizations;
/* Lists of instruction transformations. */
- struct radeon_program_transformation alu_rewrite_r500[] = {
- { &r300_transform_vertex_alu, NULL },
- { NULL, NULL }
- };
-
- struct radeon_program_transformation alu_rewrite_r300[] = {
+ struct radeon_program_transformation alu_rewrite[] = {
{ &r300_transform_vertex_alu, NULL },
- { &r300_transform_trig_simple, NULL },
{ NULL, NULL }
};
- /* Note: These passes have to be done seperately from ALU rewrite,
+ /* Note: These passes have to be done separately from ALU rewrite,
* otherwise non-native ALU instructions with source conflits
* or non-native modifiers will not be treated properly.
*/
@@ -834,8 +863,7 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
struct radeon_compiler_pass vs_list[] = {
/* NAME DUMP PREDICATE FUNCTION PARAM */
{"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
- {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
- {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300},
+ {"native rewrite", 1, 1, rc_local_transform, alu_rewrite},
{"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers},
{"deadcode", 1, opt, rc_dataflow_deadcode, NULL},
{"dataflow optimize", 1, opt, rc_optimize, NULL},
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c b/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c
index 258b873d8..28c05ada6 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c
@@ -281,8 +281,10 @@ static void emit_paired(struct r300_fragment_program_compiler *c, struct rc_pair
code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex);
code->inst[ip].inst5 |= R500_ALU_RGBA_ADDRD(inst->RGB.DestIndex);
- use_temporary(code, inst->Alpha.DestIndex);
- use_temporary(code, inst->RGB.DestIndex);
+ if (inst->Alpha.WriteMask)
+ use_temporary(code, inst->Alpha.DestIndex);
+ if (inst->RGB.WriteMask)
+ use_temporary(code, inst->RGB.DestIndex);
if (inst->RGB.Saturate)
code->inst[ip].inst0 |= R500_INST_RGB_CLAMP;
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c
index ab36513e0..fddc23702 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.c
@@ -357,17 +357,24 @@ void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s)
{
struct rc_instruction * tmp;
memset(s, 0, sizeof(*s));
+ unsigned ip = 0;
+ int last_begintex = -1;
for(tmp = c->Program.Instructions.Next; tmp != &c->Program.Instructions;
- tmp = tmp->Next){
+ tmp = tmp->Next, ip++){
const struct rc_opcode_info * info;
rc_for_all_reads_mask(tmp, reg_count_callback, s);
if (tmp->Type == RC_INSTRUCTION_NORMAL) {
info = rc_get_opcode_info(tmp->U.I.Opcode);
- if (info->Opcode == RC_OPCODE_BEGIN_TEX)
+ if (info->Opcode == RC_OPCODE_BEGIN_TEX) {
+ /* The R5xx docs mention ~30 cycles in section 8.3.1 */
+ s->num_cycles += 30;
+ last_begintex = ip;
continue;
- if (tmp->U.I.PreSub.Opcode != RC_PRESUB_NONE)
- s->num_presub_ops++;
+ }
+ if (info->Opcode == RC_OPCODE_MAD &&
+ rc_inst_has_three_diff_temp_srcs(tmp))
+ s->num_cycles++;
} else {
if (tmp->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used)
s->num_presub_ops++;
@@ -387,6 +394,15 @@ void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s)
tmp->U.P.Alpha.Omod != RC_OMOD_DISABLE) {
s->num_omod_ops++;
}
+ if (tmp->U.P.Nop)
+ s->num_cycles++;
+ /* SemWait has effect only on R500, the more instructions we can put
+ * between the tex block and the first texture semaphore, the better.
+ */
+ if (tmp->U.P.SemWait && c->is_r500 && last_begintex != -1) {
+ s->num_cycles -= MIN2(30, ip - last_begintex);
+ last_begintex = -1;
+ }
info = rc_get_opcode_info(tmp->U.P.RGB.Opcode);
}
if (info->IsFlowControl) {
@@ -402,6 +418,7 @@ void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s)
if (info->HasTexture)
s->num_tex_insts++;
s->num_insts++;
+ s->num_cycles++;
}
/* Increment here because the reg_count_callback store the max
* temporary reg index in s->nun_temp_regs. */
@@ -415,14 +432,17 @@ static void print_stats(struct radeon_compiler * c)
rc_get_stats(c, &s);
/* Note that we print some dummy values for instruction categories that
- * only the FS has, becasue shader-db's report.py wants all shaders to
+ * only the FS has, because shader-db's report.py wants all shaders to
* have the same set.
*/
- util_debug_message(c->debug, SHADER_INFO, "%s shader: %u inst, %u vinst, %u sinst, %u predicate, %u flowcontrol, %u loops, %u tex, %u presub, %u omod, %u temps, %u consts, %u lits",
+ util_debug_message(c->debug, SHADER_INFO,
+ "%s shader: %u inst, %u vinst, %u sinst, %u predicate, %u flowcontrol,"
+ "%u loops, %u tex, %u presub, %u omod, %u temps, %u consts, %u lits, %u cycles",
c->type == RC_VERTEX_PROGRAM ? "VS" : "FS",
s.num_insts, s.num_rgb_insts, s.num_alpha_insts, s.num_pred_insts,
s.num_fc_insts, s.num_loops, s.num_tex_insts, s.num_presub_ops,
- s.num_omod_ops, s.num_temp_regs, s.num_consts, s.num_inline_literals);
+ s.num_omod_ops, s.num_temp_regs, s.num_consts, s.num_inline_literals,
+ s.num_cycles);
}
static const char *shader_name[RC_NUM_PROGRAM_TYPES] = {
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h
index 100f43423..0e4321fae 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler.h
@@ -56,7 +56,6 @@ struct radeon_compiler {
unsigned has_presub:1;
unsigned has_omod:1;
unsigned disable_optimizations:1;
- unsigned needs_trig_input_transform:1;
unsigned max_temp_regs;
unsigned max_constants;
int max_alu_insts;
@@ -148,6 +147,7 @@ struct radeon_compiler_pass {
};
struct rc_program_stats {
+ unsigned num_cycles;
unsigned num_consts;
unsigned num_insts;
unsigned num_fc_insts;
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c
index 2a2542a47..17cb498b1 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.c
@@ -758,3 +758,13 @@ unsigned int rc_get_scalar_src_swz(unsigned int swizzle)
assert(swz != RC_SWIZZLE_UNUSED);
return swz;
}
+
+bool rc_inst_has_three_diff_temp_srcs(struct rc_instruction *inst)
+{
+ return (inst->U.I.SrcReg[0].File == RC_FILE_TEMPORARY &&
+ inst->U.I.SrcReg[1].File == RC_FILE_TEMPORARY &&
+ inst->U.I.SrcReg[2].File == RC_FILE_TEMPORARY &&
+ inst->U.I.SrcReg[0].Index != inst->U.I.SrcReg[1].Index &&
+ inst->U.I.SrcReg[1].Index != inst->U.I.SrcReg[2].Index &&
+ inst->U.I.SrcReg[0].Index != inst->U.I.SrcReg[2].Index);
+}
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h
index 7c1d6bbc9..c16f768e8 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_compiler_util.h
@@ -30,6 +30,8 @@
#ifndef RADEON_PROGRAM_UTIL_H
#define RADEON_PROGRAM_UTIL_H
+#include <stdbool.h>
+
#include "radeon_opcodes.h"
struct radeon_compiler;
@@ -126,4 +128,5 @@ float rc_get_constant_value(
unsigned int rc_get_scalar_src_swz(unsigned int swizzle);
+bool rc_inst_has_three_diff_temp_srcs(struct rc_instruction *inst);
#endif /* RADEON_PROGRAM_UTIL_H */
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h
index 0c7bf8adf..09e0a9608 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_dataflow.h
@@ -99,7 +99,7 @@ struct rc_reader_data {
unsigned int ReadersReserved;
struct rc_reader * Readers;
- /* If this flag is enabled, rc_get_readers will exit as soon possbile
+ /* If this flag is enabled, rc_get_readers will exit as soon possible
* after the Abort flag is set.*/
unsigned int ExitOnAbort;
void * CbData;
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c
index 7ca4cdfef..1458d03aa 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.c
@@ -61,13 +61,6 @@ const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
.HasDstReg = 1
},
{
- .Opcode = RC_OPCODE_CEIL,
- .Name = "CEIL",
- .NumSrcRegs = 1,
- .HasDstReg = 1,
- .IsComponentwise = 1
- },
- {
.Opcode = RC_OPCODE_CMP,
.Name = "CMP",
.NumSrcRegs = 3,
@@ -140,13 +133,6 @@ const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
.HasDstReg = 1
},
{
- .Opcode = RC_OPCODE_FLR,
- .Name = "FLR",
- .NumSrcRegs = 1,
- .HasDstReg = 1,
- .IsComponentwise = 1
- },
- {
.Opcode = RC_OPCODE_FRC,
.Name = "FRC",
.NumSrcRegs = 1,
@@ -297,13 +283,6 @@ const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
.IsComponentwise = 1
},
{
- .Opcode = RC_OPCODE_SSG,
- .Name = "SSG",
- .NumSrcRegs = 1,
- .HasDstReg = 1,
- .IsComponentwise = 1
- },
- {
.Opcode = RC_OPCODE_SUB,
.Name = "SUB",
.NumSrcRegs = 2,
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h
index acce9f527..88d6f212b 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_opcodes.h
@@ -48,9 +48,6 @@ typedef enum {
* dst.x = round(src.x), where dst must be an address register */
RC_OPCODE_ARR,
- /** vec4 instruction: dst.c = ceil(src0.c) */
- RC_OPCODE_CEIL,
-
/** vec4 instruction: dst.c = src0.c < 0.0 ? src1.c : src2.c */
RC_OPCODE_CMP,
@@ -86,9 +83,6 @@ typedef enum {
/** special instruction, see ARB_vertex_program */
RC_OPCODE_EXP,
- /** vec4 instruction: dst.c = floor(src0.c) */
- RC_OPCODE_FLR,
-
/** vec4 instruction: dst.c = src0.c - floor(src0.c) */
RC_OPCODE_FRC,
@@ -155,9 +149,6 @@ typedef enum {
/** vec4 instruction: dst.c = (src0.c != src1.c) ? 1.0 : 0.0 */
RC_OPCODE_SNE,
- /** vec4 instruction: dst.c = (src0.c < 0 ?) -1 : ((src0.c > 0) : 1 : 0) */
- RC_OPCODE_SSG,
-
/** vec4 instruction: dst.c = src0.c - src1.c */
RC_OPCODE_SUB,
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c
index 02a937c69..fc475f135 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_optimize.c
@@ -242,82 +242,6 @@ static int is_src_uniform_constant(struct rc_src_register src,
return 1;
}
-static void constant_folding_mad(struct rc_instruction * inst)
-{
- rc_swizzle swz = 0;
- unsigned int negate= 0;
-
- if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
- if (swz == RC_SWIZZLE_ZERO) {
- inst->U.I.Opcode = RC_OPCODE_MUL;
- return;
- }
- }
-
- if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
- if (swz == RC_SWIZZLE_ONE) {
- inst->U.I.Opcode = RC_OPCODE_ADD;
- if (negate)
- inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
- inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2];
- return;
- } else if (swz == RC_SWIZZLE_ZERO) {
- inst->U.I.Opcode = RC_OPCODE_MOV;
- inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
- return;
- }
- }
-
- if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
- if (swz == RC_SWIZZLE_ONE) {
- inst->U.I.Opcode = RC_OPCODE_ADD;
- if (negate)
- inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
- inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
- return;
- } else if (swz == RC_SWIZZLE_ZERO) {
- inst->U.I.Opcode = RC_OPCODE_MOV;
- inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
- return;
- }
- }
-}
-
-static void constant_folding_mul(struct rc_instruction * inst)
-{
- rc_swizzle swz = 0;
- unsigned int negate = 0;
-
- if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
- if (swz == RC_SWIZZLE_ONE) {
- inst->U.I.Opcode = RC_OPCODE_MOV;
- inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
- if (negate)
- inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
- return;
- } else if (swz == RC_SWIZZLE_ZERO) {
- inst->U.I.Opcode = RC_OPCODE_MOV;
- inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
- inst->U.I.SrcReg[0].File = RC_FILE_NONE;
- return;
- }
- }
-
- if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
- if (swz == RC_SWIZZLE_ONE) {
- inst->U.I.Opcode = RC_OPCODE_MOV;
- if (negate)
- inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
- return;
- } else if (swz == RC_SWIZZLE_ZERO) {
- inst->U.I.Opcode = RC_OPCODE_MOV;
- inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
- inst->U.I.SrcReg[0].File = RC_FILE_NONE;
- return;
- }
- }
-}
-
static void constant_folding_add(struct rc_instruction * inst)
{
rc_swizzle swz = 0;
@@ -420,14 +344,8 @@ static void constant_folding(struct radeon_compiler * c, struct rc_instruction *
inst->U.I.SrcReg[src] = newsrc;
}
- /* Simplify instructions based on constants */
- if (inst->U.I.Opcode == RC_OPCODE_MAD)
- constant_folding_mad(inst);
-
- /* note: MAD can simplify to MUL or ADD */
- if (inst->U.I.Opcode == RC_OPCODE_MUL)
- constant_folding_mul(inst);
- else if (inst->U.I.Opcode == RC_OPCODE_ADD)
+ if (c->type == RC_FRAGMENT_PROGRAM &&
+ inst->U.I.Opcode == RC_OPCODE_ADD)
constant_folding_add(inst);
/* In case this instruction has been converted, make sure all of the
@@ -563,7 +481,7 @@ static int is_presub_candidate(
unsigned int i;
unsigned int is_constant[2] = {0, 0};
- assert(inst->U.I.Opcode == RC_OPCODE_ADD);
+ assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
|| inst->U.I.SaturateMode
@@ -572,7 +490,7 @@ static int is_presub_candidate(
return 0;
}
- /* If both sources use a constant swizzle, then we can't convert it to
+ /* If first two sources use a constant swizzle, then we can't convert it to
* a presubtract operation. In fact for the ADD and SUB presubtract
* operations neither source can contain a constant swizzle. This
* specific case is checked in peephole_add_presub_add() when
@@ -655,10 +573,27 @@ static void presub_replace_inv(
inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
}
+static void presub_replace_bias(
+ struct rc_instruction * inst_mad,
+ struct rc_instruction * inst_reader,
+ unsigned int src_index)
+{
+ /* We must be careful not to modify inst_mad, since it
+ * is possible it will remain part of the program.*/
+ inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
+ inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
+ inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
+ inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
+ inst_reader->U.I.PreSub.SrcReg[0]);
+
+ inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
+ inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
+}
+
/**
* PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
* Use the presubtract 1 - src0 for all readers of TEMP[0]. The first source
- * of the add instruction must have the constatnt 1 swizzle. This function
+ * of the add instruction must have the constant 1 swizzle. This function
* does not check const registers to see if their value is 1.0, so it should
* be called after the constant_folding optimization.
* @return
@@ -690,8 +625,6 @@ static int peephole_add_presub_inv(
if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
inst_add->U.I.DstReg.WriteMask
|| inst_add->U.I.SrcReg[1].Abs
- || (inst_add->U.I.SrcReg[1].File != RC_FILE_TEMPORARY
- && inst_add->U.I.SrcReg[1].File != RC_FILE_CONSTANT)
|| src_has_const_swz(inst_add->U.I.SrcReg[1])) {
return 0;
@@ -704,6 +637,66 @@ static int peephole_add_presub_inv(
return 0;
}
+/**
+ * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
+ * Use the presubtract 1 - 2*src0 for all readers of TEMP[0]. The first source
+ * of the add instruction must have the constant 1 swizzle. This function
+ * does not check const registers to see if their value is 1.0, so it should
+ * be called after the constant_folding optimization.
+ * @return
+ * 0 if the MAD instruction is still part of the program.
+ * 1 if the MAD instruction is no longer part of the program.
+ */
+static int peephole_mad_presub_bias(
+ struct radeon_compiler * c,
+ struct rc_instruction * inst_mad)
+{
+ unsigned int i, swz;
+
+ if (!is_presub_candidate(c, inst_mad))
+ return 0;
+
+ /* Check if src2 is 1. */
+ for(i = 0; i < 4; i++ ) {
+ if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
+ continue;
+
+ swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
+ if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
+ return 0;
+ }
+
+ /* Check if src1 is 2. */
+ struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
+ if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
+ return 0;
+ struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
+ if (constant->Type != RC_CONSTANT_IMMEDIATE)
+ return 0;
+ for (i = 0; i < 4; i++) {
+ if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
+ continue;
+ swz = GET_SWZ(src1_reg.Swizzle, i);
+ if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
+ return 0;
+ }
+
+ /* Check src0. */
+ if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
+ inst_mad->U.I.DstReg.WriteMask
+ || inst_mad->U.I.SrcReg[0].Abs
+ || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
+
+ return 0;
+ }
+
+ if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
+ rc_remove_instruction(inst_mad);
+ return 1;
+ }
+ return 0;
+}
+
struct peephole_mul_cb_data {
struct rc_dst_register * Writer;
unsigned int Clobbered;
@@ -891,15 +884,24 @@ static int peephole_mul_omod(
*/
static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
{
- switch(inst->U.I.Opcode){
+ if (!c->has_presub)
+ return 0;
+
+ switch(inst->U.I.Opcode) {
case RC_OPCODE_ADD:
- if (c->has_presub) {
- if(peephole_add_presub_inv(c, inst))
- return 1;
- if(peephole_add_presub_add(c, inst))
- return 1;
- }
+ {
+ if (peephole_add_presub_inv(c, inst))
+ return 1;
+ if (peephole_add_presub_add(c, inst))
+ return 1;
+ break;
+ }
+ case RC_OPCODE_MAD:
+ {
+ if (peephole_mad_presub_bias(c, inst))
+ return 1;
break;
+ }
default:
break;
}
@@ -1331,6 +1333,126 @@ static void merge_channels(struct radeon_compiler * c, struct rc_instruction * i
}
}
+/**
+ * Searches for duplicate ARLs/ARRs
+ *
+ * Only a very trivial case is now optimized where if a second one is detected which reads from
+ * the same register as the first one and source is the same, just remove the second one.
+ */
+static void merge_A0_loads(
+ struct radeon_compiler * c,
+ struct rc_instruction * inst,
+ bool is_ARL)
+{
+ unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
+ unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
+ unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
+ int cf_depth = 0;
+
+ struct rc_instruction * cur = inst;
+ while (cur != &c->Program.Instructions) {
+ cur = cur->Next;
+ const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
+
+ /* Keep it simple for now and stop when encountering any
+ * control flow besides simple ifs.
+ */
+ if (opcode->IsFlowControl) {
+ switch (cur->U.I.Opcode) {
+ case RC_OPCODE_IF:
+ {
+ cf_depth++;
+ break;
+ }
+ case RC_OPCODE_ELSE:
+ {
+ if (cf_depth < 1)
+ return;
+ break;
+ }
+ case RC_OPCODE_ENDIF:
+ {
+ cf_depth--;
+ break;
+ }
+ default:
+ return;
+ }
+ }
+
+ /* Stop when the original source is overwritten */
+ if (A0_src_reg == cur->U.I.DstReg.Index &&
+ A0_src_file == cur->U.I.DstReg.File &&
+ cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
+ return;
+
+ /* Wrong A0 load type. */
+ if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
+ (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
+ return;
+
+ if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
+ if (A0_src_reg == cur->U.I.SrcReg[0].Index &&
+ A0_src_file == cur->U.I.SrcReg[0].File &&
+ A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
+ struct rc_instruction * next = cur->Next;
+ rc_remove_instruction(cur);
+ cur = next;
+ } else {
+ return;
+ }
+ }
+ }
+}
+
+/**
+ * According to the GLSL spec, round is only 1.30 and up
+ * so the only reason why we should ever see round is if it actually
+ * is lowered ARR (from nine->ttn). In that case we want to reconstruct
+ * the ARR instead of lowering the round.
+ */
+static void transform_vertex_ROUND(struct radeon_compiler* c,
+ struct rc_instruction* inst)
+{
+ struct rc_reader_data readers;
+ rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
+
+ assert(readers.ReaderCount > 0);
+ for (unsigned i = 0; i < readers.ReaderCount; i++) {
+ struct rc_instruction *reader = readers.Readers[i].Inst;
+ if (reader->U.I.Opcode != RC_OPCODE_ARL) {
+ assert(!"Unable to convert ROUND+ARL to ARR\n");
+ return;
+ }
+ }
+
+ /* Only ARL readers, convert all to ARR */
+ for (unsigned i = 0; i < readers.ReaderCount; i++) {
+ readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
+ }
+ /* Switch ROUND to MOV and let copy propagate sort it out later. */
+ inst->U.I.Opcode = RC_OPCODE_MOV;
+}
+
+/**
+ * Apply various optimizations specific to the A0 adress register loads.
+ */
+static void optimize_A0_loads(struct radeon_compiler * c) {
+ struct rc_instruction * inst = c->Program.Instructions.Next;
+
+ while (inst != &c->Program.Instructions) {
+ struct rc_instruction * cur = inst;
+ inst = inst->Next;
+ if (cur->U.I.Opcode == RC_OPCODE_ARL) {
+ merge_A0_loads(c, cur, true);
+ } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
+ merge_A0_loads(c, cur, false);
+ } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
+ transform_vertex_ROUND(c, cur);
+ }
+ }
+}
+
void rc_optimize(struct radeon_compiler * c, void *user)
{
struct rc_instruction * inst = c->Program.Instructions.Next;
@@ -1350,8 +1472,12 @@ void rc_optimize(struct radeon_compiler * c, void *user)
}
}
+ if (c->type == RC_VERTEX_PROGRAM) {
+ optimize_A0_loads(c);
+ }
+
/* Merge MOVs to same source in different channels using the constant
- * swizzles.
+ * swizzle.
*/
if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
inst = c->Program.Instructions.Next;
@@ -1376,6 +1502,10 @@ void rc_optimize(struct radeon_compiler * c, void *user)
}
}
+ if (c->type != RC_FRAGMENT_PROGRAM) {
+ return;
+ }
+
/* Presubtract operations. */
inst = c->Program.Instructions.Next;
while(inst != &c->Program.Instructions) {
@@ -1384,10 +1514,7 @@ void rc_optimize(struct radeon_compiler * c, void *user)
peephole(c, cur);
}
- if (!c->has_omod) {
- return;
- }
-
+ /* Output modifiers. */
inst = c->Program.Instructions.Next;
struct rc_list * var_list = NULL;
while(inst != &c->Program.Instructions) {
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c
index e232e93f0..428bf471c 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c
@@ -494,7 +494,7 @@ static void emit_all_tex(struct schedule_state * s, struct rc_instruction * befo
* dst_full is an rgb instruction, meaning that it has a vector instruction(rgb)
* but no scalar instruction (alpha).
* @return 0 if merging the presubtract sources fails.
- * @retrun 1 if merging the presubtract sources succeeds.
+ * @return 1 if merging the presubtract sources succeeds.
*/
static int merge_presub_sources(
struct rc_pair_instruction * dst_full,
@@ -571,7 +571,7 @@ static int merge_presub_sources(
for(arg = 0; arg < info->NumSrcRegs; arg++) {
/* If the arg does read both from rgb and alpha, then we need to rewrite
* both sources and the code currently doesn't handle this.
- * FIXME: This is definitelly solvable, however shader-db shows it is
+ * FIXME: This is definitely solvable, however shader-db shows it is
* not worth the effort.
*/
if (rc_source_type_swz(dst_full->RGB.Arg[arg].Swizzle) & RC_SOURCE_ALPHA &&
@@ -844,7 +844,7 @@ static void is_rgb_to_alpha_possible(
}
/* Make sure the source only reads the register component that we
- * are going to be convering from. It is OK if the instruction uses
+ * are going to be converting from. It is OK if the instruction uses
* this component more than once.
* XXX If the index we will be converting to is the same as the
* current index, then it is OK to read from more than one component.
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h
index 67be1b9f2..41af9815f 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program.h
@@ -42,7 +42,7 @@ struct rc_src_register {
unsigned int File:4;
/** Negative values may be used for relative addressing. */
- signed int Index:(RC_REGISTER_INDEX_BITS+1);
+ unsigned int Index:RC_REGISTER_INDEX_BITS;
unsigned int RelAddr:1;
unsigned int Swizzle:12;
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c
index c6d682b40..a56d81c62 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.c
@@ -37,6 +37,7 @@
#include "radeon_compiler.h"
#include "radeon_compiler_util.h"
+#include "radeon_dataflow.h"
#include "util/log.h"
@@ -117,12 +118,6 @@ static const struct rc_src_register builtin_one = {
.Swizzle = RC_SWIZZLE_1111
};
-static const struct rc_src_register builtin_half = {
- .File = RC_FILE_NONE,
- .Index = 0,
- .Swizzle = RC_SWIZZLE_HHHH
-};
-
static const struct rc_src_register srcreg_undefined = {
.File = RC_FILE_NONE,
.Index = 0,
@@ -202,26 +197,6 @@ static struct rc_dst_register new_dst_reg(struct radeon_compiler *c,
return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
}
-static void transform_CEIL(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- /* Assuming:
- * ceil(x) = -floor(-x)
- *
- * After inlining floor:
- * ceil(x) = -(-x-frac(-x))
- *
- * After simplification:
- * ceil(x) = x+frac(-x)
- */
-
- struct rc_dst_register dst = new_dst_reg(c, inst);
- emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dst, negate(inst->U.I.SrcReg[0]));
- emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
- inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
- rc_remove_instruction(inst);
-}
-
static void transform_DP2(struct radeon_compiler* c,
struct rc_instruction* inst)
{
@@ -237,29 +212,6 @@ static void transform_DP2(struct radeon_compiler* c,
rc_remove_instruction(inst);
}
-/**
- * [1, src0.y*src1.y, src0.z, src1.w]
- * So basically MUL with lotsa swizzling.
- */
-static void transform_DST(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
- swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
- swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
- rc_remove_instruction(inst);
-}
-
-static void transform_FLR(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- struct rc_dst_register dst = new_dst_reg(c, inst);
- emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dst, inst->U.I.SrcReg[0]);
- emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
- inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
- rc_remove_instruction(inst);
-}
-
static void transform_TRUNC(struct radeon_compiler* c,
struct rc_instruction* inst)
{
@@ -296,89 +248,6 @@ static void transform_TRUNC(struct radeon_compiler* c,
rc_remove_instruction(inst);
}
-/**
- * Definition of LIT (from ARB_fragment_program):
- *
- * tmp = VectorLoad(op0);
- * if (tmp.x < 0) tmp.x = 0;
- * if (tmp.y < 0) tmp.y = 0;
- * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
- * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
- * result.x = 1.0;
- * result.y = tmp.x;
- * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
- * result.w = 1.0;
- *
- * The longest path of computation is the one leading to result.z,
- * consisting of 5 operations. This implementation of LIT takes
- * 5 slots, if the subsequent optimization passes are clever enough
- * to pair instructions correctly.
- */
-static void transform_LIT(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- unsigned int constant;
- unsigned int constant_swizzle;
- unsigned int temp;
- struct rc_src_register srctemp;
-
- constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
-
- if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
- struct rc_instruction * inst_mov;
-
- inst_mov = emit1(c, inst,
- RC_OPCODE_MOV, NULL, inst->U.I.DstReg,
- srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
-
- inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
- inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
- inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
- }
-
- temp = inst->U.I.DstReg.Index;
- srctemp = srcreg(RC_FILE_TEMPORARY, temp);
-
- /* tmp.x = max(0.0, Src.x); */
- /* tmp.y = max(0.0, Src.y); */
- /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
- emit2(c, inst->Prev, RC_OPCODE_MAX, NULL,
- dstregtmpmask(temp, RC_MASK_XYW),
- inst->U.I.SrcReg[0],
- swizzle(srcreg(RC_FILE_CONSTANT, constant),
- RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
- emit2(c, inst->Prev, RC_OPCODE_MIN, NULL,
- dstregtmpmask(temp, RC_MASK_Z),
- swizzle_wwww(srctemp),
- negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
-
- /* tmp.w = Pow(tmp.y, tmp.w) */
- emit1(c, inst->Prev, RC_OPCODE_LG2, NULL,
- dstregtmpmask(temp, RC_MASK_W),
- swizzle_yyyy(srctemp));
- emit2(c, inst->Prev, RC_OPCODE_MUL, NULL,
- dstregtmpmask(temp, RC_MASK_W),
- swizzle_wwww(srctemp),
- swizzle_zzzz(srctemp));
- emit1(c, inst->Prev, RC_OPCODE_EX2, NULL,
- dstregtmpmask(temp, RC_MASK_W),
- swizzle_wwww(srctemp));
-
- /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
- emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
- dstregtmpmask(temp, RC_MASK_Z),
- negate(swizzle_xxxx(srctemp)),
- swizzle_wwww(srctemp),
- builtin_zero);
-
- /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
- emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
- dstregtmpmask(temp, RC_MASK_XYW),
- swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
-
- rc_remove_instruction(inst);
-}
-
static void transform_LRP(struct radeon_compiler* c,
struct rc_instruction* inst)
{
@@ -394,58 +263,6 @@ static void transform_LRP(struct radeon_compiler* c,
rc_remove_instruction(inst);
}
-static void transform_POW(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- struct rc_dst_register tempdst = new_dst_reg(c, inst);
- struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
- tempdst.WriteMask = RC_MASK_W;
- tempsrc.Swizzle = RC_SWIZZLE_WWWW;
-
- emit1(c, inst->Prev, RC_OPCODE_LG2, NULL, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
- emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
- emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
-
- rc_remove_instruction(inst);
-}
-
-/* dst = ROUND(src) :
- * add = src + .5
- * frac = FRC(add)
- * dst = add - frac
- *
- * According to the GLSL spec, the implementor can decide which way to round
- * when the fraction is .5. We round down for .5.
- *
- */
-static void transform_ROUND(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- unsigned int mask = inst->U.I.DstReg.WriteMask;
- unsigned int frac_index, add_index;
- struct rc_dst_register frac_dst, add_dst;
- struct rc_src_register frac_src, add_src;
-
- /* add = src + .5 */
- add_index = rc_find_free_temporary(c);
- add_dst = dstregtmpmask(add_index, mask);
- emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, add_dst, inst->U.I.SrcReg[0],
- builtin_half);
- add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
-
-
- /* frac = FRC(add) */
- frac_index = rc_find_free_temporary(c);
- frac_dst = dstregtmpmask(frac_index, mask);
- emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, frac_dst, add_src);
- frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
-
- /* dst = add - frac */
- emit2(c, inst->Prev, RC_OPCODE_ADD, NULL, inst->U.I.DstReg,
- add_src, negate(frac_src));
- rc_remove_instruction(inst);
-}
-
static void transform_RSQ(struct radeon_compiler* c,
struct rc_instruction* inst)
{
@@ -524,44 +341,6 @@ static void transform_SNE(struct radeon_compiler* c,
rc_remove_instruction(inst);
}
-static void transform_SSG(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- /* result = sign(x)
- *
- * CMP tmp0, -x, 1, 0
- * CMP tmp1, x, 1, 0
- * ADD result, tmp0, -tmp1;
- */
- struct rc_dst_register dst0;
- unsigned tmp1;
-
- /* 0 < x */
- dst0 = new_dst_reg(c, inst);
- emit3(c, inst->Prev, RC_OPCODE_CMP, NULL,
- dst0,
- negate(inst->U.I.SrcReg[0]),
- builtin_one,
- builtin_zero);
-
- /* x < 0 */
- tmp1 = rc_find_free_temporary(c);
- emit3(c, inst->Prev, RC_OPCODE_CMP, NULL,
- dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
- inst->U.I.SrcReg[0],
- builtin_one,
- builtin_zero);
-
- /* Either both are zero, or one of them is one and the other is zero. */
- /* result = tmp0 - tmp1 */
- emit2(c, inst->Prev, RC_OPCODE_ADD, NULL,
- inst->U.I.DstReg,
- srcreg(RC_FILE_TEMPORARY, dst0.Index),
- negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
-
- rc_remove_instruction(inst);
-}
-
static void transform_SUB(struct radeon_compiler* c,
struct rc_instruction* inst)
{
@@ -581,7 +360,7 @@ static void transform_KILP(struct radeon_compiler * c,
* no userData necessary.
*
* Eliminates the following ALU instructions:
- * CEIL, DST, FLR, LIT, LRP, POW, SEQ, SGE, SGT, SLE, SLT, SNE, SUB
+ * LRP, SEQ, SGE, SGT, SLE, SLT, SNE, SUB
* using:
* MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
*
@@ -596,15 +375,9 @@ int radeonTransformALU(
void* unused)
{
switch(inst->U.I.Opcode) {
- case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
- case RC_OPCODE_DST: transform_DST(c, inst); return 1;
- case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
case RC_OPCODE_KILP: transform_KILP(c, inst); return 1;
- case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
- case RC_OPCODE_POW: transform_POW(c, inst); return 1;
- case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
@@ -612,7 +385,6 @@ int radeonTransformALU(
case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
- case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
default:
@@ -623,7 +395,12 @@ int radeonTransformALU(
static void transform_r300_vertex_CMP(struct radeon_compiler* c,
struct rc_instruction* inst)
{
- /* There is no decent CMP available, so let's rig one up.
+ /* R5xx has a CMP, but we can use it only if it reads from less than
+ * three different temps. */
+ if (c->is_r500 && !rc_inst_has_three_diff_temp_srcs(inst))
+ return;
+
+ /* There is no decent CMP available on r300, so let's rig one up.
* CMP is defined as dst = src0 < 0.0 ? src1 : src2
* The following sequence consumes zero to two temps and two extra slots
* (the second temp and the second slot is consumed by transform_LRP),
@@ -768,42 +545,6 @@ static void transform_r300_vertex_SLE(struct radeon_compiler* c,
inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
}
-static void transform_r300_vertex_SSG(struct radeon_compiler* c,
- struct rc_instruction* inst)
-{
- /* result = sign(x)
- *
- * SLT tmp0, 0, x;
- * SLT tmp1, x, 0;
- * ADD result, tmp0, -tmp1;
- */
- struct rc_dst_register dst0;
- unsigned tmp1;
-
- /* 0 < x */
- dst0 = new_dst_reg(c, inst);
- emit2(c, inst->Prev, RC_OPCODE_SLT, NULL,
- dst0,
- builtin_zero,
- inst->U.I.SrcReg[0]);
-
- /* x < 0 */
- tmp1 = rc_find_free_temporary(c);
- emit2(c, inst->Prev, RC_OPCODE_SLT, NULL,
- dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
- inst->U.I.SrcReg[0],
- builtin_zero);
-
- /* Either both are zero, or one of them is one and the other is zero. */
- /* result = tmp0 - tmp1 */
- emit2(c, inst->Prev, RC_OPCODE_ADD, NULL,
- inst->U.I.DstReg,
- srcreg(RC_FILE_TEMPORARY, dst0.Index),
- negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
-
- rc_remove_instruction(inst);
-}
-
static void transform_vertex_TRUNC(struct radeon_compiler* c,
struct rc_instruction* inst)
{
@@ -825,11 +566,9 @@ int r300_transform_vertex_alu(
void* unused)
{
switch(inst->U.I.Opcode) {
- case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
- case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
case RC_OPCODE_SEQ:
@@ -846,7 +585,6 @@ int r300_transform_vertex_alu(
return 1;
}
return 0;
- case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
default:
@@ -854,196 +592,6 @@ int r300_transform_vertex_alu(
}
}
-static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
-{
- static const float SinCosConsts[2][4] = {
- {
- 1.273239545, /* 4/PI */
- -0.405284735, /* -4/(PI*PI) */
- 3.141592654, /* PI */
- 0.2225 /* weight */
- },
- {
- 0.75,
- 0.5,
- 0.159154943, /* 1/(2*PI) */
- 6.283185307 /* 2*PI */
- }
- };
- int i;
-
- for(i = 0; i < 2; ++i)
- constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
-}
-
-/**
- * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
- *
- * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
- * MAD tmp.x, tmp.y, |src|, tmp.x
- * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
- * MAD dest, tmp.y, weight, tmp.x
- */
-static void sin_approx(
- struct radeon_compiler* c, struct rc_instruction * inst,
- struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
-{
- unsigned int tempreg = rc_find_free_temporary(c);
-
- emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
- swizzle_xxxx(src),
- srcreg(RC_FILE_CONSTANT, constants[0]));
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_X),
- swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
- absolute(swizzle_xxxx(src)),
- swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_Y),
- swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
- absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
- negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dst,
- swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
- swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
- swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
-}
-
-/**
- * Translate the trigonometric functions COS and SIN
- * using only the basic instructions
- * MOV, ADD, MUL, MAD, FRC
- */
-int r300_transform_trig_simple(struct radeon_compiler* c,
- struct rc_instruction* inst,
- void* unused)
-{
- unsigned int constants[2];
- unsigned int tempreg;
-
- if (inst->U.I.Opcode != RC_OPCODE_COS &&
- inst->U.I.Opcode != RC_OPCODE_SIN)
- return 0;
-
- tempreg = rc_find_free_temporary(c);
-
- sincos_constants(c, constants);
-
- if (inst->U.I.Opcode == RC_OPCODE_COS) {
- /* MAD tmp.x, src, 1/(2*PI), 0.75 */
- /* FRC tmp.x, tmp.x */
- /* MAD tmp.z, tmp.x, 2*PI, -PI */
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
- swizzle_xxxx(inst->U.I.SrcReg[0]),
- swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
- swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
- emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W),
- swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
- swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
- swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
- negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
-
- sin_approx(c, inst, inst->U.I.DstReg,
- swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
- constants);
- } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
- swizzle_xxxx(inst->U.I.SrcReg[0]),
- swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
- swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
- emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_W),
- swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_W),
- swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
- swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
- negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
-
- sin_approx(c, inst, inst->U.I.DstReg,
- swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
- constants);
- } else {
- struct rc_dst_register dst;
-
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
- swizzle_xxxx(inst->U.I.SrcReg[0]),
- swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
- swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
- emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
- srcreg(RC_FILE_TEMPORARY, tempreg));
- emit3(c, inst->Prev, RC_OPCODE_MAD, NULL, dstregtmpmask(tempreg, RC_MASK_XY),
- srcreg(RC_FILE_TEMPORARY, tempreg),
- swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
- negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
-
- dst = inst->U.I.DstReg;
-
- dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
- sin_approx(c, inst, dst,
- swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
- constants);
-
- dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
- sin_approx(c, inst, dst,
- swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
- constants);
- }
-
- rc_remove_instruction(inst);
-
- return 1;
-}
-
-static void r300_transform_SIN_COS(struct radeon_compiler *c,
- struct rc_instruction *inst,
- unsigned srctmp)
-{
- if (inst->U.I.Opcode == RC_OPCODE_COS) {
- emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
- srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
- } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
- emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
- inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
- }
-
- rc_remove_instruction(inst);
-}
-
-
-/**
- * Transform the trigonometric functions COS and SIN
- * to include pre-scaling by 1/(2*PI) and taking the fractional
- * part, so that the input to COS and SIN is always in the range [0,1).
- *
- * @warning This transformation implicitly changes the semantics of SIN and COS!
- */
-int radeonTransformTrigScale(struct radeon_compiler* c,
- struct rc_instruction* inst,
- void* unused)
-{
- static const float RCP_2PI = 0.15915494309189535;
- unsigned int temp;
- unsigned int constant;
- unsigned int constant_swizzle;
-
- if (inst->U.I.Opcode != RC_OPCODE_COS &&
- inst->U.I.Opcode != RC_OPCODE_SIN)
- return 0;
-
- if (!c->needs_trig_input_transform)
- return 1;
-
- temp = rc_find_free_temporary(c);
- constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
-
- emit2(c, inst->Prev, RC_OPCODE_MUL, NULL, dstregtmpmask(temp, RC_MASK_W),
- swizzle_xxxx(inst->U.I.SrcReg[0]),
- srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
- emit1(c, inst->Prev, RC_OPCODE_FRC, NULL, dstregtmpmask(temp, RC_MASK_W),
- srcreg(RC_FILE_TEMPORARY, temp));
-
- r300_transform_SIN_COS(c, inst, temp);
- return 1;
-}
-
/**
* Replaces DDX/DDY instructions with MOV 0 to avoid using dummy shaders on r300/r400.
*
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h
index eb522b2ea..861a6a39d 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_alu.h
@@ -40,21 +40,6 @@ int r300_transform_vertex_alu(
struct rc_instruction * inst,
void*);
-int r300_transform_trig_simple(
- struct radeon_compiler * c,
- struct rc_instruction * inst,
- void*);
-
-int radeonTransformTrigScale(
- struct radeon_compiler * c,
- struct rc_instruction * inst,
- void*);
-
-int r300_transform_trig_scale_vertex(
- struct radeon_compiler *c,
- struct rc_instruction *inst,
- void*);
-
int radeonStubDeriv(
struct radeon_compiler * c,
struct rc_instruction * inst,
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h
index 6a8cbe333..4c12c5f2e 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_constants.h
@@ -101,7 +101,7 @@ enum {
RC_NUM_SPECIAL_REGISTERS
};
-#define RC_REGISTER_INDEX_BITS 10
+#define RC_REGISTER_INDEX_BITS 11
#define RC_REGISTER_MAX_INDEX (1 << RC_REGISTER_INDEX_BITS)
typedef enum {
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c
index 4882527bf..9995d5158 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_program_tex.c
@@ -230,7 +230,7 @@ int radeonTransformTEX(
else
inst_add->U.I.SrcReg[0].Negate = inst_add->U.I.SrcReg[0].Negate ^ RC_MASK_XYZW;
- /* This negates the whole expresion: */
+ /* This negates the whole expression: */
if (comparefunc == RC_COMPARE_FUNC_LESS || comparefunc == RC_COMPARE_FUNC_GREATER ||
comparefunc == RC_COMPARE_FUNC_NOTEQUAL) {
pass = 1;
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c
index 4c276a4c1..30e1232f5 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/radeon_variable.c
@@ -383,7 +383,7 @@ struct rc_list * rc_get_variables(struct radeon_compiler * c)
* as the src1.xyz and src1.w of the instruction where the value is used are
* in theory independent. They are not because the same register is written
* also by the texture instruction in the other branch and TEX can't write xyz
- * and w separatelly.
+ * and w separately.
*
* Therefore first search for RC_INSTRUCTION_NORMAL to create variables from
* the texture instruction and than the pair instructions will be properly
@@ -401,7 +401,19 @@ struct rc_list * rc_get_variables(struct radeon_compiler * c)
memset(&reader_data, 0, sizeof(reader_data));
rc_get_readers(c, inst, &reader_data, NULL, NULL, NULL);
if (reader_data.ReaderCount == 0) {
- continue;
+ /* Variable is only returned if there is both writer
+ * and reader. This means dead writes will not get
+ * register allocated as a result and can overwrite random
+ * registers. Assert on dead writes insted so we can improve
+ * the DCE.
+ */
+ const struct rc_opcode_info *opcode =
+ rc_get_opcode_info(inst->U.I.Opcode);
+ assert(c->type == RC_FRAGMENT_PROGRAM ||
+ !opcode->HasDstReg ||
+ inst->U.I.DstReg.File == RC_FILE_OUTPUT ||
+ inst->U.I.DstReg.File == RC_FILE_ADDRESS);
+ continue;
}
new_var = rc_variable(c, inst->U.I.DstReg.File,
inst->U.I.DstReg.Index,
diff --git a/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c b/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c
index 008bf5d31..0c85579ca 100644
--- a/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c
+++ b/lib/mesa/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c
@@ -513,6 +513,7 @@ void init_compiler(
rc_init_regalloc_state(rs, program_type);
rc_init(c, rs);
+ c->type = program_type;
c->is_r500 = is_r500;
c->max_temp_regs = is_r500 ? 128 : (is_r400 ? 64 : 32);
c->max_constants = is_r500 ? 256 : 32;