diff options
Diffstat (limited to 'lib/mesa/src/broadcom/compiler/nir_to_vir.c')
-rw-r--r-- | lib/mesa/src/broadcom/compiler/nir_to_vir.c | 2054 |
1 files changed, 2054 insertions, 0 deletions
diff --git a/lib/mesa/src/broadcom/compiler/nir_to_vir.c b/lib/mesa/src/broadcom/compiler/nir_to_vir.c new file mode 100644 index 000000000..3b032b704 --- /dev/null +++ b/lib/mesa/src/broadcom/compiler/nir_to_vir.c @@ -0,0 +1,2054 @@ +/* + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include "util/u_format.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/ralloc.h" +#include "util/hash_table.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "v3d_compiler.h" + +/* We don't do any address packing. */ +#define __gen_user_data void +#define __gen_address_type uint32_t +#define __gen_address_offset(reloc) (*reloc) +#define __gen_emit_reloc(cl, reloc) +#include "cle/v3d_packet_v33_pack.h" + +static struct qreg +ntq_get_src(struct v3d_compile *c, nir_src src, int i); +static void +ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); + +static void +resize_qreg_array(struct v3d_compile *c, + struct qreg **regs, + uint32_t *size, + uint32_t decl_size) +{ + if (*size >= decl_size) + return; + + uint32_t old_size = *size; + *size = MAX2(*size * 2, decl_size); + *regs = reralloc(c, *regs, struct qreg, *size); + if (!*regs) { + fprintf(stderr, "Malloc failure\n"); + abort(); + } + + for (uint32_t i = old_size; i < *size; i++) + (*regs)[i] = c->undef; +} + +static struct qreg +vir_SFU(struct v3d_compile *c, int waddr, struct qreg src) +{ + vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src); + return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); +} + +static struct qreg +vir_LDTMU(struct v3d_compile *c) +{ + vir_NOP(c)->qpu.sig.ldtmu = true; + return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); +} + +static struct qreg +indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr) +{ + struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); + uint32_t offset = nir_intrinsic_base(intr); + struct v3d_ubo_range *range = NULL; + unsigned i; + + for (i = 0; i < c->num_ubo_ranges; i++) { + range = &c->ubo_ranges[i]; + if (offset >= range->src_offset && + offset < range->src_offset + range->size) { + break; + } + } + /* The driver-location-based offset always has to be within a declared + * uniform range. + */ + assert(i != c->num_ubo_ranges); + if (!c->ubo_range_used[i]) { + c->ubo_range_used[i] = true; + range->dst_offset = c->next_ubo_dst_offset; + c->next_ubo_dst_offset += range->size; + } + + offset -= range->src_offset; + + if (range->dst_offset + offset != 0) { + indirect_offset = vir_ADD(c, indirect_offset, + vir_uniform_ui(c, range->dst_offset + + offset)); + } + + /* Adjust for where we stored the TGSI register base. */ + vir_ADD_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), + vir_uniform(c, QUNIFORM_UBO_ADDR, 0), + indirect_offset); + + return vir_LDTMU(c); +} + +static struct qreg * +ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) +{ + struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, + def->num_components); + _mesa_hash_table_insert(c->def_ht, def, qregs); + return qregs; +} + +/** + * This function is responsible for getting VIR results into the associated + * storage for a NIR instruction. + * + * If it's a NIR SSA def, then we just set the associated hash table entry to + * the new result. + * + * If it's a NIR reg, then we need to update the existing qreg assigned to the + * NIR destination with the incoming value. To do that without introducing + * new MOVs, we require that the incoming qreg either be a uniform, or be + * SSA-defined by the previous VIR instruction in the block and rewritable by + * this function. That lets us sneak ahead and insert the SF flag beforehand + * (knowing that the previous instruction doesn't depend on flags) and rewrite + * its destination to be the NIR reg's destination + */ +static void +ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, + struct qreg result) +{ + struct qinst *last_inst = NULL; + if (!list_empty(&c->cur_block->instructions)) + last_inst = (struct qinst *)c->cur_block->instructions.prev; + + assert(result.file == QFILE_UNIF || + (result.file == QFILE_TEMP && + last_inst && last_inst == c->defs[result.index])); + + if (dest->is_ssa) { + assert(chan < dest->ssa.num_components); + + struct qreg *qregs; + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, &dest->ssa); + + if (entry) + qregs = entry->data; + else + qregs = ntq_init_ssa_def(c, &dest->ssa); + + qregs[chan] = result; + } else { + nir_register *reg = dest->reg.reg; + assert(dest->reg.base_offset == 0); + assert(reg->num_array_elems == 0); + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, reg); + struct qreg *qregs = entry->data; + + /* Insert a MOV if the source wasn't an SSA def in the + * previous instruction. + */ + if (result.file == QFILE_UNIF) { + result = vir_MOV(c, result); + last_inst = c->defs[result.index]; + } + + /* We know they're both temps, so just rewrite index. */ + c->defs[last_inst->dst.index] = NULL; + last_inst->dst.index = qregs[chan].index; + + /* If we're in control flow, then make this update of the reg + * conditional on the execution mask. + */ + if (c->execute.file != QFILE_NULL) { + last_inst->dst.index = qregs[chan].index; + + /* Set the flags to the current exec mask. To insert + * the flags push, we temporarily remove our SSA + * instruction. + */ + list_del(&last_inst->link); + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + list_addtail(&last_inst->link, + &c->cur_block->instructions); + + vir_set_cond(last_inst, V3D_QPU_COND_IFA); + last_inst->cond_is_exec_mask = true; + } + } +} + +static struct qreg +ntq_get_src(struct v3d_compile *c, nir_src src, int i) +{ + struct hash_entry *entry; + if (src.is_ssa) { + entry = _mesa_hash_table_search(c->def_ht, src.ssa); + assert(i < src.ssa->num_components); + } else { + nir_register *reg = src.reg.reg; + entry = _mesa_hash_table_search(c->def_ht, reg); + assert(reg->num_array_elems == 0); + assert(src.reg.base_offset == 0); + assert(i < reg->num_components); + } + + struct qreg *qregs = entry->data; + return qregs[i]; +} + +static struct qreg +ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, + unsigned src) +{ + assert(util_is_power_of_two(instr->dest.write_mask)); + unsigned chan = ffs(instr->dest.write_mask) - 1; + struct qreg r = ntq_get_src(c, instr->src[src].src, + instr->src[src].swizzle[chan]); + + assert(!instr->src[src].abs); + assert(!instr->src[src].negate); + + return r; +}; + +static inline struct qreg +vir_SAT(struct v3d_compile *c, struct qreg val) +{ + return vir_FMAX(c, + vir_FMIN(c, val, vir_uniform_f(c, 1.0)), + vir_uniform_f(c, 0.0)); +} + +static struct qreg +ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1) +{ + vir_MULTOP(c, src0, src1); + return vir_UMUL24(c, src0, src1); +} + +static struct qreg +ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) +{ + return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); +} + +static void +ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) +{ + unsigned unit = instr->texture_index; + int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); + int dest_size = nir_tex_instr_dest_size(instr); + + struct qreg lod = c->undef; + if (lod_index != -1) + lod = ntq_get_src(c, instr->src[lod_index].src, 0); + + for (int i = 0; i < dest_size; i++) { + assert(i < 3); + enum quniform_contents contents; + + if (instr->is_array && i == dest_size - 1) + contents = QUNIFORM_TEXTURE_ARRAY_SIZE; + else + contents = QUNIFORM_TEXTURE_WIDTH + i; + + struct qreg size = vir_uniform(c, contents, unit); + + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + /* Don't minify the array size. */ + if (!(instr->is_array && i == dest_size - 1)) { + size = ntq_minify(c, size, lod); + } + break; + + case GLSL_SAMPLER_DIM_RECT: + /* There's no LOD field for rects */ + break; + + default: + unreachable("Bad sampler type"); + } + + ntq_store_dest(c, &instr->dest, i, size); + } +} + +static void +ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) +{ + unsigned unit = instr->texture_index; + + /* Since each texture sampling op requires uploading uniforms to + * reference the texture, there's no HW support for texture size and + * you just upload uniforms containing the size. + */ + switch (instr->op) { + case nir_texop_query_levels: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); + return; + case nir_texop_txs: + ntq_emit_txs(c, instr); + return; + default: + break; + } + + struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = { + V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header, + + .fetch_sample_mode = instr->op == nir_texop_txf, + }; + + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + if (instr->is_array) + p0_unpacked.lookup_type = TEXTURE_1D_ARRAY; + else + p0_unpacked.lookup_type = TEXTURE_1D; + break; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + if (instr->is_array) + p0_unpacked.lookup_type = TEXTURE_2D_ARRAY; + else + p0_unpacked.lookup_type = TEXTURE_2D; + break; + case GLSL_SAMPLER_DIM_3D: + p0_unpacked.lookup_type = TEXTURE_3D; + break; + case GLSL_SAMPLER_DIM_CUBE: + p0_unpacked.lookup_type = TEXTURE_CUBE_MAP; + break; + default: + unreachable("Bad sampler type"); + } + + struct qreg coords[5]; + int next_coord = 0; + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: + for (int j = 0; j < instr->coord_components; j++) { + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, j); + } + if (instr->coord_components < 2) + coords[next_coord++] = vir_uniform_f(c, 0.5); + break; + case nir_tex_src_bias: + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, 0); + + p0_unpacked.bias_supplied = true; + break; + case nir_tex_src_lod: + /* XXX: Needs base level addition */ + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, 0); + + if (instr->op != nir_texop_txf && + instr->op != nir_texop_tg4) { + p0_unpacked.disable_autolod_use_bias_only = true; + } + break; + case nir_tex_src_comparator: + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, 0); + + p0_unpacked.shadow = true; + break; + + case nir_tex_src_offset: { + nir_const_value *offset = + nir_src_as_const_value(instr->src[i].src); + p0_unpacked.texel_offset_for_s_coordinate = + offset->i32[0]; + + if (instr->coord_components >= 2) + p0_unpacked.texel_offset_for_t_coordinate = + offset->i32[1]; + + if (instr->coord_components >= 3) + p0_unpacked.texel_offset_for_r_coordinate = + offset->i32[2]; + break; + } + + default: + unreachable("unknown texture source"); + } + } + + uint32_t p0_packed; + V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL, + (uint8_t *)&p0_packed, + &p0_unpacked); + + /* There is no native support for GL texture rectangle coordinates, so + * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, + * 1]). + */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { + coords[0] = vir_FMUL(c, coords[0], + vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, + unit)); + coords[1] = vir_FMUL(c, coords[1], + vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, + unit)); + } + + struct qreg texture_u[] = { + vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed), + vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit), + }; + uint32_t next_texture_u = 0; + + for (int i = 0; i < next_coord; i++) { + struct qreg dst; + + if (i == next_coord - 1) + dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL); + else + dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU); + + struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]); + + if (i < 2) { + tmu->has_implicit_uniform = true; + tmu->src[vir_get_implicit_uniform_src(tmu)] = + texture_u[next_texture_u++]; + } + } + + bool return_16 = (c->key->tex[unit].return_size == 16 || + p0_unpacked.shadow); + + struct qreg return_values[4]; + for (int i = 0; i < c->key->tex[unit].return_channels; i++) + return_values[i] = vir_LDTMU(c); + /* Swizzling .zw of an RG texture should give undefined results, not + * crash the compiler. + */ + for (int i = c->key->tex[unit].return_channels; i < 4; i++) + return_values[i] = c->undef; + + for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) { + struct qreg chan; + + if (return_16) { + STATIC_ASSERT(PIPE_SWIZZLE_X == 0); + chan = return_values[i / 2]; + + enum v3d_qpu_input_unpack unpack; + if (i & 1) + unpack = V3D_QPU_UNPACK_H; + else + unpack = V3D_QPU_UNPACK_L; + + chan = vir_FMOV(c, chan); + vir_set_unpack(c->defs[chan.index], 0, unpack); + } else { + chan = vir_MOV(c, return_values[i]); + } + ntq_store_dest(c, &instr->dest, i, chan); + } +} + +static struct qreg +ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) +{ + struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); + if (is_cos) + input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); + + struct qreg periods = vir_FROUND(c, input); + struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN, + vir_FSUB(c, input, periods)); + return vir_XOR(c, sin_output, vir_SHL(c, + vir_FTOIN(c, periods), + vir_uniform_ui(c, -1))); +} + +static struct qreg +ntq_fsign(struct v3d_compile *c, struct qreg src) +{ + struct qreg t = vir_get_temp(c); + + vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); + vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); + vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN); + vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); + return vir_MOV(c, t); +} + +static struct qreg +ntq_isign(struct v3d_compile *c, struct qreg src) +{ + struct qreg t = vir_get_temp(c); + + vir_MOV_dest(c, t, vir_uniform_ui(c, 0)); + vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1)); + vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN); + vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1)); + return vir_MOV(c, t); +} + +static void +emit_fragcoord_input(struct v3d_compile *c, int attr) +{ + c->inputs[attr * 4 + 0] = vir_FXCD(c); + c->inputs[attr * 4 + 1] = vir_FYCD(c); + c->inputs[attr * 4 + 2] = c->payload_z; + c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP, + c->payload_w); +} + +static struct qreg +emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + uint8_t swizzle) +{ + struct qreg vary = vir_reg(QFILE_VARY, ~0); + struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + + /* For gl_PointCoord input or distance along a line, we'll be called + * with no nir_variable, and we don't count toward VPM size so we + * don't track an input slot. + */ + if (!var) { + return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); + } + + int i = c->num_inputs++; + c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location, + swizzle); + + switch (var->data.interpolation) { + case INTERP_MODE_NONE: + /* If a gl_FrontColor or gl_BackColor input has no interp + * qualifier, then flag it for glShadeModel() handling by the + * driver. + */ + switch (var->data.location) { + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + BITSET_SET(c->shade_model_flags, i); + break; + default: + break; + } + /* FALLTHROUGH */ + case INTERP_MODE_SMOOTH: + if (var->data.centroid) { + return vir_FADD(c, vir_FMUL(c, vary, + c->payload_w_centroid), r5); + } else { + return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); + } + case INTERP_MODE_NOPERSPECTIVE: + /* C appears after the mov from the varying. + XXX: improve ldvary setup. + */ + return vir_FADD(c, vir_MOV(c, vary), r5); + case INTERP_MODE_FLAT: + BITSET_SET(c->flat_shade_flags, i); + vir_MOV_dest(c, c->undef, vary); + return vir_MOV(c, r5); + default: + unreachable("Bad interp mode"); + } +} + +static void +emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var) +{ + for (int i = 0; i < glsl_get_vector_elements(var->type); i++) { + c->inputs[attr * 4 + i] = + emit_fragment_varying(c, var, i); + } +} + +static void +add_output(struct v3d_compile *c, + uint32_t decl_offset, + uint8_t slot, + uint8_t swizzle) +{ + uint32_t old_array_size = c->outputs_array_size; + resize_qreg_array(c, &c->outputs, &c->outputs_array_size, + decl_offset + 1); + + if (old_array_size != c->outputs_array_size) { + c->output_slots = reralloc(c, + c->output_slots, + struct v3d_varying_slot, + c->outputs_array_size); + } + + c->output_slots[decl_offset] = + v3d_slot_from_slot_and_component(slot, swizzle); +} + +static void +declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size) +{ + unsigned array_id = c->num_ubo_ranges++; + if (array_id >= c->ubo_ranges_array_size) { + c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, + array_id + 1); + c->ubo_ranges = reralloc(c, c->ubo_ranges, + struct v3d_ubo_range, + c->ubo_ranges_array_size); + c->ubo_range_used = reralloc(c, c->ubo_range_used, + bool, + c->ubo_ranges_array_size); + } + + c->ubo_ranges[array_id].dst_offset = 0; + c->ubo_ranges[array_id].src_offset = start; + c->ubo_ranges[array_id].size = size; + c->ubo_range_used[array_id] = false; +} + +/** + * If compare_instr is a valid comparison instruction, emits the + * compare_instr's comparison and returns the sel_instr's return value based + * on the compare_instr's result. + */ +static bool +ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest, + nir_alu_instr *compare_instr, + nir_alu_instr *sel_instr) +{ + struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); + struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1); + bool cond_invert = false; + + switch (compare_instr->op) { + case nir_op_feq: + case nir_op_seq: + vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ); + break; + case nir_op_ieq: + vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ); + break; + + case nir_op_fne: + case nir_op_sne: + vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ); + cond_invert = true; + break; + case nir_op_ine: + vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ); + cond_invert = true; + break; + + case nir_op_fge: + case nir_op_sge: + vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC); + break; + case nir_op_ige: + vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC); + cond_invert = true; + break; + case nir_op_uge: + vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC); + cond_invert = true; + break; + + case nir_op_slt: + case nir_op_flt: + vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN); + break; + case nir_op_ilt: + vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC); + break; + case nir_op_ult: + vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC); + break; + + default: + return false; + } + + enum v3d_qpu_cond cond = (cond_invert ? + V3D_QPU_COND_IFNA : + V3D_QPU_COND_IFA); + + switch (sel_instr->op) { + case nir_op_seq: + case nir_op_sne: + case nir_op_sge: + case nir_op_slt: + *dest = vir_SEL(c, cond, + vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0)); + break; + + case nir_op_bcsel: + *dest = vir_SEL(c, cond, + ntq_get_alu_src(c, sel_instr, 1), + ntq_get_alu_src(c, sel_instr, 2)); + break; + + default: + *dest = vir_SEL(c, cond, + vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0)); + break; + } + + /* Make the temporary for nir_store_dest(). */ + *dest = vir_MOV(c, *dest); + + return true; +} + +/** + * Attempts to fold a comparison generating a boolean result into the + * condition code for selecting between two values, instead of comparing the + * boolean result against 0 to generate the condition code. + */ +static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr, + struct qreg *src) +{ + if (!instr->src[0].src.is_ssa) + goto out; + if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + goto out; + nir_alu_instr *compare = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + if (!compare) + goto out; + + struct qreg dest; + if (ntq_emit_comparison(c, &dest, compare, instr)) + return dest; + +out: + vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); + return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2])); +} + + +static void +ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) +{ + /* This should always be lowered to ALU operations for V3D. */ + assert(!instr->dest.saturate); + + /* Vectors are special in that they have non-scalarized writemasks, + * and just take the first swizzle channel for each argument in order + * into each writemask channel. + */ + if (instr->op == nir_op_vec2 || + instr->op == nir_op_vec3 || + instr->op == nir_op_vec4) { + struct qreg srcs[4]; + for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + srcs[i] = ntq_get_src(c, instr->src[i].src, + instr->src[i].swizzle[0]); + for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + ntq_store_dest(c, &instr->dest.dest, i, + vir_MOV(c, srcs[i])); + return; + } + + /* General case: We can just grab the one used channel per src. */ + struct qreg src[nir_op_infos[instr->op].num_inputs]; + for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + src[i] = ntq_get_alu_src(c, instr, i); + } + + struct qreg result; + + switch (instr->op) { + case nir_op_fmov: + case nir_op_imov: + result = vir_MOV(c, src[0]); + break; + case nir_op_fmul: + result = vir_FMUL(c, src[0], src[1]); + break; + case nir_op_fadd: + result = vir_FADD(c, src[0], src[1]); + break; + case nir_op_fsub: + result = vir_FSUB(c, src[0], src[1]); + break; + case nir_op_fmin: + result = vir_FMIN(c, src[0], src[1]); + break; + case nir_op_fmax: + result = vir_FMAX(c, src[0], src[1]); + break; + + case nir_op_f2i32: + result = vir_FTOIZ(c, src[0]); + break; + case nir_op_f2u32: + result = vir_FTOUZ(c, src[0]); + break; + case nir_op_i2f32: + result = vir_ITOF(c, src[0]); + break; + case nir_op_u2f32: + result = vir_UTOF(c, src[0]); + break; + case nir_op_b2f: + result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); + break; + case nir_op_b2i: + result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); + break; + case nir_op_i2b: + case nir_op_f2b: + vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); + result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, + vir_uniform_ui(c, ~0), + vir_uniform_ui(c, 0))); + break; + + case nir_op_iadd: + result = vir_ADD(c, src[0], src[1]); + break; + case nir_op_ushr: + result = vir_SHR(c, src[0], src[1]); + break; + case nir_op_isub: + result = vir_SUB(c, src[0], src[1]); + break; + case nir_op_ishr: + result = vir_ASR(c, src[0], src[1]); + break; + case nir_op_ishl: + result = vir_SHL(c, src[0], src[1]); + break; + case nir_op_imin: + result = vir_MIN(c, src[0], src[1]); + break; + case nir_op_umin: + result = vir_UMIN(c, src[0], src[1]); + break; + case nir_op_imax: + result = vir_MAX(c, src[0], src[1]); + break; + case nir_op_umax: + result = vir_UMAX(c, src[0], src[1]); + break; + case nir_op_iand: + result = vir_AND(c, src[0], src[1]); + break; + case nir_op_ior: + result = vir_OR(c, src[0], src[1]); + break; + case nir_op_ixor: + result = vir_XOR(c, src[0], src[1]); + break; + case nir_op_inot: + result = vir_NOT(c, src[0]); + break; + + case nir_op_imul: + result = ntq_umul(c, src[0], src[1]); + break; + + case nir_op_seq: + case nir_op_sne: + case nir_op_sge: + case nir_op_slt: + case nir_op_feq: + case nir_op_fne: + case nir_op_fge: + case nir_op_flt: + case nir_op_ieq: + case nir_op_ine: + case nir_op_ige: + case nir_op_uge: + case nir_op_ilt: + case nir_op_ult: + if (!ntq_emit_comparison(c, &result, instr, instr)) { + fprintf(stderr, "Bad comparison instruction\n"); + } + break; + + case nir_op_bcsel: + result = ntq_emit_bcsel(c, instr, src); + break; + case nir_op_fcsel: + vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); + result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, + src[1], src[2])); + break; + + case nir_op_frcp: + result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]); + break; + case nir_op_frsq: + result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]); + break; + case nir_op_fexp2: + result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]); + break; + case nir_op_flog2: + result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]); + break; + + case nir_op_fceil: + result = vir_FCEIL(c, src[0]); + break; + case nir_op_ffloor: + result = vir_FFLOOR(c, src[0]); + break; + case nir_op_fround_even: + result = vir_FROUND(c, src[0]); + break; + case nir_op_ftrunc: + result = vir_FTRUNC(c, src[0]); + break; + case nir_op_ffract: + result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0])); + break; + + case nir_op_fsin: + result = ntq_fsincos(c, src[0], false); + break; + case nir_op_fcos: + result = ntq_fsincos(c, src[0], true); + break; + + case nir_op_fsign: + result = ntq_fsign(c, src[0]); + break; + case nir_op_isign: + result = ntq_isign(c, src[0]); + break; + + case nir_op_fabs: { + result = vir_FMOV(c, src[0]); + vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); + break; + } + + case nir_op_iabs: + result = vir_MAX(c, src[0], + vir_SUB(c, vir_uniform_ui(c, 0), src[0])); + break; + + case nir_op_fddx: + case nir_op_fddx_coarse: + case nir_op_fddx_fine: + result = vir_FDX(c, src[0]); + break; + + case nir_op_fddy: + case nir_op_fddy_coarse: + case nir_op_fddy_fine: + result = vir_FDY(c, src[0]); + break; + + default: + fprintf(stderr, "unknown NIR ALU inst: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + + /* We have a scalar result, so the instruction should only have a + * single channel written to. + */ + assert(util_is_power_of_two(instr->dest.write_mask)); + ntq_store_dest(c, &instr->dest.dest, + ffs(instr->dest.write_mask) - 1, result); +} + +/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit + * specifier. They come from a register that's preloaded with 0xffffffff + * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low + * 8 bits are shifted off the bottom and 0xff shifted in from the top. + */ +#define TLB_TYPE_F16_COLOR (3 << 6) +#define TLB_TYPE_I32_COLOR (1 << 6) +#define TLB_TYPE_F32_COLOR (0 << 6) +#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */ +#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2) +#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2) +#define TLB_F16_SWAP_HI_LO (1 << 1) +#define TLB_VEC_SIZE_4_F16 (1 << 0) +#define TLB_VEC_SIZE_2_F16 (0 << 0) +#define TLB_VEC_SIZE_MINUS_1_SHIFT 0 + +/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z" + * flag is set. + */ +#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4)) +#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */ +#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */ + +/* Stencil is a single 32-bit write. */ +#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) + +static void +emit_frag_end(struct v3d_compile *c) +{ + /* XXX + if (c->output_sample_mask_index != -1) { + vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); + } + */ + + if (c->output_position_index != -1) { + struct qinst *inst = vir_MOV_dest(c, + vir_reg(QFILE_TLBU, 0), + c->outputs[c->output_position_index]); + + inst->src[vir_get_implicit_uniform_src(inst)] = + vir_uniform_ui(c, + TLB_TYPE_DEPTH | + TLB_DEPTH_TYPE_PER_PIXEL | + 0xffffff00); + } else if (c->s->info.fs.uses_discard) { + struct qinst *inst = vir_MOV_dest(c, + vir_reg(QFILE_TLBU, 0), + vir_reg(QFILE_NULL, 0)); + + inst->src[vir_get_implicit_uniform_src(inst)] = + vir_uniform_ui(c, + TLB_TYPE_DEPTH | + TLB_DEPTH_TYPE_INVARIANT | + 0xffffff00); + } + + /* XXX: Performance improvement: Merge Z write and color writes TLB + * uniform setup + */ + + for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) { + if (!c->output_color_var[rt]) + continue; + + nir_variable *var = c->output_color_var[rt]; + struct qreg *color = &c->outputs[var->data.driver_location * 4]; + int num_components = glsl_get_vector_elements(var->type); + uint32_t conf = 0xffffff00; + struct qinst *inst; + + conf |= TLB_SAMPLE_MODE_PER_PIXEL; + conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; + + assert(num_components != 0); + switch (glsl_get_base_type(var->type)) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + conf |= TLB_TYPE_I32_COLOR; + conf |= ((num_components - 1) << + TLB_VEC_SIZE_MINUS_1_SHIFT); + + inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]); + inst->src[vir_get_implicit_uniform_src(inst)] = + vir_uniform_ui(c, conf); + + for (int i = 1; i < num_components; i++) { + inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), + color[i]); + } + break; + + default: { + struct qreg r = color[0]; + struct qreg g = color[1]; + struct qreg b = color[2]; + struct qreg a = color[3]; + + if (c->fs_key->f32_color_rb) { + conf |= TLB_TYPE_F32_COLOR; + conf |= ((num_components - 1) << + TLB_VEC_SIZE_MINUS_1_SHIFT); + } else { + conf |= TLB_TYPE_F16_COLOR; + conf |= TLB_F16_SWAP_HI_LO; + if (num_components >= 3) + conf |= TLB_VEC_SIZE_4_F16; + else + conf |= TLB_VEC_SIZE_2_F16; + } + + if (c->fs_key->swap_color_rb & (1 << rt)) { + r = color[2]; + b = color[0]; + } + + if (c->fs_key->f32_color_rb & (1 << rt)) { + inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]); + inst->src[vir_get_implicit_uniform_src(inst)] = + vir_uniform_ui(c, conf); + + for (int i = 1; i < num_components; i++) { + inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), + color[i]); + } + } else { + inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g); + if (conf != ~0) { + inst->dst.file = QFILE_TLBU; + inst->src[vir_get_implicit_uniform_src(inst)] = + vir_uniform_ui(c, conf); + } + + inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a); + } + break; + } + } + } +} + +static void +emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w) +{ + for (int i = 0; i < 2; i++) { + struct qreg coord = c->outputs[c->output_position_index + i]; + coord = vir_FMUL(c, coord, + vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, + 0)); + coord = vir_FMUL(c, coord, rcp_w); + vir_FTOIN_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), + coord); + } + +} + +static void +emit_zs_write(struct v3d_compile *c, struct qreg rcp_w) +{ + struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); + struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); + + vir_FADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), + vir_FMUL(c, vir_FMUL(c, + c->outputs[c->output_position_index + 2], + zscale), + rcp_w), + zoffset); +} + +static void +emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w) +{ + vir_VPM_WRITE(c, rcp_w); +} + +static void +emit_point_size_write(struct v3d_compile *c) +{ + struct qreg point_size; + + if (c->output_point_size_index != -1) + point_size = c->outputs[c->output_point_size_index]; + else + point_size = vir_uniform_f(c, 1.0); + + /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, + * BCM21553). + */ + point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125)); + + vir_VPM_WRITE(c, point_size); +} + +static void +emit_vpm_write_setup(struct v3d_compile *c) +{ + uint32_t packed; + struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = { + V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header, + + .horiz = true, + .laned = false, + .segs = true, + .stride = 1, + .size = VPM_SETUP_SIZE_32_BIT, + .addr = 0, + }; + + V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL, + (uint8_t *)&packed, + &unpacked); + vir_VPMSETUP(c, vir_uniform_ui(c, packed)); +} + +static void +emit_vert_end(struct v3d_compile *c) +{ + struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP, + c->outputs[c->output_position_index + 3]); + + emit_vpm_write_setup(c); + + if (c->vs_key->is_coord) { + for (int i = 0; i < 4; i++) + vir_VPM_WRITE(c, c->outputs[c->output_position_index + i]); + emit_scaled_viewport_write(c, rcp_w); + if (c->vs_key->per_vertex_point_size) { + emit_point_size_write(c); + /* emit_rcp_wc_write(c, rcp_w); */ + } + /* XXX: Z-only rendering */ + if (0) + emit_zs_write(c, rcp_w); + } else { + emit_scaled_viewport_write(c, rcp_w); + emit_zs_write(c, rcp_w); + emit_rcp_wc_write(c, rcp_w); + if (c->vs_key->per_vertex_point_size) + emit_point_size_write(c); + } + + for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + struct v3d_varying_slot input = c->vs_key->fs_inputs[i]; + int j; + + for (j = 0; j < c->num_outputs; j++) { + struct v3d_varying_slot output = c->output_slots[j]; + + if (!memcmp(&input, &output, sizeof(input))) { + vir_VPM_WRITE(c, c->outputs[j]); + break; + } + } + /* Emit padding if we didn't find a declared VS output for + * this FS input. + */ + if (j == c->num_outputs) + vir_VPM_WRITE(c, vir_uniform_f(c, 0.0)); + } +} + +void +v3d_optimize_nir(struct nir_shader *s) +{ + bool progress; + + do { + progress = false; + + NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_lower_alu_to_scalar); + NIR_PASS(progress, s, nir_lower_phis_to_scalar); + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_dce); + NIR_PASS(progress, s, nir_opt_dead_cf); + NIR_PASS(progress, s, nir_opt_cse); + NIR_PASS(progress, s, nir_opt_peephole_select, 8); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_undef); + } while (progress); +} + +static int +driver_location_compare(const void *in_a, const void *in_b) +{ + const nir_variable *const *a = in_a; + const nir_variable *const *b = in_b; + + return (*a)->data.driver_location - (*b)->data.driver_location; +} + +static struct qreg +ntq_emit_vpm_read(struct v3d_compile *c, + uint32_t *num_components_queued, + uint32_t *remaining, + uint32_t vpm_index) +{ + struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); + + if (*num_components_queued != 0) { + (*num_components_queued)--; + c->num_inputs++; + return vir_MOV(c, vpm); + } + + uint32_t num_components = MIN2(*remaining, 32); + + struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = { + V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header, + + .horiz = true, + .laned = false, + /* If the field is 0, that means a read count of 32. */ + .num = num_components & 31, + .segs = true, + .stride = 1, + .size = VPM_SETUP_SIZE_32_BIT, + .addr = c->num_inputs, + }; + + uint32_t packed; + V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL, + (uint8_t *)&packed, + &unpacked); + vir_VPMSETUP(c, vir_uniform_ui(c, packed)); + + *num_components_queued = num_components - 1; + *remaining -= num_components; + c->num_inputs++; + + return vir_MOV(c, vpm); +} + +static void +ntq_setup_inputs(struct v3d_compile *c) +{ + unsigned num_entries = 0; + unsigned num_components = 0; + nir_foreach_variable(var, &c->s->inputs) { + num_entries++; + num_components += glsl_get_components(var->type); + } + + nir_variable *vars[num_entries]; + + unsigned i = 0; + nir_foreach_variable(var, &c->s->inputs) + vars[i++] = var; + + /* Sort the variables so that we emit the input setup in + * driver_location order. This is required for VPM reads, whose data + * is fetched into the VPM in driver_location (TGSI register index) + * order. + */ + qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); + + uint32_t vpm_components_queued = 0; + if (c->s->info.stage == MESA_SHADER_VERTEX) { + bool uses_iid = c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_INSTANCE_ID); + bool uses_vid = c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_VERTEX_ID); + + num_components += uses_iid; + num_components += uses_vid; + + if (uses_iid) { + c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, + &num_components, ~0); + } + + if (uses_vid) { + c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, + &num_components, ~0); + } + } + + for (unsigned i = 0; i < num_entries; i++) { + nir_variable *var = vars[i]; + unsigned array_len = MAX2(glsl_get_length(var->type), 1); + unsigned loc = var->data.driver_location; + + assert(array_len == 1); + (void)array_len; + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + 1) * 4); + + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + if (var->data.location == VARYING_SLOT_POS) { + emit_fragcoord_input(c, loc); + } else if (var->data.location == VARYING_SLOT_PNTC || + (var->data.location >= VARYING_SLOT_VAR0 && + (c->fs_key->point_sprite_mask & + (1 << (var->data.location - + VARYING_SLOT_VAR0))))) { + c->inputs[loc * 4 + 0] = c->point_x; + c->inputs[loc * 4 + 1] = c->point_y; + } else { + emit_fragment_input(c, loc, var); + } + } else { + int var_components = glsl_get_components(var->type); + + for (int i = 0; i < var_components; i++) { + c->inputs[loc * 4 + i] = + ntq_emit_vpm_read(c, + &vpm_components_queued, + &num_components, + loc * 4 + i); + + } + c->vattr_sizes[loc] = var_components; + } + } + + if (c->s->info.stage == MESA_SHADER_VERTEX) { + assert(vpm_components_queued == 0); + assert(num_components == 0); + } +} + +static void +ntq_setup_outputs(struct v3d_compile *c) +{ + nir_foreach_variable(var, &c->s->outputs) { + unsigned array_len = MAX2(glsl_get_length(var->type), 1); + unsigned loc = var->data.driver_location * 4; + + assert(array_len == 1); + (void)array_len; + + for (int i = 0; i < 4; i++) + add_output(c, loc + i, var->data.location, i); + + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + switch (var->data.location) { + case FRAG_RESULT_COLOR: + c->output_color_var[0] = var; + c->output_color_var[1] = var; + c->output_color_var[2] = var; + c->output_color_var[3] = var; + break; + case FRAG_RESULT_DATA0: + case FRAG_RESULT_DATA1: + case FRAG_RESULT_DATA2: + case FRAG_RESULT_DATA3: + c->output_color_var[var->data.location - + FRAG_RESULT_DATA0] = var; + break; + case FRAG_RESULT_DEPTH: + c->output_position_index = loc; + break; + case FRAG_RESULT_SAMPLE_MASK: + c->output_sample_mask_index = loc; + break; + } + } else { + switch (var->data.location) { + case VARYING_SLOT_POS: + c->output_position_index = loc; + break; + case VARYING_SLOT_PSIZ: + c->output_point_size_index = loc; + break; + } + } + } +} + +static void +ntq_setup_uniforms(struct v3d_compile *c) +{ + nir_foreach_variable(var, &c->s->uniforms) { + uint32_t vec4_count = glsl_count_attribute_slots(var->type, + false); + unsigned vec4_size = 4 * sizeof(float); + + declare_uniform_range(c, var->data.driver_location * vec4_size, + vec4_count * vec4_size); + + } +} + +/** + * Sets up the mapping from nir_register to struct qreg *. + * + * Each nir_register gets a struct qreg per 32-bit component being stored. + */ +static void +ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_register, nir_reg, node, list) { + unsigned array_len = MAX2(nir_reg->num_array_elems, 1); + struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, + array_len * + nir_reg->num_components); + + _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); + + for (int i = 0; i < array_len * nir_reg->num_components; i++) + qregs[i] = vir_get_temp(c); + } +} + +static void +ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) +{ + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); + for (int i = 0; i < instr->def.num_components; i++) + qregs[i] = vir_uniform_ui(c, instr->value.u32[i]); + + _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); +} + +static void +ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr) +{ + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); + + /* VIR needs there to be *some* value, so pick 0 (same as for + * ntq_setup_registers(). + */ + for (int i = 0; i < instr->def.num_components; i++) + qregs[i] = vir_uniform_ui(c, 0); +} + +static void +ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + nir_const_value *const_offset; + unsigned offset; + + switch (instr->intrinsic) { + case nir_intrinsic_load_uniform: + assert(instr->num_components == 1); + const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + assert(offset % 4 == 0); + /* We need dwords */ + offset = offset / 4; + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_UNIFORM, + offset)); + } else { + ntq_store_dest(c, &instr->dest, 0, + indirect_uniform_load(c, instr)); + } + break; + + case nir_intrinsic_load_ubo: + for (int i = 0; i < instr->num_components; i++) { + int ubo = nir_src_as_const_value(instr->src[0])->u32[0]; + + /* Adjust for where we stored the TGSI register base. */ + vir_ADD_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), + vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo), + vir_ADD(c, + ntq_get_src(c, instr->src[1], 0), + vir_uniform_ui(c, i * 4))); + + ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); + } + break; + + const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + assert(offset % 4 == 0); + /* We need dwords */ + offset = offset / 4; + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_UNIFORM, + offset)); + } else { + ntq_store_dest(c, &instr->dest, 0, + indirect_uniform_load(c, instr)); + } + break; + + case nir_intrinsic_load_user_clip_plane: + for (int i = 0; i < instr->num_components; i++) { + ntq_store_dest(c, &instr->dest, i, + vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, + nir_intrinsic_ucp_id(instr) * + 4 + i)); + } + break; + + case nir_intrinsic_load_alpha_ref_float: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); + break; + + case nir_intrinsic_load_sample_mask_in: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0)); + break; + + case nir_intrinsic_load_front_face: + /* The register contains 0 (front) or 1 (back), and we need to + * turn it into a NIR bool where true means front. + */ + ntq_store_dest(c, &instr->dest, 0, + vir_ADD(c, + vir_uniform_ui(c, -1), + vir_REVF(c))); + break; + + case nir_intrinsic_load_instance_id: + ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); + break; + + case nir_intrinsic_load_vertex_id: + ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); + break; + + case nir_intrinsic_load_input: + const_offset = nir_src_as_const_value(instr->src[0]); + assert(const_offset && "v3d doesn't support indirect inputs"); + for (int i = 0; i < instr->num_components; i++) { + offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + int comp = nir_intrinsic_component(instr) + i; + ntq_store_dest(c, &instr->dest, i, + vir_MOV(c, c->inputs[offset * 4 + comp])); + } + break; + + case nir_intrinsic_store_output: + const_offset = nir_src_as_const_value(instr->src[1]); + assert(const_offset && "v3d doesn't support indirect outputs"); + offset = ((nir_intrinsic_base(instr) + + const_offset->u32[0]) * 4 + + nir_intrinsic_component(instr)); + + for (int i = 0; i < instr->num_components; i++) { + c->outputs[offset + i] = + vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + } + c->num_outputs = MAX2(c->num_outputs, + offset + instr->num_components); + break; + + case nir_intrinsic_discard: + if (c->execute.file != QFILE_NULL) { + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), + vir_uniform_ui(c, 0)), + V3D_QPU_COND_IFA); + } else { + vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), + vir_uniform_ui(c, 0)); + } + break; + + case nir_intrinsic_discard_if: { + /* true (~0) if we're discarding */ + struct qreg cond = ntq_get_src(c, instr->src[0], 0); + + if (c->execute.file != QFILE_NULL) { + /* execute == 0 means the channel is active. Invert + * the condition so that we can use zero as "executing + * and discarding." + */ + vir_PF(c, vir_AND(c, c->execute, vir_NOT(c, cond)), + V3D_QPU_PF_PUSHZ); + vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), + vir_uniform_ui(c, 0)), + V3D_QPU_COND_IFA); + } else { + vir_PF(c, cond, V3D_QPU_PF_PUSHZ); + vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), + vir_uniform_ui(c, 0)), + V3D_QPU_COND_IFNA); + } + + break; + } + + default: + fprintf(stderr, "Unknown intrinsic: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + break; + } +} + +/* Clears (activates) the execute flags for any channels whose jump target + * matches this block. + */ +static void +ntq_activate_execute_for_block(struct v3d_compile *c) +{ + vir_PF(c, vir_SUB(c, c->execute, vir_uniform_ui(c, c->cur_block->index)), + V3D_QPU_PF_PUSHZ); + + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); +} + +static void +ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt) +{ + nir_block *nir_else_block = nir_if_first_else_block(if_stmt); + bool empty_else_block = + (nir_else_block == nir_if_last_else_block(if_stmt) && + exec_list_is_empty(&nir_else_block->instr_list)); + + struct qblock *then_block = vir_new_block(c); + struct qblock *after_block = vir_new_block(c); + struct qblock *else_block; + if (empty_else_block) + else_block = after_block; + else + else_block = vir_new_block(c); + + bool was_top_level = false; + if (c->execute.file == QFILE_NULL) { + c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); + was_top_level = true; + } + + /* Set A for executing (execute == 0) and jumping (if->condition == + * 0) channels, and then update execute flags for those to point to + * the ELSE block. + */ + vir_PF(c, vir_OR(c, + c->execute, + ntq_get_src(c, if_stmt->condition, 0)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, + c->execute, + vir_uniform_ui(c, else_block->index)); + + /* Jump to ELSE if nothing is active for THEN, otherwise fall + * through. + */ + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); + vir_link_blocks(c->cur_block, else_block); + vir_link_blocks(c->cur_block, then_block); + + /* Process the THEN block. */ + vir_set_emit_block(c, then_block); + ntq_emit_cf_list(c, &if_stmt->then_list); + + if (!empty_else_block) { + /* Handle the end of the THEN block. First, all currently + * active channels update their execute flags to point to + * ENDIF + */ + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, + vir_uniform_ui(c, after_block->index)); + + /* If everything points at ENDIF, then jump there immediately. */ + vir_PF(c, vir_SUB(c, c->execute, + vir_uniform_ui(c, after_block->index)), + V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); + vir_link_blocks(c->cur_block, after_block); + vir_link_blocks(c->cur_block, else_block); + + vir_set_emit_block(c, else_block); + ntq_activate_execute_for_block(c); + ntq_emit_cf_list(c, &if_stmt->else_list); + } + + vir_link_blocks(c->cur_block, after_block); + + vir_set_emit_block(c, after_block); + if (was_top_level) + c->execute = c->undef; + else + ntq_activate_execute_for_block(c); +} + +static void +ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, + vir_uniform_ui(c, c->loop_break_block->index)); + break; + + case nir_jump_continue: + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, + vir_uniform_ui(c, c->loop_cont_block->index)); + break; + + case nir_jump_return: + unreachable("All returns shouold be lowered\n"); + } +} + +static void +ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + ntq_emit_alu(c, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_intrinsic: + ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_load_const: + ntq_emit_load_const(c, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_ssa_undef: + ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); + break; + + case nir_instr_type_tex: + ntq_emit_tex(c, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_jump: + ntq_emit_jump(c, nir_instr_as_jump(instr)); + break; + + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static void +ntq_emit_block(struct v3d_compile *c, nir_block *block) +{ + nir_foreach_instr(instr, block) { + ntq_emit_instr(c, instr); + } +} + +static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); + +static void +ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) +{ + bool was_top_level = false; + if (c->execute.file == QFILE_NULL) { + c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); + was_top_level = true; + } + + struct qblock *save_loop_cont_block = c->loop_cont_block; + struct qblock *save_loop_break_block = c->loop_break_block; + + c->loop_cont_block = vir_new_block(c); + c->loop_break_block = vir_new_block(c); + + vir_link_blocks(c->cur_block, c->loop_cont_block); + vir_set_emit_block(c, c->loop_cont_block); + ntq_activate_execute_for_block(c); + + ntq_emit_cf_list(c, &loop->body); + + /* Re-enable any previous continues now, so our ANYA check below + * works. + * + * XXX: Use the .ORZ flags update, instead. + */ + vir_PF(c, vir_SUB(c, + c->execute, + vir_uniform_ui(c, c->loop_cont_block->index)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); + + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); + vir_link_blocks(c->cur_block, c->loop_cont_block); + vir_link_blocks(c->cur_block, c->loop_break_block); + + vir_set_emit_block(c, c->loop_break_block); + if (was_top_level) + c->execute = c->undef; + else + ntq_activate_execute_for_block(c); + + c->loop_break_block = save_loop_break_block; + c->loop_cont_block = save_loop_cont_block; +} + +static void +ntq_emit_function(struct v3d_compile *c, nir_function_impl *func) +{ + fprintf(stderr, "FUNCTIONS not handled.\n"); + abort(); +} + +static void +ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + ntq_emit_block(c, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + ntq_emit_if(c, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + ntq_emit_loop(c, nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_function: + ntq_emit_function(c, nir_cf_node_as_function(node)); + break; + + default: + fprintf(stderr, "Unknown NIR node type\n"); + abort(); + } + } +} + +static void +ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) +{ + ntq_setup_registers(c, &impl->registers); + ntq_emit_cf_list(c, &impl->body); +} + +static void +nir_to_vir(struct v3d_compile *c) +{ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); + c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); + + if (c->fs_key->is_points) { + c->point_x = emit_fragment_varying(c, NULL, 0); + c->point_y = emit_fragment_varying(c, NULL, 0); + } else if (c->fs_key->is_lines) { + c->line_x = emit_fragment_varying(c, NULL, 0); + } + } + + ntq_setup_inputs(c); + ntq_setup_outputs(c); + ntq_setup_uniforms(c); + ntq_setup_registers(c, &c->s->registers); + + /* Find the main function and emit the body. */ + nir_foreach_function(function, c->s) { + assert(strcmp(function->name, "main") == 0); + assert(function->impl); + ntq_emit_impl(c, function->impl); + } +} + +const nir_shader_compiler_options v3d_nir_options = { + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_bitfield_insert = true, + .lower_bitfield_extract = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_pack_snorm_4x8 = true, + .lower_ffma = true, + .lower_flrp32 = true, + .lower_fpow = true, + .lower_fsat = true, + .lower_fsqrt = true, + .lower_negate = true, + .native_integers = true, +}; + + +#if 0 +static int +count_nir_instrs(nir_shader *nir) +{ + int count = 0; + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) + count++; + } + } + return count; +} +#endif + +void +v3d_nir_to_vir(struct v3d_compile *c) +{ + if (V3D_DEBUG & (V3D_DEBUG_NIR | + v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + fprintf(stderr, "%s prog %d/%d NIR:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + nir_print_shader(c->s, stderr); + } + + nir_to_vir(c); + + switch (c->s->info.stage) { + case MESA_SHADER_FRAGMENT: + emit_frag_end(c); + break; + case MESA_SHADER_VERTEX: + emit_vert_end(c); + break; + default: + unreachable("bad stage"); + } + + if (V3D_DEBUG & (V3D_DEBUG_VIR | + v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + vir_dump(c); + fprintf(stderr, "\n"); + } + + vir_optimize(c); + vir_lower_uniforms(c); + + /* XXX: vir_schedule_instructions(c); */ + + if (V3D_DEBUG & (V3D_DEBUG_VIR | + v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + fprintf(stderr, "%s prog %d/%d VIR:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + vir_dump(c); + fprintf(stderr, "\n"); + } + + v3d_vir_to_qpu(c); +} |