diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2023-11-02 04:53:47 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2023-11-02 04:53:47 +0000 |
commit | b44518130b33cadb5c1d619e9e936ae0e0dbf7cb (patch) | |
tree | 6069eb03c39fbc79808a7d94f857118cce75cbe3 /lib/mesa/src/intel/compiler/brw_fs.cpp | |
parent | 32aeb3c41fedbbd7b11aacfec48e8f699d16bff0 (diff) |
Merge Mesa 23.1.9
Diffstat (limited to 'lib/mesa/src/intel/compiler/brw_fs.cpp')
-rw-r--r-- | lib/mesa/src/intel/compiler/brw_fs.cpp | 1026 |
1 files changed, 583 insertions, 443 deletions
diff --git a/lib/mesa/src/intel/compiler/brw_fs.cpp b/lib/mesa/src/intel/compiler/brw_fs.cpp index db4806b50..21f7071c4 100644 --- a/lib/mesa/src/intel/compiler/brw_fs.cpp +++ b/lib/mesa/src/intel/compiler/brw_fs.cpp @@ -38,11 +38,14 @@ #include "brw_dead_control_flow.h" #include "brw_private.h" #include "dev/intel_debug.h" +#include "dev/intel_wa.h" #include "compiler/glsl_types.h" #include "compiler/nir/nir_builder.h" #include "program/prog_parameter.h" #include "util/u_math.h" +#include <memory> + using namespace brw; static unsigned get_lowered_simd_width(const struct brw_compiler *compiler, @@ -246,7 +249,6 @@ fs_inst::is_control_source(unsigned arg) const { switch (opcode) { case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: return arg == 0; @@ -304,9 +306,6 @@ fs_inst::is_payload(unsigned arg) const case SHADER_OPCODE_BARRIER: return arg == 0; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - return arg == 1; - case SHADER_OPCODE_SEND: return arg == 2 || arg == 3; @@ -642,10 +641,19 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg) bool fs_inst::is_partial_write() const { - return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || - (this->exec_size * type_sz(this->dst.type)) < 32 || - !this->dst.is_contiguous() || - this->dst.offset % REG_SIZE != 0); + if (this->predicate && !this->predicate_trivial && + this->opcode != BRW_OPCODE_SEL) + return true; + + if (this->dst.offset % REG_SIZE != 0) + return true; + + /* SEND instructions always write whole registers */ + if (this->opcode == SHADER_OPCODE_SEND) + return false; + + return this->exec_size * type_sz(this->dst.type) < 32 || + !this->dst.is_contiguous(); } unsigned @@ -745,13 +753,13 @@ fs_inst::components_read(unsigned i) const case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: - assert(src[2].file == IMM); + assert(src[A64_LOGICAL_ARG].file == IMM); return 1; case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: - assert(src[2].file == IMM); - if (i == 1) { /* data to write */ - const unsigned comps = src[2].ud / exec_size; + assert(src[A64_LOGICAL_ARG].file == IMM); + if (i == A64_LOGICAL_SRC) { /* data to write */ + const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size; assert(comps > 0); return comps; } else { @@ -773,41 +781,13 @@ fs_inst::components_read(unsigned i) const } case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: - assert(src[2].file == IMM); - return i == 1 ? src[2].ud : 1; + assert(src[A64_LOGICAL_ARG].file == IMM); + return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1; case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: - assert(src[2].file == IMM); - if (i == 1) { - /* Data source */ - const unsigned op = src[2].ud; - switch (op) { - case BRW_AOP_INC: - case BRW_AOP_DEC: - case BRW_AOP_PREDEC: - return 0; - case BRW_AOP_CMPWR: - return 2; - default: - return 1; - } - } else { - return 1; - } - - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL: - assert(src[2].file == IMM); - if (i == 1) { - /* Data source */ - const unsigned op = src[2].ud; - return op == BRW_AOP_FCMPWR ? 2 : 1; - } else { - return 1; - } + assert(src[A64_LOGICAL_ARG].file == IMM); + return i == A64_LOGICAL_SRC ? + lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1; case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: @@ -837,31 +817,14 @@ fs_inst::components_read(unsigned i) const if (i == SURFACE_LOGICAL_SRC_ADDRESS) return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; /* Surface operation source. */ - else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR) - return 2; - else if (i == SURFACE_LOGICAL_SRC_DATA && - (op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC)) - return 0; + else if (i == SURFACE_LOGICAL_SRC_DATA) + return lsc_op_num_data_values(op); else return 1; } case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: return (i == 0 ? 2 : 1); - case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: { - assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && - src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); - const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud; - /* Surface coordinates. */ - if (i == SURFACE_LOGICAL_SRC_ADDRESS) - return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; - /* Surface operation source. */ - else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR) - return 2; - else - return 1; - } - case SHADER_OPCODE_URB_WRITE_LOGICAL: if (i == URB_LOGICAL_SRC_DATA) return mlen - 1 - @@ -909,12 +872,6 @@ fs_inst::size_read(int arg) const return 1; break; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: - /* The payload is actually stored in src1 */ - if (arg == 1) - return mlen * REG_SIZE; - break; - case FS_OPCODE_LINTERP: if (arg == 1) return 16; @@ -922,7 +879,7 @@ fs_inst::size_read(int arg) const case SHADER_OPCODE_LOAD_PAYLOAD: if (arg < this->header_size) - return REG_SIZE; + return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8); break; case CS_OPCODE_CS_TERMINATE: @@ -1120,7 +1077,7 @@ fs_visitor::vgrf(const glsl_type *const type) brw_type_for_base_type(type)); } -fs_reg::fs_reg(enum brw_reg_file file, int nr) +fs_reg::fs_reg(enum brw_reg_file file, unsigned nr) { init(); this->file = file; @@ -1129,7 +1086,7 @@ fs_reg::fs_reg(enum brw_reg_file file, int nr) this->stride = (file == UNIFORM ? 0 : 1); } -fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type) +fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type) { init(); this->file = file; @@ -1275,7 +1232,7 @@ fs_visitor::emit_samplepos_setup() const fs_builder abld = bld.annotate("compute sample position"); fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2); - if (!wm_prog_data->persample_dispatch) { + if (wm_prog_data->persample_dispatch == BRW_NEVER) { /* From ARB_sample_shading specification: * "When rendering to a non-multisample buffer, or if multisample * rasterization is disabled, gl_SamplePosition will always be @@ -1310,6 +1267,16 @@ fs_visitor::emit_samplepos_setup() abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f)); } + if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) { + check_dynamic_msaa_flag(abld, wm_prog_data, + BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH); + for (unsigned i = 0; i < 2; i++) { + set_predicate(BRW_PREDICATE_NORMAL, + bld.SEL(offset(pos, abld, i), offset(pos, abld, i), + brw_imm_f(0.5f))); + } + } + return pos; } @@ -1318,12 +1285,13 @@ fs_visitor::emit_sampleid_setup() { assert(stage == MESA_SHADER_FRAGMENT); ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); assert(devinfo->ver >= 6); const fs_builder abld = bld.annotate("compute sample id"); fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD); - assert(key->multisample_fbo); + assert(key->multisample_fbo != BRW_NEVER); if (devinfo->ver >= 8) { /* Sample ID comes in as 4-bit numbers in g1.0: @@ -1413,6 +1381,13 @@ fs_visitor::emit_sampleid_setup() abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2); } + if (key->multisample_fbo == BRW_SOMETIMES) { + check_dynamic_msaa_flag(abld, wm_prog_data, + BRW_WM_MSAA_FLAG_MULTISAMPLE_FBO); + set_predicate(BRW_PREDICATE_NORMAL, + abld.SEL(sample_id, sample_id, brw_imm_ud(0))); + } + return sample_id; } @@ -1423,39 +1398,44 @@ fs_visitor::emit_samplemaskin_setup() struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); assert(devinfo->ver >= 6); - fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D); - /* The HW doesn't provide us with expected values. */ - assert(!wm_prog_data->per_coarse_pixel_dispatch); + assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS); fs_reg coverage_mask = fetch_payload_reg(bld, fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D); - if (wm_prog_data->persample_dispatch) { - /* gl_SampleMaskIn[] comes from two sources: the input coverage mask, - * and a mask representing which sample is being processed by the - * current shader invocation. - * - * From the OES_sample_variables specification: - * "When per-sample shading is active due to the use of a fragment input - * qualified by "sample" or due to the use of the gl_SampleID or - * gl_SamplePosition variables, only the bit for the current sample is - * set in gl_SampleMaskIn." - */ - const fs_builder abld = bld.annotate("compute gl_SampleMaskIn"); + if (wm_prog_data->persample_dispatch == BRW_NEVER) + return coverage_mask; + + /* gl_SampleMaskIn[] comes from two sources: the input coverage mask, + * and a mask representing which sample is being processed by the + * current shader invocation. + * + * From the OES_sample_variables specification: + * "When per-sample shading is active due to the use of a fragment input + * qualified by "sample" or due to the use of the gl_SampleID or + * gl_SamplePosition variables, only the bit for the current sample is + * set in gl_SampleMaskIn." + */ + const fs_builder abld = bld.annotate("compute gl_SampleMaskIn"); - if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) - nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(); + if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) + nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(); + + fs_reg one = vgrf(glsl_type::int_type); + fs_reg enabled_mask = vgrf(glsl_type::int_type); + abld.MOV(one, brw_imm_d(1)); + abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]); + fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D); + abld.AND(mask, enabled_mask, coverage_mask); + + if (wm_prog_data->persample_dispatch == BRW_ALWAYS) + return mask; + + check_dynamic_msaa_flag(abld, wm_prog_data, + BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH); + set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask)); - fs_reg one = vgrf(glsl_type::int_type); - fs_reg enabled_mask = vgrf(glsl_type::int_type); - abld.MOV(one, brw_imm_d(1)); - abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]); - abld.AND(mask, enabled_mask, coverage_mask); - } else { - /* In per-pixel mode, the coverage mask is sufficient. */ - mask = coverage_mask; - } return mask; } @@ -1464,37 +1444,44 @@ fs_visitor::emit_shading_rate_setup() { assert(devinfo->ver >= 11); - const fs_builder abld = bld.annotate("compute fragment shading rate"); - fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD); - struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(bld.shader->stage_prog_data); /* Coarse pixel shading size fields overlap with other fields of not in * coarse pixel dispatch mode, so report 0 when that's not the case. */ - if (wm_prog_data->per_coarse_pixel_dispatch) { - /* The shading rates provided in the shader are the actual 2D shading - * rate while the SPIR-V built-in is the enum value that has the shading - * rate encoded as a bitfield. Fortunately, the bitfield value is just - * the shading rate divided by two and shifted. - */ + if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER) + return brw_imm_ud(0); + + const fs_builder abld = bld.annotate("compute fragment shading rate"); - /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */ - fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB)); - /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */ - fs_reg actual_y = byte_offset(actual_x, 1); + /* The shading rates provided in the shader are the actual 2D shading + * rate while the SPIR-V built-in is the enum value that has the shading + * rate encoded as a bitfield. Fortunately, the bitfield value is just + * the shading rate divided by two and shifted. + */ - fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD); - fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD); + /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */ + fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB)); + /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */ + fs_reg actual_y = byte_offset(actual_x, 1); - abld.SHR(int_rate_y, actual_y, brw_imm_ud(1)); - abld.SHR(int_rate_x, actual_x, brw_imm_ud(1)); - abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2)); - abld.OR(rate, int_rate_x, int_rate_y); - } else { - abld.MOV(rate, brw_imm_ud(0)); - } + fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD); + + abld.SHR(int_rate_y, actual_y, brw_imm_ud(1)); + abld.SHR(int_rate_x, actual_x, brw_imm_ud(1)); + abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2)); + + fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD); + abld.OR(rate, int_rate_x, int_rate_y); + + if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) + return rate; + + check_dynamic_msaa_flag(abld, wm_prog_data, + BRW_WM_MSAA_FLAG_COARSE_RT_WRITES); + set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0))); return rate; } @@ -2137,12 +2124,12 @@ fs_visitor::split_virtual_grfs() */ compact_virtual_grfs(); - int num_vars = this->alloc.count; + unsigned num_vars = this->alloc.count; /* Count the total number of registers */ - int reg_count = 0; - int vgrf_to_reg[num_vars]; - for (int i = 0; i < num_vars; i++) { + unsigned reg_count = 0; + unsigned vgrf_to_reg[num_vars]; + for (unsigned i = 0; i < num_vars; i++) { vgrf_to_reg[i] = reg_count; reg_count += alloc.sizes[i]; } @@ -2159,14 +2146,14 @@ fs_visitor::split_virtual_grfs() /* Mark all used registers as fully splittable */ foreach_block_and_inst(block, fs_inst, inst, cfg) { if (inst->dst.file == VGRF) { - int reg = vgrf_to_reg[inst->dst.nr]; + unsigned reg = vgrf_to_reg[inst->dst.nr]; for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++) split_points[reg + j] = true; } - for (int i = 0; i < inst->sources; i++) { + for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { - int reg = vgrf_to_reg[inst->src[i].nr]; + unsigned reg = vgrf_to_reg[inst->src[i].nr]; for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++) split_points[reg + j] = true; } @@ -2181,13 +2168,13 @@ fs_visitor::split_virtual_grfs() } if (inst->dst.file == VGRF) { - int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; + unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; for (unsigned j = 1; j < regs_written(inst); j++) split_points[reg + j] = false; } - for (int i = 0; i < inst->sources; i++) { + for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { - int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; + unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; for (unsigned j = 1; j < regs_read(inst, i); j++) split_points[reg + j] = false; } @@ -2198,19 +2185,19 @@ fs_visitor::split_virtual_grfs() bool *vgrf_has_split = new bool[num_vars]; memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split)); - int *new_virtual_grf = new int[reg_count]; - int *new_reg_offset = new int[reg_count]; + unsigned *new_virtual_grf = new unsigned[reg_count]; + unsigned *new_reg_offset = new unsigned[reg_count]; - int reg = 0; + unsigned reg = 0; bool has_splits = false; - for (int i = 0; i < num_vars; i++) { + for (unsigned i = 0; i < num_vars; i++) { /* The first one should always be 0 as a quick sanity check. */ assert(split_points[reg] == false); /* j = 0 case */ new_reg_offset[reg] = 0; reg++; - int offset = 1; + unsigned offset = 1; /* j > 0 case */ for (unsigned j = 1; j < alloc.sizes[i]; j++) { @@ -2221,8 +2208,8 @@ fs_visitor::split_virtual_grfs() has_splits = true; vgrf_has_split[i] = true; assert(offset <= MAX_VGRF_SIZE); - int grf = alloc.allocate(offset); - for (int k = reg - offset; k < reg; k++) + unsigned grf = alloc.allocate(offset); + for (unsigned k = reg - offset; k < reg; k++) new_virtual_grf[k] = grf; offset = 0; } @@ -2234,7 +2221,7 @@ fs_visitor::split_virtual_grfs() /* The last one gets the original register number */ assert(offset <= MAX_VGRF_SIZE); alloc.sizes[i] = offset; - for (int k = reg - offset; k < reg; k++) + for (unsigned k = reg - offset; k < reg; k++) new_virtual_grf[k] = i; } assert(reg == reg_count); @@ -2268,7 +2255,7 @@ fs_visitor::split_virtual_grfs() } else { reg = vgrf_to_reg[inst->dst.nr]; assert(new_reg_offset[reg] == 0); - assert(new_virtual_grf[reg] == (int)inst->dst.nr); + assert(new_virtual_grf[reg] == inst->dst.nr); } continue; } @@ -2279,14 +2266,13 @@ fs_visitor::split_virtual_grfs() inst->dst.nr = new_virtual_grf[reg]; inst->dst.offset = new_reg_offset[reg] * REG_SIZE + inst->dst.offset % REG_SIZE; - assert((unsigned)new_reg_offset[reg] < - alloc.sizes[new_virtual_grf[reg]]); + assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); } else { assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE); - assert(new_virtual_grf[reg] == (int)inst->dst.nr); + assert(new_virtual_grf[reg] == inst->dst.nr); } } - for (int i = 0; i < inst->sources; i++) { + for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file != VGRF) continue; @@ -2295,11 +2281,10 @@ fs_visitor::split_virtual_grfs() inst->src[i].nr = new_virtual_grf[reg]; inst->src[i].offset = new_reg_offset[reg] * REG_SIZE + inst->src[i].offset % REG_SIZE; - assert((unsigned)new_reg_offset[reg] < - alloc.sizes[new_virtual_grf[reg]]); + assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); } else { assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE); - assert(new_virtual_grf[reg] == (int)inst->src[i].nr); + assert(new_virtual_grf[reg] == inst->src[i].nr); } } } @@ -2507,8 +2492,14 @@ fs_visitor::lower_constant_loads() const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); const unsigned base = pull_index * 4; - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1))); + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index); + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst, + srcs, PULL_UNIFORM_CONSTANT_SRCS); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; @@ -2626,6 +2617,7 @@ fs_visitor::opt_algebraic() /* a * 1.0 = a */ if (inst->src[1].is_one()) { inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; inst->src[1] = reg_undef; progress = true; break; @@ -2634,6 +2626,7 @@ fs_visitor::opt_algebraic() /* a * -1.0 = -a */ if (inst->src[1].is_negative_one()) { inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; inst->src[0].negate = !inst->src[0].negate; inst->src[1] = reg_undef; progress = true; @@ -2648,6 +2641,7 @@ fs_visitor::opt_algebraic() if (brw_reg_type_is_integer(inst->src[1].type) && inst->src[1].is_zero()) { inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; inst->src[1] = reg_undef; progress = true; break; @@ -2656,6 +2650,7 @@ fs_visitor::opt_algebraic() if (inst->src[0].file == IMM) { assert(inst->src[0].type == BRW_REGISTER_TYPE_F); inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; inst->src[0].f += inst->src[1].f; inst->src[1] = reg_undef; progress = true; @@ -2671,9 +2666,11 @@ fs_visitor::opt_algebraic() */ if (inst->src[0].negate) { inst->opcode = BRW_OPCODE_NOT; + inst->sources = 1; inst->src[0].negate = false; } else { inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; } inst->src[1] = reg_undef; progress = true; @@ -2720,6 +2717,7 @@ fs_visitor::opt_algebraic() } if (inst->src[0].equals(inst->src[1])) { inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; inst->src[1] = reg_undef; inst->predicate = BRW_PREDICATE_NONE; inst->predicate_inverse = false; @@ -2732,6 +2730,7 @@ fs_visitor::opt_algebraic() case BRW_REGISTER_TYPE_F: if (inst->src[1].f >= 1.0f) { inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; inst->src[1] = reg_undef; inst->conditional_mod = BRW_CONDITIONAL_NONE; progress = true; @@ -2747,6 +2746,7 @@ fs_visitor::opt_algebraic() case BRW_REGISTER_TYPE_F: if (inst->src[1].f <= 0.0f) { inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; inst->src[1] = reg_undef; inst->conditional_mod = BRW_CONDITIONAL_NONE; progress = true; @@ -2767,11 +2767,13 @@ fs_visitor::opt_algebraic() break; if (inst->src[1].is_one()) { inst->opcode = BRW_OPCODE_ADD; + inst->sources = 2; inst->src[1] = inst->src[2]; inst->src[2] = reg_undef; progress = true; } else if (inst->src[2].is_one()) { inst->opcode = BRW_OPCODE_ADD; + inst->sources = 2; inst->src[2] = reg_undef; progress = true; } @@ -2850,9 +2852,9 @@ fs_visitor::opt_zero_samples() /* Gfx4 infers the texturing opcode based on the message length so we can't * change it. Gfx12.5 has restrictions on the number of coordinate * parameters that have to be provided for some texture types - * (Wa_14013363432). + * (Wa_14012688258). */ - if (devinfo->ver < 5 || devinfo->verx10 == 125) + if (devinfo->ver < 5 || intel_needs_workaround(devinfo, 14012688258)) return false; bool progress = false; @@ -3725,106 +3727,6 @@ fs_visitor::insert_gfx4_send_dependency_workarounds() invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } -/** - * Turns the generic expression-style uniform pull constant load instruction - * into a hardware-specific series of instructions for loading a pull - * constant. - * - * The expression style allows the CSE pass before this to optimize out - * repeated loads from the same offset, and gives the pre-register-allocation - * scheduling full flexibility, while the conversion to native instructions - * allows the post-register-allocation scheduler the best information - * possible. - * - * Note that execution masking for setting up pull constant loads is special: - * the channels that need to be written are unrelated to the current execution - * mask, since a later instruction will use one of the result channels as a - * source operand for all 8 or 16 of its channels. - */ -void -fs_visitor::lower_uniform_pull_constant_loads() -{ - foreach_block_and_inst (block, fs_inst, inst, cfg) { - if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) - continue; - - const fs_reg& surface = inst->src[0]; - const fs_reg& offset_B = inst->src[1]; - assert(offset_B.file == IMM); - - if (devinfo->has_lsc) { - const fs_builder ubld = - fs_builder(this, block, inst).group(8, 0).exec_all(); - - const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.MOV(payload, offset_B); - - inst->sfid = GFX12_SFID_UGM; - inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, - 1 /* simd_size */, - LSC_ADDR_SURFTYPE_BTI, - LSC_ADDR_SIZE_A32, - 1 /* num_coordinates */, - LSC_DATA_SIZE_D32, - inst->size_written / 4, - true /* transpose */, - LSC_CACHE_LOAD_L1STATE_L3MOCS, - true /* has_dest */); - - fs_reg ex_desc; - if (surface.file == IMM) { - ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud)); - } else { - /* We only need the first component for the payload so we can use - * one of the other components for the extended descriptor - */ - ex_desc = component(payload, 1); - ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24)); - } - - /* Update the original instruction. */ - inst->opcode = SHADER_OPCODE_SEND; - inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); - inst->ex_mlen = 0; - inst->header_size = 0; - inst->send_has_side_effects = false; - inst->send_is_volatile = true; - inst->exec_size = 1; - - /* Finally, the payload */ - inst->resize_sources(3); - inst->src[0] = brw_imm_ud(0); /* desc */ - inst->src[1] = ex_desc; - inst->src[2] = payload; - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - } else if (devinfo->ver >= 7) { - const fs_builder ubld = fs_builder(this, block, inst).exec_all(); - const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); - - ubld.group(8, 0).MOV(payload, - retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - ubld.group(1, 0).MOV(component(payload, 2), - brw_imm_ud(offset_B.ud / 16)); - - inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7; - inst->src[1] = payload; - inst->header_size = 1; - inst->mlen = 1; - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - } else { - /* Before register allocation, we didn't tell the scheduler about the - * MRF we use. We know it's safe to use this MRF because nothing - * else does except for register spill/unspill, which generates and - * uses its MRF within a single IR instruction. - */ - inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; - inst->mlen = 1; - } - } -} - bool fs_visitor::lower_load_payload() { @@ -3933,15 +3835,147 @@ fs_visitor::lower_load_payload() return progress; } +/** + * Factor an unsigned 32-bit integer. + * + * Attempts to factor \c x into two values that are at most 0xFFFF. If no + * such factorization is possible, either because the value is too large or is + * prime, both \c result_a and \c result_b will be zero. + */ +static void +factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b) +{ + /* This is necessary to prevent various opportunities for division by zero + * below. + */ + assert(x > 0xffff); + + /* This represents the actual expected constraints on the input. Namely, + * both the upper and lower words should be > 1. + */ + assert(x >= 0x00020002); + + *result_a = 0; + *result_b = 0; + + /* The value is too large to factor with the constraints. */ + if (x > (0xffffu * 0xffffu)) + return; + + /* A non-prime number will have the form p*q*d where p is some prime + * number, q > 1, and 1 <= d <= q. To meet the constraints of this + * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p). + * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally, + * floor(x / (0xffff * p)) <= d <= floor(0xffff / p). + * + * The observation is finding the largest possible value of p reduces the + * possible range of d. After selecting p, all values of d in this range + * are tested until a factorization is found. The size of the range of + * possible values of d sets an upper bound on the run time of the + * function. + */ + static const uint16_t primes[256] = { + 2, 3, 5, 7, 11, 13, 17, 19, + 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, + 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */ + 137, 139, 149, 151, 157, 163, 167, 173, + 179, 181, 191, 193, 197, 199, 211, 223, + 227, 229, 233, 239, 241, 251, 257, 263, + 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */ + 313, 317, 331, 337, 347, 349, 353, 359, + 367, 373, 379, 383, 389, 397, 401, 409, + 419, 421, 431, 433, 439, 443, 449, 457, + 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */ + 509, 521, 523, 541, 547, 557, 563, 569, + 571, 577, 587, 593, 599, 601, 607, 613, + 617, 619, 631, 641, 643, 647, 653, 659, + 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */ + 727, 733, 739, 743, 751, 757, 761, 769, + 773, 787, 797, 809, 811, 821, 823, 827, + 829, 839, 853, 857, 859, 863, 877, 881, + 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */ + 947, 953, 967, 971, 977, 983, 991, 997, + 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, + 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, + 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */ + 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, + 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, + 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, + 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */ + 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, + 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, + 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, + 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */ + }; + + unsigned p; + unsigned x_div_p; + + for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) { + p = primes[i]; + x_div_p = x / p; + + if ((x_div_p * p) == x) + break; + } + + /* A prime factor was not found. */ + if (x_div_p * p != x) + return; + + /* Terminate early if d=1 is a solution. */ + if (x_div_p < 0x10000) { + *result_a = x_div_p; + *result_b = p; + return; + } + + /* Pick the maximum possible value for 'd'. It's important that the loop + * below execute while d <= max_d because max_d is a valid value. Having + * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be + * incorrectly reported as not being factorable. The problem would occur + * with any value that is a factor of two primes in the table and one prime + * not in the table. + */ + const unsigned max_d = 0xffff / p; + + /* Pick an initial value of 'd' that (combined with rejecting too large + * values above) guarantees that 'q' will always be small enough. + * DIV_ROUND_UP is used to prevent 'd' from being zero. + */ + for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) { + unsigned q = x_div_p / d; + + if ((q * d) == x_div_p) { + assert(p * d * q == x); + assert((p * d) < 0x10000); + + *result_a = q; + *result_b = p * d; + break; + } + + /* Since every value of 'd' is tried, as soon as 'd' is larger + * than 'q', we're just re-testing combinations that have + * already been tested. + */ + if (d > q) + break; + } +} + void fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) { const fs_builder ibld(this, block, inst); - const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); + /* It is correct to use inst->src[1].d in both end of the comparison. + * Using .ud in the UINT16_MAX comparison would cause any negative value to + * fail the check. + */ if (inst->src[1].file == IMM && - (( ud && inst->src[1].ud <= UINT16_MAX) || - (!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) { + (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) { /* The MUL instruction isn't commutative. On Gen <= 6, only the low * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of * src1 are used. @@ -3949,6 +3983,7 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) * If multiplying by an immediate value that fits in 16-bits, do a * single MUL instruction with that value in the proper location. */ + const bool ud = (inst->src[1].d >= 0); if (devinfo->ver < 7) { fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); ibld.MOV(imm, inst->src[1]); @@ -4029,6 +4064,7 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) high.stride = inst->dst.stride; high.offset = inst->dst.offset % REG_SIZE; + bool do_addition = true; if (devinfo->ver >= 7) { /* From Wa_1604601757: * @@ -4047,10 +4083,37 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) lower_src_modifiers(this, block, inst, 1); if (inst->src[1].file == IMM) { - ibld.MUL(low, inst->src[0], - brw_imm_uw(inst->src[1].ud & 0xffff)); - ibld.MUL(high, inst->src[0], - brw_imm_uw(inst->src[1].ud >> 16)); + unsigned a; + unsigned b; + + /* If the immeditate value can be factored into two values, A and + * B, that each fit in 16-bits, the multiplication result can + * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B). + * This saves an operation (the addition) and a temporary register + * (high). + * + * Skip the optimization if either the high word or the low word + * is 0 or 1. In these conditions, at least one of the + * multiplications generated by the straightforward method will be + * eliminated anyway. + */ + if (inst->src[1].ud > 0x0001ffff && + (inst->src[1].ud & 0xffff) > 1) { + factor_uint32(inst->src[1].ud, &a, &b); + + if (a != 0) { + ibld.MUL(low, inst->src[0], brw_imm_uw(a)); + ibld.MUL(low, low, brw_imm_uw(b)); + do_addition = false; + } + } + + if (do_addition) { + ibld.MUL(low, inst->src[0], + brw_imm_uw(inst->src[1].ud & 0xffff)); + ibld.MUL(high, inst->src[0], + brw_imm_uw(inst->src[1].ud >> 16)); + } } else { ibld.MUL(low, inst->src[0], subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); @@ -4067,9 +4130,11 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) inst->src[1]); } - ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1), - subscript(low, BRW_REGISTER_TYPE_UW, 1), - subscript(high, BRW_REGISTER_TYPE_UW, 0)); + if (do_addition) { + ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(high, BRW_REGISTER_TYPE_UW, 0)); + } if (needs_mov || inst->conditional_mod) set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low)); @@ -5052,7 +5117,6 @@ get_lowered_simd_width(const struct brw_compiler *compiler, return 8; case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: - case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: @@ -5074,11 +5138,6 @@ get_lowered_simd_width(const struct brw_compiler *compiler, return inst->exec_size; case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL: - case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL: return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8; case SHADER_OPCODE_URB_READ_LOGICAL: @@ -6197,7 +6256,7 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) return false; } -/* Wa_14017989577 +/* Wa_14015360517 * * The first instruction of any kernel should have non-zero emask. * Make sure this happens by introducing a dummy mov instruction. @@ -6205,7 +6264,7 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) void fs_visitor::emit_dummy_mov_instruction() { - if (devinfo->verx10 < 120) + if (!intel_needs_workaround(devinfo, 14015360517)) return; struct backend_instruction *first_inst = @@ -6240,7 +6299,7 @@ fs_visitor::emit_dummy_memory_fence_before_eot() bool progress = false; bool has_ugm_write_or_atomic = false; - if (!intel_device_info_is_dg2(devinfo)) + if (!intel_needs_workaround(devinfo, 22013689345)) return; foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { @@ -6388,15 +6447,18 @@ fs_visitor::fixup_nomask_control_flow() */ const bool save_flag = flag_liveout & flag_mask(flag, dispatch_width / 8); - const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type); + const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type); - if (save_flag) + if (save_flag) { + ubld.group(8, 0).UNDEF(tmp); ubld.group(1, 0).MOV(tmp, flag); + } ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS); set_predicate(pred, inst); inst->flag_subreg = 0; + inst->predicate_trivial = true; if (save_flag) ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp); @@ -6419,6 +6481,18 @@ fs_visitor::fixup_nomask_control_flow() return progress; } +uint32_t +fs_visitor::compute_max_register_pressure() +{ + const register_pressure &rp = regpressure_analysis.require(); + uint32_t ip = 0, max_pressure = 0; + foreach_block_and_inst(block, backend_instruction, inst, cfg) { + max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]); + ip++; + } + return max_pressure; +} + void fs_visitor::allocate_registers(bool allow_spilling) { @@ -6438,6 +6512,11 @@ fs_visitor::allocate_registers(bool allow_spilling) "lifo" }; + compact_virtual_grfs(); + + if (needs_register_pressure) + shader_stats.max_register_pressure = compute_max_register_pressure(); + bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS); /* Before we schedule anything, stash off the instruction order as an array @@ -6587,7 +6666,7 @@ fs_visitor::run_vs() fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(true /* allow_spilling */); @@ -6713,7 +6792,7 @@ fs_visitor::run_tcs() fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(true /* allow_spilling */); @@ -6745,7 +6824,7 @@ fs_visitor::run_tes() fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(true /* allow_spilling */); @@ -6793,7 +6872,7 @@ fs_visitor::run_gs() fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(true /* allow_spilling */); @@ -6896,7 +6975,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(allow_spilling); @@ -6936,7 +7015,7 @@ fs_visitor::run_cs(bool allow_spilling) fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(allow_spilling); @@ -6968,7 +7047,7 @@ fs_visitor::run_bs(bool allow_spilling) fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(allow_spilling); @@ -7001,7 +7080,7 @@ fs_visitor::run_task(bool allow_spilling) fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(allow_spilling); @@ -7034,7 +7113,7 @@ fs_visitor::run_mesh(bool allow_spilling) fixup_3src_null_dest(); emit_dummy_memory_fence_before_eot(); - /* Wa_14017989577 */ + /* Wa_14015360517 */ emit_dummy_mov_instruction(); allocate_registers(allow_spilling); @@ -7045,7 +7124,10 @@ fs_visitor::run_mesh(bool allow_spilling) static bool is_used_in_not_interp_frag_coord(nir_ssa_def *def) { - nir_foreach_use(src, def) { + nir_foreach_use_including_if(src, def) { + if (src->is_if) + return true; + if (src->parent_instr->type != nir_instr_type_intrinsic) return true; @@ -7054,9 +7136,6 @@ is_used_in_not_interp_frag_coord(nir_ssa_def *def) return true; } - nir_foreach_if_use(src, def) - return true; - return false; } @@ -7088,6 +7167,8 @@ brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo, case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_centroid: case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: break; default: continue; @@ -7182,11 +7263,12 @@ brw_nir_move_interpolation_to_top(nir_shader *nir) continue; nir_block *top = nir_start_block(f->impl); - exec_node *cursor_node = NULL; + nir_cursor cursor = nir_before_instr(nir_block_first_instr(top)); + bool impl_progress = false; - nir_foreach_block(block, f->impl) { - if (block == top) - continue; + for (nir_block *block = nir_block_cf_tree_next(top); + block != NULL; + block = nir_block_cf_tree_next(block)) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) @@ -7212,28 +7294,25 @@ brw_nir_move_interpolation_to_top(nir_shader *nir) for (unsigned i = 0; i < ARRAY_SIZE(move); i++) { if (move[i]->block != top) { - move[i]->block = top; - exec_node_remove(&move[i]->node); - if (cursor_node) { - exec_node_insert_after(cursor_node, &move[i]->node); - } else { - exec_list_push_head(&top->instr_list, &move[i]->node); - } - cursor_node = &move[i]->node; - progress = true; + nir_instr_move(cursor, move[i]); + impl_progress = true; } } } } - nir_metadata_preserve(f->impl, nir_metadata_block_index | - nir_metadata_dominance); + + progress = progress || impl_progress; + + nir_metadata_preserve(f->impl, impl_progress ? (nir_metadata_block_index | + nir_metadata_dominance) + : nir_metadata_all); } return progress; } static void -brw_nir_populate_wm_prog_data(const nir_shader *shader, +brw_nir_populate_wm_prog_data(nir_shader *shader, const struct intel_device_info *devinfo, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, @@ -7252,10 +7331,28 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, prog_data->computed_stencil = shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); - prog_data->persample_dispatch = - key->multisample_fbo && - (key->persample_interp || - shader->info.fs.uses_sample_shading); + prog_data->sample_shading = + shader->info.fs.uses_sample_shading || + shader->info.outputs_read; + + assert(key->multisample_fbo != BRW_NEVER || + key->persample_interp == BRW_NEVER); + + prog_data->persample_dispatch = key->persample_interp; + if (prog_data->sample_shading) + prog_data->persample_dispatch = BRW_ALWAYS; + + /* We can only persample dispatch if we have a multisample FBO */ + prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch, + key->multisample_fbo); + + /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If + * persample_dispatch & multisample_fbo are not dynamic, Anv should be able + * to definitively tell whether alpha_to_coverage is on or off. + */ + prog_data->alpha_to_coverage = key->alpha_to_coverage; + assert(prog_data->alpha_to_coverage != BRW_SOMETIMES || + prog_data->persample_dispatch == BRW_SOMETIMES); if (devinfo->ver >= 6) { prog_data->uses_sample_mask = @@ -7270,11 +7367,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, * per-sample dispatch. If we need gl_SamplePosition and we don't have * persample dispatch, we hard-code it to 0.5. */ - prog_data->uses_pos_offset = prog_data->persample_dispatch && - (BITSET_TEST(shader->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS) || - BITSET_TEST(shader->info.system_values_read, - SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); + prog_data->read_pos_offset_input = + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS) || + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS_OR_CENTER); + + if (prog_data->read_pos_offset_input) + prog_data->uses_pos_offset = prog_data->persample_dispatch; + else + prog_data->uses_pos_offset = BRW_NEVER; } prog_data->has_render_target_reads = shader->info.outputs_read != 0ull; @@ -7285,19 +7387,72 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, prog_data->barycentric_interp_modes = brw_compute_barycentric_interp_modes(devinfo, shader); + + /* From the BDW PRM documentation for 3DSTATE_WM: + * + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non- perspective Sample barycentric coordinates." + * + * So cleanup any potentially set sample barycentric mode when not in per + * sample dispatch. + */ + if (prog_data->persample_dispatch == BRW_NEVER) { + prog_data->barycentric_interp_modes &= + ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + prog_data->uses_nonperspective_interp_modes |= (prog_data->barycentric_interp_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0; - /* You can't be coarse and per-sample */ - assert(!key->coarse_pixel || !key->persample_interp); - prog_data->per_coarse_pixel_dispatch = - key->coarse_pixel && - !shader->info.fs.uses_sample_shading && - !prog_data->uses_omask && - !prog_data->uses_sample_mask && - (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) && - !prog_data->computed_stencil; + /* The current VK_EXT_graphics_pipeline_library specification requires + * coarse to specified at compile time. But per sample interpolation can be + * dynamic. So we should never be in a situation where coarse & + * persample_interp are both respectively true & BRW_ALWAYS. + * + * Coarse will dynamically turned off when persample_interp is active. + */ + assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS); + + prog_data->coarse_pixel_dispatch = + brw_sometimes_invert(prog_data->persample_dispatch); + if (!key->coarse_pixel || + prog_data->uses_omask || + prog_data->sample_shading || + prog_data->uses_sample_mask || + (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || + prog_data->computed_stencil) { + prog_data->coarse_pixel_dispatch = BRW_NEVER; + } + + /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, + * Message Descriptor : + * + * "Message Type. Specifies the type of message being sent when + * pixel-rate evaluation is requested : + * + * Format = U2 + * 0: Per Message Offset (eval_snapped with immediate offset) + * 1: Sample Position Offset (eval_sindex) + * 2: Centroid Position Offset (eval_centroid) + * 3: Per Slot Offset (eval_snapped with register offset) + * + * Message Type. Specifies the type of message being sent when + * coarse-rate evaluation is requested : + * + * Format = U2 + * 0: Coarse to Pixel Mapping Message (internal message) + * 1: Reserved + * 2: Coarse Centroid Position (eval_centroid) + * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" + * + * The Sample Position Offset is marked as reserved for coarse rate + * evaluation and leads to hangs if we try to use it. So disable coarse + * pixel shading if we have any intrinsic that will result in a pixel + * interpolater message at sample. + */ + if (brw_nir_pulls_at_sample(shader)) + prog_data->coarse_pixel_dispatch = BRW_NEVER; /* We choose to always enable VMask prior to XeHP, as it would cause * us to lose out on the eliminate_find_live_channel() optimization. @@ -7305,16 +7460,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, prog_data->uses_vmask = devinfo->verx10 < 125 || shader->info.fs.needs_quad_helper_invocations || shader->info.fs.needs_all_helper_invocations || - prog_data->per_coarse_pixel_dispatch; + prog_data->coarse_pixel_dispatch != BRW_NEVER; prog_data->uses_src_w = BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); prog_data->uses_src_depth = BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && - !prog_data->per_coarse_pixel_dispatch; + prog_data->coarse_pixel_dispatch != BRW_ALWAYS; prog_data->uses_depth_w_coefficients = BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && - prog_data->per_coarse_pixel_dispatch; + prog_data->coarse_pixel_dispatch != BRW_NEVER; calculate_urb_setup(devinfo, key, prog_data, shader, mue_map); brw_compute_flat_inputs(prog_data, shader); @@ -7361,35 +7516,35 @@ brw_compile_fs(const struct brw_compiler *compiler, * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in * hardware, regardless of the state setting for this feature." */ - if (devinfo->ver > 6 && key->alpha_to_coverage) { + if (devinfo->ver > 6 && key->alpha_to_coverage != BRW_NEVER) { /* Run constant fold optimization in order to get the correct source * offset to determine render target 0 store instruction in * emit_alpha_to_coverage pass. */ - NIR_PASS_V(nir, nir_opt_constant_folding); - NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage); + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data); } - NIR_PASS_V(nir, brw_nir_move_interpolation_to_top); + NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); brw_postprocess_nir(nir, compiler, true, debug_enabled, key->base.robust_buffer_access); brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data, params->mue_map); - fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; + std::unique_ptr<fs_visitor> v8, v16, v32; cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL; float throughput = 0; bool has_spilled = false; - v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, - &prog_data->base, nir, 8, - debug_enabled); + v8 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base, + &prog_data->base, nir, 8, + params->stats != NULL, + debug_enabled); if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg); - delete v8; return NULL; - } else if (!INTEL_DEBUG(DEBUG_NO8)) { + } else if (INTEL_SIMD(FS, 8)) { simd8_cfg = v8->cfg; prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs; prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used); @@ -7403,7 +7558,7 @@ brw_compile_fs(const struct brw_compiler *compiler, * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917 */ if (devinfo->ver == 8 && prog_data->dual_src_blend && - !INTEL_DEBUG(DEBUG_NO8)) { + INTEL_SIMD(FS, 8)) { assert(!params->use_rep_send); v8->limit_dispatch_width(8, "gfx8 workaround: " "using SIMD8 when dual src blending.\n"); @@ -7423,12 +7578,13 @@ brw_compile_fs(const struct brw_compiler *compiler, if (!has_spilled && v8->max_dispatch_width >= 16 && - (!INTEL_DEBUG(DEBUG_NO16) || params->use_rep_send)) { + (INTEL_SIMD(FS, 16) || params->use_rep_send)) { /* Try a SIMD16 compile */ - v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, - &prog_data->base, nir, 16, - debug_enabled); - v16->import_uniforms(v8); + v16 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base, + &prog_data->base, nir, 16, + params->stats != NULL, + debug_enabled); + v16->import_uniforms(v8.get()); if (!v16->run_fs(allow_spilling, params->use_rep_send)) { brw_shader_perf_log(compiler, params->log_data, "SIMD16 shader failed to compile: %s\n", @@ -7450,12 +7606,13 @@ brw_compile_fs(const struct brw_compiler *compiler, if (!has_spilled && v8->max_dispatch_width >= 32 && !params->use_rep_send && devinfo->ver >= 6 && !simd16_failed && - !INTEL_DEBUG(DEBUG_NO32)) { + INTEL_SIMD(FS, 32)) { /* Try a SIMD32 compile */ - v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, - &prog_data->base, nir, 32, - debug_enabled); - v32->import_uniforms(v8); + v32 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base, + &prog_data->base, nir, 32, + params->stats != NULL, + debug_enabled); + v32->import_uniforms(v8.get()); if (!v32->run_fs(allow_spilling, false)) { brw_shader_perf_log(compiler, params->log_data, "SIMD32 shader failed to compile: %s\n", @@ -7520,12 +7677,14 @@ brw_compile_fs(const struct brw_compiler *compiler, } struct brw_compile_stats *stats = params->stats; + uint32_t max_dispatch_width = 0; if (simd8_cfg) { prog_data->dispatch_8 = true; g.generate_code(simd8_cfg, 8, v8->shader_stats, v8->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; + max_dispatch_width = 8; } if (simd16_cfg) { @@ -7534,6 +7693,7 @@ brw_compile_fs(const struct brw_compiler *compiler, simd16_cfg, 16, v16->shader_stats, v16->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; + max_dispatch_width = 16; } if (simd32_cfg) { @@ -7542,14 +7702,13 @@ brw_compile_fs(const struct brw_compiler *compiler, simd32_cfg, 32, v32->shader_stats, v32->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; + max_dispatch_width = 32; } - g.add_const_data(nir->constant_data, nir->constant_data_size); - - delete v8; - delete v16; - delete v32; + for (struct brw_compile_stats *s = params->stats; s != NULL && s != stats; s++) + s->max_dispatch_width = max_dispatch_width; + g.add_const_data(nir->constant_data, nir->constant_data_size); return g.get_assembly(); } @@ -7707,15 +7866,17 @@ brw_compile_cs(const struct brw_compiler *compiler, prog_data->local_size[2] = nir->info.workgroup_size[2]; } - const unsigned required_dispatch_width = - brw_required_dispatch_width(&nir->info); + brw_simd_selection_state simd_state{ + .mem_ctx = mem_ctx, + .devinfo = compiler->devinfo, + .prog_data = prog_data, + .required_width = brw_required_dispatch_width(&nir->info), + }; - fs_visitor *v[3] = {0}; - const char *error[3] = {0}; + std::unique_ptr<fs_visitor> v[3]; for (unsigned simd = 0; simd < 3; simd++) { - if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, prog_data, - required_dispatch_width, &error[simd])) + if (!brw_simd_should_compile(simd_state, simd)) continue; const unsigned dispatch_width = 8u << simd; @@ -7733,24 +7894,23 @@ brw_compile_cs(const struct brw_compiler *compiler, brw_postprocess_nir(shader, compiler, true, debug_enabled, key->base.robust_buffer_access); - v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, - &prog_data->base, shader, dispatch_width, - debug_enabled); + v[simd] = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base, + &prog_data->base, shader, dispatch_width, + params->stats != NULL, + debug_enabled); - if (prog_data->prog_mask) { - unsigned first = ffs(prog_data->prog_mask) - 1; - v[simd]->import_uniforms(v[first]); - } + const int first = brw_simd_first_compiled(simd_state); + if (first >= 0) + v[simd]->import_uniforms(v[first].get()); - const bool allow_spilling = !prog_data->prog_mask || - nir->info.workgroup_size_variable; + const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable; if (v[simd]->run_cs(allow_spilling)) { cs_fill_push_const_info(compiler->devinfo, prog_data); - brw_simd_mark_compiled(simd, prog_data, v[simd]->spilled_any_registers); + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); } else { - error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg); + simd_state.error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg); if (simd > 0) { brw_shader_perf_log(compiler, params->log_data, "SIMD%u shader failed to compile: %s\n", @@ -7759,21 +7919,20 @@ brw_compile_cs(const struct brw_compiler *compiler, } } - const int selected_simd = brw_simd_select(prog_data); + const int selected_simd = brw_simd_select(simd_state); if (selected_simd < 0) { params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n", - error[0], error[1], error[2]);; + simd_state.error[0], simd_state.error[1], + simd_state.error[2]); return NULL; } assert(selected_simd < 3); - fs_visitor *selected = v[selected_simd]; + fs_visitor *selected = v[selected_simd].get(); if (!nir->info.workgroup_size_variable) prog_data->prog_mask = 1 << selected_simd; - const unsigned *ret = NULL; - fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base, selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE); if (unlikely(debug_enabled)) { @@ -7784,6 +7943,8 @@ brw_compile_cs(const struct brw_compiler *compiler, g.enable_debug(name); } + uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1); + struct brw_compile_stats *stats = params->stats; for (unsigned simd = 0; simd < 3; simd++) { if (prog_data->prog_mask & (1u << simd)) { @@ -7791,19 +7952,16 @@ brw_compile_cs(const struct brw_compiler *compiler, prog_data->prog_offset[simd] = g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats, v[simd]->performance_analysis.require(), stats); + if (stats) + stats->max_dispatch_width = max_dispatch_width; stats = stats ? stats + 1 : NULL; + max_dispatch_width = 8u << simd; } } g.add_const_data(nir->constant_data, nir->constant_data_size); - ret = g.get_assembly(); - - delete v[0]; - delete v[1]; - delete v[2]; - - return ret; + return g.get_assembly(); } struct brw_cs_dispatch_info @@ -7817,9 +7975,7 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, override_local_size ? override_local_size : prog_data->local_size; - const int simd = - override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) : - brw_simd_select(prog_data); + const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes); assert(simd >= 0 && simd < 3); info.group_size = sizes[0] * sizes[1] * sizes[2]; @@ -7857,81 +8013,65 @@ compile_single_bs(const struct brw_compiler *compiler, void *log_data, brw_postprocess_nir(shader, compiler, true, debug_enabled, key->base.robust_buffer_access); - fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL; - bool has_spilled = false; + brw_simd_selection_state simd_state{ + .mem_ctx = mem_ctx, + .devinfo = compiler->devinfo, + .prog_data = prog_data, - uint8_t simd_size = 0; - if ((shader->info.subgroup_size == SUBGROUP_SIZE_VARYING || - shader->info.subgroup_size == SUBGROUP_SIZE_REQUIRE_8) && - !INTEL_DEBUG(DEBUG_NO8)) { - v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, - 8, debug_enabled); - const bool allow_spilling = true; - if (!v8->run_bs(allow_spilling)) { - if (error_str) - *error_str = ralloc_strdup(mem_ctx, v8->fail_msg); - delete v8; - return 0; - } else { - v = v8; - simd_size = 8; - if (v8->spilled_any_registers) - has_spilled = true; - } - } + /* Since divergence is a lot more likely in RT than compute, it makes + * sense to limit ourselves to SIMD8 for now. + */ + .required_width = 8, + }; - if ((shader->info.subgroup_size == SUBGROUP_SIZE_VARYING || - shader->info.subgroup_size == SUBGROUP_SIZE_REQUIRE_16) && - !has_spilled && !INTEL_DEBUG(DEBUG_NO16)) { - v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, - 16, debug_enabled); - const bool allow_spilling = (v == NULL); - if (!v16->run_bs(allow_spilling)) { - brw_shader_perf_log(compiler, log_data, - "SIMD16 shader failed to compile: %s\n", - v16->fail_msg); - if (v == NULL) { - assert(v8 == NULL); - if (error_str) { - *error_str = ralloc_asprintf( - mem_ctx, "SIMD8 disabled and couldn't generate SIMD16: %s", - v16->fail_msg); - } - delete v16; - return 0; - } + std::unique_ptr<fs_visitor> v[2]; + + for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) { + if (!brw_simd_should_compile(simd_state, simd)) + continue; + + const unsigned dispatch_width = 8u << simd; + + v[simd] = std::make_unique<fs_visitor>(compiler, log_data, mem_ctx, &key->base, + &prog_data->base, shader, + dispatch_width, + stats != NULL, + debug_enabled); + + const bool allow_spilling = !brw_simd_any_compiled(simd_state); + if (v[simd]->run_bs(allow_spilling)) { + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); } else { - v = v16; - simd_size = 16; - if (v16->spilled_any_registers) - has_spilled = true; + simd_state.error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg); + if (simd > 0) { + brw_shader_perf_log(compiler, log_data, + "SIMD%u shader failed to compile: %s", + dispatch_width, v[simd]->fail_msg); + } } } - if (unlikely(v == NULL)) { - assert(INTEL_DEBUG(DEBUG_NO8 | DEBUG_NO16)); - if (error_str) { - *error_str = ralloc_strdup(mem_ctx, - "Cannot satisfy INTEL_DEBUG flags SIMD restrictions"); - } - return false; + const int selected_simd = brw_simd_select(simd_state); + if (selected_simd < 0) { + *error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s and %s.", + simd_state.error[0], simd_state.error[1]); + return 0; } - assert(v); + assert(selected_simd < int(ARRAY_SIZE(v))); + fs_visitor *selected = v[selected_simd].get(); + assert(selected); + + const unsigned dispatch_width = selected->dispatch_width; - int offset = g->generate_code(v->cfg, simd_size, v->shader_stats, - v->performance_analysis.require(), stats); + int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats, + selected->performance_analysis.require(), stats); if (prog_offset) *prog_offset = offset; else assert(offset == 0); - delete v8; - delete v16; - - return simd_size; + return dispatch_width; } uint64_t |