summaryrefslogtreecommitdiff
path: root/lib/mesa/src/intel/compiler/brw_fs.cpp
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2023-11-02 04:53:47 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2023-11-02 04:53:47 +0000
commitb44518130b33cadb5c1d619e9e936ae0e0dbf7cb (patch)
tree6069eb03c39fbc79808a7d94f857118cce75cbe3 /lib/mesa/src/intel/compiler/brw_fs.cpp
parent32aeb3c41fedbbd7b11aacfec48e8f699d16bff0 (diff)
Merge Mesa 23.1.9
Diffstat (limited to 'lib/mesa/src/intel/compiler/brw_fs.cpp')
-rw-r--r--lib/mesa/src/intel/compiler/brw_fs.cpp1026
1 files changed, 583 insertions, 443 deletions
diff --git a/lib/mesa/src/intel/compiler/brw_fs.cpp b/lib/mesa/src/intel/compiler/brw_fs.cpp
index db4806b50..21f7071c4 100644
--- a/lib/mesa/src/intel/compiler/brw_fs.cpp
+++ b/lib/mesa/src/intel/compiler/brw_fs.cpp
@@ -38,11 +38,14 @@
#include "brw_dead_control_flow.h"
#include "brw_private.h"
#include "dev/intel_debug.h"
+#include "dev/intel_wa.h"
#include "compiler/glsl_types.h"
#include "compiler/nir/nir_builder.h"
#include "program/prog_parameter.h"
#include "util/u_math.h"
+#include <memory>
+
using namespace brw;
static unsigned get_lowered_simd_width(const struct brw_compiler *compiler,
@@ -246,7 +249,6 @@ fs_inst::is_control_source(unsigned arg) const
{
switch (opcode) {
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
- case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
return arg == 0;
@@ -304,9 +306,6 @@ fs_inst::is_payload(unsigned arg) const
case SHADER_OPCODE_BARRIER:
return arg == 0;
- case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
- return arg == 1;
-
case SHADER_OPCODE_SEND:
return arg == 2 || arg == 3;
@@ -642,10 +641,19 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
bool
fs_inst::is_partial_write() const
{
- return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
- (this->exec_size * type_sz(this->dst.type)) < 32 ||
- !this->dst.is_contiguous() ||
- this->dst.offset % REG_SIZE != 0);
+ if (this->predicate && !this->predicate_trivial &&
+ this->opcode != BRW_OPCODE_SEL)
+ return true;
+
+ if (this->dst.offset % REG_SIZE != 0)
+ return true;
+
+ /* SEND instructions always write whole registers */
+ if (this->opcode == SHADER_OPCODE_SEND)
+ return false;
+
+ return this->exec_size * type_sz(this->dst.type) < 32 ||
+ !this->dst.is_contiguous();
}
unsigned
@@ -745,13 +753,13 @@ fs_inst::components_read(unsigned i) const
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
- assert(src[2].file == IMM);
+ assert(src[A64_LOGICAL_ARG].file == IMM);
return 1;
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
- assert(src[2].file == IMM);
- if (i == 1) { /* data to write */
- const unsigned comps = src[2].ud / exec_size;
+ assert(src[A64_LOGICAL_ARG].file == IMM);
+ if (i == A64_LOGICAL_SRC) { /* data to write */
+ const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
assert(comps > 0);
return comps;
} else {
@@ -773,41 +781,13 @@ fs_inst::components_read(unsigned i) const
}
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
- assert(src[2].file == IMM);
- return i == 1 ? src[2].ud : 1;
+ assert(src[A64_LOGICAL_ARG].file == IMM);
+ return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
- assert(src[2].file == IMM);
- if (i == 1) {
- /* Data source */
- const unsigned op = src[2].ud;
- switch (op) {
- case BRW_AOP_INC:
- case BRW_AOP_DEC:
- case BRW_AOP_PREDEC:
- return 0;
- case BRW_AOP_CMPWR:
- return 2;
- default:
- return 1;
- }
- } else {
- return 1;
- }
-
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
- assert(src[2].file == IMM);
- if (i == 1) {
- /* Data source */
- const unsigned op = src[2].ud;
- return op == BRW_AOP_FCMPWR ? 2 : 1;
- } else {
- return 1;
- }
+ assert(src[A64_LOGICAL_ARG].file == IMM);
+ return i == A64_LOGICAL_SRC ?
+ lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
@@ -837,31 +817,14 @@ fs_inst::components_read(unsigned i) const
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
/* Surface operation source. */
- else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)
- return 2;
- else if (i == SURFACE_LOGICAL_SRC_DATA &&
- (op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))
- return 0;
+ else if (i == SURFACE_LOGICAL_SRC_DATA)
+ return lsc_op_num_data_values(op);
else
return 1;
}
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
return (i == 0 ? 2 : 1);
- case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
- assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
- src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
- const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
- /* Surface coordinates. */
- if (i == SURFACE_LOGICAL_SRC_ADDRESS)
- return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
- /* Surface operation source. */
- else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)
- return 2;
- else
- return 1;
- }
-
case SHADER_OPCODE_URB_WRITE_LOGICAL:
if (i == URB_LOGICAL_SRC_DATA)
return mlen - 1 -
@@ -909,12 +872,6 @@ fs_inst::size_read(int arg) const
return 1;
break;
- case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
- /* The payload is actually stored in src1 */
- if (arg == 1)
- return mlen * REG_SIZE;
- break;
-
case FS_OPCODE_LINTERP:
if (arg == 1)
return 16;
@@ -922,7 +879,7 @@ fs_inst::size_read(int arg) const
case SHADER_OPCODE_LOAD_PAYLOAD:
if (arg < this->header_size)
- return REG_SIZE;
+ return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8);
break;
case CS_OPCODE_CS_TERMINATE:
@@ -1120,7 +1077,7 @@ fs_visitor::vgrf(const glsl_type *const type)
brw_type_for_base_type(type));
}
-fs_reg::fs_reg(enum brw_reg_file file, int nr)
+fs_reg::fs_reg(enum brw_reg_file file, unsigned nr)
{
init();
this->file = file;
@@ -1129,7 +1086,7 @@ fs_reg::fs_reg(enum brw_reg_file file, int nr)
this->stride = (file == UNIFORM ? 0 : 1);
}
-fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
+fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type)
{
init();
this->file = file;
@@ -1275,7 +1232,7 @@ fs_visitor::emit_samplepos_setup()
const fs_builder abld = bld.annotate("compute sample position");
fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2);
- if (!wm_prog_data->persample_dispatch) {
+ if (wm_prog_data->persample_dispatch == BRW_NEVER) {
/* From ARB_sample_shading specification:
* "When rendering to a non-multisample buffer, or if multisample
* rasterization is disabled, gl_SamplePosition will always be
@@ -1310,6 +1267,16 @@ fs_visitor::emit_samplepos_setup()
abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
}
+ if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) {
+ check_dynamic_msaa_flag(abld, wm_prog_data,
+ BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
+ for (unsigned i = 0; i < 2; i++) {
+ set_predicate(BRW_PREDICATE_NORMAL,
+ bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
+ brw_imm_f(0.5f)));
+ }
+ }
+
return pos;
}
@@ -1318,12 +1285,13 @@ fs_visitor::emit_sampleid_setup()
{
assert(stage == MESA_SHADER_FRAGMENT);
ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+ struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
assert(devinfo->ver >= 6);
const fs_builder abld = bld.annotate("compute sample id");
fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD);
- assert(key->multisample_fbo);
+ assert(key->multisample_fbo != BRW_NEVER);
if (devinfo->ver >= 8) {
/* Sample ID comes in as 4-bit numbers in g1.0:
@@ -1413,6 +1381,13 @@ fs_visitor::emit_sampleid_setup()
abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
}
+ if (key->multisample_fbo == BRW_SOMETIMES) {
+ check_dynamic_msaa_flag(abld, wm_prog_data,
+ BRW_WM_MSAA_FLAG_MULTISAMPLE_FBO);
+ set_predicate(BRW_PREDICATE_NORMAL,
+ abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
+ }
+
return sample_id;
}
@@ -1423,39 +1398,44 @@ fs_visitor::emit_samplemaskin_setup()
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
assert(devinfo->ver >= 6);
- fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
-
/* The HW doesn't provide us with expected values. */
- assert(!wm_prog_data->per_coarse_pixel_dispatch);
+ assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
fs_reg coverage_mask =
fetch_payload_reg(bld, fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D);
- if (wm_prog_data->persample_dispatch) {
- /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
- * and a mask representing which sample is being processed by the
- * current shader invocation.
- *
- * From the OES_sample_variables specification:
- * "When per-sample shading is active due to the use of a fragment input
- * qualified by "sample" or due to the use of the gl_SampleID or
- * gl_SamplePosition variables, only the bit for the current sample is
- * set in gl_SampleMaskIn."
- */
- const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
+ if (wm_prog_data->persample_dispatch == BRW_NEVER)
+ return coverage_mask;
+
+ /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
+ * and a mask representing which sample is being processed by the
+ * current shader invocation.
+ *
+ * From the OES_sample_variables specification:
+ * "When per-sample shading is active due to the use of a fragment input
+ * qualified by "sample" or due to the use of the gl_SampleID or
+ * gl_SamplePosition variables, only the bit for the current sample is
+ * set in gl_SampleMaskIn."
+ */
+ const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
- if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
- nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
+ if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+ nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
+
+ fs_reg one = vgrf(glsl_type::int_type);
+ fs_reg enabled_mask = vgrf(glsl_type::int_type);
+ abld.MOV(one, brw_imm_d(1));
+ abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
+ fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
+ abld.AND(mask, enabled_mask, coverage_mask);
+
+ if (wm_prog_data->persample_dispatch == BRW_ALWAYS)
+ return mask;
+
+ check_dynamic_msaa_flag(abld, wm_prog_data,
+ BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
+ set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
- fs_reg one = vgrf(glsl_type::int_type);
- fs_reg enabled_mask = vgrf(glsl_type::int_type);
- abld.MOV(one, brw_imm_d(1));
- abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
- abld.AND(mask, enabled_mask, coverage_mask);
- } else {
- /* In per-pixel mode, the coverage mask is sufficient. */
- mask = coverage_mask;
- }
return mask;
}
@@ -1464,37 +1444,44 @@ fs_visitor::emit_shading_rate_setup()
{
assert(devinfo->ver >= 11);
- const fs_builder abld = bld.annotate("compute fragment shading rate");
- fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
-
struct brw_wm_prog_data *wm_prog_data =
brw_wm_prog_data(bld.shader->stage_prog_data);
/* Coarse pixel shading size fields overlap with other fields of not in
* coarse pixel dispatch mode, so report 0 when that's not the case.
*/
- if (wm_prog_data->per_coarse_pixel_dispatch) {
- /* The shading rates provided in the shader are the actual 2D shading
- * rate while the SPIR-V built-in is the enum value that has the shading
- * rate encoded as a bitfield. Fortunately, the bitfield value is just
- * the shading rate divided by two and shifted.
- */
+ if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER)
+ return brw_imm_ud(0);
+
+ const fs_builder abld = bld.annotate("compute fragment shading rate");
- /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
- fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
- /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
- fs_reg actual_y = byte_offset(actual_x, 1);
+ /* The shading rates provided in the shader are the actual 2D shading
+ * rate while the SPIR-V built-in is the enum value that has the shading
+ * rate encoded as a bitfield. Fortunately, the bitfield value is just
+ * the shading rate divided by two and shifted.
+ */
- fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
- fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
+ fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
+ /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
+ fs_reg actual_y = byte_offset(actual_x, 1);
- abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
- abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
- abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
- abld.OR(rate, int_rate_x, int_rate_y);
- } else {
- abld.MOV(rate, brw_imm_ud(0));
- }
+ fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
+ abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
+ abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
+
+ fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
+ abld.OR(rate, int_rate_x, int_rate_y);
+
+ if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS)
+ return rate;
+
+ check_dynamic_msaa_flag(abld, wm_prog_data,
+ BRW_WM_MSAA_FLAG_COARSE_RT_WRITES);
+ set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
return rate;
}
@@ -2137,12 +2124,12 @@ fs_visitor::split_virtual_grfs()
*/
compact_virtual_grfs();
- int num_vars = this->alloc.count;
+ unsigned num_vars = this->alloc.count;
/* Count the total number of registers */
- int reg_count = 0;
- int vgrf_to_reg[num_vars];
- for (int i = 0; i < num_vars; i++) {
+ unsigned reg_count = 0;
+ unsigned vgrf_to_reg[num_vars];
+ for (unsigned i = 0; i < num_vars; i++) {
vgrf_to_reg[i] = reg_count;
reg_count += alloc.sizes[i];
}
@@ -2159,14 +2146,14 @@ fs_visitor::split_virtual_grfs()
/* Mark all used registers as fully splittable */
foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (inst->dst.file == VGRF) {
- int reg = vgrf_to_reg[inst->dst.nr];
+ unsigned reg = vgrf_to_reg[inst->dst.nr];
for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
split_points[reg + j] = true;
}
- for (int i = 0; i < inst->sources; i++) {
+ for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
- int reg = vgrf_to_reg[inst->src[i].nr];
+ unsigned reg = vgrf_to_reg[inst->src[i].nr];
for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
split_points[reg + j] = true;
}
@@ -2181,13 +2168,13 @@ fs_visitor::split_virtual_grfs()
}
if (inst->dst.file == VGRF) {
- int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+ unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
for (unsigned j = 1; j < regs_written(inst); j++)
split_points[reg + j] = false;
}
- for (int i = 0; i < inst->sources; i++) {
+ for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
- int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+ unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
for (unsigned j = 1; j < regs_read(inst, i); j++)
split_points[reg + j] = false;
}
@@ -2198,19 +2185,19 @@ fs_visitor::split_virtual_grfs()
bool *vgrf_has_split = new bool[num_vars];
memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
- int *new_virtual_grf = new int[reg_count];
- int *new_reg_offset = new int[reg_count];
+ unsigned *new_virtual_grf = new unsigned[reg_count];
+ unsigned *new_reg_offset = new unsigned[reg_count];
- int reg = 0;
+ unsigned reg = 0;
bool has_splits = false;
- for (int i = 0; i < num_vars; i++) {
+ for (unsigned i = 0; i < num_vars; i++) {
/* The first one should always be 0 as a quick sanity check. */
assert(split_points[reg] == false);
/* j = 0 case */
new_reg_offset[reg] = 0;
reg++;
- int offset = 1;
+ unsigned offset = 1;
/* j > 0 case */
for (unsigned j = 1; j < alloc.sizes[i]; j++) {
@@ -2221,8 +2208,8 @@ fs_visitor::split_virtual_grfs()
has_splits = true;
vgrf_has_split[i] = true;
assert(offset <= MAX_VGRF_SIZE);
- int grf = alloc.allocate(offset);
- for (int k = reg - offset; k < reg; k++)
+ unsigned grf = alloc.allocate(offset);
+ for (unsigned k = reg - offset; k < reg; k++)
new_virtual_grf[k] = grf;
offset = 0;
}
@@ -2234,7 +2221,7 @@ fs_visitor::split_virtual_grfs()
/* The last one gets the original register number */
assert(offset <= MAX_VGRF_SIZE);
alloc.sizes[i] = offset;
- for (int k = reg - offset; k < reg; k++)
+ for (unsigned k = reg - offset; k < reg; k++)
new_virtual_grf[k] = i;
}
assert(reg == reg_count);
@@ -2268,7 +2255,7 @@ fs_visitor::split_virtual_grfs()
} else {
reg = vgrf_to_reg[inst->dst.nr];
assert(new_reg_offset[reg] == 0);
- assert(new_virtual_grf[reg] == (int)inst->dst.nr);
+ assert(new_virtual_grf[reg] == inst->dst.nr);
}
continue;
}
@@ -2279,14 +2266,13 @@ fs_visitor::split_virtual_grfs()
inst->dst.nr = new_virtual_grf[reg];
inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
inst->dst.offset % REG_SIZE;
- assert((unsigned)new_reg_offset[reg] <
- alloc.sizes[new_virtual_grf[reg]]);
+ assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
} else {
assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
- assert(new_virtual_grf[reg] == (int)inst->dst.nr);
+ assert(new_virtual_grf[reg] == inst->dst.nr);
}
}
- for (int i = 0; i < inst->sources; i++) {
+ for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file != VGRF)
continue;
@@ -2295,11 +2281,10 @@ fs_visitor::split_virtual_grfs()
inst->src[i].nr = new_virtual_grf[reg];
inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
inst->src[i].offset % REG_SIZE;
- assert((unsigned)new_reg_offset[reg] <
- alloc.sizes[new_virtual_grf[reg]]);
+ assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
} else {
assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
- assert(new_virtual_grf[reg] == (int)inst->src[i].nr);
+ assert(new_virtual_grf[reg] == inst->src[i].nr);
}
}
}
@@ -2507,8 +2492,14 @@ fs_visitor::lower_constant_loads()
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
const unsigned base = pull_index * 4;
- ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
- dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
+ fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
+ srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
+ srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
+ srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
+
+
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
+ srcs, PULL_UNIFORM_CONSTANT_SRCS);
/* Rewrite the instruction to use the temporary VGRF. */
inst->src[i].file = VGRF;
@@ -2626,6 +2617,7 @@ fs_visitor::opt_algebraic()
/* a * 1.0 = a */
if (inst->src[1].is_one()) {
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
inst->src[1] = reg_undef;
progress = true;
break;
@@ -2634,6 +2626,7 @@ fs_visitor::opt_algebraic()
/* a * -1.0 = -a */
if (inst->src[1].is_negative_one()) {
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
inst->src[0].negate = !inst->src[0].negate;
inst->src[1] = reg_undef;
progress = true;
@@ -2648,6 +2641,7 @@ fs_visitor::opt_algebraic()
if (brw_reg_type_is_integer(inst->src[1].type) &&
inst->src[1].is_zero()) {
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
inst->src[1] = reg_undef;
progress = true;
break;
@@ -2656,6 +2650,7 @@ fs_visitor::opt_algebraic()
if (inst->src[0].file == IMM) {
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
inst->src[0].f += inst->src[1].f;
inst->src[1] = reg_undef;
progress = true;
@@ -2671,9 +2666,11 @@ fs_visitor::opt_algebraic()
*/
if (inst->src[0].negate) {
inst->opcode = BRW_OPCODE_NOT;
+ inst->sources = 1;
inst->src[0].negate = false;
} else {
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
}
inst->src[1] = reg_undef;
progress = true;
@@ -2720,6 +2717,7 @@ fs_visitor::opt_algebraic()
}
if (inst->src[0].equals(inst->src[1])) {
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
inst->src[1] = reg_undef;
inst->predicate = BRW_PREDICATE_NONE;
inst->predicate_inverse = false;
@@ -2732,6 +2730,7 @@ fs_visitor::opt_algebraic()
case BRW_REGISTER_TYPE_F:
if (inst->src[1].f >= 1.0f) {
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
inst->src[1] = reg_undef;
inst->conditional_mod = BRW_CONDITIONAL_NONE;
progress = true;
@@ -2747,6 +2746,7 @@ fs_visitor::opt_algebraic()
case BRW_REGISTER_TYPE_F:
if (inst->src[1].f <= 0.0f) {
inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
inst->src[1] = reg_undef;
inst->conditional_mod = BRW_CONDITIONAL_NONE;
progress = true;
@@ -2767,11 +2767,13 @@ fs_visitor::opt_algebraic()
break;
if (inst->src[1].is_one()) {
inst->opcode = BRW_OPCODE_ADD;
+ inst->sources = 2;
inst->src[1] = inst->src[2];
inst->src[2] = reg_undef;
progress = true;
} else if (inst->src[2].is_one()) {
inst->opcode = BRW_OPCODE_ADD;
+ inst->sources = 2;
inst->src[2] = reg_undef;
progress = true;
}
@@ -2850,9 +2852,9 @@ fs_visitor::opt_zero_samples()
/* Gfx4 infers the texturing opcode based on the message length so we can't
* change it. Gfx12.5 has restrictions on the number of coordinate
* parameters that have to be provided for some texture types
- * (Wa_14013363432).
+ * (Wa_14012688258).
*/
- if (devinfo->ver < 5 || devinfo->verx10 == 125)
+ if (devinfo->ver < 5 || intel_needs_workaround(devinfo, 14012688258))
return false;
bool progress = false;
@@ -3725,106 +3727,6 @@ fs_visitor::insert_gfx4_send_dependency_workarounds()
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
-/**
- * Turns the generic expression-style uniform pull constant load instruction
- * into a hardware-specific series of instructions for loading a pull
- * constant.
- *
- * The expression style allows the CSE pass before this to optimize out
- * repeated loads from the same offset, and gives the pre-register-allocation
- * scheduling full flexibility, while the conversion to native instructions
- * allows the post-register-allocation scheduler the best information
- * possible.
- *
- * Note that execution masking for setting up pull constant loads is special:
- * the channels that need to be written are unrelated to the current execution
- * mask, since a later instruction will use one of the result channels as a
- * source operand for all 8 or 16 of its channels.
- */
-void
-fs_visitor::lower_uniform_pull_constant_loads()
-{
- foreach_block_and_inst (block, fs_inst, inst, cfg) {
- if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
- continue;
-
- const fs_reg& surface = inst->src[0];
- const fs_reg& offset_B = inst->src[1];
- assert(offset_B.file == IMM);
-
- if (devinfo->has_lsc) {
- const fs_builder ubld =
- fs_builder(this, block, inst).group(8, 0).exec_all();
-
- const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
- ubld.MOV(payload, offset_B);
-
- inst->sfid = GFX12_SFID_UGM;
- inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
- 1 /* simd_size */,
- LSC_ADDR_SURFTYPE_BTI,
- LSC_ADDR_SIZE_A32,
- 1 /* num_coordinates */,
- LSC_DATA_SIZE_D32,
- inst->size_written / 4,
- true /* transpose */,
- LSC_CACHE_LOAD_L1STATE_L3MOCS,
- true /* has_dest */);
-
- fs_reg ex_desc;
- if (surface.file == IMM) {
- ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
- } else {
- /* We only need the first component for the payload so we can use
- * one of the other components for the extended descriptor
- */
- ex_desc = component(payload, 1);
- ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
- }
-
- /* Update the original instruction. */
- inst->opcode = SHADER_OPCODE_SEND;
- inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
- inst->ex_mlen = 0;
- inst->header_size = 0;
- inst->send_has_side_effects = false;
- inst->send_is_volatile = true;
- inst->exec_size = 1;
-
- /* Finally, the payload */
- inst->resize_sources(3);
- inst->src[0] = brw_imm_ud(0); /* desc */
- inst->src[1] = ex_desc;
- inst->src[2] = payload;
-
- invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
- } else if (devinfo->ver >= 7) {
- const fs_builder ubld = fs_builder(this, block, inst).exec_all();
- const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
-
- ubld.group(8, 0).MOV(payload,
- retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
- ubld.group(1, 0).MOV(component(payload, 2),
- brw_imm_ud(offset_B.ud / 16));
-
- inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;
- inst->src[1] = payload;
- inst->header_size = 1;
- inst->mlen = 1;
-
- invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
- } else {
- /* Before register allocation, we didn't tell the scheduler about the
- * MRF we use. We know it's safe to use this MRF because nothing
- * else does except for register spill/unspill, which generates and
- * uses its MRF within a single IR instruction.
- */
- inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
- inst->mlen = 1;
- }
- }
-}
-
bool
fs_visitor::lower_load_payload()
{
@@ -3933,15 +3835,147 @@ fs_visitor::lower_load_payload()
return progress;
}
+/**
+ * Factor an unsigned 32-bit integer.
+ *
+ * Attempts to factor \c x into two values that are at most 0xFFFF. If no
+ * such factorization is possible, either because the value is too large or is
+ * prime, both \c result_a and \c result_b will be zero.
+ */
+static void
+factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
+{
+ /* This is necessary to prevent various opportunities for division by zero
+ * below.
+ */
+ assert(x > 0xffff);
+
+ /* This represents the actual expected constraints on the input. Namely,
+ * both the upper and lower words should be > 1.
+ */
+ assert(x >= 0x00020002);
+
+ *result_a = 0;
+ *result_b = 0;
+
+ /* The value is too large to factor with the constraints. */
+ if (x > (0xffffu * 0xffffu))
+ return;
+
+ /* A non-prime number will have the form p*q*d where p is some prime
+ * number, q > 1, and 1 <= d <= q. To meet the constraints of this
+ * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p).
+ * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally,
+ * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
+ *
+ * The observation is finding the largest possible value of p reduces the
+ * possible range of d. After selecting p, all values of d in this range
+ * are tested until a factorization is found. The size of the range of
+ * possible values of d sets an upper bound on the run time of the
+ * function.
+ */
+ static const uint16_t primes[256] = {
+ 2, 3, 5, 7, 11, 13, 17, 19,
+ 23, 29, 31, 37, 41, 43, 47, 53,
+ 59, 61, 67, 71, 73, 79, 83, 89,
+ 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */
+ 137, 139, 149, 151, 157, 163, 167, 173,
+ 179, 181, 191, 193, 197, 199, 211, 223,
+ 227, 229, 233, 239, 241, 251, 257, 263,
+ 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */
+ 313, 317, 331, 337, 347, 349, 353, 359,
+ 367, 373, 379, 383, 389, 397, 401, 409,
+ 419, 421, 431, 433, 439, 443, 449, 457,
+ 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */
+ 509, 521, 523, 541, 547, 557, 563, 569,
+ 571, 577, 587, 593, 599, 601, 607, 613,
+ 617, 619, 631, 641, 643, 647, 653, 659,
+ 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */
+ 727, 733, 739, 743, 751, 757, 761, 769,
+ 773, 787, 797, 809, 811, 821, 823, 827,
+ 829, 839, 853, 857, 859, 863, 877, 881,
+ 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */
+ 947, 953, 967, 971, 977, 983, 991, 997,
+ 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
+ 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
+ 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */
+ 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
+ 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
+ 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+ 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */
+ 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
+ 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
+ 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
+ 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */
+ };
+
+ unsigned p;
+ unsigned x_div_p;
+
+ for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
+ p = primes[i];
+ x_div_p = x / p;
+
+ if ((x_div_p * p) == x)
+ break;
+ }
+
+ /* A prime factor was not found. */
+ if (x_div_p * p != x)
+ return;
+
+ /* Terminate early if d=1 is a solution. */
+ if (x_div_p < 0x10000) {
+ *result_a = x_div_p;
+ *result_b = p;
+ return;
+ }
+
+ /* Pick the maximum possible value for 'd'. It's important that the loop
+ * below execute while d <= max_d because max_d is a valid value. Having
+ * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
+ * incorrectly reported as not being factorable. The problem would occur
+ * with any value that is a factor of two primes in the table and one prime
+ * not in the table.
+ */
+ const unsigned max_d = 0xffff / p;
+
+ /* Pick an initial value of 'd' that (combined with rejecting too large
+ * values above) guarantees that 'q' will always be small enough.
+ * DIV_ROUND_UP is used to prevent 'd' from being zero.
+ */
+ for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
+ unsigned q = x_div_p / d;
+
+ if ((q * d) == x_div_p) {
+ assert(p * d * q == x);
+ assert((p * d) < 0x10000);
+
+ *result_a = q;
+ *result_b = p * d;
+ break;
+ }
+
+ /* Since every value of 'd' is tried, as soon as 'd' is larger
+ * than 'q', we're just re-testing combinations that have
+ * already been tested.
+ */
+ if (d > q)
+ break;
+ }
+}
+
void
fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
{
const fs_builder ibld(this, block, inst);
- const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
+ /* It is correct to use inst->src[1].d in both end of the comparison.
+ * Using .ud in the UINT16_MAX comparison would cause any negative value to
+ * fail the check.
+ */
if (inst->src[1].file == IMM &&
- (( ud && inst->src[1].ud <= UINT16_MAX) ||
- (!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) {
+ (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
/* The MUL instruction isn't commutative. On Gen <= 6, only the low
* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
* src1 are used.
@@ -3949,6 +3983,7 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
* If multiplying by an immediate value that fits in 16-bits, do a
* single MUL instruction with that value in the proper location.
*/
+ const bool ud = (inst->src[1].d >= 0);
if (devinfo->ver < 7) {
fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
ibld.MOV(imm, inst->src[1]);
@@ -4029,6 +4064,7 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
high.stride = inst->dst.stride;
high.offset = inst->dst.offset % REG_SIZE;
+ bool do_addition = true;
if (devinfo->ver >= 7) {
/* From Wa_1604601757:
*
@@ -4047,10 +4083,37 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
lower_src_modifiers(this, block, inst, 1);
if (inst->src[1].file == IMM) {
- ibld.MUL(low, inst->src[0],
- brw_imm_uw(inst->src[1].ud & 0xffff));
- ibld.MUL(high, inst->src[0],
- brw_imm_uw(inst->src[1].ud >> 16));
+ unsigned a;
+ unsigned b;
+
+ /* If the immeditate value can be factored into two values, A and
+ * B, that each fit in 16-bits, the multiplication result can
+ * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
+ * This saves an operation (the addition) and a temporary register
+ * (high).
+ *
+ * Skip the optimization if either the high word or the low word
+ * is 0 or 1. In these conditions, at least one of the
+ * multiplications generated by the straightforward method will be
+ * eliminated anyway.
+ */
+ if (inst->src[1].ud > 0x0001ffff &&
+ (inst->src[1].ud & 0xffff) > 1) {
+ factor_uint32(inst->src[1].ud, &a, &b);
+
+ if (a != 0) {
+ ibld.MUL(low, inst->src[0], brw_imm_uw(a));
+ ibld.MUL(low, low, brw_imm_uw(b));
+ do_addition = false;
+ }
+ }
+
+ if (do_addition) {
+ ibld.MUL(low, inst->src[0],
+ brw_imm_uw(inst->src[1].ud & 0xffff));
+ ibld.MUL(high, inst->src[0],
+ brw_imm_uw(inst->src[1].ud >> 16));
+ }
} else {
ibld.MUL(low, inst->src[0],
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
@@ -4067,9 +4130,11 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
inst->src[1]);
}
- ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
- subscript(low, BRW_REGISTER_TYPE_UW, 1),
- subscript(high, BRW_REGISTER_TYPE_UW, 0));
+ if (do_addition) {
+ ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
+ subscript(low, BRW_REGISTER_TYPE_UW, 1),
+ subscript(high, BRW_REGISTER_TYPE_UW, 0));
+ }
if (needs_mov || inst->conditional_mod)
set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
@@ -5052,7 +5117,6 @@ get_lowered_simd_width(const struct brw_compiler *compiler,
return 8;
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
- case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
@@ -5074,11 +5138,6 @@ get_lowered_simd_width(const struct brw_compiler *compiler,
return inst->exec_size;
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
- case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
case SHADER_OPCODE_URB_READ_LOGICAL:
@@ -6197,7 +6256,7 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
return false;
}
-/* Wa_14017989577
+/* Wa_14015360517
*
* The first instruction of any kernel should have non-zero emask.
* Make sure this happens by introducing a dummy mov instruction.
@@ -6205,7 +6264,7 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
void
fs_visitor::emit_dummy_mov_instruction()
{
- if (devinfo->verx10 < 120)
+ if (!intel_needs_workaround(devinfo, 14015360517))
return;
struct backend_instruction *first_inst =
@@ -6240,7 +6299,7 @@ fs_visitor::emit_dummy_memory_fence_before_eot()
bool progress = false;
bool has_ugm_write_or_atomic = false;
- if (!intel_device_info_is_dg2(devinfo))
+ if (!intel_needs_workaround(devinfo, 22013689345))
return;
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
@@ -6388,15 +6447,18 @@ fs_visitor::fixup_nomask_control_flow()
*/
const bool save_flag = flag_liveout &
flag_mask(flag, dispatch_width / 8);
- const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);
+ const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
- if (save_flag)
+ if (save_flag) {
+ ubld.group(8, 0).UNDEF(tmp);
ubld.group(1, 0).MOV(tmp, flag);
+ }
ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
set_predicate(pred, inst);
inst->flag_subreg = 0;
+ inst->predicate_trivial = true;
if (save_flag)
ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
@@ -6419,6 +6481,18 @@ fs_visitor::fixup_nomask_control_flow()
return progress;
}
+uint32_t
+fs_visitor::compute_max_register_pressure()
+{
+ const register_pressure &rp = regpressure_analysis.require();
+ uint32_t ip = 0, max_pressure = 0;
+ foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+ max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
+ ip++;
+ }
+ return max_pressure;
+}
+
void
fs_visitor::allocate_registers(bool allow_spilling)
{
@@ -6438,6 +6512,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
"lifo"
};
+ compact_virtual_grfs();
+
+ if (needs_register_pressure)
+ shader_stats.max_register_pressure = compute_max_register_pressure();
+
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
/* Before we schedule anything, stash off the instruction order as an array
@@ -6587,7 +6666,7 @@ fs_visitor::run_vs()
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(true /* allow_spilling */);
@@ -6713,7 +6792,7 @@ fs_visitor::run_tcs()
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(true /* allow_spilling */);
@@ -6745,7 +6824,7 @@ fs_visitor::run_tes()
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(true /* allow_spilling */);
@@ -6793,7 +6872,7 @@ fs_visitor::run_gs()
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(true /* allow_spilling */);
@@ -6896,7 +6975,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(allow_spilling);
@@ -6936,7 +7015,7 @@ fs_visitor::run_cs(bool allow_spilling)
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(allow_spilling);
@@ -6968,7 +7047,7 @@ fs_visitor::run_bs(bool allow_spilling)
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(allow_spilling);
@@ -7001,7 +7080,7 @@ fs_visitor::run_task(bool allow_spilling)
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(allow_spilling);
@@ -7034,7 +7113,7 @@ fs_visitor::run_mesh(bool allow_spilling)
fixup_3src_null_dest();
emit_dummy_memory_fence_before_eot();
- /* Wa_14017989577 */
+ /* Wa_14015360517 */
emit_dummy_mov_instruction();
allocate_registers(allow_spilling);
@@ -7045,7 +7124,10 @@ fs_visitor::run_mesh(bool allow_spilling)
static bool
is_used_in_not_interp_frag_coord(nir_ssa_def *def)
{
- nir_foreach_use(src, def) {
+ nir_foreach_use_including_if(src, def) {
+ if (src->is_if)
+ return true;
+
if (src->parent_instr->type != nir_instr_type_intrinsic)
return true;
@@ -7054,9 +7136,6 @@ is_used_in_not_interp_frag_coord(nir_ssa_def *def)
return true;
}
- nir_foreach_if_use(src, def)
- return true;
-
return false;
}
@@ -7088,6 +7167,8 @@ brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample:
+ case nir_intrinsic_load_barycentric_at_sample:
+ case nir_intrinsic_load_barycentric_at_offset:
break;
default:
continue;
@@ -7182,11 +7263,12 @@ brw_nir_move_interpolation_to_top(nir_shader *nir)
continue;
nir_block *top = nir_start_block(f->impl);
- exec_node *cursor_node = NULL;
+ nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
+ bool impl_progress = false;
- nir_foreach_block(block, f->impl) {
- if (block == top)
- continue;
+ for (nir_block *block = nir_block_cf_tree_next(top);
+ block != NULL;
+ block = nir_block_cf_tree_next(block)) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
@@ -7212,28 +7294,25 @@ brw_nir_move_interpolation_to_top(nir_shader *nir)
for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
if (move[i]->block != top) {
- move[i]->block = top;
- exec_node_remove(&move[i]->node);
- if (cursor_node) {
- exec_node_insert_after(cursor_node, &move[i]->node);
- } else {
- exec_list_push_head(&top->instr_list, &move[i]->node);
- }
- cursor_node = &move[i]->node;
- progress = true;
+ nir_instr_move(cursor, move[i]);
+ impl_progress = true;
}
}
}
}
- nir_metadata_preserve(f->impl, nir_metadata_block_index |
- nir_metadata_dominance);
+
+ progress = progress || impl_progress;
+
+ nir_metadata_preserve(f->impl, impl_progress ? (nir_metadata_block_index |
+ nir_metadata_dominance)
+ : nir_metadata_all);
}
return progress;
}
static void
-brw_nir_populate_wm_prog_data(const nir_shader *shader,
+brw_nir_populate_wm_prog_data(nir_shader *shader,
const struct intel_device_info *devinfo,
const struct brw_wm_prog_key *key,
struct brw_wm_prog_data *prog_data,
@@ -7252,10 +7331,28 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
prog_data->computed_stencil =
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
- prog_data->persample_dispatch =
- key->multisample_fbo &&
- (key->persample_interp ||
- shader->info.fs.uses_sample_shading);
+ prog_data->sample_shading =
+ shader->info.fs.uses_sample_shading ||
+ shader->info.outputs_read;
+
+ assert(key->multisample_fbo != BRW_NEVER ||
+ key->persample_interp == BRW_NEVER);
+
+ prog_data->persample_dispatch = key->persample_interp;
+ if (prog_data->sample_shading)
+ prog_data->persample_dispatch = BRW_ALWAYS;
+
+ /* We can only persample dispatch if we have a multisample FBO */
+ prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
+ key->multisample_fbo);
+
+ /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
+ * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
+ * to definitively tell whether alpha_to_coverage is on or off.
+ */
+ prog_data->alpha_to_coverage = key->alpha_to_coverage;
+ assert(prog_data->alpha_to_coverage != BRW_SOMETIMES ||
+ prog_data->persample_dispatch == BRW_SOMETIMES);
if (devinfo->ver >= 6) {
prog_data->uses_sample_mask =
@@ -7270,11 +7367,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
* per-sample dispatch. If we need gl_SamplePosition and we don't have
* persample dispatch, we hard-code it to 0.5.
*/
- prog_data->uses_pos_offset = prog_data->persample_dispatch &&
- (BITSET_TEST(shader->info.system_values_read,
- SYSTEM_VALUE_SAMPLE_POS) ||
- BITSET_TEST(shader->info.system_values_read,
- SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
+ prog_data->read_pos_offset_input =
+ BITSET_TEST(shader->info.system_values_read,
+ SYSTEM_VALUE_SAMPLE_POS) ||
+ BITSET_TEST(shader->info.system_values_read,
+ SYSTEM_VALUE_SAMPLE_POS_OR_CENTER);
+
+ if (prog_data->read_pos_offset_input)
+ prog_data->uses_pos_offset = prog_data->persample_dispatch;
+ else
+ prog_data->uses_pos_offset = BRW_NEVER;
}
prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
@@ -7285,19 +7387,72 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
prog_data->barycentric_interp_modes =
brw_compute_barycentric_interp_modes(devinfo, shader);
+
+ /* From the BDW PRM documentation for 3DSTATE_WM:
+ *
+ * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+ * Sample or Non- perspective Sample barycentric coordinates."
+ *
+ * So cleanup any potentially set sample barycentric mode when not in per
+ * sample dispatch.
+ */
+ if (prog_data->persample_dispatch == BRW_NEVER) {
+ prog_data->barycentric_interp_modes &=
+ ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+ }
+
prog_data->uses_nonperspective_interp_modes |=
(prog_data->barycentric_interp_modes &
BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
- /* You can't be coarse and per-sample */
- assert(!key->coarse_pixel || !key->persample_interp);
- prog_data->per_coarse_pixel_dispatch =
- key->coarse_pixel &&
- !shader->info.fs.uses_sample_shading &&
- !prog_data->uses_omask &&
- !prog_data->uses_sample_mask &&
- (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
- !prog_data->computed_stencil;
+ /* The current VK_EXT_graphics_pipeline_library specification requires
+ * coarse to specified at compile time. But per sample interpolation can be
+ * dynamic. So we should never be in a situation where coarse &
+ * persample_interp are both respectively true & BRW_ALWAYS.
+ *
+ * Coarse will dynamically turned off when persample_interp is active.
+ */
+ assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
+
+ prog_data->coarse_pixel_dispatch =
+ brw_sometimes_invert(prog_data->persample_dispatch);
+ if (!key->coarse_pixel ||
+ prog_data->uses_omask ||
+ prog_data->sample_shading ||
+ prog_data->uses_sample_mask ||
+ (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
+ prog_data->computed_stencil) {
+ prog_data->coarse_pixel_dispatch = BRW_NEVER;
+ }
+
+ /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
+ * Message Descriptor :
+ *
+ * "Message Type. Specifies the type of message being sent when
+ * pixel-rate evaluation is requested :
+ *
+ * Format = U2
+ * 0: Per Message Offset (eval_snapped with immediate offset)
+ * 1: Sample Position Offset (eval_sindex)
+ * 2: Centroid Position Offset (eval_centroid)
+ * 3: Per Slot Offset (eval_snapped with register offset)
+ *
+ * Message Type. Specifies the type of message being sent when
+ * coarse-rate evaluation is requested :
+ *
+ * Format = U2
+ * 0: Coarse to Pixel Mapping Message (internal message)
+ * 1: Reserved
+ * 2: Coarse Centroid Position (eval_centroid)
+ * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
+ *
+ * The Sample Position Offset is marked as reserved for coarse rate
+ * evaluation and leads to hangs if we try to use it. So disable coarse
+ * pixel shading if we have any intrinsic that will result in a pixel
+ * interpolater message at sample.
+ */
+ if (brw_nir_pulls_at_sample(shader))
+ prog_data->coarse_pixel_dispatch = BRW_NEVER;
/* We choose to always enable VMask prior to XeHP, as it would cause
* us to lose out on the eliminate_find_live_channel() optimization.
@@ -7305,16 +7460,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
prog_data->uses_vmask = devinfo->verx10 < 125 ||
shader->info.fs.needs_quad_helper_invocations ||
shader->info.fs.needs_all_helper_invocations ||
- prog_data->per_coarse_pixel_dispatch;
+ prog_data->coarse_pixel_dispatch != BRW_NEVER;
prog_data->uses_src_w =
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
prog_data->uses_src_depth =
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
- !prog_data->per_coarse_pixel_dispatch;
+ prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
prog_data->uses_depth_w_coefficients =
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
- prog_data->per_coarse_pixel_dispatch;
+ prog_data->coarse_pixel_dispatch != BRW_NEVER;
calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
brw_compute_flat_inputs(prog_data, shader);
@@ -7361,35 +7516,35 @@ brw_compile_fs(const struct brw_compiler *compiler,
* "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
* hardware, regardless of the state setting for this feature."
*/
- if (devinfo->ver > 6 && key->alpha_to_coverage) {
+ if (devinfo->ver > 6 && key->alpha_to_coverage != BRW_NEVER) {
/* Run constant fold optimization in order to get the correct source
* offset to determine render target 0 store instruction in
* emit_alpha_to_coverage pass.
*/
- NIR_PASS_V(nir, nir_opt_constant_folding);
- NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage);
+ NIR_PASS(_, nir, nir_opt_constant_folding);
+ NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
}
- NIR_PASS_V(nir, brw_nir_move_interpolation_to_top);
+ NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
brw_postprocess_nir(nir, compiler, true, debug_enabled,
key->base.robust_buffer_access);
brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
params->mue_map);
- fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
+ std::unique_ptr<fs_visitor> v8, v16, v32;
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
float throughput = 0;
bool has_spilled = false;
- v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
- &prog_data->base, nir, 8,
- debug_enabled);
+ v8 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+ &prog_data->base, nir, 8,
+ params->stats != NULL,
+ debug_enabled);
if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
- delete v8;
return NULL;
- } else if (!INTEL_DEBUG(DEBUG_NO8)) {
+ } else if (INTEL_SIMD(FS, 8)) {
simd8_cfg = v8->cfg;
prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs;
prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
@@ -7403,7 +7558,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
* See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
*/
if (devinfo->ver == 8 && prog_data->dual_src_blend &&
- !INTEL_DEBUG(DEBUG_NO8)) {
+ INTEL_SIMD(FS, 8)) {
assert(!params->use_rep_send);
v8->limit_dispatch_width(8, "gfx8 workaround: "
"using SIMD8 when dual src blending.\n");
@@ -7423,12 +7578,13 @@ brw_compile_fs(const struct brw_compiler *compiler,
if (!has_spilled &&
v8->max_dispatch_width >= 16 &&
- (!INTEL_DEBUG(DEBUG_NO16) || params->use_rep_send)) {
+ (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
/* Try a SIMD16 compile */
- v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
- &prog_data->base, nir, 16,
- debug_enabled);
- v16->import_uniforms(v8);
+ v16 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+ &prog_data->base, nir, 16,
+ params->stats != NULL,
+ debug_enabled);
+ v16->import_uniforms(v8.get());
if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
brw_shader_perf_log(compiler, params->log_data,
"SIMD16 shader failed to compile: %s\n",
@@ -7450,12 +7606,13 @@ brw_compile_fs(const struct brw_compiler *compiler,
if (!has_spilled &&
v8->max_dispatch_width >= 32 && !params->use_rep_send &&
devinfo->ver >= 6 && !simd16_failed &&
- !INTEL_DEBUG(DEBUG_NO32)) {
+ INTEL_SIMD(FS, 32)) {
/* Try a SIMD32 compile */
- v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
- &prog_data->base, nir, 32,
- debug_enabled);
- v32->import_uniforms(v8);
+ v32 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+ &prog_data->base, nir, 32,
+ params->stats != NULL,
+ debug_enabled);
+ v32->import_uniforms(v8.get());
if (!v32->run_fs(allow_spilling, false)) {
brw_shader_perf_log(compiler, params->log_data,
"SIMD32 shader failed to compile: %s\n",
@@ -7520,12 +7677,14 @@ brw_compile_fs(const struct brw_compiler *compiler,
}
struct brw_compile_stats *stats = params->stats;
+ uint32_t max_dispatch_width = 0;
if (simd8_cfg) {
prog_data->dispatch_8 = true;
g.generate_code(simd8_cfg, 8, v8->shader_stats,
v8->performance_analysis.require(), stats);
stats = stats ? stats + 1 : NULL;
+ max_dispatch_width = 8;
}
if (simd16_cfg) {
@@ -7534,6 +7693,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
simd16_cfg, 16, v16->shader_stats,
v16->performance_analysis.require(), stats);
stats = stats ? stats + 1 : NULL;
+ max_dispatch_width = 16;
}
if (simd32_cfg) {
@@ -7542,14 +7702,13 @@ brw_compile_fs(const struct brw_compiler *compiler,
simd32_cfg, 32, v32->shader_stats,
v32->performance_analysis.require(), stats);
stats = stats ? stats + 1 : NULL;
+ max_dispatch_width = 32;
}
- g.add_const_data(nir->constant_data, nir->constant_data_size);
-
- delete v8;
- delete v16;
- delete v32;
+ for (struct brw_compile_stats *s = params->stats; s != NULL && s != stats; s++)
+ s->max_dispatch_width = max_dispatch_width;
+ g.add_const_data(nir->constant_data, nir->constant_data_size);
return g.get_assembly();
}
@@ -7707,15 +7866,17 @@ brw_compile_cs(const struct brw_compiler *compiler,
prog_data->local_size[2] = nir->info.workgroup_size[2];
}
- const unsigned required_dispatch_width =
- brw_required_dispatch_width(&nir->info);
+ brw_simd_selection_state simd_state{
+ .mem_ctx = mem_ctx,
+ .devinfo = compiler->devinfo,
+ .prog_data = prog_data,
+ .required_width = brw_required_dispatch_width(&nir->info),
+ };
- fs_visitor *v[3] = {0};
- const char *error[3] = {0};
+ std::unique_ptr<fs_visitor> v[3];
for (unsigned simd = 0; simd < 3; simd++) {
- if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, prog_data,
- required_dispatch_width, &error[simd]))
+ if (!brw_simd_should_compile(simd_state, simd))
continue;
const unsigned dispatch_width = 8u << simd;
@@ -7733,24 +7894,23 @@ brw_compile_cs(const struct brw_compiler *compiler,
brw_postprocess_nir(shader, compiler, true, debug_enabled,
key->base.robust_buffer_access);
- v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
- &prog_data->base, shader, dispatch_width,
- debug_enabled);
+ v[simd] = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+ &prog_data->base, shader, dispatch_width,
+ params->stats != NULL,
+ debug_enabled);
- if (prog_data->prog_mask) {
- unsigned first = ffs(prog_data->prog_mask) - 1;
- v[simd]->import_uniforms(v[first]);
- }
+ const int first = brw_simd_first_compiled(simd_state);
+ if (first >= 0)
+ v[simd]->import_uniforms(v[first].get());
- const bool allow_spilling = !prog_data->prog_mask ||
- nir->info.workgroup_size_variable;
+ const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
if (v[simd]->run_cs(allow_spilling)) {
cs_fill_push_const_info(compiler->devinfo, prog_data);
- brw_simd_mark_compiled(simd, prog_data, v[simd]->spilled_any_registers);
+ brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
} else {
- error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
+ simd_state.error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
if (simd > 0) {
brw_shader_perf_log(compiler, params->log_data,
"SIMD%u shader failed to compile: %s\n",
@@ -7759,21 +7919,20 @@ brw_compile_cs(const struct brw_compiler *compiler,
}
}
- const int selected_simd = brw_simd_select(prog_data);
+ const int selected_simd = brw_simd_select(simd_state);
if (selected_simd < 0) {
params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n",
- error[0], error[1], error[2]);;
+ simd_state.error[0], simd_state.error[1],
+ simd_state.error[2]);
return NULL;
}
assert(selected_simd < 3);
- fs_visitor *selected = v[selected_simd];
+ fs_visitor *selected = v[selected_simd].get();
if (!nir->info.workgroup_size_variable)
prog_data->prog_mask = 1 << selected_simd;
- const unsigned *ret = NULL;
-
fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
if (unlikely(debug_enabled)) {
@@ -7784,6 +7943,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
g.enable_debug(name);
}
+ uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
+
struct brw_compile_stats *stats = params->stats;
for (unsigned simd = 0; simd < 3; simd++) {
if (prog_data->prog_mask & (1u << simd)) {
@@ -7791,19 +7952,16 @@ brw_compile_cs(const struct brw_compiler *compiler,
prog_data->prog_offset[simd] =
g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
v[simd]->performance_analysis.require(), stats);
+ if (stats)
+ stats->max_dispatch_width = max_dispatch_width;
stats = stats ? stats + 1 : NULL;
+ max_dispatch_width = 8u << simd;
}
}
g.add_const_data(nir->constant_data, nir->constant_data_size);
- ret = g.get_assembly();
-
- delete v[0];
- delete v[1];
- delete v[2];
-
- return ret;
+ return g.get_assembly();
}
struct brw_cs_dispatch_info
@@ -7817,9 +7975,7 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
override_local_size ? override_local_size :
prog_data->local_size;
- const int simd =
- override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) :
- brw_simd_select(prog_data);
+ const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
assert(simd >= 0 && simd < 3);
info.group_size = sizes[0] * sizes[1] * sizes[2];
@@ -7857,81 +8013,65 @@ compile_single_bs(const struct brw_compiler *compiler, void *log_data,
brw_postprocess_nir(shader, compiler, true, debug_enabled,
key->base.robust_buffer_access);
- fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL;
- bool has_spilled = false;
+ brw_simd_selection_state simd_state{
+ .mem_ctx = mem_ctx,
+ .devinfo = compiler->devinfo,
+ .prog_data = prog_data,
- uint8_t simd_size = 0;
- if ((shader->info.subgroup_size == SUBGROUP_SIZE_VARYING ||
- shader->info.subgroup_size == SUBGROUP_SIZE_REQUIRE_8) &&
- !INTEL_DEBUG(DEBUG_NO8)) {
- v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
- &prog_data->base, shader,
- 8, debug_enabled);
- const bool allow_spilling = true;
- if (!v8->run_bs(allow_spilling)) {
- if (error_str)
- *error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
- delete v8;
- return 0;
- } else {
- v = v8;
- simd_size = 8;
- if (v8->spilled_any_registers)
- has_spilled = true;
- }
- }
+ /* Since divergence is a lot more likely in RT than compute, it makes
+ * sense to limit ourselves to SIMD8 for now.
+ */
+ .required_width = 8,
+ };
- if ((shader->info.subgroup_size == SUBGROUP_SIZE_VARYING ||
- shader->info.subgroup_size == SUBGROUP_SIZE_REQUIRE_16) &&
- !has_spilled && !INTEL_DEBUG(DEBUG_NO16)) {
- v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
- &prog_data->base, shader,
- 16, debug_enabled);
- const bool allow_spilling = (v == NULL);
- if (!v16->run_bs(allow_spilling)) {
- brw_shader_perf_log(compiler, log_data,
- "SIMD16 shader failed to compile: %s\n",
- v16->fail_msg);
- if (v == NULL) {
- assert(v8 == NULL);
- if (error_str) {
- *error_str = ralloc_asprintf(
- mem_ctx, "SIMD8 disabled and couldn't generate SIMD16: %s",
- v16->fail_msg);
- }
- delete v16;
- return 0;
- }
+ std::unique_ptr<fs_visitor> v[2];
+
+ for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) {
+ if (!brw_simd_should_compile(simd_state, simd))
+ continue;
+
+ const unsigned dispatch_width = 8u << simd;
+
+ v[simd] = std::make_unique<fs_visitor>(compiler, log_data, mem_ctx, &key->base,
+ &prog_data->base, shader,
+ dispatch_width,
+ stats != NULL,
+ debug_enabled);
+
+ const bool allow_spilling = !brw_simd_any_compiled(simd_state);
+ if (v[simd]->run_bs(allow_spilling)) {
+ brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
} else {
- v = v16;
- simd_size = 16;
- if (v16->spilled_any_registers)
- has_spilled = true;
+ simd_state.error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
+ if (simd > 0) {
+ brw_shader_perf_log(compiler, log_data,
+ "SIMD%u shader failed to compile: %s",
+ dispatch_width, v[simd]->fail_msg);
+ }
}
}
- if (unlikely(v == NULL)) {
- assert(INTEL_DEBUG(DEBUG_NO8 | DEBUG_NO16));
- if (error_str) {
- *error_str = ralloc_strdup(mem_ctx,
- "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
- }
- return false;
+ const int selected_simd = brw_simd_select(simd_state);
+ if (selected_simd < 0) {
+ *error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s and %s.",
+ simd_state.error[0], simd_state.error[1]);
+ return 0;
}
- assert(v);
+ assert(selected_simd < int(ARRAY_SIZE(v)));
+ fs_visitor *selected = v[selected_simd].get();
+ assert(selected);
+
+ const unsigned dispatch_width = selected->dispatch_width;
- int offset = g->generate_code(v->cfg, simd_size, v->shader_stats,
- v->performance_analysis.require(), stats);
+ int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats,
+ selected->performance_analysis.require(), stats);
if (prog_offset)
*prog_offset = offset;
else
assert(offset == 0);
- delete v8;
- delete v16;
-
- return simd_size;
+ return dispatch_width;
}
uint64_t