summaryrefslogtreecommitdiff
path: root/lib/mesa/src/intel/compiler/brw_fs.cpp
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2022-09-02 05:47:02 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2022-09-02 05:47:02 +0000
commit0dbbf1e0708df85a357d70e2708c0a11aeb5480e (patch)
tree6656ff8eb8b15a2fc1c02888973caf618388cfd0 /lib/mesa/src/intel/compiler/brw_fs.cpp
parent5f66494d31f735486b8222ecfa0a0c9046e92543 (diff)
Merge Mesa 22.1.7
Diffstat (limited to 'lib/mesa/src/intel/compiler/brw_fs.cpp')
-rw-r--r--lib/mesa/src/intel/compiler/brw_fs.cpp1876
1 files changed, 912 insertions, 964 deletions
diff --git a/lib/mesa/src/intel/compiler/brw_fs.cpp b/lib/mesa/src/intel/compiler/brw_fs.cpp
index bb6e1e338..3172a79fe 100644
--- a/lib/mesa/src/intel/compiler/brw_fs.cpp
+++ b/lib/mesa/src/intel/compiler/brw_fs.cpp
@@ -36,6 +36,7 @@
#include "brw_vec4_gs_visitor.h"
#include "brw_cfg.h"
#include "brw_dead_control_flow.h"
+#include "brw_private.h"
#include "dev/intel_debug.h"
#include "compiler/glsl_types.h"
#include "compiler/nir/nir_builder.h"
@@ -220,7 +221,6 @@ fs_inst::is_send_from_grf() const
{
switch (opcode) {
case SHADER_OPCODE_SEND:
- case SHADER_OPCODE_SHADER_TIME_ADD:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
@@ -309,7 +309,6 @@ fs_inst::is_payload(unsigned arg) const
case VEC4_OPCODE_UNTYPED_SURFACE_READ:
case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
- case SHADER_OPCODE_SHADER_TIME_ADD:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
case SHADER_OPCODE_INTERLOCK:
@@ -590,83 +589,6 @@ fs_visitor::get_timestamp(const fs_builder &bld)
}
void
-fs_visitor::emit_shader_time_begin()
-{
- /* We want only the low 32 bits of the timestamp. Since it's running
- * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
- * which is plenty of time for our purposes. It is identical across the
- * EUs, but since it's tracking GPU core speed it will increment at a
- * varying rate as render P-states change.
- */
- shader_start_time = component(
- get_timestamp(bld.annotate("shader time start")), 0);
-}
-
-void
-fs_visitor::emit_shader_time_end()
-{
- /* Insert our code just before the final SEND with EOT. */
- exec_node *end = this->instructions.get_tail();
- assert(end && ((fs_inst *) end)->eot);
- const fs_builder ibld = bld.annotate("shader time end")
- .exec_all().at(NULL, end);
- const fs_reg timestamp = get_timestamp(ibld);
-
- /* We only use the low 32 bits of the timestamp - see
- * emit_shader_time_begin()).
- *
- * We could also check if render P-states have changed (or anything
- * else that might disrupt timing) by setting smear to 2 and checking if
- * that field is != 0.
- */
- const fs_reg shader_end_time = component(timestamp, 0);
-
- /* Check that there weren't any timestamp reset events (assuming these
- * were the only two timestamp reads that happened).
- */
- const fs_reg reset = component(timestamp, 2);
- set_condmod(BRW_CONDITIONAL_Z,
- ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
- ibld.IF(BRW_PREDICATE_NORMAL);
-
- fs_reg start = shader_start_time;
- start.negate = true;
- const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
- BRW_REGISTER_TYPE_UD),
- 0);
- const fs_builder cbld = ibld.group(1, 0);
- cbld.group(1, 0).ADD(diff, start, shader_end_time);
-
- /* If there were no instructions between the two timestamp gets, the diff
- * is 2 cycles. Remove that overhead, so I can forget about that when
- * trying to determine the time taken for single instructions.
- */
- cbld.ADD(diff, diff, brw_imm_ud(-2u));
- SHADER_TIME_ADD(cbld, 0, diff);
- SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
- ibld.emit(BRW_OPCODE_ELSE);
- SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
- ibld.emit(BRW_OPCODE_ENDIF);
-}
-
-void
-fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
- int shader_time_subindex,
- fs_reg value)
-{
- int index = shader_time_index * 3 + shader_time_subindex;
- struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
-
- fs_reg payload;
- if (dispatch_width == 8)
- payload = vgrf(glsl_type::uvec2_type);
- else
- payload = vgrf(glsl_type::uint_type);
-
- bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
-}
-
-void
fs_visitor::vfail(const char *format, va_list va)
{
char *msg;
@@ -777,6 +699,7 @@ fs_inst::components_read(unsigned i) const
case FS_OPCODE_TXB_LOGICAL:
case SHADER_OPCODE_TXF_CMS_LOGICAL:
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
case SHADER_OPCODE_TXF_UMS_LOGICAL:
case SHADER_OPCODE_TXF_MCS_LOGICAL:
case SHADER_OPCODE_LOD_LOGICAL:
@@ -796,9 +719,14 @@ fs_inst::components_read(unsigned i) const
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
return 2;
/* MCS */
- else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
- return 2;
- else
+ else if (i == TEX_LOGICAL_SRC_MCS) {
+ if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+ return 2;
+ else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
+ return 4;
+ else
+ return 1;
+ } else
return 1;
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
@@ -1130,6 +1058,7 @@ fs_inst::flags_written(const intel_device_info *devinfo) const
opcode == FS_OPCODE_FB_WRITE) {
return flag_mask(this, 1);
} else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
+ opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
return flag_mask(this, 32);
} else {
@@ -1227,7 +1156,6 @@ void
fs_visitor::import_uniforms(fs_visitor *v)
{
this->push_constant_loc = v->push_constant_loc;
- this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
this->subgroup_id = v->subgroup_id;
for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
@@ -1301,17 +1229,17 @@ centroid_to_pixel(enum brw_barycentric_mode bary)
return (enum brw_barycentric_mode) ((unsigned) bary - 1);
}
-fs_reg *
+fs_reg
fs_visitor::emit_frontfacing_interpolation()
{
- fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
+ fs_reg ff = bld.vgrf(BRW_REGISTER_TYPE_D);
if (devinfo->ver >= 12) {
fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
bld.ASR(tmp, g1, brw_imm_d(15));
- bld.NOT(*reg, tmp);
+ bld.NOT(ff, tmp);
} else if (devinfo->ver >= 6) {
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
* a boolean result from this (~0/true or 0/false).
@@ -1327,7 +1255,7 @@ fs_visitor::emit_frontfacing_interpolation()
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
g0.negate = true;
- bld.ASR(*reg, g0, brw_imm_d(15));
+ bld.ASR(ff, g0, brw_imm_d(15));
} else {
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
* a boolean result from this (1/true or 0/false).
@@ -1342,45 +1270,32 @@ fs_visitor::emit_frontfacing_interpolation()
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
g1_6.negate = true;
- bld.ASR(*reg, g1_6, brw_imm_d(31));
+ bld.ASR(ff, g1_6, brw_imm_d(31));
}
- return reg;
+ return ff;
}
-void
-fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
+fs_reg
+fs_visitor::emit_samplepos_setup()
{
assert(stage == MESA_SHADER_FRAGMENT);
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
- assert(dst.type == BRW_REGISTER_TYPE_F);
+ assert(devinfo->ver >= 6);
- if (wm_prog_data->persample_dispatch) {
- /* Convert int_sample_pos to floating point */
- bld.MOV(dst, int_sample_pos);
- /* Scale to the range [0, 1] */
- bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
- }
- else {
+ const fs_builder abld = bld.annotate("compute sample position");
+ fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2);
+
+ if (!wm_prog_data->persample_dispatch) {
/* From ARB_sample_shading specification:
* "When rendering to a non-multisample buffer, or if multisample
* rasterization is disabled, gl_SamplePosition will always be
* (0.5, 0.5).
*/
- bld.MOV(dst, brw_imm_f(0.5f));
+ bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f));
+ bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f));
+ return pos;
}
-}
-
-fs_reg *
-fs_visitor::emit_samplepos_setup()
-{
- assert(devinfo->ver >= 6);
-
- const fs_builder abld = bld.annotate("compute sample position");
- fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
- fs_reg pos = *reg;
- fs_reg int_sample_x = vgrf(glsl_type::int_type);
- fs_reg int_sample_y = vgrf(glsl_type::int_type);
/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
* mode will be enabled.
@@ -1396,17 +1311,20 @@ fs_visitor::emit_samplepos_setup()
const fs_reg sample_pos_reg =
fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
- /* Compute gl_SamplePosition.x */
- abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
- compute_sample_position(offset(pos, abld, 0), int_sample_x);
+ for (unsigned i = 0; i < 2; i++) {
+ fs_reg tmp_d = bld.vgrf(BRW_REGISTER_TYPE_D);
+ abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, i));
+ /* Convert int_sample_pos to floating point */
+ fs_reg tmp_f = bld.vgrf(BRW_REGISTER_TYPE_F);
+ abld.MOV(tmp_f, tmp_d);
+ /* Scale to the range [0, 1] */
+ abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
+ }
- /* Compute gl_SamplePosition.y */
- abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
- compute_sample_position(offset(pos, abld, 1), int_sample_y);
- return reg;
+ return pos;
}
-fs_reg *
+fs_reg
fs_visitor::emit_sampleid_setup()
{
assert(stage == MESA_SHADER_FRAGMENT);
@@ -1414,14 +1332,14 @@ fs_visitor::emit_sampleid_setup()
assert(devinfo->ver >= 6);
const fs_builder abld = bld.annotate("compute sample id");
- fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
+ fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD);
if (!key->multisample_fbo) {
/* As per GL_ARB_sample_shading specification:
* "When rendering to a non-multisample buffer, or if multisample
* rasterization is disabled, gl_SampleID will always be zero."
*/
- abld.MOV(*reg, brw_imm_d(0));
+ abld.MOV(sample_id, brw_imm_d(0));
} else if (devinfo->ver >= 8) {
/* Sample ID comes in as 4-bit numbers in g1.0:
*
@@ -1461,7 +1379,7 @@ fs_visitor::emit_sampleid_setup()
brw_imm_v(0x44440000));
}
- abld.AND(*reg, tmp, brw_imm_w(0xf));
+ abld.AND(sample_id, tmp, brw_imm_w(0xf));
} else {
const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
@@ -1507,20 +1425,20 @@ fs_visitor::emit_sampleid_setup()
/* This special instruction takes care of setting vstride=1,
* width=4, hstride=0 of t2 during an ADD instruction.
*/
- abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+ abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
}
- return reg;
+ return sample_id;
}
-fs_reg *
+fs_reg
fs_visitor::emit_samplemaskin_setup()
{
assert(stage == MESA_SHADER_FRAGMENT);
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
assert(devinfo->ver >= 6);
- fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+ fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
/* The HW doesn't provide us with expected values. */
assert(!wm_prog_data->per_coarse_pixel_dispatch);
@@ -1542,28 +1460,27 @@ fs_visitor::emit_samplemaskin_setup()
const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
- nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+ nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
fs_reg one = vgrf(glsl_type::int_type);
fs_reg enabled_mask = vgrf(glsl_type::int_type);
abld.MOV(one, brw_imm_d(1));
abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
- abld.AND(*reg, enabled_mask, coverage_mask);
+ abld.AND(mask, enabled_mask, coverage_mask);
} else {
/* In per-pixel mode, the coverage mask is sufficient. */
- *reg = coverage_mask;
+ mask = coverage_mask;
}
- return reg;
+ return mask;
}
-fs_reg *
+fs_reg
fs_visitor::emit_shading_rate_setup()
{
assert(devinfo->ver >= 11);
const fs_builder abld = bld.annotate("compute fragment shading rate");
-
- fs_reg *reg = new(this->mem_ctx) fs_reg(bld.vgrf(BRW_REGISTER_TYPE_UD));
+ fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
struct brw_wm_prog_data *wm_prog_data =
brw_wm_prog_data(bld.shader->stage_prog_data);
@@ -1589,12 +1506,12 @@ fs_visitor::emit_shading_rate_setup()
abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
- abld.OR(*reg, int_rate_x, int_rate_y);
+ abld.OR(rate, int_rate_x, int_rate_y);
} else {
- abld.MOV(*reg, brw_imm_ud(0));
+ abld.MOV(rate, brw_imm_ud(0));
}
- return reg;
+ return rate;
}
fs_reg
@@ -1674,9 +1591,9 @@ fs_visitor::assign_curb_setup()
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
uint64_t used = 0;
+ bool is_compute = gl_shader_stage_is_compute(stage);
- if (stage == MESA_SHADER_COMPUTE &&
- brw_cs_prog_data(prog_data)->uses_inline_data) {
+ if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
/* With COMPUTE_WALKER, we can push up to one register worth of data via
* the inline data parameter in the COMPUTE_WALKER command itself.
*
@@ -1684,7 +1601,7 @@ fs_visitor::assign_curb_setup()
*/
assert(devinfo->verx10 >= 125);
assert(uniform_push_length <= 1);
- } else if (stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125) {
+ } else if (is_compute && devinfo->verx10 >= 125) {
fs_builder ubld = bld.exec_all().group(8, 0).at(
cfg->first_block(), cfg->first_block()->start());
@@ -1794,7 +1711,6 @@ fs_visitor::assign_curb_setup()
uint64_t want_zero = used & stage_prog_data->zero_push_reg;
if (want_zero) {
- assert(!compiler->compact_params);
fs_builder ubld = bld.exec_all().group(8, 0).at(
cfg->first_block(), cfg->first_block()->start());
@@ -1843,6 +1759,10 @@ fs_visitor::assign_curb_setup()
void
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
{
+ /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
+ * skip per-primitive attributes here.
+ */
+
/* Make sure uint8_t is sufficient */
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
uint8_t index = 0;
@@ -1858,16 +1778,76 @@ static void
calculate_urb_setup(const struct intel_device_info *devinfo,
const struct brw_wm_prog_key *key,
struct brw_wm_prog_data *prog_data,
- const nir_shader *nir)
+ const nir_shader *nir,
+ const struct brw_mue_map *mue_map)
{
memset(prog_data->urb_setup, -1,
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
int urb_next = 0;
+
+ const uint64_t inputs_read =
+ nir->info.inputs_read & ~nir->info.per_primitive_inputs;
+
/* Figure out where each of the incoming setup attributes lands. */
- if (devinfo->ver >= 6) {
- if (util_bitcount64(nir->info.inputs_read &
- BRW_FS_VARYING_INPUT_MASK) <= 16) {
+ if (mue_map) {
+ /* Per-Primitive Attributes are laid out by Hardware before the regular
+ * attributes, so order them like this to make easy later to map setup
+ * into real HW registers.
+ */
+ if (nir->info.per_primitive_inputs) {
+ for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
+ if (nir->info.per_primitive_inputs & BITFIELD64_BIT(i)) {
+ prog_data->urb_setup[i] = urb_next++;
+ }
+ }
+
+ /* The actual setup attributes later must be aligned to a full GRF. */
+ urb_next = ALIGN(urb_next, 2);
+
+ prog_data->num_per_primitive_inputs = urb_next;
+ }
+
+ const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
+ VARYING_BIT_CLIP_DIST1;
+
+ uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
+
+ if (inputs_read & clip_dist_bits) {
+ assert(mue_map->per_vertex_header_size_dw > 8);
+ unique_fs_attrs &= ~clip_dist_bits;
+ }
+
+ /* In Mesh, CLIP_DIST slots are always at the beginning, because
+ * they come from MUE Vertex Header, not Per-Vertex Attributes.
+ */
+ if (inputs_read & clip_dist_bits) {
+ prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
+ prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
+ }
+
+ /* Per-Vertex attributes are laid out ordered. Because we always link
+ * Mesh and Fragment shaders, the which slots are written and read by
+ * each of them will match. */
+ for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+ if (unique_fs_attrs & BITFIELD64_BIT(i))
+ prog_data->urb_setup[i] = urb_next++;
+ }
+ } else if (devinfo->ver >= 6) {
+ uint64_t vue_header_bits =
+ VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
+
+ uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
+
+ /* VUE header fields all live in the same URB slot, so we pass them
+ * as a single FS input attribute. We want to only count them once.
+ */
+ if (inputs_read & vue_header_bits) {
+ unique_fs_attrs &= ~vue_header_bits;
+ unique_fs_attrs |= VARYING_BIT_PSIZ;
+ }
+
+ if (util_bitcount64(unique_fs_attrs) <= 16) {
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
* first 16 varying inputs, so we can put them wherever we want.
* Just put them in order.
@@ -1876,9 +1856,22 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
* fragment shader won't take up valuable register space, and (b) we
* won't have to recompile the fragment shader if it gets paired with
* a different vertex (or geometry) shader.
+ *
+ * VUE header fields share the same FS input attribute.
*/
+ if (inputs_read & vue_header_bits) {
+ if (inputs_read & VARYING_BIT_PSIZ)
+ prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
+ if (inputs_read & VARYING_BIT_LAYER)
+ prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
+ if (inputs_read & VARYING_BIT_VIEWPORT)
+ prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
+
+ urb_next++;
+ }
+
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
- if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
+ if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
BITFIELD64_BIT(i)) {
prog_data->urb_setup[i] = urb_next++;
}
@@ -1900,7 +1893,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
nir->info.separate_shader, 1);
int first_slot =
- brw_compute_first_urb_slot_required(nir->info.inputs_read,
+ brw_compute_first_urb_slot_required(inputs_read,
&prev_stage_vue_map);
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
@@ -1908,7 +1901,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
slot++) {
int varying = prev_stage_vue_map.slot_to_varying[slot];
if (varying != BRW_VARYING_SLOT_PAD &&
- (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
+ (inputs_read & BRW_FS_VARYING_INPUT_MASK &
BITFIELD64_BIT(varying))) {
prog_data->urb_setup[varying] = slot - first_slot;
}
@@ -1941,12 +1934,12 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
*
* See compile_sf_prog() for more info.
*/
- if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+ if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
}
- prog_data->num_varying_inputs = urb_next;
- prog_data->inputs = nir->info.inputs_read;
+ prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
+ prog_data->inputs = inputs_read;
brw_compute_urb_setup_index(prog_data);
}
@@ -1988,6 +1981,12 @@ fs_visitor::assign_urb_setup()
/* Each attribute is 4 setup channels, each of which is half a reg. */
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
+
+ /* Unlike regular attributes, per-primitive attributes have all 4 channels
+ * in the same slot, so each GRF can store two slots.
+ */
+ assert(prog_data->num_per_primitive_inputs % 2 == 0);
+ this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
}
void
@@ -2095,22 +2094,17 @@ fs_visitor::assign_gs_urb_setup()
/**
* Split large virtual GRFs into separate components if we can.
*
- * This is mostly duplicated with what brw_fs_vector_splitting does,
- * but that's really conservative because it's afraid of doing
- * splitting that doesn't result in real progress after the rest of
- * the optimization phases, which would cause infinite looping in
- * optimization. We can do it once here, safely. This also has the
- * opportunity to split interpolated values, or maybe even uniforms,
- * which we don't have at the IR level.
- *
- * We want to split, because virtual GRFs are what we register
- * allocate and spill (due to contiguousness requirements for some
- * instructions), and they're what we naturally generate in the
- * codegen process, but most virtual GRFs don't actually need to be
- * contiguous sets of GRFs. If we split, we'll end up with reduced
- * live intervals and better dead code elimination and coalescing.
+ * This pass aggressively splits VGRFs into as small a chunks as possible,
+ * down to single registers if it can. If no VGRFs can be split, we return
+ * false so this pass can safely be used inside an optimization loop. We
+ * want to split, because virtual GRFs are what we register allocate and
+ * spill (due to contiguousness requirements for some instructions), and
+ * they're what we naturally generate in the codegen process, but most
+ * virtual GRFs don't actually need to be contiguous sets of GRFs. If we
+ * split, we'll end up with reduced live intervals and better dead code
+ * elimination and coalescing.
*/
-void
+bool
fs_visitor::split_virtual_grfs()
{
/* Compact the register file so we eliminate dead vgrfs. This
@@ -2181,10 +2175,15 @@ fs_visitor::split_virtual_grfs()
}
}
+ /* Bitset of which registers have been split */
+ bool *vgrf_has_split = new bool[num_vars];
+ memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
+
int *new_virtual_grf = new int[reg_count];
int *new_reg_offset = new int[reg_count];
int reg = 0;
+ bool has_splits = false;
for (int i = 0; i < num_vars; i++) {
/* The first one should always be 0 as a quick sanity check. */
assert(split_points[reg] == false);
@@ -2200,6 +2199,8 @@ fs_visitor::split_virtual_grfs()
* new virtual GRF for the previous offset many registers
*/
if (split_points[reg]) {
+ has_splits = true;
+ vgrf_has_split[i] = true;
assert(offset <= MAX_VGRF_SIZE);
int grf = alloc.allocate(offset);
for (int k = reg - offset; k < reg; k++)
@@ -2219,42 +2220,74 @@ fs_visitor::split_virtual_grfs()
}
assert(reg == reg_count);
+ bool progress;
+ if (!has_splits) {
+ progress = false;
+ goto cleanup;
+ }
+
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
if (inst->opcode == SHADER_OPCODE_UNDEF) {
- const fs_builder ibld(this, block, inst);
- assert(inst->size_written % REG_SIZE == 0);
- unsigned reg_offset = 0;
- while (reg_offset < inst->size_written / REG_SIZE) {
- reg = vgrf_to_reg[inst->dst.nr] + reg_offset;
- ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type));
- reg_offset += alloc.sizes[new_virtual_grf[reg]];
+ assert(inst->dst.file == VGRF);
+ if (vgrf_has_split[inst->dst.nr]) {
+ const fs_builder ibld(this, block, inst);
+ assert(inst->size_written % REG_SIZE == 0);
+ unsigned reg_offset = 0;
+ while (reg_offset < inst->size_written / REG_SIZE) {
+ reg = vgrf_to_reg[inst->dst.nr] + reg_offset;
+ ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type));
+ reg_offset += alloc.sizes[new_virtual_grf[reg]];
+ }
+ inst->remove(block);
+ } else {
+ reg = vgrf_to_reg[inst->dst.nr];
+ assert(new_reg_offset[reg] == 0);
+ assert(new_virtual_grf[reg] == (int)inst->dst.nr);
}
- inst->remove(block);
continue;
}
if (inst->dst.file == VGRF) {
reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
- inst->dst.nr = new_virtual_grf[reg];
- inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
- inst->dst.offset % REG_SIZE;
- assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+ if (vgrf_has_split[inst->dst.nr]) {
+ inst->dst.nr = new_virtual_grf[reg];
+ inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
+ inst->dst.offset % REG_SIZE;
+ assert((unsigned)new_reg_offset[reg] <
+ alloc.sizes[new_virtual_grf[reg]]);
+ } else {
+ assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
+ assert(new_virtual_grf[reg] == (int)inst->dst.nr);
+ }
}
for (int i = 0; i < inst->sources; i++) {
- if (inst->src[i].file == VGRF) {
- reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+ if (inst->src[i].file != VGRF)
+ continue;
+
+ reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+ if (vgrf_has_split[inst->src[i].nr]) {
inst->src[i].nr = new_virtual_grf[reg];
inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
inst->src[i].offset % REG_SIZE;
- assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+ assert((unsigned)new_reg_offset[reg] <
+ alloc.sizes[new_virtual_grf[reg]]);
+ } else {
+ assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
+ assert(new_virtual_grf[reg] == (int)inst->src[i].nr);
}
}
}
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
+ progress = true;
+
+cleanup:
delete[] split_points;
+ delete[] vgrf_has_split;
delete[] new_virtual_grf;
delete[] new_reg_offset;
+
+ return progress;
}
/**
@@ -2351,109 +2384,6 @@ get_subgroup_id_param_index(const intel_device_info *devinfo,
}
/**
- * Struct for handling complex alignments.
- *
- * A complex alignment is stored as multiplier and an offset. A value is
- * considered to be aligned if it is {offset} larger than a multiple of {mul}.
- * For instance, with an alignment of {8, 2}, cplx_align_apply would do the
- * following:
- *
- * N | cplx_align_apply({8, 2}, N)
- * ----+-----------------------------
- * 4 | 6
- * 6 | 6
- * 8 | 14
- * 10 | 14
- * 12 | 14
- * 14 | 14
- * 16 | 22
- */
-struct cplx_align {
- unsigned mul:4;
- unsigned offset:4;
-};
-
-#define CPLX_ALIGN_MAX_MUL 8
-
-static void
-cplx_align_assert_sane(struct cplx_align a)
-{
- assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
- assert(a.offset < a.mul);
-}
-
-/**
- * Combines two alignments to produce a least multiple of sorts.
- *
- * The returned alignment is the smallest (in terms of multiplier) such that
- * anything aligned to both a and b will be aligned to the new alignment.
- * This function will assert-fail if a and b are not compatible, i.e. if the
- * offset parameters are such that no common alignment is possible.
- */
-static struct cplx_align
-cplx_align_combine(struct cplx_align a, struct cplx_align b)
-{
- cplx_align_assert_sane(a);
- cplx_align_assert_sane(b);
-
- /* Assert that the alignments agree. */
- assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
-
- return a.mul > b.mul ? a : b;
-}
-
-/**
- * Apply a complex alignment
- *
- * This function will return the smallest number greater than or equal to
- * offset that is aligned to align.
- */
-static unsigned
-cplx_align_apply(struct cplx_align align, unsigned offset)
-{
- return ALIGN(offset - align.offset, align.mul) + align.offset;
-}
-
-#define UNIFORM_SLOT_SIZE 4
-
-struct uniform_slot_info {
- /** True if the given uniform slot is live */
- unsigned is_live:1;
-
- /** True if this slot and the next slot must remain contiguous */
- unsigned contiguous:1;
-
- struct cplx_align align;
-};
-
-static void
-mark_uniform_slots_read(struct uniform_slot_info *slots,
- unsigned num_slots, unsigned alignment)
-{
- assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
- assert(alignment <= CPLX_ALIGN_MAX_MUL);
-
- /* We can't align a slot to anything less than the slot size */
- alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
-
- struct cplx_align align = {alignment, 0};
- cplx_align_assert_sane(align);
-
- for (unsigned i = 0; i < num_slots; i++) {
- slots[i].is_live = true;
- if (i < num_slots - 1)
- slots[i].contiguous = true;
-
- align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
- if (slots[i].align.mul == 0) {
- slots[i].align = align;
- } else {
- slots[i].align = cplx_align_combine(slots[i].align, align);
- }
- }
-}
-
-/**
* Assign UNIFORM file registers to either push constants or pull constants.
*
* We allow a fragment shader to have more than the specified minimum
@@ -2466,197 +2396,12 @@ void
fs_visitor::assign_constant_locations()
{
/* Only the first compile gets to decide on locations. */
- if (push_constant_loc) {
- assert(pull_constant_loc);
+ if (push_constant_loc)
return;
- }
-
- if (compiler->compact_params) {
- struct uniform_slot_info slots[uniforms + 1];
- memset(slots, 0, sizeof(slots));
-
- foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
- for (int i = 0 ; i < inst->sources; i++) {
- if (inst->src[i].file != UNIFORM)
- continue;
- /* NIR tightly packs things so the uniform number might not be
- * aligned (if we have a double right after a float, for
- * instance). This is fine because the process of re-arranging
- * them will ensure that things are properly aligned. The offset
- * into that uniform, however, must be aligned.
- *
- * In Vulkan, we have explicit offsets but everything is crammed
- * into a single "variable" so inst->src[i].nr will always be 0.
- * Everything will be properly aligned relative to that one base.
- */
- assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
-
- unsigned u = inst->src[i].nr +
- inst->src[i].offset / UNIFORM_SLOT_SIZE;
-
- if (u >= uniforms)
- continue;
-
- unsigned slots_read;
- if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
- slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
- } else {
- unsigned bytes_read = inst->components_read(i) *
- type_sz(inst->src[i].type);
- slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
- }
-
- assert(u + slots_read <= uniforms);
- mark_uniform_slots_read(&slots[u], slots_read,
- type_sz(inst->src[i].type));
- }
- }
-
- int subgroup_id_index = get_subgroup_id_param_index(devinfo,
- stage_prog_data);
-
- /* Only allow 16 registers (128 uniform components) as push constants.
- *
- * Just demote the end of the list. We could probably do better
- * here, demoting things that are rarely used in the program first.
- *
- * If changing this value, note the limitation about total_regs in
- * brw_curbe.c.
- */
- unsigned int max_push_components = 16 * 8;
- if (subgroup_id_index >= 0)
- max_push_components--; /* Save a slot for the thread ID */
-
- /* We push small arrays, but no bigger than 16 floats. This is big
- * enough for a vec4 but hopefully not large enough to push out other
- * stuff. We should probably use a better heuristic at some point.
- */
- const unsigned int max_chunk_size = 16;
-
- unsigned int num_push_constants = 0;
- unsigned int num_pull_constants = 0;
-
- push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-
- /* Default to -1 meaning no location */
- memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
- memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
-
- int chunk_start = -1;
- struct cplx_align align;
- for (unsigned u = 0; u < uniforms; u++) {
- if (!slots[u].is_live) {
- assert(chunk_start == -1);
- continue;
- }
-
- /* Skip subgroup_id_index to put it in the last push register. */
- if (subgroup_id_index == (int)u)
- continue;
-
- if (chunk_start == -1) {
- chunk_start = u;
- align = slots[u].align;
- } else {
- /* Offset into the chunk */
- unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
-
- /* Shift the slot alignment down by the chunk offset so it is
- * comparable with the base chunk alignment.
- */
- struct cplx_align slot_align = slots[u].align;
- slot_align.offset =
- (slot_align.offset - chunk_offset) & (align.mul - 1);
-
- align = cplx_align_combine(align, slot_align);
- }
-
- /* Sanity check the alignment */
- cplx_align_assert_sane(align);
-
- if (slots[u].contiguous)
- continue;
-
- /* Adjust the alignment to be in terms of slots, not bytes */
- assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
- assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
- align.mul /= UNIFORM_SLOT_SIZE;
- align.offset /= UNIFORM_SLOT_SIZE;
-
- unsigned push_start_align = cplx_align_apply(align, num_push_constants);
- unsigned chunk_size = u - chunk_start + 1;
- if ((!compiler->supports_pull_constants && u < UBO_START) ||
- (chunk_size < max_chunk_size &&
- push_start_align + chunk_size <= max_push_components)) {
- /* Align up the number of push constants */
- num_push_constants = push_start_align;
- for (unsigned i = 0; i < chunk_size; i++)
- push_constant_loc[chunk_start + i] = num_push_constants++;
- } else {
- /* We need to pull this one */
- num_pull_constants = cplx_align_apply(align, num_pull_constants);
- for (unsigned i = 0; i < chunk_size; i++)
- pull_constant_loc[chunk_start + i] = num_pull_constants++;
- }
-
- /* Reset the chunk and start again */
- chunk_start = -1;
- }
-
- /* Add the CS local thread ID uniform at the end of the push constants */
- if (subgroup_id_index >= 0)
- push_constant_loc[subgroup_id_index] = num_push_constants++;
-
- /* As the uniforms are going to be reordered, stash the old array and
- * create two new arrays for push/pull params.
- */
- uint32_t *param = stage_prog_data->param;
- stage_prog_data->nr_params = num_push_constants;
- if (num_push_constants) {
- stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
- num_push_constants);
- } else {
- stage_prog_data->param = NULL;
- }
- assert(stage_prog_data->nr_pull_params == 0);
- assert(stage_prog_data->pull_param == NULL);
- if (num_pull_constants > 0) {
- stage_prog_data->nr_pull_params = num_pull_constants;
- stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
- num_pull_constants);
- }
-
- /* Up until now, the param[] array has been indexed by reg + offset
- * of UNIFORM registers. Move pull constants into pull_param[] and
- * condense param[] to only contain the uniforms we chose to push.
- *
- * NOTE: Because we are condensing the params[] array, we know that
- * push_constant_loc[i] <= i and we can do it in one smooth loop without
- * having to make a copy.
- */
- for (unsigned int i = 0; i < uniforms; i++) {
- uint32_t value = param[i];
- if (pull_constant_loc[i] != -1) {
- stage_prog_data->pull_param[pull_constant_loc[i]] = value;
- } else if (push_constant_loc[i] != -1) {
- stage_prog_data->param[push_constant_loc[i]] = value;
- }
- }
- ralloc_free(param);
- } else {
- /* If we don't want to compact anything, just set up dummy push/pull
- * arrays. All the rest of the compiler cares about are these arrays.
- */
- push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-
- for (unsigned u = 0; u < uniforms; u++)
- push_constant_loc[u] = u;
-
- memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
- }
+ push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ for (unsigned u = 0; u < uniforms; u++)
+ push_constant_loc[u] = u;
/* Now that we know how many regular uniforms we'll push, reduce the
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
@@ -2687,33 +2432,22 @@ fs_visitor::get_pull_locs(const fs_reg &src,
{
assert(src.file == UNIFORM);
- if (src.nr >= UBO_START) {
- const struct brw_ubo_range *range =
- &prog_data->ubo_ranges[src.nr - UBO_START];
-
- /* If this access is in our (reduced) range, use the push data. */
- if (src.offset / 32 < range->length)
- return false;
-
- *out_surf_index = prog_data->binding_table.ubo_start + range->block;
- *out_pull_index = (32 * range->start + src.offset) / 4;
+ if (src.nr < UBO_START)
+ return false;
- prog_data->has_ubo_pull = true;
- return true;
- }
+ const struct brw_ubo_range *range =
+ &prog_data->ubo_ranges[src.nr - UBO_START];
- const unsigned location = src.nr + src.offset / 4;
+ /* If this access is in our (reduced) range, use the push data. */
+ if (src.offset / 32 < range->length)
+ return false;
- if (location < uniforms && pull_constant_loc[location] != -1) {
- /* A regular uniform push constant */
- *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
- *out_pull_index = pull_constant_loc[location];
+ *out_surf_index = range->block;
+ *out_pull_index = (32 * range->start + src.offset) / 4;
- prog_data->has_ubo_pull = true;
- return true;
- }
+ prog_data->has_ubo_pull = true;
- return false;
+ return true;
}
/**
@@ -3831,7 +3565,7 @@ fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_ins
void
fs_visitor::insert_gfx4_send_dependency_workarounds()
{
- if (devinfo->ver != 4 || devinfo->is_g4x)
+ if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
return;
bool progress = false;
@@ -4039,11 +3773,9 @@ fs_visitor::lower_load_payload()
}
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+ dst.type = inst->src[i].type;
if (inst->src[i].file != BAD_FILE) {
- dst.type = inst->src[i].type;
ibld.MOV(dst, inst->src[i]);
- } else {
- dst.type = BRW_REGISTER_TYPE_UD;
}
dst = offset(dst, ibld, 1);
}
@@ -5169,6 +4901,49 @@ sampler_msg_type(const intel_device_info *devinfo,
}
}
+/**
+ * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
+ * the given requested_alignment_sz.
+ */
+static fs_inst *
+emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst,
+ const fs_reg *src, unsigned sources,
+ unsigned header_size,
+ unsigned requested_alignment_sz)
+{
+ unsigned length = 0;
+ unsigned num_srcs =
+ sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
+ fs_reg *src_comps = new fs_reg[num_srcs];
+
+ for (unsigned i = 0; i < header_size; i++)
+ src_comps[length++] = src[i];
+
+ for (unsigned i = header_size; i < sources; i++) {
+ unsigned src_sz =
+ retype(dst, src[i].type).component_size(bld.dispatch_width());
+ const enum brw_reg_type padding_payload_type =
+ brw_reg_type_from_bit_size(type_sz(src[i].type) * 8,
+ BRW_REGISTER_TYPE_UD);
+
+ src_comps[length++] = src[i];
+
+ /* Expand the real sources if component of requested payload type is
+ * larger than real source component.
+ */
+ if (src_sz < requested_alignment_sz) {
+ for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
+ src_comps[length++] = retype(fs_reg(), padding_payload_type);
+ }
+ }
+ }
+
+ fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
+ delete[] src_comps;
+
+ return inst;
+}
+
static void
lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
const fs_reg &coordinate,
@@ -5182,16 +4957,22 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
const fs_reg &surface_handle,
const fs_reg &sampler_handle,
const fs_reg &tg4_offset,
+ unsigned payload_type_bit_size,
unsigned coord_components,
unsigned grad_components)
{
const intel_device_info *devinfo = bld.shader->devinfo;
- const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;
+ const enum brw_reg_type payload_type =
+ brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
+ const enum brw_reg_type payload_unsigned_type =
+ brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
+ const enum brw_reg_type payload_signed_type =
+ brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
unsigned reg_width = bld.dispatch_width() / 8;
unsigned header_size = 0, length = 0;
fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
- sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+ sources[i] = bld.vgrf(payload_type);
/* We must have exactly one of surface/sampler and surface/sampler_handle */
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
@@ -5330,23 +5111,23 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
coordinate_done = true;
break;
case SHADER_OPCODE_TXS:
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+ bld.MOV(retype(sources[length], payload_unsigned_type), lod);
length++;
break;
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
/* We need an LOD; just use 0 */
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+ bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
length++;
break;
case SHADER_OPCODE_TXF:
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
* On Gfx9 they are u, v, lod, r
*/
- bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
+ bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
if (devinfo->ver >= 9) {
if (coord_components >= 2) {
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
+ bld.MOV(retype(sources[length], payload_signed_type),
offset(coordinate, bld, 1));
} else {
sources[length] = brw_imm_d(0);
@@ -5357,12 +5138,12 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
if (devinfo->ver >= 9 && lod.is_zero()) {
op = SHADER_OPCODE_TXF_LZ;
} else {
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+ bld.MOV(retype(sources[length], payload_signed_type), lod);
length++;
}
for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
- bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+ bld.MOV(retype(sources[length++], payload_signed_type),
offset(coordinate, bld, i));
coordinate_done = true;
@@ -5375,24 +5156,26 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
if (op == SHADER_OPCODE_TXF_UMS ||
op == SHADER_OPCODE_TXF_CMS ||
op == SHADER_OPCODE_TXF_CMS_W) {
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
- length++;
+ bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
}
+ /* Data from the multisample control surface. */
if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
- /* Data from the multisample control surface. */
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
- length++;
+ unsigned num_mcs_components = 1;
- /* On Gfx9+ we'll use ld2dms_w instead which has two registers for
- * the MCS data.
+ /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
+ * Shared Functions - 3D Sampler - Messages - Message Format:
+ *
+ * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
*/
- if (op == SHADER_OPCODE_TXF_CMS_W) {
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
- mcs.file == IMM ?
- mcs :
- offset(mcs, bld, 1));
- length++;
+ if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W)
+ num_mcs_components = 4;
+ else if (op == SHADER_OPCODE_TXF_CMS_W)
+ num_mcs_components = 2;
+
+ for (unsigned i = 0; i < num_mcs_components; ++i) {
+ bld.MOV(retype(sources[length++], payload_unsigned_type),
+ mcs.file == IMM ? mcs : offset(mcs, bld, i));
}
}
@@ -5400,7 +5183,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
* texture coordinates.
*/
for (unsigned i = 0; i < coord_components; i++)
- bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+ bld.MOV(retype(sources[length++], payload_signed_type),
offset(coordinate, bld, i));
coordinate_done = true;
@@ -5411,7 +5194,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
bld.MOV(sources[length++], offset(coordinate, bld, i));
for (unsigned i = 0; i < 2; i++) /* offu, offv */
- bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+ bld.MOV(retype(sources[length++], payload_signed_type),
offset(tg4_offset, bld, i));
if (coord_components == 3) /* r if present */
@@ -5426,27 +5209,53 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
/* Set up the coordinate (except for cases where it was done above) */
if (!coordinate_done) {
for (unsigned i = 0; i < coord_components; i++)
- bld.MOV(sources[length++], offset(coordinate, bld, i));
+ bld.MOV(retype(sources[length++], payload_type),
+ offset(coordinate, bld, i));
}
if (min_lod.file != BAD_FILE) {
/* Account for all of the missing coordinate sources */
- length += 4 - coord_components;
- if (op == SHADER_OPCODE_TXD)
- length += (3 - grad_components) * 2;
+ if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
+ /* On DG2 and newer platforms, sample_d can only be used with 1D and
+ * 2D surfaces, so the maximum number of gradient components is 2.
+ * In spite of this limitation, the Bspec lists a mysterious R
+ * component before the min_lod, so the maximum coordinate components
+ * is 3.
+ *
+ * Wa_1209978020
+ */
+ length += 3 - coord_components;
+ length += (2 - grad_components) * 2;
+ } else {
+ length += 4 - coord_components;
+ if (op == SHADER_OPCODE_TXD)
+ length += (3 - grad_components) * 2;
+ }
bld.MOV(sources[length++], min_lod);
}
- unsigned mlen;
- if (reg_width == 2)
- mlen = length * reg_width - header_size;
- else
- mlen = length * reg_width;
-
- const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
- BRW_REGISTER_TYPE_F);
- bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+ const fs_reg src_payload =
+ fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
+ BRW_REGISTER_TYPE_F);
+ /* In case of 16-bit payload each component takes one full register in
+ * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
+ * elements. In SIMD8H case hardware simply expects the components to be
+ * padded (i.e., aligned on reg boundary).
+ */
+ fs_inst *load_payload_inst =
+ emit_load_payload_with_padding(bld, src_payload, sources, length,
+ header_size, REG_SIZE);
+ unsigned mlen = load_payload_inst->size_written / REG_SIZE;
+ unsigned simd_mode = 0;
+ if (payload_type_bit_size == 16) {
+ assert(devinfo->ver >= 11);
+ simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
+ GFX10_SAMPLER_SIMD_MODE_SIMD16H;
+ } else {
+ simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
+ BRW_SAMPLER_SIMD_MODE_SIMD16;
+ }
/* Generate the SEND. */
inst->opcode = SHADER_OPCODE_SEND;
@@ -5455,29 +5264,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
const unsigned msg_type =
sampler_msg_type(devinfo, op, inst->shadow_compare);
- const unsigned simd_mode =
- inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
- BRW_SAMPLER_SIMD_MODE_SIMD16;
-
- uint32_t base_binding_table_index;
- switch (op) {
- case SHADER_OPCODE_TG4:
- case SHADER_OPCODE_TG4_OFFSET:
- base_binding_table_index = prog_data->binding_table.gather_texture_start;
- break;
- case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
- base_binding_table_index = prog_data->binding_table.image_start;
- break;
- default:
- base_binding_table_index = prog_data->binding_table.texture_start;
- break;
- }
inst->sfid = BRW_SFID_SAMPLER;
if (surface.file == IMM &&
(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
- inst->desc = brw_sampler_desc(devinfo,
- surface.ud + base_binding_table_index,
+ inst->desc = brw_sampler_desc(devinfo, surface.ud,
sampler.file == IMM ? sampler.ud % 16 : 0,
msg_type,
simd_mode,
@@ -5533,8 +5324,6 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
ubld.OR(desc, desc, surface);
}
}
- if (base_binding_table_index)
- ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index));
ubld.AND(desc, desc, brw_imm_ud(0xfff));
inst->src[0] = component(desc, 0);
@@ -5560,6 +5349,61 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
}
+static unsigned
+get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
+ opcode op, const fs_reg *src)
+{
+ unsigned src_type_size = 0;
+
+ /* All sources need to have the same size, therefore seek the first valid
+ * and take the size from there.
+ */
+ for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
+ if (src[i].file != BAD_FILE) {
+ src_type_size = brw_reg_type_to_size(src[i].type);
+ break;
+ }
+ }
+
+ assert(src_type_size == 2 || src_type_size == 4);
+
+#ifndef NDEBUG
+ /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
+ * compressed multisampled surfaces. There the payload contains MCS data
+ * which is already in 16-bits unlike the other parameters that need forced
+ * conversion.
+ */
+ if (devinfo->verx10 < 125 ||
+ (op != SHADER_OPCODE_TXF_CMS_W &&
+ op != SHADER_OPCODE_TXF_CMS)) {
+ for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
+ assert(src[i].file == BAD_FILE ||
+ brw_reg_type_to_size(src[i].type) == src_type_size);
+ }
+ }
+#endif
+
+ if (devinfo->verx10 < 125)
+ return src_type_size * 8;
+
+ /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
+ * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
+ * Format [GFX12:HAS:1209977870] *
+ *
+ * ld2dms_w SIMD8H and SIMD16H Only
+ * ld_mcs SIMD8H and SIMD16H Only
+ * ld2dms REMOVEDBY(GEN:HAS:1406788836)
+ */
+
+ if (op == SHADER_OPCODE_TXF_CMS_W ||
+ op == SHADER_OPCODE_TXF_CMS ||
+ op == SHADER_OPCODE_TXF_UMS ||
+ op == SHADER_OPCODE_TXF_MCS)
+ src_type_size = 2;
+
+ return src_type_size * 8;
+}
+
static void
lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
{
@@ -5582,12 +5426,19 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
if (devinfo->ver >= 7) {
+ const unsigned msg_payload_type_bit_size =
+ get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
+
+ /* 16-bit payloads are available only on gfx11+ */
+ assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
+
lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
shadow_c, lod, lod2, min_lod,
sample_index,
mcs, surface, sampler,
surface_handle, sampler_handle,
tg4_offset,
+ msg_payload_type_bit_size,
coord_components, grad_components);
} else if (devinfo->ver >= 5) {
lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
@@ -5641,6 +5492,67 @@ emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
}
}
+void
+fs_visitor::emit_is_helper_invocation(fs_reg result)
+{
+ /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
+ * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
+ * consideration demoted invocations.
+ */
+ result.type = BRW_REGISTER_TYPE_UD;
+
+ bld.MOV(result, brw_imm_ud(0));
+
+ /* See sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
+ unsigned width = bld.dispatch_width();
+ for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
+ const fs_builder b = bld.group(MIN2(width, 16), i);
+
+ fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0));
+
+ /* The at() ensures that any code emitted to get the predicate happens
+ * before the mov right above. This is not an issue elsewhere because
+ * lowering code already set up the builder this way.
+ */
+ emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
+ mov->predicate_inverse = true;
+ }
+}
+
+/**
+ * Predicate the specified instruction on the vector mask.
+ */
+static void
+emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
+{
+ assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
+ bld.group() == inst->group &&
+ bld.dispatch_width() == inst->exec_size);
+
+ const fs_builder ubld = bld.exec_all().group(1, 0);
+
+ const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
+ const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW);
+ ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3));
+ const unsigned subreg = sample_mask_flag_subreg(v);
+
+ ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
+
+ if (inst->predicate) {
+ assert(inst->predicate == BRW_PREDICATE_NORMAL);
+ assert(!inst->predicate_inverse);
+ assert(inst->flag_subreg == 0);
+ /* Combine the vector mask with the existing predicate by using a
+ * vertical predication mode.
+ */
+ inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
+ } else {
+ inst->flag_subreg = subreg;
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->predicate_inverse = false;
+ }
+}
+
static void
setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
const fs_reg &surface, const fs_reg &surface_handle)
@@ -5914,11 +5826,11 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
inst->sfid = sfid;
setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+ inst->resize_sources(4);
+
/* Finally, the payload */
inst->src[2] = payload;
inst->src[3] = payload2;
-
- inst->resize_sources(4);
}
static enum lsc_opcode
@@ -6141,11 +6053,11 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
inst->send_has_side_effects = has_side_effects;
inst->send_is_volatile = !has_side_effects;
+ inst->resize_sources(4);
+
/* Finally, the payload */
inst->src[2] = payload;
inst->src[3] = payload2;
-
- inst->resize_sources(4);
}
static void
@@ -6211,10 +6123,10 @@ lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
arg.ud, write);
setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+ inst->resize_sources(4);
+
inst->src[2] = header;
inst->src[3] = data;
-
- inst->resize_sources(4);
}
static fs_reg
@@ -6235,26 +6147,40 @@ emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
}
static void
+emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
+{
+ assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
+ const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
+
+ /* If we're a fragment shader, we have to predicate with the sample mask to
+ * avoid helper invocations to avoid helper invocations in instructions
+ * with side effects, unless they are explicitly required.
+ *
+ * There are also special cases when we actually want to run on helpers
+ * (ray queries).
+ */
+ assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
+ if (enable_helpers)
+ emit_predicate_on_vector_mask(bld, inst);
+ else if (inst->has_side_effects())
+ emit_predicate_on_sample_mask(bld, inst);
+}
+
+static void
lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
/* Get the logical send arguments. */
- const fs_reg &addr = inst->src[0];
- const fs_reg &src = inst->src[1];
+ const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
+ const fs_reg &src = inst->src[A64_LOGICAL_SRC];
const unsigned src_sz = type_sz(src.type);
const unsigned src_comps = inst->components_read(1);
- assert(inst->src[2].file == IMM);
- const unsigned arg = inst->src[2].ud;
+ assert(inst->src[A64_LOGICAL_ARG].file == IMM);
+ const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
const bool has_side_effects = inst->has_side_effects();
- /* If the surface message has side effects and we're a fragment shader, we
- * have to predicate with the sample mask to avoid helper invocations.
- */
- if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
- emit_predicate_on_sample_mask(bld, inst);
-
fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
BRW_REGISTER_TYPE_UD);
@@ -6286,7 +6212,7 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
lsc_bits_to_data_size(arg),
1 /* num_channels */,
false /* transpose */,
- LSC_CACHE_STORE_L1STATE_L3MOCS,
+ LSC_CACHE_LOAD_L1STATE_L3MOCS,
true /* has_dest */);
break;
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
@@ -6330,6 +6256,9 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
unreachable("Unknown A64 logical instruction");
}
+ if (bld.shader->stage == MESA_SHADER_FRAGMENT)
+ emit_fragment_mask(bld, inst);
+
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
@@ -6352,19 +6281,13 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
- const fs_reg &addr = inst->src[0];
- const fs_reg &src = inst->src[1];
+ const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
+ const fs_reg &src = inst->src[A64_LOGICAL_SRC];
const unsigned src_comps = inst->components_read(1);
- assert(inst->src[2].file == IMM);
- const unsigned arg = inst->src[2].ud;
+ assert(inst->src[A64_LOGICAL_ARG].file == IMM);
+ const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
const bool has_side_effects = inst->has_side_effects();
- /* If the surface message has side effects and we're a fragment shader, we
- * have to predicate with the sample mask to avoid helper invocations.
- */
- if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
- emit_predicate_on_sample_mask(bld, inst);
-
fs_reg payload, payload2;
unsigned mlen, ex_mlen = 0, header_size = 0;
if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
@@ -6488,6 +6411,9 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
unreachable("Unknown A64 logical instruction");
}
+ if (bld.shader->stage == MESA_SHADER_FRAGMENT)
+ emit_fragment_mask(bld, inst);
+
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;
inst->mlen = mlen;
@@ -6781,31 +6707,60 @@ static void
lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
- const fs_reg &bvh_level = inst->src[0];
- assert(inst->src[1].file == BRW_IMMEDIATE_VALUE);
- const uint32_t trace_ray_control = inst->src[1].ud;
+ /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
+ * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
+ * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
+ * so that the MOV operates on 2 components rather than twice the same
+ * component.
+ */
+ fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD);
+ globals_addr.stride = 1;
+ const fs_reg &bvh_level =
+ inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
+ inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
+ bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
+ inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
+ const fs_reg &trace_ray_control =
+ inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
+ inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
+ bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
+ inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
+ const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
+ assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
+ const bool synchronous = synchronous_src.ud;
const unsigned mlen = 1;
const fs_builder ubld = bld.exec_all().group(8, 0);
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(header, brw_imm_ud(0));
- ubld.group(2, 0).MOV(header,
- retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD));
- /* TODO: Bit 128 is ray_query */
+ ubld.group(2, 0).MOV(header, globals_addr);
+ if (synchronous)
+ ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
const unsigned ex_mlen = inst->exec_size / 8;
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
- const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8);
- if (bvh_level.file == BRW_IMMEDIATE_VALUE) {
- bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7)));
+ if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
+ trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
+ bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
+ (bvh_level.ud & 0x7)));
} else {
- bld.AND(payload, bvh_level, brw_imm_ud(0x7));
- if (trc_bits != 0)
- bld.OR(payload, payload, brw_imm_ud(trc_bits));
+ bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
+ bld.OR(payload, payload, bvh_level);
+ }
+
+ /* When doing synchronous traversal, the HW implicitly computes the
+ * stack_id using the following formula :
+ *
+ * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
+ *
+ * Only in the asynchronous case we need to set the stack_id given from the
+ * payload register.
+ */
+ if (!synchronous) {
+ bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
+ retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
+ brw_imm_uw(0x7ff));
}
- bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
- retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
- brw_imm_uw(0x7ff));
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;
@@ -6880,6 +6835,7 @@ fs_visitor::lower_logical_sends()
break;
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
break;
@@ -7107,7 +7063,7 @@ get_fpu_lowered_simd_width(const struct intel_device_info *devinfo,
for (unsigned i = 0; i < inst->sources; i++) {
/* IVB implements DF scalars as <0;2,1> regions. */
const bool is_scalar_exception = is_uniform(inst->src[i]) &&
- (devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
+ (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
const bool is_packed_word_exception =
type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
@@ -7386,7 +7342,7 @@ get_lowered_simd_width(const struct intel_device_info *devinfo,
* should
* "Force BFI instructions to be executed always in SIMD8."
*/
- return MIN2(devinfo->is_haswell ? 8 : ~0u,
+ return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
get_fpu_lowered_simd_width(devinfo, inst));
case BRW_OPCODE_IF:
@@ -7403,7 +7359,7 @@ get_lowered_simd_width(const struct intel_device_info *devinfo,
/* Unary extended math instructions are limited to SIMD8 on Gfx4 and
* Gfx6. Extended Math Function is limited to SIMD8 with half-float.
*/
- if (devinfo->ver == 6 || (devinfo->ver == 4 && !devinfo->is_g4x))
+ if (devinfo->ver == 6 || devinfo->verx10 == 40)
return MIN2(8, inst->exec_size);
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
return MIN2(8, inst->exec_size);
@@ -7512,6 +7468,12 @@ get_lowered_simd_width(const struct intel_device_info *devinfo,
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
return get_sampler_lowered_simd_width(devinfo, inst);
+ /* On gfx12 parameters are fixed to 16-bit values and therefore they all
+ * always fit regardless of the execution size.
+ */
+ case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+ return MIN2(16, inst->exec_size);
+
case SHADER_OPCODE_TXD_LOGICAL:
/* TXD is unsupported in SIMD16 mode. */
return 8;
@@ -8485,9 +8447,6 @@ fs_visitor::optimize()
validate();
- split_virtual_grfs();
- validate();
-
#define OPT(pass, args...) ({ \
pass_num++; \
bool this_progress = pass(args); \
@@ -8518,6 +8477,8 @@ fs_visitor::optimize()
int iteration = 0;
int pass_num = 0;
+ OPT(split_virtual_grfs);
+
/* Before anything else, eliminate dead code. The results of some NIR
* instructions may effectively be calculated twice. Once when the
* instruction is encountered, and again when the user of that result is
@@ -8590,7 +8551,7 @@ fs_visitor::optimize()
OPT(opt_redundant_halt);
if (OPT(lower_load_payload)) {
- split_virtual_grfs();
+ OPT(split_virtual_grfs);
/* Lower 64 bit MOVs generated by payload lowering. */
if (!devinfo->has_64bit_float && !devinfo->has_64bit_int)
@@ -8705,6 +8666,75 @@ fs_visitor::fixup_3src_null_dest()
DEPENDENCY_VARIABLES);
}
+static bool
+needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
+{
+ /* This workaround is about making sure that any instruction writing
+ * through UGM has completed before we hit EOT.
+ *
+ * The workaround talks about UGM writes or atomic message but what is
+ * important is anything that hasn't completed. Usually any SEND
+ * instruction that has a destination register will be read by something
+ * else so we don't need to care about those as they will be synchronized
+ * by other parts of the shader or optimized away. What is left are
+ * instructions that don't have a destination register.
+ */
+ if (inst->sfid != GFX12_SFID_UGM)
+ return false;
+
+ return inst->dst.file == BAD_FILE;
+}
+
+/* Wa_22013689345
+ *
+ * We need to emit UGM fence message before EOT, if shader has any UGM write
+ * or atomic message.
+ *
+ * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
+ * We probably need a better criteria in needs_dummy_fence().
+ */
+void
+fs_visitor::emit_dummy_memory_fence_before_eot()
+{
+ bool progress = false;
+ bool has_ugm_write_or_atomic = false;
+
+ if (!intel_device_info_is_dg2(devinfo))
+ return;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (!inst->eot) {
+ if (needs_dummy_fence(devinfo, inst))
+ has_ugm_write_or_atomic = true;
+ continue;
+ }
+
+ if (!has_ugm_write_or_atomic)
+ break;
+
+ const fs_builder ibld(this, block, inst);
+ const fs_builder ubld = ibld.exec_all().group(1, 0);
+
+ fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
+ dst, brw_vec8_grf(0, 0),
+ /* commit enable */ brw_imm_ud(1),
+ /* bti */ brw_imm_ud(0));
+ dummy_fence->sfid = GFX12_SFID_UGM;
+ dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
+ LSC_FLUSH_TYPE_NONE_6, false);
+ ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
+ progress = true;
+ /* TODO: remove this break if we ever have shader with multiple EOT. */
+ break;
+ }
+
+ if (progress) {
+ invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
+ DEPENDENCY_VARIABLES);
+ }
+}
+
/**
* Find the first instruction in the program that might start a region of
* divergent control flow due to a HALT jump. There is no
@@ -8856,23 +8886,55 @@ fs_visitor::allocate_registers(bool allow_spilling)
static const enum instruction_scheduler_mode pre_modes[] = {
SCHEDULE_PRE,
SCHEDULE_PRE_NON_LIFO,
+ SCHEDULE_NONE,
SCHEDULE_PRE_LIFO,
};
static const char *scheduler_mode_name[] = {
"top-down",
"non-lifo",
+ "none",
"lifo"
};
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
+ /* Before we schedule anything, stash off the instruction order as an array
+ * of fs_inst *. This way, we can reset it between scheduling passes to
+ * prevent dependencies between the different scheduling modes.
+ */
+ int num_insts = cfg->last_block()->end_ip + 1;
+ fs_inst **inst_arr = ralloc_array(mem_ctx, fs_inst *, num_insts);
+
+ int ip = 0;
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ assert(ip >= block->start_ip && ip <= block->end_ip);
+ inst_arr[ip++] = inst;
+ }
+ assert(ip == num_insts);
+
/* Try each scheduling heuristic to see if it can successfully register
* allocate without spilling. They should be ordered by decreasing
* performance but increasing likelihood of allocating.
*/
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
- schedule_instructions(pre_modes[i]);
+ if (i > 0) {
+ /* Unless we're the first pass, reset back to the original order */
+ ip = 0;
+ foreach_block (block, cfg) {
+ block->instructions.make_empty();
+
+ assert(ip == block->start_ip);
+ for (; ip <= block->end_ip; ip++)
+ block->instructions.push_tail(inst_arr[ip]);
+ }
+ assert(ip == num_insts);
+
+ invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+ }
+
+ if (pre_modes[i] != SCHEDULE_NONE)
+ schedule_instructions(pre_modes[i]);
this->shader_stats.scheduler_mode = scheduler_mode_name[i];
if (0) {
@@ -8881,23 +8943,6 @@ fs_visitor::allocate_registers(bool allow_spilling)
break;
}
- /* Scheduling may create additional opportunities for CMOD propagation,
- * so let's do it again. If CMOD propagation made any progress,
- * eliminate dead code one more time.
- */
- bool progress = false;
- const int iteration = 99;
- int pass_num = 0;
-
- if (OPT(opt_cmod_propagation)) {
- /* dead_code_eliminate "undoes" the fixing done by
- * fixup_3src_null_dest, so we have to do it again if
- * dead_code_eliminiate makes any progress.
- */
- if (OPT(dead_code_eliminate))
- fixup_3src_null_dest();
- }
-
bool can_spill = allow_spilling &&
(i == ARRAY_SIZE(pre_modes) - 1);
@@ -8943,8 +8988,8 @@ fs_visitor::allocate_registers(bool allow_spilling)
prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
prog_data->total_scratch);
- if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
- if (devinfo->is_haswell) {
+ if (gl_shader_stage_is_compute(stage)) {
+ if (devinfo->platform == INTEL_PLATFORM_HSW) {
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
* field documentation, Haswell supports a minimum of 2kB of
* scratch space for compute shaders, unlike every other stage
@@ -8984,9 +9029,6 @@ fs_visitor::run_vs()
setup_vs_payload();
- if (shader_time_index >= 0)
- emit_shader_time_begin();
-
emit_nir_code();
if (failed)
@@ -8994,9 +9036,6 @@ fs_visitor::run_vs()
emit_urb_writes();
- if (shader_time_index >= 0)
- emit_shader_time_end();
-
calculate_cfg();
optimize();
@@ -9005,6 +9044,7 @@ fs_visitor::run_vs()
assign_vs_urb_setup();
fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9016,12 +9056,19 @@ fs_visitor::set_tcs_invocation_id()
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+ const bool dg2_plus =
+ devinfo->ver > 12 || intel_device_info_is_dg2(devinfo);
const unsigned instance_id_mask =
- devinfo->ver >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
+ dg2_plus ? INTEL_MASK(7, 0) :
+ (devinfo->ver >= 11) ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
const unsigned instance_id_shift =
- devinfo->ver >= 11 ? 16 : 17;
+ dg2_plus ? 0 : (devinfo->ver >= 11) ? 16 : 17;
- /* Get instance number from g0.2 bits 22:16 or 23:17 */
+ /* Get instance number from g0.2 bits:
+ * * 7:0 on DG2+
+ * * 22:16 on gfx11+
+ * * 23:17 otherwise
+ */
fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
brw_imm_ud(instance_id_mask));
@@ -9075,9 +9122,6 @@ fs_visitor::run_tcs()
tcs_key->input_vertices;
}
- if (shader_time_index >= 0)
- emit_shader_time_begin();
-
/* Initialize gl_InvocationID */
set_tcs_invocation_id();
@@ -9112,9 +9156,6 @@ fs_visitor::run_tcs()
inst->mlen = 3;
inst->eot = true;
- if (shader_time_index >= 0)
- emit_shader_time_end();
-
if (failed)
return false;
@@ -9126,6 +9167,7 @@ fs_visitor::run_tcs()
assign_tcs_urb_setup();
fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9139,9 +9181,6 @@ fs_visitor::run_tes()
/* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
payload.num_regs = 5;
- if (shader_time_index >= 0)
- emit_shader_time_begin();
-
emit_nir_code();
if (failed)
@@ -9149,9 +9188,6 @@ fs_visitor::run_tes()
emit_urb_writes();
- if (shader_time_index >= 0)
- emit_shader_time_end();
-
calculate_cfg();
optimize();
@@ -9160,6 +9196,7 @@ fs_visitor::run_tes()
assign_tes_urb_setup();
fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9188,16 +9225,10 @@ fs_visitor::run_gs()
}
}
- if (shader_time_index >= 0)
- emit_shader_time_begin();
-
emit_nir_code();
emit_gs_thread_end();
- if (shader_time_index >= 0)
- emit_shader_time_end();
-
if (failed)
return false;
@@ -9209,6 +9240,7 @@ fs_visitor::run_gs()
assign_gs_urb_setup();
fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
allocate_registers(true /* allow_spilling */);
return !failed;
@@ -9260,9 +9292,6 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
assert(dispatch_width == 16);
emit_repclear_shader();
} else {
- if (shader_time_index >= 0)
- emit_shader_time_begin();
-
if (nir->info.inputs_read > 0 ||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
@@ -9295,31 +9324,26 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
if (failed)
return false;
- if (wm_key->alpha_test_func)
+ if (wm_key->emit_alpha_test)
emit_alpha_test();
emit_fb_writes();
- if (shader_time_index >= 0)
- emit_shader_time_end();
-
calculate_cfg();
optimize();
assign_curb_setup();
- if (devinfo->ver >= 9)
+ if (devinfo->ver == 9)
gfx9_ps_header_only_workaround(wm_prog_data);
assign_urb_setup();
fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
-
- if (failed)
- return false;
}
return !failed;
@@ -9328,14 +9352,11 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
bool
fs_visitor::run_cs(bool allow_spilling)
{
- assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
+ assert(gl_shader_stage_is_compute(stage));
setup_cs_payload();
- if (shader_time_index >= 0)
- emit_shader_time_begin();
-
- if (devinfo->is_haswell && prog_data->total_shared > 0) {
+ if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
/* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
const fs_builder abld = bld.exec_all().group(1, 0);
abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
@@ -9349,9 +9370,6 @@ fs_visitor::run_cs(bool allow_spilling)
emit_cs_terminate();
- if (shader_time_index >= 0)
- emit_shader_time_end();
-
calculate_cfg();
optimize();
@@ -9359,11 +9377,9 @@ fs_visitor::run_cs(bool allow_spilling)
assign_curb_setup();
fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
- if (failed)
- return false;
-
return !failed;
}
@@ -9375,9 +9391,6 @@ fs_visitor::run_bs(bool allow_spilling)
/* R0: thread header, R1: stack IDs, R2: argument addresses */
payload.num_regs = 3;
- if (shader_time_index >= 0)
- emit_shader_time_begin();
-
emit_nir_code();
if (failed)
@@ -9386,8 +9399,52 @@ fs_visitor::run_bs(bool allow_spilling)
/* TODO(RT): Perhaps rename this? */
emit_cs_terminate();
- if (shader_time_index >= 0)
- emit_shader_time_end();
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+
+ fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
+ allocate_registers(allow_spilling);
+
+ return !failed;
+}
+
+bool
+fs_visitor::run_task(bool allow_spilling)
+{
+ assert(stage == MESA_SHADER_TASK);
+
+ /* Task Shader Payloads (SIMD8 and SIMD16)
+ *
+ * R0: Header
+ * R1: Local_ID.X[0-7 or 0-15]
+ * R2: Inline Parameter
+ *
+ * Task Shader Payloads (SIMD32)
+ *
+ * R0: Header
+ * R1: Local_ID.X[0-15]
+ * R2: Local_ID.X[16-31]
+ * R3: Inline Parameter
+ *
+ * Local_ID.X values are 16 bits.
+ *
+ * Inline parameter is optional but always present since we use it to pass
+ * the address to descriptors.
+ */
+ payload.num_regs = dispatch_width == 32 ? 4 : 3;
+
+ emit_nir_code();
+
+ if (failed)
+ return false;
+
+ emit_urb_fence();
+
+ emit_cs_terminate();
calculate_cfg();
@@ -9396,11 +9453,56 @@ fs_visitor::run_bs(bool allow_spilling)
assign_curb_setup();
fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
allocate_registers(allow_spilling);
+ return !failed;
+}
+
+bool
+fs_visitor::run_mesh(bool allow_spilling)
+{
+ assert(stage == MESA_SHADER_MESH);
+
+ /* Mesh Shader Payloads (SIMD8 and SIMD16)
+ *
+ * R0: Header
+ * R1: Local_ID.X[0-7 or 0-15]
+ * R2: Inline Parameter
+ *
+ * Mesh Shader Payloads (SIMD32)
+ *
+ * R0: Header
+ * R1: Local_ID.X[0-15]
+ * R2: Local_ID.X[16-31]
+ * R3: Inline Parameter
+ *
+ * Local_ID.X values are 16 bits.
+ *
+ * Inline parameter is optional but always present since we use it to pass
+ * the address to descriptors.
+ */
+ payload.num_regs = dispatch_width == 32 ? 4 : 3;
+
+ emit_nir_code();
+
if (failed)
return false;
+ emit_urb_fence();
+
+ emit_cs_terminate();
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+
+ fixup_3src_null_dest();
+ emit_dummy_memory_fence_before_eot();
+ allocate_registers(allow_spilling);
+
return !failed;
}
@@ -9485,15 +9587,18 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
prog_data->flat_inputs = 0;
nir_foreach_shader_in_variable(var, shader) {
+ /* flat shading */
+ if (var->data.interpolation != INTERP_MODE_FLAT)
+ continue;
+
+ if (var->data.per_primitive)
+ continue;
+
unsigned slots = glsl_count_attribute_slots(var->type, false);
for (unsigned s = 0; s < slots; s++) {
int input_index = prog_data->urb_setup[var->data.location + s];
- if (input_index < 0)
- continue;
-
- /* flat shading */
- if (var->data.interpolation == INTERP_MODE_FLAT)
+ if (input_index >= 0)
prog_data->flat_inputs |= 1 << input_index;
}
}
@@ -9630,19 +9735,22 @@ brw_nir_demote_sample_qualifiers(nir_shader *nir)
NULL);
}
-void
+static void
brw_nir_populate_wm_prog_data(const nir_shader *shader,
const struct intel_device_info *devinfo,
const struct brw_wm_prog_key *key,
- struct brw_wm_prog_data *prog_data)
+ struct brw_wm_prog_data *prog_data,
+ const struct brw_mue_map *mue_map)
{
/* key->alpha_test_func means simulating alpha testing via discards,
* so the shader definitely kills pixels.
*/
prog_data->uses_kill = shader->info.fs.uses_discard ||
- key->alpha_test_func;
+ shader->info.fs.uses_demote ||
+ key->emit_alpha_test;
prog_data->uses_omask = !key->ignore_sample_mask_out &&
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+ prog_data->color_outputs_written = key->color_outputs_valid;
prog_data->computed_depth_mode = computed_depth_mode(shader);
prog_data->computed_stencil =
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
@@ -9669,7 +9777,10 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
* persample dispatch, we hard-code it to 0.5.
*/
prog_data->uses_pos_offset = prog_data->persample_dispatch &&
- BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
+ (BITSET_TEST(shader->info.system_values_read,
+ SYSTEM_VALUE_SAMPLE_POS) ||
+ BITSET_TEST(shader->info.system_values_read,
+ SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
}
prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
@@ -9680,6 +9791,9 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
prog_data->barycentric_interp_modes =
brw_compute_barycentric_interp_modes(devinfo, shader);
+ prog_data->uses_nonperspective_interp_modes |=
+ (prog_data->barycentric_interp_modes &
+ BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
prog_data->per_coarse_pixel_dispatch =
key->coarse_pixel &&
@@ -9698,7 +9812,7 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
prog_data->per_coarse_pixel_dispatch;
- calculate_urb_setup(devinfo, key, prog_data, shader);
+ calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
brw_compute_flat_inputs(prog_data, shader);
}
@@ -9726,6 +9840,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
INTEL_DEBUG(params->debug_flag ? params->debug_flag : DEBUG_WM);
prog_data->base.stage = MESA_SHADER_FRAGMENT;
+ prog_data->base.ray_queries = nir->info.ray_queries;
prog_data->base.total_scratch = 0;
const struct intel_device_info *devinfo = compiler->devinfo;
@@ -9757,7 +9872,8 @@ brw_compile_fs(const struct brw_compiler *compiler,
brw_postprocess_nir(nir, compiler, true, debug_enabled,
key->base.robust_buffer_access);
- brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
+ brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
+ params->mue_map);
fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
@@ -9766,7 +9882,6 @@ brw_compile_fs(const struct brw_compiler *compiler,
v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
&prog_data->base, nir, 8,
- params->shader_time ? params->shader_time_index8 : -1,
debug_enabled);
if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
@@ -9801,13 +9916,15 @@ brw_compile_fs(const struct brw_compiler *compiler,
" pixel shading.\n");
}
+ if (nir->info.ray_queries > 0)
+ v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
+
if (!has_spilled &&
v8->max_dispatch_width >= 16 &&
(!INTEL_DEBUG(DEBUG_NO16) || params->use_rep_send)) {
/* Try a SIMD16 compile */
v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
&prog_data->base, nir, 16,
- params->shader_time ? params->shader_time_index16 : -1,
debug_enabled);
v16->import_uniforms(v8);
if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
@@ -9835,7 +9952,6 @@ brw_compile_fs(const struct brw_compiler *compiler,
/* Try a SIMD32 compile */
v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
&prog_data->base, nir, 32,
- params->shader_time ? params->shader_time_index32 : -1,
debug_enabled);
v32->import_uniforms(v8);
if (!v32->run_fs(allow_spilling, false)) {
@@ -9955,22 +10071,28 @@ brw_compile_fs(const struct brw_compiler *compiler,
return g.get_assembly();
}
-fs_reg *
-fs_visitor::emit_cs_work_group_id_setup()
+fs_reg
+fs_visitor::emit_work_group_id_setup()
{
- assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
+ assert(gl_shader_stage_uses_workgroup(stage));
- fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
+ fs_reg id = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
- struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
- struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
+ bld.MOV(id, r0_1);
- bld.MOV(*reg, r0_1);
- bld.MOV(offset(*reg, bld, 1), r0_6);
- bld.MOV(offset(*reg, bld, 2), r0_7);
+ if (gl_shader_stage_is_compute(stage)) {
+ struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
+ struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
+ bld.MOV(offset(id, bld, 1), r0_6);
+ bld.MOV(offset(id, bld, 2), r0_7);
+ } else {
+ /* Task/Mesh have a single Workgroup ID dimension in the HW. */
+ bld.MOV(offset(id, bld, 1), brw_imm_ud(0));
+ bld.MOV(offset(id, bld, 2), brw_imm_ud(0));
+ }
- return reg;
+ return id;
}
unsigned
@@ -10071,36 +10193,13 @@ lower_simd(nir_builder *b, nir_instr *instr, void *options)
}
}
-static void
+void
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
{
nir_shader_lower_instructions(nir, filter_simd, lower_simd,
(void *)(uintptr_t)dispatch_width);
}
-static nir_shader *
-compile_cs_to_nir(const struct brw_compiler *compiler,
- void *mem_ctx,
- const struct brw_cs_prog_key *key,
- const nir_shader *src_shader,
- unsigned dispatch_width,
- bool debug_enabled)
-{
- nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
- brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
-
- NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width);
-
- /* Clean up after the local index and ID calculations. */
- NIR_PASS_V(shader, nir_opt_constant_folding);
- NIR_PASS_V(shader, nir_opt_dce);
-
- brw_postprocess_nir(shader, compiler, true, debug_enabled,
- key->base.robust_buffer_access);
-
- return shader;
-}
-
const unsigned *
brw_compile_cs(const struct brw_compiler *compiler,
void *mem_ctx,
@@ -10109,193 +10208,93 @@ brw_compile_cs(const struct brw_compiler *compiler,
const nir_shader *nir = params->nir;
const struct brw_cs_prog_key *key = params->key;
struct brw_cs_prog_data *prog_data = params->prog_data;
- int shader_time_index = params->shader_time ? params->shader_time_index : -1;
const bool debug_enabled =
INTEL_DEBUG(params->debug_flag ? params->debug_flag : DEBUG_CS);
prog_data->base.stage = MESA_SHADER_COMPUTE;
prog_data->base.total_shared = nir->info.shared_size;
+ prog_data->base.ray_queries = nir->info.ray_queries;
prog_data->base.total_scratch = 0;
- /* Generate code for all the possible SIMD variants. */
- bool generate_all;
-
- unsigned min_dispatch_width;
- unsigned max_dispatch_width;
-
- if (nir->info.workgroup_size_variable) {
- generate_all = true;
- min_dispatch_width = 8;
- max_dispatch_width = 32;
- } else {
- generate_all = false;
+ if (!nir->info.workgroup_size_variable) {
prog_data->local_size[0] = nir->info.workgroup_size[0];
prog_data->local_size[1] = nir->info.workgroup_size[1];
prog_data->local_size[2] = nir->info.workgroup_size[2];
- unsigned local_workgroup_size = prog_data->local_size[0] *
- prog_data->local_size[1] *
- prog_data->local_size[2];
-
- /* Limit max_threads to 64 for the GPGPU_WALKER command */
- const uint32_t max_threads = compiler->devinfo->max_cs_workgroup_threads;
- min_dispatch_width = util_next_power_of_two(
- MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads)));
- assert(min_dispatch_width <= 32);
- max_dispatch_width = 32;
- }
-
- unsigned required_dispatch_width = 0;
- if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
- /* These enum values are expressly chosen to be equal to the subgroup
- * size that they require.
- */
- required_dispatch_width = (unsigned)key->base.subgroup_size_type;
}
- if (nir->info.cs.subgroup_size > 0) {
- assert(required_dispatch_width == 0 ||
- required_dispatch_width == nir->info.cs.subgroup_size);
- required_dispatch_width = nir->info.cs.subgroup_size;
- }
+ const unsigned required_dispatch_width =
+ brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type);
- if (required_dispatch_width > 0) {
- assert(required_dispatch_width == 8 ||
- required_dispatch_width == 16 ||
- required_dispatch_width == 32);
- if (required_dispatch_width < min_dispatch_width ||
- required_dispatch_width > max_dispatch_width) {
- params->error_str = ralloc_strdup(mem_ctx,
- "Cannot satisfy explicit subgroup size");
- return NULL;
- }
- min_dispatch_width = max_dispatch_width = required_dispatch_width;
- }
+ fs_visitor *v[3] = {0};
+ const char *error[3] = {0};
- assert(min_dispatch_width <= max_dispatch_width);
+ for (unsigned simd = 0; simd < 3; simd++) {
+ if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, prog_data,
+ required_dispatch_width, &error[simd]))
+ continue;
- fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
- fs_visitor *v = NULL;
-
- if (!INTEL_DEBUG(DEBUG_NO8) &&
- min_dispatch_width <= 8 && max_dispatch_width >= 8) {
- nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
- nir, 8, debug_enabled);
- v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
- &prog_data->base,
- nir8, 8, shader_time_index, debug_enabled);
- if (!v8->run_cs(true /* allow_spilling */)) {
- params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
- delete v8;
- return NULL;
- }
+ const unsigned dispatch_width = 8u << simd;
- /* We should always be able to do SIMD32 for compute shaders */
- assert(v8->max_dispatch_width >= 32);
+ nir_shader *shader = nir_shader_clone(mem_ctx, nir);
+ brw_nir_apply_key(shader, compiler, &key->base,
+ dispatch_width, true /* is_scalar */);
- v = v8;
- prog_data->prog_mask |= 1 << 0;
- if (v8->spilled_any_registers)
- prog_data->prog_spilled |= 1 << 0;
- cs_fill_push_const_info(compiler->devinfo, prog_data);
- }
+ NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width);
- if (!INTEL_DEBUG(DEBUG_NO16) &&
- (generate_all || !prog_data->prog_spilled) &&
- min_dispatch_width <= 16 && max_dispatch_width >= 16) {
- /* Try a SIMD16 compile */
- nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
- nir, 16, debug_enabled);
- v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
- &prog_data->base,
- nir16, 16, shader_time_index, debug_enabled);
- if (v8)
- v16->import_uniforms(v8);
+ /* Clean up after the local index and ID calculations. */
+ NIR_PASS_V(shader, nir_opt_constant_folding);
+ NIR_PASS_V(shader, nir_opt_dce);
- const bool allow_spilling = generate_all || v == NULL;
- if (!v16->run_cs(allow_spilling)) {
- brw_shader_perf_log(compiler, params->log_data,
- "SIMD16 shader failed to compile: %s\n",
- v16->fail_msg);
- if (!v) {
- assert(v8 == NULL);
- params->error_str = ralloc_asprintf(
- mem_ctx, "Not enough threads for SIMD8 and "
- "couldn't generate SIMD16: %s", v16->fail_msg);
- delete v16;
- return NULL;
- }
- } else {
- /* We should always be able to do SIMD32 for compute shaders */
- assert(v16->max_dispatch_width >= 32);
+ brw_postprocess_nir(shader, compiler, true, debug_enabled,
+ key->base.robust_buffer_access);
- v = v16;
- prog_data->prog_mask |= 1 << 1;
- if (v16->spilled_any_registers)
- prog_data->prog_spilled |= 1 << 1;
- cs_fill_push_const_info(compiler->devinfo, prog_data);
+ v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
+ &prog_data->base, shader, dispatch_width,
+ debug_enabled);
+
+ if (prog_data->prog_mask) {
+ unsigned first = ffs(prog_data->prog_mask) - 1;
+ v[simd]->import_uniforms(v[first]);
}
- }
- /* The SIMD32 is only enabled for cases it is needed unless forced.
- *
- * TODO: Use performance_analysis and drop this boolean.
- */
- const bool needs_32 = v == NULL ||
- INTEL_DEBUG(DEBUG_DO32) ||
- generate_all;
-
- if (!INTEL_DEBUG(DEBUG_NO32) &&
- (generate_all || !prog_data->prog_spilled) &&
- needs_32 &&
- min_dispatch_width <= 32 && max_dispatch_width >= 32) {
- /* Try a SIMD32 compile */
- nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
- nir, 32, debug_enabled);
- v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
- &prog_data->base,
- nir32, 32, shader_time_index, debug_enabled);
- if (v8)
- v32->import_uniforms(v8);
- else if (v16)
- v32->import_uniforms(v16);
-
- const bool allow_spilling = generate_all || v == NULL;
- if (!v32->run_cs(allow_spilling)) {
- brw_shader_perf_log(compiler, params->log_data,
- "SIMD32 shader failed to compile: %s\n",
- v32->fail_msg);
- if (!v) {
- assert(v8 == NULL);
- assert(v16 == NULL);
- params->error_str = ralloc_asprintf(
- mem_ctx, "Not enough threads for SIMD16 and "
- "couldn't generate SIMD32: %s", v32->fail_msg);
- delete v32;
- return NULL;
- }
- } else {
- v = v32;
- prog_data->prog_mask |= 1 << 2;
- if (v32->spilled_any_registers)
- prog_data->prog_spilled |= 1 << 2;
+ const bool allow_spilling = !prog_data->prog_mask ||
+ nir->info.workgroup_size_variable;
+
+ if (v[simd]->run_cs(allow_spilling)) {
+ /* We should always be able to do SIMD32 for compute shaders. */
+ assert(v[simd]->max_dispatch_width >= 32);
+
cs_fill_push_const_info(compiler->devinfo, prog_data);
+
+ brw_simd_mark_compiled(simd, prog_data, v[simd]->spilled_any_registers);
+ } else {
+ error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
+ if (simd > 0) {
+ brw_shader_perf_log(compiler, params->log_data,
+ "SIMD%u shader failed to compile: %s\n",
+ dispatch_width, v[simd]->fail_msg);
+ }
}
}
- if (unlikely(!v) && INTEL_DEBUG(DEBUG_NO8 | DEBUG_NO16 | DEBUG_NO32)) {
- params->error_str =
- ralloc_strdup(mem_ctx,
- "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
+ const int selected_simd = brw_simd_select(prog_data);
+ if (selected_simd < 0) {
+ params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n",
+ error[0], error[1], error[2]);;
return NULL;
}
- assert(v);
+ assert(selected_simd < 3);
+ fs_visitor *selected = v[selected_simd];
+
+ if (!nir->info.workgroup_size_variable)
+ prog_data->prog_mask = 1 << selected_simd;
const unsigned *ret = NULL;
fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
- v->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
+ selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
if (unlikely(debug_enabled)) {
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
nir->info.label ?
@@ -10305,84 +10304,27 @@ brw_compile_cs(const struct brw_compiler *compiler,
}
struct brw_compile_stats *stats = params->stats;
- if (generate_all) {
- if (prog_data->prog_mask & (1 << 0)) {
- assert(v8);
- prog_data->prog_offset[0] =
- g.generate_code(v8->cfg, 8, v8->shader_stats,
- v8->performance_analysis.require(), stats);
- stats = stats ? stats + 1 : NULL;
- }
-
- if (prog_data->prog_mask & (1 << 1)) {
- assert(v16);
- prog_data->prog_offset[1] =
- g.generate_code(v16->cfg, 16, v16->shader_stats,
- v16->performance_analysis.require(), stats);
- stats = stats ? stats + 1 : NULL;
- }
-
- if (prog_data->prog_mask & (1 << 2)) {
- assert(v32);
- prog_data->prog_offset[2] =
- g.generate_code(v32->cfg, 32, v32->shader_stats,
- v32->performance_analysis.require(), stats);
+ for (unsigned simd = 0; simd < 3; simd++) {
+ if (prog_data->prog_mask & (1u << simd)) {
+ assert(v[simd]);
+ prog_data->prog_offset[simd] =
+ g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
+ v[simd]->performance_analysis.require(), stats);
stats = stats ? stats + 1 : NULL;
}
- } else {
- /* Only one dispatch width will be valid, and will be at offset 0,
- * which is already the default value of prog_offset_* fields.
- */
- prog_data->prog_mask = 1 << (v->dispatch_width / 16);
- g.generate_code(v->cfg, v->dispatch_width, v->shader_stats,
- v->performance_analysis.require(), stats);
}
g.add_const_data(nir->constant_data, nir->constant_data_size);
ret = g.get_assembly();
- delete v8;
- delete v16;
- delete v32;
+ delete v[0];
+ delete v[1];
+ delete v[2];
return ret;
}
-static unsigned
-brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
- const struct brw_cs_prog_data *cs_prog_data,
- unsigned group_size)
-{
- const unsigned mask = cs_prog_data->prog_mask;
- assert(mask != 0);
-
- static const unsigned simd8 = 1 << 0;
- static const unsigned simd16 = 1 << 1;
- static const unsigned simd32 = 1 << 2;
-
- if (INTEL_DEBUG(DEBUG_DO32) && (mask & simd32))
- return 32;
-
- const uint32_t max_threads = devinfo->max_cs_workgroup_threads;
-
- if ((mask & simd8) && group_size <= 8 * max_threads) {
- /* Prefer SIMD16 if can do without spilling. Matches logic in
- * brw_compile_cs.
- */
- if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))
- return 16;
- return 8;
- }
-
- if ((mask & simd16) && group_size <= 16 * max_threads)
- return 16;
-
- assert(mask & simd32);
- assert(group_size <= 32 * max_threads);
- return 32;
-}
-
struct brw_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *prog_data,
@@ -10394,9 +10336,13 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
override_local_size ? override_local_size :
prog_data->local_size;
+ const int simd =
+ override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) :
+ brw_simd_select(prog_data);
+ assert(simd >= 0 && simd < 3);
+
info.group_size = sizes[0] * sizes[1] * sizes[2];
- info.simd_size =
- brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);
+ info.simd_size = 8u << simd;
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
const uint32_t remainder = info.group_size & (info.simd_size - 1);
@@ -10437,7 +10383,7 @@ compile_single_bs(const struct brw_compiler *compiler, void *log_data,
if (!INTEL_DEBUG(DEBUG_NO8)) {
v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
&prog_data->base, shader,
- 8, -1 /* shader time */, debug_enabled);
+ 8, debug_enabled);
const bool allow_spilling = true;
if (!v8->run_bs(allow_spilling)) {
if (error_str)
@@ -10455,7 +10401,7 @@ compile_single_bs(const struct brw_compiler *compiler, void *log_data,
if (!has_spilled && !INTEL_DEBUG(DEBUG_NO16)) {
v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
&prog_data->base, shader,
- 16, -1 /* shader time */, debug_enabled);
+ 16, debug_enabled);
const bool allow_spilling = (v == NULL);
if (!v16->run_bs(allow_spilling)) {
brw_shader_perf_log(compiler, log_data,
@@ -10517,23 +10463,23 @@ brw_bsr(const struct intel_device_info *devinfo,
}
const unsigned *
-brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
+brw_compile_bs(const struct brw_compiler *compiler,
void *mem_ctx,
- const struct brw_bs_prog_key *key,
- struct brw_bs_prog_data *prog_data,
- nir_shader *shader,
- unsigned num_resume_shaders,
- struct nir_shader **resume_shaders,
- struct brw_compile_stats *stats,
- char **error_str)
+ struct brw_compile_bs_params *params)
{
+ nir_shader *shader = params->nir;
+ struct brw_bs_prog_data *prog_data = params->prog_data;
+ unsigned num_resume_shaders = params->num_resume_shaders;
+ nir_shader **resume_shaders = params->resume_shaders;
const bool debug_enabled = INTEL_DEBUG(DEBUG_RT);
prog_data->base.stage = shader->info.stage;
+ prog_data->base.ray_queries = shader->info.ray_queries;
prog_data->base.total_scratch = 0;
+
prog_data->max_stack_size = 0;
- fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
+ fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
false, shader->info.stage);
if (unlikely(debug_enabled)) {
char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s",
@@ -10545,8 +10491,9 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
}
prog_data->simd_size =
- compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
- shader, &g, stats, NULL, error_str);
+ compile_single_bs(compiler, params->log_data, mem_ctx,
+ params->key, prog_data,
+ shader, &g, params->stats, NULL, &params->error_str);
if (prog_data->simd_size == 0)
return NULL;
@@ -10564,8 +10511,9 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
/* TODO: Figure out shader stats etc. for resume shaders */
int offset = 0;
uint8_t simd_size =
- compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
- resume_shaders[i], &g, NULL, &offset, error_str);
+ compile_single_bs(compiler, params->log_data, mem_ctx, params->key,
+ prog_data, resume_shaders[i], &g, NULL, &offset,
+ &params->error_str);
if (simd_size == 0)
return NULL;
@@ -10623,7 +10571,7 @@ brw_fs_test_dispatch_packing(const fs_builder &bld)
unsigned
fs_visitor::workgroup_size() const
{
- assert(stage == MESA_SHADER_COMPUTE);
+ assert(gl_shader_stage_uses_workgroup(stage));
const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
}