diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2022-09-02 05:47:02 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2022-09-02 05:47:02 +0000 |
commit | 0dbbf1e0708df85a357d70e2708c0a11aeb5480e (patch) | |
tree | 6656ff8eb8b15a2fc1c02888973caf618388cfd0 /lib/mesa/src/intel/compiler/brw_fs.cpp | |
parent | 5f66494d31f735486b8222ecfa0a0c9046e92543 (diff) |
Merge Mesa 22.1.7
Diffstat (limited to 'lib/mesa/src/intel/compiler/brw_fs.cpp')
-rw-r--r-- | lib/mesa/src/intel/compiler/brw_fs.cpp | 1876 |
1 files changed, 912 insertions, 964 deletions
diff --git a/lib/mesa/src/intel/compiler/brw_fs.cpp b/lib/mesa/src/intel/compiler/brw_fs.cpp index bb6e1e338..3172a79fe 100644 --- a/lib/mesa/src/intel/compiler/brw_fs.cpp +++ b/lib/mesa/src/intel/compiler/brw_fs.cpp @@ -36,6 +36,7 @@ #include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" #include "brw_dead_control_flow.h" +#include "brw_private.h" #include "dev/intel_debug.h" #include "compiler/glsl_types.h" #include "compiler/nir/nir_builder.h" @@ -220,7 +221,6 @@ fs_inst::is_send_from_grf() const { switch (opcode) { case SHADER_OPCODE_SEND: - case SHADER_OPCODE_SHADER_TIME_ADD: case FS_OPCODE_INTERPOLATE_AT_SAMPLE: case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: @@ -309,7 +309,6 @@ fs_inst::is_payload(unsigned arg) const case VEC4_OPCODE_UNTYPED_SURFACE_READ: case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: - case SHADER_OPCODE_SHADER_TIME_ADD: case FS_OPCODE_INTERPOLATE_AT_SAMPLE: case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: case SHADER_OPCODE_INTERLOCK: @@ -590,83 +589,6 @@ fs_visitor::get_timestamp(const fs_builder &bld) } void -fs_visitor::emit_shader_time_begin() -{ - /* We want only the low 32 bits of the timestamp. Since it's running - * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, - * which is plenty of time for our purposes. It is identical across the - * EUs, but since it's tracking GPU core speed it will increment at a - * varying rate as render P-states change. - */ - shader_start_time = component( - get_timestamp(bld.annotate("shader time start")), 0); -} - -void -fs_visitor::emit_shader_time_end() -{ - /* Insert our code just before the final SEND with EOT. */ - exec_node *end = this->instructions.get_tail(); - assert(end && ((fs_inst *) end)->eot); - const fs_builder ibld = bld.annotate("shader time end") - .exec_all().at(NULL, end); - const fs_reg timestamp = get_timestamp(ibld); - - /* We only use the low 32 bits of the timestamp - see - * emit_shader_time_begin()). - * - * We could also check if render P-states have changed (or anything - * else that might disrupt timing) by setting smear to 2 and checking if - * that field is != 0. - */ - const fs_reg shader_end_time = component(timestamp, 0); - - /* Check that there weren't any timestamp reset events (assuming these - * were the only two timestamp reads that happened). - */ - const fs_reg reset = component(timestamp, 2); - set_condmod(BRW_CONDITIONAL_Z, - ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u))); - ibld.IF(BRW_PREDICATE_NORMAL); - - fs_reg start = shader_start_time; - start.negate = true; - const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1), - BRW_REGISTER_TYPE_UD), - 0); - const fs_builder cbld = ibld.group(1, 0); - cbld.group(1, 0).ADD(diff, start, shader_end_time); - - /* If there were no instructions between the two timestamp gets, the diff - * is 2 cycles. Remove that overhead, so I can forget about that when - * trying to determine the time taken for single instructions. - */ - cbld.ADD(diff, diff, brw_imm_ud(-2u)); - SHADER_TIME_ADD(cbld, 0, diff); - SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u)); - ibld.emit(BRW_OPCODE_ELSE); - SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u)); - ibld.emit(BRW_OPCODE_ENDIF); -} - -void -fs_visitor::SHADER_TIME_ADD(const fs_builder &bld, - int shader_time_subindex, - fs_reg value) -{ - int index = shader_time_index * 3 + shader_time_subindex; - struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE); - - fs_reg payload; - if (dispatch_width == 8) - payload = vgrf(glsl_type::uvec2_type); - else - payload = vgrf(glsl_type::uint_type); - - bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value); -} - -void fs_visitor::vfail(const char *format, va_list va) { char *msg; @@ -777,6 +699,7 @@ fs_inst::components_read(unsigned i) const case FS_OPCODE_TXB_LOGICAL: case SHADER_OPCODE_TXF_CMS_LOGICAL: case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: case SHADER_OPCODE_TXF_UMS_LOGICAL: case SHADER_OPCODE_TXF_MCS_LOGICAL: case SHADER_OPCODE_LOD_LOGICAL: @@ -796,9 +719,14 @@ fs_inst::components_read(unsigned i) const else if (i == TEX_LOGICAL_SRC_TG4_OFFSET) return 2; /* MCS */ - else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) - return 2; - else + else if (i == TEX_LOGICAL_SRC_MCS) { + if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) + return 2; + else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) + return 4; + else + return 1; + } else return 1; case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: @@ -1130,6 +1058,7 @@ fs_inst::flags_written(const intel_device_info *devinfo) const opcode == FS_OPCODE_FB_WRITE) { return flag_mask(this, 1); } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL || + opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL || opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) { return flag_mask(this, 32); } else { @@ -1227,7 +1156,6 @@ void fs_visitor::import_uniforms(fs_visitor *v) { this->push_constant_loc = v->push_constant_loc; - this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; this->subgroup_id = v->subgroup_id; for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++) @@ -1301,17 +1229,17 @@ centroid_to_pixel(enum brw_barycentric_mode bary) return (enum brw_barycentric_mode) ((unsigned) bary - 1); } -fs_reg * +fs_reg fs_visitor::emit_frontfacing_interpolation() { - fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type)); + fs_reg ff = bld.vgrf(BRW_REGISTER_TYPE_D); if (devinfo->ver >= 12) { fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W); bld.ASR(tmp, g1, brw_imm_d(15)); - bld.NOT(*reg, tmp); + bld.NOT(ff, tmp); } else if (devinfo->ver >= 6) { /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create * a boolean result from this (~0/true or 0/false). @@ -1327,7 +1255,7 @@ fs_visitor::emit_frontfacing_interpolation() fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); g0.negate = true; - bld.ASR(*reg, g0, brw_imm_d(15)); + bld.ASR(ff, g0, brw_imm_d(15)); } else { /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create * a boolean result from this (1/true or 0/false). @@ -1342,45 +1270,32 @@ fs_visitor::emit_frontfacing_interpolation() fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); g1_6.negate = true; - bld.ASR(*reg, g1_6, brw_imm_d(31)); + bld.ASR(ff, g1_6, brw_imm_d(31)); } - return reg; + return ff; } -void -fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) +fs_reg +fs_visitor::emit_samplepos_setup() { assert(stage == MESA_SHADER_FRAGMENT); struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); - assert(dst.type == BRW_REGISTER_TYPE_F); + assert(devinfo->ver >= 6); - if (wm_prog_data->persample_dispatch) { - /* Convert int_sample_pos to floating point */ - bld.MOV(dst, int_sample_pos); - /* Scale to the range [0, 1] */ - bld.MUL(dst, dst, brw_imm_f(1 / 16.0f)); - } - else { + const fs_builder abld = bld.annotate("compute sample position"); + fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2); + + if (!wm_prog_data->persample_dispatch) { /* From ARB_sample_shading specification: * "When rendering to a non-multisample buffer, or if multisample * rasterization is disabled, gl_SamplePosition will always be * (0.5, 0.5). */ - bld.MOV(dst, brw_imm_f(0.5f)); + bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f)); + bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f)); + return pos; } -} - -fs_reg * -fs_visitor::emit_samplepos_setup() -{ - assert(devinfo->ver >= 6); - - const fs_builder abld = bld.annotate("compute sample position"); - fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type)); - fs_reg pos = *reg; - fs_reg int_sample_x = vgrf(glsl_type::int_type); - fs_reg int_sample_y = vgrf(glsl_type::int_type); /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16 * mode will be enabled. @@ -1396,17 +1311,20 @@ fs_visitor::emit_samplepos_setup() const fs_reg sample_pos_reg = fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W); - /* Compute gl_SamplePosition.x */ - abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0)); - compute_sample_position(offset(pos, abld, 0), int_sample_x); + for (unsigned i = 0; i < 2; i++) { + fs_reg tmp_d = bld.vgrf(BRW_REGISTER_TYPE_D); + abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, i)); + /* Convert int_sample_pos to floating point */ + fs_reg tmp_f = bld.vgrf(BRW_REGISTER_TYPE_F); + abld.MOV(tmp_f, tmp_d); + /* Scale to the range [0, 1] */ + abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f)); + } - /* Compute gl_SamplePosition.y */ - abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1)); - compute_sample_position(offset(pos, abld, 1), int_sample_y); - return reg; + return pos; } -fs_reg * +fs_reg fs_visitor::emit_sampleid_setup() { assert(stage == MESA_SHADER_FRAGMENT); @@ -1414,14 +1332,14 @@ fs_visitor::emit_sampleid_setup() assert(devinfo->ver >= 6); const fs_builder abld = bld.annotate("compute sample id"); - fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type)); + fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD); if (!key->multisample_fbo) { /* As per GL_ARB_sample_shading specification: * "When rendering to a non-multisample buffer, or if multisample * rasterization is disabled, gl_SampleID will always be zero." */ - abld.MOV(*reg, brw_imm_d(0)); + abld.MOV(sample_id, brw_imm_d(0)); } else if (devinfo->ver >= 8) { /* Sample ID comes in as 4-bit numbers in g1.0: * @@ -1461,7 +1379,7 @@ fs_visitor::emit_sampleid_setup() brw_imm_v(0x44440000)); } - abld.AND(*reg, tmp, brw_imm_w(0xf)); + abld.AND(sample_id, tmp, brw_imm_w(0xf)); } else { const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0); const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW); @@ -1507,20 +1425,20 @@ fs_visitor::emit_sampleid_setup() /* This special instruction takes care of setting vstride=1, * width=4, hstride=0 of t2 during an ADD instruction. */ - abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2); + abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2); } - return reg; + return sample_id; } -fs_reg * +fs_reg fs_visitor::emit_samplemaskin_setup() { assert(stage == MESA_SHADER_FRAGMENT); struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); assert(devinfo->ver >= 6); - fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); + fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D); /* The HW doesn't provide us with expected values. */ assert(!wm_prog_data->per_coarse_pixel_dispatch); @@ -1542,28 +1460,27 @@ fs_visitor::emit_samplemaskin_setup() const fs_builder abld = bld.annotate("compute gl_SampleMaskIn"); if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) - nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); + nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(); fs_reg one = vgrf(glsl_type::int_type); fs_reg enabled_mask = vgrf(glsl_type::int_type); abld.MOV(one, brw_imm_d(1)); abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]); - abld.AND(*reg, enabled_mask, coverage_mask); + abld.AND(mask, enabled_mask, coverage_mask); } else { /* In per-pixel mode, the coverage mask is sufficient. */ - *reg = coverage_mask; + mask = coverage_mask; } - return reg; + return mask; } -fs_reg * +fs_reg fs_visitor::emit_shading_rate_setup() { assert(devinfo->ver >= 11); const fs_builder abld = bld.annotate("compute fragment shading rate"); - - fs_reg *reg = new(this->mem_ctx) fs_reg(bld.vgrf(BRW_REGISTER_TYPE_UD)); + fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD); struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(bld.shader->stage_prog_data); @@ -1589,12 +1506,12 @@ fs_visitor::emit_shading_rate_setup() abld.SHR(int_rate_y, actual_y, brw_imm_ud(1)); abld.SHR(int_rate_x, actual_x, brw_imm_ud(1)); abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2)); - abld.OR(*reg, int_rate_x, int_rate_y); + abld.OR(rate, int_rate_x, int_rate_y); } else { - abld.MOV(*reg, brw_imm_ud(0)); + abld.MOV(rate, brw_imm_ud(0)); } - return reg; + return rate; } fs_reg @@ -1674,9 +1591,9 @@ fs_visitor::assign_curb_setup() prog_data->curb_read_length = uniform_push_length + ubo_push_length; uint64_t used = 0; + bool is_compute = gl_shader_stage_is_compute(stage); - if (stage == MESA_SHADER_COMPUTE && - brw_cs_prog_data(prog_data)->uses_inline_data) { + if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) { /* With COMPUTE_WALKER, we can push up to one register worth of data via * the inline data parameter in the COMPUTE_WALKER command itself. * @@ -1684,7 +1601,7 @@ fs_visitor::assign_curb_setup() */ assert(devinfo->verx10 >= 125); assert(uniform_push_length <= 1); - } else if (stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125) { + } else if (is_compute && devinfo->verx10 >= 125) { fs_builder ubld = bld.exec_all().group(8, 0).at( cfg->first_block(), cfg->first_block()->start()); @@ -1794,7 +1711,6 @@ fs_visitor::assign_curb_setup() uint64_t want_zero = used & stage_prog_data->zero_push_reg; if (want_zero) { - assert(!compiler->compact_params); fs_builder ubld = bld.exec_all().group(8, 0).at( cfg->first_block(), cfg->first_block()->start()); @@ -1843,6 +1759,10 @@ fs_visitor::assign_curb_setup() void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data) { + /* TODO(mesh): Review usage of this in the context of Mesh, we may want to + * skip per-primitive attributes here. + */ + /* Make sure uint8_t is sufficient */ STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff); uint8_t index = 0; @@ -1858,16 +1778,76 @@ static void calculate_urb_setup(const struct intel_device_info *devinfo, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, - const nir_shader *nir) + const nir_shader *nir, + const struct brw_mue_map *mue_map) { memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); int urb_next = 0; + + const uint64_t inputs_read = + nir->info.inputs_read & ~nir->info.per_primitive_inputs; + /* Figure out where each of the incoming setup attributes lands. */ - if (devinfo->ver >= 6) { - if (util_bitcount64(nir->info.inputs_read & - BRW_FS_VARYING_INPUT_MASK) <= 16) { + if (mue_map) { + /* Per-Primitive Attributes are laid out by Hardware before the regular + * attributes, so order them like this to make easy later to map setup + * into real HW registers. + */ + if (nir->info.per_primitive_inputs) { + for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { + if (nir->info.per_primitive_inputs & BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + + /* The actual setup attributes later must be aligned to a full GRF. */ + urb_next = ALIGN(urb_next, 2); + + prog_data->num_per_primitive_inputs = urb_next; + } + + const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 | + VARYING_BIT_CLIP_DIST1; + + uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; + + if (inputs_read & clip_dist_bits) { + assert(mue_map->per_vertex_header_size_dw > 8); + unique_fs_attrs &= ~clip_dist_bits; + } + + /* In Mesh, CLIP_DIST slots are always at the beginning, because + * they come from MUE Vertex Header, not Per-Vertex Attributes. + */ + if (inputs_read & clip_dist_bits) { + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++; + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++; + } + + /* Per-Vertex attributes are laid out ordered. Because we always link + * Mesh and Fragment shaders, the which slots are written and read by + * each of them will match. */ + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (unique_fs_attrs & BITFIELD64_BIT(i)) + prog_data->urb_setup[i] = urb_next++; + } + } else if (devinfo->ver >= 6) { + uint64_t vue_header_bits = + VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT; + + uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; + + /* VUE header fields all live in the same URB slot, so we pass them + * as a single FS input attribute. We want to only count them once. + */ + if (inputs_read & vue_header_bits) { + unique_fs_attrs &= ~vue_header_bits; + unique_fs_attrs |= VARYING_BIT_PSIZ; + } + + if (util_bitcount64(unique_fs_attrs) <= 16) { /* The SF/SBE pipeline stage can do arbitrary rearrangement of the * first 16 varying inputs, so we can put them wherever we want. * Just put them in order. @@ -1876,9 +1856,22 @@ calculate_urb_setup(const struct intel_device_info *devinfo, * fragment shader won't take up valuable register space, and (b) we * won't have to recompile the fragment shader if it gets paired with * a different vertex (or geometry) shader. + * + * VUE header fields share the same FS input attribute. */ + if (inputs_read & vue_header_bits) { + if (inputs_read & VARYING_BIT_PSIZ) + prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next; + if (inputs_read & VARYING_BIT_LAYER) + prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next; + if (inputs_read & VARYING_BIT_VIEWPORT) + prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next; + + urb_next++; + } + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { - if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & + if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits & BITFIELD64_BIT(i)) { prog_data->urb_setup[i] = urb_next++; } @@ -1900,7 +1893,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo, nir->info.separate_shader, 1); int first_slot = - brw_compute_first_urb_slot_required(nir->info.inputs_read, + brw_compute_first_urb_slot_required(inputs_read, &prev_stage_vue_map); assert(prev_stage_vue_map.num_slots <= first_slot + 32); @@ -1908,7 +1901,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo, slot++) { int varying = prev_stage_vue_map.slot_to_varying[slot]; if (varying != BRW_VARYING_SLOT_PAD && - (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & + (inputs_read & BRW_FS_VARYING_INPUT_MASK & BITFIELD64_BIT(varying))) { prog_data->urb_setup[varying] = slot - first_slot; } @@ -1941,12 +1934,12 @@ calculate_urb_setup(const struct intel_device_info *devinfo, * * See compile_sf_prog() for more info. */ - if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) + if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; } - prog_data->num_varying_inputs = urb_next; - prog_data->inputs = nir->info.inputs_read; + prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs; + prog_data->inputs = inputs_read; brw_compute_urb_setup_index(prog_data); } @@ -1988,6 +1981,12 @@ fs_visitor::assign_urb_setup() /* Each attribute is 4 setup channels, each of which is half a reg. */ this->first_non_payload_grf += prog_data->num_varying_inputs * 2; + + /* Unlike regular attributes, per-primitive attributes have all 4 channels + * in the same slot, so each GRF can store two slots. + */ + assert(prog_data->num_per_primitive_inputs % 2 == 0); + this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2; } void @@ -2095,22 +2094,17 @@ fs_visitor::assign_gs_urb_setup() /** * Split large virtual GRFs into separate components if we can. * - * This is mostly duplicated with what brw_fs_vector_splitting does, - * but that's really conservative because it's afraid of doing - * splitting that doesn't result in real progress after the rest of - * the optimization phases, which would cause infinite looping in - * optimization. We can do it once here, safely. This also has the - * opportunity to split interpolated values, or maybe even uniforms, - * which we don't have at the IR level. - * - * We want to split, because virtual GRFs are what we register - * allocate and spill (due to contiguousness requirements for some - * instructions), and they're what we naturally generate in the - * codegen process, but most virtual GRFs don't actually need to be - * contiguous sets of GRFs. If we split, we'll end up with reduced - * live intervals and better dead code elimination and coalescing. + * This pass aggressively splits VGRFs into as small a chunks as possible, + * down to single registers if it can. If no VGRFs can be split, we return + * false so this pass can safely be used inside an optimization loop. We + * want to split, because virtual GRFs are what we register allocate and + * spill (due to contiguousness requirements for some instructions), and + * they're what we naturally generate in the codegen process, but most + * virtual GRFs don't actually need to be contiguous sets of GRFs. If we + * split, we'll end up with reduced live intervals and better dead code + * elimination and coalescing. */ -void +bool fs_visitor::split_virtual_grfs() { /* Compact the register file so we eliminate dead vgrfs. This @@ -2181,10 +2175,15 @@ fs_visitor::split_virtual_grfs() } } + /* Bitset of which registers have been split */ + bool *vgrf_has_split = new bool[num_vars]; + memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split)); + int *new_virtual_grf = new int[reg_count]; int *new_reg_offset = new int[reg_count]; int reg = 0; + bool has_splits = false; for (int i = 0; i < num_vars; i++) { /* The first one should always be 0 as a quick sanity check. */ assert(split_points[reg] == false); @@ -2200,6 +2199,8 @@ fs_visitor::split_virtual_grfs() * new virtual GRF for the previous offset many registers */ if (split_points[reg]) { + has_splits = true; + vgrf_has_split[i] = true; assert(offset <= MAX_VGRF_SIZE); int grf = alloc.allocate(offset); for (int k = reg - offset; k < reg; k++) @@ -2219,42 +2220,74 @@ fs_visitor::split_virtual_grfs() } assert(reg == reg_count); + bool progress; + if (!has_splits) { + progress = false; + goto cleanup; + } + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { if (inst->opcode == SHADER_OPCODE_UNDEF) { - const fs_builder ibld(this, block, inst); - assert(inst->size_written % REG_SIZE == 0); - unsigned reg_offset = 0; - while (reg_offset < inst->size_written / REG_SIZE) { - reg = vgrf_to_reg[inst->dst.nr] + reg_offset; - ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type)); - reg_offset += alloc.sizes[new_virtual_grf[reg]]; + assert(inst->dst.file == VGRF); + if (vgrf_has_split[inst->dst.nr]) { + const fs_builder ibld(this, block, inst); + assert(inst->size_written % REG_SIZE == 0); + unsigned reg_offset = 0; + while (reg_offset < inst->size_written / REG_SIZE) { + reg = vgrf_to_reg[inst->dst.nr] + reg_offset; + ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type)); + reg_offset += alloc.sizes[new_virtual_grf[reg]]; + } + inst->remove(block); + } else { + reg = vgrf_to_reg[inst->dst.nr]; + assert(new_reg_offset[reg] == 0); + assert(new_virtual_grf[reg] == (int)inst->dst.nr); } - inst->remove(block); continue; } if (inst->dst.file == VGRF) { reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; - inst->dst.nr = new_virtual_grf[reg]; - inst->dst.offset = new_reg_offset[reg] * REG_SIZE + - inst->dst.offset % REG_SIZE; - assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); + if (vgrf_has_split[inst->dst.nr]) { + inst->dst.nr = new_virtual_grf[reg]; + inst->dst.offset = new_reg_offset[reg] * REG_SIZE + + inst->dst.offset % REG_SIZE; + assert((unsigned)new_reg_offset[reg] < + alloc.sizes[new_virtual_grf[reg]]); + } else { + assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE); + assert(new_virtual_grf[reg] == (int)inst->dst.nr); + } } for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == VGRF) { - reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; + if (inst->src[i].file != VGRF) + continue; + + reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; + if (vgrf_has_split[inst->src[i].nr]) { inst->src[i].nr = new_virtual_grf[reg]; inst->src[i].offset = new_reg_offset[reg] * REG_SIZE + inst->src[i].offset % REG_SIZE; - assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); + assert((unsigned)new_reg_offset[reg] < + alloc.sizes[new_virtual_grf[reg]]); + } else { + assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE); + assert(new_virtual_grf[reg] == (int)inst->src[i].nr); } } } invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); + progress = true; + +cleanup: delete[] split_points; + delete[] vgrf_has_split; delete[] new_virtual_grf; delete[] new_reg_offset; + + return progress; } /** @@ -2351,109 +2384,6 @@ get_subgroup_id_param_index(const intel_device_info *devinfo, } /** - * Struct for handling complex alignments. - * - * A complex alignment is stored as multiplier and an offset. A value is - * considered to be aligned if it is {offset} larger than a multiple of {mul}. - * For instance, with an alignment of {8, 2}, cplx_align_apply would do the - * following: - * - * N | cplx_align_apply({8, 2}, N) - * ----+----------------------------- - * 4 | 6 - * 6 | 6 - * 8 | 14 - * 10 | 14 - * 12 | 14 - * 14 | 14 - * 16 | 22 - */ -struct cplx_align { - unsigned mul:4; - unsigned offset:4; -}; - -#define CPLX_ALIGN_MAX_MUL 8 - -static void -cplx_align_assert_sane(struct cplx_align a) -{ - assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul)); - assert(a.offset < a.mul); -} - -/** - * Combines two alignments to produce a least multiple of sorts. - * - * The returned alignment is the smallest (in terms of multiplier) such that - * anything aligned to both a and b will be aligned to the new alignment. - * This function will assert-fail if a and b are not compatible, i.e. if the - * offset parameters are such that no common alignment is possible. - */ -static struct cplx_align -cplx_align_combine(struct cplx_align a, struct cplx_align b) -{ - cplx_align_assert_sane(a); - cplx_align_assert_sane(b); - - /* Assert that the alignments agree. */ - assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1))); - - return a.mul > b.mul ? a : b; -} - -/** - * Apply a complex alignment - * - * This function will return the smallest number greater than or equal to - * offset that is aligned to align. - */ -static unsigned -cplx_align_apply(struct cplx_align align, unsigned offset) -{ - return ALIGN(offset - align.offset, align.mul) + align.offset; -} - -#define UNIFORM_SLOT_SIZE 4 - -struct uniform_slot_info { - /** True if the given uniform slot is live */ - unsigned is_live:1; - - /** True if this slot and the next slot must remain contiguous */ - unsigned contiguous:1; - - struct cplx_align align; -}; - -static void -mark_uniform_slots_read(struct uniform_slot_info *slots, - unsigned num_slots, unsigned alignment) -{ - assert(alignment > 0 && util_is_power_of_two_nonzero(alignment)); - assert(alignment <= CPLX_ALIGN_MAX_MUL); - - /* We can't align a slot to anything less than the slot size */ - alignment = MAX2(alignment, UNIFORM_SLOT_SIZE); - - struct cplx_align align = {alignment, 0}; - cplx_align_assert_sane(align); - - for (unsigned i = 0; i < num_slots; i++) { - slots[i].is_live = true; - if (i < num_slots - 1) - slots[i].contiguous = true; - - align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1); - if (slots[i].align.mul == 0) { - slots[i].align = align; - } else { - slots[i].align = cplx_align_combine(slots[i].align, align); - } - } -} - -/** * Assign UNIFORM file registers to either push constants or pull constants. * * We allow a fragment shader to have more than the specified minimum @@ -2466,197 +2396,12 @@ void fs_visitor::assign_constant_locations() { /* Only the first compile gets to decide on locations. */ - if (push_constant_loc) { - assert(pull_constant_loc); + if (push_constant_loc) return; - } - - if (compiler->compact_params) { - struct uniform_slot_info slots[uniforms + 1]; - memset(slots, 0, sizeof(slots)); - - foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { - for (int i = 0 ; i < inst->sources; i++) { - if (inst->src[i].file != UNIFORM) - continue; - /* NIR tightly packs things so the uniform number might not be - * aligned (if we have a double right after a float, for - * instance). This is fine because the process of re-arranging - * them will ensure that things are properly aligned. The offset - * into that uniform, however, must be aligned. - * - * In Vulkan, we have explicit offsets but everything is crammed - * into a single "variable" so inst->src[i].nr will always be 0. - * Everything will be properly aligned relative to that one base. - */ - assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); - - unsigned u = inst->src[i].nr + - inst->src[i].offset / UNIFORM_SLOT_SIZE; - - if (u >= uniforms) - continue; - - unsigned slots_read; - if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { - slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); - } else { - unsigned bytes_read = inst->components_read(i) * - type_sz(inst->src[i].type); - slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); - } - - assert(u + slots_read <= uniforms); - mark_uniform_slots_read(&slots[u], slots_read, - type_sz(inst->src[i].type)); - } - } - - int subgroup_id_index = get_subgroup_id_param_index(devinfo, - stage_prog_data); - - /* Only allow 16 registers (128 uniform components) as push constants. - * - * Just demote the end of the list. We could probably do better - * here, demoting things that are rarely used in the program first. - * - * If changing this value, note the limitation about total_regs in - * brw_curbe.c. - */ - unsigned int max_push_components = 16 * 8; - if (subgroup_id_index >= 0) - max_push_components--; /* Save a slot for the thread ID */ - - /* We push small arrays, but no bigger than 16 floats. This is big - * enough for a vec4 but hopefully not large enough to push out other - * stuff. We should probably use a better heuristic at some point. - */ - const unsigned int max_chunk_size = 16; - - unsigned int num_push_constants = 0; - unsigned int num_pull_constants = 0; - - push_constant_loc = ralloc_array(mem_ctx, int, uniforms); - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - - /* Default to -1 meaning no location */ - memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); - memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); - - int chunk_start = -1; - struct cplx_align align; - for (unsigned u = 0; u < uniforms; u++) { - if (!slots[u].is_live) { - assert(chunk_start == -1); - continue; - } - - /* Skip subgroup_id_index to put it in the last push register. */ - if (subgroup_id_index == (int)u) - continue; - - if (chunk_start == -1) { - chunk_start = u; - align = slots[u].align; - } else { - /* Offset into the chunk */ - unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; - - /* Shift the slot alignment down by the chunk offset so it is - * comparable with the base chunk alignment. - */ - struct cplx_align slot_align = slots[u].align; - slot_align.offset = - (slot_align.offset - chunk_offset) & (align.mul - 1); - - align = cplx_align_combine(align, slot_align); - } - - /* Sanity check the alignment */ - cplx_align_assert_sane(align); - - if (slots[u].contiguous) - continue; - - /* Adjust the alignment to be in terms of slots, not bytes */ - assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); - assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); - align.mul /= UNIFORM_SLOT_SIZE; - align.offset /= UNIFORM_SLOT_SIZE; - - unsigned push_start_align = cplx_align_apply(align, num_push_constants); - unsigned chunk_size = u - chunk_start + 1; - if ((!compiler->supports_pull_constants && u < UBO_START) || - (chunk_size < max_chunk_size && - push_start_align + chunk_size <= max_push_components)) { - /* Align up the number of push constants */ - num_push_constants = push_start_align; - for (unsigned i = 0; i < chunk_size; i++) - push_constant_loc[chunk_start + i] = num_push_constants++; - } else { - /* We need to pull this one */ - num_pull_constants = cplx_align_apply(align, num_pull_constants); - for (unsigned i = 0; i < chunk_size; i++) - pull_constant_loc[chunk_start + i] = num_pull_constants++; - } - - /* Reset the chunk and start again */ - chunk_start = -1; - } - - /* Add the CS local thread ID uniform at the end of the push constants */ - if (subgroup_id_index >= 0) - push_constant_loc[subgroup_id_index] = num_push_constants++; - - /* As the uniforms are going to be reordered, stash the old array and - * create two new arrays for push/pull params. - */ - uint32_t *param = stage_prog_data->param; - stage_prog_data->nr_params = num_push_constants; - if (num_push_constants) { - stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, - num_push_constants); - } else { - stage_prog_data->param = NULL; - } - assert(stage_prog_data->nr_pull_params == 0); - assert(stage_prog_data->pull_param == NULL); - if (num_pull_constants > 0) { - stage_prog_data->nr_pull_params = num_pull_constants; - stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, - num_pull_constants); - } - - /* Up until now, the param[] array has been indexed by reg + offset - * of UNIFORM registers. Move pull constants into pull_param[] and - * condense param[] to only contain the uniforms we chose to push. - * - * NOTE: Because we are condensing the params[] array, we know that - * push_constant_loc[i] <= i and we can do it in one smooth loop without - * having to make a copy. - */ - for (unsigned int i = 0; i < uniforms; i++) { - uint32_t value = param[i]; - if (pull_constant_loc[i] != -1) { - stage_prog_data->pull_param[pull_constant_loc[i]] = value; - } else if (push_constant_loc[i] != -1) { - stage_prog_data->param[push_constant_loc[i]] = value; - } - } - ralloc_free(param); - } else { - /* If we don't want to compact anything, just set up dummy push/pull - * arrays. All the rest of the compiler cares about are these arrays. - */ - push_constant_loc = ralloc_array(mem_ctx, int, uniforms); - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - - for (unsigned u = 0; u < uniforms; u++) - push_constant_loc[u] = u; - - memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); - } + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + for (unsigned u = 0; u < uniforms; u++) + push_constant_loc[u] = u; /* Now that we know how many regular uniforms we'll push, reduce the * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits. @@ -2687,33 +2432,22 @@ fs_visitor::get_pull_locs(const fs_reg &src, { assert(src.file == UNIFORM); - if (src.nr >= UBO_START) { - const struct brw_ubo_range *range = - &prog_data->ubo_ranges[src.nr - UBO_START]; - - /* If this access is in our (reduced) range, use the push data. */ - if (src.offset / 32 < range->length) - return false; - - *out_surf_index = prog_data->binding_table.ubo_start + range->block; - *out_pull_index = (32 * range->start + src.offset) / 4; + if (src.nr < UBO_START) + return false; - prog_data->has_ubo_pull = true; - return true; - } + const struct brw_ubo_range *range = + &prog_data->ubo_ranges[src.nr - UBO_START]; - const unsigned location = src.nr + src.offset / 4; + /* If this access is in our (reduced) range, use the push data. */ + if (src.offset / 32 < range->length) + return false; - if (location < uniforms && pull_constant_loc[location] != -1) { - /* A regular uniform push constant */ - *out_surf_index = stage_prog_data->binding_table.pull_constants_start; - *out_pull_index = pull_constant_loc[location]; + *out_surf_index = range->block; + *out_pull_index = (32 * range->start + src.offset) / 4; - prog_data->has_ubo_pull = true; - return true; - } + prog_data->has_ubo_pull = true; - return false; + return true; } /** @@ -3831,7 +3565,7 @@ fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_ins void fs_visitor::insert_gfx4_send_dependency_workarounds() { - if (devinfo->ver != 4 || devinfo->is_g4x) + if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X) return; bool progress = false; @@ -4039,11 +3773,9 @@ fs_visitor::lower_load_payload() } for (uint8_t i = inst->header_size; i < inst->sources; i++) { + dst.type = inst->src[i].type; if (inst->src[i].file != BAD_FILE) { - dst.type = inst->src[i].type; ibld.MOV(dst, inst->src[i]); - } else { - dst.type = BRW_REGISTER_TYPE_UD; } dst = offset(dst, ibld, 1); } @@ -5169,6 +4901,49 @@ sampler_msg_type(const intel_device_info *devinfo, } } +/** + * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to + * the given requested_alignment_sz. + */ +static fs_inst * +emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst, + const fs_reg *src, unsigned sources, + unsigned header_size, + unsigned requested_alignment_sz) +{ + unsigned length = 0; + unsigned num_srcs = + sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width()); + fs_reg *src_comps = new fs_reg[num_srcs]; + + for (unsigned i = 0; i < header_size; i++) + src_comps[length++] = src[i]; + + for (unsigned i = header_size; i < sources; i++) { + unsigned src_sz = + retype(dst, src[i].type).component_size(bld.dispatch_width()); + const enum brw_reg_type padding_payload_type = + brw_reg_type_from_bit_size(type_sz(src[i].type) * 8, + BRW_REGISTER_TYPE_UD); + + src_comps[length++] = src[i]; + + /* Expand the real sources if component of requested payload type is + * larger than real source component. + */ + if (src_sz < requested_alignment_sz) { + for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) { + src_comps[length++] = retype(fs_reg(), padding_payload_type); + } + } + } + + fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size); + delete[] src_comps; + + return inst; +} + static void lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &coordinate, @@ -5182,16 +4957,22 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &surface_handle, const fs_reg &sampler_handle, const fs_reg &tg4_offset, + unsigned payload_type_bit_size, unsigned coord_components, unsigned grad_components) { const intel_device_info *devinfo = bld.shader->devinfo; - const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data; + const enum brw_reg_type payload_type = + brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F); + const enum brw_reg_type payload_unsigned_type = + brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD); + const enum brw_reg_type payload_signed_type = + brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D); unsigned reg_width = bld.dispatch_width() / 8; unsigned header_size = 0, length = 0; fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE]; for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) - sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F); + sources[i] = bld.vgrf(payload_type); /* We must have exactly one of surface/sampler and surface/sampler_handle */ assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); @@ -5330,23 +5111,23 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, coordinate_done = true; break; case SHADER_OPCODE_TXS: - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod); + bld.MOV(retype(sources[length], payload_unsigned_type), lod); length++; break; case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: /* We need an LOD; just use 0 */ - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); + bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0)); length++; break; case SHADER_OPCODE_TXF: /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. * On Gfx9 they are u, v, lod, r */ - bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate); + bld.MOV(retype(sources[length++], payload_signed_type), coordinate); if (devinfo->ver >= 9) { if (coord_components >= 2) { - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), + bld.MOV(retype(sources[length], payload_signed_type), offset(coordinate, bld, 1)); } else { sources[length] = brw_imm_d(0); @@ -5357,12 +5138,12 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, if (devinfo->ver >= 9 && lod.is_zero()) { op = SHADER_OPCODE_TXF_LZ; } else { - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod); + bld.MOV(retype(sources[length], payload_signed_type), lod); length++; } for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++) - bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), + bld.MOV(retype(sources[length++], payload_signed_type), offset(coordinate, bld, i)); coordinate_done = true; @@ -5375,24 +5156,26 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) { - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index); - length++; + bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index); } + /* Data from the multisample control surface. */ if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) { - /* Data from the multisample control surface. */ - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs); - length++; + unsigned num_mcs_components = 1; - /* On Gfx9+ we'll use ld2dms_w instead which has two registers for - * the MCS data. + /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs - + * Shared Functions - 3D Sampler - Messages - Message Format: + * + * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r */ - if (op == SHADER_OPCODE_TXF_CMS_W) { - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), - mcs.file == IMM ? - mcs : - offset(mcs, bld, 1)); - length++; + if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W) + num_mcs_components = 4; + else if (op == SHADER_OPCODE_TXF_CMS_W) + num_mcs_components = 2; + + for (unsigned i = 0; i < num_mcs_components; ++i) { + bld.MOV(retype(sources[length++], payload_unsigned_type), + mcs.file == IMM ? mcs : offset(mcs, bld, i)); } } @@ -5400,7 +5183,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, * texture coordinates. */ for (unsigned i = 0; i < coord_components; i++) - bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), + bld.MOV(retype(sources[length++], payload_signed_type), offset(coordinate, bld, i)); coordinate_done = true; @@ -5411,7 +5194,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, bld.MOV(sources[length++], offset(coordinate, bld, i)); for (unsigned i = 0; i < 2; i++) /* offu, offv */ - bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), + bld.MOV(retype(sources[length++], payload_signed_type), offset(tg4_offset, bld, i)); if (coord_components == 3) /* r if present */ @@ -5426,27 +5209,53 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, /* Set up the coordinate (except for cases where it was done above) */ if (!coordinate_done) { for (unsigned i = 0; i < coord_components; i++) - bld.MOV(sources[length++], offset(coordinate, bld, i)); + bld.MOV(retype(sources[length++], payload_type), + offset(coordinate, bld, i)); } if (min_lod.file != BAD_FILE) { /* Account for all of the missing coordinate sources */ - length += 4 - coord_components; - if (op == SHADER_OPCODE_TXD) - length += (3 - grad_components) * 2; + if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) { + /* On DG2 and newer platforms, sample_d can only be used with 1D and + * 2D surfaces, so the maximum number of gradient components is 2. + * In spite of this limitation, the Bspec lists a mysterious R + * component before the min_lod, so the maximum coordinate components + * is 3. + * + * Wa_1209978020 + */ + length += 3 - coord_components; + length += (2 - grad_components) * 2; + } else { + length += 4 - coord_components; + if (op == SHADER_OPCODE_TXD) + length += (3 - grad_components) * 2; + } bld.MOV(sources[length++], min_lod); } - unsigned mlen; - if (reg_width == 2) - mlen = length * reg_width - header_size; - else - mlen = length * reg_width; - - const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen), - BRW_REGISTER_TYPE_F); - bld.LOAD_PAYLOAD(src_payload, sources, length, header_size); + const fs_reg src_payload = + fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width), + BRW_REGISTER_TYPE_F); + /* In case of 16-bit payload each component takes one full register in + * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16 + * elements. In SIMD8H case hardware simply expects the components to be + * padded (i.e., aligned on reg boundary). + */ + fs_inst *load_payload_inst = + emit_load_payload_with_padding(bld, src_payload, sources, length, + header_size, REG_SIZE); + unsigned mlen = load_payload_inst->size_written / REG_SIZE; + unsigned simd_mode = 0; + if (payload_type_bit_size == 16) { + assert(devinfo->ver >= 11); + simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H : + GFX10_SAMPLER_SIMD_MODE_SIMD16H; + } else { + simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : + BRW_SAMPLER_SIMD_MODE_SIMD16; + } /* Generate the SEND. */ inst->opcode = SHADER_OPCODE_SEND; @@ -5455,29 +5264,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, const unsigned msg_type = sampler_msg_type(devinfo, op, inst->shadow_compare); - const unsigned simd_mode = - inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : - BRW_SAMPLER_SIMD_MODE_SIMD16; - - uint32_t base_binding_table_index; - switch (op) { - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - base_binding_table_index = prog_data->binding_table.gather_texture_start; - break; - case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: - base_binding_table_index = prog_data->binding_table.image_start; - break; - default: - base_binding_table_index = prog_data->binding_table.texture_start; - break; - } inst->sfid = BRW_SFID_SAMPLER; if (surface.file == IMM && (sampler.file == IMM || sampler_handle.file != BAD_FILE)) { - inst->desc = brw_sampler_desc(devinfo, - surface.ud + base_binding_table_index, + inst->desc = brw_sampler_desc(devinfo, surface.ud, sampler.file == IMM ? sampler.ud % 16 : 0, msg_type, simd_mode, @@ -5533,8 +5324,6 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, ubld.OR(desc, desc, surface); } } - if (base_binding_table_index) - ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index)); ubld.AND(desc, desc, brw_imm_ud(0xfff)); inst->src[0] = component(desc, 0); @@ -5560,6 +5349,61 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); } +static unsigned +get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo, + opcode op, const fs_reg *src) +{ + unsigned src_type_size = 0; + + /* All sources need to have the same size, therefore seek the first valid + * and take the size from there. + */ + for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { + if (src[i].file != BAD_FILE) { + src_type_size = brw_reg_type_to_size(src[i].type); + break; + } + } + + assert(src_type_size == 2 || src_type_size == 4); + +#ifndef NDEBUG + /* Make sure all sources agree. On gfx12 this doesn't hold when sampling + * compressed multisampled surfaces. There the payload contains MCS data + * which is already in 16-bits unlike the other parameters that need forced + * conversion. + */ + if (devinfo->verx10 < 125 || + (op != SHADER_OPCODE_TXF_CMS_W && + op != SHADER_OPCODE_TXF_CMS)) { + for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) { + assert(src[i].file == BAD_FILE || + brw_reg_type_to_size(src[i].type) == src_type_size); + } + } +#endif + + if (devinfo->verx10 < 125) + return src_type_size * 8; + + /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec: + * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message + * Format [GFX12:HAS:1209977870] * + * + * ld2dms_w SIMD8H and SIMD16H Only + * ld_mcs SIMD8H and SIMD16H Only + * ld2dms REMOVEDBY(GEN:HAS:1406788836) + */ + + if (op == SHADER_OPCODE_TXF_CMS_W || + op == SHADER_OPCODE_TXF_CMS || + op == SHADER_OPCODE_TXF_UMS || + op == SHADER_OPCODE_TXF_MCS) + src_type_size = 2; + + return src_type_size * 8; +} + static void lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) { @@ -5582,12 +5426,19 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; if (devinfo->ver >= 7) { + const unsigned msg_payload_type_bit_size = + get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src); + + /* 16-bit payloads are available only on gfx11+ */ + assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11); + lower_sampler_logical_send_gfx7(bld, inst, op, coordinate, shadow_c, lod, lod2, min_lod, sample_index, mcs, surface, sampler, surface_handle, sampler_handle, tg4_offset, + msg_payload_type_bit_size, coord_components, grad_components); } else if (devinfo->ver >= 5) { lower_sampler_logical_send_gfx5(bld, inst, op, coordinate, @@ -5641,6 +5492,67 @@ emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst) } } +void +fs_visitor::emit_is_helper_invocation(fs_reg result) +{ + /* Unlike the regular gl_HelperInvocation, that is defined at dispatch, + * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into + * consideration demoted invocations. + */ + result.type = BRW_REGISTER_TYPE_UD; + + bld.MOV(result, brw_imm_ud(0)); + + /* See sample_mask_reg() for why we split SIMD32 into SIMD16 here. */ + unsigned width = bld.dispatch_width(); + for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) { + const fs_builder b = bld.group(MIN2(width, 16), i); + + fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0)); + + /* The at() ensures that any code emitted to get the predicate happens + * before the mov right above. This is not an issue elsewhere because + * lowering code already set up the builder this way. + */ + emit_predicate_on_sample_mask(b.at(NULL, mov), mov); + mov->predicate_inverse = true; + } +} + +/** + * Predicate the specified instruction on the vector mask. + */ +static void +emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst) +{ + assert(bld.shader->stage == MESA_SHADER_FRAGMENT && + bld.group() == inst->group && + bld.dispatch_width() == inst->exec_size); + + const fs_builder ubld = bld.exec_all().group(1, 0); + + const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader); + const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW); + ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3)); + const unsigned subreg = sample_mask_flag_subreg(v); + + ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask); + + if (inst->predicate) { + assert(inst->predicate == BRW_PREDICATE_NORMAL); + assert(!inst->predicate_inverse); + assert(inst->flag_subreg == 0); + /* Combine the vector mask with the existing predicate by using a + * vertical predication mode. + */ + inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; + } else { + inst->flag_subreg = subreg; + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + } +} + static void setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc, const fs_reg &surface, const fs_reg &surface_handle) @@ -5914,11 +5826,11 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->sfid = sfid; setup_surface_descriptors(bld, inst, desc, surface, surface_handle); + inst->resize_sources(4); + /* Finally, the payload */ inst->src[2] = payload; inst->src[3] = payload2; - - inst->resize_sources(4); } static enum lsc_opcode @@ -6141,11 +6053,11 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->send_has_side_effects = has_side_effects; inst->send_is_volatile = !has_side_effects; + inst->resize_sources(4); + /* Finally, the payload */ inst->src[2] = payload; inst->src[3] = payload2; - - inst->resize_sources(4); } static void @@ -6211,10 +6123,10 @@ lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst) arg.ud, write); setup_surface_descriptors(bld, inst, desc, surface, surface_handle); + inst->resize_sources(4); + inst->src[2] = header; inst->src[3] = data; - - inst->resize_sources(4); } static fs_reg @@ -6235,26 +6147,40 @@ emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr) } static void +emit_fragment_mask(const fs_builder &bld, fs_inst *inst) +{ + assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM); + const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud; + + /* If we're a fragment shader, we have to predicate with the sample mask to + * avoid helper invocations to avoid helper invocations in instructions + * with side effects, unless they are explicitly required. + * + * There are also special cases when we actually want to run on helpers + * (ray queries). + */ + assert(bld.shader->stage == MESA_SHADER_FRAGMENT); + if (enable_helpers) + emit_predicate_on_vector_mask(bld, inst); + else if (inst->has_side_effects()) + emit_predicate_on_sample_mask(bld, inst); +} + +static void lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) { const intel_device_info *devinfo = bld.shader->devinfo; /* Get the logical send arguments. */ - const fs_reg &addr = inst->src[0]; - const fs_reg &src = inst->src[1]; + const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS]; + const fs_reg &src = inst->src[A64_LOGICAL_SRC]; const unsigned src_sz = type_sz(src.type); const unsigned src_comps = inst->components_read(1); - assert(inst->src[2].file == IMM); - const unsigned arg = inst->src[2].ud; + assert(inst->src[A64_LOGICAL_ARG].file == IMM); + const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; const bool has_side_effects = inst->has_side_effects(); - /* If the surface message has side effects and we're a fragment shader, we - * have to predicate with the sample mask to avoid helper invocations. - */ - if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) - emit_predicate_on_sample_mask(bld, inst); - fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD); fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps), BRW_REGISTER_TYPE_UD); @@ -6286,7 +6212,7 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) lsc_bits_to_data_size(arg), 1 /* num_channels */, false /* transpose */, - LSC_CACHE_STORE_L1STATE_L3MOCS, + LSC_CACHE_LOAD_L1STATE_L3MOCS, true /* has_dest */); break; case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: @@ -6330,6 +6256,9 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) unreachable("Unknown A64 logical instruction"); } + if (bld.shader->stage == MESA_SHADER_FRAGMENT) + emit_fragment_mask(bld, inst); + /* Update the original instruction. */ inst->opcode = SHADER_OPCODE_SEND; inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); @@ -6352,19 +6281,13 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) { const intel_device_info *devinfo = bld.shader->devinfo; - const fs_reg &addr = inst->src[0]; - const fs_reg &src = inst->src[1]; + const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS]; + const fs_reg &src = inst->src[A64_LOGICAL_SRC]; const unsigned src_comps = inst->components_read(1); - assert(inst->src[2].file == IMM); - const unsigned arg = inst->src[2].ud; + assert(inst->src[A64_LOGICAL_ARG].file == IMM); + const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; const bool has_side_effects = inst->has_side_effects(); - /* If the surface message has side effects and we're a fragment shader, we - * have to predicate with the sample mask to avoid helper invocations. - */ - if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) - emit_predicate_on_sample_mask(bld, inst); - fs_reg payload, payload2; unsigned mlen, ex_mlen = 0, header_size = 0; if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL || @@ -6488,6 +6411,9 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) unreachable("Unknown A64 logical instruction"); } + if (bld.shader->stage == MESA_SHADER_FRAGMENT) + emit_fragment_mask(bld, inst); + /* Update the original instruction. */ inst->opcode = SHADER_OPCODE_SEND; inst->mlen = mlen; @@ -6781,31 +6707,60 @@ static void lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst) { const intel_device_info *devinfo = bld.shader->devinfo; - const fs_reg &bvh_level = inst->src[0]; - assert(inst->src[1].file == BRW_IMMEDIATE_VALUE); - const uint32_t trace_ray_control = inst->src[1].ud; + /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal + * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q + * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword + * so that the MOV operates on 2 components rather than twice the same + * component. + */ + fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD); + globals_addr.stride = 1; + const fs_reg &bvh_level = + inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ? + inst->src[RT_LOGICAL_SRC_BVH_LEVEL] : + bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL], + inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL)); + const fs_reg &trace_ray_control = + inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ? + inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] : + bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL], + inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL)); + const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS]; + assert(synchronous_src.file == BRW_IMMEDIATE_VALUE); + const bool synchronous = synchronous_src.ud; const unsigned mlen = 1; const fs_builder ubld = bld.exec_all().group(8, 0); fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); ubld.MOV(header, brw_imm_ud(0)); - ubld.group(2, 0).MOV(header, - retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD)); - /* TODO: Bit 128 is ray_query */ + ubld.group(2, 0).MOV(header, globals_addr); + if (synchronous) + ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous)); const unsigned ex_mlen = inst->exec_size / 8; fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD); - const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8); - if (bvh_level.file == BRW_IMMEDIATE_VALUE) { - bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7))); + if (bvh_level.file == BRW_IMMEDIATE_VALUE && + trace_ray_control.file == BRW_IMMEDIATE_VALUE) { + bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) | + (bvh_level.ud & 0x7))); } else { - bld.AND(payload, bvh_level, brw_imm_ud(0x7)); - if (trc_bits != 0) - bld.OR(payload, payload, brw_imm_ud(trc_bits)); + bld.SHL(payload, trace_ray_control, brw_imm_ud(8)); + bld.OR(payload, payload, bvh_level); + } + + /* When doing synchronous traversal, the HW implicitly computes the + * stack_id using the following formula : + * + * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0] + * + * Only in the asynchronous case we need to set the stack_id given from the + * payload register. + */ + if (!synchronous) { + bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1), + retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0x7ff)); } - bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1), - retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW), - brw_imm_uw(0x7ff)); /* Update the original instruction. */ inst->opcode = SHADER_OPCODE_SEND; @@ -6880,6 +6835,7 @@ fs_visitor::lower_logical_sends() break; case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W); break; @@ -7107,7 +7063,7 @@ get_fpu_lowered_simd_width(const struct intel_device_info *devinfo, for (unsigned i = 0; i < inst->sources; i++) { /* IVB implements DF scalars as <0;2,1> regions. */ const bool is_scalar_exception = is_uniform(inst->src[i]) && - (devinfo->is_haswell || type_sz(inst->src[i].type) != 8); + (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8); const bool is_packed_word_exception = type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1; @@ -7386,7 +7342,7 @@ get_lowered_simd_width(const struct intel_device_info *devinfo, * should * "Force BFI instructions to be executed always in SIMD8." */ - return MIN2(devinfo->is_haswell ? 8 : ~0u, + return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u, get_fpu_lowered_simd_width(devinfo, inst)); case BRW_OPCODE_IF: @@ -7403,7 +7359,7 @@ get_lowered_simd_width(const struct intel_device_info *devinfo, /* Unary extended math instructions are limited to SIMD8 on Gfx4 and * Gfx6. Extended Math Function is limited to SIMD8 with half-float. */ - if (devinfo->ver == 6 || (devinfo->ver == 4 && !devinfo->is_g4x)) + if (devinfo->ver == 6 || devinfo->verx10 == 40) return MIN2(8, inst->exec_size); if (inst->dst.type == BRW_REGISTER_TYPE_HF) return MIN2(8, inst->exec_size); @@ -7512,6 +7468,12 @@ get_lowered_simd_width(const struct intel_device_info *devinfo, case SHADER_OPCODE_TG4_OFFSET_LOGICAL: return get_sampler_lowered_simd_width(devinfo, inst); + /* On gfx12 parameters are fixed to 16-bit values and therefore they all + * always fit regardless of the execution size. + */ + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + return MIN2(16, inst->exec_size); + case SHADER_OPCODE_TXD_LOGICAL: /* TXD is unsupported in SIMD16 mode. */ return 8; @@ -8485,9 +8447,6 @@ fs_visitor::optimize() validate(); - split_virtual_grfs(); - validate(); - #define OPT(pass, args...) ({ \ pass_num++; \ bool this_progress = pass(args); \ @@ -8518,6 +8477,8 @@ fs_visitor::optimize() int iteration = 0; int pass_num = 0; + OPT(split_virtual_grfs); + /* Before anything else, eliminate dead code. The results of some NIR * instructions may effectively be calculated twice. Once when the * instruction is encountered, and again when the user of that result is @@ -8590,7 +8551,7 @@ fs_visitor::optimize() OPT(opt_redundant_halt); if (OPT(lower_load_payload)) { - split_virtual_grfs(); + OPT(split_virtual_grfs); /* Lower 64 bit MOVs generated by payload lowering. */ if (!devinfo->has_64bit_float && !devinfo->has_64bit_int) @@ -8705,6 +8666,75 @@ fs_visitor::fixup_3src_null_dest() DEPENDENCY_VARIABLES); } +static bool +needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) +{ + /* This workaround is about making sure that any instruction writing + * through UGM has completed before we hit EOT. + * + * The workaround talks about UGM writes or atomic message but what is + * important is anything that hasn't completed. Usually any SEND + * instruction that has a destination register will be read by something + * else so we don't need to care about those as they will be synchronized + * by other parts of the shader or optimized away. What is left are + * instructions that don't have a destination register. + */ + if (inst->sfid != GFX12_SFID_UGM) + return false; + + return inst->dst.file == BAD_FILE; +} + +/* Wa_22013689345 + * + * We need to emit UGM fence message before EOT, if shader has any UGM write + * or atomic message. + * + * TODO/FINISHME: According to Curro we could avoid the fence in some cases. + * We probably need a better criteria in needs_dummy_fence(). + */ +void +fs_visitor::emit_dummy_memory_fence_before_eot() +{ + bool progress = false; + bool has_ugm_write_or_atomic = false; + + if (!intel_device_info_is_dg2(devinfo)) + return; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (!inst->eot) { + if (needs_dummy_fence(devinfo, inst)) + has_ugm_write_or_atomic = true; + continue; + } + + if (!has_ugm_write_or_atomic) + break; + + const fs_builder ibld(this, block, inst); + const fs_builder ubld = ibld.exec_all().group(1, 0); + + fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE, + dst, brw_vec8_grf(0, 0), + /* commit enable */ brw_imm_ud(1), + /* bti */ brw_imm_ud(0)); + dummy_fence->sfid = GFX12_SFID_UGM; + dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE, + LSC_FLUSH_TYPE_NONE_6, false); + ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst); + progress = true; + /* TODO: remove this break if we ever have shader with multiple EOT. */ + break; + } + + if (progress) { + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | + DEPENDENCY_VARIABLES); + } +} + /** * Find the first instruction in the program that might start a region of * divergent control flow due to a HALT jump. There is no @@ -8856,23 +8886,55 @@ fs_visitor::allocate_registers(bool allow_spilling) static const enum instruction_scheduler_mode pre_modes[] = { SCHEDULE_PRE, SCHEDULE_PRE_NON_LIFO, + SCHEDULE_NONE, SCHEDULE_PRE_LIFO, }; static const char *scheduler_mode_name[] = { "top-down", "non-lifo", + "none", "lifo" }; bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS); + /* Before we schedule anything, stash off the instruction order as an array + * of fs_inst *. This way, we can reset it between scheduling passes to + * prevent dependencies between the different scheduling modes. + */ + int num_insts = cfg->last_block()->end_ip + 1; + fs_inst **inst_arr = ralloc_array(mem_ctx, fs_inst *, num_insts); + + int ip = 0; + foreach_block_and_inst(block, fs_inst, inst, cfg) { + assert(ip >= block->start_ip && ip <= block->end_ip); + inst_arr[ip++] = inst; + } + assert(ip == num_insts); + /* Try each scheduling heuristic to see if it can successfully register * allocate without spilling. They should be ordered by decreasing * performance but increasing likelihood of allocating. */ for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { - schedule_instructions(pre_modes[i]); + if (i > 0) { + /* Unless we're the first pass, reset back to the original order */ + ip = 0; + foreach_block (block, cfg) { + block->instructions.make_empty(); + + assert(ip == block->start_ip); + for (; ip <= block->end_ip; ip++) + block->instructions.push_tail(inst_arr[ip]); + } + assert(ip == num_insts); + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + + if (pre_modes[i] != SCHEDULE_NONE) + schedule_instructions(pre_modes[i]); this->shader_stats.scheduler_mode = scheduler_mode_name[i]; if (0) { @@ -8881,23 +8943,6 @@ fs_visitor::allocate_registers(bool allow_spilling) break; } - /* Scheduling may create additional opportunities for CMOD propagation, - * so let's do it again. If CMOD propagation made any progress, - * eliminate dead code one more time. - */ - bool progress = false; - const int iteration = 99; - int pass_num = 0; - - if (OPT(opt_cmod_propagation)) { - /* dead_code_eliminate "undoes" the fixing done by - * fixup_3src_null_dest, so we have to do it again if - * dead_code_eliminiate makes any progress. - */ - if (OPT(dead_code_eliminate)) - fixup_3src_null_dest(); - } - bool can_spill = allow_spilling && (i == ARRAY_SIZE(pre_modes) - 1); @@ -8943,8 +8988,8 @@ fs_visitor::allocate_registers(bool allow_spilling) prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch), prog_data->total_scratch); - if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) { - if (devinfo->is_haswell) { + if (gl_shader_stage_is_compute(stage)) { + if (devinfo->platform == INTEL_PLATFORM_HSW) { /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" * field documentation, Haswell supports a minimum of 2kB of * scratch space for compute shaders, unlike every other stage @@ -8984,9 +9029,6 @@ fs_visitor::run_vs() setup_vs_payload(); - if (shader_time_index >= 0) - emit_shader_time_begin(); - emit_nir_code(); if (failed) @@ -8994,9 +9036,6 @@ fs_visitor::run_vs() emit_urb_writes(); - if (shader_time_index >= 0) - emit_shader_time_end(); - calculate_cfg(); optimize(); @@ -9005,6 +9044,7 @@ fs_visitor::run_vs() assign_vs_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9016,12 +9056,19 @@ fs_visitor::set_tcs_invocation_id() struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; + const bool dg2_plus = + devinfo->ver > 12 || intel_device_info_is_dg2(devinfo); const unsigned instance_id_mask = - devinfo->ver >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17); + dg2_plus ? INTEL_MASK(7, 0) : + (devinfo->ver >= 11) ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17); const unsigned instance_id_shift = - devinfo->ver >= 11 ? 16 : 17; + dg2_plus ? 0 : (devinfo->ver >= 11) ? 16 : 17; - /* Get instance number from g0.2 bits 22:16 or 23:17 */ + /* Get instance number from g0.2 bits: + * * 7:0 on DG2+ + * * 22:16 on gfx11+ + * * 23:17 otherwise + */ fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)), brw_imm_ud(instance_id_mask)); @@ -9075,9 +9122,6 @@ fs_visitor::run_tcs() tcs_key->input_vertices; } - if (shader_time_index >= 0) - emit_shader_time_begin(); - /* Initialize gl_InvocationID */ set_tcs_invocation_id(); @@ -9112,9 +9156,6 @@ fs_visitor::run_tcs() inst->mlen = 3; inst->eot = true; - if (shader_time_index >= 0) - emit_shader_time_end(); - if (failed) return false; @@ -9126,6 +9167,7 @@ fs_visitor::run_tcs() assign_tcs_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9139,9 +9181,6 @@ fs_visitor::run_tes() /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */ payload.num_regs = 5; - if (shader_time_index >= 0) - emit_shader_time_begin(); - emit_nir_code(); if (failed) @@ -9149,9 +9188,6 @@ fs_visitor::run_tes() emit_urb_writes(); - if (shader_time_index >= 0) - emit_shader_time_end(); - calculate_cfg(); optimize(); @@ -9160,6 +9196,7 @@ fs_visitor::run_tes() assign_tes_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9188,16 +9225,10 @@ fs_visitor::run_gs() } } - if (shader_time_index >= 0) - emit_shader_time_begin(); - emit_nir_code(); emit_gs_thread_end(); - if (shader_time_index >= 0) - emit_shader_time_end(); - if (failed) return false; @@ -9209,6 +9240,7 @@ fs_visitor::run_gs() assign_gs_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(true /* allow_spilling */); return !failed; @@ -9260,9 +9292,6 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) assert(dispatch_width == 16); emit_repclear_shader(); } else { - if (shader_time_index >= 0) - emit_shader_time_begin(); - if (nir->info.inputs_read > 0 || BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) || (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) { @@ -9295,31 +9324,26 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) if (failed) return false; - if (wm_key->alpha_test_func) + if (wm_key->emit_alpha_test) emit_alpha_test(); emit_fb_writes(); - if (shader_time_index >= 0) - emit_shader_time_end(); - calculate_cfg(); optimize(); assign_curb_setup(); - if (devinfo->ver >= 9) + if (devinfo->ver == 9) gfx9_ps_header_only_workaround(wm_prog_data); assign_urb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); - - if (failed) - return false; } return !failed; @@ -9328,14 +9352,11 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) bool fs_visitor::run_cs(bool allow_spilling) { - assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); + assert(gl_shader_stage_is_compute(stage)); setup_cs_payload(); - if (shader_time_index >= 0) - emit_shader_time_begin(); - - if (devinfo->is_haswell && prog_data->total_shared > 0) { + if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) { /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ const fs_builder abld = bld.exec_all().group(1, 0); abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW), @@ -9349,9 +9370,6 @@ fs_visitor::run_cs(bool allow_spilling) emit_cs_terminate(); - if (shader_time_index >= 0) - emit_shader_time_end(); - calculate_cfg(); optimize(); @@ -9359,11 +9377,9 @@ fs_visitor::run_cs(bool allow_spilling) assign_curb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); - if (failed) - return false; - return !failed; } @@ -9375,9 +9391,6 @@ fs_visitor::run_bs(bool allow_spilling) /* R0: thread header, R1: stack IDs, R2: argument addresses */ payload.num_regs = 3; - if (shader_time_index >= 0) - emit_shader_time_begin(); - emit_nir_code(); if (failed) @@ -9386,8 +9399,52 @@ fs_visitor::run_bs(bool allow_spilling) /* TODO(RT): Perhaps rename this? */ emit_cs_terminate(); - if (shader_time_index >= 0) - emit_shader_time_end(); + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + allocate_registers(allow_spilling); + + return !failed; +} + +bool +fs_visitor::run_task(bool allow_spilling) +{ + assert(stage == MESA_SHADER_TASK); + + /* Task Shader Payloads (SIMD8 and SIMD16) + * + * R0: Header + * R1: Local_ID.X[0-7 or 0-15] + * R2: Inline Parameter + * + * Task Shader Payloads (SIMD32) + * + * R0: Header + * R1: Local_ID.X[0-15] + * R2: Local_ID.X[16-31] + * R3: Inline Parameter + * + * Local_ID.X values are 16 bits. + * + * Inline parameter is optional but always present since we use it to pass + * the address to descriptors. + */ + payload.num_regs = dispatch_width == 32 ? 4 : 3; + + emit_nir_code(); + + if (failed) + return false; + + emit_urb_fence(); + + emit_cs_terminate(); calculate_cfg(); @@ -9396,11 +9453,56 @@ fs_visitor::run_bs(bool allow_spilling) assign_curb_setup(); fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); allocate_registers(allow_spilling); + return !failed; +} + +bool +fs_visitor::run_mesh(bool allow_spilling) +{ + assert(stage == MESA_SHADER_MESH); + + /* Mesh Shader Payloads (SIMD8 and SIMD16) + * + * R0: Header + * R1: Local_ID.X[0-7 or 0-15] + * R2: Inline Parameter + * + * Mesh Shader Payloads (SIMD32) + * + * R0: Header + * R1: Local_ID.X[0-15] + * R2: Local_ID.X[16-31] + * R3: Inline Parameter + * + * Local_ID.X values are 16 bits. + * + * Inline parameter is optional but always present since we use it to pass + * the address to descriptors. + */ + payload.num_regs = dispatch_width == 32 ? 4 : 3; + + emit_nir_code(); + if (failed) return false; + emit_urb_fence(); + + emit_cs_terminate(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + allocate_registers(allow_spilling); + return !failed; } @@ -9485,15 +9587,18 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, prog_data->flat_inputs = 0; nir_foreach_shader_in_variable(var, shader) { + /* flat shading */ + if (var->data.interpolation != INTERP_MODE_FLAT) + continue; + + if (var->data.per_primitive) + continue; + unsigned slots = glsl_count_attribute_slots(var->type, false); for (unsigned s = 0; s < slots; s++) { int input_index = prog_data->urb_setup[var->data.location + s]; - if (input_index < 0) - continue; - - /* flat shading */ - if (var->data.interpolation == INTERP_MODE_FLAT) + if (input_index >= 0) prog_data->flat_inputs |= 1 << input_index; } } @@ -9630,19 +9735,22 @@ brw_nir_demote_sample_qualifiers(nir_shader *nir) NULL); } -void +static void brw_nir_populate_wm_prog_data(const nir_shader *shader, const struct intel_device_info *devinfo, const struct brw_wm_prog_key *key, - struct brw_wm_prog_data *prog_data) + struct brw_wm_prog_data *prog_data, + const struct brw_mue_map *mue_map) { /* key->alpha_test_func means simulating alpha testing via discards, * so the shader definitely kills pixels. */ prog_data->uses_kill = shader->info.fs.uses_discard || - key->alpha_test_func; + shader->info.fs.uses_demote || + key->emit_alpha_test; prog_data->uses_omask = !key->ignore_sample_mask_out && (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + prog_data->color_outputs_written = key->color_outputs_valid; prog_data->computed_depth_mode = computed_depth_mode(shader); prog_data->computed_stencil = shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); @@ -9669,7 +9777,10 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, * persample dispatch, we hard-code it to 0.5. */ prog_data->uses_pos_offset = prog_data->persample_dispatch && - BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS); + (BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS) || + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); } prog_data->has_render_target_reads = shader->info.outputs_read != 0ull; @@ -9680,6 +9791,9 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, prog_data->barycentric_interp_modes = brw_compute_barycentric_interp_modes(devinfo, shader); + prog_data->uses_nonperspective_interp_modes |= + (prog_data->barycentric_interp_modes & + BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0; prog_data->per_coarse_pixel_dispatch = key->coarse_pixel && @@ -9698,7 +9812,7 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && prog_data->per_coarse_pixel_dispatch; - calculate_urb_setup(devinfo, key, prog_data, shader); + calculate_urb_setup(devinfo, key, prog_data, shader, mue_map); brw_compute_flat_inputs(prog_data, shader); } @@ -9726,6 +9840,7 @@ brw_compile_fs(const struct brw_compiler *compiler, INTEL_DEBUG(params->debug_flag ? params->debug_flag : DEBUG_WM); prog_data->base.stage = MESA_SHADER_FRAGMENT; + prog_data->base.ray_queries = nir->info.ray_queries; prog_data->base.total_scratch = 0; const struct intel_device_info *devinfo = compiler->devinfo; @@ -9757,7 +9872,8 @@ brw_compile_fs(const struct brw_compiler *compiler, brw_postprocess_nir(nir, compiler, true, debug_enabled, key->base.robust_buffer_access); - brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data); + brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data, + params->mue_map); fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL; @@ -9766,7 +9882,6 @@ brw_compile_fs(const struct brw_compiler *compiler, v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, &prog_data->base, nir, 8, - params->shader_time ? params->shader_time_index8 : -1, debug_enabled); if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg); @@ -9801,13 +9916,15 @@ brw_compile_fs(const struct brw_compiler *compiler, " pixel shading.\n"); } + if (nir->info.ray_queries > 0) + v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n"); + if (!has_spilled && v8->max_dispatch_width >= 16 && (!INTEL_DEBUG(DEBUG_NO16) || params->use_rep_send)) { /* Try a SIMD16 compile */ v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, &prog_data->base, nir, 16, - params->shader_time ? params->shader_time_index16 : -1, debug_enabled); v16->import_uniforms(v8); if (!v16->run_fs(allow_spilling, params->use_rep_send)) { @@ -9835,7 +9952,6 @@ brw_compile_fs(const struct brw_compiler *compiler, /* Try a SIMD32 compile */ v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, &prog_data->base, nir, 32, - params->shader_time ? params->shader_time_index32 : -1, debug_enabled); v32->import_uniforms(v8); if (!v32->run_fs(allow_spilling, false)) { @@ -9955,22 +10071,28 @@ brw_compile_fs(const struct brw_compiler *compiler, return g.get_assembly(); } -fs_reg * -fs_visitor::emit_cs_work_group_id_setup() +fs_reg +fs_visitor::emit_work_group_id_setup() { - assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); + assert(gl_shader_stage_uses_workgroup(stage)); - fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type)); + fs_reg id = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); - struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD)); - struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD)); + bld.MOV(id, r0_1); - bld.MOV(*reg, r0_1); - bld.MOV(offset(*reg, bld, 1), r0_6); - bld.MOV(offset(*reg, bld, 2), r0_7); + if (gl_shader_stage_is_compute(stage)) { + struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD)); + struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD)); + bld.MOV(offset(id, bld, 1), r0_6); + bld.MOV(offset(id, bld, 2), r0_7); + } else { + /* Task/Mesh have a single Workgroup ID dimension in the HW. */ + bld.MOV(offset(id, bld, 1), brw_imm_ud(0)); + bld.MOV(offset(id, bld, 2), brw_imm_ud(0)); + } - return reg; + return id; } unsigned @@ -10071,36 +10193,13 @@ lower_simd(nir_builder *b, nir_instr *instr, void *options) } } -static void +void brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width) { nir_shader_lower_instructions(nir, filter_simd, lower_simd, (void *)(uintptr_t)dispatch_width); } -static nir_shader * -compile_cs_to_nir(const struct brw_compiler *compiler, - void *mem_ctx, - const struct brw_cs_prog_key *key, - const nir_shader *src_shader, - unsigned dispatch_width, - bool debug_enabled) -{ - nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); - brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true); - - NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width); - - /* Clean up after the local index and ID calculations. */ - NIR_PASS_V(shader, nir_opt_constant_folding); - NIR_PASS_V(shader, nir_opt_dce); - - brw_postprocess_nir(shader, compiler, true, debug_enabled, - key->base.robust_buffer_access); - - return shader; -} - const unsigned * brw_compile_cs(const struct brw_compiler *compiler, void *mem_ctx, @@ -10109,193 +10208,93 @@ brw_compile_cs(const struct brw_compiler *compiler, const nir_shader *nir = params->nir; const struct brw_cs_prog_key *key = params->key; struct brw_cs_prog_data *prog_data = params->prog_data; - int shader_time_index = params->shader_time ? params->shader_time_index : -1; const bool debug_enabled = INTEL_DEBUG(params->debug_flag ? params->debug_flag : DEBUG_CS); prog_data->base.stage = MESA_SHADER_COMPUTE; prog_data->base.total_shared = nir->info.shared_size; + prog_data->base.ray_queries = nir->info.ray_queries; prog_data->base.total_scratch = 0; - /* Generate code for all the possible SIMD variants. */ - bool generate_all; - - unsigned min_dispatch_width; - unsigned max_dispatch_width; - - if (nir->info.workgroup_size_variable) { - generate_all = true; - min_dispatch_width = 8; - max_dispatch_width = 32; - } else { - generate_all = false; + if (!nir->info.workgroup_size_variable) { prog_data->local_size[0] = nir->info.workgroup_size[0]; prog_data->local_size[1] = nir->info.workgroup_size[1]; prog_data->local_size[2] = nir->info.workgroup_size[2]; - unsigned local_workgroup_size = prog_data->local_size[0] * - prog_data->local_size[1] * - prog_data->local_size[2]; - - /* Limit max_threads to 64 for the GPGPU_WALKER command */ - const uint32_t max_threads = compiler->devinfo->max_cs_workgroup_threads; - min_dispatch_width = util_next_power_of_two( - MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads))); - assert(min_dispatch_width <= 32); - max_dispatch_width = 32; - } - - unsigned required_dispatch_width = 0; - if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) { - /* These enum values are expressly chosen to be equal to the subgroup - * size that they require. - */ - required_dispatch_width = (unsigned)key->base.subgroup_size_type; } - if (nir->info.cs.subgroup_size > 0) { - assert(required_dispatch_width == 0 || - required_dispatch_width == nir->info.cs.subgroup_size); - required_dispatch_width = nir->info.cs.subgroup_size; - } + const unsigned required_dispatch_width = + brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type); - if (required_dispatch_width > 0) { - assert(required_dispatch_width == 8 || - required_dispatch_width == 16 || - required_dispatch_width == 32); - if (required_dispatch_width < min_dispatch_width || - required_dispatch_width > max_dispatch_width) { - params->error_str = ralloc_strdup(mem_ctx, - "Cannot satisfy explicit subgroup size"); - return NULL; - } - min_dispatch_width = max_dispatch_width = required_dispatch_width; - } + fs_visitor *v[3] = {0}; + const char *error[3] = {0}; - assert(min_dispatch_width <= max_dispatch_width); + for (unsigned simd = 0; simd < 3; simd++) { + if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, prog_data, + required_dispatch_width, &error[simd])) + continue; - fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; - fs_visitor *v = NULL; - - if (!INTEL_DEBUG(DEBUG_NO8) && - min_dispatch_width <= 8 && max_dispatch_width >= 8) { - nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key, - nir, 8, debug_enabled); - v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, - &prog_data->base, - nir8, 8, shader_time_index, debug_enabled); - if (!v8->run_cs(true /* allow_spilling */)) { - params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg); - delete v8; - return NULL; - } + const unsigned dispatch_width = 8u << simd; - /* We should always be able to do SIMD32 for compute shaders */ - assert(v8->max_dispatch_width >= 32); + nir_shader *shader = nir_shader_clone(mem_ctx, nir); + brw_nir_apply_key(shader, compiler, &key->base, + dispatch_width, true /* is_scalar */); - v = v8; - prog_data->prog_mask |= 1 << 0; - if (v8->spilled_any_registers) - prog_data->prog_spilled |= 1 << 0; - cs_fill_push_const_info(compiler->devinfo, prog_data); - } + NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width); - if (!INTEL_DEBUG(DEBUG_NO16) && - (generate_all || !prog_data->prog_spilled) && - min_dispatch_width <= 16 && max_dispatch_width >= 16) { - /* Try a SIMD16 compile */ - nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key, - nir, 16, debug_enabled); - v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, - &prog_data->base, - nir16, 16, shader_time_index, debug_enabled); - if (v8) - v16->import_uniforms(v8); + /* Clean up after the local index and ID calculations. */ + NIR_PASS_V(shader, nir_opt_constant_folding); + NIR_PASS_V(shader, nir_opt_dce); - const bool allow_spilling = generate_all || v == NULL; - if (!v16->run_cs(allow_spilling)) { - brw_shader_perf_log(compiler, params->log_data, - "SIMD16 shader failed to compile: %s\n", - v16->fail_msg); - if (!v) { - assert(v8 == NULL); - params->error_str = ralloc_asprintf( - mem_ctx, "Not enough threads for SIMD8 and " - "couldn't generate SIMD16: %s", v16->fail_msg); - delete v16; - return NULL; - } - } else { - /* We should always be able to do SIMD32 for compute shaders */ - assert(v16->max_dispatch_width >= 32); + brw_postprocess_nir(shader, compiler, true, debug_enabled, + key->base.robust_buffer_access); - v = v16; - prog_data->prog_mask |= 1 << 1; - if (v16->spilled_any_registers) - prog_data->prog_spilled |= 1 << 1; - cs_fill_push_const_info(compiler->devinfo, prog_data); + v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, + &prog_data->base, shader, dispatch_width, + debug_enabled); + + if (prog_data->prog_mask) { + unsigned first = ffs(prog_data->prog_mask) - 1; + v[simd]->import_uniforms(v[first]); } - } - /* The SIMD32 is only enabled for cases it is needed unless forced. - * - * TODO: Use performance_analysis and drop this boolean. - */ - const bool needs_32 = v == NULL || - INTEL_DEBUG(DEBUG_DO32) || - generate_all; - - if (!INTEL_DEBUG(DEBUG_NO32) && - (generate_all || !prog_data->prog_spilled) && - needs_32 && - min_dispatch_width <= 32 && max_dispatch_width >= 32) { - /* Try a SIMD32 compile */ - nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key, - nir, 32, debug_enabled); - v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, - &prog_data->base, - nir32, 32, shader_time_index, debug_enabled); - if (v8) - v32->import_uniforms(v8); - else if (v16) - v32->import_uniforms(v16); - - const bool allow_spilling = generate_all || v == NULL; - if (!v32->run_cs(allow_spilling)) { - brw_shader_perf_log(compiler, params->log_data, - "SIMD32 shader failed to compile: %s\n", - v32->fail_msg); - if (!v) { - assert(v8 == NULL); - assert(v16 == NULL); - params->error_str = ralloc_asprintf( - mem_ctx, "Not enough threads for SIMD16 and " - "couldn't generate SIMD32: %s", v32->fail_msg); - delete v32; - return NULL; - } - } else { - v = v32; - prog_data->prog_mask |= 1 << 2; - if (v32->spilled_any_registers) - prog_data->prog_spilled |= 1 << 2; + const bool allow_spilling = !prog_data->prog_mask || + nir->info.workgroup_size_variable; + + if (v[simd]->run_cs(allow_spilling)) { + /* We should always be able to do SIMD32 for compute shaders. */ + assert(v[simd]->max_dispatch_width >= 32); + cs_fill_push_const_info(compiler->devinfo, prog_data); + + brw_simd_mark_compiled(simd, prog_data, v[simd]->spilled_any_registers); + } else { + error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg); + if (simd > 0) { + brw_shader_perf_log(compiler, params->log_data, + "SIMD%u shader failed to compile: %s\n", + dispatch_width, v[simd]->fail_msg); + } } } - if (unlikely(!v) && INTEL_DEBUG(DEBUG_NO8 | DEBUG_NO16 | DEBUG_NO32)) { - params->error_str = - ralloc_strdup(mem_ctx, - "Cannot satisfy INTEL_DEBUG flags SIMD restrictions"); + const int selected_simd = brw_simd_select(prog_data); + if (selected_simd < 0) { + params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n", + error[0], error[1], error[2]);; return NULL; } - assert(v); + assert(selected_simd < 3); + fs_visitor *selected = v[selected_simd]; + + if (!nir->info.workgroup_size_variable) + prog_data->prog_mask = 1 << selected_simd; const unsigned *ret = NULL; fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base, - v->runtime_check_aads_emit, MESA_SHADER_COMPUTE); + selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE); if (unlikely(debug_enabled)) { char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", nir->info.label ? @@ -10305,84 +10304,27 @@ brw_compile_cs(const struct brw_compiler *compiler, } struct brw_compile_stats *stats = params->stats; - if (generate_all) { - if (prog_data->prog_mask & (1 << 0)) { - assert(v8); - prog_data->prog_offset[0] = - g.generate_code(v8->cfg, 8, v8->shader_stats, - v8->performance_analysis.require(), stats); - stats = stats ? stats + 1 : NULL; - } - - if (prog_data->prog_mask & (1 << 1)) { - assert(v16); - prog_data->prog_offset[1] = - g.generate_code(v16->cfg, 16, v16->shader_stats, - v16->performance_analysis.require(), stats); - stats = stats ? stats + 1 : NULL; - } - - if (prog_data->prog_mask & (1 << 2)) { - assert(v32); - prog_data->prog_offset[2] = - g.generate_code(v32->cfg, 32, v32->shader_stats, - v32->performance_analysis.require(), stats); + for (unsigned simd = 0; simd < 3; simd++) { + if (prog_data->prog_mask & (1u << simd)) { + assert(v[simd]); + prog_data->prog_offset[simd] = + g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats, + v[simd]->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; } - } else { - /* Only one dispatch width will be valid, and will be at offset 0, - * which is already the default value of prog_offset_* fields. - */ - prog_data->prog_mask = 1 << (v->dispatch_width / 16); - g.generate_code(v->cfg, v->dispatch_width, v->shader_stats, - v->performance_analysis.require(), stats); } g.add_const_data(nir->constant_data, nir->constant_data_size); ret = g.get_assembly(); - delete v8; - delete v16; - delete v32; + delete v[0]; + delete v[1]; + delete v[2]; return ret; } -static unsigned -brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo, - const struct brw_cs_prog_data *cs_prog_data, - unsigned group_size) -{ - const unsigned mask = cs_prog_data->prog_mask; - assert(mask != 0); - - static const unsigned simd8 = 1 << 0; - static const unsigned simd16 = 1 << 1; - static const unsigned simd32 = 1 << 2; - - if (INTEL_DEBUG(DEBUG_DO32) && (mask & simd32)) - return 32; - - const uint32_t max_threads = devinfo->max_cs_workgroup_threads; - - if ((mask & simd8) && group_size <= 8 * max_threads) { - /* Prefer SIMD16 if can do without spilling. Matches logic in - * brw_compile_cs. - */ - if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16)) - return 16; - return 8; - } - - if ((mask & simd16) && group_size <= 16 * max_threads) - return 16; - - assert(mask & simd32); - assert(group_size <= 32 * max_threads); - return 32; -} - struct brw_cs_dispatch_info brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, const struct brw_cs_prog_data *prog_data, @@ -10394,9 +10336,13 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, override_local_size ? override_local_size : prog_data->local_size; + const int simd = + override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) : + brw_simd_select(prog_data); + assert(simd >= 0 && simd < 3); + info.group_size = sizes[0] * sizes[1] * sizes[2]; - info.simd_size = - brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size); + info.simd_size = 8u << simd; info.threads = DIV_ROUND_UP(info.group_size, info.simd_size); const uint32_t remainder = info.group_size & (info.simd_size - 1); @@ -10437,7 +10383,7 @@ compile_single_bs(const struct brw_compiler *compiler, void *log_data, if (!INTEL_DEBUG(DEBUG_NO8)) { v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, shader, - 8, -1 /* shader time */, debug_enabled); + 8, debug_enabled); const bool allow_spilling = true; if (!v8->run_bs(allow_spilling)) { if (error_str) @@ -10455,7 +10401,7 @@ compile_single_bs(const struct brw_compiler *compiler, void *log_data, if (!has_spilled && !INTEL_DEBUG(DEBUG_NO16)) { v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, shader, - 16, -1 /* shader time */, debug_enabled); + 16, debug_enabled); const bool allow_spilling = (v == NULL); if (!v16->run_bs(allow_spilling)) { brw_shader_perf_log(compiler, log_data, @@ -10517,23 +10463,23 @@ brw_bsr(const struct intel_device_info *devinfo, } const unsigned * -brw_compile_bs(const struct brw_compiler *compiler, void *log_data, +brw_compile_bs(const struct brw_compiler *compiler, void *mem_ctx, - const struct brw_bs_prog_key *key, - struct brw_bs_prog_data *prog_data, - nir_shader *shader, - unsigned num_resume_shaders, - struct nir_shader **resume_shaders, - struct brw_compile_stats *stats, - char **error_str) + struct brw_compile_bs_params *params) { + nir_shader *shader = params->nir; + struct brw_bs_prog_data *prog_data = params->prog_data; + unsigned num_resume_shaders = params->num_resume_shaders; + nir_shader **resume_shaders = params->resume_shaders; const bool debug_enabled = INTEL_DEBUG(DEBUG_RT); prog_data->base.stage = shader->info.stage; + prog_data->base.ray_queries = shader->info.ray_queries; prog_data->base.total_scratch = 0; + prog_data->max_stack_size = 0; - fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, + fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base, false, shader->info.stage); if (unlikely(debug_enabled)) { char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s", @@ -10545,8 +10491,9 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, } prog_data->simd_size = - compile_single_bs(compiler, log_data, mem_ctx, key, prog_data, - shader, &g, stats, NULL, error_str); + compile_single_bs(compiler, params->log_data, mem_ctx, + params->key, prog_data, + shader, &g, params->stats, NULL, ¶ms->error_str); if (prog_data->simd_size == 0) return NULL; @@ -10564,8 +10511,9 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, /* TODO: Figure out shader stats etc. for resume shaders */ int offset = 0; uint8_t simd_size = - compile_single_bs(compiler, log_data, mem_ctx, key, prog_data, - resume_shaders[i], &g, NULL, &offset, error_str); + compile_single_bs(compiler, params->log_data, mem_ctx, params->key, + prog_data, resume_shaders[i], &g, NULL, &offset, + ¶ms->error_str); if (simd_size == 0) return NULL; @@ -10623,7 +10571,7 @@ brw_fs_test_dispatch_packing(const fs_builder &bld) unsigned fs_visitor::workgroup_size() const { - assert(stage == MESA_SHADER_COMPUTE); + assert(gl_shader_stage_uses_workgroup(stage)); const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data); return cs->local_size[0] * cs->local_size[1] * cs->local_size[2]; } |