diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-09-22 02:09:17 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-09-22 02:09:17 +0000 |
commit | 865c23c9c56f47f6cf8d73e8a6060a0c33a28b93 (patch) | |
tree | aeed22bc39ce87dd6f09ff173c8273beaef65fe7 /lib/mesa/src/intel/compiler/brw_fs.cpp | |
parent | 27e7bb02bd0f89f96d9e3b402b46c2c97ee4defe (diff) |
Merge Mesa 20.0.8
With Mesa 20.1 even after the kernel change to do wbinvd on all cpus
sthen@ reported that hard hangs still occurred on his Haswell system
with inteldrm.
Mark Kane also reported seeing hangs on Ivy Bridge on bugs@.
Some systems/workloads seem to be more prone to triggering this than
others as I have not seen any hangs on Ivy Bridge and the only hangs
I saw on Haswell when running piglit went away with the wbinvd change.
It seems something is wrong with drm memory attributes or coherency in
the kernel and newer Mesa versions expect behaviour we don't have.
Diffstat (limited to 'lib/mesa/src/intel/compiler/brw_fs.cpp')
-rw-r--r-- | lib/mesa/src/intel/compiler/brw_fs.cpp | 509 |
1 files changed, 176 insertions, 333 deletions
diff --git a/lib/mesa/src/intel/compiler/brw_fs.cpp b/lib/mesa/src/intel/compiler/brw_fs.cpp index 5f5e3b21b..573b22857 100644 --- a/lib/mesa/src/intel/compiler/brw_fs.cpp +++ b/lib/mesa/src/intel/compiler/brw_fs.cpp @@ -208,7 +208,7 @@ fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf) * dependencies, and to avoid having to deal with aligning its regs to 2. */ const fs_builder ubld = bld.annotate("send dependency resolve") - .quarter(0); + .half(0); ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F)); } @@ -1190,8 +1190,6 @@ fs_visitor::import_uniforms(fs_visitor *v) this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; this->subgroup_id = v->subgroup_id; - for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++) - this->group_size[i] = v->group_size[i]; } void @@ -1534,7 +1532,7 @@ fs_visitor::emit_discard_jump() * shader if all relevant channels have been discarded. */ fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP); - discard_jump->flag_subreg = sample_mask_flag_subreg(this); + discard_jump->flag_subreg = 1; discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; discard_jump->predicate_inverse = true; @@ -1604,8 +1602,6 @@ fs_visitor::assign_curb_setup() prog_data->curb_read_length = uniform_push_length + ubo_push_length; - uint64_t used = 0; - /* Map the offsets in the UNIFORM file to fixed HW regs. */ foreach_block_and_inst(block, fs_inst, inst, cfg) { for (unsigned int i = 0; i < inst->sources; i++) { @@ -1627,9 +1623,6 @@ fs_visitor::assign_curb_setup() constant_nr = 0; } - assert(constant_nr / 8 < 64); - used |= BITFIELD64_BIT(constant_nr / 8); - struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs + constant_nr / 8, constant_nr % 8); @@ -1644,44 +1637,6 @@ fs_visitor::assign_curb_setup() } } - uint64_t want_zero = used & stage_prog_data->zero_push_reg; - if (want_zero) { - assert(!compiler->compact_params); - fs_builder ubld = bld.exec_all().group(8, 0).at( - cfg->first_block(), cfg->first_block()->start()); - - /* push_reg_mask_param is in 32-bit units */ - unsigned mask_param = stage_prog_data->push_reg_mask_param; - struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8, - mask_param % 8); - - fs_reg b32; - for (unsigned i = 0; i < 64; i++) { - if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) { - fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2); - ubld.SHL(horiz_offset(shifted, 8), - byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8), - brw_imm_v(0x01234567)); - ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8)); - - fs_builder ubld16 = ubld.group(16, 0); - b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D); - ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15)); - } - - if (want_zero & BITFIELD64_BIT(i)) { - assert(i < prog_data->curb_read_length); - struct brw_reg push_reg = - retype(brw_vec8_grf(payload.num_regs + i, 0), - BRW_REGISTER_TYPE_D); - - ubld.AND(push_reg, push_reg, component(b32, i % 16)); - } - } - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - } - /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */ this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length; } @@ -1744,7 +1699,7 @@ calculate_urb_setup(const struct gen_device_info *devinfo, struct brw_vue_map prev_stage_vue_map; brw_compute_vue_map(devinfo, &prev_stage_vue_map, key->input_slots_valid, - nir->info.separate_shader, 1); + nir->info.separate_shader); int first_slot = brw_compute_first_urb_slot_required(nir->info.inputs_read, @@ -2097,7 +2052,7 @@ fs_visitor::split_virtual_grfs() } } } - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); delete[] split_points; delete[] new_virtual_grf; @@ -2105,7 +2060,7 @@ fs_visitor::split_virtual_grfs() } /** - * Remove unused virtual GRFs and compact the vgrf_* arrays. + * Remove unused virtual GRFs and compact the virtual_grf_* arrays. * * During code generation, we create tons of temporary variables, many of * which get immediately killed and are never used again. Yet, in later @@ -2142,7 +2097,7 @@ fs_visitor::compact_virtual_grfs() } else { remap_table[i] = new_index; alloc.sizes[new_index] = alloc.sizes[i]; - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); ++new_index; } } @@ -2315,7 +2270,7 @@ fs_visitor::assign_constant_locations() } if (compiler->compact_params) { - struct uniform_slot_info slots[uniforms + 1]; + struct uniform_slot_info slots[uniforms]; memset(slots, 0, sizeof(slots)); foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { @@ -2605,7 +2560,7 @@ fs_visitor::lower_constant_loads() inst->remove(block); } } - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); } bool @@ -2883,11 +2838,6 @@ fs_visitor::opt_algebraic() } } } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | - DEPENDENCY_INSTRUCTION_DETAIL); - return progress; } @@ -2937,7 +2887,7 @@ fs_visitor::opt_zero_samples() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); + invalidate_live_intervals(); return progress; } @@ -3034,7 +2984,7 @@ fs_visitor::opt_sampler_eot() * flag and submit a header together with the sampler message as required * by the hardware. */ - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return true; } @@ -3087,8 +3037,7 @@ fs_visitor::opt_register_renaming() } if (progress) { - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | - DEPENDENCY_VARIABLES); + invalidate_live_intervals(); for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) { @@ -3136,7 +3085,7 @@ fs_visitor::opt_redundant_discard_jumps() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); return progress; } @@ -3167,7 +3116,7 @@ fs_visitor::compute_to_mrf() if (devinfo->gen >= 7) return false; - const fs_live_variables &live = live_analysis.require(); + calculate_live_intervals(); foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { int ip = next_ip; @@ -3185,7 +3134,7 @@ fs_visitor::compute_to_mrf() /* Can't compute-to-MRF this GRF if someone else was going to * read it later. */ - if (live.vgrf_end[inst->src[0].nr] > ip) + if (this->virtual_grf_end[inst->src[0].nr] > ip) continue; /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the @@ -3330,7 +3279,7 @@ fs_visitor::compute_to_mrf() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); return progress; } @@ -3387,9 +3336,6 @@ fs_visitor::eliminate_find_live_channel() } } - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); - return progress; } @@ -3536,7 +3482,7 @@ fs_visitor::remove_duplicate_mrf_writes() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); return progress; } @@ -3585,7 +3531,7 @@ fs_visitor::remove_extra_rounding_modes() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); return progress; } @@ -3766,7 +3712,7 @@ fs_visitor::insert_gen4_send_dependency_workarounds() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); } /** @@ -3806,7 +3752,7 @@ fs_visitor::lower_uniform_pull_constant_loads() inst->header_size = 1; inst->mlen = 1; - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); } else { /* Before register allocation, we didn't tell the scheduler about the * MRF we use. We know it's safe to use this MRF because nothing @@ -3885,9 +3831,9 @@ fs_visitor::lower_load_payload() } else { /* Platform doesn't have COMPR4. We have to fake it */ fs_reg mov_dst = retype(dst, inst->src[i].type); - ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0)); + ibld.half(0).MOV(mov_dst, half(inst->src[i], 0)); mov_dst.nr += 4; - ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1)); + ibld.half(1).MOV(mov_dst, half(inst->src[i], 1)); } } @@ -3924,7 +3870,7 @@ fs_visitor::lower_load_payload() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); return progress; } @@ -4224,7 +4170,7 @@ fs_visitor::lower_integer_multiplication() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return progress; } @@ -4254,7 +4200,7 @@ fs_visitor::lower_minmax() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + invalidate_live_intervals(); return progress; } @@ -4343,34 +4289,11 @@ fs_visitor::lower_sub_sat() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return progress; } -/** - * Get the mask of SIMD channels enabled during dispatch and not yet disabled - * by discard. Due to the layout of the sample mask in the fragment shader - * thread payload, \p bld is required to have a dispatch_width() not greater - * than 16 for fragment shaders. - */ -static fs_reg -sample_mask_reg(const fs_builder &bld) -{ - const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader); - - if (v->stage != MESA_SHADER_FRAGMENT) { - return brw_imm_ud(0xffffffff); - } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) { - assert(bld.dispatch_width() <= 16); - return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16); - } else { - assert(v->devinfo->gen >= 6 && bld.dispatch_width() <= 16); - return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7), - BRW_REGISTER_TYPE_UW); - } -} - static void setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, fs_reg *dst, fs_reg color, unsigned components) @@ -4440,8 +4363,6 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, const unsigned components = inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; - assert(inst->target != 0 || src0_alpha.file == BAD_FILE); - /* We can potentially have a message length of up to 15, so we have to set * base_mrf to either 0 or 1 in order to fit in m0..m15. */ @@ -4468,7 +4389,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, if (prog_data->uses_kill) { bld.exec_all().group(1, 0) .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), - sample_mask_reg(bld)); + brw_flag_reg(0, 1)); } assert(length == 0); @@ -4499,9 +4420,6 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD), }; ubld.LOAD_PAYLOAD(header, header_sources, 2, 0); - - /* Gen12 will require additional fix-ups if we ever hit this path. */ - assert(devinfo->gen < 12); } uint32_t g00_bits = 0; @@ -4509,7 +4427,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, /* Set "Source0 Alpha Present to RenderTarget" bit in message * header. */ - if (src0_alpha.file != BAD_FILE) + if (inst->target > 0 && prog_data->replicate_alpha) g00_bits |= 1 << 11; /* Set computes stencil to render target */ @@ -4530,9 +4448,10 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, } if (prog_data->uses_kill) { + assert(bld.group() < 16); ubld.group(1, 0).MOV(retype(component(header, 15), BRW_REGISTER_TYPE_UW), - sample_mask_reg(bld)); + brw_flag_reg(0, 1)); } assert(length == 0); @@ -4552,6 +4471,8 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, length++; } + bool src0_alpha_present = false; + if (src0_alpha.file != BAD_FILE) { for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) { const fs_builder &ubld = bld.exec_all().group(8, i) @@ -4561,6 +4482,14 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, setup_color_payload(ubld, key, &sources[length], tmp, 1); length++; } + src0_alpha_present = true; + } else if (prog_data->replicate_alpha && inst->target != 0) { + /* Handle the case when fragment shader doesn't write to draw buffer + * zero. No need to call setup_color_payload() for src0_alpha because + * alpha value will be undefined. + */ + length += bld.dispatch_width() / 8; + src0_alpha_present = true; } if (sample_mask.file != BAD_FILE) { @@ -4642,7 +4571,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, /* Set the "Render Target Index" and "Src0 Alpha Present" fields * in the extended message descriptor, in lieu of using a header. */ - ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15; + ex_desc = inst->target << 12 | src0_alpha_present << 15; if (key->nr_color_regions == 0) ex_desc |= 1 << 20; /* Null Render Target */ @@ -4687,7 +4616,6 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, static void lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) { - const gen_device_info *devinfo = bld.shader->devinfo; const fs_builder &ubld = bld.exec_all().group(8, 0); const unsigned length = 2; const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length); @@ -4702,19 +4630,6 @@ lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD) }; ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0); - - if (devinfo->gen >= 12) { - /* On Gen12 the Viewport and Render Target Array Index fields (AKA - * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render - * target message header format was updated accordingly -- However - * the updated format only works for the lower 16 channels in a - * SIMD32 thread, since the higher 16 channels want the subspan data - * from r2 instead of r1, so we need to copy over the contents of - * r1.1 in order to fix things up. - */ - ubld.group(1, 0).MOV(component(header, 9), - retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD)); - } } inst->resize_sources(1); @@ -5411,45 +5326,6 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) } } -/** - * Predicate the specified instruction on the sample mask. - */ -static void -emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst) -{ - assert(bld.shader->stage == MESA_SHADER_FRAGMENT && - bld.group() == inst->group && - bld.dispatch_width() == inst->exec_size); - - const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader); - const fs_reg sample_mask = sample_mask_reg(bld); - const unsigned subreg = sample_mask_flag_subreg(v); - - if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) { - assert(sample_mask.file == ARF && - sample_mask.nr == brw_flag_subreg(subreg).nr && - sample_mask.subnr == brw_flag_subreg( - subreg + inst->group / 16).subnr); - } else { - bld.group(1, 0).exec_all() - .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask); - } - - if (inst->predicate) { - assert(inst->predicate == BRW_PREDICATE_NORMAL); - assert(!inst->predicate_inverse); - assert(inst->flag_subreg == 0); - /* Combine the sample mask with the existing predicate by using a - * vertical predication mode. - */ - inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; - } else { - inst->flag_subreg = subreg; - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = false; - } -} - static void lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -5486,7 +5362,7 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) surface.ud == GEN8_BTI_STATELESS_NON_COHERENT); const bool has_side_effects = inst->has_side_effects(); - fs_reg sample_mask = has_side_effects ? sample_mask_reg(bld) : + fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() : fs_reg(brw_imm_d(0xffff)); /* From the BDW PRM Volume 7, page 147: @@ -5582,8 +5458,27 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) * provided. */ if ((header.file == BAD_FILE || !is_surface_access) && - sample_mask.file != BAD_FILE && sample_mask.file != IMM) - emit_predicate_on_sample_mask(bld, inst); + sample_mask.file != BAD_FILE && sample_mask.file != IMM) { + const fs_builder ubld = bld.group(1, 0).exec_all(); + if (inst->predicate) { + assert(inst->predicate == BRW_PREDICATE_NORMAL); + assert(!inst->predicate_inverse); + assert(inst->flag_subreg < 2); + /* Combine the sample mask with the existing predicate by using a + * vertical predication mode. + */ + inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; + ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg + 2), + sample_mask.type), + sample_mask); + } else { + inst->flag_subreg = 2; + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type), + sample_mask); + } + } uint32_t sfid; switch (inst->opcode) { @@ -5754,8 +5649,16 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) /* If the surface message has side effects and we're a fragment shader, we * have to predicate with the sample mask to avoid helper invocations. */ - if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) - emit_predicate_on_sample_mask(bld, inst); + if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) { + inst->flag_subreg = 2; + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + + fs_reg sample_mask = bld.sample_mask_reg(); + const fs_builder ubld = bld.group(1, 0).exec_all(); + ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type), + sample_mask); + } fs_reg payload, payload2; unsigned mlen, ex_mlen = 0; @@ -6073,7 +5976,7 @@ fs_visitor::lower_logical_sends() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return progress; } @@ -6315,7 +6218,7 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, /* Only power-of-two execution sizes are representable in the instruction * control fields. */ - return 1 << util_logbase2(max_width); + return 1 << _mesa_logbase2(max_width); } /** @@ -6958,7 +6861,7 @@ fs_visitor::lower_simd_width() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return progress; } @@ -7039,19 +6942,19 @@ fs_visitor::lower_barycentrics() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return progress; } void -fs_visitor::dump_instructions() const +fs_visitor::dump_instructions() { dump_instructions(NULL); } void -fs_visitor::dump_instructions(const char *name) const +fs_visitor::dump_instructions(const char *name) { FILE *file = stderr; if (name && geteuid() != 0) { @@ -7061,11 +6964,11 @@ fs_visitor::dump_instructions(const char *name) const } if (cfg) { - const register_pressure &rp = regpressure_analysis.require(); - unsigned ip = 0, max_pressure = 0; + calculate_register_pressure(); + int ip = 0, max_pressure = 0; foreach_block_and_inst(block, backend_instruction, inst, cfg) { - max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]); - fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip); + max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]); + fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip); dump_instruction(inst, file); ip++; } @@ -7084,15 +6987,15 @@ fs_visitor::dump_instructions(const char *name) const } void -fs_visitor::dump_instruction(const backend_instruction *be_inst) const +fs_visitor::dump_instruction(backend_instruction *be_inst) { dump_instruction(be_inst, stderr); } void -fs_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const +fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) { - const fs_inst *inst = (const fs_inst *)be_inst; + fs_inst *inst = (fs_inst *)be_inst; if (inst->predicate) { fprintf(file, "(%cf%d.%d) ", @@ -7445,31 +7348,22 @@ fs_visitor::setup_cs_payload() payload.num_regs = 1; } -brw::register_pressure::register_pressure(const fs_visitor *v) +void +fs_visitor::calculate_register_pressure() { - const fs_live_variables &live = v->live_analysis.require(); - const unsigned num_instructions = v->cfg->num_blocks ? - v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0; + invalidate_live_intervals(); + calculate_live_intervals(); - regs_live_at_ip = new unsigned[num_instructions](); + unsigned num_instructions = 0; + foreach_block(block, cfg) + num_instructions += block->instructions.length(); - for (unsigned reg = 0; reg < v->alloc.count; reg++) { - for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++) - regs_live_at_ip[ip] += v->alloc.sizes[reg]; - } -} + regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions); -brw::register_pressure::~register_pressure() -{ - delete[] regs_live_at_ip; -} - -void -fs_visitor::invalidate_analysis(brw::analysis_dependency_class c) -{ - backend_shader::invalidate_analysis(c); - live_analysis.invalidate(c); - regpressure_analysis.invalidate(c); + for (unsigned reg = 0; reg < alloc.count; reg++) { + for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++) + regs_live_at_ip[ip] += alloc.sizes[reg]; + } } void @@ -7684,7 +7578,7 @@ fs_visitor::fixup_sends_duplicate_payload() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return progress; } @@ -7707,8 +7601,7 @@ fs_visitor::fixup_3src_null_dest() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | - DEPENDENCY_VARIABLES); + invalidate_live_intervals(); } /** @@ -7756,15 +7649,15 @@ fs_visitor::fixup_nomask_control_flow() unsigned depth = 0; bool progress = false; - const fs_live_variables &live_vars = live_analysis.require(); + calculate_live_intervals(); /* Scan the program backwards in order to be able to easily determine * whether the flag register is live at any point. */ foreach_block_reverse_safe(block, cfg) { - BITSET_WORD flag_liveout = live_vars.block_data[block->num] + BITSET_WORD flag_liveout = live_intervals->block_data[block->num] .flag_liveout[0]; - STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1); + STATIC_ASSERT(ARRAY_SIZE(live_intervals->block_data[0].flag_liveout) == 1); foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { if (!inst->predicate && inst->exec_size >= 8) @@ -7851,7 +7744,7 @@ fs_visitor::fixup_nomask_control_flow() } if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + invalidate_live_intervals(); return progress; } @@ -7889,24 +7782,6 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) break; } - /* Scheduling may create additional opportunities for CMOD propagation, - * so let's do it again. If CMOD propagation made any progress, - * elminate dead code one more time. - */ - bool progress = false; - const int iteration = 99; - int pass_num = 0; - - if (OPT(opt_cmod_propagation)) { - /* dead_code_eliminate "undoes" the fixing done by - * fixup_3src_null_dest, so we have to do it again if - * dead_code_eliminiate makes any progress. - */ - if (OPT(dead_code_eliminate)) - fixup_3src_null_dest(); - } - - /* We only allow spilling for the last schedule mode and only if the * allow_spilling parameter and dispatch width work out ok. */ @@ -8296,15 +8171,11 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) * Initialize it with the dispatched pixels. */ if (wm_prog_data->uses_kill) { - const unsigned lower_width = MIN2(dispatch_width, 16); - for (unsigned i = 0; i < dispatch_width / lower_width; i++) { - const fs_reg dispatch_mask = - devinfo->gen >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) : - brw_vec1_grf(0, 0); - bld.exec_all().group(1, 0) - .MOV(sample_mask_reg(bld.group(lower_width, i)), - retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); - } + const fs_reg dispatch_mask = + devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0); + bld.exec_all().group(1, 0) + .MOV(retype(brw_flag_reg(0, 1), BRW_REGISTER_TYPE_UW), + retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); } emit_nir_code(); @@ -8647,7 +8518,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, char **error_str) { const struct gen_device_info *devinfo = compiler->devinfo; - const unsigned max_subgroup_size = compiler->devinfo->gen >= 6 ? 32 : 16; + + unsigned max_subgroup_size = unlikely(INTEL_DEBUG & DEBUG_DO32) ? 32 : 16; brw_nir_apply_key(shader, compiler, &key->base, max_subgroup_size, true); brw_nir_lower_fs_inputs(shader, devinfo, key); @@ -8705,24 +8577,20 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, calculate_urb_setup(devinfo, key, prog_data, shader); brw_compute_flat_inputs(prog_data, shader); - fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL; - float throughput = 0; - v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, 8, shader_time_index8); - if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { + fs_visitor v8(compiler, log_data, mem_ctx, &key->base, + &prog_data->base, shader, 8, + shader_time_index8); + if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) { if (error_str) - *error_str = ralloc_strdup(mem_ctx, v8->fail_msg); + *error_str = ralloc_strdup(mem_ctx, v8.fail_msg); - delete v8; return NULL; } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { - simd8_cfg = v8->cfg; - prog_data->base.dispatch_grf_start_reg = v8->payload.num_regs; - prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used); - const performance &perf = v8->performance_analysis.require(); - throughput = MAX2(throughput, perf.throughput); + simd8_cfg = v8.cfg; + prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; + prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used); } /* Limit dispatch width to simd8 with dual source blending on gen8. @@ -8731,52 +8599,45 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, if (devinfo->gen == 8 && prog_data->dual_src_blend && !(INTEL_DEBUG & DEBUG_NO8)) { assert(!use_rep_send); - v8->limit_dispatch_width(8, "gen8 workaround: " - "using SIMD8 when dual src blending.\n"); + v8.limit_dispatch_width(8, "gen8 workaround: " + "using SIMD8 when dual src blending.\n"); } - if (v8->max_dispatch_width >= 16 && + if (v8.max_dispatch_width >= 16 && likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { /* Try a SIMD16 compile */ - v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, 16, shader_time_index16); - v16->import_uniforms(v8); - if (!v16->run_fs(allow_spilling, use_rep_send)) { + fs_visitor v16(compiler, log_data, mem_ctx, &key->base, + &prog_data->base, shader, 16, + shader_time_index16); + v16.import_uniforms(&v8); + if (!v16.run_fs(allow_spilling, use_rep_send)) { compiler->shader_perf_log(log_data, "SIMD16 shader failed to compile: %s", - v16->fail_msg); + v16.fail_msg); } else { - simd16_cfg = v16->cfg; - prog_data->dispatch_grf_start_reg_16 = v16->payload.num_regs; - prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used); - const performance &perf = v16->performance_analysis.require(); - throughput = MAX2(throughput, perf.throughput); + simd16_cfg = v16.cfg; + prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; + prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used); } } /* Currently, the compiler only supports SIMD32 on SNB+ */ - if (v8->max_dispatch_width >= 32 && !use_rep_send && - devinfo->gen >= 6 && simd16_cfg && - !(INTEL_DEBUG & DEBUG_NO32)) { + if (v8.max_dispatch_width >= 32 && !use_rep_send && + compiler->devinfo->gen >= 6 && + unlikely(INTEL_DEBUG & DEBUG_DO32)) { /* Try a SIMD32 compile */ - v32 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, 32, shader_time_index32); - v32->import_uniforms(v8); - if (!v32->run_fs(allow_spilling, false)) { + fs_visitor v32(compiler, log_data, mem_ctx, &key->base, + &prog_data->base, shader, 32, + shader_time_index32); + v32.import_uniforms(&v8); + if (!v32.run_fs(allow_spilling, false)) { compiler->shader_perf_log(log_data, "SIMD32 shader failed to compile: %s", - v32->fail_msg); + v32.fail_msg); } else { - const performance &perf = v32->performance_analysis.require(); - - if (!(INTEL_DEBUG & DEBUG_DO32) && throughput >= perf.throughput) { - compiler->shader_perf_log(log_data, "SIMD32 shader inefficient\n"); - } else { - simd32_cfg = v32->cfg; - prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs; - prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used); - throughput = MAX2(throughput, perf.throughput); - } + simd32_cfg = v32.cfg; + prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs; + prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used); } } @@ -8817,25 +8678,19 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, if (prog_data->persample_dispatch) { /* Starting with SandyBridge (where we first get MSAA), the different * pixel dispatch combinations are grouped into classifications A - * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On most hardware + * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware * generations, the only configurations supporting persample dispatch - * are those in which only one dispatch width is enabled. - * - * The Gen12 hardware spec has a similar dispatch grouping table, but - * the following conflicting restriction applies (from the page on - * "Structure_3DSTATE_PS_BODY"), so we need to keep the SIMD16 shader: - * - * "SIMD32 may only be enabled if SIMD16 or (dual)SIMD8 is also - * enabled." + * are are this in which only one dispatch width is enabled. */ if (simd32_cfg || simd16_cfg) simd8_cfg = NULL; - if (simd32_cfg && devinfo->gen < 12) + if (simd32_cfg) simd16_cfg = NULL; } fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, - v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT); + v8.shader_stats, v8.runtime_check_aads_emit, + MESA_SHADER_FRAGMENT); if (unlikely(INTEL_DEBUG & DEBUG_WM)) { g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s", @@ -8846,31 +8701,22 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, if (simd8_cfg) { prog_data->dispatch_8 = true; - g.generate_code(simd8_cfg, 8, v8->shader_stats, - v8->performance_analysis.require(), stats); + g.generate_code(simd8_cfg, 8, stats); stats = stats ? stats + 1 : NULL; } if (simd16_cfg) { prog_data->dispatch_16 = true; - prog_data->prog_offset_16 = g.generate_code( - simd16_cfg, 16, v16->shader_stats, - v16->performance_analysis.require(), stats); + prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16, stats); stats = stats ? stats + 1 : NULL; } if (simd32_cfg) { prog_data->dispatch_32 = true; - prog_data->prog_offset_32 = g.generate_code( - simd32_cfg, 32, v32->shader_stats, - v32->performance_analysis.require(), stats); + prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32, stats); stats = stats ? stats + 1 : NULL; } - delete v8; - delete v16; - delete v32; - return g.get_assembly(); } @@ -8892,16 +8738,6 @@ fs_visitor::emit_cs_work_group_id_setup() return reg; } -unsigned -brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, - unsigned threads) -{ - assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0); - assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0); - return cs_prog_data->push.per_thread.size * threads + - cs_prog_data->push.cross_thread.size; -} - static void fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) { @@ -8940,6 +8776,11 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords); fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords); + unsigned total_dwords = + (cs_prog_data->push.per_thread.size * cs_prog_data->threads + + cs_prog_data->push.cross_thread.size) / 4; + fill_push_const_block_info(&cs_prog_data->push.total, total_dwords); + assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 || cs_prog_data->push.per_thread.size == 0); assert(cs_prog_data->push.cross_thread.dwords + @@ -8947,6 +8788,15 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, prog_data->nr_params); } +static void +cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size) +{ + cs_prog_data->simd_size = size; + unsigned group_size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; + cs_prog_data->threads = (group_size + size - 1) / size; +} + static nir_shader * compile_cs_to_nir(const struct brw_compiler *compiler, void *mem_ctx, @@ -8979,20 +8829,13 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, char **error_str) { prog_data->base.total_shared = src_shader->info.cs.shared_size; + prog_data->local_size[0] = src_shader->info.cs.local_size[0]; + prog_data->local_size[1] = src_shader->info.cs.local_size[1]; + prog_data->local_size[2] = src_shader->info.cs.local_size[2]; prog_data->slm_size = src_shader->num_shared; - - unsigned local_workgroup_size; - if (prog_data->uses_variable_group_size) { - prog_data->max_variable_local_size = - src_shader->info.cs.max_variable_local_size; - local_workgroup_size = src_shader->info.cs.max_variable_local_size; - } else { - prog_data->local_size[0] = src_shader->info.cs.local_size[0]; - prog_data->local_size[1] = src_shader->info.cs.local_size[1]; - prog_data->local_size[2] = src_shader->info.cs.local_size[2]; - local_workgroup_size = src_shader->info.cs.local_size[0] * - src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2]; - } + unsigned local_workgroup_size = + src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * + src_shader->info.cs.local_size[2]; /* Limit max_threads to 64 for the GPGPU_WALKER command */ const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads); @@ -9039,7 +8882,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, assert(v8->max_dispatch_width >= 32); v = v8; - prog_data->simd_size = 8; + cs_set_simd_size(prog_data, 8); cs_fill_push_const_info(compiler->devinfo, prog_data); } } @@ -9069,7 +8912,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, assert(v16->max_dispatch_width >= 32); v = v16; - prog_data->simd_size = 16; + cs_set_simd_size(prog_data, 16); cs_fill_push_const_info(compiler->devinfo, prog_data); } } @@ -9101,7 +8944,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, } } else { v = v32; - prog_data->simd_size = 32; + cs_set_simd_size(prog_data, 32); cs_fill_push_const_info(compiler->devinfo, prog_data); } } @@ -9113,7 +8956,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, *error_str = ralloc_strdup(mem_ctx, fail_msg); } else { fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, - v->runtime_check_aads_emit, MESA_SHADER_COMPUTE); + v->shader_stats, v->runtime_check_aads_emit, + MESA_SHADER_COMPUTE); if (INTEL_DEBUG & DEBUG_CS) { char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", src_shader->info.label ? @@ -9122,8 +8966,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, g.enable_debug(name); } - g.generate_code(v->cfg, prog_data->simd_size, v->shader_stats, - v->performance_analysis.require(), stats); + g.generate_code(v->cfg, prog_data->simd_size, stats); ret = g.get_assembly(); } |