diff options
Diffstat (limited to 'lib/mesa/src')
15 files changed, 565 insertions, 462 deletions
diff --git a/lib/mesa/src/compiler/nir/nir_to_ssa.c b/lib/mesa/src/compiler/nir/nir_to_ssa.c index 6accdd24b..44a505477 100644 --- a/lib/mesa/src/compiler/nir/nir_to_ssa.c +++ b/lib/mesa/src/compiler/nir/nir_to_ssa.c @@ -27,6 +27,7 @@ #include "nir.h" #include <stdlib.h> +#include <unistd.h> /* * Implements the classic to-SSA algorithm described by Cytron et. al. in @@ -88,7 +89,7 @@ insert_phi_nodes(nir_function_impl *impl) w_start = w_end = 0; iter_count++; - nir_foreach_def(dest, reg) { + nir_foreach_def(reg, dest) { nir_instr *def = dest->reg.parent_instr; if (work[def->block->index] < iter_count) W[w_end++] = def->block; @@ -159,8 +160,7 @@ static nir_ssa_def *get_ssa_src(nir_register *reg, rewrite_state *state) * to preserve the information that this source is undefined */ nir_ssa_undef_instr *instr = - nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components, - reg->bit_size); + nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components); /* * We could just insert the undefined instruction before the instruction @@ -219,9 +219,7 @@ rewrite_def_forwards(nir_dest *dest, void *_state) state->states[index].num_defs); list_del(&dest->reg.def_link); - nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, - reg->bit_size, name); - ralloc_free(name); + nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name); /* push our SSA destination on the stack */ state->states[index].index++; @@ -273,9 +271,7 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state) instr->dest.write_mask = (1 << num_components) - 1; list_del(&instr->dest.dest.reg.def_link); - nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, - reg->bit_size, name); - ralloc_free(name); + nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name); if (nir_op_infos[instr->op].output_size == 0) { /* @@ -381,7 +377,7 @@ rewrite_instr_forward(nir_instr *instr, rewrite_state *state) static void rewrite_phi_sources(nir_block *block, nir_block *pred, rewrite_state *state) { - nir_foreach_instr(instr, block) { + nir_foreach_instr(block, instr) { if (instr->type != nir_instr_type_phi) break; @@ -389,7 +385,7 @@ rewrite_phi_sources(nir_block *block, nir_block *pred, rewrite_state *state) state->parent_instr = instr; - nir_foreach_phi_src(src, phi_instr) { + nir_foreach_phi_src(phi_instr, src) { if (src->pred == pred) { rewrite_use(&src->src, state); break; @@ -434,7 +430,7 @@ rewrite_block(nir_block *block, rewrite_state *state) * what we want because those instructions (vector gather, conditional * select) will already be in SSA form. */ - nir_foreach_instr_safe(instr, block) { + nir_foreach_instr_safe(block, instr) { rewrite_instr_forward(instr, state); } @@ -455,7 +451,7 @@ rewrite_block(nir_block *block, rewrite_state *state) for (unsigned i = 0; i < block->num_dom_children; i++) rewrite_block(block->dom_children[i], state); - nir_foreach_instr_reverse(instr, block) { + nir_foreach_instr_reverse(block, instr) { rewrite_instr_backwards(instr, state); } } @@ -533,7 +529,7 @@ nir_convert_to_ssa_impl(nir_function_impl *impl) void nir_convert_to_ssa(nir_shader *shader) { - nir_foreach_function(function, shader) { + nir_foreach_function(shader, function) { if (function->impl) nir_convert_to_ssa_impl(function->impl); } diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c index 18145beb2..148920756 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c @@ -27,21 +27,62 @@ #include "main/errors.h" #include "util/debug.h" +static void +shader_debug_log_mesa(void *data, const char *fmt, ...) +{ + struct brw_context *brw = (struct brw_context *)data; + va_list args; + + va_start(args, fmt); + GLuint msg_id = 0; + _mesa_gl_vdebug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_OTHER, + MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args); + va_end(args); +} + +static void +shader_perf_log_mesa(void *data, const char *fmt, ...) +{ + struct brw_context *brw = (struct brw_context *)data; + + va_list args; + va_start(args, fmt); + + if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { + va_list args_copy; + va_copy(args_copy, args); + vfprintf(stderr, fmt, args_copy); + va_end(args_copy); + } + + if (brw->perf_debug) { + GLuint msg_id = 0; + _mesa_gl_vdebug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_PERFORMANCE, + MESA_DEBUG_SEVERITY_MEDIUM, fmt, args); + } + va_end(args); +} + #define COMMON_OPTIONS \ + /* In order to help allow for better CSE at the NIR level we tell NIR to \ + * split all ffma instructions during opt_algebraic and we then re-combine \ + * them as a later step. \ + */ \ + .lower_ffma = true, \ .lower_sub = true, \ .lower_fdiv = true, \ .lower_scmp = true, \ - .lower_fmod32 = true, \ - .lower_fmod64 = false, \ + .lower_fmod = true, \ .lower_bitfield_extract = true, \ .lower_bitfield_insert = true, \ .lower_uadd_carry = true, \ .lower_usub_borrow = true, \ .lower_fdiv = true, \ - .lower_flrp64 = true, \ - .native_integers = true, \ - .use_interpolated_input_intrinsics = true, \ - .vertex_id_zero_based = true + .native_integers = true static const struct nir_shader_compiler_options scalar_nir_options = { COMMON_OPTIONS, @@ -66,26 +107,6 @@ static const struct nir_shader_compiler_options vector_nir_options = { */ .fdot_replicates = true, - /* Prior to Gen6, there are no three source operations for SIMD4x2. */ - .lower_flrp32 = true, - - .lower_pack_snorm_2x16 = true, - .lower_pack_unorm_2x16 = true, - .lower_unpack_snorm_2x16 = true, - .lower_unpack_unorm_2x16 = true, - .lower_extract_byte = true, - .lower_extract_word = true, -}; - -static const struct nir_shader_compiler_options vector_nir_options_gen6 = { - COMMON_OPTIONS, - - /* In the vec4 backend, our dpN instruction replicates its result to all the - * components of a vec4. We would like NIR to give us replicated fdot - * instructions because it can optimize better for us. - */ - .fdot_replicates = true, - .lower_pack_snorm_2x16 = true, .lower_pack_unorm_2x16 = true, .lower_unpack_snorm_2x16 = true, @@ -95,25 +116,24 @@ static const struct nir_shader_compiler_options vector_nir_options_gen6 = { }; struct brw_compiler * -brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo) +brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) { struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); compiler->devinfo = devinfo; + compiler->shader_debug_log = shader_debug_log_mesa; + compiler->shader_perf_log = shader_perf_log_mesa; brw_fs_alloc_reg_sets(compiler); brw_vec4_alloc_reg_set(compiler); - compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false); - compiler->scalar_stage[MESA_SHADER_VERTEX] = devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); - compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = - devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", true); + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true); compiler->scalar_stage[MESA_SHADER_GEOMETRY] = - devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true); + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false); compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; compiler->scalar_stage[MESA_SHADER_COMPUTE] = true; @@ -123,10 +143,12 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo) compiler->glsl_compiler_options[i].MaxIfDepth = devinfo->gen < 6 ? 16 : UINT_MAX; + compiler->glsl_compiler_options[i].EmitCondCodes = true; + compiler->glsl_compiler_options[i].EmitNoNoise = true; compiler->glsl_compiler_options[i].EmitNoMainReturn = true; compiler->glsl_compiler_options[i].EmitNoIndirectInput = true; compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false; - compiler->glsl_compiler_options[i].LowerCombinedClipCullDistance = true; + compiler->glsl_compiler_options[i].LowerClipDistance = true; bool is_scalar = compiler->scalar_stage[i]; @@ -138,20 +160,14 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo) if (devinfo->gen < 7) compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true; - if (is_scalar) { - compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options; - } else { - compiler->glsl_compiler_options[i].NirOptions = - devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6; - } + compiler->glsl_compiler_options[i].NirOptions = + is_scalar ? &scalar_nir_options : &vector_nir_options; compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; - compiler->glsl_compiler_options[i].ClampBlockIndicesToArrayBounds = true; } compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; - compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false; if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h index 447d05b81..27a95a3c6 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h @@ -24,9 +24,8 @@ #pragma once #include <stdio.h> -#include "common/gen_device_info.h" +#include "brw_device_info.h" #include "main/mtypes.h" -#include "main/macros.h" #ifdef __cplusplus extern "C" { @@ -38,7 +37,7 @@ struct brw_geometry_program; union gl_constant_value; struct brw_compiler { - const struct gen_device_info *devinfo; + const struct brw_device_info *devinfo; struct { struct ra_regs *regs; @@ -86,19 +85,13 @@ struct brw_compiler { * appear in *classes. */ int aligned_pairs_class; - } fs_reg_sets[3]; + } fs_reg_sets[2]; void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); bool scalar_stage[MESA_SHADER_STAGES]; struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; - - /** - * Apply workarounds for SIN and COS output range problems. - * This can negatively impact performance. - */ - bool precise_trig; }; @@ -160,13 +153,6 @@ struct brw_sampler_prog_key_data { * For Sandybridge, which shader w/a we need for gather quirks. */ enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; - - /** - * Texture units that have a YUV image bound. - */ - uint32_t y_u_v_image_mask; - uint32_t y_uv_image_mask; - uint32_t yx_xuxv_image_mask; }; @@ -220,8 +206,6 @@ struct brw_tcs_prog_key /** A bitfield of per-vertex outputs written. */ uint64_t outputs_written; - bool quads_workaround; - struct brw_sampler_prog_key_data tex; }; @@ -252,15 +236,17 @@ struct brw_wm_prog_key { uint8_t iz_lookup; bool stats_wm:1; bool flat_shade:1; + bool persample_shading:1; + bool persample_2x:1; unsigned nr_color_regions:5; bool replicate_alpha:1; + bool render_to_fbo:1; bool clamp_fragment_color:1; - bool persample_interp:1; - bool multisample_fbo:1; + bool compute_pos_offset:1; + bool compute_sample_id:1; unsigned line_aa:2; bool high_quality_derivatives:1; bool force_dual_color_blend:1; - bool coherent_fb_fetch:1; uint16_t drawable_height; uint64_t input_slots_valid; @@ -338,7 +324,6 @@ struct brw_stage_prog_data { uint32_t abo_start; uint32_t image_start; uint32_t shader_time_start; - uint32_t plane_start[3]; /** @} */ } binding_table; @@ -378,18 +363,15 @@ struct brw_wm_prog_data { GLuint num_varying_inputs; - uint8_t reg_blocks_0; - uint8_t reg_blocks_2; - - uint8_t dispatch_grf_start_reg_2; - uint32_t prog_offset_2; + GLuint dispatch_grf_start_reg_16; + GLuint reg_blocks; + GLuint reg_blocks_16; struct { /** @{ * surface indices the WM-specific surfaces */ uint32_t render_target_start; - uint32_t render_target_read_start; /** @} */ } binding_table; @@ -397,18 +379,16 @@ struct brw_wm_prog_data { bool computed_stencil; bool early_fragment_tests; - bool dispatch_8; - bool dispatch_16; + bool no_8; bool dual_src_blend; - bool persample_dispatch; bool uses_pos_offset; bool uses_omask; bool uses_kill; bool uses_src_depth; bool uses_src_w; bool uses_sample_mask; - bool has_side_effects; bool pulls_bary; + uint32_t prog_offset_16; /** * Mask of which interpolation modes are required by the fragment shader. @@ -417,12 +397,6 @@ struct brw_wm_prog_data { uint32_t barycentric_interp_modes; /** - * Mask of which FS inputs are marked flat by the shader source. This is - * needed for setting up 3DSTATE_SF/SBE. - */ - uint32_t flat_inputs; - - /** * Map from gl_varying_slot to the position within the FS setup data * payload where the varying's attribute vertex deltas should be delivered. * For varying slots that are not used by the FS, the value is -1. @@ -430,28 +404,15 @@ struct brw_wm_prog_data { int urb_setup[VARYING_SLOT_MAX]; }; -struct brw_push_const_block { - unsigned dwords; /* Dword count, not reg aligned */ - unsigned regs; - unsigned size; /* Bytes, register aligned */ -}; - struct brw_cs_prog_data { struct brw_stage_prog_data base; GLuint dispatch_grf_start_reg_16; unsigned local_size[3]; unsigned simd_size; - unsigned threads; bool uses_barrier; bool uses_num_work_groups; - int thread_local_id_index; - - struct { - struct brw_push_const_block cross_thread; - struct brw_push_const_block per_thread; - struct brw_push_const_block total; - } push; + unsigned local_invocation_id_regs; struct { /** @{ @@ -566,7 +527,7 @@ GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying) return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); } -void brw_compute_vue_map(const struct gen_device_info *devinfo, +void brw_compute_vue_map(const struct brw_device_info *devinfo, struct brw_vue_map *vue_map, GLbitfield64 slots_valid, bool separate_shader); @@ -620,8 +581,6 @@ struct brw_vue_prog_data { GLuint urb_read_length; GLuint total_grf; - uint32_t cull_distance_mask; - /* Used for calculating urb partitions. In the VS, this is the size of the * URB entry used for both input and output to the thread. In the GS, this * is the size of the URB entry used for output. @@ -637,7 +596,6 @@ struct brw_vs_prog_data { GLbitfield64 inputs_read; unsigned nr_attributes; - unsigned nr_attribute_slots; bool uses_vertexid; bool uses_instanceid; @@ -731,28 +689,11 @@ struct brw_gs_prog_data unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; }; -#define DEFINE_PROG_DATA_DOWNCAST(stage) \ -static inline struct brw_##stage##_prog_data * \ -brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \ -{ \ - return (struct brw_##stage##_prog_data *) prog_data; \ -} -DEFINE_PROG_DATA_DOWNCAST(vue) -DEFINE_PROG_DATA_DOWNCAST(vs) -DEFINE_PROG_DATA_DOWNCAST(tcs) -DEFINE_PROG_DATA_DOWNCAST(tes) -DEFINE_PROG_DATA_DOWNCAST(gs) -DEFINE_PROG_DATA_DOWNCAST(wm) -DEFINE_PROG_DATA_DOWNCAST(cs) -DEFINE_PROG_DATA_DOWNCAST(ff_gs) -DEFINE_PROG_DATA_DOWNCAST(clip) -DEFINE_PROG_DATA_DOWNCAST(sf) -#undef DEFINE_PROG_DATA_DOWNCAST /** @} */ struct brw_compiler * -brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo); +brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo); /** * Compile a vertex shader. @@ -833,7 +774,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, struct gl_program *prog, int shader_time_index8, int shader_time_index16, - bool allow_spilling, bool use_rep_send, unsigned *final_assembly_size, char **error_str); @@ -853,86 +793,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, unsigned *final_assembly_size, char **error_str); -static inline uint32_t -encode_slm_size(unsigned gen, uint32_t bytes) -{ - uint32_t slm_size = 0; - - /* Shared Local Memory is specified as powers of two, and encoded in - * INTERFACE_DESCRIPTOR_DATA with the following representations: - * - * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | - * ------------------------------------------------------------------- - * Gen7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | - * ------------------------------------------------------------------- - * Gen9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - */ - assert(bytes <= 64 * 1024); - - if (bytes > 0) { - /* Shared Local Memory Size is specified as powers of two. */ - slm_size = util_next_power_of_two(bytes); - - if (gen >= 9) { - /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */ - slm_size = ffs(MAX2(slm_size, 1024)) - 10; - } else { - /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */ - slm_size = MAX2(slm_size, 4096) / 4096; - } - } - - return slm_size; -} - /** - * Return true if the given shader stage is dispatched contiguously by the - * relevant fixed function starting from channel 0 of the SIMD thread, which - * implies that the dispatch mask of a thread can be assumed to have the form - * '2^n - 1' for some n. + * Fill out local id payload for compute shader according to cs_prog_data. */ -static inline bool -brw_stage_has_packed_dispatch(const struct gen_device_info *devinfo, - gl_shader_stage stage, - const struct brw_stage_prog_data *prog_data) -{ - /* The code below makes assumptions about the hardware's thread dispatch - * behavior that could be proven wrong in future generations -- Make sure - * to do a full test run with brw_fs_test_dispatch_packing() hooked up to - * the NIR front-end before changing this assertion. - */ - assert(devinfo->gen <= 9); - - switch (stage) { - case MESA_SHADER_FRAGMENT: { - /* The PSD discards subspans coming in with no lit samples, which in the - * per-pixel shading case implies that each subspan will either be fully - * lit (due to the VMask being used to allow derivative computations), - * or not dispatched at all. In per-sample dispatch mode individual - * samples from the same subspan have a fixed relative location within - * the SIMD thread, so dispatch of unlit samples cannot be avoided in - * general and we should return false. - */ - const struct brw_wm_prog_data *wm_prog_data = - (const struct brw_wm_prog_data *)prog_data; - return !wm_prog_data->persample_dispatch; - } - case MESA_SHADER_COMPUTE: - /* Compute shaders will be spawned with either a fully enabled dispatch - * mask or with whatever bottom/right execution mask was given to the - * GPGPU walker command to be used along the workgroup edges -- In both - * cases the dispatch mask is required to be tightly packed for our - * invocation index calculations to work. - */ - return true; - default: - /* Most remaining fixed functions are limited to use a packed dispatch - * mask due to the hardware representation of the dispatch mask as a - * single counter representing the number of enabled channels. - */ - return true; - } -} +void +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, + void *buffer, uint32_t threads, uint32_t stride); #ifdef __cplusplus } /* extern "C" */ diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c index 0e736ed01..2de2ea1ba 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c @@ -55,31 +55,251 @@ cat(struct string *dest, const struct string src) } while(0) static bool -src0_is_null(const struct gen_device_info *devinfo, const brw_inst *inst) +src0_is_null(const struct brw_device_info *devinfo, const brw_inst *inst) { return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; } static bool -src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst) +src1_is_null(const struct brw_device_info *devinfo, const brw_inst *inst) { return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; } -static bool -src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst) -{ - return brw_inst_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE; -} +enum gen { + GEN4 = (1 << 0), + GEN45 = (1 << 1), + GEN5 = (1 << 2), + GEN6 = (1 << 3), + GEN7 = (1 << 4), + GEN75 = (1 << 5), + GEN8 = (1 << 6), + GEN9 = (1 << 7), + GEN_ALL = ~0 +}; + +#define GEN_GE(gen) (~((gen) - 1) | gen) +#define GEN_LE(gen) (((gen) - 1) | gen) + +struct inst_info { + enum gen gen; +}; + +static const struct inst_info inst_info[128] = { + [BRW_OPCODE_ILLEGAL] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MOV] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SEL] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MOVI] = { + .gen = GEN_GE(GEN45), + }, + [BRW_OPCODE_NOT] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_AND] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_OR] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_XOR] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SHR] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SHL] = { + .gen = GEN_ALL, + }, + /* BRW_OPCODE_DIM / BRW_OPCODE_SMOV */ + /* Reserved - 11 */ + [BRW_OPCODE_ASR] = { + .gen = GEN_ALL, + }, + /* Reserved - 13-15 */ + [BRW_OPCODE_CMP] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_CMPN] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_CSEL] = { + .gen = GEN_GE(GEN8), + }, + [BRW_OPCODE_F32TO16] = { + .gen = GEN7 | GEN75, + }, + [BRW_OPCODE_F16TO32] = { + .gen = GEN7 | GEN75, + }, + /* Reserved - 21-22 */ + [BRW_OPCODE_BFREV] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFE] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFI1] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFI2] = { + .gen = GEN_GE(GEN7), + }, + /* Reserved - 27-31 */ + [BRW_OPCODE_JMPI] = { + .gen = GEN_ALL, + }, + /* BRW_OPCODE_BRD */ + [BRW_OPCODE_IF] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_IFF] = { /* also BRW_OPCODE_BRC */ + .gen = GEN_LE(GEN5), + }, + [BRW_OPCODE_ELSE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_ENDIF] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DO] = { /* also BRW_OPCODE_CASE */ + .gen = GEN_LE(GEN5), + }, + [BRW_OPCODE_WHILE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_BREAK] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_CONTINUE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_HALT] = { + .gen = GEN_ALL, + }, + /* BRW_OPCODE_CALLA */ + /* BRW_OPCODE_MSAVE / BRW_OPCODE_CALL */ + /* BRW_OPCODE_MREST / BRW_OPCODE_RET */ + /* BRW_OPCODE_PUSH / BRW_OPCODE_FORK / BRW_OPCODE_GOTO */ + /* BRW_OPCODE_POP */ + [BRW_OPCODE_WAIT] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SEND] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SENDC] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SENDS] = { + .gen = GEN_GE(GEN9), + }, + [BRW_OPCODE_SENDSC] = { + .gen = GEN_GE(GEN9), + }, + /* Reserved 53-55 */ + [BRW_OPCODE_MATH] = { + .gen = GEN_GE(GEN6), + }, + /* Reserved 57-63 */ + [BRW_OPCODE_ADD] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MUL] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_AVG] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_FRC] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDU] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDD] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDZ] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MAC] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MACH] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_LZD] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_FBH] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_FBL] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_CBIT] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_ADDC] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_SUBB] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_SAD2] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SADA2] = { + .gen = GEN_ALL, + }, + /* Reserved 82-83 */ + [BRW_OPCODE_DP4] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DPH] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DP3] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DP2] = { + .gen = GEN_ALL, + }, + /* Reserved 88 */ + [BRW_OPCODE_LINE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_PLN] = { + .gen = GEN_GE(GEN45), + }, + [BRW_OPCODE_MAD] = { + .gen = GEN_GE(GEN6), + }, + [BRW_OPCODE_LRP] = { + .gen = GEN_GE(GEN6), + }, + /* Reserved 93-124 */ + /* BRW_OPCODE_NENOP */ + [BRW_OPCODE_NOP] = { + .gen = GEN_ALL, + }, +}; static unsigned -num_sources_from_inst(const struct gen_device_info *devinfo, +num_sources_from_inst(const struct brw_device_info *devinfo, const brw_inst *inst) { - const struct opcode_desc *desc = - brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)); unsigned math_function; if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) { @@ -94,10 +314,8 @@ num_sources_from_inst(const struct gen_device_info *devinfo, */ return 0; } - } else if (desc) { - return desc->nsrc; } else { - return 0; + return opcode_descs[brw_inst_opcode(devinfo, inst)].nsrc; } switch (math_function) { @@ -123,18 +341,34 @@ num_sources_from_inst(const struct gen_device_info *devinfo, } } +static enum gen +gen_from_devinfo(const struct brw_device_info *devinfo) +{ + switch (devinfo->gen) { + case 4: return devinfo->is_g4x ? GEN45 : GEN4; + case 5: return GEN5; + case 6: return GEN6; + case 7: return devinfo->is_haswell ? GEN75 : GEN7; + case 8: return GEN8; + case 9: return GEN9; + default: + unreachable("not reached"); + } +} + static bool -is_unsupported_inst(const struct gen_device_info *devinfo, +is_unsupported_inst(const struct brw_device_info *devinfo, const brw_inst *inst) { - return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL; + enum gen gen = gen_from_devinfo(devinfo); + return (inst_info[brw_inst_opcode(devinfo, inst)].gen & gen) == 0; } bool brw_validate_instructions(const struct brw_codegen *p, int start_offset, struct annotation_info *annotation) { - const struct gen_device_info *devinfo = p->devinfo; + const struct brw_device_info *devinfo = p->devinfo; const void *store = p->store + start_offset / 16; bool valid = true; @@ -163,18 +397,6 @@ brw_validate_instructions(const struct brw_codegen *p, int start_offset, ERROR_IF(is_unsupported_inst(devinfo, inst), "Instruction not supported on this Gen"); - if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) { - ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != - BRW_ADDRESS_DIRECT, "send must use direct addressing"); - - if (devinfo->gen >= 7) { - ERROR_IF(!src0_is_grf(devinfo, inst), "send from non-GRF"); - ERROR_IF(brw_inst_eot(devinfo, inst) && - brw_inst_src0_da_reg_nr(devinfo, inst) < 112, - "send with EOT must use g112-g127"); - } - } - if (error_msg.str && annotation) { annotation_insert_error(annotation, src_offset, error_msg.str); } diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp index 676942c19..90edd023b 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp @@ -43,14 +43,14 @@ fs_visitor::validate() { foreach_block_and_inst (block, fs_inst, inst, cfg) { if (inst->dst.file == VGRF) { - fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <= + fsv_assert(inst->dst.reg_offset + inst->regs_written <= alloc.sizes[inst->dst.nr]); } for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { - fsv_assert(inst->src[i].offset / REG_SIZE + regs_read(inst, i) <= - alloc.sizes[inst->src[i].nr]); + fsv_assert(inst->src[i].reg_offset + inst->regs_read(i) <= + (int)alloc.sizes[inst->src[i].nr]); } } } diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c index 0bb766d70..9c65e540d 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c @@ -39,11 +39,12 @@ struct attr_wa_state { }; static bool -apply_attr_wa_block(nir_block *block, struct attr_wa_state *state) +apply_attr_wa_block(nir_block *block, void *void_state) { + struct attr_wa_state *state = void_state; nir_builder *b = &state->builder; - nir_foreach_instr_safe(instr, block) { + nir_foreach_instr_safe(block, instr) { if (instr->type != nir_instr_type_intrinsic) continue; @@ -155,16 +156,14 @@ brw_nir_apply_attribute_workarounds(nir_shader *shader, .wa_flags = attrib_wa_flags, }; - nir_foreach_function(func, shader) { + nir_foreach_function(shader, func) { if (!func->impl) continue; nir_builder_init(&state.builder, func->impl); state.impl_progress = false; - nir_foreach_block(block, func->impl) { - apply_attr_wa_block(block, &state); - } + nir_foreach_block(func->impl, apply_attr_wa_block, &state); if (state.impl_progress) { nir_metadata_preserve(func->impl, nir_metadata_block_index | diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c index 14a9a0fac..5ff2cba04 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c @@ -44,7 +44,7 @@ are_all_uses_fadd(nir_ssa_def *def) if (!list_empty(&def->if_uses)) return false; - nir_foreach_use(use_src, def) { + nir_foreach_use(def, use_src) { nir_instr *use_instr = use_src->parent_instr; if (use_instr->type != nir_instr_type_alu) @@ -84,17 +84,6 @@ get_mul_for_src(nir_alu_src *src, int num_components, return NULL; nir_alu_instr *alu = nir_instr_as_alu(instr); - - /* We want to bail if any of the other ALU operations involved is labled - * exact. One reason for this is that, while the value that is changing is - * actually the result of the add and not the multiply, the intention of - * the user when they specify an exact multiply is that they want *that* - * value and what they don't care about is the add. Another reason is that - * SPIR-V explicitly requires this behaviour. - */ - if (alu->exact) - return NULL; - switch (alu->op) { case nir_op_imov: case nir_op_fmov: @@ -113,7 +102,7 @@ get_mul_for_src(nir_alu_src *src, int num_components, break; case nir_op_fmul: - /* Only absorb a fmul into a ffma if the fmul is only used in fadd + /* Only absorb a fmul into a ffma if the fmul is is only used in fadd * operations. This prevents us from being too aggressive with our * fusing which can actually lead to more instructions. */ @@ -167,11 +156,11 @@ any_alu_src_is_a_constant(nir_alu_src srcs[]) } static bool -brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx) +brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state) { - bool progress = false; + struct peephole_ffma_state *state = void_state; - nir_foreach_instr_safe(instr, block) { + nir_foreach_instr_safe(block, instr) { if (instr->type != nir_instr_type_alu) continue; @@ -179,9 +168,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx) if (add->op != nir_op_fadd) continue; - assert(add->dest.dest.is_ssa); - if (add->exact) - continue; + /* TODO: Maybe bail if this expression is considered "precise"? */ assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa); @@ -214,8 +201,6 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx) if (mul == NULL) continue; - unsigned bit_size = add->dest.dest.ssa.bit_size; - nir_ssa_def *mul_src[2]; mul_src[0] = mul->src[0].src.ssa; mul_src[1] = mul->src[1].src.ssa; @@ -231,10 +216,11 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx) if (abs) { for (unsigned i = 0; i < 2; i++) { - nir_alu_instr *abs = nir_alu_instr_create(mem_ctx, nir_op_fabs); + nir_alu_instr *abs = nir_alu_instr_create(state->mem_ctx, + nir_op_fabs); abs->src[0].src = nir_src_for_ssa(mul_src[i]); nir_ssa_dest_init(&abs->instr, &abs->dest.dest, - mul_src[i]->num_components, bit_size, NULL); + mul_src[i]->num_components, NULL); abs->dest.write_mask = (1 << mul_src[i]->num_components) - 1; nir_instr_insert_before(&add->instr, &abs->instr); mul_src[i] = &abs->dest.dest.ssa; @@ -242,16 +228,17 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx) } if (negate) { - nir_alu_instr *neg = nir_alu_instr_create(mem_ctx, nir_op_fneg); + nir_alu_instr *neg = nir_alu_instr_create(state->mem_ctx, + nir_op_fneg); neg->src[0].src = nir_src_for_ssa(mul_src[0]); nir_ssa_dest_init(&neg->instr, &neg->dest.dest, - mul_src[0]->num_components, bit_size, NULL); + mul_src[0]->num_components, NULL); neg->dest.write_mask = (1 << mul_src[0]->num_components) - 1; nir_instr_insert_before(&add->instr, &neg->instr); mul_src[0] = &neg->dest.dest.ssa; } - nir_alu_instr *ffma = nir_alu_instr_create(mem_ctx, nir_op_ffma); + nir_alu_instr *ffma = nir_alu_instr_create(state->mem_ctx, nir_op_ffma); ffma->dest.saturate = add->dest.saturate; ffma->dest.write_mask = add->dest.write_mask; @@ -266,7 +253,6 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx) nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest, add->dest.dest.ssa.num_components, - bit_size, add->dest.dest.ssa.name); nir_ssa_def_rewrite_uses(&add->dest.dest.ssa, nir_src_for_ssa(&ffma->dest.dest.ssa)); @@ -275,27 +261,28 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx) assert(list_empty(&add->dest.dest.ssa.uses)); nir_instr_remove(&add->instr); - progress = true; + state->progress = true; } - return progress; + return true; } static bool brw_nir_opt_peephole_ffma_impl(nir_function_impl *impl) { - bool progress = false; - void *mem_ctx = ralloc_parent(impl); + struct peephole_ffma_state state; - nir_foreach_block(block, impl) { - progress |= brw_nir_opt_peephole_ffma_block(block, mem_ctx); - } + state.mem_ctx = ralloc_parent(impl); + state.impl = impl; + state.progress = false; + + nir_foreach_block(impl, brw_nir_opt_peephole_ffma_block, &state); - if (progress) + if (state.progress) nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); - return progress; + return state.progress; } bool @@ -303,7 +290,7 @@ brw_nir_opt_peephole_ffma(nir_shader *shader) { bool progress = false; - nir_foreach_function(function, shader) { + nir_foreach_function(shader, function) { if (function->impl) progress |= brw_nir_opt_peephole_ffma_impl(function->impl); } diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h index dab6e0377..3a8617e05 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h @@ -95,7 +95,7 @@ namespace brw { vec4_builder at_end() const { - return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); + return at(NULL, (exec_node *)&shader->instructions.tail); } /** @@ -373,7 +373,6 @@ namespace brw { ALU1(CBIT) ALU2(CMPN) ALU3(CSEL) - ALU1(DIM) ALU2(DP2) ALU2(DP3) ALU2(DP4) diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp index c531fba03..0c8224f5f 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp @@ -68,10 +68,10 @@ opt_cmod_propagation_local(bblock_t *block) bool read_flag = false; foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) { - if (regions_overlap(inst->src[0], inst->size_read(0), - scan_inst->dst, scan_inst->size_written)) { + if (inst->src[0].in_range(scan_inst->dst, + scan_inst->regs_written)) { if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) || - scan_inst->dst.offset != inst->src[0].offset || + scan_inst->dst.reg_offset != inst->src[0].reg_offset || (scan_inst->dst.writemask != WRITEMASK_X && scan_inst->dst.writemask != WRITEMASK_XYZW) || (scan_inst->dst.writemask == WRITEMASK_XYZW && @@ -115,18 +115,6 @@ opt_cmod_propagation_local(bblock_t *block) break; } - /* The conditional mod of the CMP/CMPN instructions behaves - * specially because the flag output is not calculated from the - * result of the instruction, but the other way around, which - * means that even if the condmod to propagate and the condmod - * from the CMP instruction are the same they will in general give - * different results because they are evaluated based on different - * inputs. - */ - if (scan_inst->opcode == BRW_OPCODE_CMP || - scan_inst->opcode == BRW_OPCODE_CMPN) - break; - /* Otherwise, try propagating the conditional. */ enum brw_conditional_mod cond = inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp index 19c685fee..28002c56c 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp @@ -145,7 +145,7 @@ namespace brw { vec4_instruction *inst = bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg)); inst->mlen = sz; - inst->size_written = ret_sz * REG_SIZE; + inst->regs_written = ret_sz; inst->header_size = header_sz; inst->predicate = pred; @@ -221,7 +221,7 @@ namespace brw { emit_insert(bld, addr, dims, has_simd4x2), has_simd4x2 ? 1 : dims, emit_insert(bld, src_reg(srcs), size, has_simd4x2), - has_simd4x2 && size ? 1 : size, + has_simd4x2 ? 1 : size, surface, op, rsize, pred); } diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp index 498fb7cfb..9b49b7df8 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -29,7 +29,6 @@ #include "brw_nir.h" #include "brw_vec4_tcs.h" -#include "brw_fs.h" namespace brw { @@ -49,12 +48,62 @@ vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler, void +vec4_tcs_visitor::emit_nir_code() +{ + if (key->program_string_id != 0) { + /* We have a real application-supplied TCS, emit real code. */ + vec4_visitor::emit_nir_code(); + } else { + /* There is no TCS; automatically generate a passthrough shader + * that writes the API-specified default tessellation levels and + * copies VS outputs to TES inputs. + */ + uniforms = 2; + uniform_size[0] = 1; + uniform_size[1] = 1; + + uint64_t varyings = key->outputs_written; + + src_reg vertex_offset(this, glsl_type::uint_type); + emit(MUL(dst_reg(vertex_offset), invocation_id, + brw_imm_ud(prog_data->vue_map.num_per_vertex_slots))); + + while (varyings != 0) { + const int varying = ffsll(varyings) - 1; + + unsigned in_offset = input_vue_map->varying_to_slot[varying]; + unsigned out_offset = prog_data->vue_map.varying_to_slot[varying]; + assert(out_offset >= 2); + + dst_reg val(this, glsl_type::vec4_type); + emit_input_urb_read(val, invocation_id, in_offset, src_reg()); + emit_urb_write(src_reg(val), WRITEMASK_XYZW, out_offset, + vertex_offset); + + varyings &= ~BITFIELD64_BIT(varying); + } + + /* Only write the tessellation factors from invocation 0. + * There's no point in making other threads do redundant work. + */ + emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(0), + BRW_CONDITIONAL_EQ)); + emit(IF(BRW_PREDICATE_NORMAL)); + emit_urb_write(src_reg(UNIFORM, 0, glsl_type::vec4_type), + WRITEMASK_XYZW, 0, src_reg()); + emit_urb_write(src_reg(UNIFORM, 1, glsl_type::vec4_type), + WRITEMASK_XYZW, 1, src_reg()); + emit(BRW_OPCODE_ENDIF); + } +} + +void vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) { } dst_reg * -vec4_tcs_visitor::make_reg_for_system_value(int location) +vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type *type) { return NULL; } @@ -135,9 +184,7 @@ vec4_tcs_visitor::emit_thread_end() * we don't have stride in the vec4 world, nor UV immediates in * align16, so we need an opcode to get invocation_id<0,4,0>. */ - set_condmod(BRW_CONDITIONAL_Z, - emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), - invocation_id)); + emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id); emit(IF(BRW_PREDICATE_NORMAL)); for (unsigned i = 0; i < key->input_vertices; i += 2) { /* If we have an odd number of input vertices, the last will be @@ -166,7 +213,6 @@ void vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, const src_reg &vertex_index, unsigned base_offset, - unsigned first_component, const src_reg &indirect_offset) { vec4_instruction *inst; @@ -192,16 +238,13 @@ vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); } else { - src_reg src = src_reg(temp); - src.swizzle = BRW_SWZ_COMP_INPUT(first_component); - emit(MOV(dst, src)); + emit(MOV(dst, src_reg(temp))); } } void vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, unsigned base_offset, - unsigned first_component, const src_reg &indirect_offset) { vec4_instruction *inst; @@ -217,12 +260,6 @@ vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, read->offset = base_offset; read->mlen = 1; read->base_mrf = -1; - - if (first_component) { - src_reg src = src_reg(dst); - src.swizzle = BRW_SWZ_COMP_INPUT(first_component); - emit(MOV(dst, src)); - } } void @@ -249,6 +286,53 @@ vec4_tcs_visitor::emit_urb_write(const src_reg &value, inst->base_mrf = -1; } +static unsigned +tesslevel_outer_components(GLenum tes_primitive_mode) +{ + switch (tes_primitive_mode) { + case GL_QUADS: + return 4; + case GL_TRIANGLES: + return 3; + case GL_ISOLINES: + return 2; + default: + unreachable("Bogus tessellation domain"); + } + return 0; +} + +static unsigned +tesslevel_inner_components(GLenum tes_primitive_mode) +{ + switch (tes_primitive_mode) { + case GL_QUADS: + return 2; + case GL_TRIANGLES: + return 1; + case GL_ISOLINES: + return 0; + default: + unreachable("Bogus tessellation domain"); + } + return 0; +} + +/** + * Given a normal .xyzw writemask, convert it to a writemask for a vector + * that's stored backwards, i.e. .wzyx. + */ +static unsigned +writemask_for_backwards_vector(unsigned mask) +{ + unsigned new_mask = 0; + + for (int i = 0; i < 4; i++) + new_mask |= ((mask >> i) & 1) << (3 - i); + + return new_mask; +} + void vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { @@ -271,14 +355,13 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); src_reg vertex_index = - vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0])) + vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0])) : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); - emit_input_urb_read(dst, vertex_index, imm_offset, - nir_intrinsic_component(instr), indirect_offset); + emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset); break; } case nir_intrinsic_load_input: @@ -287,7 +370,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_load_output: case nir_intrinsic_load_per_vertex_output: { src_reg indirect_offset = get_indirect_offset(instr); - unsigned imm_offset = instr->const_index[0]; + unsigned imm_offset = instr->const_index[0];; dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); @@ -302,15 +385,14 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case GL_QUADS: { /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */ dst_reg tmp(this, glsl_type::vec4_type); - emit_output_urb_read(tmp, 0, 0, src_reg()); + emit_output_urb_read(tmp, 0, src_reg()); emit(MOV(writemask(dst, WRITEMASK_XY), swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); break; } case GL_TRIANGLES: /* DWord 4; use offset 1 but normal swizzle/writemask. */ - emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, 0, - src_reg()); + emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg()); break; case GL_ISOLINES: /* All channels are undefined. */ @@ -342,11 +424,10 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) } dst_reg tmp(this, glsl_type::vec4_type); - emit_output_urb_read(tmp, 1, 0, src_reg()); + emit_output_urb_read(tmp, 1, src_reg()); emit(MOV(dst, swizzle(src_reg(tmp), swiz))); } else { - emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr), - indirect_offset); + emit_output_urb_read(dst, imm_offset, indirect_offset); } break; } @@ -359,67 +440,53 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; - /* The passthrough shader writes the whole patch header as two vec4s; - * skip all the gl_TessLevelInner/Outer swizzling. - */ - if (indirect_offset.file == BAD_FILE && !is_passthrough_shader) { - if (imm_offset == 0) { - value.type = BRW_REGISTER_TYPE_F; + if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { + value.type = BRW_REGISTER_TYPE_F; - mask &= - (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1; + mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1; - /* This is a write to gl_TessLevelInner[], which lives in the - * Patch URB header. The layout depends on the domain. + /* This is a write to gl_TessLevelInner[], which lives in the + * Patch URB header. The layout depends on the domain. + */ + switch (key->tes_primitive_mode) { + case GL_QUADS: + /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). + * We use an XXYX swizzle to reverse put .xy in the .wz + * channels, and use a .zw writemask. */ - switch (key->tes_primitive_mode) { - case GL_QUADS: - /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). - * We use an XXYX swizzle to reverse put .xy in the .wz - * channels, and use a .zw writemask. - */ - swiz = BRW_SWIZZLE4(0, 0, 1, 0); - mask = writemask_for_backwards_vector(mask); - break; - case GL_TRIANGLES: - /* gl_TessLevelInner[].x lives at DWord 4, so we set the - * writemask to X and bump the URB offset by 1. - */ - imm_offset = 1; - break; - case GL_ISOLINES: - /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ - return; - default: - unreachable("Bogus tessellation domain"); - } - } else if (imm_offset == 1) { - value.type = BRW_REGISTER_TYPE_F; - - mask &= - (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1; - - /* This is a write to gl_TessLevelOuter[] which lives in the - * Patch URB Header at DWords 4-7. However, it's reversed, so - * instead of .xyzw we have .wzyx. + swiz = BRW_SWIZZLE4(0, 0, 1, 0); + mask = writemask_for_backwards_vector(mask); + break; + case GL_TRIANGLES: + /* gl_TessLevelInner[].x lives at DWord 4, so we set the + * writemask to X and bump the URB offset by 1. */ - if (key->tes_primitive_mode == GL_ISOLINES) { - /* Isolines .xy should be stored in .zw, in order. */ - swiz = BRW_SWIZZLE4(0, 0, 0, 1); - mask <<= 2; - } else { - /* Other domains are reversed; store .wzyx instead of .xyzw. */ - swiz = BRW_SWIZZLE_WZYX; - mask = writemask_for_backwards_vector(mask); - } + imm_offset = 1; + break; + case GL_ISOLINES: + /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ + return; + default: + unreachable("Bogus tessellation domain"); } - } + } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { + value.type = BRW_REGISTER_TYPE_F; - unsigned first_component = nir_intrinsic_component(instr); - if (first_component) { - assert(swiz == BRW_SWIZZLE_XYZW); - swiz = BRW_SWZ_COMP_OUTPUT(first_component); - mask = mask << first_component; + mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1; + + /* This is a write to gl_TessLevelOuter[] which lives in the + * Patch URB Header at DWords 4-7. However, it's reversed, so + * instead of .xyzw we have .wzyx. + */ + if (key->tes_primitive_mode == GL_ISOLINES) { + /* Isolines .xy should be stored in .zw, in order. */ + swiz = BRW_SWIZZLE4(0, 0, 0, 1); + mask <<= 2; + } else { + /* Other domains are reversed; store .wzyx instead of .xyzw. */ + swiz = BRW_SWIZZLE_WZYX; + mask = writemask_for_backwards_vector(mask); + } } emit_urb_write(swizzle(value, swiz), mask, @@ -451,36 +518,23 @@ brw_compile_tcs(const struct brw_compiler *compiler, unsigned *final_assembly_size, char **error_str) { - const struct gen_device_info *devinfo = compiler->devinfo; + const struct brw_device_info *devinfo = compiler->devinfo; struct brw_vue_prog_data *vue_prog_data = &prog_data->base; const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL]; nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); + nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar); nir->info.outputs_written = key->outputs_written; nir->info.patch_outputs_written = key->patch_outputs_written; + nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar, false, NULL); + nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); - struct brw_vue_map input_vue_map; - brw_compute_vue_map(devinfo, &input_vue_map, - nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, - true); + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2); brw_compute_tess_vue_map(&vue_prog_data->vue_map, nir->info.outputs_written, nir->info.patch_outputs_written); - nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar); - brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map); - brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map); - if (key->quads_workaround) - brw_nir_apply_tcs_quads_workaround(nir); - - nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); - - if (is_scalar) - prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 8); - else - prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2); - /* Compute URB entry size. The maximum allowed URB entry size is 32k. * That divides up as follows: * @@ -507,6 +561,11 @@ brw_compile_tcs(const struct brw_compiler *compiler, /* URB entry sizes are stored as a multiple of 64 bytes. */ vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + struct brw_vue_map input_vue_map; + brw_compute_vue_map(devinfo, &input_vue_map, + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, + true); + /* HS does not use the usual payload pushing from URB to GRFs, * because we don't have enough registers for a full-size payload, and * the hardware is broken on Haswell anyway. @@ -520,50 +579,20 @@ brw_compile_tcs(const struct brw_compiler *compiler, brw_print_vue_map(stderr, &vue_prog_data->vue_map); } - if (is_scalar) { - fs_visitor v(compiler, log_data, mem_ctx, (void *) key, - &prog_data->base.base, NULL, nir, 8, - shader_time_index, &input_vue_map); - if (!v.run_tcs_single_patch()) { - if (error_str) - *error_str = ralloc_strdup(mem_ctx, v.fail_msg); - return NULL; - } - - prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; - prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; - - fs_generator g(compiler, log_data, mem_ctx, (void *) key, - &prog_data->base.base, v.promoted_constants, false, - MESA_SHADER_TESS_CTRL); - if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { - g.enable_debug(ralloc_asprintf(mem_ctx, - "%s tessellation control shader %s", - nir->info.label ? nir->info.label - : "unnamed", - nir->info.name)); - } - - g.generate_code(v.cfg, 8); - - return g.get_assembly(final_assembly_size); - } else { - vec4_tcs_visitor v(compiler, log_data, key, prog_data, - nir, mem_ctx, shader_time_index, &input_vue_map); - if (!v.run()) { - if (error_str) - *error_str = ralloc_strdup(mem_ctx, v.fail_msg); - return NULL; - } - - if (unlikely(INTEL_DEBUG & DEBUG_TCS)) - v.dump_instructions(); + vec4_tcs_visitor v(compiler, log_data, key, prog_data, + nir, mem_ctx, shader_time_index, &input_vue_map); + if (!v.run()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) + v.dump_instructions(); - return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, - &prog_data->base, v.cfg, - final_assembly_size); - } + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, + &prog_data->base, v.cfg, + final_assembly_size); } diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h index 030eb5e66..2c6801b2a 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h @@ -49,7 +49,9 @@ public: const struct brw_vue_map *input_vue_map); protected: - virtual dst_reg *make_reg_for_system_value(int location); + virtual void emit_nir_code(); + virtual dst_reg *make_reg_for_system_value(int location, + const glsl_type *type); virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); virtual void setup_payload(); virtual void emit_prolog(); @@ -60,11 +62,9 @@ protected: void emit_input_urb_read(const dst_reg &dst, const src_reg &vertex_index, unsigned base_offset, - unsigned first_component, const src_reg &indirect_offset); void emit_output_urb_read(const dst_reg &dst, unsigned base_offset, - unsigned first_component, const src_reg &indirect_offset); void emit_urb_write(const src_reg &value, unsigned writemask, diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp index 226dcb4f6..7ba494fbf 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp @@ -46,7 +46,7 @@ vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler, dst_reg * -vec4_tes_visitor::make_reg_for_system_value(int location) +vec4_tes_visitor::make_reg_for_system_value(int location, const glsl_type *type) { return NULL; } @@ -177,9 +177,7 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: { src_reg indirect_offset = get_indirect_offset(instr); - dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); unsigned imm_offset = instr->const_index[0]; - unsigned first_component = nir_intrinsic_component(instr); src_reg header = input_read_header; if (indirect_offset.file != BAD_FILE) { @@ -192,10 +190,8 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) */ const unsigned max_push_slots = 24; if (imm_offset < max_push_slots) { - src_reg src = src_reg(ATTR, imm_offset, glsl_type::ivec4_type); - src.swizzle = BRW_SWZ_COMP_INPUT(first_component); - - emit(MOV(dst, src)); + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), + src_reg(ATTR, imm_offset, glsl_type::ivec4_type))); prog_data->urb_read_length = MAX2(prog_data->urb_read_length, DIV_ROUND_UP(imm_offset + 1, 2)); @@ -209,14 +205,12 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) read->offset = imm_offset; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; - src_reg src = src_reg(temp); - src.swizzle = BRW_SWZ_COMP_INPUT(first_component); - /* Copy to target. We might end up with some funky writemasks landing * in here, but we really don't want them in the above pseudo-ops. */ + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); - emit(MOV(dst, src)); + emit(MOV(dst, src_reg(temp))); break; } default: diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h index 31a28f359..4b697aa59 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h +++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h @@ -47,7 +47,8 @@ public: int shader_time_index); protected: - virtual dst_reg *make_reg_for_system_value(int location); + virtual dst_reg *make_reg_for_system_value(int location, + const glsl_type *type); virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); diff --git a/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp b/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp index 1323b6507..8d4a447a8 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp +++ b/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp @@ -36,9 +36,10 @@ class cmod_propagation_test : public ::testing::Test { public: struct brw_compiler *compiler; - struct gen_device_info *devinfo; + struct brw_device_info *devinfo; struct gl_context *ctx; struct gl_shader_program *shader_prog; + struct brw_vertex_program *vp; struct brw_vue_prog_data *prog_data; vec4_visitor *v; }; @@ -57,7 +58,8 @@ public: protected: /* Dummy implementation for pure virtual methods */ - virtual dst_reg *make_reg_for_system_value(int location) + virtual dst_reg *make_reg_for_system_value(int location, + const glsl_type *type) { unreachable("Not reached"); } @@ -98,14 +100,18 @@ void cmod_propagation_test::SetUp() { ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); - devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo)); prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data)); compiler->devinfo = devinfo; + vp = ralloc(NULL, struct brw_vertex_program); + nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL); v = new cmod_propagation_vec4_visitor(compiler, shader, prog_data); + _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0); + devinfo->gen = 4; } @@ -370,7 +376,7 @@ TEST_F(cmod_propagation_test, intervening_dest_write) src_reg zero(brw_imm_f(0.0f)); bld.ADD(offset(dest, 2), src0, src1); bld.emit(SHADER_OPCODE_TEX, dest, src2) - ->size_written = 4 * REG_SIZE; + ->regs_written = 4; bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 2), zero, BRW_CONDITIONAL_GE); /* = Before = |