diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-10-23 05:50:09 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-10-23 05:50:09 +0000 |
commit | 0873e165a9260f88c3f4c9d0bbce99bc73c4d337 (patch) | |
tree | 458734023a92c02a6f23d5f1e82d1ddb1c9ceffa /lib | |
parent | 110e9058364942848eb29000891046b3f56749ee (diff) |
Import Mesa 17.3.9
Diffstat (limited to 'lib')
-rw-r--r-- | lib/mesa/src/amd/vulkan/radv_shader.c | 1400 |
1 files changed, 279 insertions, 1121 deletions
diff --git a/lib/mesa/src/amd/vulkan/radv_shader.c b/lib/mesa/src/amd/vulkan/radv_shader.c index d7e2bce06..83e2e675e 100644 --- a/lib/mesa/src/amd/vulkan/radv_shader.c +++ b/lib/mesa/src/amd/vulkan/radv_shader.c @@ -30,41 +30,28 @@ #include "radv_debug.h" #include "radv_private.h" #include "radv_shader.h" -#include "radv_shader_helper.h" -#include "radv_shader_args.h" #include "nir/nir.h" #include "nir/nir_builder.h" #include "spirv/nir_spirv.h" #include <llvm-c/Core.h> #include <llvm-c/TargetMachine.h> -#include <llvm-c/Support.h> #include "sid.h" +#include "gfx9d.h" #include "ac_binary.h" #include "ac_llvm_util.h" #include "ac_nir_to_llvm.h" -#include "ac_rtld.h" #include "vk_format.h" #include "util/debug.h" #include "ac_exp_param.h" -#include "aco_interface.h" - -#include "util/string_buffer.h" - -static const struct nir_shader_compiler_options nir_options_llvm = { +static const struct nir_shader_compiler_options nir_options = { .vertex_id_zero_based = true, .lower_scmp = true, - .lower_flrp16 = true, .lower_flrp32 = true, - .lower_flrp64 = true, - .lower_device_index_to_zero = true, .lower_fsat = true, .lower_fdiv = true, - .lower_fmod = true, - .lower_bitfield_insert_to_bitfield_select = true, - .lower_bitfield_extract = true, .lower_sub = true, .lower_pack_snorm_2x16 = true, .lower_pack_snorm_4x8 = true, @@ -77,81 +64,9 @@ static const struct nir_shader_compiler_options nir_options_llvm = { .lower_extract_byte = true, .lower_extract_word = true, .lower_ffma = true, - .lower_fpow = true, - .lower_mul_2x32_64 = true, - .lower_rotate = true, - .max_unroll_iterations = 32, - .use_interpolated_input_intrinsics = true, - /* nir_lower_int64() isn't actually called for the LLVM backend, but - * this helps the loop unrolling heuristics. */ - .lower_int64_options = nir_lower_imul64 | - nir_lower_imul_high64 | - nir_lower_imul_2x32_64 | - nir_lower_divmod64 | - nir_lower_minmax64 | - nir_lower_iabs64, + .max_unroll_iterations = 32 }; -static const struct nir_shader_compiler_options nir_options_aco = { - .vertex_id_zero_based = true, - .lower_scmp = true, - .lower_flrp16 = true, - .lower_flrp32 = true, - .lower_flrp64 = true, - .lower_device_index_to_zero = true, - .lower_fdiv = true, - .lower_fmod = true, - .lower_bitfield_insert_to_bitfield_select = true, - .lower_bitfield_extract = true, - .lower_pack_snorm_2x16 = true, - .lower_pack_snorm_4x8 = true, - .lower_pack_unorm_2x16 = true, - .lower_pack_unorm_4x8 = true, - .lower_unpack_snorm_2x16 = true, - .lower_unpack_snorm_4x8 = true, - .lower_unpack_unorm_2x16 = true, - .lower_unpack_unorm_4x8 = true, - .lower_unpack_half_2x16 = true, - .lower_extract_byte = true, - .lower_extract_word = true, - .lower_ffma = true, - .lower_fpow = true, - .lower_mul_2x32_64 = true, - .lower_rotate = true, - .max_unroll_iterations = 32, - .use_interpolated_input_intrinsics = true, - .lower_int64_options = nir_lower_imul64 | - nir_lower_imul_high64 | - nir_lower_imul_2x32_64 | - nir_lower_divmod64 | - nir_lower_logic64 | - nir_lower_minmax64 | - nir_lower_iabs64, -}; - -bool -radv_can_dump_shader(struct radv_device *device, - struct radv_shader_module *module, - bool is_gs_copy_shader) -{ - if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) - return false; - if (module) - return !module->nir || - (device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS); - - return is_gs_copy_shader; -} - -bool -radv_can_dump_shader_stats(struct radv_device *device, - struct radv_shader_module *module) -{ - /* Only dump non-meta shader stats. */ - return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS && - module && !module->nir; -} - VkResult radv_CreateShaderModule( VkDevice _device, const VkShaderModuleCreateInfo* pCreateInfo, @@ -168,7 +83,7 @@ VkResult radv_CreateShaderModule( sizeof(*module) + pCreateInfo->codeSize, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (module == NULL) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); module->nir = NULL; module->size = pCreateInfo->codeSize; @@ -195,40 +110,56 @@ void radv_DestroyShaderModule( vk_free2(&device->alloc, pAllocator, module); } +bool +radv_lower_indirect_derefs(struct nir_shader *nir, + struct radv_physical_device *device) +{ + /* While it would be nice not to have this flag, we are constrained + * by the reality that LLVM 5.0 doesn't have working VGPR indexing + * on GFX9. + */ + bool llvm_has_working_vgpr_indexing = + device->rad_info.chip_class <= VI; + + /* TODO: Indirect indexing of GS inputs is unimplemented. + * + * TCS and TES load inputs directly from LDS or offchip memory, so + * indirect indexing is trivial. + */ + nir_variable_mode indirect_mask = 0; + if (nir->info.stage == MESA_SHADER_GEOMETRY || + (nir->info.stage != MESA_SHADER_TESS_CTRL && + nir->info.stage != MESA_SHADER_TESS_EVAL && + !llvm_has_working_vgpr_indexing)) { + indirect_mask |= nir_var_shader_in; + } + if (!llvm_has_working_vgpr_indexing && + nir->info.stage != MESA_SHADER_TESS_CTRL) + indirect_mask |= nir_var_shader_out; + + /* TODO: We shouldn't need to do this, however LLVM isn't currently + * smart enough to handle indirects without causing excess spilling + * causing the gpu to hang. + * + * See the following thread for more details of the problem: + * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html + */ + indirect_mask |= nir_var_local; + + return nir_lower_indirect_derefs(nir, indirect_mask); +} + void -radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively, - bool allow_copies) +radv_optimize_nir(struct nir_shader *shader) { bool progress; - unsigned lower_flrp = - (shader->options->lower_flrp16 ? 16 : 0) | - (shader->options->lower_flrp32 ? 32 : 0) | - (shader->options->lower_flrp64 ? 64 : 0); do { progress = false; - NIR_PASS(progress, shader, nir_split_array_vars, nir_var_function_temp); - NIR_PASS(progress, shader, nir_shrink_vec_array_vars, nir_var_function_temp); - NIR_PASS_V(shader, nir_lower_vars_to_ssa); - NIR_PASS_V(shader, nir_lower_pack); - - if (allow_copies) { - /* Only run this pass in the first call to - * radv_optimize_nir. Later calls assume that we've - * lowered away any copy_deref instructions and we - * don't want to introduce any more. - */ - NIR_PASS(progress, shader, nir_opt_find_array_copies); - } - - NIR_PASS(progress, shader, nir_opt_copy_prop_vars); - NIR_PASS(progress, shader, nir_opt_dead_write_vars); - NIR_PASS(progress, shader, nir_remove_dead_variables, - nir_var_function_temp | nir_var_shader_in | nir_var_shader_out); - - NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS_V(shader, nir_lower_64bit_pack); + NIR_PASS_V(shader, nir_lower_alu_to_scalar); NIR_PASS_V(shader, nir_lower_phis_to_scalar); NIR_PASS(progress, shader, nir_copy_prop); @@ -240,53 +171,18 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively, NIR_PASS(progress, shader, nir_opt_remove_phis); NIR_PASS(progress, shader, nir_opt_dce); } - NIR_PASS(progress, shader, nir_opt_if, true); + NIR_PASS(progress, shader, nir_opt_if); NIR_PASS(progress, shader, nir_opt_dead_cf); NIR_PASS(progress, shader, nir_opt_cse); - NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true); - NIR_PASS(progress, shader, nir_opt_constant_folding); + NIR_PASS(progress, shader, nir_opt_peephole_select, 8); NIR_PASS(progress, shader, nir_opt_algebraic); - - if (lower_flrp != 0) { - bool lower_flrp_progress = false; - NIR_PASS(lower_flrp_progress, - shader, - nir_lower_flrp, - lower_flrp, - false /* always_precise */, - shader->options->lower_ffma); - if (lower_flrp_progress) { - NIR_PASS(progress, shader, - nir_opt_constant_folding); - progress = true; - } - - /* Nothing should rematerialize any flrps, so we only - * need to do this lowering once. - */ - lower_flrp = 0; - } - + NIR_PASS(progress, shader, nir_opt_constant_folding); NIR_PASS(progress, shader, nir_opt_undef); + NIR_PASS(progress, shader, nir_opt_conditional_discard); if (shader->options->max_unroll_iterations) { NIR_PASS(progress, shader, nir_opt_loop_unroll, 0); } - } while (progress && !optimize_conservatively); - - NIR_PASS(progress, shader, nir_opt_conditional_discard); - NIR_PASS(progress, shader, nir_opt_shrink_load); - NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo); -} - -static void -shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) -{ - assert(glsl_type_is_vector_or_scalar(type)); - - uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; - unsigned length = glsl_get_vector_elements(type); - *size = comp_size * length, - *align = comp_size; + } while (progress); } nir_shader * @@ -294,30 +190,31 @@ radv_shader_compile_to_nir(struct radv_device *device, struct radv_shader_module *module, const char *entrypoint_name, gl_shader_stage stage, - const VkSpecializationInfo *spec_info, - const VkPipelineCreateFlags flags, - const struct radv_pipeline_layout *layout, - bool use_aco, - unsigned subgroup_size, unsigned ballot_bit_size) + const VkSpecializationInfo *spec_info) { + if (strcmp(entrypoint_name, "main") != 0) { + radv_finishme("Multiple shaders per module not really supported"); + } + nir_shader *nir; - const nir_shader_compiler_options *nir_options = use_aco ? &nir_options_aco : - &nir_options_llvm; + nir_function *entry_point; if (module->nir) { /* Some things such as our meta clear/blit code will give us a NIR * shader directly. In that case, we just ignore the SPIR-V entirely * and just use the NIR shader */ nir = module->nir; - nir->options = nir_options; - nir_validate_shader(nir, "in internal shader"); + nir->options = &nir_options; + nir_validate_shader(nir); assert(exec_list_length(&nir->functions) == 1); + struct exec_node *node = exec_list_get_head(&nir->functions); + entry_point = exec_node_data(nir_function, node, node); } else { uint32_t *spirv = (uint32_t *) module->data; assert(module->size % 4 == 0); if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV) - radv_print_spirv(module->data, module->size, stderr); + radv_print_spirv(spirv, module->size, stderr); uint32_t num_spec_entries = 0; struct nir_spirv_specialization *spec_entries = NULL; @@ -330,85 +227,28 @@ radv_shader_compile_to_nir(struct radv_device *device, assert(data + entry.size <= spec_info->pData + spec_info->dataSize); spec_entries[i].id = spec_info->pMapEntries[i].constantID; - switch (entry.size) { - case 8: + if (spec_info->dataSize == 8) spec_entries[i].data64 = *(const uint64_t *)data; - break; - case 4: + else spec_entries[i].data32 = *(const uint32_t *)data; - break; - case 2: - spec_entries[i].data32 = *(const uint16_t *)data; - break; - case 1: - spec_entries[i].data32 = *(const uint8_t *)data; - break; - default: - assert(!"Invalid spec constant size"); - break; - } } } - const struct spirv_to_nir_options spirv_options = { - .lower_ubo_ssbo_access_to_offsets = true, - .caps = { - .amd_fragment_mask = true, - .amd_gcn_shader = true, - .amd_image_read_write_lod = true, - .amd_shader_ballot = device->physical_device->use_shader_ballot, - .amd_shader_explicit_vertex_parameter = true, - .amd_trinary_minmax = true, - .demote_to_helper_invocation = device->physical_device->use_aco, - .derivative_group = true, - .descriptor_array_dynamic_indexing = true, - .descriptor_array_non_uniform_indexing = true, - .descriptor_indexing = true, - .device_group = true, - .draw_parameters = true, - .float_controls = true, - .float16 = !device->physical_device->use_aco, - .float64 = true, - .geometry_streams = true, - .image_ms_array = true, - .image_read_without_format = true, - .image_write_without_format = true, - .int8 = !device->physical_device->use_aco, - .int16 = !device->physical_device->use_aco, - .int64 = true, - .int64_atomics = true, - .multiview = true, - .physical_storage_buffer_address = true, - .post_depth_coverage = true, - .runtime_descriptor_array = true, - .shader_clock = true, - .shader_viewport_index_layer = true, - .stencil_export = true, - .storage_8bit = !device->physical_device->use_aco, - .storage_16bit = !device->physical_device->use_aco, - .storage_image_ms = true, - .subgroup_arithmetic = true, - .subgroup_ballot = true, - .subgroup_basic = true, - .subgroup_quad = true, - .subgroup_shuffle = true, - .subgroup_vote = true, - .tessellation = true, - .transform_feedback = true, - .variable_pointers = true, - }, - .ubo_addr_format = nir_address_format_32bit_index_offset, - .ssbo_addr_format = nir_address_format_32bit_index_offset, - .phys_ssbo_addr_format = nir_address_format_64bit_global, - .push_const_addr_format = nir_address_format_logical, - .shared_addr_format = nir_address_format_32bit_offset, - .frag_coord_is_sysval = true, + const struct nir_spirv_supported_extensions supported_ext = { + .draw_parameters = true, + .float64 = true, + .image_read_without_format = true, + .image_write_without_format = true, + .tessellation = true, + .int64 = true, + .multiview = true, + .variable_pointers = true, }; - nir = spirv_to_nir(spirv, module->size / 4, - spec_entries, num_spec_entries, - stage, entrypoint_name, - &spirv_options, nir_options); + entry_point = spirv_to_nir(spirv, module->size / 4, + spec_entries, num_spec_entries, + stage, entrypoint_name, &supported_ext, &nir_options); + nir = entry_point->shader; assert(nir->info.stage == stage); - nir_validate_shader(nir, "after spirv_to_nir"); + nir_validate_shader(nir); free(spec_entries); @@ -416,214 +256,50 @@ radv_shader_compile_to_nir(struct radv_device *device, * inline functions. That way they get properly initialized at the top * of the function and not at the top of its caller. */ - NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_local); NIR_PASS_V(nir, nir_lower_returns); NIR_PASS_V(nir, nir_inline_functions); - NIR_PASS_V(nir, nir_opt_deref); /* Pick off the single entrypoint that we want */ foreach_list_typed_safe(nir_function, func, node, &nir->functions) { - if (func->is_entrypoint) - func->name = ralloc_strdup(func, "main"); - else + if (func != entry_point) exec_node_remove(&func->node); } assert(exec_list_length(&nir->functions) == 1); + entry_point->name = ralloc_strdup(entry_point, "main"); - /* Make sure we lower constant initializers on output variables so that - * nir_remove_dead_variables below sees the corresponding stores - */ - NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_shader_out); + NIR_PASS_V(nir, nir_remove_dead_variables, + nir_var_shader_in | nir_var_shader_out | nir_var_system_value); /* Now that we've deleted all but the main function, we can go ahead and * lower the rest of the constant initializers. */ NIR_PASS_V(nir, nir_lower_constant_initializers, ~0); - - /* Split member structs. We do this before lower_io_to_temporaries so that - * it doesn't lower system values to temporaries by accident. - */ - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_split_per_member_structs); - - if (nir->info.stage == MESA_SHADER_FRAGMENT && use_aco) - NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out); - if (nir->info.stage == MESA_SHADER_FRAGMENT) - NIR_PASS_V(nir, nir_lower_input_attachments, true); - - NIR_PASS_V(nir, nir_remove_dead_variables, - nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared); - - NIR_PASS_V(nir, nir_propagate_invariant); - NIR_PASS_V(nir, nir_lower_system_values); NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays); - NIR_PASS_V(nir, radv_nir_lower_ycbcr_textures, layout); } /* Vulkan uses the separate-shader linking model */ nir->info.separate_shader = true; - nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); - - if (nir->info.stage == MESA_SHADER_GEOMETRY && use_aco) - nir_lower_gs_intrinsics(nir, true); + nir_shader_gather_info(nir, entry_point->impl); static const nir_lower_tex_options tex_options = { .lower_txp = ~0, - .lower_tg4_offsets = true, }; nir_lower_tex(nir, &tex_options); nir_lower_vars_to_ssa(nir); - - if (nir->info.stage == MESA_SHADER_VERTEX || - nir->info.stage == MESA_SHADER_GEOMETRY || - nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_lower_io_to_temporaries, - nir_shader_get_entrypoint(nir), true, true); - } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) { - NIR_PASS_V(nir, nir_lower_io_to_temporaries, - nir_shader_get_entrypoint(nir), true, false); - } - - nir_split_var_copies(nir); - - nir_lower_global_vars_to_local(nir); - nir_remove_dead_variables(nir, nir_var_function_temp); - bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7; - nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options) { - .subgroup_size = subgroup_size, - .ballot_bit_size = ballot_bit_size, - .lower_to_scalar = 1, - .lower_subgroup_masks = 1, - .lower_shuffle = 1, - .lower_shuffle_to_32bit = 1, - .lower_vote_eq_to_ballot = 1, - .lower_quad_broadcast_dynamic = 1, - .lower_quad_broadcast_dynamic_to_const = gfx7minus, - }); - - nir_lower_load_const_to_scalar(nir); - - if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)) - radv_optimize_nir(nir, false, true); - - /* We call nir_lower_var_copies() after the first radv_optimize_nir() - * to remove any copies introduced by nir_opt_find_array_copies(). - */ nir_lower_var_copies(nir); - - /* Lower deref operations for compute shared memory. */ - if (nir->info.stage == MESA_SHADER_COMPUTE) { - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared, shared_var_info); - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_shared, nir_address_format_32bit_offset); - } - - /* Lower large variables that are always constant with load_constant - * intrinsics, which get turned into PC-relative loads from a data - * section next to the shader. - */ - NIR_PASS_V(nir, nir_opt_large_constants, - glsl_get_natural_size_align_bytes, 16); - - /* Indirect lowering must be called after the radv_optimize_nir() loop - * has been called at least once. Otherwise indirect lowering can - * bloat the instruction count of the loop and cause it to be - * considered too large for unrolling. - */ - ac_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class); - radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT, false); + nir_lower_global_vars_to_local(nir); + nir_remove_dead_variables(nir, nir_var_local); + radv_lower_indirect_derefs(nir, device->physical_device); + radv_optimize_nir(nir); return nir; } -static int -type_size_vec4(const struct glsl_type *type, bool bindless) -{ - return glsl_count_attribute_slots(type, false); -} - -static nir_variable * -find_layer_in_var(nir_shader *nir) -{ - nir_foreach_variable(var, &nir->inputs) { - if (var->data.location == VARYING_SLOT_LAYER) { - return var; - } - } - - nir_variable *var = - nir_variable_create(nir, nir_var_shader_in, glsl_int_type(), "layer id"); - var->data.location = VARYING_SLOT_LAYER; - var->data.interpolation = INTERP_MODE_FLAT; - return var; -} - -/* We use layered rendering to implement multiview, which means we need to map - * view_index to gl_Layer. The attachment lowering also uses needs to know the - * layer so that it can sample from the correct layer. The code generates a - * load from the layer_id sysval, but since we don't have a way to get at this - * information from the fragment shader, we also need to lower this to the - * gl_Layer varying. This pass lowers both to a varying load from the LAYER - * slot, before lowering io, so that nir_assign_var_locations() will give the - * LAYER varying the correct driver_location. - */ - -static bool -lower_view_index(nir_shader *nir) -{ - bool progress = false; - nir_function_impl *entry = nir_shader_get_entrypoint(nir); - nir_builder b; - nir_builder_init(&b, entry); - - nir_variable *layer = NULL; - nir_foreach_block(block, entry) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); - if (load->intrinsic != nir_intrinsic_load_view_index && - load->intrinsic != nir_intrinsic_load_layer_id) - continue; - - if (!layer) - layer = find_layer_in_var(nir); - - b.cursor = nir_before_instr(instr); - nir_ssa_def *def = nir_load_var(&b, layer); - nir_ssa_def_rewrite_uses(&load->dest.ssa, - nir_src_for_ssa(def)); - - nir_instr_remove(instr); - progress = true; - } - } - - return progress; -} - -void -radv_lower_fs_io(nir_shader *nir) -{ - NIR_PASS_V(nir, lower_view_index); - nir_assign_io_var_locations(&nir->inputs, &nir->num_inputs, - MESA_SHADER_FRAGMENT); - - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); - - /* This pass needs actual constants */ - nir_opt_constant_folding(nir); - - NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in); -} - - void * radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader) @@ -655,11 +331,7 @@ radv_alloc_shader_memory(struct radv_device *device, slab->size = 256 * 1024; slab->bo = device->ws->buffer_create(device->ws, slab->size, 256, - RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_INTERPROCESS_SHARING | - (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? - 0 : RADEON_FLAG_READ_ONLY), - RADV_BO_PRIORITY_SHADER); + RADEON_DOMAIN_VRAM, 0); slab->ptr = (char*)device->ws->buffer_map(slab->bo); list_inithead(&slab->shaders); @@ -683,564 +355,178 @@ radv_destroy_shader_slabs(struct radv_device *device) mtx_destroy(&device->shader_slab_mutex); } -/* For the UMR disassembler. */ -#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ -#define DEBUGGER_NUM_MARKERS 5 - -static unsigned -radv_get_shader_binary_size(size_t code_size) -{ - return code_size + DEBUGGER_NUM_MARKERS * 4; -} - -static void radv_postprocess_config(const struct radv_physical_device *pdevice, - const struct ac_shader_config *config_in, - const struct radv_shader_info *info, - gl_shader_stage stage, - struct ac_shader_config *config_out) +static void +radv_fill_shader_variant(struct radv_device *device, + struct radv_shader_variant *variant, + struct ac_shader_binary *binary, + gl_shader_stage stage) { - bool scratch_enabled = config_in->scratch_bytes_per_wave > 0; + bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0; unsigned vgpr_comp_cnt = 0; - unsigned num_input_vgprs = info->num_input_vgprs; - if (stage == MESA_SHADER_FRAGMENT) { - num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL); - } + if (scratch_enabled && !device->llvm_supports_spill) + radv_finishme("shader scratch support only available with LLVM 4.0"); - unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs); - /* +3 for scratch wave offset and VCC */ - unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3); - unsigned num_shared_vgprs = config_in->num_shared_vgprs; - /* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */ - assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0) - || (pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0)); - unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8; - - *config_out = *config_in; - config_out->num_vgprs = num_vgprs; - config_out->num_sgprs = num_sgprs; - config_out->num_shared_vgprs = num_shared_vgprs; - - config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) | - S_00B12C_SCRATCH_EN(scratch_enabled); - - if (!pdevice->use_ngg_streamout) { - config_out->rsrc2 |= S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | - S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) | - S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) | - S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) | - S_00B12C_SO_EN(!!info->so.num_outputs); - } - - config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / - (info->wave_size == 32 ? 8 : 4)) | - S_00B848_DX10_CLAMP(1) | - S_00B848_FLOAT_MODE(config_out->float_mode); + variant->code_size = binary->code_size; + variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) | + S_00B12C_SCRATCH_EN(scratch_enabled); - if (pdevice->rad_info.chip_class >= GFX10) { - config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(info->num_user_sgprs >> 5); - } else { - config_out->rsrc1 |= S_00B228_SGPRS((num_sgprs - 1) / 8); - config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(info->num_user_sgprs >> 5); - } + variant->rsrc1 = S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) | + S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) | + S_00B848_DX10_CLAMP(1) | + S_00B848_FLOAT_MODE(variant->config.float_mode); switch (stage) { case MESA_SHADER_TESS_EVAL: - if (info->is_ngg) { - config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); - config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1); - } else if (info->tes.as_es) { - assert(pdevice->rad_info.chip_class <= GFX8); - vgpr_comp_cnt = info->uses_prim_id ? 3 : 2; - - config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); - } else { - bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id; - vgpr_comp_cnt = enable_prim_id ? 3 : 2; - - config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); - config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); - } - config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); + vgpr_comp_cnt = 3; + variant->rsrc2 |= S_00B12C_OC_LDS_EN(1); break; case MESA_SHADER_TESS_CTRL: - if (pdevice->rad_info.chip_class >= GFX9) { - /* We need at least 2 components for LS. - * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID). - * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. - */ - if (pdevice->rad_info.chip_class >= GFX10) { - vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1; - } else { - vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1; - } - } else { - config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); - } - config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | - S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); - config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); + if (device->physical_device->rad_info.chip_class >= GFX9) + vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt; + else + variant->rsrc2 |= S_00B12C_OC_LDS_EN(1); break; case MESA_SHADER_VERTEX: - if (info->is_ngg) { - config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); - } else if (info->vs.as_ls) { - assert(pdevice->rad_info.chip_class <= GFX8); - /* We need at least 2 components for LS. - * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID). - * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. - */ - vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1; - } else if (info->vs.as_es) { - assert(pdevice->rad_info.chip_class <= GFX8); - /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ - vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0; - } else { - /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID) - * If PrimID is disabled. InstanceID / StepRate1 is loaded instead. - * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. - */ - if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) { - vgpr_comp_cnt = 3; - } else if (info->vs.export_prim_id) { - vgpr_comp_cnt = 2; - } else if (info->vs.needs_instance_id) { - vgpr_comp_cnt = 1; - } else { - vgpr_comp_cnt = 0; - } - - config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); - config_out->rsrc2 |= S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); - } + case MESA_SHADER_GEOMETRY: + vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt; break; case MESA_SHADER_FRAGMENT: - config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); - config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); - break; - case MESA_SHADER_GEOMETRY: - config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | - S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); - config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_COMPUTE: - config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | - S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); - config_out->rsrc2 |= - S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) | - S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) | - S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) | - S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2] ? 2 : - info->cs.uses_thread_id[1] ? 1 : 0) | - S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) | - S_00B84C_LDS_SIZE(config_in->lds_size); - config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks); - + variant->rsrc2 |= + S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) | + S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) | + S_00B84C_TG_SIZE_EN(1) | + S_00B84C_LDS_SIZE(variant->config.lds_size); break; default: unreachable("unsupported shader type"); break; } - if (pdevice->rad_info.chip_class >= GFX10 && info->is_ngg && - (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_GEOMETRY)) { - unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt; - gl_shader_stage es_stage = stage; - if (stage == MESA_SHADER_GEOMETRY) - es_stage = info->gs.es_type; - - /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */ - if (es_stage == MESA_SHADER_VERTEX) { - es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0; - } else if (es_stage == MESA_SHADER_TESS_EVAL) { - bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id; - es_vgpr_comp_cnt = enable_prim_id ? 3 : 2; - } else - unreachable("Unexpected ES shader stage"); - - bool tes_triangles = stage == MESA_SHADER_TESS_EVAL && - info->tes.primitive_mode >= 4; /* GL_TRIANGLES */ - if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) { - gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ - } else if (info->uses_prim_id) { - gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ - } else if (info->gs.vertices_in >= 3 || tes_triangles) { - gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ - } else { - gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ - } - - config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | - S_00B228_WGP_MODE(1); - config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | - S_00B22C_LDS_SIZE(config_in->lds_size) | - S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL); - } else if (pdevice->rad_info.chip_class >= GFX9 && - stage == MESA_SHADER_GEOMETRY) { - unsigned es_type = info->gs.es_type; - unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt; - - if (es_type == MESA_SHADER_VERTEX) { - /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ - if (info->vs.needs_instance_id) { - es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1; - } else { - es_vgpr_comp_cnt = 0; - } - } else if (es_type == MESA_SHADER_TESS_EVAL) { - es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2; - } else { - unreachable("invalid shader ES type"); - } - - /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and - * VGPR[0:4] are always loaded. - */ - if (info->uses_invocation_id) { - gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ - } else if (info->uses_prim_id) { - gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ - } else if (info->gs.vertices_in >= 3) { - gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ - } else { - gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ - } - - config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt); - config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | - S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL); - } else if (pdevice->rad_info.chip_class >= GFX9 && - stage == MESA_SHADER_TESS_CTRL) { - config_out->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt); - } else { - config_out->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt); - } -} - -struct radv_shader_variant * -radv_shader_variant_create(struct radv_device *device, - const struct radv_shader_binary *binary, - bool keep_shader_info) -{ - struct ac_shader_config config = {0}; - struct ac_rtld_binary rtld_binary = {0}; - struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant)); - if (!variant) - return NULL; - - variant->ref_count = 1; - - if (binary->type == RADV_BINARY_TYPE_RTLD) { - struct ac_rtld_symbol lds_symbols[2]; - unsigned num_lds_symbols = 0; - const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data; - size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size; - - if (device->physical_device->rad_info.chip_class >= GFX9 && - (binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) && - !binary->is_gs_copy_shader) { - /* We add this symbol even on LLVM <= 8 to ensure that - * shader->config.lds_size is set correctly below. - */ - struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; - sym->name = "esgs_ring"; - sym->size = binary->info.ngg_info.esgs_ring_size; - sym->align = 64 * 1024; - } - - if (binary->info.is_ngg && - binary->stage == MESA_SHADER_GEOMETRY) { - struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; - sym->name = "ngg_emit"; - sym->size = binary->info.ngg_info.ngg_emit_size * 4; - sym->align = 4; - } - - struct ac_rtld_open_info open_info = { - .info = &device->physical_device->rad_info, - .shader_type = binary->stage, - .wave_size = binary->info.wave_size, - .num_parts = 1, - .elf_ptrs = &elf_data, - .elf_sizes = &elf_size, - .num_shared_lds_symbols = num_lds_symbols, - .shared_lds_symbols = lds_symbols, - }; - - if (!ac_rtld_open(&rtld_binary, open_info)) { - free(variant); - return NULL; - } - - if (!ac_rtld_read_config(&rtld_binary, &config)) { - ac_rtld_close(&rtld_binary); - free(variant); - return NULL; - } - - /* Enable 64-bit and 16-bit denormals, because there is no performance - * cost. - * - * If denormals are enabled, all floating-point output modifiers are - * ignored. - * - * Don't enable denormals for 32-bit floats, because: - * - Floating-point output modifiers would be ignored by the hw. - * - Some opcodes don't support denormals, such as v_mad_f32. We would - * have to stop using those. - * - GFX6 & GFX7 would be very slow. - */ - config.float_mode |= V_00B028_FP_64_DENORMS; - - if (rtld_binary.lds_size > 0) { - unsigned alloc_granularity = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256; - config.lds_size = align(rtld_binary.lds_size, alloc_granularity) / alloc_granularity; - } - - variant->code_size = rtld_binary.rx_size; - variant->exec_size = rtld_binary.exec_size; - } else { - assert(binary->type == RADV_BINARY_TYPE_LEGACY); - config = ((struct radv_shader_binary_legacy *)binary)->config; - variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size); - variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size; - } - - variant->info = binary->info; - radv_postprocess_config(device->physical_device, &config, &binary->info, - binary->stage, &variant->config); - - if (radv_device_use_secure_compile(device->instance)) { - if (binary->type == RADV_BINARY_TYPE_RTLD) - ac_rtld_close(&rtld_binary); - - return variant; - } - - void *dest_ptr = radv_alloc_shader_memory(device, variant); - - if (binary->type == RADV_BINARY_TYPE_RTLD) { - struct radv_shader_binary_rtld* bin = (struct radv_shader_binary_rtld *)binary; - struct ac_rtld_upload_info info = { - .binary = &rtld_binary, - .rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset, - .rx_ptr = dest_ptr, - }; - - if (!ac_rtld_upload(&info)) { - radv_shader_variant_destroy(device, variant); - ac_rtld_close(&rtld_binary); - return NULL; - } - - if (keep_shader_info || - (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) { - const char *disasm_data; - size_t disasm_size; - if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm_data, &disasm_size)) { - radv_shader_variant_destroy(device, variant); - ac_rtld_close(&rtld_binary); - return NULL; - } - - variant->ir_string = bin->llvm_ir_size ? strdup((const char*)(bin->data + bin->elf_size)) : NULL; - variant->disasm_string = malloc(disasm_size + 1); - memcpy(variant->disasm_string, disasm_data, disasm_size); - variant->disasm_string[disasm_size] = 0; - } - - ac_rtld_close(&rtld_binary); - } else { - struct radv_shader_binary_legacy* bin = (struct radv_shader_binary_legacy *)binary; - memcpy(dest_ptr, bin->data, bin->code_size); - - /* Add end-of-code markers for the UMR disassembler. */ - uint32_t *ptr32 = (uint32_t *)dest_ptr + bin->code_size / 4; - for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++) - ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; - - variant->ir_string = bin->ir_size ? strdup((const char*)(bin->data + bin->code_size)) : NULL; - variant->disasm_string = bin->disasm_size ? strdup((const char*)(bin->data + bin->code_size + bin->ir_size)) : NULL; - } - return variant; -} - -static char * -radv_dump_nir_shaders(struct nir_shader * const *shaders, - int shader_count) -{ - char *data = NULL; - char *ret = NULL; - size_t size = 0; - FILE *f = open_memstream(&data, &size); - if (f) { - for (int i = 0; i < shader_count; ++i) - nir_print_shader(shaders[i], f); - fclose(f); - } + if (device->physical_device->rad_info.chip_class >= GFX9 && + stage == MESA_SHADER_GEOMETRY) { + /* TODO: Figure out how many we actually need. */ + variant->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(3); + variant->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(3) | + S_00B22C_OC_LDS_EN(1); + } else if (device->physical_device->rad_info.chip_class >= GFX9 && + stage == MESA_SHADER_TESS_CTRL) + variant->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt); + else + variant->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt); - ret = malloc(size + 1); - if (ret) { - memcpy(ret, data, size); - ret[size] = 0; - } - free(data); - return ret; + void *ptr = radv_alloc_shader_memory(device, variant); + memcpy(ptr, binary->code, binary->code_size); } static struct radv_shader_variant * -shader_variant_compile(struct radv_device *device, - struct radv_shader_module *module, - struct nir_shader * const *shaders, - int shader_count, - gl_shader_stage stage, - struct radv_shader_info *info, - struct radv_nir_compiler_options *options, - bool gs_copy_shader, - bool keep_shader_info, - bool use_aco, - struct radv_shader_binary **binary_out) +shader_variant_create(struct radv_device *device, + struct radv_shader_module *module, + struct nir_shader * const *shaders, + int shader_count, + gl_shader_stage stage, + struct ac_nir_compiler_options *options, + bool gs_copy_shader, + void **code_out, + unsigned *code_size_out) { enum radeon_family chip_family = device->physical_device->rad_info.family; - struct radv_shader_binary *binary = NULL; + bool dump_shaders = device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS; + enum ac_target_machine_options tm_options = 0; + struct radv_shader_variant *variant; + struct ac_shader_binary binary; + LLVMTargetMachineRef tm; + + variant = calloc(1, sizeof(struct radv_shader_variant)); + if (!variant) + return NULL; options->family = chip_family; options->chip_class = device->physical_device->rad_info.chip_class; - options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader); - options->dump_preoptir = options->dump_shader && - device->instance->debug_flags & RADV_DEBUG_PREOPTIR; - options->record_ir = keep_shader_info; - options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR; - options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size; - options->address32_hi = device->physical_device->rad_info.address32_hi; - options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug; - options->use_ngg_streamout = device->physical_device->use_ngg_streamout; - - struct radv_shader_args args = {}; - args.options = options; - args.shader_info = info; - args.is_gs_copy_shader = gs_copy_shader; - radv_declare_shader_args(&args, - gs_copy_shader ? MESA_SHADER_VERTEX - : shaders[shader_count - 1]->info.stage, - shader_count >= 2, - shader_count >= 2 ? shaders[shader_count - 2]->info.stage - : MESA_SHADER_VERTEX); - - if (!use_aco || options->dump_shader || options->record_ir) - ac_init_llvm_once(); - - if (use_aco) { - aco_compile_shader(shader_count, shaders, &binary, &args); - binary->info = *info; - } else { - enum ac_target_machine_options tm_options = 0; - struct ac_llvm_compiler ac_llvm; - bool thread_compiler; + if (options->supports_spill) tm_options |= AC_TM_SUPPORTS_SPILL; - if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED) - tm_options |= AC_TM_SISCHED; - if (options->check_ir) - tm_options |= AC_TM_CHECK_IR; - if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) - tm_options |= AC_TM_NO_LOAD_STORE_OPT; - - thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); - radv_init_llvm_compiler(&ac_llvm, - thread_compiler, - chip_family, tm_options, - info->wave_size); - - if (gs_copy_shader) { - assert(shader_count == 1); - radv_compile_gs_copy_shader(&ac_llvm, *shaders, &binary, - &args); - } else { - radv_compile_nir_shader(&ac_llvm, &binary, &args, - shaders, shader_count); - } - - binary->info = *info; - radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); + if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED) + tm_options |= AC_TM_SISCHED; + tm = ac_create_target_machine(chip_family, tm_options); + + if (gs_copy_shader) { + assert(shader_count == 1); + ac_create_gs_copy_shader(tm, *shaders, &binary, &variant->config, + &variant->info, options, dump_shaders); + } else { + ac_compile_nir_shader(tm, &binary, &variant->config, + &variant->info, shaders, shader_count, options, + dump_shaders); } - struct radv_shader_variant *variant = radv_shader_variant_create(device, binary, - keep_shader_info); - if (!variant) { - free(binary); - return NULL; - } - variant->aco_used = use_aco; + LLVMDisposeTargetMachine(tm); - if (options->dump_shader) { - fprintf(stderr, "disasm:\n%s\n", variant->disasm_string); - } + radv_fill_shader_variant(device, variant, &binary, stage); + if (code_out) { + *code_out = binary.code; + *code_size_out = binary.code_size; + } else + free(binary.code); + free(binary.config); + free(binary.rodata); + free(binary.global_symbol_offsets); + free(binary.relocs); + variant->ref_count = 1; - if (keep_shader_info) { - variant->nir_string = radv_dump_nir_shaders(shaders, shader_count); + if (device->trace_bo) { + variant->disasm_string = binary.disasm_string; if (!gs_copy_shader && !module->nir) { - variant->spirv = malloc(module->size); - if (!variant->spirv) { - free(variant); - free(binary); - return NULL; - } - - memcpy(variant->spirv, module->data, module->size); + variant->nir = *shaders; + variant->spirv = (uint32_t *)module->data; variant->spirv_size = module->size; } + } else { + free(binary.disasm_string); } - if (binary_out) - *binary_out = binary; - else - free(binary); - return variant; } struct radv_shader_variant * -radv_shader_variant_compile(struct radv_device *device, +radv_shader_variant_create(struct radv_device *device, struct radv_shader_module *module, struct nir_shader *const *shaders, int shader_count, struct radv_pipeline_layout *layout, - const struct radv_shader_variant_key *key, - struct radv_shader_info *info, - bool keep_shader_info, - bool use_aco, - struct radv_shader_binary **binary_out) + const struct ac_shader_variant_key *key, + void **code_out, + unsigned *code_size_out) { - struct radv_nir_compiler_options options = {0}; + struct ac_nir_compiler_options options = {0}; options.layout = layout; if (key) options.key = *key; - options.explicit_scratch_args = use_aco; - options.robust_buffer_access = device->robust_buffer_access; + options.unsafe_math = !!(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH); + options.supports_spill = device->llvm_supports_spill; - return shader_variant_compile(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, info, - &options, false, keep_shader_info, use_aco, binary_out); + return shader_variant_create(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, + &options, false, code_out, code_size_out); } struct radv_shader_variant * radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader, - struct radv_shader_info *info, - struct radv_shader_binary **binary_out, - bool keep_shader_info, - bool multiview, bool use_aco) + void **code_out, + unsigned *code_size_out, + bool multiview) { - struct radv_nir_compiler_options options = {0}; + struct ac_nir_compiler_options options = {0}; - options.explicit_scratch_args = use_aco; options.key.has_multiview_view_index = multiview; - return shader_variant_compile(device, NULL, &shader, 1, MESA_SHADER_VERTEX, - info, &options, true, keep_shader_info, use_aco, binary_out); + return shader_variant_create(device, NULL, &shader, 1, MESA_SHADER_VERTEX, + &options, true, code_out, code_size_out); } void @@ -1254,106 +540,104 @@ radv_shader_variant_destroy(struct radv_device *device, list_del(&variant->slab_list); mtx_unlock(&device->shader_slab_mutex); - free(variant->spirv); - free(variant->nir_string); + ralloc_free(variant->nir); free(variant->disasm_string); - free(variant->ir_string); free(variant); } -const char * -radv_get_shader_name(struct radv_shader_info *info, - gl_shader_stage stage) +uint32_t +radv_shader_stage_to_user_data_0(gl_shader_stage stage, enum chip_class chip_class, + bool has_gs, bool has_tess) { switch (stage) { + case MESA_SHADER_FRAGMENT: + return R_00B030_SPI_SHADER_USER_DATA_PS_0; case MESA_SHADER_VERTEX: - if (info->vs.as_ls) - return "Vertex Shader as LS"; - else if (info->vs.as_es) - return "Vertex Shader as ES"; - else if (info->is_ngg) - return "Vertex Shader as ESGS"; + if (chip_class >= GFX9) { + return has_tess ? R_00B430_SPI_SHADER_USER_DATA_LS_0 : + has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : + R_00B130_SPI_SHADER_USER_DATA_VS_0; + } + if (has_tess) + return R_00B530_SPI_SHADER_USER_DATA_LS_0; else - return "Vertex Shader as VS"; + return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B130_SPI_SHADER_USER_DATA_VS_0; + case MESA_SHADER_GEOMETRY: + return chip_class >= GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : + R_00B230_SPI_SHADER_USER_DATA_GS_0; + case MESA_SHADER_COMPUTE: + return R_00B900_COMPUTE_USER_DATA_0; case MESA_SHADER_TESS_CTRL: - return "Tessellation Control Shader"; + return chip_class >= GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 : + R_00B430_SPI_SHADER_USER_DATA_HS_0; case MESA_SHADER_TESS_EVAL: - if (info->tes.as_es) - return "Tessellation Evaluation Shader as ES"; - else if (info->is_ngg) - return "Tessellation Evaluation Shader as ESGS"; + if (chip_class >= GFX9) { + return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : + R_00B130_SPI_SHADER_USER_DATA_VS_0; + } + if (has_gs) + return R_00B330_SPI_SHADER_USER_DATA_ES_0; else - return "Tessellation Evaluation Shader as VS"; - case MESA_SHADER_GEOMETRY: - return "Geometry Shader"; - case MESA_SHADER_FRAGMENT: - return "Pixel Shader"; - case MESA_SHADER_COMPUTE: - return "Compute Shader"; + return R_00B130_SPI_SHADER_USER_DATA_VS_0; default: - return "Unknown shader"; - }; + unreachable("unknown shader"); + } } -unsigned -radv_get_max_workgroup_size(enum chip_class chip_class, - gl_shader_stage stage, - const unsigned *sizes) +const char * +radv_get_shader_name(struct radv_shader_variant *var, gl_shader_stage stage) { switch (stage) { - case MESA_SHADER_TESS_CTRL: - return chip_class >= GFX7 ? 128 : 64; - case MESA_SHADER_GEOMETRY: - return chip_class >= GFX9 ? 128 : 64; - case MESA_SHADER_COMPUTE: - break; + case MESA_SHADER_VERTEX: return var->info.vs.as_ls ? "Vertex Shader as LS" : var->info.vs.as_es ? "Vertex Shader as ES" : "Vertex Shader as VS"; + case MESA_SHADER_GEOMETRY: return "Geometry Shader"; + case MESA_SHADER_FRAGMENT: return "Pixel Shader"; + case MESA_SHADER_COMPUTE: return "Compute Shader"; + case MESA_SHADER_TESS_CTRL: return "Tessellation Control Shader"; + case MESA_SHADER_TESS_EVAL: return var->info.tes.as_es ? "Tessellation Evaluation Shader as ES" : "Tessellation Evaluation Shader as VS"; default: - return 0; - } - - unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2]; - return max_workgroup_size; + return "Unknown shader"; + }; } -unsigned -radv_get_max_waves(struct radv_device *device, - struct radv_shader_variant *variant, - gl_shader_stage stage) +void +radv_shader_dump_stats(struct radv_device *device, + struct radv_shader_variant *variant, + gl_shader_stage stage, + FILE *file) { - enum chip_class chip_class = device->physical_device->rad_info.chip_class; - unsigned lds_increment = chip_class >= GFX7 ? 512 : 256; - uint8_t wave_size = variant->info.wave_size; - struct ac_shader_config *conf = &variant->config; + unsigned lds_increment = device->physical_device->rad_info.chip_class >= CIK ? 512 : 256; + struct ac_shader_config *conf; unsigned max_simd_waves; unsigned lds_per_wave = 0; - max_simd_waves = device->physical_device->rad_info.max_wave64_per_simd; + switch (device->physical_device->rad_info.family) { + /* These always have 8 waves: */ + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + max_simd_waves = 8; + break; + default: + max_simd_waves = 10; + } + + conf = &variant->config; if (stage == MESA_SHADER_FRAGMENT) { lds_per_wave = conf->lds_size * lds_increment + - align(variant->info.ps.num_interp * 48, + align(variant->info.fs.num_interp * 48, lds_increment); - } else if (stage == MESA_SHADER_COMPUTE) { - unsigned max_workgroup_size = - radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size); - lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, wave_size); } if (conf->num_sgprs) { - unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8); - max_simd_waves = - MIN2(max_simd_waves, - device->physical_device->rad_info.num_physical_sgprs_per_simd / - sgprs); + if (device->physical_device->rad_info.chip_class >= VI) + max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs); + else + max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs); } - if (conf->num_vgprs) { - unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4); - max_simd_waves = - MIN2(max_simd_waves, - RADV_NUM_PHYSICAL_VGPRS / vgprs); - } + if (conf->num_vgprs) + max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD * that PS can use. @@ -1361,153 +645,27 @@ radv_get_max_waves(struct radv_device *device, if (lds_per_wave) max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); - return max_simd_waves; -} - -static void -generate_shader_stats(struct radv_device *device, - struct radv_shader_variant *variant, - gl_shader_stage stage, - struct _mesa_string_buffer *buf) -{ - struct ac_shader_config *conf = &variant->config; - unsigned max_simd_waves = radv_get_max_waves(device, variant, stage); + fprintf(file, "\n%s:\n", radv_get_shader_name(variant, stage)); if (stage == MESA_SHADER_FRAGMENT) { - _mesa_string_buffer_printf(buf, "*** SHADER CONFIG ***\n" - "SPI_PS_INPUT_ADDR = 0x%04x\n" - "SPI_PS_INPUT_ENA = 0x%04x\n", - conf->spi_ps_input_addr, conf->spi_ps_input_ena); - } - - _mesa_string_buffer_printf(buf, "*** SHADER STATS ***\n" - "SGPRS: %d\n" - "VGPRS: %d\n" - "Spilled SGPRs: %d\n" - "Spilled VGPRs: %d\n" - "PrivMem VGPRS: %d\n" - "Code Size: %d bytes\n" - "LDS: %d blocks\n" - "Scratch: %d bytes per wave\n" - "Max Waves: %d\n" - "********************\n\n\n", - conf->num_sgprs, conf->num_vgprs, - conf->spilled_sgprs, conf->spilled_vgprs, - variant->info.private_mem_vgprs, variant->exec_size, - conf->lds_size, conf->scratch_bytes_per_wave, - max_simd_waves); -} - -void -radv_shader_dump_stats(struct radv_device *device, - struct radv_shader_variant *variant, - gl_shader_stage stage, - FILE *file) -{ - struct _mesa_string_buffer *buf = _mesa_string_buffer_create(NULL, 256); - - generate_shader_stats(device, variant, stage, buf); - - fprintf(file, "\n%s:\n", radv_get_shader_name(&variant->info, stage)); - fprintf(file, "%s", buf->buf); - - _mesa_string_buffer_destroy(buf); -} - -VkResult -radv_GetShaderInfoAMD(VkDevice _device, - VkPipeline _pipeline, - VkShaderStageFlagBits shaderStage, - VkShaderInfoTypeAMD infoType, - size_t* pInfoSize, - void* pInfo) -{ - RADV_FROM_HANDLE(radv_device, device, _device); - RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); - gl_shader_stage stage = vk_to_mesa_shader_stage(shaderStage); - struct radv_shader_variant *variant = pipeline->shaders[stage]; - struct _mesa_string_buffer *buf; - VkResult result = VK_SUCCESS; - - /* Spec doesn't indicate what to do if the stage is invalid, so just - * return no info for this. */ - if (!variant) - return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT); - - switch (infoType) { - case VK_SHADER_INFO_TYPE_STATISTICS_AMD: - if (!pInfo) { - *pInfoSize = sizeof(VkShaderStatisticsInfoAMD); - } else { - unsigned lds_multiplier = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256; - struct ac_shader_config *conf = &variant->config; - - VkShaderStatisticsInfoAMD statistics = {}; - statistics.shaderStageMask = shaderStage; - statistics.numPhysicalVgprs = RADV_NUM_PHYSICAL_VGPRS; - statistics.numPhysicalSgprs = device->physical_device->rad_info.num_physical_sgprs_per_simd; - statistics.numAvailableSgprs = statistics.numPhysicalSgprs; - - if (stage == MESA_SHADER_COMPUTE) { - unsigned *local_size = variant->info.cs.block_size; - unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2]; - - statistics.numAvailableVgprs = statistics.numPhysicalVgprs / - ceil((double)workgroup_size / statistics.numPhysicalVgprs); - - statistics.computeWorkGroupSize[0] = local_size[0]; - statistics.computeWorkGroupSize[1] = local_size[1]; - statistics.computeWorkGroupSize[2] = local_size[2]; - } else { - statistics.numAvailableVgprs = statistics.numPhysicalVgprs; - } - - statistics.resourceUsage.numUsedVgprs = conf->num_vgprs; - statistics.resourceUsage.numUsedSgprs = conf->num_sgprs; - statistics.resourceUsage.ldsSizePerLocalWorkGroup = 32768; - statistics.resourceUsage.ldsUsageSizeInBytes = conf->lds_size * lds_multiplier; - statistics.resourceUsage.scratchMemUsageInBytes = conf->scratch_bytes_per_wave; - - size_t size = *pInfoSize; - *pInfoSize = sizeof(statistics); - - memcpy(pInfo, &statistics, MIN2(size, *pInfoSize)); - - if (size < *pInfoSize) - result = VK_INCOMPLETE; - } - - break; - case VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD: - buf = _mesa_string_buffer_create(NULL, 1024); - - _mesa_string_buffer_printf(buf, "%s:\n", radv_get_shader_name(&variant->info, stage)); - _mesa_string_buffer_printf(buf, "%s\n\n", variant->ir_string); - _mesa_string_buffer_printf(buf, "%s\n\n", variant->disasm_string); - generate_shader_stats(device, variant, stage, buf); - - /* Need to include the null terminator. */ - size_t length = buf->length + 1; - - if (!pInfo) { - *pInfoSize = length; - } else { - size_t size = *pInfoSize; - *pInfoSize = length; - - memcpy(pInfo, buf->buf, MIN2(size, length)); - - if (size < length) - result = VK_INCOMPLETE; - } - - _mesa_string_buffer_destroy(buf); - break; - default: - /* VK_SHADER_INFO_TYPE_BINARY_AMD unimplemented for now. */ - result = VK_ERROR_FEATURE_NOT_PRESENT; - break; + fprintf(file, "*** SHADER CONFIG ***\n" + "SPI_PS_INPUT_ADDR = 0x%04x\n" + "SPI_PS_INPUT_ENA = 0x%04x\n", + conf->spi_ps_input_addr, conf->spi_ps_input_ena); } - return result; + fprintf(file, "*** SHADER STATS ***\n" + "SGPRS: %d\n" + "VGPRS: %d\n" + "Spilled SGPRs: %d\n" + "Spilled VGPRs: %d\n" + "Code Size: %d bytes\n" + "LDS: %d blocks\n" + "Scratch: %d bytes per wave\n" + "Max Waves: %d\n" + "********************\n\n\n", + conf->num_sgprs, conf->num_vgprs, + conf->spilled_sgprs, conf->spilled_vgprs, variant->code_size, + conf->lds_size, conf->scratch_bytes_per_wave, + max_simd_waves); } |