diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2021-03-11 05:14:49 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2021-03-11 05:14:49 +0000 |
commit | b87befe14c653861830ec0bfa9b8a27e503a2294 (patch) | |
tree | e8fa56db169d60b95a62ecef48e4bc53b6493588 /lib/mesa/src | |
parent | 845976941bf8cd7bfdd6672d614ac73cd02286c3 (diff) |
ac: unify denorm setting enforcement
From Marek Olsak
56cc10bd27b24d513de88bf7fa94a6c8f43e348f in mainline Mesa
Diffstat (limited to 'lib/mesa/src')
-rw-r--r-- | lib/mesa/src/amd/common/ac_binary.c | 13 | ||||
-rw-r--r-- | lib/mesa/src/amd/vulkan/radv_shader.c | 1386 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c | 1232 |
3 files changed, 1762 insertions, 869 deletions
diff --git a/lib/mesa/src/amd/common/ac_binary.c b/lib/mesa/src/amd/common/ac_binary.c index 5f92a57d7..8761422bd 100644 --- a/lib/mesa/src/amd/common/ac_binary.c +++ b/lib/mesa/src/amd/common/ac_binary.c @@ -58,11 +58,13 @@ void ac_parse_shader_binary_config(const char *data, size_t nbytes, conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); + /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */ conf->float_mode = G_00B028_FLOAT_MODE(value); conf->rsrc1 = value; break; case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); + /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */ conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value); conf->rsrc2 = value; break; @@ -124,4 +126,15 @@ void ac_parse_shader_binary_config(const char *data, size_t nbytes, /* sgprs spills aren't spilling */ conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4; } + + /* Enable 64-bit and 16-bit denormals, because there is no performance + * cost. + * + * Don't enable denormals for 32-bit floats, because: + * - denormals disable output modifiers + * - denormals break v_mad_f32 + * - GFX6 & GFX7 would be very slow + */ + conf->float_mode &= ~V_00B028_FP_ALL_DENORMS; + conf->float_mode |= V_00B028_FP_64_DENORMS; } diff --git a/lib/mesa/src/amd/vulkan/radv_shader.c b/lib/mesa/src/amd/vulkan/radv_shader.c index 83e2e675e..fb725284c 100644 --- a/lib/mesa/src/amd/vulkan/radv_shader.c +++ b/lib/mesa/src/amd/vulkan/radv_shader.c @@ -30,28 +30,41 @@ #include "radv_debug.h" #include "radv_private.h" #include "radv_shader.h" +#include "radv_shader_helper.h" +#include "radv_shader_args.h" #include "nir/nir.h" #include "nir/nir_builder.h" #include "spirv/nir_spirv.h" #include <llvm-c/Core.h> #include <llvm-c/TargetMachine.h> +#include <llvm-c/Support.h> #include "sid.h" -#include "gfx9d.h" #include "ac_binary.h" #include "ac_llvm_util.h" #include "ac_nir_to_llvm.h" +#include "ac_rtld.h" #include "vk_format.h" #include "util/debug.h" #include "ac_exp_param.h" -static const struct nir_shader_compiler_options nir_options = { +#include "aco_interface.h" + +#include "util/string_buffer.h" + +static const struct nir_shader_compiler_options nir_options_llvm = { .vertex_id_zero_based = true, .lower_scmp = true, + .lower_flrp16 = true, .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_device_index_to_zero = true, .lower_fsat = true, .lower_fdiv = true, + .lower_fmod = true, + .lower_bitfield_insert_to_bitfield_select = true, + .lower_bitfield_extract = true, .lower_sub = true, .lower_pack_snorm_2x16 = true, .lower_pack_snorm_4x8 = true, @@ -64,9 +77,81 @@ static const struct nir_shader_compiler_options nir_options = { .lower_extract_byte = true, .lower_extract_word = true, .lower_ffma = true, - .max_unroll_iterations = 32 + .lower_fpow = true, + .lower_mul_2x32_64 = true, + .lower_rotate = true, + .max_unroll_iterations = 32, + .use_interpolated_input_intrinsics = true, + /* nir_lower_int64() isn't actually called for the LLVM backend, but + * this helps the loop unrolling heuristics. */ + .lower_int64_options = nir_lower_imul64 | + nir_lower_imul_high64 | + nir_lower_imul_2x32_64 | + nir_lower_divmod64 | + nir_lower_minmax64 | + nir_lower_iabs64, }; +static const struct nir_shader_compiler_options nir_options_aco = { + .vertex_id_zero_based = true, + .lower_scmp = true, + .lower_flrp16 = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_device_index_to_zero = true, + .lower_fdiv = true, + .lower_fmod = true, + .lower_bitfield_insert_to_bitfield_select = true, + .lower_bitfield_extract = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_ffma = true, + .lower_fpow = true, + .lower_mul_2x32_64 = true, + .lower_rotate = true, + .max_unroll_iterations = 32, + .use_interpolated_input_intrinsics = true, + .lower_int64_options = nir_lower_imul64 | + nir_lower_imul_high64 | + nir_lower_imul_2x32_64 | + nir_lower_divmod64 | + nir_lower_logic64 | + nir_lower_minmax64 | + nir_lower_iabs64, +}; + +bool +radv_can_dump_shader(struct radv_device *device, + struct radv_shader_module *module, + bool is_gs_copy_shader) +{ + if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) + return false; + if (module) + return !module->nir || + (device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS); + + return is_gs_copy_shader; +} + +bool +radv_can_dump_shader_stats(struct radv_device *device, + struct radv_shader_module *module) +{ + /* Only dump non-meta shader stats. */ + return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS && + module && !module->nir; +} + VkResult radv_CreateShaderModule( VkDevice _device, const VkShaderModuleCreateInfo* pCreateInfo, @@ -83,7 +168,7 @@ VkResult radv_CreateShaderModule( sizeof(*module) + pCreateInfo->codeSize, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (module == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); module->nir = NULL; module->size = pCreateInfo->codeSize; @@ -110,56 +195,40 @@ void radv_DestroyShaderModule( vk_free2(&device->alloc, pAllocator, module); } -bool -radv_lower_indirect_derefs(struct nir_shader *nir, - struct radv_physical_device *device) -{ - /* While it would be nice not to have this flag, we are constrained - * by the reality that LLVM 5.0 doesn't have working VGPR indexing - * on GFX9. - */ - bool llvm_has_working_vgpr_indexing = - device->rad_info.chip_class <= VI; - - /* TODO: Indirect indexing of GS inputs is unimplemented. - * - * TCS and TES load inputs directly from LDS or offchip memory, so - * indirect indexing is trivial. - */ - nir_variable_mode indirect_mask = 0; - if (nir->info.stage == MESA_SHADER_GEOMETRY || - (nir->info.stage != MESA_SHADER_TESS_CTRL && - nir->info.stage != MESA_SHADER_TESS_EVAL && - !llvm_has_working_vgpr_indexing)) { - indirect_mask |= nir_var_shader_in; - } - if (!llvm_has_working_vgpr_indexing && - nir->info.stage != MESA_SHADER_TESS_CTRL) - indirect_mask |= nir_var_shader_out; - - /* TODO: We shouldn't need to do this, however LLVM isn't currently - * smart enough to handle indirects without causing excess spilling - * causing the gpu to hang. - * - * See the following thread for more details of the problem: - * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html - */ - indirect_mask |= nir_var_local; - - return nir_lower_indirect_derefs(nir, indirect_mask); -} - void -radv_optimize_nir(struct nir_shader *shader) +radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively, + bool allow_copies) { bool progress; + unsigned lower_flrp = + (shader->options->lower_flrp16 ? 16 : 0) | + (shader->options->lower_flrp32 ? 32 : 0) | + (shader->options->lower_flrp64 ? 64 : 0); do { progress = false; + NIR_PASS(progress, shader, nir_split_array_vars, nir_var_function_temp); + NIR_PASS(progress, shader, nir_shrink_vec_array_vars, nir_var_function_temp); + NIR_PASS_V(shader, nir_lower_vars_to_ssa); - NIR_PASS_V(shader, nir_lower_64bit_pack); - NIR_PASS_V(shader, nir_lower_alu_to_scalar); + NIR_PASS_V(shader, nir_lower_pack); + + if (allow_copies) { + /* Only run this pass in the first call to + * radv_optimize_nir. Later calls assume that we've + * lowered away any copy_deref instructions and we + * don't want to introduce any more. + */ + NIR_PASS(progress, shader, nir_opt_find_array_copies); + } + + NIR_PASS(progress, shader, nir_opt_copy_prop_vars); + NIR_PASS(progress, shader, nir_opt_dead_write_vars); + NIR_PASS(progress, shader, nir_remove_dead_variables, + nir_var_function_temp | nir_var_shader_in | nir_var_shader_out); + + NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS_V(shader, nir_lower_phis_to_scalar); NIR_PASS(progress, shader, nir_copy_prop); @@ -171,18 +240,53 @@ radv_optimize_nir(struct nir_shader *shader) NIR_PASS(progress, shader, nir_opt_remove_phis); NIR_PASS(progress, shader, nir_opt_dce); } - NIR_PASS(progress, shader, nir_opt_if); + NIR_PASS(progress, shader, nir_opt_if, true); NIR_PASS(progress, shader, nir_opt_dead_cf); NIR_PASS(progress, shader, nir_opt_cse); - NIR_PASS(progress, shader, nir_opt_peephole_select, 8); - NIR_PASS(progress, shader, nir_opt_algebraic); + NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true); NIR_PASS(progress, shader, nir_opt_constant_folding); + NIR_PASS(progress, shader, nir_opt_algebraic); + + if (lower_flrp != 0) { + bool lower_flrp_progress = false; + NIR_PASS(lower_flrp_progress, + shader, + nir_lower_flrp, + lower_flrp, + false /* always_precise */, + shader->options->lower_ffma); + if (lower_flrp_progress) { + NIR_PASS(progress, shader, + nir_opt_constant_folding); + progress = true; + } + + /* Nothing should rematerialize any flrps, so we only + * need to do this lowering once. + */ + lower_flrp = 0; + } + NIR_PASS(progress, shader, nir_opt_undef); - NIR_PASS(progress, shader, nir_opt_conditional_discard); if (shader->options->max_unroll_iterations) { NIR_PASS(progress, shader, nir_opt_loop_unroll, 0); } - } while (progress); + } while (progress && !optimize_conservatively); + + NIR_PASS(progress, shader, nir_opt_conditional_discard); + NIR_PASS(progress, shader, nir_opt_shrink_load); + NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo); +} + +static void +shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size; } nir_shader * @@ -190,31 +294,30 @@ radv_shader_compile_to_nir(struct radv_device *device, struct radv_shader_module *module, const char *entrypoint_name, gl_shader_stage stage, - const VkSpecializationInfo *spec_info) + const VkSpecializationInfo *spec_info, + const VkPipelineCreateFlags flags, + const struct radv_pipeline_layout *layout, + bool use_aco, + unsigned subgroup_size, unsigned ballot_bit_size) { - if (strcmp(entrypoint_name, "main") != 0) { - radv_finishme("Multiple shaders per module not really supported"); - } - nir_shader *nir; - nir_function *entry_point; + const nir_shader_compiler_options *nir_options = use_aco ? &nir_options_aco : + &nir_options_llvm; if (module->nir) { /* Some things such as our meta clear/blit code will give us a NIR * shader directly. In that case, we just ignore the SPIR-V entirely * and just use the NIR shader */ nir = module->nir; - nir->options = &nir_options; - nir_validate_shader(nir); + nir->options = nir_options; + nir_validate_shader(nir, "in internal shader"); assert(exec_list_length(&nir->functions) == 1); - struct exec_node *node = exec_list_get_head(&nir->functions); - entry_point = exec_node_data(nir_function, node, node); } else { uint32_t *spirv = (uint32_t *) module->data; assert(module->size % 4 == 0); if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV) - radv_print_spirv(spirv, module->size, stderr); + radv_print_spirv(module->data, module->size, stderr); uint32_t num_spec_entries = 0; struct nir_spirv_specialization *spec_entries = NULL; @@ -227,28 +330,85 @@ radv_shader_compile_to_nir(struct radv_device *device, assert(data + entry.size <= spec_info->pData + spec_info->dataSize); spec_entries[i].id = spec_info->pMapEntries[i].constantID; - if (spec_info->dataSize == 8) + switch (entry.size) { + case 8: spec_entries[i].data64 = *(const uint64_t *)data; - else + break; + case 4: spec_entries[i].data32 = *(const uint32_t *)data; + break; + case 2: + spec_entries[i].data32 = *(const uint16_t *)data; + break; + case 1: + spec_entries[i].data32 = *(const uint8_t *)data; + break; + default: + assert(!"Invalid spec constant size"); + break; + } } } - const struct nir_spirv_supported_extensions supported_ext = { - .draw_parameters = true, - .float64 = true, - .image_read_without_format = true, - .image_write_without_format = true, - .tessellation = true, - .int64 = true, - .multiview = true, - .variable_pointers = true, + const struct spirv_to_nir_options spirv_options = { + .lower_ubo_ssbo_access_to_offsets = true, + .caps = { + .amd_fragment_mask = true, + .amd_gcn_shader = true, + .amd_image_read_write_lod = true, + .amd_shader_ballot = device->physical_device->use_shader_ballot, + .amd_shader_explicit_vertex_parameter = true, + .amd_trinary_minmax = true, + .demote_to_helper_invocation = device->physical_device->use_aco, + .derivative_group = true, + .descriptor_array_dynamic_indexing = true, + .descriptor_array_non_uniform_indexing = true, + .descriptor_indexing = true, + .device_group = true, + .draw_parameters = true, + .float_controls = true, + .float16 = !device->physical_device->use_aco, + .float64 = true, + .geometry_streams = true, + .image_ms_array = true, + .image_read_without_format = true, + .image_write_without_format = true, + .int8 = !device->physical_device->use_aco, + .int16 = !device->physical_device->use_aco, + .int64 = true, + .int64_atomics = true, + .multiview = true, + .physical_storage_buffer_address = true, + .post_depth_coverage = true, + .runtime_descriptor_array = true, + .shader_clock = true, + .shader_viewport_index_layer = true, + .stencil_export = true, + .storage_8bit = !device->physical_device->use_aco, + .storage_16bit = !device->physical_device->use_aco, + .storage_image_ms = true, + .subgroup_arithmetic = true, + .subgroup_ballot = true, + .subgroup_basic = true, + .subgroup_quad = true, + .subgroup_shuffle = true, + .subgroup_vote = true, + .tessellation = true, + .transform_feedback = true, + .variable_pointers = true, + }, + .ubo_addr_format = nir_address_format_32bit_index_offset, + .ssbo_addr_format = nir_address_format_32bit_index_offset, + .phys_ssbo_addr_format = nir_address_format_64bit_global, + .push_const_addr_format = nir_address_format_logical, + .shared_addr_format = nir_address_format_32bit_offset, + .frag_coord_is_sysval = true, }; - entry_point = spirv_to_nir(spirv, module->size / 4, - spec_entries, num_spec_entries, - stage, entrypoint_name, &supported_ext, &nir_options); - nir = entry_point->shader; + nir = spirv_to_nir(spirv, module->size / 4, + spec_entries, num_spec_entries, + stage, entrypoint_name, + &spirv_options, nir_options); assert(nir->info.stage == stage); - nir_validate_shader(nir); + nir_validate_shader(nir, "after spirv_to_nir"); free(spec_entries); @@ -256,50 +416,214 @@ radv_shader_compile_to_nir(struct radv_device *device, * inline functions. That way they get properly initialized at the top * of the function and not at the top of its caller. */ - NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_local); + NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_function_temp); NIR_PASS_V(nir, nir_lower_returns); NIR_PASS_V(nir, nir_inline_functions); + NIR_PASS_V(nir, nir_opt_deref); /* Pick off the single entrypoint that we want */ foreach_list_typed_safe(nir_function, func, node, &nir->functions) { - if (func != entry_point) + if (func->is_entrypoint) + func->name = ralloc_strdup(func, "main"); + else exec_node_remove(&func->node); } assert(exec_list_length(&nir->functions) == 1); - entry_point->name = ralloc_strdup(entry_point, "main"); - NIR_PASS_V(nir, nir_remove_dead_variables, - nir_var_shader_in | nir_var_shader_out | nir_var_system_value); + /* Make sure we lower constant initializers on output variables so that + * nir_remove_dead_variables below sees the corresponding stores + */ + NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_shader_out); /* Now that we've deleted all but the main function, we can go ahead and * lower the rest of the constant initializers. */ NIR_PASS_V(nir, nir_lower_constant_initializers, ~0); + + /* Split member structs. We do this before lower_io_to_temporaries so that + * it doesn't lower system values to temporaries by accident. + */ + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_split_per_member_structs); + + if (nir->info.stage == MESA_SHADER_FRAGMENT && use_aco) + NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out); + if (nir->info.stage == MESA_SHADER_FRAGMENT) + NIR_PASS_V(nir, nir_lower_input_attachments, true); + + NIR_PASS_V(nir, nir_remove_dead_variables, + nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared); + + NIR_PASS_V(nir, nir_propagate_invariant); + NIR_PASS_V(nir, nir_lower_system_values); NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays); + NIR_PASS_V(nir, radv_nir_lower_ycbcr_textures, layout); } /* Vulkan uses the separate-shader linking model */ nir->info.separate_shader = true; - nir_shader_gather_info(nir, entry_point->impl); + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + if (nir->info.stage == MESA_SHADER_GEOMETRY && use_aco) + nir_lower_gs_intrinsics(nir, true); static const nir_lower_tex_options tex_options = { .lower_txp = ~0, + .lower_tg4_offsets = true, }; nir_lower_tex(nir, &tex_options); nir_lower_vars_to_ssa(nir); - nir_lower_var_copies(nir); + + if (nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_GEOMETRY || + nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(nir, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(nir), true, true); + } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) { + NIR_PASS_V(nir, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(nir), true, false); + } + + nir_split_var_copies(nir); + nir_lower_global_vars_to_local(nir); - nir_remove_dead_variables(nir, nir_var_local); - radv_lower_indirect_derefs(nir, device->physical_device); - radv_optimize_nir(nir); + nir_remove_dead_variables(nir, nir_var_function_temp); + bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7; + nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options) { + .subgroup_size = subgroup_size, + .ballot_bit_size = ballot_bit_size, + .lower_to_scalar = 1, + .lower_subgroup_masks = 1, + .lower_shuffle = 1, + .lower_shuffle_to_32bit = 1, + .lower_vote_eq_to_ballot = 1, + .lower_quad_broadcast_dynamic = 1, + .lower_quad_broadcast_dynamic_to_const = gfx7minus, + }); + + nir_lower_load_const_to_scalar(nir); + + if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)) + radv_optimize_nir(nir, false, true); + + /* We call nir_lower_var_copies() after the first radv_optimize_nir() + * to remove any copies introduced by nir_opt_find_array_copies(). + */ + nir_lower_var_copies(nir); + + /* Lower deref operations for compute shared memory. */ + if (nir->info.stage == MESA_SHADER_COMPUTE) { + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared, shared_var_info); + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_mem_shared, nir_address_format_32bit_offset); + } + + /* Lower large variables that are always constant with load_constant + * intrinsics, which get turned into PC-relative loads from a data + * section next to the shader. + */ + NIR_PASS_V(nir, nir_opt_large_constants, + glsl_get_natural_size_align_bytes, 16); + + /* Indirect lowering must be called after the radv_optimize_nir() loop + * has been called at least once. Otherwise indirect lowering can + * bloat the instruction count of the loop and cause it to be + * considered too large for unrolling. + */ + ac_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class); + radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT, false); return nir; } +static int +type_size_vec4(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +static nir_variable * +find_layer_in_var(nir_shader *nir) +{ + nir_foreach_variable(var, &nir->inputs) { + if (var->data.location == VARYING_SLOT_LAYER) { + return var; + } + } + + nir_variable *var = + nir_variable_create(nir, nir_var_shader_in, glsl_int_type(), "layer id"); + var->data.location = VARYING_SLOT_LAYER; + var->data.interpolation = INTERP_MODE_FLAT; + return var; +} + +/* We use layered rendering to implement multiview, which means we need to map + * view_index to gl_Layer. The attachment lowering also uses needs to know the + * layer so that it can sample from the correct layer. The code generates a + * load from the layer_id sysval, but since we don't have a way to get at this + * information from the fragment shader, we also need to lower this to the + * gl_Layer varying. This pass lowers both to a varying load from the LAYER + * slot, before lowering io, so that nir_assign_var_locations() will give the + * LAYER varying the correct driver_location. + */ + +static bool +lower_view_index(nir_shader *nir) +{ + bool progress = false; + nir_function_impl *entry = nir_shader_get_entrypoint(nir); + nir_builder b; + nir_builder_init(&b, entry); + + nir_variable *layer = NULL; + nir_foreach_block(block, entry) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); + if (load->intrinsic != nir_intrinsic_load_view_index && + load->intrinsic != nir_intrinsic_load_layer_id) + continue; + + if (!layer) + layer = find_layer_in_var(nir); + + b.cursor = nir_before_instr(instr); + nir_ssa_def *def = nir_load_var(&b, layer); + nir_ssa_def_rewrite_uses(&load->dest.ssa, + nir_src_for_ssa(def)); + + nir_instr_remove(instr); + progress = true; + } + } + + return progress; +} + +void +radv_lower_fs_io(nir_shader *nir) +{ + NIR_PASS_V(nir, lower_view_index); + nir_assign_io_var_locations(&nir->inputs, &nir->num_inputs, + MESA_SHADER_FRAGMENT); + + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in); +} + + void * radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader) @@ -331,7 +655,11 @@ radv_alloc_shader_memory(struct radv_device *device, slab->size = 256 * 1024; slab->bo = device->ws->buffer_create(device->ws, slab->size, 256, - RADEON_DOMAIN_VRAM, 0); + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_INTERPROCESS_SHARING | + (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? + 0 : RADEON_FLAG_READ_ONLY), + RADV_BO_PRIORITY_SHADER); slab->ptr = (char*)device->ws->buffer_map(slab->bo); list_inithead(&slab->shaders); @@ -355,178 +683,550 @@ radv_destroy_shader_slabs(struct radv_device *device) mtx_destroy(&device->shader_slab_mutex); } -static void -radv_fill_shader_variant(struct radv_device *device, - struct radv_shader_variant *variant, - struct ac_shader_binary *binary, - gl_shader_stage stage) +/* For the UMR disassembler. */ +#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ +#define DEBUGGER_NUM_MARKERS 5 + +static unsigned +radv_get_shader_binary_size(size_t code_size) { - bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0; + return code_size + DEBUGGER_NUM_MARKERS * 4; +} + +static void radv_postprocess_config(const struct radv_physical_device *pdevice, + const struct ac_shader_config *config_in, + const struct radv_shader_info *info, + gl_shader_stage stage, + struct ac_shader_config *config_out) +{ + bool scratch_enabled = config_in->scratch_bytes_per_wave > 0; unsigned vgpr_comp_cnt = 0; + unsigned num_input_vgprs = info->num_input_vgprs; - if (scratch_enabled && !device->llvm_supports_spill) - radv_finishme("shader scratch support only available with LLVM 4.0"); + if (stage == MESA_SHADER_FRAGMENT) { + num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL); + } - variant->code_size = binary->code_size; - variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) | - S_00B12C_SCRATCH_EN(scratch_enabled); + unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs); + /* +3 for scratch wave offset and VCC */ + unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3); + unsigned num_shared_vgprs = config_in->num_shared_vgprs; + /* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */ + assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0) + || (pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0)); + unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8; + + *config_out = *config_in; + config_out->num_vgprs = num_vgprs; + config_out->num_sgprs = num_sgprs; + config_out->num_shared_vgprs = num_shared_vgprs; + + config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) | + S_00B12C_SCRATCH_EN(scratch_enabled); + + if (!pdevice->use_ngg_streamout) { + config_out->rsrc2 |= S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | + S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) | + S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) | + S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) | + S_00B12C_SO_EN(!!info->so.num_outputs); + } - variant->rsrc1 = S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) | - S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) | - S_00B848_DX10_CLAMP(1) | - S_00B848_FLOAT_MODE(variant->config.float_mode); + config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / + (info->wave_size == 32 ? 8 : 4)) | + S_00B848_DX10_CLAMP(1) | + S_00B848_FLOAT_MODE(config_out->float_mode); + + if (pdevice->rad_info.chip_class >= GFX10) { + config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(info->num_user_sgprs >> 5); + } else { + config_out->rsrc1 |= S_00B228_SGPRS((num_sgprs - 1) / 8); + config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(info->num_user_sgprs >> 5); + } switch (stage) { case MESA_SHADER_TESS_EVAL: - vgpr_comp_cnt = 3; - variant->rsrc2 |= S_00B12C_OC_LDS_EN(1); + if (info->is_ngg) { + config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1); + } else if (info->tes.as_es) { + assert(pdevice->rad_info.chip_class <= GFX8); + vgpr_comp_cnt = info->uses_prim_id ? 3 : 2; + + config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); + } else { + bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id; + vgpr_comp_cnt = enable_prim_id ? 3 : 2; + + config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); + } + config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_TESS_CTRL: - if (device->physical_device->rad_info.chip_class >= GFX9) - vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt; - else - variant->rsrc2 |= S_00B12C_OC_LDS_EN(1); + if (pdevice->rad_info.chip_class >= GFX9) { + /* We need at least 2 components for LS. + * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID). + * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. + */ + if (pdevice->rad_info.chip_class >= GFX10) { + vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1; + } else { + vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1; + } + } else { + config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); + } + config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | + S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_VERTEX: - case MESA_SHADER_GEOMETRY: - vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt; + if (info->is_ngg) { + config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + } else if (info->vs.as_ls) { + assert(pdevice->rad_info.chip_class <= GFX8); + /* We need at least 2 components for LS. + * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID). + * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. + */ + vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1; + } else if (info->vs.as_es) { + assert(pdevice->rad_info.chip_class <= GFX8); + /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ + vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0; + } else { + /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID) + * If PrimID is disabled. InstanceID / StepRate1 is loaded instead. + * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. + */ + if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) { + vgpr_comp_cnt = 3; + } else if (info->vs.export_prim_id) { + vgpr_comp_cnt = 2; + } else if (info->vs.needs_instance_id) { + vgpr_comp_cnt = 1; + } else { + vgpr_comp_cnt = 0; + } + + config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); + } break; case MESA_SHADER_FRAGMENT: + config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); + break; + case MESA_SHADER_GEOMETRY: + config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | + S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_COMPUTE: - variant->rsrc2 |= - S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_LDS_SIZE(variant->config.lds_size); + config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | + S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= + S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) | + S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) | + S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) | + S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2] ? 2 : + info->cs.uses_thread_id[1] ? 1 : 0) | + S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) | + S_00B84C_LDS_SIZE(config_in->lds_size); + config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks); + break; default: unreachable("unsupported shader type"); break; } - if (device->physical_device->rad_info.chip_class >= GFX9 && - stage == MESA_SHADER_GEOMETRY) { - /* TODO: Figure out how many we actually need. */ - variant->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(3); - variant->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(3) | - S_00B22C_OC_LDS_EN(1); - } else if (device->physical_device->rad_info.chip_class >= GFX9 && - stage == MESA_SHADER_TESS_CTRL) - variant->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt); - else - variant->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt); + if (pdevice->rad_info.chip_class >= GFX10 && info->is_ngg && + (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_GEOMETRY)) { + unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt; + gl_shader_stage es_stage = stage; + if (stage == MESA_SHADER_GEOMETRY) + es_stage = info->gs.es_type; + + /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */ + if (es_stage == MESA_SHADER_VERTEX) { + es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0; + } else if (es_stage == MESA_SHADER_TESS_EVAL) { + bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id; + es_vgpr_comp_cnt = enable_prim_id ? 3 : 2; + } else + unreachable("Unexpected ES shader stage"); + + bool tes_triangles = stage == MESA_SHADER_TESS_EVAL && + info->tes.primitive_mode >= 4; /* GL_TRIANGLES */ + if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) { + gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ + } else if (info->uses_prim_id) { + gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ + } else if (info->gs.vertices_in >= 3 || tes_triangles) { + gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ + } else { + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + } + + config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | + S_00B228_WGP_MODE(1); + config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | + S_00B22C_LDS_SIZE(config_in->lds_size) | + S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL); + } else if (pdevice->rad_info.chip_class >= GFX9 && + stage == MESA_SHADER_GEOMETRY) { + unsigned es_type = info->gs.es_type; + unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt; + + if (es_type == MESA_SHADER_VERTEX) { + /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ + if (info->vs.needs_instance_id) { + es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1; + } else { + es_vgpr_comp_cnt = 0; + } + } else if (es_type == MESA_SHADER_TESS_EVAL) { + es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2; + } else { + unreachable("invalid shader ES type"); + } - void *ptr = radv_alloc_shader_memory(device, variant); - memcpy(ptr, binary->code, binary->code_size); + /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and + * VGPR[0:4] are always loaded. + */ + if (info->uses_invocation_id) { + gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ + } else if (info->uses_prim_id) { + gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ + } else if (info->gs.vertices_in >= 3) { + gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ + } else { + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + } + + config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt); + config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | + S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL); + } else if (pdevice->rad_info.chip_class >= GFX9 && + stage == MESA_SHADER_TESS_CTRL) { + config_out->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt); + } else { + config_out->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt); + } } -static struct radv_shader_variant * -shader_variant_create(struct radv_device *device, - struct radv_shader_module *module, - struct nir_shader * const *shaders, - int shader_count, - gl_shader_stage stage, - struct ac_nir_compiler_options *options, - bool gs_copy_shader, - void **code_out, - unsigned *code_size_out) +struct radv_shader_variant * +radv_shader_variant_create(struct radv_device *device, + const struct radv_shader_binary *binary, + bool keep_shader_info) { - enum radeon_family chip_family = device->physical_device->rad_info.family; - bool dump_shaders = device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS; - enum ac_target_machine_options tm_options = 0; - struct radv_shader_variant *variant; - struct ac_shader_binary binary; - LLVMTargetMachineRef tm; - - variant = calloc(1, sizeof(struct radv_shader_variant)); + struct ac_shader_config config = {0}; + struct ac_rtld_binary rtld_binary = {0}; + struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant)); if (!variant) return NULL; + variant->ref_count = 1; + + if (binary->type == RADV_BINARY_TYPE_RTLD) { + struct ac_rtld_symbol lds_symbols[2]; + unsigned num_lds_symbols = 0; + const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data; + size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size; + + if (device->physical_device->rad_info.chip_class >= GFX9 && + (binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) && + !binary->is_gs_copy_shader) { + /* We add this symbol even on LLVM <= 8 to ensure that + * shader->config.lds_size is set correctly below. + */ + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "esgs_ring"; + sym->size = binary->info.ngg_info.esgs_ring_size; + sym->align = 64 * 1024; + } + + if (binary->info.is_ngg && + binary->stage == MESA_SHADER_GEOMETRY) { + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "ngg_emit"; + sym->size = binary->info.ngg_info.ngg_emit_size * 4; + sym->align = 4; + } + + struct ac_rtld_open_info open_info = { + .info = &device->physical_device->rad_info, + .shader_type = binary->stage, + .wave_size = binary->info.wave_size, + .num_parts = 1, + .elf_ptrs = &elf_data, + .elf_sizes = &elf_size, + .num_shared_lds_symbols = num_lds_symbols, + .shared_lds_symbols = lds_symbols, + }; + + if (!ac_rtld_open(&rtld_binary, open_info)) { + free(variant); + return NULL; + } + + if (!ac_rtld_read_config(&rtld_binary, &config)) { + ac_rtld_close(&rtld_binary); + free(variant); + return NULL; + } + + if (rtld_binary.lds_size > 0) { + unsigned alloc_granularity = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256; + config.lds_size = align(rtld_binary.lds_size, alloc_granularity) / alloc_granularity; + } + + variant->code_size = rtld_binary.rx_size; + variant->exec_size = rtld_binary.exec_size; + } else { + assert(binary->type == RADV_BINARY_TYPE_LEGACY); + config = ((struct radv_shader_binary_legacy *)binary)->config; + variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size); + variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size; + } + + variant->info = binary->info; + radv_postprocess_config(device->physical_device, &config, &binary->info, + binary->stage, &variant->config); + + if (radv_device_use_secure_compile(device->instance)) { + if (binary->type == RADV_BINARY_TYPE_RTLD) + ac_rtld_close(&rtld_binary); + + return variant; + } + + void *dest_ptr = radv_alloc_shader_memory(device, variant); + + if (binary->type == RADV_BINARY_TYPE_RTLD) { + struct radv_shader_binary_rtld* bin = (struct radv_shader_binary_rtld *)binary; + struct ac_rtld_upload_info info = { + .binary = &rtld_binary, + .rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset, + .rx_ptr = dest_ptr, + }; + + if (!ac_rtld_upload(&info)) { + radv_shader_variant_destroy(device, variant); + ac_rtld_close(&rtld_binary); + return NULL; + } + + if (keep_shader_info || + (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) { + const char *disasm_data; + size_t disasm_size; + if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm_data, &disasm_size)) { + radv_shader_variant_destroy(device, variant); + ac_rtld_close(&rtld_binary); + return NULL; + } + + variant->ir_string = bin->llvm_ir_size ? strdup((const char*)(bin->data + bin->elf_size)) : NULL; + variant->disasm_string = malloc(disasm_size + 1); + memcpy(variant->disasm_string, disasm_data, disasm_size); + variant->disasm_string[disasm_size] = 0; + } + + ac_rtld_close(&rtld_binary); + } else { + struct radv_shader_binary_legacy* bin = (struct radv_shader_binary_legacy *)binary; + memcpy(dest_ptr, bin->data, bin->code_size); + + /* Add end-of-code markers for the UMR disassembler. */ + uint32_t *ptr32 = (uint32_t *)dest_ptr + bin->code_size / 4; + for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++) + ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; + + variant->ir_string = bin->ir_size ? strdup((const char*)(bin->data + bin->code_size)) : NULL; + variant->disasm_string = bin->disasm_size ? strdup((const char*)(bin->data + bin->code_size + bin->ir_size)) : NULL; + } + return variant; +} + +static char * +radv_dump_nir_shaders(struct nir_shader * const *shaders, + int shader_count) +{ + char *data = NULL; + char *ret = NULL; + size_t size = 0; + FILE *f = open_memstream(&data, &size); + if (f) { + for (int i = 0; i < shader_count; ++i) + nir_print_shader(shaders[i], f); + fclose(f); + } + + ret = malloc(size + 1); + if (ret) { + memcpy(ret, data, size); + ret[size] = 0; + } + free(data); + return ret; +} + +static struct radv_shader_variant * +shader_variant_compile(struct radv_device *device, + struct radv_shader_module *module, + struct nir_shader * const *shaders, + int shader_count, + gl_shader_stage stage, + struct radv_shader_info *info, + struct radv_nir_compiler_options *options, + bool gs_copy_shader, + bool keep_shader_info, + bool use_aco, + struct radv_shader_binary **binary_out) +{ + enum radeon_family chip_family = device->physical_device->rad_info.family; + struct radv_shader_binary *binary = NULL; + options->family = chip_family; options->chip_class = device->physical_device->rad_info.chip_class; + options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader); + options->dump_preoptir = options->dump_shader && + device->instance->debug_flags & RADV_DEBUG_PREOPTIR; + options->record_ir = keep_shader_info; + options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR; + options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size; + options->address32_hi = device->physical_device->rad_info.address32_hi; + options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug; + options->use_ngg_streamout = device->physical_device->use_ngg_streamout; + + struct radv_shader_args args = {}; + args.options = options; + args.shader_info = info; + args.is_gs_copy_shader = gs_copy_shader; + radv_declare_shader_args(&args, + gs_copy_shader ? MESA_SHADER_VERTEX + : shaders[shader_count - 1]->info.stage, + shader_count >= 2, + shader_count >= 2 ? shaders[shader_count - 2]->info.stage + : MESA_SHADER_VERTEX); + + if (!use_aco || options->dump_shader || options->record_ir) + ac_init_llvm_once(); + + if (use_aco) { + aco_compile_shader(shader_count, shaders, &binary, &args); + binary->info = *info; + } else { + enum ac_target_machine_options tm_options = 0; + struct ac_llvm_compiler ac_llvm; + bool thread_compiler; - if (options->supports_spill) tm_options |= AC_TM_SUPPORTS_SPILL; - if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED) - tm_options |= AC_TM_SISCHED; - tm = ac_create_target_machine(chip_family, tm_options); - - if (gs_copy_shader) { - assert(shader_count == 1); - ac_create_gs_copy_shader(tm, *shaders, &binary, &variant->config, - &variant->info, options, dump_shaders); - } else { - ac_compile_nir_shader(tm, &binary, &variant->config, - &variant->info, shaders, shader_count, options, - dump_shaders); + if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED) + tm_options |= AC_TM_SISCHED; + if (options->check_ir) + tm_options |= AC_TM_CHECK_IR; + if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) + tm_options |= AC_TM_NO_LOAD_STORE_OPT; + + thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); + radv_init_llvm_compiler(&ac_llvm, + thread_compiler, + chip_family, tm_options, + info->wave_size); + + if (gs_copy_shader) { + assert(shader_count == 1); + radv_compile_gs_copy_shader(&ac_llvm, *shaders, &binary, + &args); + } else { + radv_compile_nir_shader(&ac_llvm, &binary, &args, + shaders, shader_count); + } + + binary->info = *info; + radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); } - LLVMDisposeTargetMachine(tm); + struct radv_shader_variant *variant = radv_shader_variant_create(device, binary, + keep_shader_info); + if (!variant) { + free(binary); + return NULL; + } + variant->aco_used = use_aco; - radv_fill_shader_variant(device, variant, &binary, stage); + if (options->dump_shader) { + fprintf(stderr, "disasm:\n%s\n", variant->disasm_string); + } - if (code_out) { - *code_out = binary.code; - *code_size_out = binary.code_size; - } else - free(binary.code); - free(binary.config); - free(binary.rodata); - free(binary.global_symbol_offsets); - free(binary.relocs); - variant->ref_count = 1; - if (device->trace_bo) { - variant->disasm_string = binary.disasm_string; + if (keep_shader_info) { + variant->nir_string = radv_dump_nir_shaders(shaders, shader_count); if (!gs_copy_shader && !module->nir) { - variant->nir = *shaders; - variant->spirv = (uint32_t *)module->data; + variant->spirv = malloc(module->size); + if (!variant->spirv) { + free(variant); + free(binary); + return NULL; + } + + memcpy(variant->spirv, module->data, module->size); variant->spirv_size = module->size; } - } else { - free(binary.disasm_string); } + if (binary_out) + *binary_out = binary; + else + free(binary); + return variant; } struct radv_shader_variant * -radv_shader_variant_create(struct radv_device *device, +radv_shader_variant_compile(struct radv_device *device, struct radv_shader_module *module, struct nir_shader *const *shaders, int shader_count, struct radv_pipeline_layout *layout, - const struct ac_shader_variant_key *key, - void **code_out, - unsigned *code_size_out) + const struct radv_shader_variant_key *key, + struct radv_shader_info *info, + bool keep_shader_info, + bool use_aco, + struct radv_shader_binary **binary_out) { - struct ac_nir_compiler_options options = {0}; + struct radv_nir_compiler_options options = {0}; options.layout = layout; if (key) options.key = *key; - options.unsafe_math = !!(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH); - options.supports_spill = device->llvm_supports_spill; + options.explicit_scratch_args = use_aco; + options.robust_buffer_access = device->robust_buffer_access; - return shader_variant_create(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, - &options, false, code_out, code_size_out); + return shader_variant_compile(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, info, + &options, false, keep_shader_info, use_aco, binary_out); } struct radv_shader_variant * radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader, - void **code_out, - unsigned *code_size_out, - bool multiview) + struct radv_shader_info *info, + struct radv_shader_binary **binary_out, + bool keep_shader_info, + bool multiview, bool use_aco) { - struct ac_nir_compiler_options options = {0}; + struct radv_nir_compiler_options options = {0}; + options.explicit_scratch_args = use_aco; options.key.has_multiview_view_index = multiview; - return shader_variant_create(device, NULL, &shader, 1, MESA_SHADER_VERTEX, - &options, true, code_out, code_size_out); + return shader_variant_compile(device, NULL, &shader, 1, MESA_SHADER_VERTEX, + info, &options, true, keep_shader_info, use_aco, binary_out); } void @@ -540,104 +1240,106 @@ radv_shader_variant_destroy(struct radv_device *device, list_del(&variant->slab_list); mtx_unlock(&device->shader_slab_mutex); - ralloc_free(variant->nir); + free(variant->spirv); + free(variant->nir_string); free(variant->disasm_string); + free(variant->ir_string); free(variant); } -uint32_t -radv_shader_stage_to_user_data_0(gl_shader_stage stage, enum chip_class chip_class, - bool has_gs, bool has_tess) +const char * +radv_get_shader_name(struct radv_shader_info *info, + gl_shader_stage stage) { switch (stage) { - case MESA_SHADER_FRAGMENT: - return R_00B030_SPI_SHADER_USER_DATA_PS_0; case MESA_SHADER_VERTEX: - if (chip_class >= GFX9) { - return has_tess ? R_00B430_SPI_SHADER_USER_DATA_LS_0 : - has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : - R_00B130_SPI_SHADER_USER_DATA_VS_0; - } - if (has_tess) - return R_00B530_SPI_SHADER_USER_DATA_LS_0; + if (info->vs.as_ls) + return "Vertex Shader as LS"; + else if (info->vs.as_es) + return "Vertex Shader as ES"; + else if (info->is_ngg) + return "Vertex Shader as ESGS"; else - return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B130_SPI_SHADER_USER_DATA_VS_0; - case MESA_SHADER_GEOMETRY: - return chip_class >= GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : - R_00B230_SPI_SHADER_USER_DATA_GS_0; - case MESA_SHADER_COMPUTE: - return R_00B900_COMPUTE_USER_DATA_0; + return "Vertex Shader as VS"; case MESA_SHADER_TESS_CTRL: - return chip_class >= GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 : - R_00B430_SPI_SHADER_USER_DATA_HS_0; + return "Tessellation Control Shader"; case MESA_SHADER_TESS_EVAL: - if (chip_class >= GFX9) { - return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : - R_00B130_SPI_SHADER_USER_DATA_VS_0; - } - if (has_gs) - return R_00B330_SPI_SHADER_USER_DATA_ES_0; + if (info->tes.as_es) + return "Tessellation Evaluation Shader as ES"; + else if (info->is_ngg) + return "Tessellation Evaluation Shader as ESGS"; else - return R_00B130_SPI_SHADER_USER_DATA_VS_0; + return "Tessellation Evaluation Shader as VS"; + case MESA_SHADER_GEOMETRY: + return "Geometry Shader"; + case MESA_SHADER_FRAGMENT: + return "Pixel Shader"; + case MESA_SHADER_COMPUTE: + return "Compute Shader"; default: - unreachable("unknown shader"); - } + return "Unknown shader"; + }; } -const char * -radv_get_shader_name(struct radv_shader_variant *var, gl_shader_stage stage) +unsigned +radv_get_max_workgroup_size(enum chip_class chip_class, + gl_shader_stage stage, + const unsigned *sizes) { switch (stage) { - case MESA_SHADER_VERTEX: return var->info.vs.as_ls ? "Vertex Shader as LS" : var->info.vs.as_es ? "Vertex Shader as ES" : "Vertex Shader as VS"; - case MESA_SHADER_GEOMETRY: return "Geometry Shader"; - case MESA_SHADER_FRAGMENT: return "Pixel Shader"; - case MESA_SHADER_COMPUTE: return "Compute Shader"; - case MESA_SHADER_TESS_CTRL: return "Tessellation Control Shader"; - case MESA_SHADER_TESS_EVAL: return var->info.tes.as_es ? "Tessellation Evaluation Shader as ES" : "Tessellation Evaluation Shader as VS"; + case MESA_SHADER_TESS_CTRL: + return chip_class >= GFX7 ? 128 : 64; + case MESA_SHADER_GEOMETRY: + return chip_class >= GFX9 ? 128 : 64; + case MESA_SHADER_COMPUTE: + break; default: - return "Unknown shader"; - }; + return 0; + } + + unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2]; + return max_workgroup_size; } -void -radv_shader_dump_stats(struct radv_device *device, - struct radv_shader_variant *variant, - gl_shader_stage stage, - FILE *file) +unsigned +radv_get_max_waves(struct radv_device *device, + struct radv_shader_variant *variant, + gl_shader_stage stage) { - unsigned lds_increment = device->physical_device->rad_info.chip_class >= CIK ? 512 : 256; - struct ac_shader_config *conf; + enum chip_class chip_class = device->physical_device->rad_info.chip_class; + unsigned lds_increment = chip_class >= GFX7 ? 512 : 256; + uint8_t wave_size = variant->info.wave_size; + struct ac_shader_config *conf = &variant->config; unsigned max_simd_waves; unsigned lds_per_wave = 0; - switch (device->physical_device->rad_info.family) { - /* These always have 8 waves: */ - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - max_simd_waves = 8; - break; - default: - max_simd_waves = 10; - } - - conf = &variant->config; + max_simd_waves = device->physical_device->rad_info.max_wave64_per_simd; if (stage == MESA_SHADER_FRAGMENT) { lds_per_wave = conf->lds_size * lds_increment + - align(variant->info.fs.num_interp * 48, + align(variant->info.ps.num_interp * 48, lds_increment); + } else if (stage == MESA_SHADER_COMPUTE) { + unsigned max_workgroup_size = + radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size); + lds_per_wave = (conf->lds_size * lds_increment) / + DIV_ROUND_UP(max_workgroup_size, wave_size); } if (conf->num_sgprs) { - if (device->physical_device->rad_info.chip_class >= VI) - max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs); - else - max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs); + unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8); + max_simd_waves = + MIN2(max_simd_waves, + device->physical_device->rad_info.num_physical_sgprs_per_simd / + sgprs); } - if (conf->num_vgprs) - max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); + if (conf->num_vgprs) { + unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4); + max_simd_waves = + MIN2(max_simd_waves, + RADV_NUM_PHYSICAL_VGPRS / vgprs); + } /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD * that PS can use. @@ -645,27 +1347,153 @@ radv_shader_dump_stats(struct radv_device *device, if (lds_per_wave) max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); - fprintf(file, "\n%s:\n", radv_get_shader_name(variant, stage)); + return max_simd_waves; +} + +static void +generate_shader_stats(struct radv_device *device, + struct radv_shader_variant *variant, + gl_shader_stage stage, + struct _mesa_string_buffer *buf) +{ + struct ac_shader_config *conf = &variant->config; + unsigned max_simd_waves = radv_get_max_waves(device, variant, stage); if (stage == MESA_SHADER_FRAGMENT) { - fprintf(file, "*** SHADER CONFIG ***\n" - "SPI_PS_INPUT_ADDR = 0x%04x\n" - "SPI_PS_INPUT_ENA = 0x%04x\n", - conf->spi_ps_input_addr, conf->spi_ps_input_ena); + _mesa_string_buffer_printf(buf, "*** SHADER CONFIG ***\n" + "SPI_PS_INPUT_ADDR = 0x%04x\n" + "SPI_PS_INPUT_ENA = 0x%04x\n", + conf->spi_ps_input_addr, conf->spi_ps_input_ena); + } + + _mesa_string_buffer_printf(buf, "*** SHADER STATS ***\n" + "SGPRS: %d\n" + "VGPRS: %d\n" + "Spilled SGPRs: %d\n" + "Spilled VGPRs: %d\n" + "PrivMem VGPRS: %d\n" + "Code Size: %d bytes\n" + "LDS: %d blocks\n" + "Scratch: %d bytes per wave\n" + "Max Waves: %d\n" + "********************\n\n\n", + conf->num_sgprs, conf->num_vgprs, + conf->spilled_sgprs, conf->spilled_vgprs, + variant->info.private_mem_vgprs, variant->exec_size, + conf->lds_size, conf->scratch_bytes_per_wave, + max_simd_waves); +} + +void +radv_shader_dump_stats(struct radv_device *device, + struct radv_shader_variant *variant, + gl_shader_stage stage, + FILE *file) +{ + struct _mesa_string_buffer *buf = _mesa_string_buffer_create(NULL, 256); + + generate_shader_stats(device, variant, stage, buf); + + fprintf(file, "\n%s:\n", radv_get_shader_name(&variant->info, stage)); + fprintf(file, "%s", buf->buf); + + _mesa_string_buffer_destroy(buf); +} + +VkResult +radv_GetShaderInfoAMD(VkDevice _device, + VkPipeline _pipeline, + VkShaderStageFlagBits shaderStage, + VkShaderInfoTypeAMD infoType, + size_t* pInfoSize, + void* pInfo) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); + gl_shader_stage stage = vk_to_mesa_shader_stage(shaderStage); + struct radv_shader_variant *variant = pipeline->shaders[stage]; + struct _mesa_string_buffer *buf; + VkResult result = VK_SUCCESS; + + /* Spec doesn't indicate what to do if the stage is invalid, so just + * return no info for this. */ + if (!variant) + return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT); + + switch (infoType) { + case VK_SHADER_INFO_TYPE_STATISTICS_AMD: + if (!pInfo) { + *pInfoSize = sizeof(VkShaderStatisticsInfoAMD); + } else { + unsigned lds_multiplier = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256; + struct ac_shader_config *conf = &variant->config; + + VkShaderStatisticsInfoAMD statistics = {}; + statistics.shaderStageMask = shaderStage; + statistics.numPhysicalVgprs = RADV_NUM_PHYSICAL_VGPRS; + statistics.numPhysicalSgprs = device->physical_device->rad_info.num_physical_sgprs_per_simd; + statistics.numAvailableSgprs = statistics.numPhysicalSgprs; + + if (stage == MESA_SHADER_COMPUTE) { + unsigned *local_size = variant->info.cs.block_size; + unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2]; + + statistics.numAvailableVgprs = statistics.numPhysicalVgprs / + ceil((double)workgroup_size / statistics.numPhysicalVgprs); + + statistics.computeWorkGroupSize[0] = local_size[0]; + statistics.computeWorkGroupSize[1] = local_size[1]; + statistics.computeWorkGroupSize[2] = local_size[2]; + } else { + statistics.numAvailableVgprs = statistics.numPhysicalVgprs; + } + + statistics.resourceUsage.numUsedVgprs = conf->num_vgprs; + statistics.resourceUsage.numUsedSgprs = conf->num_sgprs; + statistics.resourceUsage.ldsSizePerLocalWorkGroup = 32768; + statistics.resourceUsage.ldsUsageSizeInBytes = conf->lds_size * lds_multiplier; + statistics.resourceUsage.scratchMemUsageInBytes = conf->scratch_bytes_per_wave; + + size_t size = *pInfoSize; + *pInfoSize = sizeof(statistics); + + memcpy(pInfo, &statistics, MIN2(size, *pInfoSize)); + + if (size < *pInfoSize) + result = VK_INCOMPLETE; + } + + break; + case VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD: + buf = _mesa_string_buffer_create(NULL, 1024); + + _mesa_string_buffer_printf(buf, "%s:\n", radv_get_shader_name(&variant->info, stage)); + _mesa_string_buffer_printf(buf, "%s\n\n", variant->ir_string); + _mesa_string_buffer_printf(buf, "%s\n\n", variant->disasm_string); + generate_shader_stats(device, variant, stage, buf); + + /* Need to include the null terminator. */ + size_t length = buf->length + 1; + + if (!pInfo) { + *pInfoSize = length; + } else { + size_t size = *pInfoSize; + *pInfoSize = length; + + memcpy(pInfo, buf->buf, MIN2(size, length)); + + if (size < length) + result = VK_INCOMPLETE; + } + + _mesa_string_buffer_destroy(buf); + break; + default: + /* VK_SHADER_INFO_TYPE_BINARY_AMD unimplemented for now. */ + result = VK_ERROR_FEATURE_NOT_PRESENT; + break; } - fprintf(file, "*** SHADER STATS ***\n" - "SGPRS: %d\n" - "VGPRS: %d\n" - "Spilled SGPRs: %d\n" - "Spilled VGPRs: %d\n" - "Code Size: %d bytes\n" - "LDS: %d blocks\n" - "Scratch: %d bytes per wave\n" - "Max Waves: %d\n" - "********************\n\n\n", - conf->num_sgprs, conf->num_vgprs, - conf->spilled_sgprs, conf->spilled_vgprs, variant->code_size, - conf->lds_size, conf->scratch_bytes_per_wave, - max_simd_waves); + return result; } diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c index 63c9c033a..dca604afe 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -22,272 +22,298 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "ac_nir_to_llvm.h" -#include "ac_rtld.h" -#include "si_pipe.h" #include "si_shader_internal.h" +#include "si_pipe.h" +#include "ac_rtld.h" +#include "ac_nir_to_llvm.h" #include "sid.h" + #include "tgsi/tgsi_from_mesa.h" #include "util/u_memory.h" struct si_llvm_diagnostics { - struct pipe_debug_callback *debug; - unsigned retval; + struct pipe_debug_callback *debug; + unsigned retval; }; static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) { - struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context; - LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); - const char *severity_str = NULL; - - switch (severity) { - case LLVMDSError: - severity_str = "error"; - break; - case LLVMDSWarning: - severity_str = "warning"; - break; - case LLVMDSRemark: - case LLVMDSNote: - default: - return; - } - - char *description = LLVMGetDiagInfoDescription(di); - - pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str, - description); - - if (severity == LLVMDSError) { - diag->retval = 1; - fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description); - } - - LLVMDisposeMessage(description); + struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context; + LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); + const char *severity_str = NULL; + + switch (severity) { + case LLVMDSError: + severity_str = "error"; + break; + case LLVMDSWarning: + severity_str = "warning"; + break; + case LLVMDSRemark: + case LLVMDSNote: + default: + return; + } + + char *description = LLVMGetDiagInfoDescription(di); + + pipe_debug_message(diag->debug, SHADER_INFO, + "LLVM diagnostic (%s): %s", severity_str, description); + + if (severity == LLVMDSError) { + diag->retval = 1; + fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description); + } + + LLVMDisposeMessage(description); } -bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary, - struct ac_shader_config *conf, struct ac_llvm_compiler *compiler, - struct ac_llvm_context *ac, struct pipe_debug_callback *debug, - enum pipe_shader_type shader_type, const char *name, bool less_optimized) +bool si_compile_llvm(struct si_screen *sscreen, + struct si_shader_binary *binary, + struct ac_shader_config *conf, + struct ac_llvm_compiler *compiler, + struct ac_llvm_context *ac, + struct pipe_debug_callback *debug, + enum pipe_shader_type shader_type, + const char *name, + bool less_optimized) { - unsigned count = p_atomic_inc_return(&sscreen->num_compilations); - - if (si_can_dump_shader(sscreen, shader_type)) { - fprintf(stderr, "radeonsi: Compiling shader %d\n", count); - - if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { - fprintf(stderr, "%s LLVM IR:\n\n", name); - ac_dump_module(ac->module); - fprintf(stderr, "\n"); - } - } - - if (sscreen->record_llvm_ir) { - char *ir = LLVMPrintModuleToString(ac->module); - binary->llvm_ir_string = strdup(ir); - LLVMDisposeMessage(ir); - } - - if (!si_replace_shader(count, binary)) { - struct ac_compiler_passes *passes = compiler->passes; - - if (ac->wave_size == 32) - passes = compiler->passes_wave32; - else if (less_optimized && compiler->low_opt_passes) - passes = compiler->low_opt_passes; - - struct si_llvm_diagnostics diag = {debug}; - LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag); - - if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer, - &binary->elf_size)) - diag.retval = 1; - - if (diag.retval != 0) { - pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed"); - return false; - } - } - - struct ac_rtld_binary rtld; - if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ - .info = &sscreen->info, - .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = ac->wave_size, - .num_parts = 1, - .elf_ptrs = &binary->elf_buffer, - .elf_sizes = &binary->elf_size})) - return false; - - bool ok = ac_rtld_read_config(&rtld, conf); - ac_rtld_close(&rtld); - return ok; + unsigned count = p_atomic_inc_return(&sscreen->num_compilations); + + if (si_can_dump_shader(sscreen, shader_type)) { + fprintf(stderr, "radeonsi: Compiling shader %d\n", count); + + if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { + fprintf(stderr, "%s LLVM IR:\n\n", name); + ac_dump_module(ac->module); + fprintf(stderr, "\n"); + } + } + + if (sscreen->record_llvm_ir) { + char *ir = LLVMPrintModuleToString(ac->module); + binary->llvm_ir_string = strdup(ir); + LLVMDisposeMessage(ir); + } + + if (!si_replace_shader(count, binary)) { + struct ac_compiler_passes *passes = compiler->passes; + + if (ac->wave_size == 32) + passes = compiler->passes_wave32; + else if (less_optimized && compiler->low_opt_passes) + passes = compiler->low_opt_passes; + + struct si_llvm_diagnostics diag = {debug}; + LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag); + + if (!ac_compile_module_to_elf(passes, ac->module, + (char **)&binary->elf_buffer, + &binary->elf_size)) + diag.retval = 1; + + if (diag.retval != 0) { + pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed"); + return false; + } + } + + struct ac_rtld_binary rtld; + if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ + .info = &sscreen->info, + .shader_type = tgsi_processor_to_shader_stage(shader_type), + .wave_size = ac->wave_size, + .num_parts = 1, + .elf_ptrs = &binary->elf_buffer, + .elf_sizes = &binary->elf_size })) + return false; + + bool ok = ac_rtld_read_config(&rtld, conf); + ac_rtld_close(&rtld); + return ok; } -void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, unsigned wave_size) +void si_llvm_context_init(struct si_shader_context *ctx, + struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + unsigned wave_size) { - memset(ctx, 0, sizeof(*ctx)); - ctx->screen = sscreen; - ctx->compiler = compiler; - - ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family, - AC_FLOAT_MODE_DEFAULT_OPENGL, wave_size, 64); + memset(ctx, 0, sizeof(*ctx)); + ctx->screen = sscreen; + ctx->compiler = compiler; + + ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, + sscreen->info.family, + AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, + wave_size, 64); } -void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types, - unsigned num_return_elems, unsigned max_workgroup_size) +void si_llvm_create_func(struct si_shader_context *ctx, const char *name, + LLVMTypeRef *return_types, unsigned num_return_elems, + unsigned max_workgroup_size) { - LLVMTypeRef ret_type; - enum ac_llvm_calling_convention call_conv; - enum pipe_shader_type real_shader_type; - - if (num_return_elems) - ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true); - else - ret_type = ctx->ac.voidt; - - real_shader_type = ctx->type; - - /* LS is merged into HS (TCS), and ES is merged into GS. */ - if (ctx->screen->info.chip_class >= GFX9) { - if (ctx->shader->key.as_ls) - real_shader_type = PIPE_SHADER_TESS_CTRL; - else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg) - real_shader_type = PIPE_SHADER_GEOMETRY; - } - - switch (real_shader_type) { - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_TESS_EVAL: - call_conv = AC_LLVM_AMDGPU_VS; - break; - case PIPE_SHADER_TESS_CTRL: - call_conv = AC_LLVM_AMDGPU_HS; - break; - case PIPE_SHADER_GEOMETRY: - call_conv = AC_LLVM_AMDGPU_GS; - break; - case PIPE_SHADER_FRAGMENT: - call_conv = AC_LLVM_AMDGPU_PS; - break; - case PIPE_SHADER_COMPUTE: - call_conv = AC_LLVM_AMDGPU_CS; - break; - default: - unreachable("Unhandle shader type"); - } - - /* Setup the function */ - ctx->return_type = ret_type; - ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module); - ctx->return_value = LLVMGetUndef(ctx->return_type); - - if (ctx->screen->info.address32_hi) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits", - ctx->screen->info.address32_hi); - } - - LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true"); - - ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); + LLVMTypeRef ret_type; + enum ac_llvm_calling_convention call_conv; + enum pipe_shader_type real_shader_type; + + if (num_return_elems) + ret_type = LLVMStructTypeInContext(ctx->ac.context, + return_types, + num_return_elems, true); + else + ret_type = ctx->ac.voidt; + + real_shader_type = ctx->type; + + /* LS is merged into HS (TCS), and ES is merged into GS. */ + if (ctx->screen->info.chip_class >= GFX9) { + if (ctx->shader->key.as_ls) + real_shader_type = PIPE_SHADER_TESS_CTRL; + else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg) + real_shader_type = PIPE_SHADER_GEOMETRY; + } + + switch (real_shader_type) { + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_EVAL: + call_conv = AC_LLVM_AMDGPU_VS; + break; + case PIPE_SHADER_TESS_CTRL: + call_conv = AC_LLVM_AMDGPU_HS; + break; + case PIPE_SHADER_GEOMETRY: + call_conv = AC_LLVM_AMDGPU_GS; + break; + case PIPE_SHADER_FRAGMENT: + call_conv = AC_LLVM_AMDGPU_PS; + break; + case PIPE_SHADER_COMPUTE: + call_conv = AC_LLVM_AMDGPU_CS; + break; + default: + unreachable("Unhandle shader type"); + } + + /* Setup the function */ + ctx->return_type = ret_type; + ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, + ret_type, ctx->ac.module); + ctx->return_value = LLVMGetUndef(ctx->return_type); + + if (ctx->screen->info.address32_hi) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, + "amdgpu-32bit-address-high-bits", + ctx->screen->info.address32_hi); + } + + LLVMAddTargetDependentFunctionAttr(ctx->main_fn, + "no-signed-zeros-fp-math", + "true"); + + ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); } void si_llvm_optimize_module(struct si_shader_context *ctx) { - /* Dump LLVM IR before any optimization passes */ - if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->type)) - LLVMDumpModule(ctx->ac.module); - - /* Run the pass */ - LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module); - LLVMDisposeBuilder(ctx->ac.builder); + /* Dump LLVM IR before any optimization passes */ + if (ctx->screen->debug_flags & DBG(PREOPT_IR) && + si_can_dump_shader(ctx->screen, ctx->type)) + LLVMDumpModule(ctx->ac.module); + + /* Run the pass */ + LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module); + LLVMDisposeBuilder(ctx->ac.builder); } void si_llvm_dispose(struct si_shader_context *ctx) { - LLVMDisposeModule(ctx->ac.module); - LLVMContextDispose(ctx->ac.context); - ac_llvm_context_dispose(&ctx->ac); + LLVMDisposeModule(ctx->ac.module); + LLVMContextDispose(ctx->ac.context); + ac_llvm_context_dispose(&ctx->ac); } /** * Load a dword from a constant buffer. */ -LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource, - LLVMValueRef offset) +LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, + LLVMValueRef resource, LLVMValueRef offset) { - return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, 0, true, true); + return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, + 0, 0, true, true); } void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) { - if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) - LLVMBuildRetVoid(ctx->ac.builder); - else - LLVMBuildRet(ctx->ac.builder, ret); + if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) + LLVMBuildRetVoid(ctx->ac.builder); + else + LLVMBuildRet(ctx->ac.builder, ret); } LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index) + struct ac_arg param, unsigned return_index) { - return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, ""); + return LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_get_arg(&ctx->ac, param), + return_index, ""); } LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index) + struct ac_arg param, unsigned return_index) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef p = ac_get_arg(&ctx->ac, param); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef p = ac_get_arg(&ctx->ac, param); - return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, ""); + return LLVMBuildInsertValue(builder, ret, + ac_to_float(&ctx->ac, p), + return_index, ""); } LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index) + struct ac_arg param, unsigned return_index) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ptr = ac_get_arg(&ctx->ac, param); - ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, ""); - return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef ptr = ac_get_arg(&ctx->ac, param); + ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, ""); + return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); } LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) { - LLVMValueRef ptr[2], list; - bool merged_shader = si_is_merged_shader(ctx->shader); + LLVMValueRef ptr[2], list; + bool merged_shader = si_is_merged_shader(ctx->shader); - ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); - list = - LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); - return list; + ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); + list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], + ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + return list; } -LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type, - LLVMValueRef val1, LLVMValueRef val2) +LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, + LLVMTypeRef type, LLVMValueRef val1, + LLVMValueRef val2) { - LLVMValueRef values[2] = { - ac_to_integer(&ctx->ac, val1), - ac_to_integer(&ctx->ac, val2), - }; - LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); - return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); + LLVMValueRef values[2] = { + ac_to_integer(&ctx->ac, val1), + ac_to_integer(&ctx->ac, val2), + }; + LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); + return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); } void si_llvm_emit_barrier(struct si_shader_context *ctx) { - /* GFX6 only (thanks to a hw bug workaround): - * The real barrier instruction isn’t needed, because an entire patch - * always fits into a single wave. - */ - if (ctx->screen->info.chip_class == GFX6 && ctx->type == PIPE_SHADER_TESS_CTRL) { - ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); - return; - } - - ac_build_s_barrier(&ctx->ac); + /* GFX6 only (thanks to a hw bug workaround): + * The real barrier instruction isn’t needed, because an entire patch + * always fits into a single wave. + */ + if (ctx->screen->info.chip_class == GFX6 && + ctx->type == PIPE_SHADER_TESS_CTRL) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); + return; + } + + ac_build_s_barrier(&ctx->ac); } /* Ensure that the esgs ring is declared. @@ -297,169 +323,187 @@ void si_llvm_emit_barrier(struct si_shader_context *ctx) */ void si_llvm_declare_esgs_ring(struct si_shader_context *ctx) { - if (ctx->esgs_ring) - return; + if (ctx->esgs_ring) + return; - assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); + assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); - ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), - "esgs_ring", AC_ADDR_SPACE_LDS); - LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); - LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); + ctx->esgs_ring = LLVMAddGlobalInAddressSpace( + ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), + "esgs_ring", + AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); + LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); } -void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset) +void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, + unsigned bitoffset) { - LLVMValueRef args[] = { - ac_get_arg(&ctx->ac, param), - LLVMConstInt(ctx->ac.i32, bitoffset, 0), - }; - ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2, - AC_FUNC_ATTR_CONVERGENT); + LLVMValueRef args[] = { + ac_get_arg(&ctx->ac, param), + LLVMConstInt(ctx->ac.i32, bitoffset, 0), + }; + ac_build_intrinsic(&ctx->ac, + "llvm.amdgcn.init.exec.from.input", + ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); } /** * Get the value of a shader input parameter and extract a bitfield. */ -static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value, - unsigned rshift, unsigned bitwidth) +static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, + LLVMValueRef value, unsigned rshift, + unsigned bitwidth) { - if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) - value = ac_to_integer(&ctx->ac, value); + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) + value = ac_to_integer(&ctx->ac, value); - if (rshift) - value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), ""); + if (rshift) + value = LLVMBuildLShr(ctx->ac.builder, value, + LLVMConstInt(ctx->ac.i32, rshift, 0), ""); - if (rshift + bitwidth < 32) { - unsigned mask = (1 << bitwidth) - 1; - value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), ""); - } + if (rshift + bitwidth < 32) { + unsigned mask = (1 << bitwidth) - 1; + value = LLVMBuildAnd(ctx->ac.builder, value, + LLVMConstInt(ctx->ac.i32, mask, 0), ""); + } - return value; + return value; } -LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift, - unsigned bitwidth) +LLVMValueRef si_unpack_param(struct si_shader_context *ctx, + struct ac_arg param, unsigned rshift, + unsigned bitwidth) { - LLVMValueRef value = ac_get_arg(&ctx->ac, param); + LLVMValueRef value = ac_get_arg(&ctx->ac, param); - return unpack_llvm_param(ctx, value, rshift, bitwidth); + return unpack_llvm_param(ctx, value, rshift, bitwidth); } -LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle) +LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, + unsigned swizzle) { - if (swizzle > 0) - return ctx->ac.i32_0; - - switch (ctx->type) { - case PIPE_SHADER_VERTEX: - return ac_get_arg(&ctx->ac, ctx->vs_prim_id); - case PIPE_SHADER_TESS_CTRL: - return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id); - case PIPE_SHADER_TESS_EVAL: - return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id); - case PIPE_SHADER_GEOMETRY: - return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id); - default: - assert(0); - return ctx->ac.i32_0; - } + if (swizzle > 0) + return ctx->ac.i32_0; + + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + return ac_get_arg(&ctx->ac, ctx->vs_prim_id); + case PIPE_SHADER_TESS_CTRL: + return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id); + case PIPE_SHADER_TESS_EVAL: + return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id); + case PIPE_SHADER_GEOMETRY: + return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id); + default: + assert(0); + return ctx->ac.i32_0; + } } LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef values[3]; - LLVMValueRef result; - unsigned i; - unsigned *properties = ctx->shader->selector->info.properties; + LLVMValueRef values[3]; + LLVMValueRef result; + unsigned i; + unsigned *properties = ctx->shader->selector->info.properties; - if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { - unsigned sizes[3] = {properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]}; + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { + unsigned sizes[3] = { + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] + }; - for (i = 0; i < 3; ++i) - values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0); + for (i = 0; i < 3; ++i) + values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0); - result = ac_build_gather_values(&ctx->ac, values, 3); - } else { - result = ac_get_arg(&ctx->ac, ctx->block_size); - } + result = ac_build_gather_values(&ctx->ac, values, 3); + } else { + result = ac_get_arg(&ctx->ac, ctx->block_size); + } - return result; + return result; } void si_llvm_declare_compute_memory(struct si_shader_context *ctx) { - struct si_shader_selector *sel = ctx->shader->selector; - unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; + struct si_shader_selector *sel = ctx->shader->selector; + unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; - LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); - LLVMValueRef var; + LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); + LLVMValueRef var; - assert(!ctx->ac.lds); + assert(!ctx->ac.lds); - var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size), - "compute_lds", AC_ADDR_SPACE_LDS); - LLVMSetAlignment(var, 64 * 1024); + var = LLVMAddGlobalInAddressSpace(ctx->ac.module, + LLVMArrayType(ctx->ac.i8, lds_size), + "compute_lds", + AC_ADDR_SPACE_LDS); + LLVMSetAlignment(var, 64 * 1024); - ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); + ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); } bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) { - if (nir->info.stage == MESA_SHADER_VERTEX) { - si_llvm_load_vs_inputs(ctx, nir); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - unsigned colors_read = ctx->shader->selector->info.colors_read; - LLVMValueRef main_fn = ctx->main_fn; - - LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32); - - unsigned offset = SI_PARAM_POS_FIXED_PT + 1; - - if (colors_read & 0x0f) { - unsigned mask = colors_read & 0x0f; - LLVMValueRef values[4]; - values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; - values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; - values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; - values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; - ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4)); - } - if (colors_read & 0xf0) { - unsigned mask = (colors_read & 0xf0) >> 4; - LLVMValueRef values[4]; - values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; - values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; - values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; - values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; - ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4)); - } - - ctx->abi.interp_at_sample_force_center = - ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center; - } else if (nir->info.stage == MESA_SHADER_COMPUTE) { - if (nir->info.cs.user_data_components_amd) { - ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data); - ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data, - nir->info.cs.user_data_components_amd); - } - } - - ctx->abi.inputs = &ctx->inputs[0]; - ctx->abi.clamp_shadow_reference = true; - ctx->abi.robust_buffer_access = true; - - if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) { - assert(gl_shader_stage_is_compute(nir->info.stage)); - si_llvm_declare_compute_memory(ctx); - } - ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir); - - return true; + if (nir->info.stage == MESA_SHADER_VERTEX) { + si_llvm_load_vs_inputs(ctx, nir); + } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + unsigned colors_read = + ctx->shader->selector->info.colors_read; + LLVMValueRef main_fn = ctx->main_fn; + + LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32); + + unsigned offset = SI_PARAM_POS_FIXED_PT + 1; + + if (colors_read & 0x0f) { + unsigned mask = colors_read & 0x0f; + LLVMValueRef values[4]; + values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; + values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; + values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; + values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; + ctx->abi.color0 = + ac_to_integer(&ctx->ac, + ac_build_gather_values(&ctx->ac, values, 4)); + } + if (colors_read & 0xf0) { + unsigned mask = (colors_read & 0xf0) >> 4; + LLVMValueRef values[4]; + values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; + values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; + values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; + values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; + ctx->abi.color1 = + ac_to_integer(&ctx->ac, + ac_build_gather_values(&ctx->ac, values, 4)); + } + + ctx->abi.interp_at_sample_force_center = + ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center; + } else if (nir->info.stage == MESA_SHADER_COMPUTE) { + if (nir->info.cs.user_data_components_amd) { + ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data); + ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data, + nir->info.cs.user_data_components_amd); + } + } + + ctx->abi.inputs = &ctx->inputs[0]; + ctx->abi.clamp_shadow_reference = true; + ctx->abi.robust_buffer_access = true; + + if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) { + assert(gl_shader_stage_is_compute(nir->info.stage)); + si_llvm_declare_compute_memory(ctx); + } + ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir); + + return true; } /** @@ -467,270 +511,278 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) * runs them in sequence to form a monolithic shader. */ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, - unsigned num_parts, unsigned main_part, - unsigned next_shader_first_part) + unsigned num_parts, unsigned main_part, + unsigned next_shader_first_part) { - LLVMBuilderRef builder = ctx->ac.builder; - /* PS epilog has one arg per color component; gfx9 merged shader - * prologs need to forward 40 SGPRs. - */ - LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS]; - LLVMTypeRef function_type; - unsigned num_first_params; - unsigned num_out, initial_num_out; - ASSERTED unsigned num_out_sgpr; /* used in debug checks */ - ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */ - unsigned num_sgprs, num_vgprs; - unsigned gprs; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - for (unsigned i = 0; i < num_parts; ++i) { - ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE); - LLVMSetLinkage(parts[i], LLVMPrivateLinkage); - } - - /* The parameters of the wrapper function correspond to those of the - * first part in terms of SGPRs and VGPRs, but we use the types of the - * main part to get the right types. This is relevant for the - * dereferenceable attribute on descriptor table pointers. - */ - num_sgprs = 0; - num_vgprs = 0; - - function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); - num_first_params = LLVMCountParamTypes(function_type); - - for (unsigned i = 0; i < num_first_params; ++i) { - LLVMValueRef param = LLVMGetParam(parts[0], i); - - if (ac_is_sgpr_param(param)) { - assert(num_vgprs == 0); - num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; - } else { - num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; - } - } - - gprs = 0; - while (gprs < num_sgprs + num_vgprs) { - LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count); - LLVMTypeRef type = LLVMTypeOf(param); - unsigned size = ac_get_type_size(type) / 4; - - /* This is going to get casted anyways, so we don't have to - * have the exact same type. But we do have to preserve the - * pointer-ness so that LLVM knows about it. - */ - enum ac_arg_type arg_type = AC_ARG_INT; - if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { - type = LLVMGetElementType(type); - - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { - if (LLVMGetVectorSize(type) == 4) - arg_type = AC_ARG_CONST_DESC_PTR; - else if (LLVMGetVectorSize(type) == 8) - arg_type = AC_ARG_CONST_IMAGE_PTR; - else - assert(0); - } else if (type == ctx->ac.f32) { - arg_type = AC_ARG_CONST_FLOAT_PTR; - } else { - assert(0); - } - } - - ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL); - - assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); - assert(gprs + size <= num_sgprs + num_vgprs && - (gprs >= num_sgprs || gprs + size <= num_sgprs)); - - gprs += size; - } - - /* Prepare the return type. */ - unsigned num_returns = 0; - LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type; - - last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1])); - return_type = LLVMGetReturnType(last_func_type); - - switch (LLVMGetTypeKind(return_type)) { - case LLVMStructTypeKind: - num_returns = LLVMCountStructElementTypes(return_type); - assert(num_returns <= ARRAY_SIZE(returns)); - LLVMGetStructElementTypes(return_type, returns); - break; - case LLVMVoidTypeKind: - break; - default: - unreachable("unexpected type"); - } - - si_llvm_create_func(ctx, "wrapper", returns, num_returns, - si_get_max_workgroup_size(ctx->shader)); - - if (si_is_merged_shader(ctx->shader)) - ac_init_exec_full_mask(&ctx->ac); - - /* Record the arguments of the function as if they were an output of - * a previous part. - */ - num_out = 0; - num_out_sgpr = 0; - - for (unsigned i = 0; i < ctx->args.arg_count; ++i) { - LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); - LLVMTypeRef param_type = LLVMTypeOf(param); - LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32; - unsigned size = ac_get_type_size(param_type) / 4; - - if (size == 1) { - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, ""); - param_type = ctx->ac.i32; - } - - if (param_type != out_type) - param = LLVMBuildBitCast(builder, param, out_type, ""); - out[num_out++] = param; - } else { - LLVMTypeRef vector_type = LLVMVectorType(out_type, size); - - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, ""); - param_type = ctx->ac.i64; - } - - if (param_type != vector_type) - param = LLVMBuildBitCast(builder, param, vector_type, ""); - - for (unsigned j = 0; j < size; ++j) - out[num_out++] = - LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), ""); - } - - if (ctx->args.args[i].file == AC_ARG_SGPR) - num_out_sgpr = num_out; - } - - memcpy(initial, out, sizeof(out)); - initial_num_out = num_out; - initial_num_out_sgpr = num_out_sgpr; - - /* Now chain the parts. */ - LLVMValueRef ret = NULL; - for (unsigned part = 0; part < num_parts; ++part) { - LLVMValueRef in[AC_MAX_ARGS]; - LLVMTypeRef ret_type; - unsigned out_idx = 0; - unsigned num_params = LLVMCountParams(parts[part]); - - /* Merged shaders are executed conditionally depending - * on the number of enabled threads passed in the input SGPRs. */ - if (si_is_multi_part_shader(ctx->shader) && part == 0) { - LLVMValueRef ena, count = initial[3]; - - count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); - ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, ""); - ac_build_ifcc(&ctx->ac, ena, 6506); - } - - /* Derive arguments for the next part from outputs of the - * previous one. - */ - for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { - LLVMValueRef param; - LLVMTypeRef param_type; - bool is_sgpr; - unsigned param_size; - LLVMValueRef arg = NULL; - - param = LLVMGetParam(parts[part], param_idx); - param_type = LLVMTypeOf(param); - param_size = ac_get_type_size(param_type) / 4; - is_sgpr = ac_is_sgpr_param(param); - - if (is_sgpr) { - ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG); - } else if (out_idx < num_out_sgpr) { - /* Skip returned SGPRs the current part doesn't - * declare on the input. */ - out_idx = num_out_sgpr; - } - - assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); - - if (param_size == 1) - arg = out[out_idx]; - else - arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); - - if (LLVMTypeOf(arg) != param_type) { - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) { - arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, ""); - arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); - } else { - arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, ""); - arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); - } - } else { - arg = LLVMBuildBitCast(builder, arg, param_type, ""); - } - } - - in[param_idx] = arg; - out_idx += param_size; - } - - ret = ac_build_call(&ctx->ac, parts[part], in, num_params); - - if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) { - ac_build_endif(&ctx->ac, 6506); - - /* The second half of the merged shader should use - * the inputs from the toplevel (wrapper) function, - * not the return value from the last call. - * - * That's because the last call was executed condi- - * tionally, so we can't consume it in the main - * block. - */ - memcpy(out, initial, sizeof(initial)); - num_out = initial_num_out; - num_out_sgpr = initial_num_out_sgpr; - continue; - } - - /* Extract the returned GPRs. */ - ret_type = LLVMTypeOf(ret); - num_out = 0; - num_out_sgpr = 0; - - if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { - assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); - - unsigned ret_size = LLVMCountStructElementTypes(ret_type); - - for (unsigned i = 0; i < ret_size; ++i) { - LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, ""); - - assert(num_out < ARRAY_SIZE(out)); - out[num_out++] = val; - - if (LLVMTypeOf(val) == ctx->ac.i32) { - assert(num_out_sgpr + 1 == num_out); - num_out_sgpr = num_out; - } - } - } - } - - /* Return the value from the last part. */ - if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) - LLVMBuildRetVoid(builder); - else - LLVMBuildRet(builder, ret); + LLVMBuilderRef builder = ctx->ac.builder; + /* PS epilog has one arg per color component; gfx9 merged shader + * prologs need to forward 40 SGPRs. + */ + LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS]; + LLVMTypeRef function_type; + unsigned num_first_params; + unsigned num_out, initial_num_out; + ASSERTED unsigned num_out_sgpr; /* used in debug checks */ + ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */ + unsigned num_sgprs, num_vgprs; + unsigned gprs; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + for (unsigned i = 0; i < num_parts; ++i) { + ac_add_function_attr(ctx->ac.context, parts[i], -1, + AC_FUNC_ATTR_ALWAYSINLINE); + LLVMSetLinkage(parts[i], LLVMPrivateLinkage); + } + + /* The parameters of the wrapper function correspond to those of the + * first part in terms of SGPRs and VGPRs, but we use the types of the + * main part to get the right types. This is relevant for the + * dereferenceable attribute on descriptor table pointers. + */ + num_sgprs = 0; + num_vgprs = 0; + + function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); + num_first_params = LLVMCountParamTypes(function_type); + + for (unsigned i = 0; i < num_first_params; ++i) { + LLVMValueRef param = LLVMGetParam(parts[0], i); + + if (ac_is_sgpr_param(param)) { + assert(num_vgprs == 0); + num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; + } else { + num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; + } + } + + gprs = 0; + while (gprs < num_sgprs + num_vgprs) { + LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count); + LLVMTypeRef type = LLVMTypeOf(param); + unsigned size = ac_get_type_size(type) / 4; + + /* This is going to get casted anyways, so we don't have to + * have the exact same type. But we do have to preserve the + * pointer-ness so that LLVM knows about it. + */ + enum ac_arg_type arg_type = AC_ARG_INT; + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { + type = LLVMGetElementType(type); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + if (LLVMGetVectorSize(type) == 4) + arg_type = AC_ARG_CONST_DESC_PTR; + else if (LLVMGetVectorSize(type) == 8) + arg_type = AC_ARG_CONST_IMAGE_PTR; + else + assert(0); + } else if (type == ctx->ac.f32) { + arg_type = AC_ARG_CONST_FLOAT_PTR; + } else { + assert(0); + } + } + + ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, + size, arg_type, NULL); + + assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); + assert(gprs + size <= num_sgprs + num_vgprs && + (gprs >= num_sgprs || gprs + size <= num_sgprs)); + + gprs += size; + } + + /* Prepare the return type. */ + unsigned num_returns = 0; + LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type; + + last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1])); + return_type = LLVMGetReturnType(last_func_type); + + switch (LLVMGetTypeKind(return_type)) { + case LLVMStructTypeKind: + num_returns = LLVMCountStructElementTypes(return_type); + assert(num_returns <= ARRAY_SIZE(returns)); + LLVMGetStructElementTypes(return_type, returns); + break; + case LLVMVoidTypeKind: + break; + default: + unreachable("unexpected type"); + } + + si_llvm_create_func(ctx, "wrapper", returns, num_returns, + si_get_max_workgroup_size(ctx->shader)); + + if (si_is_merged_shader(ctx->shader)) + ac_init_exec_full_mask(&ctx->ac); + + /* Record the arguments of the function as if they were an output of + * a previous part. + */ + num_out = 0; + num_out_sgpr = 0; + + for (unsigned i = 0; i < ctx->args.arg_count; ++i) { + LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); + LLVMTypeRef param_type = LLVMTypeOf(param); + LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32; + unsigned size = ac_get_type_size(param_type) / 4; + + if (size == 1) { + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, ""); + param_type = ctx->ac.i32; + } + + if (param_type != out_type) + param = LLVMBuildBitCast(builder, param, out_type, ""); + out[num_out++] = param; + } else { + LLVMTypeRef vector_type = LLVMVectorType(out_type, size); + + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, ""); + param_type = ctx->ac.i64; + } + + if (param_type != vector_type) + param = LLVMBuildBitCast(builder, param, vector_type, ""); + + for (unsigned j = 0; j < size; ++j) + out[num_out++] = LLVMBuildExtractElement( + builder, param, LLVMConstInt(ctx->ac.i32, j, 0), ""); + } + + if (ctx->args.args[i].file == AC_ARG_SGPR) + num_out_sgpr = num_out; + } + + memcpy(initial, out, sizeof(out)); + initial_num_out = num_out; + initial_num_out_sgpr = num_out_sgpr; + + /* Now chain the parts. */ + LLVMValueRef ret = NULL; + for (unsigned part = 0; part < num_parts; ++part) { + LLVMValueRef in[AC_MAX_ARGS]; + LLVMTypeRef ret_type; + unsigned out_idx = 0; + unsigned num_params = LLVMCountParams(parts[part]); + + /* Merged shaders are executed conditionally depending + * on the number of enabled threads passed in the input SGPRs. */ + if (si_is_multi_part_shader(ctx->shader) && part == 0) { + LLVMValueRef ena, count = initial[3]; + + count = LLVMBuildAnd(builder, count, + LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); + ena = LLVMBuildICmp(builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), count, ""); + ac_build_ifcc(&ctx->ac, ena, 6506); + } + + /* Derive arguments for the next part from outputs of the + * previous one. + */ + for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { + LLVMValueRef param; + LLVMTypeRef param_type; + bool is_sgpr; + unsigned param_size; + LLVMValueRef arg = NULL; + + param = LLVMGetParam(parts[part], param_idx); + param_type = LLVMTypeOf(param); + param_size = ac_get_type_size(param_type) / 4; + is_sgpr = ac_is_sgpr_param(param); + + if (is_sgpr) { + ac_add_function_attr(ctx->ac.context, parts[part], + param_idx + 1, AC_FUNC_ATTR_INREG); + } else if (out_idx < num_out_sgpr) { + /* Skip returned SGPRs the current part doesn't + * declare on the input. */ + out_idx = num_out_sgpr; + } + + assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); + + if (param_size == 1) + arg = out[out_idx]; + else + arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); + + if (LLVMTypeOf(arg) != param_type) { + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + if (LLVMGetPointerAddressSpace(param_type) == + AC_ADDR_SPACE_CONST_32BIT) { + arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, ""); + arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); + } else { + arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, ""); + arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); + } + } else { + arg = LLVMBuildBitCast(builder, arg, param_type, ""); + } + } + + in[param_idx] = arg; + out_idx += param_size; + } + + ret = ac_build_call(&ctx->ac, parts[part], in, num_params); + + if (si_is_multi_part_shader(ctx->shader) && + part + 1 == next_shader_first_part) { + ac_build_endif(&ctx->ac, 6506); + + /* The second half of the merged shader should use + * the inputs from the toplevel (wrapper) function, + * not the return value from the last call. + * + * That's because the last call was executed condi- + * tionally, so we can't consume it in the main + * block. + */ + memcpy(out, initial, sizeof(initial)); + num_out = initial_num_out; + num_out_sgpr = initial_num_out_sgpr; + continue; + } + + /* Extract the returned GPRs. */ + ret_type = LLVMTypeOf(ret); + num_out = 0; + num_out_sgpr = 0; + + if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { + assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); + + unsigned ret_size = LLVMCountStructElementTypes(ret_type); + + for (unsigned i = 0; i < ret_size; ++i) { + LLVMValueRef val = + LLVMBuildExtractValue(builder, ret, i, ""); + + assert(num_out < ARRAY_SIZE(out)); + out[num_out++] = val; + + if (LLVMTypeOf(val) == ctx->ac.i32) { + assert(num_out_sgpr + 1 == num_out); + num_out_sgpr = num_out; + } + } + } + } + + /* Return the value from the last part. */ + if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) + LLVMBuildRetVoid(builder); + else + LLVMBuildRet(builder, ret); } |