ac: unify denorm setting enforcement

From Marek Olsak 56cc10bd27b24d513de88bf7fa94a6c8f43e348f in mainline Mesa
author: Jonathan Gray <jsg@cvs.openbsd.org> 2021-03-11 05:14:49 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2021-03-11 05:14:49 +0000
commit: b87befe14c653861830ec0bfa9b8a27e503a2294 (patch)
tree: e8fa56db169d60b95a62ecef48e4bc53b6493588 /lib/mesa/src
parent: 845976941bf8cd7bfdd6672d614ac73cd02286c3 (diff)
3 files changed, 1762 insertions, 869 deletions
diff --git a/lib/mesa/src/amd/common/ac_binary.c b/lib/mesa/src/amd/common/ac_binary.c
index 5f92a57d7..8761422bd 100644
--- a/lib/mesa/src/amd/common/ac_binary.c
+++ b/lib/mesa/src/amd/common/ac_binary.c
@@ -58,11 +58,13 @@ void ac_parse_shader_binary_config(const char *data, size_t nbytes,
 				conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 
 			conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+			/* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
 			conf->float_mode =  G_00B028_FLOAT_MODE(value);
 			conf->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
 			conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+			/* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
 			conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
 			conf->rsrc2 = value;
 			break;
@@ -124,4 +126,15 @@ void ac_parse_shader_binary_config(const char *data, size_t nbytes,
 		/* sgprs spills aren't spilling */
 	        conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
 	}
+
+	/* Enable 64-bit and 16-bit denormals, because there is no performance
+	 * cost.
+	 *
+	 * Don't enable denormals for 32-bit floats, because:
+	 * - denormals disable output modifiers
+	 * - denormals break v_mad_f32
+	 * - GFX6 & GFX7 would be very slow
+	 */
+	conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
+	conf->float_mode |= V_00B028_FP_64_DENORMS;
 }
diff --git a/lib/mesa/src/amd/vulkan/radv_shader.c b/lib/mesa/src/amd/vulkan/radv_shader.c
index 83e2e675e..fb725284c 100644
--- a/lib/mesa/src/amd/vulkan/radv_shader.c
+++ b/lib/mesa/src/amd/vulkan/radv_shader.c
@@ -30,28 +30,41 @@
 #include "radv_debug.h"
 #include "radv_private.h"
 #include "radv_shader.h"
+#include "radv_shader_helper.h"
+#include "radv_shader_args.h"
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
 #include "spirv/nir_spirv.h"
 
 #include <llvm-c/Core.h>
 #include <llvm-c/TargetMachine.h>
+#include <llvm-c/Support.h>
 
 #include "sid.h"
-#include "gfx9d.h"
 #include "ac_binary.h"
 #include "ac_llvm_util.h"
 #include "ac_nir_to_llvm.h"
+#include "ac_rtld.h"
 #include "vk_format.h"
 #include "util/debug.h"
 #include "ac_exp_param.h"
 
-static const struct nir_shader_compiler_options nir_options = {
+#include "aco_interface.h"
+
+#include "util/string_buffer.h"
+
+static const struct nir_shader_compiler_options nir_options_llvm = {
 	.vertex_id_zero_based = true,
 	.lower_scmp = true,
+	.lower_flrp16 = true,
 	.lower_flrp32 = true,
+	.lower_flrp64 = true,
+	.lower_device_index_to_zero = true,
 	.lower_fsat = true,
 	.lower_fdiv = true,
+	.lower_fmod = true,
+	.lower_bitfield_insert_to_bitfield_select = true,
+	.lower_bitfield_extract = true,
 	.lower_sub = true,
 	.lower_pack_snorm_2x16 = true,
 	.lower_pack_snorm_4x8 = true,
@@ -64,9 +77,81 @@ static const struct nir_shader_compiler_options nir_options = {
 	.lower_extract_byte = true,
 	.lower_extract_word = true,
 	.lower_ffma = true,
-	.max_unroll_iterations = 32
+	.lower_fpow = true,
+	.lower_mul_2x32_64 = true,
+	.lower_rotate = true,
+	.max_unroll_iterations = 32,
+	.use_interpolated_input_intrinsics = true,
+	/* nir_lower_int64() isn't actually called for the LLVM backend, but
+	 * this helps the loop unrolling heuristics. */
+	.lower_int64_options = nir_lower_imul64 |
+                               nir_lower_imul_high64 |
+                               nir_lower_imul_2x32_64 |
+                               nir_lower_divmod64 |
+                               nir_lower_minmax64 |
+                               nir_lower_iabs64,
 };
 
+static const struct nir_shader_compiler_options nir_options_aco = {
+	.vertex_id_zero_based = true,
+	.lower_scmp = true,
+	.lower_flrp16 = true,
+	.lower_flrp32 = true,
+	.lower_flrp64 = true,
+	.lower_device_index_to_zero = true,
+	.lower_fdiv = true,
+	.lower_fmod = true,
+	.lower_bitfield_insert_to_bitfield_select = true,
+	.lower_bitfield_extract = true,
+	.lower_pack_snorm_2x16 = true,
+	.lower_pack_snorm_4x8 = true,
+	.lower_pack_unorm_2x16 = true,
+	.lower_pack_unorm_4x8 = true,
+	.lower_unpack_snorm_2x16 = true,
+	.lower_unpack_snorm_4x8 = true,
+	.lower_unpack_unorm_2x16 = true,
+	.lower_unpack_unorm_4x8 = true,
+	.lower_unpack_half_2x16 = true,
+	.lower_extract_byte = true,
+	.lower_extract_word = true,
+	.lower_ffma = true,
+	.lower_fpow = true,
+	.lower_mul_2x32_64 = true,
+	.lower_rotate = true,
+	.max_unroll_iterations = 32,
+	.use_interpolated_input_intrinsics = true,
+	.lower_int64_options = nir_lower_imul64 |
+                               nir_lower_imul_high64 |
+                               nir_lower_imul_2x32_64 |
+                               nir_lower_divmod64 |
+                               nir_lower_logic64 |
+                               nir_lower_minmax64 |
+                               nir_lower_iabs64,
+};
+
+bool
+radv_can_dump_shader(struct radv_device *device,
+		     struct radv_shader_module *module,
+		     bool is_gs_copy_shader)
+{
+	if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))
+		return false;
+	if (module)
+		return !module->nir ||
+			(device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS);
+
+	return is_gs_copy_shader;
+}
+
+bool
+radv_can_dump_shader_stats(struct radv_device *device,
+			   struct radv_shader_module *module)
+{
+	/* Only dump non-meta shader stats. */
+	return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS &&
+	       module && !module->nir;
+}
+
 VkResult radv_CreateShaderModule(
 	VkDevice                                    _device,
 	const VkShaderModuleCreateInfo*             pCreateInfo,
@@ -83,7 +168,7 @@ VkResult radv_CreateShaderModule(
 			     sizeof(*module) + pCreateInfo->codeSize, 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (module == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	module->nir = NULL;
 	module->size = pCreateInfo->codeSize;
@@ -110,56 +195,40 @@ void radv_DestroyShaderModule(
 	vk_free2(&device->alloc, pAllocator, module);
 }
 
-bool
-radv_lower_indirect_derefs(struct nir_shader *nir,
-                           struct radv_physical_device *device)
-{
-	/* While it would be nice not to have this flag, we are constrained
-	 * by the reality that LLVM 5.0 doesn't have working VGPR indexing
-	 * on GFX9.
-	 */
-	bool llvm_has_working_vgpr_indexing =
-		device->rad_info.chip_class <= VI;
-
-	/* TODO: Indirect indexing of GS inputs is unimplemented.
-	 *
-	 * TCS and TES load inputs directly from LDS or offchip memory, so
-	 * indirect indexing is trivial.
-	 */
-	nir_variable_mode indirect_mask = 0;
-	if (nir->info.stage == MESA_SHADER_GEOMETRY ||
-	    (nir->info.stage != MESA_SHADER_TESS_CTRL &&
-	     nir->info.stage != MESA_SHADER_TESS_EVAL &&
-	     !llvm_has_working_vgpr_indexing)) {
-		indirect_mask |= nir_var_shader_in;
-	}
-	if (!llvm_has_working_vgpr_indexing &&
-	    nir->info.stage != MESA_SHADER_TESS_CTRL)
-		indirect_mask |= nir_var_shader_out;
-
-	/* TODO: We shouldn't need to do this, however LLVM isn't currently
-	 * smart enough to handle indirects without causing excess spilling
-	 * causing the gpu to hang.
-	 *
-	 * See the following thread for more details of the problem:
-	 * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html
-	 */
-	indirect_mask |= nir_var_local;
-
-	return nir_lower_indirect_derefs(nir, indirect_mask);
-}
-
 void
-radv_optimize_nir(struct nir_shader *shader)
+radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
+                  bool allow_copies)
 {
         bool progress;
+        unsigned lower_flrp =
+                (shader->options->lower_flrp16 ? 16 : 0) |
+                (shader->options->lower_flrp32 ? 32 : 0) |
+                (shader->options->lower_flrp64 ? 64 : 0);
 
         do {
                 progress = false;
 
+		NIR_PASS(progress, shader, nir_split_array_vars, nir_var_function_temp);
+		NIR_PASS(progress, shader, nir_shrink_vec_array_vars, nir_var_function_temp);
+
                 NIR_PASS_V(shader, nir_lower_vars_to_ssa);
-		NIR_PASS_V(shader, nir_lower_64bit_pack);
-                NIR_PASS_V(shader, nir_lower_alu_to_scalar);
+		NIR_PASS_V(shader, nir_lower_pack);
+
+		if (allow_copies) {
+			/* Only run this pass in the first call to
+			 * radv_optimize_nir.  Later calls assume that we've
+			 * lowered away any copy_deref instructions and we
+			 *  don't want to introduce any more.
+			*/
+			NIR_PASS(progress, shader, nir_opt_find_array_copies);
+		}
+
+		NIR_PASS(progress, shader, nir_opt_copy_prop_vars);
+		NIR_PASS(progress, shader, nir_opt_dead_write_vars);
+		NIR_PASS(progress, shader, nir_remove_dead_variables,
+			 nir_var_function_temp | nir_var_shader_in | nir_var_shader_out);
+
+                NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL);
                 NIR_PASS_V(shader, nir_lower_phis_to_scalar);
 
                 NIR_PASS(progress, shader, nir_copy_prop);
@@ -171,18 +240,53 @@ radv_optimize_nir(struct nir_shader *shader)
 			NIR_PASS(progress, shader, nir_opt_remove_phis);
                         NIR_PASS(progress, shader, nir_opt_dce);
                 }
-                NIR_PASS(progress, shader, nir_opt_if);
+                NIR_PASS(progress, shader, nir_opt_if, true);
                 NIR_PASS(progress, shader, nir_opt_dead_cf);
                 NIR_PASS(progress, shader, nir_opt_cse);
-                NIR_PASS(progress, shader, nir_opt_peephole_select, 8);
-                NIR_PASS(progress, shader, nir_opt_algebraic);
+                NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
                 NIR_PASS(progress, shader, nir_opt_constant_folding);
+                NIR_PASS(progress, shader, nir_opt_algebraic);
+
+                if (lower_flrp != 0) {
+                        bool lower_flrp_progress = false;
+                        NIR_PASS(lower_flrp_progress,
+                                 shader,
+                                 nir_lower_flrp,
+                                 lower_flrp,
+                                 false /* always_precise */,
+                                 shader->options->lower_ffma);
+                        if (lower_flrp_progress) {
+                                NIR_PASS(progress, shader,
+                                         nir_opt_constant_folding);
+                                progress = true;
+                        }
+
+                        /* Nothing should rematerialize any flrps, so we only
+                         * need to do this lowering once.
+                         */
+                        lower_flrp = 0;
+                }
+
                 NIR_PASS(progress, shader, nir_opt_undef);
-                NIR_PASS(progress, shader, nir_opt_conditional_discard);
                 if (shader->options->max_unroll_iterations) {
                         NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);
                 }
-        } while (progress);
+        } while (progress && !optimize_conservatively);
+
+	NIR_PASS(progress, shader, nir_opt_conditional_discard);
+        NIR_PASS(progress, shader, nir_opt_shrink_load);
+        NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo);
+}
+
+static void
+shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
+{
+	assert(glsl_type_is_vector_or_scalar(type));
+
+	uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
+	unsigned length = glsl_get_vector_elements(type);
+	*size = comp_size * length,
+	*align = comp_size;
 }
 
 nir_shader *
@@ -190,31 +294,30 @@ radv_shader_compile_to_nir(struct radv_device *device,
 			   struct radv_shader_module *module,
 			   const char *entrypoint_name,
 			   gl_shader_stage stage,
-			   const VkSpecializationInfo *spec_info)
+			   const VkSpecializationInfo *spec_info,
+			   const VkPipelineCreateFlags flags,
+			   const struct radv_pipeline_layout *layout,
+			   bool use_aco,
+			   unsigned subgroup_size, unsigned ballot_bit_size)
 {
-	if (strcmp(entrypoint_name, "main") != 0) {
-		radv_finishme("Multiple shaders per module not really supported");
-	}
-
 	nir_shader *nir;
-	nir_function *entry_point;
+	const nir_shader_compiler_options *nir_options = use_aco ? &nir_options_aco :
+								   &nir_options_llvm;
 	if (module->nir) {
 		/* Some things such as our meta clear/blit code will give us a NIR
 		 * shader directly.  In that case, we just ignore the SPIR-V entirely
 		 * and just use the NIR shader */
 		nir = module->nir;
-		nir->options = &nir_options;
-		nir_validate_shader(nir);
+		nir->options = nir_options;
+		nir_validate_shader(nir, "in internal shader");
 
 		assert(exec_list_length(&nir->functions) == 1);
-		struct exec_node *node = exec_list_get_head(&nir->functions);
-		entry_point = exec_node_data(nir_function, node, node);
 	} else {
 		uint32_t *spirv = (uint32_t *) module->data;
 		assert(module->size % 4 == 0);
 
 		if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV)
-			radv_print_spirv(spirv, module->size, stderr);
+			radv_print_spirv(module->data, module->size, stderr);
 
 		uint32_t num_spec_entries = 0;
 		struct nir_spirv_specialization *spec_entries = NULL;
@@ -227,28 +330,85 @@ radv_shader_compile_to_nir(struct radv_device *device,
 				assert(data + entry.size <= spec_info->pData + spec_info->dataSize);
 
 				spec_entries[i].id = spec_info->pMapEntries[i].constantID;
-				if (spec_info->dataSize == 8)
+				switch (entry.size) {
+				case 8:
 					spec_entries[i].data64 = *(const uint64_t *)data;
-				else
+					break;
+				case 4:
 					spec_entries[i].data32 = *(const uint32_t *)data;
+					break;
+				case 2:
+					spec_entries[i].data32 = *(const uint16_t *)data;
+					break;
+				case 1:
+					spec_entries[i].data32 = *(const uint8_t *)data;
+					break;
+				default:
+					assert(!"Invalid spec constant size");
+					break;
+				}
 			}
 		}
-		const struct nir_spirv_supported_extensions supported_ext = {
-			.draw_parameters = true,
-			.float64 = true,
-			.image_read_without_format = true,
-			.image_write_without_format = true,
-			.tessellation = true,
-			.int64 = true,
-			.multiview = true,
-			.variable_pointers = true,
+		const struct spirv_to_nir_options spirv_options = {
+			.lower_ubo_ssbo_access_to_offsets = true,
+			.caps = {
+				.amd_fragment_mask = true,
+				.amd_gcn_shader = true,
+				.amd_image_read_write_lod = true,
+				.amd_shader_ballot = device->physical_device->use_shader_ballot,
+				.amd_shader_explicit_vertex_parameter = true,
+				.amd_trinary_minmax = true,
+				.demote_to_helper_invocation = device->physical_device->use_aco,
+				.derivative_group = true,
+				.descriptor_array_dynamic_indexing = true,
+				.descriptor_array_non_uniform_indexing = true,
+				.descriptor_indexing = true,
+				.device_group = true,
+				.draw_parameters = true,
+				.float_controls = true,
+				.float16 = !device->physical_device->use_aco,
+				.float64 = true,
+				.geometry_streams = true,
+				.image_ms_array = true,
+				.image_read_without_format = true,
+				.image_write_without_format = true,
+				.int8 = !device->physical_device->use_aco,
+				.int16 = !device->physical_device->use_aco,
+				.int64 = true,
+				.int64_atomics = true,
+				.multiview = true,
+				.physical_storage_buffer_address = true,
+				.post_depth_coverage = true,
+				.runtime_descriptor_array = true,
+				.shader_clock = true,
+				.shader_viewport_index_layer = true,
+				.stencil_export = true,
+				.storage_8bit = !device->physical_device->use_aco,
+				.storage_16bit = !device->physical_device->use_aco,
+				.storage_image_ms = true,
+				.subgroup_arithmetic = true,
+				.subgroup_ballot = true,
+				.subgroup_basic = true,
+				.subgroup_quad = true,
+				.subgroup_shuffle = true,
+				.subgroup_vote = true,
+				.tessellation = true,
+				.transform_feedback = true,
+				.variable_pointers = true,
+			},
+			.ubo_addr_format = nir_address_format_32bit_index_offset,
+			.ssbo_addr_format = nir_address_format_32bit_index_offset,
+			.phys_ssbo_addr_format = nir_address_format_64bit_global,
+			.push_const_addr_format = nir_address_format_logical,
+			.shared_addr_format = nir_address_format_32bit_offset,
+			.frag_coord_is_sysval = true,
 		};
-		entry_point = spirv_to_nir(spirv, module->size / 4,
-					   spec_entries, num_spec_entries,
-					   stage, entrypoint_name, &supported_ext, &nir_options);
-		nir = entry_point->shader;
+		nir = spirv_to_nir(spirv, module->size / 4,
+				   spec_entries, num_spec_entries,
+				   stage, entrypoint_name,
+				   &spirv_options, nir_options);
 		assert(nir->info.stage == stage);
-		nir_validate_shader(nir);
+		nir_validate_shader(nir, "after spirv_to_nir");
 
 		free(spec_entries);
 
@@ -256,50 +416,214 @@ radv_shader_compile_to_nir(struct radv_device *device,
 		 * inline functions.  That way they get properly initialized at the top
 		 * of the function and not at the top of its caller.
 		 */
-		NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_local);
+		NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_function_temp);
 		NIR_PASS_V(nir, nir_lower_returns);
 		NIR_PASS_V(nir, nir_inline_functions);
+		NIR_PASS_V(nir, nir_opt_deref);
 
 		/* Pick off the single entrypoint that we want */
 		foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
-			if (func != entry_point)
+			if (func->is_entrypoint)
+				func->name = ralloc_strdup(func, "main");
+			else
 				exec_node_remove(&func->node);
 		}
 		assert(exec_list_length(&nir->functions) == 1);
-		entry_point->name = ralloc_strdup(entry_point, "main");
 
-		NIR_PASS_V(nir, nir_remove_dead_variables,
-		           nir_var_shader_in | nir_var_shader_out | nir_var_system_value);
+		/* Make sure we lower constant initializers on output variables so that
+		 * nir_remove_dead_variables below sees the corresponding stores
+		 */
+		NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_shader_out);
 
 		/* Now that we've deleted all but the main function, we can go ahead and
 		 * lower the rest of the constant initializers.
 		 */
 		NIR_PASS_V(nir, nir_lower_constant_initializers, ~0);
+
+		/* Split member structs.  We do this before lower_io_to_temporaries so that
+		 * it doesn't lower system values to temporaries by accident.
+		 */
+		NIR_PASS_V(nir, nir_split_var_copies);
+		NIR_PASS_V(nir, nir_split_per_member_structs);
+
+		if (nir->info.stage == MESA_SHADER_FRAGMENT && use_aco)
+                        NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
+		if (nir->info.stage == MESA_SHADER_FRAGMENT)
+			NIR_PASS_V(nir, nir_lower_input_attachments, true);
+
+		NIR_PASS_V(nir, nir_remove_dead_variables,
+		           nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared);
+
+		NIR_PASS_V(nir, nir_propagate_invariant);
+
 		NIR_PASS_V(nir, nir_lower_system_values);
 		NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
+		NIR_PASS_V(nir, radv_nir_lower_ycbcr_textures, layout);
 	}
 
 	/* Vulkan uses the separate-shader linking model */
 	nir->info.separate_shader = true;
 
-	nir_shader_gather_info(nir, entry_point->impl);
+	nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+	if (nir->info.stage == MESA_SHADER_GEOMETRY && use_aco)
+		nir_lower_gs_intrinsics(nir, true);
 
 	static const nir_lower_tex_options tex_options = {
 	  .lower_txp = ~0,
+	  .lower_tg4_offsets = true,
 	};
 
 	nir_lower_tex(nir, &tex_options);
 
 	nir_lower_vars_to_ssa(nir);
-	nir_lower_var_copies(nir);
+
+	if (nir->info.stage == MESA_SHADER_VERTEX ||
+	    nir->info.stage == MESA_SHADER_GEOMETRY ||
+	    nir->info.stage == MESA_SHADER_FRAGMENT) {
+		NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+			   nir_shader_get_entrypoint(nir), true, true);
+	} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+		NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+			   nir_shader_get_entrypoint(nir), true, false);
+	}
+
+	nir_split_var_copies(nir);
+
 	nir_lower_global_vars_to_local(nir);
-	nir_remove_dead_variables(nir, nir_var_local);
-	radv_lower_indirect_derefs(nir, device->physical_device);
-	radv_optimize_nir(nir);
+	nir_remove_dead_variables(nir, nir_var_function_temp);
+	bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7;
+	nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options) {
+			.subgroup_size = subgroup_size,
+			.ballot_bit_size = ballot_bit_size,
+			.lower_to_scalar = 1,
+			.lower_subgroup_masks = 1,
+			.lower_shuffle = 1,
+			.lower_shuffle_to_32bit = 1,
+			.lower_vote_eq_to_ballot = 1,
+			.lower_quad_broadcast_dynamic = 1,
+			.lower_quad_broadcast_dynamic_to_const = gfx7minus,
+		});
+
+	nir_lower_load_const_to_scalar(nir);
+
+	if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
+		radv_optimize_nir(nir, false, true);
+
+	/* We call nir_lower_var_copies() after the first radv_optimize_nir()
+	 * to remove any copies introduced by nir_opt_find_array_copies().
+	 */
+	nir_lower_var_copies(nir);
+
+	/* Lower deref operations for compute shared memory. */
+	if (nir->info.stage == MESA_SHADER_COMPUTE) {
+		NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+			   nir_var_mem_shared, shared_var_info);
+		NIR_PASS_V(nir, nir_lower_explicit_io,
+			   nir_var_mem_shared, nir_address_format_32bit_offset);
+	}
+
+	/* Lower large variables that are always constant with load_constant
+	 * intrinsics, which get turned into PC-relative loads from a data
+	 * section next to the shader.
+	 */
+	NIR_PASS_V(nir, nir_opt_large_constants,
+		   glsl_get_natural_size_align_bytes, 16);
+
+	/* Indirect lowering must be called after the radv_optimize_nir() loop
+	 * has been called at least once. Otherwise indirect lowering can
+	 * bloat the instruction count of the loop and cause it to be
+	 * considered too large for unrolling.
+	 */
+	ac_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class);
+	radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT, false);
 
 	return nir;
 }
 
+static int
+type_size_vec4(const struct glsl_type *type, bool bindless)
+{
+	return glsl_count_attribute_slots(type, false);
+}
+
+static nir_variable *
+find_layer_in_var(nir_shader *nir)
+{
+	nir_foreach_variable(var, &nir->inputs) {
+		if (var->data.location == VARYING_SLOT_LAYER) {
+			return var;
+		}
+	}
+
+	nir_variable *var =
+		nir_variable_create(nir, nir_var_shader_in, glsl_int_type(), "layer id");
+	var->data.location = VARYING_SLOT_LAYER;
+	var->data.interpolation = INTERP_MODE_FLAT;
+	return var;
+}
+
+/* We use layered rendering to implement multiview, which means we need to map
+ * view_index to gl_Layer. The attachment lowering also uses needs to know the
+ * layer so that it can sample from the correct layer. The code generates a
+ * load from the layer_id sysval, but since we don't have a way to get at this
+ * information from the fragment shader, we also need to lower this to the
+ * gl_Layer varying.  This pass lowers both to a varying load from the LAYER
+ * slot, before lowering io, so that nir_assign_var_locations() will give the
+ * LAYER varying the correct driver_location.
+ */
+
+static bool
+lower_view_index(nir_shader *nir)
+{
+	bool progress = false;
+	nir_function_impl *entry = nir_shader_get_entrypoint(nir);
+	nir_builder b;
+	nir_builder_init(&b, entry);
+	
+	nir_variable *layer = NULL;
+	nir_foreach_block(block, entry) {
+		nir_foreach_instr_safe(instr, block) {
+			if (instr->type != nir_instr_type_intrinsic)
+				continue;
+
+			nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+			if (load->intrinsic != nir_intrinsic_load_view_index &&
+			    load->intrinsic != nir_intrinsic_load_layer_id)
+				continue;
+
+			if (!layer)
+				layer = find_layer_in_var(nir);
+
+			b.cursor = nir_before_instr(instr);
+			nir_ssa_def *def = nir_load_var(&b, layer);
+			nir_ssa_def_rewrite_uses(&load->dest.ssa,
+						 nir_src_for_ssa(def));
+
+			nir_instr_remove(instr);
+			progress = true;
+		}
+	}
+
+	return progress;
+}
+
+void
+radv_lower_fs_io(nir_shader *nir)
+{
+	NIR_PASS_V(nir, lower_view_index);
+	nir_assign_io_var_locations(&nir->inputs, &nir->num_inputs,
+				    MESA_SHADER_FRAGMENT);
+
+	NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
+
+	/* This pass needs actual constants */
+	nir_opt_constant_folding(nir);
+
+	NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in);
+}
+
+
 void *
 radv_alloc_shader_memory(struct radv_device *device,
 			 struct radv_shader_variant *shader)
@@ -331,7 +655,11 @@ radv_alloc_shader_memory(struct radv_device *device,
 
 	slab->size = 256 * 1024;
 	slab->bo = device->ws->buffer_create(device->ws, slab->size, 256,
-	                                     RADEON_DOMAIN_VRAM, 0);
+	                                     RADEON_DOMAIN_VRAM,
+					     RADEON_FLAG_NO_INTERPROCESS_SHARING |
+					     (device->physical_device->rad_info.cpdma_prefetch_writes_memory ?
+					             0 : RADEON_FLAG_READ_ONLY),
+					     RADV_BO_PRIORITY_SHADER);
 	slab->ptr = (char*)device->ws->buffer_map(slab->bo);
 	list_inithead(&slab->shaders);
 
@@ -355,178 +683,550 @@ radv_destroy_shader_slabs(struct radv_device *device)
 	mtx_destroy(&device->shader_slab_mutex);
 }
 
-static void
-radv_fill_shader_variant(struct radv_device *device,
-			 struct radv_shader_variant *variant,
-			 struct ac_shader_binary *binary,
-			 gl_shader_stage stage)
+/* For the UMR disassembler. */
+#define DEBUGGER_END_OF_CODE_MARKER    0xbf9f0000 /* invalid instruction */
+#define DEBUGGER_NUM_MARKERS           5
+
+static unsigned
+radv_get_shader_binary_size(size_t code_size)
 {
-	bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0;
+	return code_size + DEBUGGER_NUM_MARKERS * 4;
+}
+
+static void radv_postprocess_config(const struct radv_physical_device *pdevice,
+				    const struct ac_shader_config *config_in,
+				    const struct radv_shader_info *info,
+				    gl_shader_stage stage,
+				    struct ac_shader_config *config_out)
+{
+	bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;
 	unsigned vgpr_comp_cnt = 0;
+	unsigned num_input_vgprs = info->num_input_vgprs;
 
-	if (scratch_enabled && !device->llvm_supports_spill)
-		radv_finishme("shader scratch support only available with LLVM 4.0");
+	if (stage == MESA_SHADER_FRAGMENT) {
+		num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL);
+	}
 
-	variant->code_size = binary->code_size;
-	variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) |
-			S_00B12C_SCRATCH_EN(scratch_enabled);
+	unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs);
+	/* +3 for scratch wave offset and VCC */
+	unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3);
+	unsigned num_shared_vgprs = config_in->num_shared_vgprs;
+	/* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */
+	assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0)
+	       || (pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0));
+	unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8;
+
+	*config_out = *config_in;
+	config_out->num_vgprs = num_vgprs;
+	config_out->num_sgprs = num_sgprs;
+	config_out->num_shared_vgprs = num_shared_vgprs;
+
+	config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) |
+			    S_00B12C_SCRATCH_EN(scratch_enabled);
+
+	if (!pdevice->use_ngg_streamout) {
+		config_out->rsrc2 |= S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) |
+				     S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) |
+				     S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) |
+				     S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) |
+				     S_00B12C_SO_EN(!!info->so.num_outputs);
+	}
 
-	variant->rsrc1 =  S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) |
-		S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) |
-		S_00B848_DX10_CLAMP(1) |
-		S_00B848_FLOAT_MODE(variant->config.float_mode);
+	config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) /
+					   (info->wave_size == 32 ? 8 : 4)) |
+			    S_00B848_DX10_CLAMP(1) |
+			    S_00B848_FLOAT_MODE(config_out->float_mode);
+
+	if (pdevice->rad_info.chip_class >= GFX10) {
+		config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(info->num_user_sgprs >> 5);
+	} else {
+		config_out->rsrc1 |= S_00B228_SGPRS((num_sgprs - 1) / 8);
+		config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(info->num_user_sgprs >> 5);
+	}
 
 	switch (stage) {
 	case MESA_SHADER_TESS_EVAL:
-		vgpr_comp_cnt = 3;
-		variant->rsrc2 |= S_00B12C_OC_LDS_EN(1);
+		if (info->is_ngg) {
+			config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+			config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1);
+		} else if (info->tes.as_es) {
+			assert(pdevice->rad_info.chip_class <= GFX8);
+			vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
+
+			config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1);
+		} else {
+			bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
+			vgpr_comp_cnt = enable_prim_id ? 3 : 2;
+
+			config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+			config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1);
+		}
+		config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		if (device->physical_device->rad_info.chip_class >= GFX9)
-			vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt;
-		else
-			variant->rsrc2 |= S_00B12C_OC_LDS_EN(1);
+		if (pdevice->rad_info.chip_class >= GFX9) {
+			/* We need at least 2 components for LS.
+			 * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+			 * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+			 */
+			if (pdevice->rad_info.chip_class >= GFX10) {
+				vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1;
+			} else {
+				vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
+			}
+		} else {
+			config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1);
+		}
+		config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
+				     S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
+		config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
 		break;
 	case MESA_SHADER_VERTEX:
-	case MESA_SHADER_GEOMETRY:
-		vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt;
+		if (info->is_ngg) {
+			config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+		} else if (info->vs.as_ls) {
+			assert(pdevice->rad_info.chip_class <= GFX8);
+			/* We need at least 2 components for LS.
+			 * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+			 * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+			 */
+			vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
+		} else if (info->vs.as_es) {
+			assert(pdevice->rad_info.chip_class <= GFX8);
+			/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+			vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0;
+		} else {
+			/* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
+			 * If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
+			 * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+			 */
+			if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) {
+				vgpr_comp_cnt = 3;
+			} else if (info->vs.export_prim_id) {
+				vgpr_comp_cnt = 2;
+			} else if (info->vs.needs_instance_id) {
+				vgpr_comp_cnt = 1;
+			} else {
+				vgpr_comp_cnt = 0;
+			}
+
+			config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+			config_out->rsrc2 |= S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
+		}
 		break;
 	case MESA_SHADER_FRAGMENT:
+		config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+		config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
+		break;
+	case MESA_SHADER_GEOMETRY:
+		config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
+				     S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
+		config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
 		break;
 	case MESA_SHADER_COMPUTE:
-		variant->rsrc2 |=
-			S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) |
-			S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) |
-			S_00B84C_TG_SIZE_EN(1) |
-			S_00B84C_LDS_SIZE(variant->config.lds_size);
+		config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
+				     S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
+		config_out->rsrc2 |=
+			S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) |
+			S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) |
+			S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) |
+			S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2] ? 2 :
+						info->cs.uses_thread_id[1] ? 1 : 0) |
+			S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |
+			S_00B84C_LDS_SIZE(config_in->lds_size);
+		config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
+
 		break;
 	default:
 		unreachable("unsupported shader type");
 		break;
 	}
 
-	if (device->physical_device->rad_info.chip_class >= GFX9 &&
-	    stage == MESA_SHADER_GEOMETRY) {
-		/* TODO: Figure out how many we actually need. */
-		variant->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(3);
-		variant->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(3) |
-		                  S_00B22C_OC_LDS_EN(1);
-	} else if (device->physical_device->rad_info.chip_class >= GFX9 &&
-	    stage == MESA_SHADER_TESS_CTRL)
-		variant->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
-	else
-		variant->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
+	if (pdevice->rad_info.chip_class >= GFX10 && info->is_ngg &&
+	    (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_GEOMETRY)) {
+		unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
+		gl_shader_stage es_stage = stage;
+		if (stage == MESA_SHADER_GEOMETRY)
+			es_stage = info->gs.es_type;
+
+		/* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
+		if (es_stage == MESA_SHADER_VERTEX) {
+			es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0;
+		} else if (es_stage == MESA_SHADER_TESS_EVAL) {
+			bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
+			es_vgpr_comp_cnt = enable_prim_id ? 3 : 2;
+		} else
+			unreachable("Unexpected ES shader stage");
+
+		bool tes_triangles = stage == MESA_SHADER_TESS_EVAL &&
+			info->tes.primitive_mode >= 4; /* GL_TRIANGLES */
+		if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) {
+			gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+		} else if (info->uses_prim_id) {
+			gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+		} else if (info->gs.vertices_in >= 3 || tes_triangles) {
+			gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+		} else {
+			gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+		}
+
+		config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) |
+				     S_00B228_WGP_MODE(1);
+		config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+				     S_00B22C_LDS_SIZE(config_in->lds_size) |
+				     S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL);
+	} else if (pdevice->rad_info.chip_class >= GFX9 &&
+		   stage == MESA_SHADER_GEOMETRY) {
+		unsigned es_type = info->gs.es_type;
+		unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
+
+		if (es_type == MESA_SHADER_VERTEX) {
+			/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+			if (info->vs.needs_instance_id) {
+				es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1;
+			} else {
+				es_vgpr_comp_cnt = 0;
+			}
+		} else if (es_type == MESA_SHADER_TESS_EVAL) {
+			es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
+		} else {
+			unreachable("invalid shader ES type");
+		}
 
-	void *ptr = radv_alloc_shader_memory(device, variant);
-	memcpy(ptr, binary->code, binary->code_size);
+		/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+		 * VGPR[0:4] are always loaded.
+		 */
+		if (info->uses_invocation_id) {
+			gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+		} else if (info->uses_prim_id) {
+			gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+		} else if (info->gs.vertices_in >= 3) {
+			gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+		} else {
+			gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+		}
+
+		config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+		config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+		                         S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL);
+	} else if (pdevice->rad_info.chip_class >= GFX9 &&
+		   stage == MESA_SHADER_TESS_CTRL) {
+		config_out->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
+	} else {
+		config_out->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
+	}
 }
 
-static struct radv_shader_variant *
-shader_variant_create(struct radv_device *device,
-		      struct radv_shader_module *module,
-		      struct nir_shader * const *shaders,
-		      int shader_count,
-		      gl_shader_stage stage,
-		      struct ac_nir_compiler_options *options,
-		      bool gs_copy_shader,
-		      void **code_out,
-		      unsigned *code_size_out)
+struct radv_shader_variant *
+radv_shader_variant_create(struct radv_device *device,
+			   const struct radv_shader_binary *binary,
+			   bool keep_shader_info)
 {
-	enum radeon_family chip_family = device->physical_device->rad_info.family;
-	bool dump_shaders = device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS;
-	enum ac_target_machine_options tm_options = 0;
-	struct radv_shader_variant *variant;
-	struct ac_shader_binary binary;
-	LLVMTargetMachineRef tm;
-
-	variant = calloc(1, sizeof(struct radv_shader_variant));
+	struct ac_shader_config config = {0};
+	struct ac_rtld_binary rtld_binary = {0};
+	struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant));
 	if (!variant)
 		return NULL;
 
+	variant->ref_count = 1;
+
+	if (binary->type == RADV_BINARY_TYPE_RTLD) {
+		struct ac_rtld_symbol lds_symbols[2];
+		unsigned num_lds_symbols = 0;
+		const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data;
+		size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size;
+
+		if (device->physical_device->rad_info.chip_class >= GFX9 &&
+		    (binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) &&
+		    !binary->is_gs_copy_shader) {
+			/* We add this symbol even on LLVM <= 8 to ensure that
+			 * shader->config.lds_size is set correctly below.
+			 */
+			struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+			sym->name = "esgs_ring";
+			sym->size = binary->info.ngg_info.esgs_ring_size;
+			sym->align = 64 * 1024;
+		}
+
+		if (binary->info.is_ngg &&
+		    binary->stage == MESA_SHADER_GEOMETRY) {
+			struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+			sym->name = "ngg_emit";
+			sym->size = binary->info.ngg_info.ngg_emit_size * 4;
+			sym->align = 4;
+		}
+
+		struct ac_rtld_open_info open_info = {
+			.info = &device->physical_device->rad_info,
+			.shader_type = binary->stage,
+			.wave_size = binary->info.wave_size,
+			.num_parts = 1,
+			.elf_ptrs = &elf_data,
+			.elf_sizes = &elf_size,
+			.num_shared_lds_symbols = num_lds_symbols,
+			.shared_lds_symbols = lds_symbols,
+		};
+		
+		if (!ac_rtld_open(&rtld_binary, open_info)) {
+			free(variant);
+			return NULL;
+		}
+
+		if (!ac_rtld_read_config(&rtld_binary, &config)) {
+			ac_rtld_close(&rtld_binary);
+			free(variant);
+			return NULL;
+		}
+
+		if (rtld_binary.lds_size > 0) {
+			unsigned alloc_granularity = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256;
+			config.lds_size = align(rtld_binary.lds_size, alloc_granularity) / alloc_granularity;
+		}
+
+		variant->code_size = rtld_binary.rx_size;
+		variant->exec_size = rtld_binary.exec_size;
+	} else {
+		assert(binary->type == RADV_BINARY_TYPE_LEGACY);
+		config = ((struct radv_shader_binary_legacy *)binary)->config;
+		variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size);
+		variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size;
+	}
+
+	variant->info = binary->info;
+	radv_postprocess_config(device->physical_device, &config, &binary->info,
+				binary->stage, &variant->config);
+
+	if (radv_device_use_secure_compile(device->instance)) {
+		if (binary->type == RADV_BINARY_TYPE_RTLD)
+			ac_rtld_close(&rtld_binary);
+
+		return variant;
+	}
+
+	void *dest_ptr = radv_alloc_shader_memory(device, variant);
+
+	if (binary->type == RADV_BINARY_TYPE_RTLD) {
+		struct radv_shader_binary_rtld* bin = (struct radv_shader_binary_rtld *)binary;
+		struct ac_rtld_upload_info info = {
+			.binary = &rtld_binary,
+			.rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset,
+			.rx_ptr = dest_ptr, 
+		};
+
+		if (!ac_rtld_upload(&info)) {
+			radv_shader_variant_destroy(device, variant);
+			ac_rtld_close(&rtld_binary);
+			return NULL;
+		}
+
+		if (keep_shader_info ||
+		    (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) {
+			const char *disasm_data;
+			size_t disasm_size;
+			if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm_data, &disasm_size)) {
+				radv_shader_variant_destroy(device, variant);
+				ac_rtld_close(&rtld_binary);
+				return NULL;
+			}
+
+			variant->ir_string = bin->llvm_ir_size ? strdup((const char*)(bin->data + bin->elf_size)) : NULL;
+			variant->disasm_string = malloc(disasm_size + 1);
+			memcpy(variant->disasm_string, disasm_data, disasm_size);
+			variant->disasm_string[disasm_size] = 0;
+		}
+
+		ac_rtld_close(&rtld_binary);
+	} else {
+		struct radv_shader_binary_legacy* bin = (struct radv_shader_binary_legacy *)binary;
+		memcpy(dest_ptr, bin->data, bin->code_size);
+
+		/* Add end-of-code markers for the UMR disassembler. */
+		uint32_t *ptr32 = (uint32_t *)dest_ptr + bin->code_size / 4;
+		for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
+			ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
+
+		variant->ir_string = bin->ir_size ? strdup((const char*)(bin->data + bin->code_size)) : NULL;
+		variant->disasm_string = bin->disasm_size ? strdup((const char*)(bin->data + bin->code_size + bin->ir_size)) : NULL;
+	}
+	return variant;
+}
+
+static char *
+radv_dump_nir_shaders(struct nir_shader * const *shaders,
+                      int shader_count)
+{
+	char *data = NULL;
+	char *ret = NULL;
+	size_t size = 0;
+	FILE *f = open_memstream(&data, &size);
+	if (f) {
+		for (int i = 0; i < shader_count; ++i)
+			nir_print_shader(shaders[i], f);
+		fclose(f);
+	}
+
+	ret = malloc(size + 1);
+	if (ret) {
+		memcpy(ret, data, size);
+		ret[size] = 0;
+	}
+	free(data);
+	return ret;
+}
+
+static struct radv_shader_variant *
+shader_variant_compile(struct radv_device *device,
+		       struct radv_shader_module *module,
+		       struct nir_shader * const *shaders,
+		       int shader_count,
+		       gl_shader_stage stage,
+		       struct radv_shader_info *info,
+		       struct radv_nir_compiler_options *options,
+		       bool gs_copy_shader,
+		       bool keep_shader_info,
+		       bool use_aco,
+		       struct radv_shader_binary **binary_out)
+{
+	enum radeon_family chip_family = device->physical_device->rad_info.family;
+	struct radv_shader_binary *binary = NULL;
+
 	options->family = chip_family;
 	options->chip_class = device->physical_device->rad_info.chip_class;
+	options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader);
+	options->dump_preoptir = options->dump_shader &&
+				 device->instance->debug_flags & RADV_DEBUG_PREOPTIR;
+	options->record_ir = keep_shader_info;
+	options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
+	options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
+	options->address32_hi = device->physical_device->rad_info.address32_hi;
+	options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug;
+	options->use_ngg_streamout = device->physical_device->use_ngg_streamout;
+
+	struct radv_shader_args args = {};
+	args.options = options;
+	args.shader_info = info;
+	args.is_gs_copy_shader = gs_copy_shader;
+	radv_declare_shader_args(&args, 
+				 gs_copy_shader ? MESA_SHADER_VERTEX
+						: shaders[shader_count - 1]->info.stage,
+				 shader_count >= 2,
+				 shader_count >= 2 ? shaders[shader_count - 2]->info.stage
+						   : MESA_SHADER_VERTEX);
+
+	if (!use_aco || options->dump_shader || options->record_ir)
+		ac_init_llvm_once();
+
+	if (use_aco) {
+		aco_compile_shader(shader_count, shaders, &binary, &args);
+		binary->info = *info;
+	} else {
+		enum ac_target_machine_options tm_options = 0;
+		struct ac_llvm_compiler ac_llvm;
+		bool thread_compiler;
 
-	if (options->supports_spill)
 		tm_options |= AC_TM_SUPPORTS_SPILL;
-	if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED)
-		tm_options |= AC_TM_SISCHED;
-	tm = ac_create_target_machine(chip_family, tm_options);
-
-	if (gs_copy_shader) {
-		assert(shader_count == 1);
-		ac_create_gs_copy_shader(tm, *shaders, &binary, &variant->config,
-					 &variant->info, options, dump_shaders);
-	} else {
-		ac_compile_nir_shader(tm, &binary, &variant->config,
-				      &variant->info, shaders, shader_count, options,
-				      dump_shaders);
+		if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED)
+			tm_options |= AC_TM_SISCHED;
+		if (options->check_ir)
+			tm_options |= AC_TM_CHECK_IR;
+		if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT)
+			tm_options |= AC_TM_NO_LOAD_STORE_OPT;
+
+		thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);
+		radv_init_llvm_compiler(&ac_llvm,
+					thread_compiler,
+					chip_family, tm_options,
+					info->wave_size);
+
+		if (gs_copy_shader) {
+			assert(shader_count == 1);
+			radv_compile_gs_copy_shader(&ac_llvm, *shaders, &binary,
+						    &args);
+		} else {
+			radv_compile_nir_shader(&ac_llvm, &binary, &args,
+						shaders, shader_count);
+		}
+
+		binary->info = *info;
+		radv_destroy_llvm_compiler(&ac_llvm, thread_compiler);
 	}
 
-	LLVMDisposeTargetMachine(tm);
+	struct radv_shader_variant *variant = radv_shader_variant_create(device, binary,
+									 keep_shader_info);
+	if (!variant) {
+		free(binary);
+		return NULL;
+	}
+	variant->aco_used = use_aco;
 
-	radv_fill_shader_variant(device, variant, &binary, stage);
+	if (options->dump_shader) {
+		fprintf(stderr, "disasm:\n%s\n", variant->disasm_string);
+	}
 
-	if (code_out) {
-		*code_out = binary.code;
-		*code_size_out = binary.code_size;
-	} else
-		free(binary.code);
-	free(binary.config);
-	free(binary.rodata);
-	free(binary.global_symbol_offsets);
-	free(binary.relocs);
-	variant->ref_count = 1;
 
-	if (device->trace_bo) {
-		variant->disasm_string = binary.disasm_string;
+	if (keep_shader_info) {
+		variant->nir_string = radv_dump_nir_shaders(shaders, shader_count);
 		if (!gs_copy_shader && !module->nir) {
-			variant->nir = *shaders;
-			variant->spirv = (uint32_t *)module->data;
+			variant->spirv = malloc(module->size);
+			if (!variant->spirv) {
+				free(variant);
+				free(binary);
+				return NULL;
+			}
+
+			memcpy(variant->spirv, module->data, module->size);
 			variant->spirv_size = module->size;
 		}
-	} else {
-		free(binary.disasm_string);
 	}
 
+	if (binary_out)
+		*binary_out = binary;
+	else
+		free(binary);
+
 	return variant;
 }
 
 struct radv_shader_variant *
-radv_shader_variant_create(struct radv_device *device,
+radv_shader_variant_compile(struct radv_device *device,
 			   struct radv_shader_module *module,
 			   struct nir_shader *const *shaders,
 			   int shader_count,
 			   struct radv_pipeline_layout *layout,
-			   const struct ac_shader_variant_key *key,
-			   void **code_out,
-			   unsigned *code_size_out)
+			   const struct radv_shader_variant_key *key,
+			   struct radv_shader_info *info,
+			   bool keep_shader_info,
+			   bool use_aco,
+			   struct radv_shader_binary **binary_out)
 {
-	struct ac_nir_compiler_options options = {0};
+	struct radv_nir_compiler_options options = {0};
 
 	options.layout = layout;
 	if (key)
 		options.key = *key;
 
-	options.unsafe_math = !!(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH);
-	options.supports_spill = device->llvm_supports_spill;
+	options.explicit_scratch_args = use_aco;
+	options.robust_buffer_access = device->robust_buffer_access;
 
-	return shader_variant_create(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage,
-				     &options, false, code_out, code_size_out);
+	return shader_variant_compile(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, info,
+				     &options, false, keep_shader_info, use_aco, binary_out);
 }
 
 struct radv_shader_variant *
 radv_create_gs_copy_shader(struct radv_device *device,
 			   struct nir_shader *shader,
-			   void **code_out,
-			   unsigned *code_size_out,
-			   bool multiview)
+			   struct radv_shader_info *info,
+			   struct radv_shader_binary **binary_out,
+			   bool keep_shader_info,
+			   bool multiview, bool use_aco)
 {
-	struct ac_nir_compiler_options options = {0};
+	struct radv_nir_compiler_options options = {0};
 
+	options.explicit_scratch_args = use_aco;
 	options.key.has_multiview_view_index = multiview;
 
-	return shader_variant_create(device, NULL, &shader, 1, MESA_SHADER_VERTEX,
-				     &options, true, code_out, code_size_out);
+	return shader_variant_compile(device, NULL, &shader, 1, MESA_SHADER_VERTEX,
+				      info, &options, true, keep_shader_info, use_aco, binary_out);
 }
 
 void
@@ -540,104 +1240,106 @@ radv_shader_variant_destroy(struct radv_device *device,
 	list_del(&variant->slab_list);
 	mtx_unlock(&device->shader_slab_mutex);
 
-	ralloc_free(variant->nir);
+	free(variant->spirv);
+	free(variant->nir_string);
 	free(variant->disasm_string);
+	free(variant->ir_string);
 	free(variant);
 }
 
-uint32_t
-radv_shader_stage_to_user_data_0(gl_shader_stage stage, enum chip_class chip_class,
-				 bool has_gs, bool has_tess)
+const char *
+radv_get_shader_name(struct radv_shader_info *info,
+		     gl_shader_stage stage)
 {
 	switch (stage) {
-	case MESA_SHADER_FRAGMENT:
-		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
 	case MESA_SHADER_VERTEX:
-		if (chip_class >= GFX9) {
-			return has_tess ? R_00B430_SPI_SHADER_USER_DATA_LS_0 :
-			       has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-			       R_00B130_SPI_SHADER_USER_DATA_VS_0;
-		}
-		if (has_tess)
-			return R_00B530_SPI_SHADER_USER_DATA_LS_0;
+		if (info->vs.as_ls)
+			return "Vertex Shader as LS";
+		else if (info->vs.as_es)
+			return "Vertex Shader as ES";
+		else if (info->is_ngg)
+			return "Vertex Shader as ESGS";
 		else
-			return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B130_SPI_SHADER_USER_DATA_VS_0;
-	case MESA_SHADER_GEOMETRY:
-		return chip_class >= GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-		                            R_00B230_SPI_SHADER_USER_DATA_GS_0;
-	case MESA_SHADER_COMPUTE:
-		return R_00B900_COMPUTE_USER_DATA_0;
+			return "Vertex Shader as VS";
 	case MESA_SHADER_TESS_CTRL:
-		return chip_class >= GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 :
-		                            R_00B430_SPI_SHADER_USER_DATA_HS_0;
+		return "Tessellation Control Shader";
 	case MESA_SHADER_TESS_EVAL:
-		if (chip_class >= GFX9) {
-			return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-			       R_00B130_SPI_SHADER_USER_DATA_VS_0;
-		}
-		if (has_gs)
-			return R_00B330_SPI_SHADER_USER_DATA_ES_0;
+		if (info->tes.as_es)
+			return "Tessellation Evaluation Shader as ES";
+		else if (info->is_ngg)
+			return "Tessellation Evaluation Shader as ESGS";
 		else
-			return R_00B130_SPI_SHADER_USER_DATA_VS_0;
+			return "Tessellation Evaluation Shader as VS";
+	case MESA_SHADER_GEOMETRY:
+		return "Geometry Shader";
+	case MESA_SHADER_FRAGMENT:
+		return "Pixel Shader";
+	case MESA_SHADER_COMPUTE:
+		return "Compute Shader";
 	default:
-		unreachable("unknown shader");
-	}
+		return "Unknown shader";
+	};
 }
 
-const char *
-radv_get_shader_name(struct radv_shader_variant *var, gl_shader_stage stage)
+unsigned
+radv_get_max_workgroup_size(enum chip_class chip_class,
+                            gl_shader_stage stage,
+                            const unsigned *sizes)
 {
 	switch (stage) {
-	case MESA_SHADER_VERTEX: return var->info.vs.as_ls ? "Vertex Shader as LS" : var->info.vs.as_es ? "Vertex Shader as ES" : "Vertex Shader as VS";
-	case MESA_SHADER_GEOMETRY: return "Geometry Shader";
-	case MESA_SHADER_FRAGMENT: return "Pixel Shader";
-	case MESA_SHADER_COMPUTE: return "Compute Shader";
-	case MESA_SHADER_TESS_CTRL: return "Tessellation Control Shader";
-	case MESA_SHADER_TESS_EVAL: return var->info.tes.as_es ? "Tessellation Evaluation Shader as ES" : "Tessellation Evaluation Shader as VS";
+	case MESA_SHADER_TESS_CTRL:
+		return chip_class >= GFX7 ? 128 : 64;
+	case MESA_SHADER_GEOMETRY:
+		return chip_class >= GFX9 ? 128 : 64;
+	case MESA_SHADER_COMPUTE:
+		break;
 	default:
-		return "Unknown shader";
-	};
+		return 0;
+	}
+
+	unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2];
+	return max_workgroup_size;
 }
 
-void
-radv_shader_dump_stats(struct radv_device *device,
-		       struct radv_shader_variant *variant,
-		       gl_shader_stage stage,
-		       FILE *file)
+unsigned
+radv_get_max_waves(struct radv_device *device,
+                   struct radv_shader_variant *variant,
+                   gl_shader_stage stage)
 {
-	unsigned lds_increment = device->physical_device->rad_info.chip_class >= CIK ? 512 : 256;
-	struct ac_shader_config *conf;
+	enum chip_class chip_class = device->physical_device->rad_info.chip_class;
+	unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
+	uint8_t wave_size = variant->info.wave_size;
+	struct ac_shader_config *conf = &variant->config;
 	unsigned max_simd_waves;
 	unsigned lds_per_wave = 0;
 
-	switch (device->physical_device->rad_info.family) {
-	/* These always have 8 waves: */
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-		max_simd_waves = 8;
-		break;
-	default:
-		max_simd_waves = 10;
-	}
-
-	conf = &variant->config;
+	max_simd_waves = device->physical_device->rad_info.max_wave64_per_simd;
 
 	if (stage == MESA_SHADER_FRAGMENT) {
 		lds_per_wave = conf->lds_size * lds_increment +
-			       align(variant->info.fs.num_interp * 48,
+			       align(variant->info.ps.num_interp * 48,
 				     lds_increment);
+	} else if (stage == MESA_SHADER_COMPUTE) {
+		unsigned max_workgroup_size =
+			radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size);
+		lds_per_wave = (conf->lds_size * lds_increment) /
+			       DIV_ROUND_UP(max_workgroup_size, wave_size);
 	}
 
 	if (conf->num_sgprs) {
-		if (device->physical_device->rad_info.chip_class >= VI)
-			max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
-		else
-			max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
+		unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8);
+		max_simd_waves =
+			MIN2(max_simd_waves,
+			     device->physical_device->rad_info.num_physical_sgprs_per_simd /
+			     sgprs);
 	}
 
-	if (conf->num_vgprs)
-		max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
+	if (conf->num_vgprs) {
+		unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4);
+		max_simd_waves =
+			MIN2(max_simd_waves,
+			     RADV_NUM_PHYSICAL_VGPRS / vgprs);
+	}
 
 	/* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
 	 * that PS can use.
@@ -645,27 +1347,153 @@ radv_shader_dump_stats(struct radv_device *device,
 	if (lds_per_wave)
 		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
 
-	fprintf(file, "\n%s:\n", radv_get_shader_name(variant, stage));
+	return max_simd_waves;
+}
+
+static void
+generate_shader_stats(struct radv_device *device,
+		      struct radv_shader_variant *variant,
+		      gl_shader_stage stage,
+		      struct _mesa_string_buffer *buf)
+{
+	struct ac_shader_config *conf = &variant->config;
+	unsigned max_simd_waves = radv_get_max_waves(device, variant, stage);
 
 	if (stage == MESA_SHADER_FRAGMENT) {
-		fprintf(file, "*** SHADER CONFIG ***\n"
-			"SPI_PS_INPUT_ADDR = 0x%04x\n"
-			"SPI_PS_INPUT_ENA  = 0x%04x\n",
-			conf->spi_ps_input_addr, conf->spi_ps_input_ena);
+		_mesa_string_buffer_printf(buf, "*** SHADER CONFIG ***\n"
+					   "SPI_PS_INPUT_ADDR = 0x%04x\n"
+					   "SPI_PS_INPUT_ENA  = 0x%04x\n",
+					   conf->spi_ps_input_addr, conf->spi_ps_input_ena);
+	}
+
+	_mesa_string_buffer_printf(buf, "*** SHADER STATS ***\n"
+				   "SGPRS: %d\n"
+				   "VGPRS: %d\n"
+				   "Spilled SGPRs: %d\n"
+				   "Spilled VGPRs: %d\n"
+				   "PrivMem VGPRS: %d\n"
+				   "Code Size: %d bytes\n"
+				   "LDS: %d blocks\n"
+				   "Scratch: %d bytes per wave\n"
+				   "Max Waves: %d\n"
+				   "********************\n\n\n",
+				   conf->num_sgprs, conf->num_vgprs,
+				   conf->spilled_sgprs, conf->spilled_vgprs,
+				   variant->info.private_mem_vgprs, variant->exec_size,
+				   conf->lds_size, conf->scratch_bytes_per_wave,
+				   max_simd_waves);
+}
+
+void
+radv_shader_dump_stats(struct radv_device *device,
+		       struct radv_shader_variant *variant,
+		       gl_shader_stage stage,
+		       FILE *file)
+{
+	struct _mesa_string_buffer *buf = _mesa_string_buffer_create(NULL, 256);
+
+	generate_shader_stats(device, variant, stage, buf);
+
+	fprintf(file, "\n%s:\n", radv_get_shader_name(&variant->info, stage));
+	fprintf(file, "%s", buf->buf);
+
+	_mesa_string_buffer_destroy(buf);
+}
+
+VkResult
+radv_GetShaderInfoAMD(VkDevice _device,
+		      VkPipeline _pipeline,
+		      VkShaderStageFlagBits shaderStage,
+		      VkShaderInfoTypeAMD infoType,
+		      size_t* pInfoSize,
+		      void* pInfo)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
+	gl_shader_stage stage = vk_to_mesa_shader_stage(shaderStage);
+	struct radv_shader_variant *variant = pipeline->shaders[stage];
+	struct _mesa_string_buffer *buf;
+	VkResult result = VK_SUCCESS;
+
+	/* Spec doesn't indicate what to do if the stage is invalid, so just
+	 * return no info for this. */
+	if (!variant)
+		return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
+
+	switch (infoType) {
+	case VK_SHADER_INFO_TYPE_STATISTICS_AMD:
+		if (!pInfo) {
+			*pInfoSize = sizeof(VkShaderStatisticsInfoAMD);
+		} else {
+			unsigned lds_multiplier = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256;
+			struct ac_shader_config *conf = &variant->config;
+
+			VkShaderStatisticsInfoAMD statistics = {};
+			statistics.shaderStageMask = shaderStage;
+			statistics.numPhysicalVgprs = RADV_NUM_PHYSICAL_VGPRS;
+			statistics.numPhysicalSgprs = device->physical_device->rad_info.num_physical_sgprs_per_simd;
+			statistics.numAvailableSgprs = statistics.numPhysicalSgprs;
+
+			if (stage == MESA_SHADER_COMPUTE) {
+				unsigned *local_size = variant->info.cs.block_size;
+				unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2];
+
+				statistics.numAvailableVgprs = statistics.numPhysicalVgprs /
+							       ceil((double)workgroup_size / statistics.numPhysicalVgprs);
+
+				statistics.computeWorkGroupSize[0] = local_size[0];
+				statistics.computeWorkGroupSize[1] = local_size[1];
+				statistics.computeWorkGroupSize[2] = local_size[2];
+			} else {
+				statistics.numAvailableVgprs = statistics.numPhysicalVgprs;
+			}
+
+			statistics.resourceUsage.numUsedVgprs = conf->num_vgprs;
+			statistics.resourceUsage.numUsedSgprs = conf->num_sgprs;
+			statistics.resourceUsage.ldsSizePerLocalWorkGroup = 32768;
+			statistics.resourceUsage.ldsUsageSizeInBytes = conf->lds_size * lds_multiplier;
+			statistics.resourceUsage.scratchMemUsageInBytes = conf->scratch_bytes_per_wave;
+
+			size_t size = *pInfoSize;
+			*pInfoSize = sizeof(statistics);
+
+			memcpy(pInfo, &statistics, MIN2(size, *pInfoSize));
+
+			if (size < *pInfoSize)
+				result = VK_INCOMPLETE;
+		}
+
+		break;
+	case VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD:
+		buf = _mesa_string_buffer_create(NULL, 1024);
+
+		_mesa_string_buffer_printf(buf, "%s:\n", radv_get_shader_name(&variant->info, stage));
+		_mesa_string_buffer_printf(buf, "%s\n\n", variant->ir_string);
+		_mesa_string_buffer_printf(buf, "%s\n\n", variant->disasm_string);
+		generate_shader_stats(device, variant, stage, buf);
+
+		/* Need to include the null terminator. */
+		size_t length = buf->length + 1;
+
+		if (!pInfo) {
+			*pInfoSize = length;
+		} else {
+			size_t size = *pInfoSize;
+			*pInfoSize = length;
+
+			memcpy(pInfo, buf->buf, MIN2(size, length));
+
+			if (size < length)
+				result = VK_INCOMPLETE;
+		}
+
+		_mesa_string_buffer_destroy(buf);
+		break;
+	default:
+		/* VK_SHADER_INFO_TYPE_BINARY_AMD unimplemented for now. */
+		result = VK_ERROR_FEATURE_NOT_PRESENT;
+		break;
 	}
 
-	fprintf(file, "*** SHADER STATS ***\n"
-		"SGPRS: %d\n"
-		"VGPRS: %d\n"
-		"Spilled SGPRs: %d\n"
-		"Spilled VGPRs: %d\n"
-		"Code Size: %d bytes\n"
-		"LDS: %d blocks\n"
-		"Scratch: %d bytes per wave\n"
-		"Max Waves: %d\n"
-		"********************\n\n\n",
-		conf->num_sgprs, conf->num_vgprs,
-		conf->spilled_sgprs, conf->spilled_vgprs, variant->code_size,
-		conf->lds_size, conf->scratch_bytes_per_wave,
-		max_simd_waves);
+	return result;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 63c9c033a..dca604afe 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -22,272 +22,298 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "ac_nir_to_llvm.h"
-#include "ac_rtld.h"
-#include "si_pipe.h"
 #include "si_shader_internal.h"
+#include "si_pipe.h"
+#include "ac_rtld.h"
+#include "ac_nir_to_llvm.h"
 #include "sid.h"
+
 #include "tgsi/tgsi_from_mesa.h"
 #include "util/u_memory.h"
 
 struct si_llvm_diagnostics {
-   struct pipe_debug_callback *debug;
-   unsigned retval;
+	struct pipe_debug_callback *debug;
+	unsigned retval;
 };
 
 static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
 {
-   struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
-   LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
-   const char *severity_str = NULL;
-
-   switch (severity) {
-   case LLVMDSError:
-      severity_str = "error";
-      break;
-   case LLVMDSWarning:
-      severity_str = "warning";
-      break;
-   case LLVMDSRemark:
-   case LLVMDSNote:
-   default:
-      return;
-   }
-
-   char *description = LLVMGetDiagInfoDescription(di);
-
-   pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
-                      description);
-
-   if (severity == LLVMDSError) {
-      diag->retval = 1;
-      fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
-   }
-
-   LLVMDisposeMessage(description);
+	struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
+	LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+	const char *severity_str = NULL;
+
+	switch (severity) {
+	case LLVMDSError:
+		severity_str = "error";
+		break;
+	case LLVMDSWarning:
+		severity_str = "warning";
+		break;
+	case LLVMDSRemark:
+	case LLVMDSNote:
+	default:
+		return;
+	}
+
+	char *description = LLVMGetDiagInfoDescription(di);
+
+	pipe_debug_message(diag->debug, SHADER_INFO,
+			   "LLVM diagnostic (%s): %s", severity_str, description);
+
+	if (severity == LLVMDSError) {
+		diag->retval = 1;
+		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
+	}
+
+	LLVMDisposeMessage(description);
 }
 
-bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
-                     struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
-                     struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
-                     enum pipe_shader_type shader_type, const char *name, bool less_optimized)
+bool si_compile_llvm(struct si_screen *sscreen,
+		     struct si_shader_binary *binary,
+		     struct ac_shader_config *conf,
+		     struct ac_llvm_compiler *compiler,
+		     struct ac_llvm_context *ac,
+		     struct pipe_debug_callback *debug,
+		     enum pipe_shader_type shader_type,
+		     const char *name,
+		     bool less_optimized)
 {
-   unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
-
-   if (si_can_dump_shader(sscreen, shader_type)) {
-      fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
-
-      if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
-         fprintf(stderr, "%s LLVM IR:\n\n", name);
-         ac_dump_module(ac->module);
-         fprintf(stderr, "\n");
-      }
-   }
-
-   if (sscreen->record_llvm_ir) {
-      char *ir = LLVMPrintModuleToString(ac->module);
-      binary->llvm_ir_string = strdup(ir);
-      LLVMDisposeMessage(ir);
-   }
-
-   if (!si_replace_shader(count, binary)) {
-      struct ac_compiler_passes *passes = compiler->passes;
-
-      if (ac->wave_size == 32)
-         passes = compiler->passes_wave32;
-      else if (less_optimized && compiler->low_opt_passes)
-         passes = compiler->low_opt_passes;
-
-      struct si_llvm_diagnostics diag = {debug};
-      LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
-
-      if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer,
-                                    &binary->elf_size))
-         diag.retval = 1;
-
-      if (diag.retval != 0) {
-         pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
-         return false;
-      }
-   }
-
-   struct ac_rtld_binary rtld;
-   if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
-                               .info = &sscreen->info,
-                               .shader_type = tgsi_processor_to_shader_stage(shader_type),
-                               .wave_size = ac->wave_size,
-                               .num_parts = 1,
-                               .elf_ptrs = &binary->elf_buffer,
-                               .elf_sizes = &binary->elf_size}))
-      return false;
-
-   bool ok = ac_rtld_read_config(&rtld, conf);
-   ac_rtld_close(&rtld);
-   return ok;
+	unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
+
+	if (si_can_dump_shader(sscreen, shader_type)) {
+		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
+
+		if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
+			fprintf(stderr, "%s LLVM IR:\n\n", name);
+			ac_dump_module(ac->module);
+			fprintf(stderr, "\n");
+		}
+	}
+
+	if (sscreen->record_llvm_ir) {
+		char *ir = LLVMPrintModuleToString(ac->module);
+		binary->llvm_ir_string = strdup(ir);
+		LLVMDisposeMessage(ir);
+	}
+
+	if (!si_replace_shader(count, binary)) {
+		struct ac_compiler_passes *passes = compiler->passes;
+
+		if (ac->wave_size == 32)
+			passes = compiler->passes_wave32;
+		else if (less_optimized && compiler->low_opt_passes)
+			passes = compiler->low_opt_passes;
+
+		struct si_llvm_diagnostics diag = {debug};
+		LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
+
+		if (!ac_compile_module_to_elf(passes, ac->module,
+					      (char **)&binary->elf_buffer,
+					      &binary->elf_size))
+			diag.retval = 1;
+
+		if (diag.retval != 0) {
+			pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
+			return false;
+		}
+	}
+
+	struct ac_rtld_binary rtld;
+	if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
+			.info = &sscreen->info,
+			.shader_type = tgsi_processor_to_shader_stage(shader_type),
+			.wave_size = ac->wave_size,
+			.num_parts = 1,
+			.elf_ptrs = &binary->elf_buffer,
+			.elf_sizes = &binary->elf_size }))
+		return false;
+
+	bool ok = ac_rtld_read_config(&rtld, conf);
+	ac_rtld_close(&rtld);
+	return ok;
 }
 
-void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
-                          struct ac_llvm_compiler *compiler, unsigned wave_size)
+void si_llvm_context_init(struct si_shader_context *ctx,
+			  struct si_screen *sscreen,
+			  struct ac_llvm_compiler *compiler,
+			  unsigned wave_size)
 {
-   memset(ctx, 0, sizeof(*ctx));
-   ctx->screen = sscreen;
-   ctx->compiler = compiler;
-
-   ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family,
-                        AC_FLOAT_MODE_DEFAULT_OPENGL, wave_size, 64);
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->screen = sscreen;
+	ctx->compiler = compiler;
+
+	ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class,
+			     sscreen->info.family,
+			     AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
+			     wave_size, 64);
 }
 
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
-                         unsigned num_return_elems, unsigned max_workgroup_size)
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
+			 LLVMTypeRef *return_types, unsigned num_return_elems,
+			 unsigned max_workgroup_size)
 {
-   LLVMTypeRef ret_type;
-   enum ac_llvm_calling_convention call_conv;
-   enum pipe_shader_type real_shader_type;
-
-   if (num_return_elems)
-      ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
-   else
-      ret_type = ctx->ac.voidt;
-
-   real_shader_type = ctx->type;
-
-   /* LS is merged into HS (TCS), and ES is merged into GS. */
-   if (ctx->screen->info.chip_class >= GFX9) {
-      if (ctx->shader->key.as_ls)
-         real_shader_type = PIPE_SHADER_TESS_CTRL;
-      else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
-         real_shader_type = PIPE_SHADER_GEOMETRY;
-   }
-
-   switch (real_shader_type) {
-   case PIPE_SHADER_VERTEX:
-   case PIPE_SHADER_TESS_EVAL:
-      call_conv = AC_LLVM_AMDGPU_VS;
-      break;
-   case PIPE_SHADER_TESS_CTRL:
-      call_conv = AC_LLVM_AMDGPU_HS;
-      break;
-   case PIPE_SHADER_GEOMETRY:
-      call_conv = AC_LLVM_AMDGPU_GS;
-      break;
-   case PIPE_SHADER_FRAGMENT:
-      call_conv = AC_LLVM_AMDGPU_PS;
-      break;
-   case PIPE_SHADER_COMPUTE:
-      call_conv = AC_LLVM_AMDGPU_CS;
-      break;
-   default:
-      unreachable("Unhandle shader type");
-   }
-
-   /* Setup the function */
-   ctx->return_type = ret_type;
-   ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
-   ctx->return_value = LLVMGetUndef(ctx->return_type);
-
-   if (ctx->screen->info.address32_hi) {
-      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits",
-                                           ctx->screen->info.address32_hi);
-   }
-
-   LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true");
-
-   ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
+	LLVMTypeRef ret_type;
+	enum ac_llvm_calling_convention call_conv;
+	enum pipe_shader_type real_shader_type;
+
+	if (num_return_elems)
+		ret_type = LLVMStructTypeInContext(ctx->ac.context,
+						   return_types,
+						   num_return_elems, true);
+	else
+		ret_type = ctx->ac.voidt;
+
+	real_shader_type = ctx->type;
+
+	/* LS is merged into HS (TCS), and ES is merged into GS. */
+	if (ctx->screen->info.chip_class >= GFX9) {
+		if (ctx->shader->key.as_ls)
+			real_shader_type = PIPE_SHADER_TESS_CTRL;
+		else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
+			real_shader_type = PIPE_SHADER_GEOMETRY;
+	}
+
+	switch (real_shader_type) {
+	case PIPE_SHADER_VERTEX:
+	case PIPE_SHADER_TESS_EVAL:
+		call_conv = AC_LLVM_AMDGPU_VS;
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		call_conv = AC_LLVM_AMDGPU_HS;
+		break;
+	case PIPE_SHADER_GEOMETRY:
+		call_conv = AC_LLVM_AMDGPU_GS;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		call_conv = AC_LLVM_AMDGPU_PS;
+		break;
+	case PIPE_SHADER_COMPUTE:
+		call_conv = AC_LLVM_AMDGPU_CS;
+		break;
+	default:
+		unreachable("Unhandle shader type");
+	}
+
+	/* Setup the function */
+	ctx->return_type = ret_type;
+	ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name,
+				     ret_type, ctx->ac.module);
+	ctx->return_value = LLVMGetUndef(ctx->return_type);
+
+	if (ctx->screen->info.address32_hi) {
+		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
+						     "amdgpu-32bit-address-high-bits",
+						     ctx->screen->info.address32_hi);
+	}
+
+	LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
+					   "no-signed-zeros-fp-math",
+					   "true");
+
+	ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
 }
 
 void si_llvm_optimize_module(struct si_shader_context *ctx)
 {
-   /* Dump LLVM IR before any optimization passes */
-   if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->type))
-      LLVMDumpModule(ctx->ac.module);
-
-   /* Run the pass */
-   LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
-   LLVMDisposeBuilder(ctx->ac.builder);
+	/* Dump LLVM IR before any optimization passes */
+	if (ctx->screen->debug_flags & DBG(PREOPT_IR) &&
+	    si_can_dump_shader(ctx->screen, ctx->type))
+		LLVMDumpModule(ctx->ac.module);
+
+	/* Run the pass */
+	LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
+	LLVMDisposeBuilder(ctx->ac.builder);
 }
 
 void si_llvm_dispose(struct si_shader_context *ctx)
 {
-   LLVMDisposeModule(ctx->ac.module);
-   LLVMContextDispose(ctx->ac.context);
-   ac_llvm_context_dispose(&ctx->ac);
+	LLVMDisposeModule(ctx->ac.module);
+	LLVMContextDispose(ctx->ac.context);
+	ac_llvm_context_dispose(&ctx->ac);
 }
 
 /**
  * Load a dword from a constant buffer.
  */
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
-                                  LLVMValueRef offset)
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
+				  LLVMValueRef resource, LLVMValueRef offset)
 {
-   return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, 0, true, true);
+	return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
+				    0, 0, true, true);
 }
 
 void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
 {
-   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
-      LLVMBuildRetVoid(ctx->ac.builder);
-   else
-      LLVMBuildRet(ctx->ac.builder, ret);
+	if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+		LLVMBuildRetVoid(ctx->ac.builder);
+	else
+		LLVMBuildRet(ctx->ac.builder, ret);
 }
 
 LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
-                                 struct ac_arg param, unsigned return_index)
+				 struct ac_arg param, unsigned return_index)
 {
-   return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
+	return LLVMBuildInsertValue(ctx->ac.builder, ret,
+				    ac_get_arg(&ctx->ac, param),
+				    return_index, "");
 }
 
 LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
-                                       struct ac_arg param, unsigned return_index)
+				       struct ac_arg param, unsigned return_index)
 {
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef p = ac_get_arg(&ctx->ac, param);
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef p = ac_get_arg(&ctx->ac, param);
 
-   return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
+	return LLVMBuildInsertValue(builder, ret,
+				    ac_to_float(&ctx->ac, p),
+				    return_index, "");
 }
 
 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
-                                 struct ac_arg param, unsigned return_index)
+				 struct ac_arg param, unsigned return_index)
 {
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
-   ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
-   return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
+	ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
+	return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
 }
 
 LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
 {
-   LLVMValueRef ptr[2], list;
-   bool merged_shader = si_is_merged_shader(ctx->shader);
+	LLVMValueRef ptr[2], list;
+	bool merged_shader = si_is_merged_shader(ctx->shader);
 
-   ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
-   list =
-      LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-   return list;
+	ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
+	list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
+				 ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+	return list;
 }
 
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
-                                   LLVMValueRef val1, LLVMValueRef val2)
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
+				   LLVMTypeRef type, LLVMValueRef val1,
+				   LLVMValueRef val2)
 {
-   LLVMValueRef values[2] = {
-      ac_to_integer(&ctx->ac, val1),
-      ac_to_integer(&ctx->ac, val2),
-   };
-   LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
-   return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
+	LLVMValueRef values[2] = {
+		ac_to_integer(&ctx->ac, val1),
+		ac_to_integer(&ctx->ac, val2),
+	};
+	LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
+	return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
 }
 
 void si_llvm_emit_barrier(struct si_shader_context *ctx)
 {
-   /* GFX6 only (thanks to a hw bug workaround):
-    * The real barrier instruction isn’t needed, because an entire patch
-    * always fits into a single wave.
-    */
-   if (ctx->screen->info.chip_class == GFX6 && ctx->type == PIPE_SHADER_TESS_CTRL) {
-      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
-      return;
-   }
-
-   ac_build_s_barrier(&ctx->ac);
+	/* GFX6 only (thanks to a hw bug workaround):
+	 * The real barrier instruction isn’t needed, because an entire patch
+	 * always fits into a single wave.
+	 */
+	if (ctx->screen->info.chip_class == GFX6 &&
+	    ctx->type == PIPE_SHADER_TESS_CTRL) {
+		ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
+		return;
+	}
+
+	ac_build_s_barrier(&ctx->ac);
 }
 
 /* Ensure that the esgs ring is declared.
@@ -297,169 +323,187 @@ void si_llvm_emit_barrier(struct si_shader_context *ctx)
  */
 void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
 {
-   if (ctx->esgs_ring)
-      return;
+	if (ctx->esgs_ring)
+		return;
 
-   assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
+	assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
 
-   ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
-                                                "esgs_ring", AC_ADDR_SPACE_LDS);
-   LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
-   LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
+	ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
+		ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
+		"esgs_ring",
+		AC_ADDR_SPACE_LDS);
+	LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
+	LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
 }
 
-void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset)
+void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
+			     unsigned bitoffset)
 {
-   LLVMValueRef args[] = {
-      ac_get_arg(&ctx->ac, param),
-      LLVMConstInt(ctx->ac.i32, bitoffset, 0),
-   };
-   ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2,
-                      AC_FUNC_ATTR_CONVERGENT);
+	LLVMValueRef args[] = {
+		ac_get_arg(&ctx->ac, param),
+		LLVMConstInt(ctx->ac.i32, bitoffset, 0),
+	};
+	ac_build_intrinsic(&ctx->ac,
+			   "llvm.amdgcn.init.exec.from.input",
+			   ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
 }
 
 /**
  * Get the value of a shader input parameter and extract a bitfield.
  */
-static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
-                                      unsigned rshift, unsigned bitwidth)
+static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
+				      LLVMValueRef value, unsigned rshift,
+				      unsigned bitwidth)
 {
-   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
-      value = ac_to_integer(&ctx->ac, value);
+	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+		value = ac_to_integer(&ctx->ac, value);
 
-   if (rshift)
-      value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
+	if (rshift)
+		value = LLVMBuildLShr(ctx->ac.builder, value,
+				      LLVMConstInt(ctx->ac.i32, rshift, 0), "");
 
-   if (rshift + bitwidth < 32) {
-      unsigned mask = (1 << bitwidth) - 1;
-      value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
-   }
+	if (rshift + bitwidth < 32) {
+		unsigned mask = (1 << bitwidth) - 1;
+		value = LLVMBuildAnd(ctx->ac.builder, value,
+				     LLVMConstInt(ctx->ac.i32, mask, 0), "");
+	}
 
-   return value;
+	return value;
 }
 
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
-                             unsigned bitwidth)
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
+			     struct ac_arg param, unsigned rshift,
+			     unsigned bitwidth)
 {
-   LLVMValueRef value = ac_get_arg(&ctx->ac, param);
+	LLVMValueRef value = ac_get_arg(&ctx->ac, param);
 
-   return unpack_llvm_param(ctx, value, rshift, bitwidth);
+	return unpack_llvm_param(ctx, value, rshift, bitwidth);
 }
 
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle)
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
+				 unsigned swizzle)
 {
-   if (swizzle > 0)
-      return ctx->ac.i32_0;
-
-   switch (ctx->type) {
-   case PIPE_SHADER_VERTEX:
-      return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
-   case PIPE_SHADER_TESS_CTRL:
-      return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
-   case PIPE_SHADER_TESS_EVAL:
-      return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
-   case PIPE_SHADER_GEOMETRY:
-      return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
-   default:
-      assert(0);
-      return ctx->ac.i32_0;
-   }
+	if (swizzle > 0)
+		return ctx->ac.i32_0;
+
+	switch (ctx->type) {
+	case PIPE_SHADER_VERTEX:
+		return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
+	case PIPE_SHADER_TESS_CTRL:
+		return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
+	case PIPE_SHADER_TESS_EVAL:
+		return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
+	case PIPE_SHADER_GEOMETRY:
+		return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
+	default:
+		assert(0);
+		return ctx->ac.i32_0;
+	}
 }
 
 LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
 {
-   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
-   LLVMValueRef values[3];
-   LLVMValueRef result;
-   unsigned i;
-   unsigned *properties = ctx->shader->selector->info.properties;
+	LLVMValueRef values[3];
+	LLVMValueRef result;
+	unsigned i;
+	unsigned *properties = ctx->shader->selector->info.properties;
 
-   if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
-      unsigned sizes[3] = {properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
-                           properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
-                           properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]};
+	if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
+		unsigned sizes[3] = {
+			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
+		};
 
-      for (i = 0; i < 3; ++i)
-         values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
+		for (i = 0; i < 3; ++i)
+			values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
 
-      result = ac_build_gather_values(&ctx->ac, values, 3);
-   } else {
-      result = ac_get_arg(&ctx->ac, ctx->block_size);
-   }
+		result = ac_build_gather_values(&ctx->ac, values, 3);
+	} else {
+		result = ac_get_arg(&ctx->ac, ctx->block_size);
+	}
 
-   return result;
+	return result;
 }
 
 void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
 {
-   struct si_shader_selector *sel = ctx->shader->selector;
-   unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
+	struct si_shader_selector *sel = ctx->shader->selector;
+	unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
 
-   LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
-   LLVMValueRef var;
+	LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
+	LLVMValueRef var;
 
-   assert(!ctx->ac.lds);
+	assert(!ctx->ac.lds);
 
-   var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size),
-                                     "compute_lds", AC_ADDR_SPACE_LDS);
-   LLVMSetAlignment(var, 64 * 1024);
+	var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
+	                                  LLVMArrayType(ctx->ac.i8, lds_size),
+	                                  "compute_lds",
+	                                  AC_ADDR_SPACE_LDS);
+	LLVMSetAlignment(var, 64 * 1024);
 
-   ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
+	ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
 }
 
 bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 {
-   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      si_llvm_load_vs_inputs(ctx, nir);
-   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      unsigned colors_read = ctx->shader->selector->info.colors_read;
-      LLVMValueRef main_fn = ctx->main_fn;
-
-      LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
-
-      unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
-
-      if (colors_read & 0x0f) {
-         unsigned mask = colors_read & 0x0f;
-         LLVMValueRef values[4];
-         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
-         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
-         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
-         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
-         ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
-      }
-      if (colors_read & 0xf0) {
-         unsigned mask = (colors_read & 0xf0) >> 4;
-         LLVMValueRef values[4];
-         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
-         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
-         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
-         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
-         ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
-      }
-
-      ctx->abi.interp_at_sample_force_center =
-         ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
-   } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
-      if (nir->info.cs.user_data_components_amd) {
-         ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
-         ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
-                                                      nir->info.cs.user_data_components_amd);
-      }
-   }
-
-   ctx->abi.inputs = &ctx->inputs[0];
-   ctx->abi.clamp_shadow_reference = true;
-   ctx->abi.robust_buffer_access = true;
-
-   if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
-      assert(gl_shader_stage_is_compute(nir->info.stage));
-      si_llvm_declare_compute_memory(ctx);
-   }
-   ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
-
-   return true;
+	if (nir->info.stage == MESA_SHADER_VERTEX) {
+		si_llvm_load_vs_inputs(ctx, nir);
+	} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+                unsigned colors_read =
+                        ctx->shader->selector->info.colors_read;
+                LLVMValueRef main_fn = ctx->main_fn;
+
+                LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
+
+                unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
+
+                if (colors_read & 0x0f) {
+                        unsigned mask = colors_read & 0x0f;
+                        LLVMValueRef values[4];
+                        values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+                        values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+                        values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+                        values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+                        ctx->abi.color0 =
+                                ac_to_integer(&ctx->ac,
+                                              ac_build_gather_values(&ctx->ac, values, 4));
+                }
+                if (colors_read & 0xf0) {
+                        unsigned mask = (colors_read & 0xf0) >> 4;
+                        LLVMValueRef values[4];
+                        values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+                        values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+                        values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+                        values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+                        ctx->abi.color1 =
+                                ac_to_integer(&ctx->ac,
+                                              ac_build_gather_values(&ctx->ac, values, 4));
+                }
+
+		ctx->abi.interp_at_sample_force_center =
+			ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
+	} else if (nir->info.stage == MESA_SHADER_COMPUTE) {
+		if (nir->info.cs.user_data_components_amd) {
+			ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
+			ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
+								     nir->info.cs.user_data_components_amd);
+		}
+	}
+
+	ctx->abi.inputs = &ctx->inputs[0];
+	ctx->abi.clamp_shadow_reference = true;
+	ctx->abi.robust_buffer_access = true;
+
+	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
+		assert(gl_shader_stage_is_compute(nir->info.stage));
+		si_llvm_declare_compute_memory(ctx);
+	}
+	ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
+
+	return true;
 }
 
 /**
@@ -467,270 +511,278 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
  * runs them in sequence to form a monolithic shader.
  */
 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
-                               unsigned num_parts, unsigned main_part,
-                               unsigned next_shader_first_part)
+			       unsigned num_parts, unsigned main_part,
+			       unsigned next_shader_first_part)
 {
-   LLVMBuilderRef builder = ctx->ac.builder;
-   /* PS epilog has one arg per color component; gfx9 merged shader
-    * prologs need to forward 40 SGPRs.
-    */
-   LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
-   LLVMTypeRef function_type;
-   unsigned num_first_params;
-   unsigned num_out, initial_num_out;
-   ASSERTED unsigned num_out_sgpr;         /* used in debug checks */
-   ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
-   unsigned num_sgprs, num_vgprs;
-   unsigned gprs;
-
-   memset(&ctx->args, 0, sizeof(ctx->args));
-
-   for (unsigned i = 0; i < num_parts; ++i) {
-      ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE);
-      LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
-   }
-
-   /* The parameters of the wrapper function correspond to those of the
-    * first part in terms of SGPRs and VGPRs, but we use the types of the
-    * main part to get the right types. This is relevant for the
-    * dereferenceable attribute on descriptor table pointers.
-    */
-   num_sgprs = 0;
-   num_vgprs = 0;
-
-   function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
-   num_first_params = LLVMCountParamTypes(function_type);
-
-   for (unsigned i = 0; i < num_first_params; ++i) {
-      LLVMValueRef param = LLVMGetParam(parts[0], i);
-
-      if (ac_is_sgpr_param(param)) {
-         assert(num_vgprs == 0);
-         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
-      } else {
-         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
-      }
-   }
-
-   gprs = 0;
-   while (gprs < num_sgprs + num_vgprs) {
-      LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
-      LLVMTypeRef type = LLVMTypeOf(param);
-      unsigned size = ac_get_type_size(type) / 4;
-
-      /* This is going to get casted anyways, so we don't have to
-       * have the exact same type. But we do have to preserve the
-       * pointer-ness so that LLVM knows about it.
-       */
-      enum ac_arg_type arg_type = AC_ARG_INT;
-      if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
-         type = LLVMGetElementType(type);
-
-         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
-            if (LLVMGetVectorSize(type) == 4)
-               arg_type = AC_ARG_CONST_DESC_PTR;
-            else if (LLVMGetVectorSize(type) == 8)
-               arg_type = AC_ARG_CONST_IMAGE_PTR;
-            else
-               assert(0);
-         } else if (type == ctx->ac.f32) {
-            arg_type = AC_ARG_CONST_FLOAT_PTR;
-         } else {
-            assert(0);
-         }
-      }
-
-      ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL);
-
-      assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
-      assert(gprs + size <= num_sgprs + num_vgprs &&
-             (gprs >= num_sgprs || gprs + size <= num_sgprs));
-
-      gprs += size;
-   }
-
-   /* Prepare the return type. */
-   unsigned num_returns = 0;
-   LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
-
-   last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
-   return_type = LLVMGetReturnType(last_func_type);
-
-   switch (LLVMGetTypeKind(return_type)) {
-   case LLVMStructTypeKind:
-      num_returns = LLVMCountStructElementTypes(return_type);
-      assert(num_returns <= ARRAY_SIZE(returns));
-      LLVMGetStructElementTypes(return_type, returns);
-      break;
-   case LLVMVoidTypeKind:
-      break;
-   default:
-      unreachable("unexpected type");
-   }
-
-   si_llvm_create_func(ctx, "wrapper", returns, num_returns,
-                       si_get_max_workgroup_size(ctx->shader));
-
-   if (si_is_merged_shader(ctx->shader))
-      ac_init_exec_full_mask(&ctx->ac);
-
-   /* Record the arguments of the function as if they were an output of
-    * a previous part.
-    */
-   num_out = 0;
-   num_out_sgpr = 0;
-
-   for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
-      LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
-      LLVMTypeRef param_type = LLVMTypeOf(param);
-      LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
-      unsigned size = ac_get_type_size(param_type) / 4;
-
-      if (size == 1) {
-         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
-            param_type = ctx->ac.i32;
-         }
-
-         if (param_type != out_type)
-            param = LLVMBuildBitCast(builder, param, out_type, "");
-         out[num_out++] = param;
-      } else {
-         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
-
-         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
-            param_type = ctx->ac.i64;
-         }
-
-         if (param_type != vector_type)
-            param = LLVMBuildBitCast(builder, param, vector_type, "");
-
-         for (unsigned j = 0; j < size; ++j)
-            out[num_out++] =
-               LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
-      }
-
-      if (ctx->args.args[i].file == AC_ARG_SGPR)
-         num_out_sgpr = num_out;
-   }
-
-   memcpy(initial, out, sizeof(out));
-   initial_num_out = num_out;
-   initial_num_out_sgpr = num_out_sgpr;
-
-   /* Now chain the parts. */
-   LLVMValueRef ret = NULL;
-   for (unsigned part = 0; part < num_parts; ++part) {
-      LLVMValueRef in[AC_MAX_ARGS];
-      LLVMTypeRef ret_type;
-      unsigned out_idx = 0;
-      unsigned num_params = LLVMCountParams(parts[part]);
-
-      /* Merged shaders are executed conditionally depending
-       * on the number of enabled threads passed in the input SGPRs. */
-      if (si_is_multi_part_shader(ctx->shader) && part == 0) {
-         LLVMValueRef ena, count = initial[3];
-
-         count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
-         ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
-         ac_build_ifcc(&ctx->ac, ena, 6506);
-      }
-
-      /* Derive arguments for the next part from outputs of the
-       * previous one.
-       */
-      for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
-         LLVMValueRef param;
-         LLVMTypeRef param_type;
-         bool is_sgpr;
-         unsigned param_size;
-         LLVMValueRef arg = NULL;
-
-         param = LLVMGetParam(parts[part], param_idx);
-         param_type = LLVMTypeOf(param);
-         param_size = ac_get_type_size(param_type) / 4;
-         is_sgpr = ac_is_sgpr_param(param);
-
-         if (is_sgpr) {
-            ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG);
-         } else if (out_idx < num_out_sgpr) {
-            /* Skip returned SGPRs the current part doesn't
-             * declare on the input. */
-            out_idx = num_out_sgpr;
-         }
-
-         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
-
-         if (param_size == 1)
-            arg = out[out_idx];
-         else
-            arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
-
-         if (LLVMTypeOf(arg) != param_type) {
-            if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-               if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) {
-                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
-                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
-               } else {
-                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
-                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
-               }
-            } else {
-               arg = LLVMBuildBitCast(builder, arg, param_type, "");
-            }
-         }
-
-         in[param_idx] = arg;
-         out_idx += param_size;
-      }
-
-      ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
-
-      if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
-         ac_build_endif(&ctx->ac, 6506);
-
-         /* The second half of the merged shader should use
-          * the inputs from the toplevel (wrapper) function,
-          * not the return value from the last call.
-          *
-          * That's because the last call was executed condi-
-          * tionally, so we can't consume it in the main
-          * block.
-          */
-         memcpy(out, initial, sizeof(initial));
-         num_out = initial_num_out;
-         num_out_sgpr = initial_num_out_sgpr;
-         continue;
-      }
-
-      /* Extract the returned GPRs. */
-      ret_type = LLVMTypeOf(ret);
-      num_out = 0;
-      num_out_sgpr = 0;
-
-      if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
-         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
-
-         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
-
-         for (unsigned i = 0; i < ret_size; ++i) {
-            LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, "");
-
-            assert(num_out < ARRAY_SIZE(out));
-            out[num_out++] = val;
-
-            if (LLVMTypeOf(val) == ctx->ac.i32) {
-               assert(num_out_sgpr + 1 == num_out);
-               num_out_sgpr = num_out;
-            }
-         }
-      }
-   }
-
-   /* Return the value from the last part. */
-   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
-      LLVMBuildRetVoid(builder);
-   else
-      LLVMBuildRet(builder, ret);
+	LLVMBuilderRef builder = ctx->ac.builder;
+	/* PS epilog has one arg per color component; gfx9 merged shader
+	 * prologs need to forward 40 SGPRs.
+	 */
+	LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
+	LLVMTypeRef function_type;
+	unsigned num_first_params;
+	unsigned num_out, initial_num_out;
+	ASSERTED unsigned num_out_sgpr; /* used in debug checks */
+	ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
+	unsigned num_sgprs, num_vgprs;
+	unsigned gprs;
+
+	memset(&ctx->args, 0, sizeof(ctx->args));
+
+	for (unsigned i = 0; i < num_parts; ++i) {
+		ac_add_function_attr(ctx->ac.context, parts[i], -1,
+				     AC_FUNC_ATTR_ALWAYSINLINE);
+		LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
+	}
+
+	/* The parameters of the wrapper function correspond to those of the
+	 * first part in terms of SGPRs and VGPRs, but we use the types of the
+	 * main part to get the right types. This is relevant for the
+	 * dereferenceable attribute on descriptor table pointers.
+	 */
+	num_sgprs = 0;
+	num_vgprs = 0;
+
+	function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
+	num_first_params = LLVMCountParamTypes(function_type);
+
+	for (unsigned i = 0; i < num_first_params; ++i) {
+		LLVMValueRef param = LLVMGetParam(parts[0], i);
+
+		if (ac_is_sgpr_param(param)) {
+			assert(num_vgprs == 0);
+			num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+		} else {
+			num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+		}
+	}
+
+	gprs = 0;
+	while (gprs < num_sgprs + num_vgprs) {
+		LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
+		LLVMTypeRef type = LLVMTypeOf(param);
+		unsigned size = ac_get_type_size(type) / 4;
+
+		/* This is going to get casted anyways, so we don't have to
+		 * have the exact same type. But we do have to preserve the
+		 * pointer-ness so that LLVM knows about it.
+		 */
+		enum ac_arg_type arg_type = AC_ARG_INT;
+		if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+			type = LLVMGetElementType(type);
+
+			if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
+				if (LLVMGetVectorSize(type) == 4)
+					arg_type = AC_ARG_CONST_DESC_PTR;
+				else if (LLVMGetVectorSize(type) == 8)
+					arg_type = AC_ARG_CONST_IMAGE_PTR;
+				else
+					assert(0);
+			} else if (type == ctx->ac.f32) {
+				arg_type = AC_ARG_CONST_FLOAT_PTR;
+			} else {
+				assert(0);
+			}
+		}
+
+		ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
+			   size, arg_type, NULL);
+
+		assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
+		assert(gprs + size <= num_sgprs + num_vgprs &&
+		       (gprs >= num_sgprs || gprs + size <= num_sgprs));
+
+		gprs += size;
+	}
+
+	/* Prepare the return type. */
+	unsigned num_returns = 0;
+	LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
+
+	last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
+	return_type = LLVMGetReturnType(last_func_type);
+
+	switch (LLVMGetTypeKind(return_type)) {
+	case LLVMStructTypeKind:
+		num_returns = LLVMCountStructElementTypes(return_type);
+		assert(num_returns <= ARRAY_SIZE(returns));
+		LLVMGetStructElementTypes(return_type, returns);
+		break;
+	case LLVMVoidTypeKind:
+		break;
+	default:
+		unreachable("unexpected type");
+	}
+
+	si_llvm_create_func(ctx, "wrapper", returns, num_returns,
+			    si_get_max_workgroup_size(ctx->shader));
+
+	if (si_is_merged_shader(ctx->shader))
+		ac_init_exec_full_mask(&ctx->ac);
+
+	/* Record the arguments of the function as if they were an output of
+	 * a previous part.
+	 */
+	num_out = 0;
+	num_out_sgpr = 0;
+
+	for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
+		LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
+		LLVMTypeRef param_type = LLVMTypeOf(param);
+		LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
+		unsigned size = ac_get_type_size(param_type) / 4;
+
+		if (size == 1) {
+			if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+				param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
+				param_type = ctx->ac.i32;
+			}
+
+			if (param_type != out_type)
+				param = LLVMBuildBitCast(builder, param, out_type, "");
+			out[num_out++] = param;
+		} else {
+			LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
+
+			if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+				param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
+				param_type = ctx->ac.i64;
+			}
+
+			if (param_type != vector_type)
+				param = LLVMBuildBitCast(builder, param, vector_type, "");
+
+			for (unsigned j = 0; j < size; ++j)
+				out[num_out++] = LLVMBuildExtractElement(
+					builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
+		}
+
+		if (ctx->args.args[i].file == AC_ARG_SGPR)
+			num_out_sgpr = num_out;
+	}
+
+	memcpy(initial, out, sizeof(out));
+	initial_num_out = num_out;
+	initial_num_out_sgpr = num_out_sgpr;
+
+	/* Now chain the parts. */
+	LLVMValueRef ret = NULL;
+	for (unsigned part = 0; part < num_parts; ++part) {
+		LLVMValueRef in[AC_MAX_ARGS];
+		LLVMTypeRef ret_type;
+		unsigned out_idx = 0;
+		unsigned num_params = LLVMCountParams(parts[part]);
+
+		/* Merged shaders are executed conditionally depending
+		 * on the number of enabled threads passed in the input SGPRs. */
+		if (si_is_multi_part_shader(ctx->shader) && part == 0) {
+			LLVMValueRef ena, count = initial[3];
+
+			count = LLVMBuildAnd(builder, count,
+					     LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
+			ena = LLVMBuildICmp(builder, LLVMIntULT,
+					    ac_get_thread_id(&ctx->ac), count, "");
+			ac_build_ifcc(&ctx->ac, ena, 6506);
+		}
+
+		/* Derive arguments for the next part from outputs of the
+		 * previous one.
+		 */
+		for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
+			LLVMValueRef param;
+			LLVMTypeRef param_type;
+			bool is_sgpr;
+			unsigned param_size;
+			LLVMValueRef arg = NULL;
+
+			param = LLVMGetParam(parts[part], param_idx);
+			param_type = LLVMTypeOf(param);
+			param_size = ac_get_type_size(param_type) / 4;
+			is_sgpr = ac_is_sgpr_param(param);
+
+			if (is_sgpr) {
+				ac_add_function_attr(ctx->ac.context, parts[part],
+						     param_idx + 1, AC_FUNC_ATTR_INREG);
+			} else if (out_idx < num_out_sgpr) {
+				/* Skip returned SGPRs the current part doesn't
+				 * declare on the input. */
+				out_idx = num_out_sgpr;
+			}
+
+			assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
+
+			if (param_size == 1)
+				arg = out[out_idx];
+			else
+				arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
+
+			if (LLVMTypeOf(arg) != param_type) {
+				if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+					if (LLVMGetPointerAddressSpace(param_type) ==
+					    AC_ADDR_SPACE_CONST_32BIT) {
+						arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
+						arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+					} else {
+						arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
+						arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+					}
+				} else {
+					arg = LLVMBuildBitCast(builder, arg, param_type, "");
+				}
+			}
+
+			in[param_idx] = arg;
+			out_idx += param_size;
+		}
+
+		ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
+
+		if (si_is_multi_part_shader(ctx->shader) &&
+		    part + 1 == next_shader_first_part) {
+			ac_build_endif(&ctx->ac, 6506);
+
+			/* The second half of the merged shader should use
+			 * the inputs from the toplevel (wrapper) function,
+			 * not the return value from the last call.
+			 *
+			 * That's because the last call was executed condi-
+			 * tionally, so we can't consume it in the main
+			 * block.
+			 */
+			memcpy(out, initial, sizeof(initial));
+			num_out = initial_num_out;
+			num_out_sgpr = initial_num_out_sgpr;
+			continue;
+		}
+
+		/* Extract the returned GPRs. */
+		ret_type = LLVMTypeOf(ret);
+		num_out = 0;
+		num_out_sgpr = 0;
+
+		if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
+			assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
+
+			unsigned ret_size = LLVMCountStructElementTypes(ret_type);
+
+			for (unsigned i = 0; i < ret_size; ++i) {
+				LLVMValueRef val =
+					LLVMBuildExtractValue(builder, ret, i, "");
+
+				assert(num_out < ARRAY_SIZE(out));
+				out[num_out++] = val;
+
+				if (LLVMTypeOf(val) == ctx->ac.i32) {
+					assert(num_out_sgpr + 1 == num_out);
+					num_out_sgpr = num_out;
+				}
+			}
+		}
+	}
+
+	/* Return the value from the last part. */
+	if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+		LLVMBuildRetVoid(builder);
+	else
+		LLVMBuildRet(builder, ret);
 }
author	Jonathan Gray <jsg@cvs.openbsd.org>	2021-03-11 05:14:49 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2021-03-11 05:14:49 +0000
commit	b87befe14c653861830ec0bfa9b8a27e503a2294 (patch)
tree	e8fa56db169d60b95a62ecef48e4bc53b6493588 /lib/mesa/src
parent	845976941bf8cd7bfdd6672d614ac73cd02286c3 (diff)