Merge Mesa 21.3.7

author: Jonathan Gray <jsg@cvs.openbsd.org> 2022-02-24 02:30:08 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2022-02-24 02:30:08 +0000
commit: 1d35364040c0ffa99133522fa5ab3bd6131d8bf7 (patch)
tree: 0ea3d9ca4ad10692c6477168b67e98cb50ea6bd3 /lib/mesa/src/gallium/drivers/radeonsi
parent: b24b5b9049e889ee4eb39b565bcc8d48bd45ab48 (diff)
24 files changed, 2306 insertions, 4872 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
deleted file mode 100644
index e402da639..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
+++ /dev/null
@@ -1,95 +0,0 @@
-# Mesa 3-D graphics library
-#
-# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
-# Copyright (C) 2010-2011 LunarG Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-LOCAL_PATH := $(call my-dir)
-
-# get C_SOURCES and GENERATED_SOURCES
-include $(LOCAL_PATH)/Makefile.sources
-
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := $(C_SOURCES)
-
-LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
-
-LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
-LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/src/amd/common \
-	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \
-	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir
-
-LOCAL_STATIC_LIBRARIES := libmesa_amd_common
-
-LOCAL_SHARED_LIBRARIES := libdrm_radeon
-LOCAL_MODULE := libmesa_pipe_radeonsi
-
-intermediates := $(call local-generated-sources-dir)
-
-# We need to get NIR's generated headers.
-LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
-LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/radeonsi/,$(GENERATED_SOURCES))
-
-GEN_DRIINFO_INPUTS := \
-	$(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
-	$(LOCAL_PATH)/driinfo_radeonsi.h
-
-MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
-
-$(intermediates)/radeonsi/si_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
-
-GEN10_FORMAT_TABLE_INPUTS := \
-	$(MESA_TOP)/src/gallium/auxiliary/util/u_format.csv \
-	$(MESA_TOP)/src/amd/registers/gfx10-rsrc.json
-
-GEN10_FORMAT_TABLE_DEP := \
-	$(MESA_TOP)/src/amd/registers/regdb.py
-
-GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py
-
-$(intermediates)/radeonsi/gfx10_format_table.h: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP)
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false)
-
-LOCAL_C_INCLUDES += $(intermediates)/radeonsi
-
-LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
-
-$(call mesa-build-with-llvm)
-
-include $(GALLIUM_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
-
-ifneq ($(HAVE_GALLIUM_RADEONSI),)
-GALLIUM_TARGET_DRIVERS += radeonsi
-$(eval GALLIUM_LIBS += \
-	$(LOCAL_MODULE) \
-	$(LOCAL_STATIC_LIBRARIES) \
-	libmesa_winsys_radeon \
-	libmesa_winsys_amdgpu)
-$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
-endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources b/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources
deleted file mode 100644
index 55ef80856..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources
+++ /dev/null
@@ -1,75 +0,0 @@
-C_SOURCES := \
-	driinfo_radeonsi.h \
-	gfx10_query.c \
-	gfx10_shader_ngg.c \
-	si_blit.c \
-	si_buffer.c \
-	si_build_pm4.h \
-	si_clear.c \
-	si_compute.c \
-	si_compute_prim_discard.c \
-	si_compute.h \
-	si_compute_blit.c \
-	si_cp_dma.c \
-	si_cp_reg_shadowing.c \
-	si_debug.c \
-	si_descriptors.c \
-	si_fence.c \
-	si_get.c \
-	si_gfx_cs.c \
-	si_gpu_load.c \
-	si_pipe.c \
-	si_pipe.h \
-	si_pm4.c \
-	si_pm4.h \
-	si_perfcounter.c \
-	si_public.h \
-	si_query.c \
-	si_query.h \
-	si_shader.c \
-	si_shader.h \
-	si_shader_internal.h \
-	si_shader_llvm.c \
-	si_shader_llvm_gs.c \
-	si_shader_llvm_ps.c \
-	si_shader_llvm_resources.c \
-	si_shader_llvm_tess.c \
-	si_shader_llvm_vs.c \
-	si_shader_nir.c \
-        si_shaderlib_nir.c \
-	si_shaderlib_tgsi.c \
-	si_sqtt.c \
-	si_state.c \
-	si_state_binning.c \
-	si_state_draw.cpp \
-	si_state_msaa.c \
-	si_state_shaders.c \
-	si_state_streamout.c \
-	si_state_viewport.c \
-	si_state.h \
-	si_test_blit.c \
-	si_test_dma_perf.c \
-	si_texture.c \
-	si_uvd.c \
-	../radeon/radeon_uvd.c \
-	../radeon/radeon_uvd.h \
-	../radeon/radeon_vcn_dec_jpeg.c \
-	../radeon/radeon_vcn_dec.c \
-	../radeon/radeon_vcn_dec.h \
-	../radeon/radeon_vcn_av1_default.h \
-	../radeon/radeon_vcn_enc_1_2.c \
-	../radeon/radeon_vcn_enc_2_0.c \
-	../radeon/radeon_vcn_enc_3_0.c \
-	../radeon/radeon_vcn_enc.c \
-	../radeon/radeon_vcn_enc.h \
-	../radeon/radeon_uvd_enc_1_1.c \
-	../radeon/radeon_uvd_enc.c \
-	../radeon/radeon_uvd_enc.h \
-	../radeon/radeon_vce_40_2_2.c \
-	../radeon/radeon_vce_50.c \
-	../radeon/radeon_vce_52.c \
-	../radeon/radeon_vce.c \
-	../radeon/radeon_vce.h \
-	../radeon/radeon_video.c \
-	../radeon/radeon_video.h \
-	../radeon/radeon_winsys.h
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt
deleted file mode 100644
index e69de29bb..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt
+++ /dev/null
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt
deleted file mode 100644
index 69d00870a..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# Note: skips lists for CI are just a list of lines that, when
-# non-zero-length and not starting with '#', will regex match to
-# delete lines from the test list.  Be careful.
-
-# Skip the perf/stress tests to keep runtime manageable
-dEQP-GLES[0-9]*.performance.*
-dEQP-GLES[0-9]*.stress.*
-
-# These are really slow on tiling architectures (including llvmpipe).
-dEQP-GLES[0-9]*.functional.flush_finish.*
-
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt
deleted file mode 100644
index e69de29bb..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt
+++ /dev/null
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c b/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c
index 653dfc343..5653ff233 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c
@@ -98,11 +98,13 @@ void si_blitter_end(struct si_context *sctx)
    /* Restore shader pointers because the VS blit shader changed all
     * non-global VS user SGPRs. */
    sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
+
+   unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
    sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
                                        sctx->num_vertex_elements >
-                                       sctx->screen->num_vbos_in_user_sgprs;
+                                       num_vbos_in_user_sgprs;
    sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
-                                          sctx->screen->num_vbos_in_user_sgprs;
+                                          num_vbos_in_user_sgprs;
    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
 
@@ -393,11 +395,12 @@ static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex,
       si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, false, true /* no DCC */);
 }
 
-static void si_decompress_sampler_depth_textures(struct si_context *sctx,
+static bool si_decompress_sampler_depth_textures(struct si_context *sctx,
                                                  struct si_samplers *textures)
 {
    unsigned i;
    unsigned mask = textures->needs_depth_decompress_mask;
+   bool need_flush = false;
 
    while (mask) {
       struct pipe_sampler_view *view;
@@ -416,7 +419,14 @@ static void si_decompress_sampler_depth_textures(struct si_context *sctx,
       si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
                           view->u.tex.first_level, view->u.tex.last_level, 0,
                           util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+
+      if (tex->need_flush_after_depth_decompression) {
+         need_flush = true;
+         tex->need_flush_after_depth_decompression = false;
+      }
    }
+
+   return need_flush;
 }
 
 static void si_blit_decompress_color(struct si_context *sctx, struct si_texture *tex,
@@ -755,6 +765,7 @@ static void si_decompress_resident_images(struct si_context *sctx)
 void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
 {
    unsigned compressed_colortex_counter, mask;
+   bool need_flush = false;
 
    if (sctx->blitter_running)
       return;
@@ -772,7 +783,7 @@ void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
       unsigned i = u_bit_scan(&mask);
 
       if (sctx->samplers[i].needs_depth_decompress_mask) {
-         si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
+         need_flush |= si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
       }
       if (sctx->samplers[i].needs_color_decompress_mask) {
          si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
@@ -782,6 +793,16 @@ void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
       }
    }
 
+   if (sctx->chip_class == GFX10_3 && need_flush) {
+      /* This fixes a corruption with the following sequence:
+       *   - fast clear depth
+       *   - decompress depth
+       *   - draw
+       * (see https://gitlab.freedesktop.org/drm/amd/-/issues/1810#note_1170171)
+       */
+      sctx->b.flush(&sctx->b, NULL, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW);
+   }
+
    if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
       if (sctx->uses_bindless_samplers)
          si_decompress_resident_textures(sctx);
@@ -1027,7 +1048,7 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst
    /* Copy. */
    si_blitter_begin(sctx, SI_COPY);
    util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0,
-                             src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false);
+                             src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false, false);
    si_blitter_end(sctx);
 
    pipe_surface_reference(&dst_view, NULL);
@@ -1203,11 +1224,48 @@ resolve_to_temp:
 static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
 {
    struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *sdst = (struct si_texture *)info->dst.resource;
 
    if (do_hardware_msaa_resolve(ctx, info)) {
       return;
    }
 
+   if (info->is_dri_blit_image && sdst->surface.is_linear &&
+       sctx->chip_class >= GFX7 && sdst->surface.flags & RADEON_SURF_IMPORTED) {
+      struct si_texture *ssrc = (struct si_texture *)info->src.resource;
+      /* Use SDMA or async compute when copying to a DRI_PRIME imported linear surface. */
+      bool async_copy = info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.z == 0 &&
+                        info->src.box.x == 0 && info->src.box.y == 0 && info->src.box.z == 0 &&
+                        info->dst.level == 0 && info->src.level == 0 &&
+                        info->src.box.width == info->dst.resource->width0 &&
+                        info->src.box.height == info->dst.resource->height0 &&
+                        info->src.box.depth == 1 && util_can_blit_via_copy_region(info, true);
+      /* Try SDMA first... */
+      /* TODO: figure out why SDMA copies are slow on GFX10_3 */
+      if (async_copy && sctx->chip_class < GFX10_3 && si_sdma_copy_image(sctx, sdst, ssrc))
+         return;
+
+      /* ... and use async compute as the fallback. */
+      if (async_copy) {
+         struct si_screen *sscreen = sctx->screen;
+
+         simple_mtx_lock(&sscreen->async_compute_context_lock);
+         if (!sscreen->async_compute_context)
+            si_init_aux_async_compute_ctx(sscreen);
+
+         if (sscreen->async_compute_context) {
+            si_compute_copy_image((struct si_context*)sctx->screen->async_compute_context,
+                                  info->dst.resource, 0, info->src.resource, 0, 0, 0, 0,
+                                  &info->src.box, false, 0);
+            si_flush_gfx_cs((struct si_context*)sctx->screen->async_compute_context, 0, NULL);
+            simple_mtx_unlock(&sscreen->async_compute_context_lock);
+            return;
+         }
+
+         simple_mtx_unlock(&sscreen->async_compute_context_lock);
+      }
+   }
+
    if (unlikely(sctx->thread_trace_enabled))
       sctx->sqtt_next_event = EventCmdCopyImage;
 
@@ -1276,52 +1334,16 @@ static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *re
    struct si_texture *tex = (struct si_texture *)res;
 
    assert(res->target != PIPE_BUFFER);
-   assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
-
-   /* st/dri calls flush twice per frame (not a bug), this prevents double
-    * decompression. */
-   if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
-      return;
 
    if (!tex->is_depth && (tex->cmask_buffer || vi_dcc_enabled(tex, 0))) {
       si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0),
-                               tex->dcc_separate_buffer != NULL, false);
+                               false, false);
 
       if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
          si_retile_dcc(sctx, tex);
          tex->displayable_dcc_dirty = false;
       }
    }
-
-   /* Always do the analysis even if DCC is disabled at the moment. */
-   if (tex->dcc_gather_statistics) {
-      bool separate_dcc_dirty = tex->separate_dcc_dirty;
-
-      /* If the color buffer hasn't been unbound and fast clear hasn't
-       * been used, separate_dcc_dirty is false, but there may have been
-       * new rendering. Check if the color buffer is bound and assume
-       * it's dirty.
-       *
-       * Note that DRI2 never unbinds window colorbuffers, which means
-       * the DCC pipeline statistics query would never be re-set and would
-       * keep adding new results until all free memory is exhausted if we
-       * didn't do this.
-       */
-      if (!separate_dcc_dirty) {
-         for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-            if (sctx->framebuffer.state.cbufs[i] &&
-                sctx->framebuffer.state.cbufs[i]->texture == res) {
-               separate_dcc_dirty = true;
-               break;
-            }
-         }
-      }
-
-      if (separate_dcc_dirty) {
-         tex->separate_dcc_dirty = false;
-         vi_separate_dcc_process_and_reset_stats(ctx, tex);
-      }
-   }
 }
 
 void si_flush_implicit_resources(struct si_context *sctx)
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
index 48ec79ac5..0ae232db2 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
@@ -107,7 +107,7 @@ static void code_object_to_config(const amd_kernel_code_t *code_object,
 }
 
 /* Asynchronous compute shader compilation. */
-static void si_create_compute_state_async(void *job, int thread_index)
+static void si_create_compute_state_async(void *job, void *gdata, int thread_index)
 {
    struct si_compute *program = (struct si_compute *)job;
    struct si_shader_selector *sel = &program->sel;
@@ -367,11 +367,14 @@ static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsi
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
 {
    radeon_begin(cs);
-   radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
+   radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI,
+                     S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
+
+   radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
    /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
     * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
-   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
 
    if (sctx->chip_class == GFX6) {
       /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
@@ -381,25 +384,25 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
        * TODO: This should be:
        * (number of compute units) * 4 * (waves per simd) - 1
        */
-      radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+      radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
 
       if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
          uint64_t bc_va = sctx->border_color_buffer->gpu_address;
 
-         radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+         radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
       }
    }
 
    if (sctx->chip_class >= GFX7) {
       /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
-      radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
-      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+      radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
+      radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+      radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
 
       /* Disable profiling on compute queues. */
       if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) {
-         radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
-         radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
+         radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
+         radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
       }
 
       /* Set the pointer to border colors. */
@@ -407,9 +410,9 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
       if (sctx->border_color_buffer) {
          uint64_t bc_va = sctx->border_color_buffer->gpu_address;
 
-         radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
-         radeon_emit(cs, bc_va >> 8);                    /* R_030E00_TA_CS_BC_BASE_ADDR */
-         radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
+         radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
+         radeon_emit(bc_va >> 8);                    /* R_030E00_TA_CS_BC_BASE_ADDR */
+         radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
       }
    }
 
@@ -418,17 +421,19 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
     */
    if (sctx->chip_class >= GFX9 &&
        (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) {
-      radeon_set_uconfig_reg(cs, R_0301EC_CP_COHER_START_DELAY,
+      radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY,
                              sctx->chip_class >= GFX10 ? 0x20 : 0);
    }
 
    if (sctx->chip_class >= GFX10) {
-      radeon_set_sh_reg(cs, R_00B890_COMPUTE_USER_ACCUM_0, 0);
-      radeon_set_sh_reg(cs, R_00B894_COMPUTE_USER_ACCUM_1, 0);
-      radeon_set_sh_reg(cs, R_00B898_COMPUTE_USER_ACCUM_2, 0);
-      radeon_set_sh_reg(cs, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
-      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
-      radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
+      radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 5);
+      radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */
+      radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */
+      radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */
+      radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */
+      radeon_emit(0); /* R_00B8A0_COMPUTE_PGM_RSRC3 */
+
+      radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
    }
    radeon_end();
 }
@@ -533,13 +538,11 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
                              RADEON_PRIO_SHADER_BINARY);
 
    radeon_begin(cs);
-   radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-   radeon_emit(cs, shader_va >> 8);
-   radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+   radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 
-   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-   radeon_emit(cs, config->rsrc1);
-   radeon_emit(cs, config->rsrc2);
+   radeon_set_sh_reg_seq(R_00B848_COMPUTE_PGM_RSRC1, 2);
+   radeon_emit(config->rsrc1);
+   radeon_emit(config->rsrc2);
 
    COMPUTE_DBG(sctx->screen,
                "COMPUTE_PGM_RSRC1: 0x%08x "
@@ -549,7 +552,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
    sctx->max_seen_compute_scratch_bytes_per_wave =
       MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
 
-   radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+   radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
                      S_00B860_WAVES(sctx->scratch_waves) |
                         S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
    radeon_end();
@@ -592,11 +595,11 @@ static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
    }
 
    radeon_begin(cs);
-   radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
-   radeon_emit(cs, scratch_dword0);
-   radeon_emit(cs, scratch_dword1);
-   radeon_emit(cs, scratch_dword2);
-   radeon_emit(cs, scratch_dword3);
+   radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
+   radeon_emit(scratch_dword0);
+   radeon_emit(scratch_dword1);
+   radeon_emit(scratch_dword2);
+   radeon_emit(scratch_dword3);
    radeon_end();
 }
 
@@ -656,9 +659,9 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
 
       dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
 
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
-      radeon_emit(cs, dispatch_va);
-      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(dispatch_va);
+      radeon_emit(S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
 
       si_resource_reference(&dispatch_buf, NULL);
       user_sgpr += 2;
@@ -666,16 +669,16 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
 
    if (AMD_HSA_BITS_GET(code_object->code_properties,
                         AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
-      radeon_emit(cs, kernel_args_va);
-      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(kernel_args_va);
+      radeon_emit(S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
       user_sgpr += 2;
    }
 
    for (i = 0; i < 3 && user_sgpr < 16; i++) {
       if (code_object->code_properties & workgroup_count_masks[i]) {
-         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
-         radeon_emit(cs, info->grid[i]);
+         radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
+         radeon_emit(info->grid[i]);
          user_sgpr += 1;
       }
    }
@@ -740,21 +743,21 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
          }
          radeon_begin_again(cs);
       } else {
-         radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
-         radeon_emit(cs, info->grid[0]);
-         radeon_emit(cs, info->grid[1]);
-         radeon_emit(cs, info->grid[2]);
+         radeon_set_sh_reg_seq(grid_size_reg, 3);
+         radeon_emit(info->grid[0]);
+         radeon_emit(info->grid[1]);
+         radeon_emit(info->grid[2]);
       }
    }
 
    if (sel->info.uses_variable_block_size) {
-      radeon_set_sh_reg(cs, block_size_reg,
+      radeon_set_sh_reg(block_size_reg,
                         info->block[0] | (info->block[1] << 10) | (info->block[2] << 20));
    }
 
    if (sel->info.base.cs.user_data_components_amd) {
-      radeon_set_sh_reg_seq(cs, cs_user_data_reg, sel->info.base.cs.user_data_components_amd);
-      radeon_emit_array(cs, sctx->cs_user_data, sel->info.base.cs.user_data_components_amd);
+      radeon_set_sh_reg_seq(cs_user_data_reg, sel->info.base.cs.user_data_components_amd);
+      radeon_emit_array(sctx->cs_user_data, sel->info.base.cs.user_data_components_amd);
    }
    radeon_end();
 }
@@ -780,7 +783,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
 
    radeon_begin(cs);
    radeon_set_sh_reg(
-      cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+      R_00B854_COMPUTE_RESOURCE_LIMITS,
       ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup,
                                      sctx->cs_max_waves_per_sh, threadgroups_per_cu));
 
@@ -793,7 +796,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
    const uint *last_block = info->last_block;
    bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
 
-   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+   radeon_set_sh_reg_seq(R_00B81C_COMPUTE_NUM_THREAD_X, 3);
 
    if (partial_block_en) {
       unsigned partial[3];
@@ -803,18 +806,18 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
       partial[1] = last_block[1] ? last_block[1] : info->block[1];
       partial[2] = last_block[2] ? last_block[2] : info->block[2];
 
-      radeon_emit(
-         cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
-      radeon_emit(
-         cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1]));
-      radeon_emit(
-         cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+      radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]) |
+                  S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+      radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]) |
+                  S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+      radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]) |
+                  S_00B824_NUM_THREAD_PARTIAL(partial[2]));
 
       dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
    } else {
-      radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
-      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
-      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+      radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]));
+      radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]));
+      radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]));
    }
 
    if (info->indirect) {
@@ -823,25 +826,25 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ,
                                 RADEON_PRIO_DRAW_INDIRECT);
 
-      radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
-      radeon_emit(cs, 1);
-      radeon_emit(cs, base_va);
-      radeon_emit(cs, base_va >> 32);
+      radeon_emit(PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(1);
+      radeon_emit(base_va);
+      radeon_emit(base_va >> 32);
 
-      radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
-      radeon_emit(cs, info->indirect_offset);
-      radeon_emit(cs, dispatch_initiator);
+      radeon_emit(PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(info->indirect_offset);
+      radeon_emit(dispatch_initiator);
    } else {
-      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
-      radeon_emit(cs, info->grid[0]);
-      radeon_emit(cs, info->grid[1]);
-      radeon_emit(cs, info->grid[2]);
-      radeon_emit(cs, dispatch_initiator);
+      radeon_emit(PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(info->grid[0]);
+      radeon_emit(info->grid[1]);
+      radeon_emit(info->grid[2]);
+      radeon_emit(dispatch_initiator);
    }
 
    if (unlikely(sctx->thread_trace_enabled && sctx->chip_class >= GFX9)) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
    }
    radeon_end();
 }
@@ -857,6 +860,8 @@ static bool si_check_needs_implicit_sync(struct si_context *sctx)
     *
     *    buffer object and texture stores performed by shaders are not
     *    automatically synchronized
+    *
+    * TODO: Bindless textures are not handled, and thus are not synchronized.
     */
    struct si_shader_info *info = &sctx->cs_shader_state.program->sel.info;
    struct si_samplers *samplers = &sctx->samplers[PIPE_SHADER_COMPUTE];
@@ -890,18 +895,12 @@ static bool si_check_needs_implicit_sync(struct si_context *sctx)
 static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
 {
    struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = sctx->screen;
    struct si_compute *program = sctx->cs_shader_state.program;
    const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc);
    int i;
-   /* HW bug workaround when CS threadgroups > 256 threads and async
-    * compute isn't used, i.e. only one compute job can run at a time.
-    * If async compute is possible, the threadgroup size must be limited
-    * to 256 threads on all queues to avoid the bug.
-    * Only GFX6 and certain GFX7 chips are affected.
-    */
-   bool cs_regalloc_hang =
-      (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) &&
-      info->block[0] * info->block[1] * info->block[2] > 256;
+   bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug &&
+                           info->block[0] * info->block[1] * info->block[2] > 256;
 
    if (cs_regalloc_hang)
       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
deleted file mode 100644
index 373fd4ffa..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ /dev/null
@@ -1,1580 +0,0 @@
-/*
- * Copyright 2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "si_pipe.h"
-#include "si_shader_internal.h"
-#include "sid.h"
-#include "si_build_pm4.h"
-#include "ac_llvm_cull.h"
-
-#include "util/u_prim.h"
-#include "util/u_suballoc.h"
-#include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
-
-/* Based on:
- * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
- */
-
-/* This file implements primitive culling using asynchronous compute.
- * It's written to be GL conformant.
- *
- * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
- * in a compute shader. The shader processes 1 primitive/thread by invoking
- * the VS for each vertex to get the positions, decomposes strips and fans
- * into triangles (if needed), eliminates primitive restart (if needed),
- * does (W<0) culling, face culling, view XY culling, zero-area and
- * small-primitive culling, and generates a new index buffer that doesn't
- * contain culled primitives.
- *
- * The index buffer is generated using the Ordered Count feature of GDS,
- * which is an atomic counter that is incremented in the wavefront launch
- * order, so that the original primitive order is preserved.
- *
- * Another GDS ordered counter is used to eliminate primitive restart indices.
- * If a restart index lands on an even thread ID, the compute shader has to flip
- * the primitive orientation of the whole following triangle strip. The primitive
- * orientation has to be correct after strip and fan decomposition for two-sided
- * shading to behave correctly. The decomposition also needs to be aware of
- * which vertex is the provoking vertex for flat shading to behave correctly.
- *
- * IB = a GPU command buffer
- *
- * Both the compute and gfx IBs run in parallel sort of like CE and DE.
- * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
- * doesn't continue if its word isn't 0x80000000. Once compute shaders are
- * finished culling, the last wave will write the final primitive count from
- * GDS directly into the count word of the draw packet in the gfx IB, and
- * a CS_DONE event will signal the REWIND packet to continue. It's really
- * a direct draw with command buffer patching from the compute queue.
- *
- * The compute IB doesn't have to start when its corresponding gfx IB starts,
- * but can start sooner. The compute IB is signaled to start after the last
- * execution barrier in the *previous* gfx IB. This is handled as follows.
- * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
- * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
- * represents the barrier in the previous gfx IB.
- *
- * Features:
- * - Triangle strips and fans are decomposed into an indexed triangle list.
- *   The decomposition differs based on the provoking vertex state.
- * - Instanced draws are converted into non-instanced draws for 16-bit indices.
- *   (InstanceID is stored in the high bits of VertexID and unpacked by VS)
- * - Primitive restart is fully supported with triangle strips, including
- *   correct primitive orientation across multiple waves. (restart indices
- *   reset primitive orientation)
- * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
- * - Back face culling, incl. culling zero-area / degenerate primitives.
- * - View XY culling.
- * - View Z culling (disabled due to limited impact with perspective projection).
- * - Small primitive culling for all MSAA modes and all quant modes.
- *
- * The following are not implemented:
- * - ClipVertex/ClipDistance/CullDistance-based culling.
- * - Scissor culling.
- * - HiZ culling.
- *
- * Limitations (and unimplemented features that may be possible to implement):
- * - Only triangles, triangle strips, and triangle fans are supported.
- * - Primitive restart is only supported with triangle strips.
- * - Instancing and primitive restart can't be used together.
- * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
- * - The instance divisor buffer is unavailable, so all divisors must be
- *   either 0 or 1.
- * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
- * - No support for tessellation and geometry shaders.
- *   (patch elimination where tess factors are 0 would be possible to implement)
- * - The vertex shader must not contain memory stores.
- * - All VS resources must not have a write usage in the command buffer.
- *   (TODO: all shader buffers currently set the write usage)
- * - Bindless textures and images must not occur in the vertex shader.
- *
- * User data SGPR layout:
- *   INDEX_BUFFERS: pointer to constants
- *     0..3: input index buffer - typed buffer view
- *     4..7: output index buffer - typed buffer view
- *     8..11: viewport state - scale.xy, translate.xy
- *   VERTEX_COUNTER: counter address or first primitive ID
- *     - If unordered memory counter: address of "count" in the draw packet
- *       and is incremented atomically by the shader.
- *     - If unordered GDS counter: address of "count" in GDS starting from 0,
- *       must be initialized to 0 before the dispatch.
- *     - If ordered GDS counter: the primitive ID that should reset the vertex
- *       counter to 0 in GDS
- *   LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
- *       count to memory if using GDS ordered append
- *   VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
- *       using GDS ordered append
- *   VS.VERTEX_BUFFERS:           same value as VS
- *   VS.CONST_AND_SHADER_BUFFERS: same value as VS
- *   VS.SAMPLERS_AND_IMAGES:      same value as VS
- *   VS.BASE_VERTEX:              same value as VS
- *   VS.START_INSTANCE:           same value as VS
- *   NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
- *       per instance for instancing.
- *   NUM_PRIMS_UDIV_TERMS:
- *     - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
- *     - Bits [5:31]: The number of primitives per instance for computing the remainder.
- *   PRIMITIVE_RESTART_INDEX
- *   SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
- *
- *
- * The code contains 3 codepaths:
- * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
- * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
- * - Ordered GDS counter (it preserves the primitive order)
- *
- * How to test primitive restart (the most complicated part because it needs
- * to get the primitive orientation right):
- *   Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
- *   primitive orientation flips with small draw calls, which is what most tests use.
- *   You can also enable draw call splitting into draw calls with just 2 primitives.
- */
-
-/* At least 256 is needed for the fastest wave launch rate from compute queues
- * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE		256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU		1 /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH		0 /* no limit */
-#define INDEX_STORES_USE_SLC		1 /* don't cache indices if L2 is full */
-/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
-#define CULL_Z				0
-/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
-#define VERTEX_COUNTER_GDS_MODE		2
-#define GDS_SIZE_UNORDERED		(4 * 1024) /* only for the unordered GDS counter */
-
-/* Grouping compute dispatches for small draw calls: How many primitives from multiple
- * draw calls to process by compute before signaling the gfx IB. This reduces the number
- * of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH			(512 * 1024)
-/* Draw call splitting at the packet level. This allows signaling the gfx IB
- * for big draw calls sooner, but doesn't allow context flushes between packets.
- * Primitive restart is supported. Only implemented for ordered append. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE	PRIMS_PER_BATCH
-/* If there is not enough ring buffer space for the current IB, split draw calls into
- * this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL		PRIMS_PER_BATCH
-
-/* Derived values. */
-#define WAVES_PER_TG			DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL	(VERTEX_COUNTER_GDS_MODE == 2 ? \
-					 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
-					 UINT_MAX & ~(THREADGROUP_SIZE - 1))
-
-#define REWIND_SIGNAL_BIT		0x80000000
-/* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION		0
-
-void si_initialize_prim_discard_tunables(struct si_context *sctx)
-{
-	sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
-	if (sctx->chip_class == GFX6 || /* SI support is not implemented */
-	    !sctx->screen->info.has_gds_ordered_append ||
-	    sctx->screen->debug_flags & DBG(NO_PD) ||
-	    /* If aux_context == NULL, we are initializing aux_context right now. */
-	    !sctx->screen->aux_context)
-		return;
-
-	/* TODO: enable this after the GDS kernel memory management is fixed */
-	bool enable_on_pro_graphics_by_default = false;
-
-	if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
-	    sctx->screen->debug_flags & DBG(PD) ||
-	    (enable_on_pro_graphics_by_default &&
-	     sctx->screen->info.is_pro_graphics &&
-	     (sctx->family == CHIP_BONAIRE ||
-	      sctx->family == CHIP_HAWAII ||
-	      sctx->family == CHIP_TONGA ||
-	      sctx->family == CHIP_FIJI ||
-	      sctx->family == CHIP_POLARIS10 ||
-	      sctx->family == CHIP_POLARIS11 ||
-	      sctx->family == CHIP_VEGA10 ||
-	      sctx->family == CHIP_VEGA20))) {
-		sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
-		if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
-			sctx->prim_discard_vertex_count_threshold = 0; /* always enable */
-
-		const uint32_t MB = 1024 * 1024;
-		const uint64_t GB = 1024 * 1024 * 1024;
-
-		/* The total size is double this per context.
-		 * Greater numbers allow bigger gfx IBs.
-		 */
-		if (sctx->screen->info.vram_size <= 2 * GB)
-			sctx->index_ring_size_per_ib = 64 * MB;
-		else if (sctx->screen->info.vram_size <= 4 * GB)
-			sctx->index_ring_size_per_ib = 128 * MB;
-		else
-			sctx->index_ring_size_per_ib = 256 * MB;
-	}
-}
-
-/* Opcode can be "add" or "swap". */
-static LLVMValueRef
-si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
-		       LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
-		       bool release, bool done)
-{
-	LLVMValueRef args[] = {
-		LLVMBuildIntToPtr(ctx->ac.builder, m0,
-				  LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""),
-		value,
-		LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
-		ctx->i32_0, /* scope */
-		ctx->i1false, /* volatile */
-		LLVMConstInt(ctx->i32, ordered_count_index, 0),
-		LLVMConstInt(ctx->i1, release, 0),
-		LLVMConstInt(ctx->i1, done, 0),
-	};
-
-	char intrinsic[64];
-	snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
-	return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0);
-}
-
-static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
-{
-	uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
-	ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, "");
-	ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), "");
-	return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
-				 LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), "");
-}
-
-struct si_thread0_section {
-	struct si_shader_context *ctx;
-	LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
-	LLVMValueRef saved_exec;
-};
-
-/* Enter a section that only executes on thread 0. */
-static void si_enter_thread0_section(struct si_shader_context *ctx,
-				     struct si_thread0_section *section,
-				     LLVMValueRef thread_id)
-{
-	section->ctx = ctx;
-	section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0");
-
-	/* This IF has 4 instructions:
-	 *   v_and_b32_e32 v, 63, v         ; get the thread ID
-	 *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
-	 *   s_and_saveexec_b64 s, vcc
-	 *   s_cbranch_execz BB0_4
-	 *
-	 * It could just be s_and_saveexec_b64 s, 1.
-	 */
-	ac_build_ifcc(&ctx->ac,
-		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
-				    ctx->i32_0, ""), 12601);
-}
-
-/* Exit a section that only executes on thread 0 and broadcast the result
- * to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section,
-				    LLVMValueRef *result)
-{
-	struct si_shader_context *ctx = section->ctx;
-
-	LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
-
-	ac_build_endif(&ctx->ac, 12601);
-
-	/* Broadcast the result from thread 0 to all threads. */
-	*result = ac_build_readlane(&ctx->ac,
-			LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
-}
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
-{
-	struct si_shader_key *key = &ctx->shader->key;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef vs = ctx->main_fn;
-
-	/* Always inline the VS function. */
-	ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
-	LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
-	LLVMTypeRef const_desc_type;
-	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
-	    ctx->shader->selector->info.shader_buffers_declared == 0)
-		const_desc_type = ctx->f32;
-	else
-		const_desc_type = ctx->v4i32;
-
-	struct si_function_info fninfo;
-	si_init_function_info(&fninfo);
-
-	LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc;
-	LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id;
-	LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision;
-	LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc;
-	LLVMValueRef last_wave_prim_id, vertex_count_addr;
-
-	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
-		       &index_buffers_and_constants);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr);
-	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
-		       &vb_desc);
-	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type),
-		       &const_desc);
-	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32),
-		       &sampler_desc);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index);
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision);
-
-	/* Block ID and thread ID inputs. */
-	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id);
-	if (VERTEX_COUNTER_GDS_MODE == 2)
-		add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id);
-	add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id);
-
-	/* Create the compute shader function. */
-	unsigned old_type = ctx->type;
-	ctx->type = PIPE_SHADER_COMPUTE;
-	si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE);
-	ctx->type = old_type;
-
-	if (VERTEX_COUNTER_GDS_MODE == 1) {
-		ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
-						     GDS_SIZE_UNORDERED);
-	}
-
-	/* Assemble parameters for VS. */
-	LLVMValueRef vs_params[16];
-	unsigned num_vs_params = 0;
-	unsigned param_vertex_id, param_instance_id;
-
-	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
-	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
-	vs_params[num_vs_params++] = const_desc;
-	vs_params[num_vs_params++] = sampler_desc;
-	vs_params[num_vs_params++] = LLVMConstInt(ctx->i32,
-					S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
-	vs_params[num_vs_params++] = base_vertex;
-	vs_params[num_vs_params++] = start_instance;
-	vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */
-	vs_params[num_vs_params++] = vb_desc;
-
-	vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
-	vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
-	vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */
-	vs_params[num_vs_params++] = ctx->i32_0; /* unused */
-
-	assert(num_vs_params <= ARRAY_SIZE(vs_params));
-	assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
-	/* Load descriptors. (load 8 dwords at once) */
-	LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
-	tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
-				   ac_array_in_const32_addr_space(ctx->v8i32), "");
-	tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0);
-
-	for (unsigned i = 0; i < 8; i++)
-		desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
-	input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
-	output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
-	/* Compute PrimID and InstanceID. */
-	LLVMValueRef global_thread_id =
-		ac_build_imad(&ctx->ac, block_id,
-			      LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id);
-	LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
-	LLVMValueRef instance_id = ctx->i32_0;
-
-	if (key->opt.cs_instancing) {
-		/* Unpack num_prims_udiv_terms. */
-		LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
-						       LLVMConstInt(ctx->i32, 0x1f, 0), "");
-		LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
-								LLVMConstInt(ctx->i32, 5, 0), "");
-		/* Divide the total prim_id by the number of prims per instance. */
-		instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
-							       num_prims_udiv_multiplier,
-							       post_shift);
-		/* Compute the remainder. */
-		prim_id = LLVMBuildSub(builder, prim_id,
-				       LLVMBuildMul(builder, instance_id,
-						    prims_per_instance, ""), "");
-	}
-
-	/* Generate indices (like a non-indexed draw call). */
-	LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)};
-	unsigned vertices_per_prim = 3;
-
-	switch (key->opt.cs_prim_type) {
-	case PIPE_PRIM_TRIANGLES:
-		for (unsigned i = 0; i < 3; i++) {
-			index[i] = ac_build_imad(&ctx->ac, prim_id,
-						 LLVMConstInt(ctx->i32, 3, 0),
-						 LLVMConstInt(ctx->i32, i, 0));
-		}
-		break;
-	case PIPE_PRIM_TRIANGLE_STRIP:
-		for (unsigned i = 0; i < 3; i++) {
-			index[i] = LLVMBuildAdd(builder, prim_id,
-						LLVMConstInt(ctx->i32, i, 0), "");
-		}
-		break;
-	case PIPE_PRIM_TRIANGLE_FAN:
-		/* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
-		 * and rasterizer as a normal triangle, so we need to put the provoking
-		 * vertex into the correct index variable and preserve orientation at the same time.
-		 * gl_VertexID is preserved, because it's equal to the index.
-		 */
-		if (key->opt.cs_provoking_vertex_first) {
-			index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
-			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
-			index[2] = ctx->i32_0;
-		} else {
-			index[0] = ctx->i32_0;
-			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
-			index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
-		}
-		break;
-	default:
-		unreachable("unexpected primitive type");
-	}
-
-	/* Fetch indices. */
-	if (key->opt.cs_indexed) {
-		for (unsigned i = 0; i < 3; i++) {
-			index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
-							       index[i], ctx->i32_0, 1,
-							       0, true);
-			index[i] = ac_to_integer(&ctx->ac, index[i]);
-		}
-	}
-
-	/* Extract the ordered wave ID. */
-	if (VERTEX_COUNTER_GDS_MODE == 2) {
-		ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
-						LLVMConstInt(ctx->i32, 6, 0), "");
-		ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
-					       LLVMConstInt(ctx->i32, 0xfff, 0), "");
-	}
-	LLVMValueRef thread_id =
-		LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), "");
-
-	/* Every other triangle in a strip has a reversed vertex order, so we
-	 * need to swap vertices of odd primitives to get the correct primitive
-	 * orientation when converting triangle strips to triangles. Primitive
-	 * restart complicates it, because a strip can start anywhere.
-	 */
-	LLVMValueRef prim_restart_accepted = ctx->i1true;
-
-	if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
-		/* Without primitive restart, odd primitives have reversed orientation.
-		 * Only primitive restart can flip it with respect to the first vertex
-		 * of the draw call.
-		 */
-		LLVMValueRef first_is_odd = ctx->i1false;
-
-		/* Handle primitive restart. */
-		if (key->opt.cs_primitive_restart) {
-			/* Get the GDS primitive restart continue flag and clear
-			 * the flag in vertex_counter. This flag is used when the draw
-			 * call was split and we need to load the primitive orientation
-			 * flag from GDS for the first wave too.
-			 */
-			LLVMValueRef gds_prim_restart_continue =
-				LLVMBuildLShr(builder, vertex_counter,
-					      LLVMConstInt(ctx->i32, 31, 0), "");
-			gds_prim_restart_continue =
-				LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, "");
-			vertex_counter = LLVMBuildAnd(builder, vertex_counter,
-						      LLVMConstInt(ctx->i32, 0x7fffffff, 0), "");
-
-			LLVMValueRef index0_is_reset;
-
-			for (unsigned i = 0; i < 3; i++) {
-				LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
-								       restart_index, "");
-				if (i == 0)
-					index0_is_reset = LLVMBuildNot(builder, not_reset, "");
-				prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
-								     not_reset, "");
-			}
-
-			/* If the previous waves flip the primitive orientation
-			 * of the current triangle strip, it will be stored in GDS.
-			 *
-			 * Sometimes the correct orientation is not needed, in which case
-			 * we don't need to execute this.
-			 */
-			if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
-				/* If there are reset indices in this wave, get the thread index
-				 * where the most recent strip starts relative to each thread.
-				 */
-				LLVMValueRef preceding_threads_mask =
-					LLVMBuildSub(builder,
-						     LLVMBuildShl(builder, ctx->ac.i64_1,
-								  LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""),
-						     ctx->ac.i64_1, "");
-
-				LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
-				LLVMValueRef preceding_reset_threadmask =
-					LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
-				LLVMValueRef strip_start =
-					ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
-				strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, "");
-
-				/* This flips the orientatino based on reset indices within this wave only. */
-				first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, "");
-
-				LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
-				LLVMValueRef is_first_wave, current_wave_resets_index;
-
-				/* Get the thread index where the last strip starts in this wave.
-				 *
-				 * If the last strip doesn't start in this wave, the thread index
-				 * will be 0.
-				 *
-				 * If the last strip starts in the next wave, the thread index will
-				 * be 64.
-				 */
-				last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
-				last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, "");
-
-				struct si_thread0_section section;
-				si_enter_thread0_section(ctx, &section, thread_id);
-
-				/* This must be done in the thread 0 section, because
-				 * we expect PrimID to be 0 for the whole first wave
-				 * in this expression.
-				 *
-				 * NOTE: This will need to be different if we wanna support
-				 * instancing with primitive restart.
-				 */
-				is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, "");
-				is_first_wave = LLVMBuildAnd(builder, is_first_wave,
-							     LLVMBuildNot(builder,
-									  gds_prim_restart_continue, ""), "");
-				current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
-									  last_strip_start, ctx->i32_0, "");
-
-				ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state");
-
-				/* Save the last strip start primitive index in GDS and read
-				 * the value that previous waves stored.
-				 *
-				 * if (is_first_wave || current_wave_resets_strip)
-				 *    // Read the value that previous waves stored and store a new one.
-				 *    first_is_odd = ds.ordered.swap(last_strip_start);
-				 * else
-				 *    // Just read the value that previous waves stored.
-				 *    first_is_odd = ds.ordered.add(0);
-				 */
-				ac_build_ifcc(&ctx->ac,
-					      LLVMBuildOr(builder, is_first_wave,
-							  current_wave_resets_index, ""), 12602);
-				{
-					/* The GDS address is always 0 with ordered append. */
-					tmp = si_build_ds_ordered_op(ctx, "swap",
-								     ordered_wave_id, last_strip_start,
-								     1, true, false);
-					LLVMBuildStore(builder, tmp, ret);
-				}
-				ac_build_else(&ctx->ac, 12603);
-				{
-					/* Just read the value from GDS. */
-					tmp = si_build_ds_ordered_op(ctx, "add",
-								     ordered_wave_id, ctx->i32_0,
-								     1, true, false);
-					LLVMBuildStore(builder, tmp, ret);
-				}
-				ac_build_endif(&ctx->ac, 12602);
-
-				prev_wave_state = LLVMBuildLoad(builder, ret, "");
-				/* Ignore the return value if this is the first wave. */
-				prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
-								  ctx->i32_0, prev_wave_state, "");
-				si_exit_thread0_section(&section, &prev_wave_state);
-				prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, "");
-
-				/* If the strip start appears to be on thread 0 for the current primitive
-				 * (meaning the reset index is not present in this wave and might have
-				 * appeared in previous waves), use the value from GDS to determine
-				 * primitive orientation.
-				 *
-				 * If the strip start is in this wave for the current primitive, use
-				 * the value from the current wave to determine primitive orientation.
-				 */
-				LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
-									     strip_start, ctx->i32_0, "");
-				first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
-							       first_is_odd, "");
-			}
-		}
-		/* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
-		LLVMValueRef prim_is_odd =
-			LLVMBuildXor(builder, first_is_odd,
-				     LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), "");
-
-		/* Determine the primitive orientation.
-		 * Only swap the vertices that are not the provoking vertex. We need to keep
-		 * the provoking vertex in place.
-		 */
-		if (key->opt.cs_provoking_vertex_first) {
-			LLVMValueRef index1 = index[1];
-			LLVMValueRef index2 = index[2];
-			index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, "");
-			index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, "");
-		} else {
-			LLVMValueRef index0 = index[0];
-			LLVMValueRef index1 = index[1];
-			index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, "");
-			index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, "");
-		}
-	}
-
-	/* Execute the vertex shader for each vertex to get vertex positions. */
-	LLVMValueRef pos[3][4];
-	for (unsigned i = 0; i < vertices_per_prim; i++) {
-		vs_params[param_vertex_id] = index[i];
-		vs_params[param_instance_id] = instance_id;
-
-		LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
-		for (unsigned chan = 0; chan < 4; chan++)
-			pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
-	}
-
-	/* Divide XYZ by W. */
-	for (unsigned i = 0; i < vertices_per_prim; i++) {
-		for (unsigned chan = 0; chan < 3; chan++)
-			pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
-	}
-
-	/* Load the viewport state. */
-	LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
-						  LLVMConstInt(ctx->i32, 2, 0));
-	vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
-	vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
-	vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
-	vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
-	vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
-	/* Do culling. */
-	struct ac_cull_options options = {};
-	options.cull_front = key->opt.cs_cull_front;
-	options.cull_back = key->opt.cs_cull_back;
-	options.cull_view_xy = true;
-	options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
-	options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
-	options.cull_small_prims = true;
-	options.cull_zero_area = true;
-	options.cull_w = true;
-	options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
-	LLVMValueRef accepted =
-		ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
-				 vp_scale, vp_translate, smallprim_precision,
-				 &options);
-
-	LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
-	/* Count the number of active threads by doing bitcount(accepted). */
-	LLVMValueRef num_prims_accepted =
-		ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64,
-				   &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
-	num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, "");
-
-	LLVMValueRef start;
-
-	/* Execute atomic_add on the vertex count. */
-	struct si_thread0_section section;
-	si_enter_thread0_section(ctx, &section, thread_id);
-	{
-		if (VERTEX_COUNTER_GDS_MODE == 0) {
-			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
-						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
-			vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
-			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-						   vertex_counter, num_indices,
-						   LLVMAtomicOrderingMonotonic, false);
-		} else if (VERTEX_COUNTER_GDS_MODE == 1) {
-			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
-						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
-			vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
-							   LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), "");
-			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-						   vertex_counter, num_indices,
-						   LLVMAtomicOrderingMonotonic, false);
-		} else if (VERTEX_COUNTER_GDS_MODE == 2) {
-			LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
-
-			/* If the draw call was split into multiple subdraws, each using
-			 * a separate draw packet, we need to start counting from 0 for
-			 * the first compute wave of the subdraw.
-			 *
-			 * vertex_counter contains the primitive ID of the first thread
-			 * in the first wave.
-			 *
-			 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
-			 */
-			LLVMValueRef is_first_wave =
-				LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
-					      vertex_counter, "");
-
-			/* Store the primitive count for ordered append, not vertex count.
-			 * The idea is to avoid GDS initialization via CP DMA. The shader
-			 * effectively stores the first count using "swap".
-			 *
-			 * if (first_wave) {
-			 *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
-			 *    previous = 0;
-			 * } else {
-			 *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
-			 * }
-			 */
-			ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
-			{
-				/* The GDS address is always 0 with ordered append. */
-				si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
-						       num_prims_accepted, 0, true, true);
-				LLVMBuildStore(builder, ctx->i32_0, tmp_store);
-			}
-			ac_build_else(&ctx->ac, 12605);
-			{
-				LLVMBuildStore(builder,
-					       si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
-								      num_prims_accepted, 0,
-								      true, true),
-					       tmp_store);
-			}
-			ac_build_endif(&ctx->ac, 12604);
-
-			start = LLVMBuildLoad(builder, tmp_store, "");
-		}
-	}
-	si_exit_thread0_section(&section, &start);
-
-	/* Write the final vertex count to memory. An EOS/EOP event could do this,
-	 * but those events are super slow and should be avoided if performance
-	 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
-	 * event like this.
-	 */
-	if (VERTEX_COUNTER_GDS_MODE == 2) {
-		ac_build_ifcc(&ctx->ac,
-			      LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
-					    last_wave_prim_id, ""), 12606);
-		LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
-		count = LLVMBuildMul(builder, count,
-				     LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
-
-		/* GFX8 needs to disable caching, so that the CP can see the stored value.
-		 * MTYPE=3 bypasses TC L2.
-		 */
-		if (ctx->screen->info.chip_class <= GFX8) {
-			LLVMValueRef desc[] = {
-				vertex_count_addr,
-				LLVMConstInt(ctx->i32,
-					S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
-				LLVMConstInt(ctx->i32, 4, 0),
-				LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-						       S_008F0C_MTYPE(3 /* uncached */), 0),
-			};
-			LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
-			ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
-						    ctx->i32_0, 0, ac_glc | ac_slc, false);
-		} else {
-			LLVMBuildStore(builder, count,
-				       si_expand_32bit_pointer(ctx, vertex_count_addr));
-		}
-		ac_build_endif(&ctx->ac, 12606);
-	} else {
-		/* For unordered modes that increment a vertex count instead of
-		 * primitive count, convert it into the primitive index.
-		 */
-		start = LLVMBuildUDiv(builder, start,
-				      LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
-	}
-
-	/* Now we need to store the indices of accepted primitives into
-	 * the output index buffer.
-	 */
-	ac_build_ifcc(&ctx->ac, accepted, 16607);
-	{
-		/* Get the number of bits set before the index of this thread. */
-		LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
-		/* We have lowered instancing. Pack the instance ID into vertex ID. */
-		if (key->opt.cs_instancing) {
-			instance_id = LLVMBuildShl(builder, instance_id,
-						   LLVMConstInt(ctx->i32, 16, 0), "");
-
-			for (unsigned i = 0; i < vertices_per_prim; i++)
-				index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
-		}
-
-		if (VERTEX_COUNTER_GDS_MODE == 2) {
-			/* vertex_counter contains the first primitive ID
-			 * for this dispatch. If the draw call was split into
-			 * multiple subdraws, the first primitive ID is > 0
-			 * for subsequent subdraws. Each subdraw uses a different
-			 * portion of the output index buffer. Offset the store
-			 * vindex by the first primitive ID to get the correct
-			 * store address for the subdraw.
-			 */
-			start = LLVMBuildAdd(builder, start, vertex_counter, "");
-		}
-
-		/* Write indices for accepted primitives. */
-		LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
-		LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
-		if (!ac_has_vec3_support(ctx->ac.chip_class, true))
-			vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
-		ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
-					     vindex, ctx->i32_0, 3,
-					     ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
-	}
-	ac_build_endif(&ctx->ac, 16607);
-
-	LLVMBuildRetVoid(builder);
-}
-
-/* Return false if the shader isn't ready. */
-static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
-					     const struct pipe_draw_info *info,
-					     bool primitive_restart)
-{
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	struct si_shader_key key;
-
-	/* Primitive restart needs ordered counters. */
-	assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
-	assert(!primitive_restart || info->instance_count == 1);
-
-	memset(&key, 0, sizeof(key));
-	si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
-	assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
-	key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
-	key.opt.vs_as_prim_discard_cs = 1;
-	key.opt.cs_prim_type = info->mode;
-	key.opt.cs_indexed = info->index_size != 0;
-	key.opt.cs_instancing = info->instance_count > 1;
-	key.opt.cs_primitive_restart = primitive_restart;
-	key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
-	/* Primitive restart with triangle strips needs to preserve primitive
-	 * orientation for cases where front and back primitive orientation matters.
-	 */
-	if (primitive_restart) {
-		struct si_shader_selector *ps = sctx->ps_shader.cso;
-
-		key.opt.cs_need_correct_orientation =
-			rs->cull_front != rs->cull_back ||
-			ps->info.uses_frontface ||
-			(rs->two_side && ps->info.colors_read);
-	}
-
-	if (rs->rasterizer_discard) {
-		/* Just for performance testing and analysis of trivial bottlenecks.
-		 * This should result in a very short compute shader. */
-		key.opt.cs_cull_front = 1;
-		key.opt.cs_cull_back = 1;
-	} else {
-		key.opt.cs_cull_front =
-			sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
-		key.opt.cs_cull_back =
-			sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
-	}
-
-	if (!rs->depth_clamp_any && CULL_Z) {
-		key.opt.cs_cull_z = 1;
-		key.opt.cs_halfz_clip_space = rs->clip_halfz;
-	}
-
-	sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
-	sctx->cs_prim_discard_state.current = NULL;
-
-	struct si_compiler_ctx_state compiler_state;
-	compiler_state.compiler = &sctx->compiler;
-	compiler_state.debug = sctx->debug;
-	compiler_state.is_debug_context = sctx->is_debug;
-
-	return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
-					 &compiler_state, &key, -1, true) == 0 &&
-	       /* Disallow compute shaders using the scratch buffer. */
-	       sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
-}
-
-static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
-{
-	if (sctx->index_ring)
-		return true;
-
-	if (!sctx->prim_discard_compute_cs) {
-		struct radeon_winsys *ws = sctx->ws;
-		unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
-				    VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
-		unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
-
-		if (gds_size) {
-			sctx->gds = ws->buffer_create(ws, gds_size, 4,
-						      RADEON_DOMAIN_GDS, 0);
-			if (!sctx->gds)
-				return false;
-
-			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
-					  RADEON_USAGE_READWRITE, 0, 0);
-		}
-		if (num_oa_counters) {
-			assert(gds_size);
-			sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
-							 1, RADEON_DOMAIN_OA, 0);
-			if (!sctx->gds_oa)
-				return false;
-
-			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
-					  RADEON_USAGE_READWRITE, 0, 0);
-		}
-
-		sctx->prim_discard_compute_cs =
-			ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
-						       num_oa_counters > 0);
-		if (!sctx->prim_discard_compute_cs)
-			return false;
-	}
-
-	if (!sctx->index_ring) {
-		sctx->index_ring =
-			si_aligned_buffer_create(sctx->b.screen,
-						 SI_RESOURCE_FLAG_UNMAPPABLE,
-						 PIPE_USAGE_DEFAULT,
-						 sctx->index_ring_size_per_ib * 2,
-						 2 * 1024 * 1024);
-		if (!sctx->index_ring)
-			return false;
-	}
-	return true;
-}
-
-static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
-{
-	return sctx->index_ring_offset +
-	       align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
-	       sctx->index_ring_size_per_ib;
-}
-
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
-				      const struct pipe_draw_info *info,
-				      bool primitive_restart)
-{
-	/* If the compute shader compilation isn't finished, this returns false. */
-	if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
-		return SI_PRIM_DISCARD_DISABLED;
-
-	if (!si_initialize_prim_discard_cmdbuf(sctx))
-		return SI_PRIM_DISCARD_DISABLED;
-
-	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
-	unsigned prim = info->mode;
-	unsigned count = info->count;
-	unsigned instance_count = info->instance_count;
-	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
-	unsigned num_prims = num_prims_per_instance * instance_count;
-	unsigned out_indexbuf_size = num_prims * 12;
-	bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
-	const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
-
-	/* Split draws at the draw call level if the ring is full. This makes
-	 * better use of the ring space.
-	 */
-	if (ring_full &&
-	    num_prims > split_prims_draw_level &&
-	    instance_count == 1 && /* TODO: support splitting instanced draws */
-	    (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
-			   (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
-		/* Split draws. */
-		struct pipe_draw_info split_draw = *info;
-		split_draw.primitive_restart = primitive_restart;
-
-		unsigned base_start = split_draw.start;
-
-		if (prim == PIPE_PRIM_TRIANGLES) {
-			unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
-			assert(vert_count_per_subdraw < count);
-
-			for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
-				split_draw.start = base_start + start;
-				split_draw.count = MIN2(count - start, vert_count_per_subdraw);
-
-				sctx->b.draw_vbo(&sctx->b, &split_draw);
-			}
-		} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
-			/* No primitive pair can be split, because strips reverse orientation
-			 * for odd primitives. */
-			STATIC_ASSERT(split_prims_draw_level % 2 == 0);
-
-			unsigned vert_count_per_subdraw = split_prims_draw_level;
-
-			for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
-				split_draw.start = base_start + start;
-				split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
-				sctx->b.draw_vbo(&sctx->b, &split_draw);
-
-				if (start == 0 &&
-				    primitive_restart &&
-				    sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
-					sctx->preserve_prim_restart_gds_at_flush = true;
-			}
-			sctx->preserve_prim_restart_gds_at_flush = false;
-		} else {
-			assert(0);
-		}
-
-		return SI_PRIM_DISCARD_DRAW_SPLIT;
-	}
-
-	/* Just quit if the draw call doesn't fit into the ring and can't be split. */
-	if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
-		if (SI_PRIM_DISCARD_DEBUG)
-			puts("PD failed: draw call too big, can't be split");
-		return SI_PRIM_DISCARD_DISABLED;
-	}
-
-	unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
-	unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
-				   24 * (num_subdraws - 1) + /* subdraws */
-				   20; /* leave some space at the end */
-	unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
-
-	if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
-		need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
-	else
-		need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-
-	if (ring_full ||
-	    (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
-	    !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
-		/* If the current IB is empty but the size is too small, add a NOP
-		 * packet to force a flush and get a bigger IB.
-		 */
-		if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
-		    gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
-			radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-			radeon_emit(gfx_cs, 0);
-		}
-
-		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-	}
-
-	/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
-	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-	ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
-	assert(compute_has_space);
-	assert(si_check_ring_space(sctx, out_indexbuf_size));
-	return SI_PRIM_DISCARD_ENABLED;
-}
-
-void si_compute_signal_gfx(struct si_context *sctx)
-{
-	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-	unsigned writeback_L2_flags = 0;
-
-	/* The writeback L2 flags vary with each chip generation. */
-	/* CI needs to flush vertex indices to memory. */
-	if (sctx->chip_class <= GFX7)
-		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
-	else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
-		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
-	if (!sctx->compute_num_prims_in_batch)
-		return;
-
-	assert(sctx->compute_rewind_va);
-
-	/* After the queued dispatches are done and vertex counts are written to
-	 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
-	 * the dispatches to finish, it only adds the CS_DONE event into the event
-	 * queue.
-	 */
-	si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
-			  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-			  writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
-					       EOP_INT_SEL_NONE,
-			  EOP_DATA_SEL_VALUE_32BIT,
-			  NULL,
-			  sctx->compute_rewind_va |
-			  ((uint64_t)sctx->screen->info.address32_hi << 32),
-			  REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
-			  SI_NOT_QUERY);
-
-	sctx->compute_rewind_va = 0;
-	sctx->compute_num_prims_in_batch = 0;
-}
-
-/* Dispatch a primitive discard compute shader. */
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-					  const struct pipe_draw_info *info,
-					  unsigned index_size,
-					  unsigned base_vertex,
-					  uint64_t input_indexbuf_va,
-					  unsigned input_indexbuf_num_elements)
-{
-	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
-	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
-	if (!num_prims_per_instance)
-		return;
-
-	unsigned num_prims = num_prims_per_instance * info->instance_count;
-	unsigned vertices_per_prim, output_indexbuf_format;
-
-	switch (info->mode) {
-	case PIPE_PRIM_TRIANGLES:
-	case PIPE_PRIM_TRIANGLE_STRIP:
-	case PIPE_PRIM_TRIANGLE_FAN:
-		vertices_per_prim = 3;
-		output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
-		break;
-	default:
-		unreachable("unsupported primitive type");
-		return;
-	}
-
-	unsigned out_indexbuf_offset;
-	uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
-	bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
-
-	/* Initialize the compute IB if it's empty. */
-	if (!sctx->prim_discard_compute_ib_initialized) {
-		/* 1) State initialization. */
-		sctx->compute_gds_offset = 0;
-		sctx->compute_ib_last_shader = NULL;
-
-		if (sctx->last_ib_barrier_fence) {
-			assert(!sctx->last_ib_barrier_buf);
-			sctx->ws->cs_add_fence_dependency(gfx_cs,
-							  sctx->last_ib_barrier_fence,
-							  RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
-		}
-
-		/* 2) IB initialization. */
-
-		/* This needs to be done at the beginning of IBs due to possible
-		 * TTM buffer moves in the kernel.
-		 *
-		 * TODO: update for GFX10
-		 */
-		si_emit_surface_sync(sctx, cs,
-				     S_0085F0_TC_ACTION_ENA(1) |
-				     S_0085F0_TCL1_ACTION_ENA(1) |
-				     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
-				     S_0085F0_SH_ICACHE_ACTION_ENA(1) |
-				     S_0085F0_SH_KCACHE_ACTION_ENA(1));
-
-		/* Restore the GDS prim restart counter if needed. */
-		if (sctx->preserve_prim_restart_gds_at_flush) {
-			si_cp_copy_data(sctx, cs,
-					COPY_DATA_GDS, NULL, 4,
-					COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
-		}
-
-		si_emit_initial_compute_regs(sctx, cs);
-
-		radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
-				  S_00B860_WAVES(sctx->scratch_waves) |
-				  S_00B860_WAVESIZE(0)); /* no scratch */
-
-		/* Only 1D grids are launched. */
-		radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
-		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
-				S_00B820_NUM_THREAD_PARTIAL(1));
-		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
-				S_00B824_NUM_THREAD_PARTIAL(1));
-
-		radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-
-		/* Disable ordered alloc for OA resources. */
-		for (unsigned i = 0; i < 2; i++) {
-			radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
-			radeon_emit(cs, S_031074_INDEX(i));
-			radeon_emit(cs, 0);
-			radeon_emit(cs, S_03107C_ENABLE(0));
-		}
-
-		if (sctx->last_ib_barrier_buf) {
-			assert(!sctx->last_ib_barrier_fence);
-			radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
-						  RADEON_USAGE_READ, RADEON_PRIO_FENCE);
-			si_cp_wait_mem(sctx, cs,
-				       sctx->last_ib_barrier_buf->gpu_address +
-				       sctx->last_ib_barrier_buf_offset, 1, 1,
-				       WAIT_REG_MEM_EQUAL);
-		}
-
-		sctx->prim_discard_compute_ib_initialized = true;
-	}
-
-	/* Allocate the output index buffer. */
-	output_indexbuf_size = align(output_indexbuf_size,
-				     sctx->screen->info.tcc_cache_line_size);
-	assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
-	out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
-	sctx->index_ring_offset += output_indexbuf_size;
-
-	radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
-				  RADEON_PRIO_SHADER_RW_BUFFER);
-	uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
-	/* Prepare index buffer descriptors. */
-	struct si_resource *indexbuf_desc = NULL;
-	unsigned indexbuf_desc_offset;
-	unsigned desc_size = 12 * 4;
-	uint32_t *desc;
-
-	u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
-		       si_optimal_tcc_alignment(sctx, desc_size),
-		       &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
-		       (void**)&desc);
-	radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
-				  RADEON_PRIO_DESCRIPTORS);
-
-	/* Input index buffer. */
-	desc[0] = input_indexbuf_va;
-	desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
-		  S_008F04_STRIDE(index_size);
-	desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
-	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-		  S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
-				       index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
-							 V_008F0C_BUF_DATA_FORMAT_32);
-
-	/* Output index buffer. */
-	desc[4] = out_indexbuf_va;
-	desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
-		  S_008F04_STRIDE(vertices_per_prim * 4);
-	desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
-	desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-		  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-		  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-		  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-		  S_008F0C_DATA_FORMAT(output_indexbuf_format);
-
-	/* Viewport state.
-	 * This is needed by the small primitive culling, because it's done
-	 * in screen space.
-	 */
-	float scale[2], translate[2];
-
-	scale[0] = sctx->viewports.states[0].scale[0];
-	scale[1] = sctx->viewports.states[0].scale[1];
-	translate[0] = sctx->viewports.states[0].translate[0];
-	translate[1] = sctx->viewports.states[0].translate[1];
-
-	/* The viewport shouldn't flip the X axis for the small prim culling to work. */
-	assert(-scale[0] + translate[0] <= scale[0] + translate[0]);
-
-	/* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
-	 * This is because the viewport transformation inverts the clip space
-	 * bounding box, so min becomes max, which breaks small primitive
-	 * culling.
-	 */
-	if (sctx->viewports.y_inverted) {
-		scale[1] = -scale[1];
-		translate[1] = -translate[1];
-	}
-
-	/* Scale the framebuffer up, so that samples become pixels and small
-	 * primitive culling is the same for all sample counts.
-	 * This only works with the standard DX sample positions, because
-	 * the samples are evenly spaced on both X and Y axes.
-	 */
-	unsigned num_samples = sctx->framebuffer.nr_samples;
-	assert(num_samples >= 1);
-
-	for (unsigned i = 0; i < 2; i++) {
-		scale[i] *= num_samples;
-		translate[i] *= num_samples;
-	}
-
-	desc[8] = fui(scale[0]);
-	desc[9] = fui(scale[1]);
-	desc[10] = fui(translate[0]);
-	desc[11] = fui(translate[1]);
-
-	/* Better subpixel precision increases the efficiency of small
-	 * primitive culling. */
-	unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
-	float small_prim_cull_precision;
-
-	if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-		small_prim_cull_precision = num_samples / 4096.0;
-	else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-		small_prim_cull_precision = num_samples / 1024.0;
-	else
-		small_prim_cull_precision = num_samples / 256.0;
-
-	/* Set user data SGPRs. */
-	/* This can't be greater than 14 if we want the fastest launch rate. */
-	unsigned user_sgprs = 13;
-
-	uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
-	unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
-	unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
-	uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
-	uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
-	uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
-				      sctx->vb_descriptors_buffer->gpu_address +
-				      sctx->vb_descriptors_offset : 0;
-	unsigned gds_offset, gds_size;
-	struct si_fast_udiv_info32 num_prims_udiv = {};
-
-	if (info->instance_count > 1)
-		num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
-	/* Limitations on how these two are packed in the user SGPR. */
-	assert(num_prims_udiv.post_shift < 32);
-	assert(num_prims_per_instance < 1 << 27);
-
-	si_resource_reference(&indexbuf_desc, NULL);
-
-	bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
-
-	if (VERTEX_COUNTER_GDS_MODE == 1) {
-		gds_offset = sctx->compute_gds_offset;
-		gds_size = primitive_restart ? 8 : 4;
-		sctx->compute_gds_offset += gds_size;
-
-		/* Reset the counters in GDS for the first dispatch using WRITE_DATA.
-		 * The remainder of the GDS will be cleared after the dispatch packet
-		 * in parallel with compute shaders.
-		 */
-		if (first_dispatch) {
-			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
-			radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
-			radeon_emit(cs, gds_offset);
-			radeon_emit(cs, 0);
-			radeon_emit(cs, 0); /* value to write */
-			if (gds_size == 8)
-				radeon_emit(cs, 0);
-		}
-	}
-
-	/* Set shader registers. */
-	struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
-	if (shader != sctx->compute_ib_last_shader) {
-		radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
-					  RADEON_PRIO_SHADER_BINARY);
-		uint64_t shader_va = shader->bo->gpu_address;
-
-		assert(shader->config.scratch_bytes_per_wave == 0);
-		assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
-		radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-		radeon_emit(cs, shader_va >> 8);
-		radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
-		radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-		radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
-				S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
-				S_00B848_FLOAT_MODE(shader->config.float_mode) |
-				S_00B848_DX10_CLAMP(1));
-		radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
-				S_00B84C_USER_SGPR(user_sgprs) |
-				S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
-				S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
-				S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
-				S_00B84C_LDS_SIZE(shader->config.lds_size));
-
-		radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-			ac_get_compute_resource_limits(&sctx->screen->info,
-						       WAVES_PER_TG,
-						       MAX_WAVES_PER_SH,
-						       THREADGROUPS_PER_CU));
-		sctx->compute_ib_last_shader = shader;
-	}
-
-	STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
-
-	/* Big draw calls are split into smaller dispatches and draw packets. */
-	for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
-		unsigned num_subdraw_prims;
-
-		if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
-			num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
-		else
-			num_subdraw_prims = num_prims - start_prim;
-
-		/* Small dispatches are executed back to back until a specific primitive
-		 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
-		 * to start drawing the batch. This batching adds latency to the gfx IB,
-		 * but CS_DONE and REWIND are too slow.
-		 */
-		if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
-			si_compute_signal_gfx(sctx);
-
-		if (sctx->compute_num_prims_in_batch == 0) {
-			assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
-			sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
-			if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
-				radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-				radeon_emit(gfx_cs, 0);
-
-				si_cp_wait_mem(sctx, gfx_cs,
-					       sctx->compute_rewind_va |
-					       (uint64_t)sctx->screen->info.address32_hi << 32,
-					       REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
-					       WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
-				/* Use INDIRECT_BUFFER to chain to a different buffer
-				 * to discard the CP prefetch cache.
-				 */
-				sctx->ws->cs_check_space(gfx_cs, 0, true);
-			} else {
-				radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
-				radeon_emit(gfx_cs, 0);
-			}
-		}
-
-		sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
-		uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
-		uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
-		/* Emit the draw packet into the gfx IB. */
-		radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
-		radeon_emit(gfx_cs, num_prims * vertices_per_prim);
-		radeon_emit(gfx_cs, index_va);
-		radeon_emit(gfx_cs, index_va >> 32);
-		radeon_emit(gfx_cs, 0);
-		radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
-
-		/* Continue with the compute IB. */
-		if (start_prim == 0) {
-			uint32_t gds_prim_restart_continue_bit = 0;
-
-			if (sctx->preserve_prim_restart_gds_at_flush) {
-				assert(primitive_restart &&
-				       info->mode == PIPE_PRIM_TRIANGLE_STRIP);
-				assert(start_prim < 1 << 31);
-				gds_prim_restart_continue_bit = 1 << 31;
-			}
-
-			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
-			radeon_emit(cs, index_buffers_va);
-			radeon_emit(cs,
-				    VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
-				    VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
-								   start_prim |
-								   gds_prim_restart_continue_bit);
-			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-			radeon_emit(cs, count_va);
-			radeon_emit(cs, vb_desc_va);
-			radeon_emit(cs, vs_const_desc_va);
-			radeon_emit(cs, vs_sampler_desc_va);
-			radeon_emit(cs, base_vertex);
-			radeon_emit(cs, info->start_instance);
-			radeon_emit(cs, num_prims_udiv.multiplier);
-			radeon_emit(cs, num_prims_udiv.post_shift |
-					(num_prims_per_instance << 5));
-			radeon_emit(cs, info->restart_index);
-			/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
-			radeon_emit(cs, fui(small_prim_cull_precision));
-		} else {
-			assert(VERTEX_COUNTER_GDS_MODE == 2);
-			/* Only update the SGPRs that changed. */
-			radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
-			radeon_emit(cs, start_prim);
-			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-			radeon_emit(cs, count_va);
-		}
-
-		/* Set grid dimensions. */
-		unsigned start_block = start_prim / THREADGROUP_SIZE;
-		unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
-		unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
-		radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
-		radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
-				  S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
-				  S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
-		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
-				PKT3_SHADER_TYPE_S(1));
-		radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
-		radeon_emit(cs, 1);
-		radeon_emit(cs, 1);
-		radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
-				S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
-				S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
-				S_00B800_ORDER_MODE(0 /* launch in order */));
-
-		/* This is only for unordered append. Ordered append writes this from
-		 * the shader.
-		 *
-		 * Note that EOP and EOS events are super slow, so emulating the event
-		 * in a shader is an important optimization.
-		 */
-		if (VERTEX_COUNTER_GDS_MODE == 1) {
-			si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
-					  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-					  EOP_INT_SEL_NONE,
-					  EOP_DATA_SEL_GDS,
-					  NULL,
-					  count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
-					  EOP_DATA_GDS(gds_offset / 4, 1),
-					  SI_NOT_QUERY);
-
-			/* Now that compute shaders are running, clear the remainder of GDS. */
-			if (first_dispatch) {
-				unsigned offset = gds_offset + gds_size;
-				si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
-						       GDS_SIZE_UNORDERED - offset,
-						       0,
-						       SI_CPDMA_SKIP_CHECK_CS_SPACE |
-						       SI_CPDMA_SKIP_GFX_SYNC |
-						       SI_CPDMA_SKIP_SYNC_BEFORE,
-						       SI_COHERENCY_NONE, L2_BYPASS);
-			}
-		}
-		first_dispatch = false;
-
-		assert(cs->current.cdw <= cs->current.max_dw);
-		assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
-	}
-}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c b/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c
index b7aece564..ca2230620 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -100,22 +100,22 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
    radeon_begin(cs);
 
    if (sctx->chip_class >= GFX7) {
-      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-      radeon_emit(cs, header);
-      radeon_emit(cs, src_va);       /* SRC_ADDR_LO [31:0] */
-      radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
-      radeon_emit(cs, dst_va);       /* DST_ADDR_LO [31:0] */
-      radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
-      radeon_emit(cs, command);
+      radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(header);
+      radeon_emit(src_va);       /* SRC_ADDR_LO [31:0] */
+      radeon_emit(src_va >> 32); /* SRC_ADDR_HI [31:0] */
+      radeon_emit(dst_va);       /* DST_ADDR_LO [31:0] */
+      radeon_emit(dst_va >> 32); /* DST_ADDR_HI [31:0] */
+      radeon_emit(command);
    } else {
       header |= S_411_SRC_ADDR_HI(src_va >> 32);
 
-      radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-      radeon_emit(cs, src_va);                  /* SRC_ADDR_LO [31:0] */
-      radeon_emit(cs, header);                  /* SRC_ADDR_HI [15:0] + flags. */
-      radeon_emit(cs, dst_va);                  /* DST_ADDR_LO [31:0] */
-      radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
-      radeon_emit(cs, command);
+      radeon_emit(PKT3(PKT3_CP_DMA, 4, 0));
+      radeon_emit(src_va);                  /* SRC_ADDR_LO [31:0] */
+      radeon_emit(header);                  /* SRC_ADDR_HI [15:0] + flags. */
+      radeon_emit(dst_va);                  /* DST_ADDR_LO [31:0] */
+      radeon_emit((dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
+      radeon_emit(command);
    }
 
    /* CP DMA is executed in ME, but index buffers are read by PFP.
@@ -124,8 +124,8 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
     * should precede it.
     */
    if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
-      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-      radeon_emit(cs, 0);
+      radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(0);
    }
    radeon_end();
 }
@@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
       sdst->TC_L2_dirty = true;
 
    /* If it's not a framebuffer fast clear... */
-   if (coher == SI_COHERENCY_SHADER) {
+   if (coher == SI_COHERENCY_SHADER)
       sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }
 
 /**
@@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
       si_resource(dst)->TC_L2_dirty = true;
 
    /* If it's not a prefetch or GDS copy... */
-   if (dst && src && (dst != src || dst_offset != src_offset)) {
+   if (dst && src && (dst != src || dst_offset != src_offset))
       sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }
 
 void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
@@ -423,13 +419,13 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
 
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-   radeon_emit(cs, header);
-   radeon_emit(cs, address);       /* SRC_ADDR_LO [31:0] */
-   radeon_emit(cs, address >> 32); /* SRC_ADDR_HI [31:0] */
-   radeon_emit(cs, address);       /* DST_ADDR_LO [31:0] */
-   radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */
-   radeon_emit(cs, command);
+   radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+   radeon_emit(header);
+   radeon_emit(address);       /* SRC_ADDR_LO [31:0] */
+   radeon_emit(address >> 32); /* SRC_ADDR_HI [31:0] */
+   radeon_emit(address);       /* DST_ADDR_LO [31:0] */
+   radeon_emit(address >> 32); /* DST_ADDR_HI [31:0] */
+   radeon_emit(command);
    radeon_end();
 }
 
@@ -495,11 +491,11 @@ void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned
    uint64_t va = buf->gpu_address + offset;
 
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
-   radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
-   radeon_emit(cs, va);
-   radeon_emit(cs, va >> 32);
-   radeon_emit_array(cs, (const uint32_t *)data, size / 4);
+   radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
+   radeon_emit(S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
+   radeon_emit(va);
+   radeon_emit(va >> 32);
+   radeon_emit_array((const uint32_t *)data, size / 4);
    radeon_end();
 }
 
@@ -519,11 +515,11 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned
    uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
 
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-   radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
-   radeon_emit(cs, src_va);
-   radeon_emit(cs, src_va >> 32);
-   radeon_emit(cs, dst_va);
-   radeon_emit(cs, dst_va >> 32);
+   radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
+   radeon_emit(src_va);
+   radeon_emit(src_va >> 32);
+   radeon_emit(dst_va);
+   radeon_emit(dst_va >> 32);
    radeon_end();
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c b/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c
index bcc8baa93..540206c15 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c
@@ -344,7 +344,6 @@ struct si_log_chunk_cs {
    struct si_saved_cs *cs;
    bool dump_bo_list;
    unsigned gfx_begin, gfx_end;
-   unsigned compute_begin, compute_end;
 };
 
 static void si_log_chunk_type_cs_destroy(void *data)
@@ -390,13 +389,18 @@ static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begi
    fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end);
 }
 
+void si_print_current_ib(struct si_context *sctx, FILE *f)
+{
+   si_parse_current_ib(f, &sctx->gfx_cs, 0, sctx->gfx_cs.prev_dw + sctx->gfx_cs.current.cdw,
+                       NULL, 0, "GFX", sctx->chip_class);
+}
+
 static void si_log_chunk_type_cs_print(void *data, FILE *f)
 {
    struct si_log_chunk_cs *chunk = data;
    struct si_context *ctx = chunk->ctx;
    struct si_saved_cs *scs = chunk->cs;
    int last_trace_id = -1;
-   int last_compute_trace_id = -1;
 
    /* We are expecting that the ddebug pipe has already
     * waited for the context, so this buffer should be idle.
@@ -404,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
     */
    uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL,
                                        PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ);
-   if (map) {
+   if (map)
       last_trace_id = map[0];
-      last_compute_trace_id = map[1];
-   }
 
    if (chunk->gfx_end != chunk->gfx_begin) {
       if (chunk->gfx_begin == 0) {
@@ -429,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
       }
    }
 
-   if (chunk->compute_end != chunk->compute_begin) {
-      assert(ctx->prim_discard_compute_cs.priv);
-
-      if (scs->flushed) {
-         ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
-                     chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
-                     "Compute IB", ctx->chip_class, NULL, NULL);
-      } else {
-         si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin,
-                             chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
-                             ctx->chip_class);
-      }
-   }
-
    if (chunk->dump_bo_list) {
       fprintf(f, "Flushing. Time: ");
       util_dump_ns(f, scs->time_flush);
@@ -462,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
 
    struct si_saved_cs *scs = ctx->current_saved_cs;
    unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw;
-   unsigned compute_cur = 0;
-
-   if (ctx->prim_discard_compute_cs.priv)
-      compute_cur =
-         ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw;
 
-   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw)
       return;
 
    struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@@ -481,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
    chunk->gfx_end = gfx_cur;
    scs->gfx_last_dw = gfx_cur;
 
-   chunk->compute_begin = scs->compute_last_dw;
-   chunk->compute_end = compute_cur;
-   scs->compute_last_dw = compute_cur;
-
    u_log_chunk(log, &si_log_chunk_type_cs, chunk);
 }
 
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c b/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c
index 60daaeb07..f02855743 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -231,15 +231,6 @@ static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_reso
 
    priority = si_get_sampler_view_priority(&tex->buffer);
    radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, check_mem);
-
-   if (resource->target == PIPE_BUFFER)
-      return;
-
-   /* Add separate DCC. */
-   if (tex->dcc_separate_buffer) {
-      radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, usage,
-                                              RADEON_PRIO_SEPARATE_META, check_mem);
-   }
 }
 
 static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers)
@@ -296,7 +287,8 @@ static void si_set_buf_desc_address(struct si_resource *buf, uint64_t offset, ui
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
                                     const struct legacy_surf_level *base_level_info,
                                     unsigned base_level, unsigned first_level, unsigned block_width,
-                                    bool is_stencil, uint16_t access, uint32_t *state)
+                                    /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+                                    bool is_stencil, uint16_t access, uint32_t * restrict state)
 {
    uint64_t va, meta_va = 0;
 
@@ -318,7 +310,6 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
    }
 
    state[0] = va >> 8;
-   state[1] &= C_008F14_BASE_ADDRESS_HI;
    state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
 
    /* Only macrotiled modes can set tile swizzle.
@@ -328,11 +319,8 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
       state[0] |= tex->surface.tile_swizzle;
 
    if (sscreen->info.chip_class >= GFX8) {
-      state[6] &= C_008F28_COMPRESSION_EN;
-
       if (!(access & SI_IMAGE_ACCESS_DCC_OFF) && vi_dcc_enabled(tex, first_level)) {
-         meta_va =
-            (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.meta_offset;
+         meta_va = tex->buffer.gpu_address + tex->surface.meta_offset;
 
          if (sscreen->info.chip_class == GFX8) {
             meta_va += tex->surface.u.legacy.color.dcc_level[base_level].dcc_offset;
@@ -355,17 +343,12 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
       state[7] = meta_va >> 8;
 
    if (sscreen->info.chip_class >= GFX10) {
-      state[3] &= C_00A00C_SW_MODE;
-
       if (is_stencil) {
          state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode);
       } else {
          state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.swizzle_mode);
       }
 
-      state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED &
-                  C_00A018_WRITE_COMPRESS_ENABLE;
-
       if (meta_va) {
          struct gfx9_surf_meta_flags meta = {
             .rb_aligned = 1,
@@ -377,14 +360,21 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
 
          state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
                      S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8) |
-                     S_00A018_WRITE_COMPRESS_ENABLE((access & SI_IMAGE_ACCESS_DCC_WRITE) != 0);
+                     /* DCC image stores require the following settings:
+                      * - INDEPENDENT_64B_BLOCKS = 0
+                      * - INDEPENDENT_128B_BLOCKS = 1
+                      * - MAX_COMPRESSED_BLOCK_SIZE = 128B
+                      * - MAX_UNCOMPRESSED_BLOCK_SIZE = 256B (always used)
+                      *
+                      * The same limitations apply to SDMA compressed stores because
+                      * SDMA uses the same DCC codec.
+                      */
+                     S_00A018_WRITE_COMPRESS_ENABLE(ac_surface_supports_dcc_image_stores(sscreen->info.chip_class, &tex->surface) &&
+                                                    (access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE));
       }
 
       state[7] = meta_va >> 16;
    } else if (sscreen->info.chip_class == GFX9) {
-      state[3] &= C_008F1C_SW_MODE;
-      state[4] &= C_008F20_PITCH;
-
       if (is_stencil) {
          state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode);
          state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.zs.stencil_epitch);
@@ -423,9 +413,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
       unsigned pitch = base_level_info->nblk_x * block_width;
       unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
 
-      state[3] &= C_008F1C_TILING_INDEX;
       state[3] |= S_008F1C_TILING_INDEX(index);
-      state[4] &= C_008F20_PITCH;
       state[4] |= S_008F20_PITCH(pitch - 1);
    }
 
@@ -451,13 +439,23 @@ static void si_set_sampler_state_desc(struct si_sampler_state *sstate,
 }
 
 static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_view *sview,
-                                     struct si_sampler_state *sstate, uint32_t *desc)
+                                     struct si_sampler_state *sstate,
+                                     /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+                                     uint32_t * restrict desc)
 {
    struct pipe_sampler_view *view = &sview->base;
    struct si_texture *tex = (struct si_texture *)view->texture;
-   bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
 
-   if (unlikely(!is_buffer && sview->dcc_incompatible)) {
+   assert(tex); /* views with texture == NULL aren't supported */
+
+   if (tex->buffer.b.b.target == PIPE_BUFFER) {
+      memcpy(desc, sview->state, 8 * 4);
+      memcpy(desc + 8, null_texture_descriptor, 4 * 4); /* Disable FMASK. */
+      si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4);
+      return;
+   }
+
+   if (unlikely(sview->dcc_incompatible)) {
       if (vi_dcc_enabled(tex, view->u.tex.first_level))
          if (!si_texture_disable_dcc(sctx, tex))
             si_decompress_dcc(sctx, tex);
@@ -465,27 +463,21 @@ static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_
       sview->dcc_incompatible = false;
    }
 
-   assert(tex); /* views with texture == NULL aren't supported */
-   memcpy(desc, sview->state, 8 * 4);
+   bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler;
 
-   if (is_buffer) {
-      si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4);
-   } else {
-      bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler;
-
-      si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level,
-                                     sview->base.u.tex.first_level, sview->block_width,
-                                     is_separate_stencil, 0, desc);
-   }
+   memcpy(desc, sview->state, 8 * 4);
+   si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level,
+                                  sview->base.u.tex.first_level, sview->block_width,
+                                  is_separate_stencil, 0, desc);
 
-   if (!is_buffer && tex->surface.fmask_size) {
+   if (tex->surface.fmask_size) {
       memcpy(desc + 8, sview->fmask_state, 8 * 4);
    } else {
       /* Disable FMASK and bind sampler state in [12:15]. */
       memcpy(desc + 8, null_texture_descriptor, 4 * 4);
 
       if (sstate)
-         si_set_sampler_state_desc(sstate, sview, is_buffer ? NULL : tex, desc + 12);
+         si_set_sampler_state_desc(sstate, sview, tex, desc + 12);
    }
 }
 
@@ -508,65 +500,106 @@ static bool depth_needs_decompression(struct si_texture *tex)
    return tex->db_compatible;
 }
 
-static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot,
-                                struct pipe_sampler_view *view, bool disallow_early_out)
+static void si_reset_sampler_view_slot(struct si_samplers *samplers, unsigned slot,
+                                       uint32_t * restrict desc)
+{
+   pipe_sampler_view_reference(&samplers->views[slot], NULL);
+   memcpy(desc, null_texture_descriptor, 8 * 4);
+   /* Only clear the lower dwords of FMASK. */
+   memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+   /* Re-set the sampler state if we are transitioning from FMASK. */
+   if (samplers->sampler_states[slot])
+      si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12);
+}
+
+static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
+                                unsigned start_slot, unsigned count,
+                                unsigned unbind_num_trailing_slots,
+                                bool take_ownership, struct pipe_sampler_view **views,
+                                bool disallow_early_out)
 {
    struct si_samplers *samplers = &sctx->samplers[shader];
-   struct si_sampler_view *sview = (struct si_sampler_view *)view;
    struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
-   unsigned desc_slot = si_get_sampler_slot(slot);
-   uint32_t *desc = descs->list + desc_slot * 16;
+   uint32_t unbound_mask = 0;
 
-   if (samplers->views[slot] == view && !disallow_early_out)
-      return;
+   if (views) {
+      for (unsigned i = 0; i < count; i++) {
+         unsigned slot = start_slot + i;
+         struct si_sampler_view *sview = (struct si_sampler_view *)views[i];
+         unsigned desc_slot = si_get_sampler_slot(slot);
+         /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+         uint32_t *restrict desc = descs->list + desc_slot * 16;
+
+         if (samplers->views[slot] == &sview->base && !disallow_early_out) {
+            if (take_ownership) {
+               struct pipe_sampler_view *view = views[i];
+               pipe_sampler_view_reference(&view, NULL);
+            }
+            continue;
+         }
 
-   if (view) {
-      struct si_texture *tex = (struct si_texture *)view->texture;
+         if (sview) {
+            struct si_texture *tex = (struct si_texture *)sview->base.texture;
+
+            si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
+
+            if (tex->buffer.b.b.target == PIPE_BUFFER) {
+               tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
+               samplers->needs_depth_decompress_mask &= ~(1u << slot);
+               samplers->needs_color_decompress_mask &= ~(1u << slot);
+            } else {
+               if (depth_needs_decompression(tex)) {
+                  samplers->needs_depth_decompress_mask |= 1u << slot;
+               } else {
+                  samplers->needs_depth_decompress_mask &= ~(1u << slot);
+               }
+               if (color_needs_decompression(tex)) {
+                  samplers->needs_color_decompress_mask |= 1u << slot;
+               } else {
+                  samplers->needs_color_decompress_mask &= ~(1u << slot);
+               }
+
+               if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) &&
+                   p_atomic_read(&tex->framebuffers_bound))
+                  sctx->need_check_render_feedback = true;
+            }
 
-      si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
+            if (take_ownership) {
+               pipe_sampler_view_reference(&samplers->views[slot], NULL);
+               samplers->views[slot] = &sview->base;
+            } else {
+               pipe_sampler_view_reference(&samplers->views[slot], &sview->base);
+            }
+            samplers->enabled_mask |= 1u << slot;
 
-      if (tex->buffer.b.b.target == PIPE_BUFFER) {
-         tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
-         samplers->needs_depth_decompress_mask &= ~(1u << slot);
-         samplers->needs_color_decompress_mask &= ~(1u << slot);
-      } else {
-         if (depth_needs_decompression(tex)) {
-            samplers->needs_depth_decompress_mask |= 1u << slot;
+            /* Since this can flush, it must be done after enabled_mask is
+             * updated. */
+            si_sampler_view_add_buffer(sctx, &tex->buffer.b.b, RADEON_USAGE_READ,
+                                       sview->is_stencil_sampler, true);
          } else {
-            samplers->needs_depth_decompress_mask &= ~(1u << slot);
+            si_reset_sampler_view_slot(samplers, slot, desc);
+            unbound_mask |= 1u << slot;
          }
-         if (color_needs_decompression(tex)) {
-            samplers->needs_color_decompress_mask |= 1u << slot;
-         } else {
-            samplers->needs_color_decompress_mask &= ~(1u << slot);
-         }
-
-         if (vi_dcc_enabled(tex, view->u.tex.first_level) &&
-             p_atomic_read(&tex->framebuffers_bound))
-            sctx->need_check_render_feedback = true;
       }
-
-      pipe_sampler_view_reference(&samplers->views[slot], view);
-      samplers->enabled_mask |= 1u << slot;
-
-      /* Since this can flush, it must be done after enabled_mask is
-       * updated. */
-      si_sampler_view_add_buffer(sctx, view->texture, RADEON_USAGE_READ, sview->is_stencil_sampler,
-                                 true);
    } else {
-      pipe_sampler_view_reference(&samplers->views[slot], NULL);
-      memcpy(desc, null_texture_descriptor, 8 * 4);
-      /* Only clear the lower dwords of FMASK. */
-      memcpy(desc + 8, null_texture_descriptor, 4 * 4);
-      /* Re-set the sampler state if we are transitioning from FMASK. */
-      if (samplers->sampler_states[slot])
-         si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12);
+      unbind_num_trailing_slots += count;
+      count = 0;
+   }
 
-      samplers->enabled_mask &= ~(1u << slot);
-      samplers->needs_depth_decompress_mask &= ~(1u << slot);
-      samplers->needs_color_decompress_mask &= ~(1u << slot);
+   for (unsigned i = 0; i < unbind_num_trailing_slots; i++) {
+      unsigned slot = start_slot + count + i;
+      unsigned desc_slot = si_get_sampler_slot(slot);
+      uint32_t * restrict desc = descs->list + desc_slot * 16;
+
+      if (samplers->views[slot])
+         si_reset_sampler_view_slot(samplers, slot, desc);
    }
 
+   unbound_mask |= BITFIELD_RANGE(start_slot + count, unbind_num_trailing_slots);
+   samplers->enabled_mask &= ~unbound_mask;
+   samplers->needs_depth_decompress_mask &= ~unbound_mask;
+   samplers->needs_color_decompress_mask &= ~unbound_mask;
+
    sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
 }
 
@@ -582,28 +615,18 @@ static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsi
       sctx->shader_needs_decompress_mask &= ~shader_bit;
 }
 
-static void si_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
-                                 unsigned start, unsigned count,
-                                 unsigned unbind_num_trailing_slots,
-                                 struct pipe_sampler_view **views)
+static void si_pipe_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                      unsigned start, unsigned count,
+                                      unsigned unbind_num_trailing_slots,
+                                      bool take_ownership, struct pipe_sampler_view **views)
 {
    struct si_context *sctx = (struct si_context *)ctx;
-   int i;
 
    if ((!count && !unbind_num_trailing_slots) || shader >= SI_NUM_SHADERS)
       return;
 
-   if (views) {
-      for (i = 0; i < count; i++)
-         si_set_sampler_view(sctx, shader, start + i, views[i], false);
-   } else {
-      for (i = 0; i < count; i++)
-         si_set_sampler_view(sctx, shader, start + i, NULL, false);
-   }
-
-   for (; i < count + unbind_num_trailing_slots; i++)
-      si_set_sampler_view(sctx, shader, start + i, NULL, false);
-
+   si_set_sampler_views(sctx, shader, start, count, unbind_num_trailing_slots,
+                        take_ownership, views, false);
    si_update_shader_needs_decompress_mask(sctx, shader);
 }
 
@@ -710,7 +733,7 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
 
    res = si_resource(view->resource);
 
-   if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+   if (res->b.b.target == PIPE_BUFFER) {
       if (view->access & PIPE_IMAGE_ACCESS_WRITE)
          si_mark_image_range_valid(view);
 
@@ -725,12 +748,15 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
       bool uses_dcc = vi_dcc_enabled(tex, level);
       unsigned access = view->access;
 
+      if (uses_dcc && screen->always_allow_dcc_stores)
+         access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
+
       assert(!tex->is_depth);
       assert(fmask_desc || tex->surface.fmask_offset == 0);
 
       if (uses_dcc && !skip_decompress &&
           !(access & SI_IMAGE_ACCESS_DCC_OFF) &&
-          ((!(access & SI_IMAGE_ACCESS_DCC_WRITE) && (access & PIPE_IMAGE_ACCESS_WRITE)) ||
+          ((!(access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE) && (access & PIPE_IMAGE_ACCESS_WRITE)) ||
            !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
          /* If DCC can't be disabled, at least decompress it.
           * The decompression is relatively cheap if the surface
@@ -766,7 +792,7 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
          view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc);
       si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level,
                                      util_format_get_blockwidth(view->format),
-                                     false, view->access, desc);
+                                     false, access, desc);
    }
 }
 
@@ -790,7 +816,7 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
    if (&images->views[slot] != view)
       util_copy_image_view(&images->views[slot], view);
 
-   if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+   if (res->b.b.target == PIPE_BUFFER) {
       images->needs_color_decompress_mask &= ~(1 << slot);
       images->display_dcc_store_mask &= ~(1u << slot);
       res->bind_history |= PIPE_BIND_SHADER_IMAGE;
@@ -804,10 +830,15 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
          images->needs_color_decompress_mask &= ~(1 << slot);
       }
 
-      if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE)
+      if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE) {
          images->display_dcc_store_mask |= 1u << slot;
-      else
+
+         /* Set displayable_dcc_dirty for non-compute stages conservatively (before draw calls). */
+         if (shader != PIPE_SHADER_COMPUTE)
+            tex->displayable_dcc_dirty = true;
+      } else {
          images->display_dcc_store_mask &= ~(1u << slot);
+      }
 
       if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
          ctx->need_check_render_feedback = true;
@@ -992,7 +1023,8 @@ static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_ty
 
 /* BUFFER RESOURCES */
 
-static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_context *sctx,
+                                     struct si_buffer_resources *buffers,
                                      struct si_descriptors *descs, unsigned num_buffers,
                                      short shader_userdata_rel_index,
                                      enum radeon_bo_priority priority,
@@ -1004,6 +1036,22 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
    buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
 
    si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
+
+   /* Initialize buffer descriptors, so that we don't have to do it at bind time. */
+   for (unsigned i = 0; i < num_buffers; i++) {
+      uint32_t *desc = descs->list + i * 4;
+
+      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+      if (sctx->chip_class >= GFX10) {
+         desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
+                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+      }
+   }
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
@@ -1145,7 +1193,6 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
          }
       } else {
          if (take_ownership) {
-            pipe_resource_reference(&buffer, NULL);
             buffer = input->buffer;
          } else {
             pipe_resource_reference(&buffer, input->buffer);
@@ -1160,16 +1207,6 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
       desc[0] = va;
       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
       desc[2] = input->buffer_size;
-      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-      if (sctx->chip_class >= GFX10) {
-         desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
-      } else {
-         desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-      }
 
       buffers->buffers[slot] = buffer;
       buffers->offsets[slot] = buffer_offset;
@@ -1177,14 +1214,27 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
                                               buffers->priority_constbuf, true);
       buffers->enabled_mask |= 1llu << slot;
    } else {
-      /* Clear the descriptor. */
-      memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+      /* Clear the descriptor. Only 3 dwords are cleared. The 4th dword is immutable. */
+      memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 3);
       buffers->enabled_mask &= ~(1llu << slot);
    }
 
    sctx->descriptors_dirty |= 1u << descriptors_idx;
 }
 
+void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader)
+{
+   if (shader == PIPE_SHADER_COMPUTE)
+      return;
+
+   if (sctx->shaders[shader].key.opt.inline_uniforms) {
+      sctx->shaders[shader].key.opt.inline_uniforms = false;
+      memset(sctx->shaders[shader].key.opt.inlined_uniform_values, 0,
+             sizeof(sctx->shaders[shader].key.opt.inlined_uniform_values));
+      sctx->do_update_shaders = true;
+   }
+}
+
 static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader,
                                         uint slot, bool take_ownership,
                                         const struct pipe_constant_buffer *input)
@@ -1204,10 +1254,8 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad
          si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
       }
 
-      if (slot == 0) {
-         /* Invalidate current inlinable uniforms. */
-         sctx->inlinable_uniforms_valid_mask &= ~(1 << shader);
-      }
+      if (slot == 0)
+         si_invalidate_inlinable_uniforms(sctx, shader);
    }
 
    slot = si_get_constbuf_slot(slot);
@@ -1222,9 +1270,24 @@ static void si_set_inlinable_constants(struct pipe_context *ctx,
 {
    struct si_context *sctx = (struct si_context *)ctx;
 
-   memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4);
-   sctx->inlinable_uniforms_valid_mask |= 1 << shader;
-   sctx->do_update_shaders = true;
+   if (shader == PIPE_SHADER_COMPUTE)
+      return;
+
+   if (!sctx->shaders[shader].key.opt.inline_uniforms) {
+      /* It's the first time we set the constants. Always update shaders. */
+      sctx->shaders[shader].key.opt.inline_uniforms = true;
+      memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4);
+      sctx->do_update_shaders = true;
+      return;
+   }
+
+   /* We have already set inlinable constants for this shader. Update the shader only if
+    * the constants are being changed so as not to update shaders needlessly.
+    */
+   if (memcmp(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4)) {
+      memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4);
+      sctx->do_update_shaders = true;
+   }
 }
 
 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
@@ -1248,7 +1311,8 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou
 
    if (!sbuffer || !sbuffer->buffer) {
       pipe_resource_reference(&buffers->buffers[slot], NULL);
-      memset(desc, 0, sizeof(uint32_t) * 4);
+      /* Clear the descriptor. Only 3 dwords are cleared. The 4th dword is immutable. */
+      memset(desc, 0, sizeof(uint32_t) * 3);
       buffers->enabled_mask &= ~(1llu << slot);
       buffers->writable_mask &= ~(1llu << slot);
       sctx->descriptors_dirty |= 1u << descriptors_idx;
@@ -1261,16 +1325,6 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou
    desc[0] = va;
    desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
    desc[2] = sbuffer->buffer_size;
-   desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-   if (sctx->chip_class >= GFX10) {
-      desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
-   } else {
-      desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-   }
 
    pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
    buffers->offsets[slot] = sbuffer->buffer_offset;
@@ -1417,7 +1471,7 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource
          desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
 
       if (sctx->chip_class >= GFX10) {
-         desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+         desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
                     S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
       } else {
          desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
@@ -1879,7 +1933,7 @@ void si_update_all_texture_descriptors(struct si_context *sctx)
          if (!view || !view->texture || view->texture->target == PIPE_BUFFER)
             continue;
 
-         si_set_sampler_view(sctx, shader, i, samplers->views[i], true);
+         si_set_sampler_views(sctx, shader, i, 1, 0, false, &samplers->views[i], true);
       }
 
       si_update_shader_needs_decompress_mask(sctx, shader);
@@ -1897,11 +1951,13 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
       u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
 
    if (shader == PIPE_SHADER_VERTEX) {
+      unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
+
       sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
                                           sctx->num_vertex_elements >
-                                          sctx->screen->num_vbos_in_user_sgprs;
+                                          num_vbos_in_user_sgprs;
       sctx->vertex_buffer_user_sgprs_dirty =
-         sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+         sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
    }
 
    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
@@ -1909,12 +1965,14 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
 
 void si_shader_pointers_mark_dirty(struct si_context *sctx)
 {
+   unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
+
    sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
    sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
                                        sctx->num_vertex_elements >
-                                       sctx->screen->num_vbos_in_user_sgprs;
+                                       num_vbos_in_user_sgprs;
    sctx->vertex_buffer_user_sgprs_dirty =
-      sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+      sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
    sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
    sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
@@ -1963,6 +2021,36 @@ void si_shader_change_notify(struct si_context *sctx)
                                                sctx->shader.gs.cso ? GS_ON : GS_OFF,
                                                sctx->ngg ? NGG_ON : NGG_OFF,
                                                PIPE_SHADER_TESS_EVAL));
+
+   /* Update as_* flags in shader keys. Ignore disabled shader stages.
+    *   as_ls = VS before TCS
+    *   as_es = VS before GS or TES before GS
+    *   as_ngg = NGG enabled for the last geometry stage.
+    *            If GS sets as_ngg, the previous stage must set as_ngg too.
+    */
+   if (sctx->shader.tes.cso) {
+      sctx->shader.vs.key.as_ls = 1;
+      sctx->shader.vs.key.as_es = 0;
+      sctx->shader.vs.key.as_ngg = 0;
+
+      if (sctx->shader.gs.cso) {
+         sctx->shader.tes.key.as_es = 1;
+         sctx->shader.tes.key.as_ngg = sctx->ngg;
+         sctx->shader.gs.key.as_ngg = sctx->ngg;
+      } else {
+         sctx->shader.tes.key.as_es = 0;
+         sctx->shader.tes.key.as_ngg = sctx->ngg;
+      }
+   } else if (sctx->shader.gs.cso) {
+      sctx->shader.vs.key.as_ls = 0;
+      sctx->shader.vs.key.as_es = 1;
+      sctx->shader.vs.key.as_ngg = sctx->ngg;
+      sctx->shader.gs.key.as_ngg = sctx->ngg;
+   } else {
+      sctx->shader.vs.key.as_ls = 0;
+      sctx->shader.vs.key.as_es = 0;
+      sctx->shader.vs.key.as_ngg = sctx->ngg;
+   }
 }
 
 #define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \
@@ -1977,9 +2065,9 @@ void si_shader_change_notify(struct si_context *sctx)
          struct si_descriptors *descs = &sctx->descriptors[start]; \
          unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \
          \
-         radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, count); \
+         radeon_set_sh_reg_seq(sh_offset, count); \
          for (int i = 0; i < count; i++) \
-            radeon_emit_32bit_pointer(sctx->screen, cs, descs[i].gpu_address); \
+            radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \
       } \
    } \
 } while (0)
@@ -2070,12 +2158,12 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
    if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) {
       struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE);
 
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 +
                             shader->cs_shaderbufs_sgpr_index * 4,
                             num_shaderbufs * 4);
 
       for (unsigned i = 0; i < num_shaderbufs; i++)
-         radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4);
+         radeon_emit_array(&desc->list[si_get_shaderbuf_slot(i) * 4], 4);
 
       sctx->compute_shaderbuf_sgprs_dirty = false;
    }
@@ -2085,7 +2173,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
    if (num_images && sctx->compute_image_sgprs_dirty) {
       struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE);
 
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 +
                             shader->cs_images_sgpr_index * 4,
                             shader->cs_images_num_sgprs);
 
@@ -2099,7 +2187,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
             num_sgprs = 4;
          }
 
-         radeon_emit_array(cs, &desc->list[desc_offset], num_sgprs);
+         radeon_emit_array(&desc->list[desc_offset], num_sgprs);
       }
 
       sctx->compute_image_sgprs_dirty = false;
@@ -2123,8 +2211,7 @@ static void si_init_bindless_descriptors(struct si_context *sctx, struct si_desc
    sctx->num_bindless_descriptors = 1;
 
    /* Track which bindless slots are used (or not). */
-   util_idalloc_init(&sctx->bindless_used_slots);
-   util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
+   util_idalloc_init(&sctx->bindless_used_slots, num_elements);
 
    /* Reserve slot 0 because it's an invalid handle for bindless. */
    desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
@@ -2526,7 +2613,7 @@ void si_init_all_descriptors(struct si_context *sctx)
          rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
       }
       desc = si_const_and_shader_buffer_descriptors(sctx, i);
-      si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots,
+      si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i], desc, num_buffer_slots,
                                rel_dw_offset, RADEON_PRIO_SHADER_RW_BUFFER,
                                RADEON_PRIO_CONST_BUFFER);
       desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
@@ -2556,7 +2643,7 @@ void si_init_all_descriptors(struct si_context *sctx)
          memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
    }
 
-   si_init_buffer_resources(&sctx->internal_bindings, &sctx->descriptors[SI_DESCS_INTERNAL],
+   si_init_buffer_resources(sctx, &sctx->internal_bindings, &sctx->descriptors[SI_DESCS_INTERNAL],
                             SI_NUM_INTERNAL_BINDINGS, SI_SGPR_INTERNAL_BINDINGS,
                             /* The second priority is used by
                              * const buffers in RW buffer slots. */
@@ -2577,7 +2664,7 @@ void si_init_all_descriptors(struct si_context *sctx)
    sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
    sctx->b.set_inlinable_constants = si_set_inlinable_constants;
    sctx->b.set_shader_buffers = si_set_shader_buffers;
-   sctx->b.set_sampler_views = si_set_sampler_views;
+   sctx->b.set_sampler_views = si_pipe_set_sampler_views;
    sctx->b.create_texture_handle = si_create_texture_handle;
    sctx->b.delete_texture_handle = si_delete_texture_handle;
    sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c b/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c
index fc965cd7a..0bee2f7d0 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -26,141 +26,17 @@
 #include "si_query.h"
 #include "util/u_memory.h"
 
-enum si_pc_block_flags
-{
-   /* This block is part of the shader engine */
-   SI_PC_BLOCK_SE = (1 << 0),
-
-   /* Expose per-instance groups instead of summing all instances (within
-    * an SE). */
-   SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
-
-   /* Expose per-SE groups instead of summing instances across SEs. */
-   SI_PC_BLOCK_SE_GROUPS = (1 << 2),
-
-   /* Shader block */
-   SI_PC_BLOCK_SHADER = (1 << 3),
-
-   /* Non-shader block with perfcounters windowed by shaders. */
-   SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
-};
-
-enum si_pc_reg_layout
-{
-   /* All secondary selector dwords follow as one block after the primary
-    * selector dwords for the counters that have secondary selectors.
-    *
-    * Example:
-    *    PERFCOUNTER0_SELECT
-    *    PERFCOUNTER1_SELECT
-    *    PERFCOUNTER0_SELECT1
-    *    PERFCOUNTER1_SELECT1
-    *    PERFCOUNTER2_SELECT
-    *    PERFCOUNTER3_SELECT
-    */
-   SI_PC_MULTI_BLOCK = 0,
-
-   /* Each secondary selector dword follows immediately after the
-    * corresponding primary.
-    *
-    * Example:
-    *    PERFCOUNTER0_SELECT
-    *    PERFCOUNTER0_SELECT1
-    *    PERFCOUNTER1_SELECT
-    *    PERFCOUNTER1_SELECT1
-    *    PERFCOUNTER2_SELECT
-    *    PERFCOUNTER3_SELECT
-    */
-   SI_PC_MULTI_ALTERNATE = 1,
-
-   /* All secondary selector dwords follow as one block after all primary
-    * selector dwords.
-    *
-    * Example:
-    *    PERFCOUNTER0_SELECT
-    *    PERFCOUNTER1_SELECT
-    *    PERFCOUNTER2_SELECT
-    *    PERFCOUNTER3_SELECT
-    *    PERFCOUNTER0_SELECT1
-    *    PERFCOUNTER1_SELECT1
-    */
-   SI_PC_MULTI_TAIL = 2,
-
-   /* Free-form arrangement of selector registers. */
-   SI_PC_MULTI_CUSTOM = 3,
-
-   SI_PC_MULTI_MASK = 3,
-
-   /* Registers are laid out in decreasing rather than increasing order. */
-   SI_PC_REG_REVERSE = 4,
-
-   SI_PC_FAKE = 8,
-};
-
-struct si_pc_block_base {
-   const char *name;
-   unsigned num_counters;
-   unsigned flags;
-
-   unsigned select_or;
-   unsigned select0;
-   unsigned counter0_lo;
-   unsigned *select;
-   unsigned *counters;
-   unsigned num_multi;
-   unsigned num_prelude;
-   unsigned layout;
-};
-
-struct si_pc_block_gfxdescr {
-   struct si_pc_block_base *b;
-   unsigned selectors;
-   unsigned instances;
-};
-
-struct si_pc_block {
-   const struct si_pc_block_gfxdescr *b;
-   unsigned num_instances;
-
-   unsigned num_groups;
-   char *group_names;
-   unsigned group_name_stride;
-
-   char *selector_names;
-   unsigned selector_name_stride;
-};
-
-/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
- * performance counter group IDs.
- */
-static const char *const si_pc_shader_type_suffixes[] = {"",    "_ES", "_GS", "_VS",
-                                                         "_PS", "_LS", "_HS", "_CS"};
-
-static const unsigned si_pc_shader_type_bits[] = {
-   0x7f,
-   S_036780_ES_EN(1),
-   S_036780_GS_EN(1),
-   S_036780_VS_EN(1),
-   S_036780_PS_EN(1),
-   S_036780_LS_EN(1),
-   S_036780_HS_EN(1),
-   S_036780_CS_EN(1),
-};
-
-/* Max counters per HW block */
-#define SI_QUERY_MAX_COUNTERS 16
-
-#define SI_PC_SHADERS_WINDOWING (1u << 31)
+#include "ac_perfcounter.h"
 
 struct si_query_group {
    struct si_query_group *next;
-   struct si_pc_block *block;
+   struct ac_pc_block *block;
    unsigned sub_gid;     /* only used during init */
    unsigned result_base; /* only used during init */
    int se;
    int instance;
    unsigned num_counters;
-   unsigned selectors[SI_QUERY_MAX_COUNTERS];
+   unsigned selectors[AC_QUERY_MAX_COUNTERS];
 };
 
 struct si_query_counter {
@@ -182,525 +58,6 @@ struct si_query_pc {
    struct si_query_group *groups;
 };
 
-static struct si_pc_block_base cik_CB = {
-   .name = "CB",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
-   .select0 = R_037000_CB_PERFCOUNTER_FILTER,
-   .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .num_prelude = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static unsigned cik_CPC_select[] = {
-   R_036024_CPC_PERFCOUNTER0_SELECT,
-   R_036010_CPC_PERFCOUNTER0_SELECT1,
-   R_03600C_CPC_PERFCOUNTER1_SELECT,
-};
-static struct si_pc_block_base cik_CPC = {
-   .name = "CPC",
-   .num_counters = 2,
-
-   .select = cik_CPC_select,
-   .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
-};
-
-static struct si_pc_block_base cik_CPF = {
-   .name = "CPF",
-   .num_counters = 2,
-
-   .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
-};
-
-static struct si_pc_block_base cik_CPG = {
-   .name = "CPG",
-   .num_counters = 2,
-
-   .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
-};
-
-static struct si_pc_block_base cik_DB = {
-   .name = "DB",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
-   .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
-   .num_multi = 3, // really only 2, but there's a gap between registers
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_GDS = {
-   .name = "GDS",
-   .num_counters = 4,
-
-   .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_TAIL,
-};
-
-static unsigned cik_GRBM_counters[] = {
-   R_034100_GRBM_PERFCOUNTER0_LO,
-   R_03410C_GRBM_PERFCOUNTER1_LO,
-};
-static struct si_pc_block_base cik_GRBM = {
-   .name = "GRBM",
-   .num_counters = 2,
-
-   .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
-   .counters = cik_GRBM_counters,
-};
-
-static struct si_pc_block_base cik_GRBMSE = {
-   .name = "GRBMSE",
-   .num_counters = 4,
-
-   .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
-   .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
-};
-
-static struct si_pc_block_base cik_IA = {
-   .name = "IA",
-   .num_counters = 4,
-
-   .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_TAIL,
-};
-
-static struct si_pc_block_base cik_PA_SC = {
-   .name = "PA_SC",
-   .num_counters = 8,
-   .flags = SI_PC_BLOCK_SE,
-
-   .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-/* According to docs, PA_SU counters are only 48 bits wide. */
-static struct si_pc_block_base cik_PA_SU = {
-   .name = "PA_SU",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE,
-
-   .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
-   .num_multi = 2,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_SPI = {
-   .name = "SPI",
-   .num_counters = 6,
-   .flags = SI_PC_BLOCK_SE,
-
-   .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
-   .num_multi = 4,
-   .layout = SI_PC_MULTI_BLOCK,
-};
-
-static struct si_pc_block_base cik_SQ = {
-   .name = "SQ",
-   .num_counters = 16,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
-
-   .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
-   .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
-   .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
-};
-
-static struct si_pc_block_base cik_SX = {
-   .name = "SX",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE,
-
-   .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
-   .num_multi = 2,
-   .layout = SI_PC_MULTI_TAIL,
-};
-
-static struct si_pc_block_base cik_TA = {
-   .name = "TA",
-   .num_counters = 2,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-   .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TD = {
-   .name = "TD",
-   .num_counters = 2,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-   .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TCA = {
-   .name = "TCA",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
-   .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
-   .num_multi = 2,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TCC = {
-   .name = "TCC",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
-   .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
-   .num_multi = 2,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TCP = {
-   .name = "TCP",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-   .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
-   .num_multi = 2,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_VGT = {
-   .name = "VGT",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE,
-
-   .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_TAIL,
-};
-
-static struct si_pc_block_base cik_WD = {
-   .name = "WD",
-   .num_counters = 4,
-
-   .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
-};
-
-static struct si_pc_block_base cik_MC = {
-   .name = "MC",
-   .num_counters = 4,
-
-   .layout = SI_PC_FAKE,
-};
-
-static struct si_pc_block_base cik_SRBM = {
-   .name = "SRBM",
-   .num_counters = 2,
-
-   .layout = SI_PC_FAKE,
-};
-
-static struct si_pc_block_base gfx10_CHA = {
-   .name = "CHA",
-   .num_counters = 4,
-
-   .select0 = R_037780_CHA_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_035800_CHA_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_CHCG = {
-   .name = "CHCG",
-   .num_counters = 4,
-
-   .select0 = R_036F18_CHCG_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034F20_CHCG_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_CHC = {
-   .name = "CHC",
-   .num_counters = 4,
-
-   .select0 = R_036F00_CHC_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034F00_CHC_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GCR = {
-   .name = "GCR",
-   .num_counters = 2,
-
-   .select0 = R_037580_GCR_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_035480_GCR_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GE = {
-   .name = "GE",
-   .num_counters = 12,
-
-   .select0 = R_036200_GE_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034200_GE_PERFCOUNTER0_LO,
-   .num_multi = 4,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL1A = {
-   .name = "GL1A",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
-
-   .select0 = R_037700_GL1A_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_035700_GL1A_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL1C = {
-   .name = "GL1C",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
-
-   .select0 = R_036E80_GL1C_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034E80_GL1C_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL2A = {
-   .name = "GL2A",
-   .num_counters = 4,
-
-   .select0 = R_036E40_GL2A_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034E40_GL2A_PERFCOUNTER0_LO,
-   .num_multi = 2,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL2C = {
-   .name = "GL2C",
-   .num_counters = 4,
-
-   .select0 = R_036E00_GL2C_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034E00_GL2C_PERFCOUNTER0_LO,
-   .num_multi = 2,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static unsigned gfx10_PA_PH_select[] = {
-   R_037600_PA_PH_PERFCOUNTER0_SELECT,
-   R_037604_PA_PH_PERFCOUNTER0_SELECT1,
-   R_037608_PA_PH_PERFCOUNTER1_SELECT,
-   R_037640_PA_PH_PERFCOUNTER1_SELECT1,
-   R_03760C_PA_PH_PERFCOUNTER2_SELECT,
-   R_037644_PA_PH_PERFCOUNTER2_SELECT1,
-   R_037610_PA_PH_PERFCOUNTER3_SELECT,
-   R_037648_PA_PH_PERFCOUNTER3_SELECT1,
-   R_037614_PA_PH_PERFCOUNTER4_SELECT,
-   R_037618_PA_PH_PERFCOUNTER5_SELECT,
-   R_03761C_PA_PH_PERFCOUNTER6_SELECT,
-   R_037620_PA_PH_PERFCOUNTER7_SELECT,
-};
-static struct si_pc_block_base gfx10_PA_PH = {
-   .name = "PA_PH",
-   .num_counters = 8,
-   .flags = SI_PC_BLOCK_SE,
-
-   .select = gfx10_PA_PH_select,
-   .counter0_lo = R_035600_PA_PH_PERFCOUNTER0_LO,
-   .num_multi = 4,
-   .layout = SI_PC_MULTI_CUSTOM,
-};
-
-static struct si_pc_block_base gfx10_PA_SU = {
-   .name = "PA_SU",
-   .num_counters = 4,
-   .flags = SI_PC_BLOCK_SE,
-
-   .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
-   .num_multi = 4,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_RLC = {
-   .name = "RLC",
-   .num_counters = 2,
-
-   .select0 = R_037304_RLC_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_035200_RLC_PERFCOUNTER0_LO,
-   .num_multi = 0,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_RMI = {
-   .name = "RMI",
-   /* Actually 4, but the 2nd counter is missing the secondary selector while
-    * the 3rd counter has it, which complicates the register layout. */
-   .num_counters = 2,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
-   .select0 = R_037400_RMI_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_035300_RMI_PERFCOUNTER0_LO,
-   .num_multi = 1,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_UTCL1 = {
-   .name = "UTCL1",
-   .num_counters = 2,
-   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
-
-   .select0 = R_03758C_UTCL1_PERFCOUNTER0_SELECT,
-   .counter0_lo = R_035470_UTCL1_PERFCOUNTER0_LO,
-   .num_multi = 0,
-   .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-/* Both the number of instances and selectors varies between chips of the same
- * class. We only differentiate by class here and simply expose the maximum
- * number over all chips in a class.
- *
- * Unfortunately, GPUPerfStudio uses the order of performance counter groups
- * blindly once it believes it has identified the hardware, so the order of
- * blocks here matters.
- */
-static struct si_pc_block_gfxdescr groups_CIK[] = {
-   {&cik_CB, 226},     {&cik_CPF, 17},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
-   {&cik_PA_SU, 153},  {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252},    {&cik_SX, 32},
-   {&cik_TA, 111},     {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55},     {&cik_TCP, 154},
-   {&cik_GDS, 121},    {&cik_VGT, 140},   {&cik_IA, 22},   {&cik_MC, 22},     {&cik_SRBM, 19},
-   {&cik_WD, 22},      {&cik_CPG, 46},    {&cik_CPC, 22},
-
-};
-
-static struct si_pc_block_gfxdescr groups_VI[] = {
-   {&cik_CB, 405},     {&cik_CPF, 19},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
-   {&cik_PA_SU, 154},  {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273},    {&cik_SX, 34},
-   {&cik_TA, 119},     {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55},     {&cik_TCP, 180},
-   {&cik_GDS, 121},    {&cik_VGT, 147},   {&cik_IA, 24},   {&cik_MC, 22},     {&cik_SRBM, 27},
-   {&cik_WD, 37},      {&cik_CPG, 48},    {&cik_CPC, 24},
-
-};
-
-static struct si_pc_block_gfxdescr groups_gfx9[] = {
-   {&cik_CB, 438},     {&cik_CPF, 32},    {&cik_DB, 328},  {&cik_GRBM, 38},   {&cik_GRBMSE, 16},
-   {&cik_PA_SU, 292},  {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374},    {&cik_SX, 208},
-   {&cik_TA, 119},     {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57},     {&cik_TCP, 85},
-   {&cik_GDS, 121},    {&cik_VGT, 148},   {&cik_IA, 32},   {&cik_WD, 58},     {&cik_CPG, 59},
-   {&cik_CPC, 35},
-};
-
-static struct si_pc_block_gfxdescr groups_gfx10[] = {
-   {&cik_CB, 461},
-   {&gfx10_CHA, 45},
-   {&gfx10_CHCG, 35},
-   {&gfx10_CHC, 35},
-   {&cik_CPC, 47},
-   {&cik_CPF, 40},
-   {&cik_CPG, 82},
-   {&cik_DB, 370},
-   {&gfx10_GCR, 94},
-   {&cik_GDS, 123},
-   {&gfx10_GE, 315},
-   {&gfx10_GL1A, 36},
-   {&gfx10_GL1C, 64},
-   {&gfx10_GL2A, 91},
-   {&gfx10_GL2C, 235},
-   {&cik_GRBM, 47},
-   {&cik_GRBMSE, 19},
-   {&gfx10_PA_PH, 960},
-   {&cik_PA_SC, 552},
-   {&gfx10_PA_SU, 266},
-   {&gfx10_RLC, 7},
-   {&gfx10_RMI, 258},
-   {&cik_SPI, 329},
-   {&cik_SQ, 509},
-   {&cik_SX, 225},
-   {&cik_TA, 226},
-   {&cik_TCP, 77},
-   {&cik_TD, 61},
-   {&gfx10_UTCL1, 15},
-};
-
-static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
-                                          const struct si_pc_block *block)
-{
-   return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
-          (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
-}
-
-static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
-                                                const struct si_pc_block *block)
-{
-   return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
-          (block->num_instances > 1 && pc->separate_instance);
-}
-
-static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
-                                          unsigned *base_gid, unsigned *sub_index)
-{
-   struct si_pc_block *block = pc->blocks;
-   unsigned bid;
-
-   *base_gid = 0;
-   for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
-      unsigned total = block->num_groups * block->b->selectors;
-
-      if (index < total) {
-         *sub_index = index;
-         return block;
-      }
-
-      index -= total;
-      *base_gid += block->num_groups;
-   }
-
-   return NULL;
-}
-
-static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
-{
-   unsigned bid;
-   struct si_pc_block *block = pc->blocks;
-
-   for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
-      if (*index < block->num_groups)
-         return block;
-      *index -= block->num_groups;
-   }
-
-   return NULL;
-}
-
 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
@@ -724,7 +81,7 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
    }
 
    radeon_begin(cs);
-   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+   radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
    radeon_end();
 }
 
@@ -733,105 +90,37 @@ static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
-   radeon_emit(cs, shaders & 0x7f);
-   radeon_emit(cs, 0xffffffff);
+   radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
+   radeon_emit(shaders & 0x7f);
+   radeon_emit(0xffffffff);
    radeon_end();
 }
 
-static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
                               unsigned *selectors)
 {
-   struct si_pc_block_base *regs = block->b->b;
+   struct ac_pc_block_base *regs = block->b->b;
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    unsigned idx;
-   unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
-   unsigned dw;
 
    assert(count <= regs->num_counters);
 
-   if (regs->layout & SI_PC_FAKE)
+   /* Fake counters. */
+   if (!regs->select0)
       return;
 
    radeon_begin(cs);
 
-   if (layout_multi == SI_PC_MULTI_BLOCK) {
-      assert(!(regs->layout & SI_PC_REG_REVERSE));
-
-      dw = count + regs->num_prelude;
-      if (count >= regs->num_multi)
-         dw += regs->num_multi;
-      radeon_set_uconfig_reg_seq(cs, regs->select0, dw, false);
-      for (idx = 0; idx < regs->num_prelude; ++idx)
-         radeon_emit(cs, 0);
-      for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
-         radeon_emit(cs, selectors[idx] | regs->select_or);
-
-      if (count < regs->num_multi) {
-         unsigned select1 = regs->select0 + 4 * regs->num_multi;
-         radeon_set_uconfig_reg_seq(cs, select1, count, false);
-      }
-
-      for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
-         radeon_emit(cs, 0);
+   for (idx = 0; idx < count; ++idx) {
+      radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false);
+      radeon_emit(selectors[idx] | regs->select_or);
+   }
 
-      if (count > regs->num_multi) {
-         for (idx = regs->num_multi; idx < count; ++idx)
-            radeon_emit(cs, selectors[idx] | regs->select_or);
-      }
-   } else if (layout_multi == SI_PC_MULTI_TAIL) {
-      unsigned select1, select1_count;
-
-      assert(!(regs->layout & SI_PC_REG_REVERSE));
-
-      radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude, false);
-      for (idx = 0; idx < regs->num_prelude; ++idx)
-         radeon_emit(cs, 0);
-      for (idx = 0; idx < count; ++idx)
-         radeon_emit(cs, selectors[idx] | regs->select_or);
-
-      select1 = regs->select0 + 4 * regs->num_counters;
-      select1_count = MIN2(count, regs->num_multi);
-      radeon_set_uconfig_reg_seq(cs, select1, select1_count, false);
-      for (idx = 0; idx < select1_count; ++idx)
-         radeon_emit(cs, 0);
-   } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
-      unsigned *reg = regs->select;
-      for (idx = 0; idx < count; ++idx) {
-         radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
-         if (idx < regs->num_multi)
-            radeon_set_uconfig_reg(cs, *reg++, 0);
-      }
-   } else {
-      assert(layout_multi == SI_PC_MULTI_ALTERNATE);
-
-      unsigned reg_base = regs->select0;
-      unsigned reg_count = count + MIN2(count, regs->num_multi);
-      reg_count += regs->num_prelude;
-
-      if (!(regs->layout & SI_PC_REG_REVERSE)) {
-         radeon_set_uconfig_reg_seq(cs, reg_base, reg_count, false);
-
-         for (idx = 0; idx < regs->num_prelude; ++idx)
-            radeon_emit(cs, 0);
-         for (idx = 0; idx < count; ++idx) {
-            radeon_emit(cs, selectors[idx] | regs->select_or);
-            if (idx < regs->num_multi)
-               radeon_emit(cs, 0);
-         }
-      } else {
-         reg_base -= (reg_count - 1) * 4;
-         radeon_set_uconfig_reg_seq(cs, reg_base, reg_count, false);
-
-         for (idx = count; idx > 0; --idx) {
-            if (idx <= regs->num_multi)
-               radeon_emit(cs, 0);
-            radeon_emit(cs, selectors[idx - 1] | regs->select_or);
-         }
-         for (idx = 0; idx < regs->num_prelude; ++idx)
-            radeon_emit(cs, 0);
-      }
+   for (idx = 0; idx < regs->num_spm_counters; idx++) {
+      radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false);
+      radeon_emit(0);
    }
+
    radeon_end();
 }
 
@@ -843,11 +132,11 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer
                    COPY_DATA_IMM, NULL, 1);
 
    radeon_begin(cs);
-   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+   radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
-   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
+   radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
    radeon_end();
 }
@@ -863,20 +152,20 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer,
    si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
 
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
    radeon_set_uconfig_reg(
-      cs, R_036020_CP_PERFMON_CNTL,
+      R_036020_CP_PERFMON_CNTL,
       S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
    radeon_end();
 }
 
-static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
                             uint64_t va)
 {
-   struct si_pc_block_base *regs = block->b->b;
+   struct ac_pc_block_base *regs = block->b->b;
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    unsigned idx;
    unsigned reg = regs->counter0_lo;
@@ -884,33 +173,31 @@ static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block,
 
    radeon_begin(cs);
 
-   if (!(regs->layout & SI_PC_FAKE)) {
-      if (regs->layout & SI_PC_REG_REVERSE)
-         reg_delta = -reg_delta;
-
+   if (regs->select0) {
       for (idx = 0; idx < count; ++idx) {
          if (regs->counters)
             reg = regs->counters[idx];
 
-         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+         radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
                             COPY_DATA_COUNT_SEL); /* 64 bits */
-         radeon_emit(cs, reg >> 2);
-         radeon_emit(cs, 0); /* unused */
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
+         radeon_emit(reg >> 2);
+         radeon_emit(0); /* unused */
+         radeon_emit(va);
+         radeon_emit(va >> 32);
          va += sizeof(uint64_t);
          reg += reg_delta;
       }
    } else {
+      /* Fake counters. */
       for (idx = 0; idx < count; ++idx) {
-         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-                            COPY_DATA_COUNT_SEL);
-         radeon_emit(cs, 0); /* immediate */
-         radeon_emit(cs, 0);
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
+         radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                     COPY_DATA_COUNT_SEL);
+         radeon_emit(0); /* immediate */
+         radeon_emit(0);
+         radeon_emit(va);
+         radeon_emit(va >> 32);
          va += sizeof(uint64_t);
       }
    }
@@ -938,10 +225,10 @@ void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, b
    radeon_begin(&sctx->gfx_cs);
 
    if (sctx->chip_class >= GFX10) {
-      radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL,
+      radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
                              S_037390_PERFMON_CLOCK_STATE(inhibit));
    } else if (sctx->chip_class >= GFX8) {
-      radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL,
+      radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
                              S_0372FC_PERFMON_CLOCK_STATE(inhibit));
    }
    radeon_end();
@@ -966,7 +253,7 @@ static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
    si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
 
    for (struct si_query_group *group = query->groups; group; group = group->next) {
-      struct si_pc_block *block = group->block;
+      struct ac_pc_block *block = group->block;
 
       if (group->se != current_se || group->instance != current_instance) {
          current_se = group->se;
@@ -997,11 +284,11 @@ static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery
    si_pc_emit_stop(sctx, query->buffer.buf, va);
 
    for (struct si_query_group *group = query->groups; group; group = group->next) {
-      struct si_pc_block *block = group->block;
+      struct ac_pc_block *block = group->block;
       unsigned se = group->se >= 0 ? group->se : 0;
       unsigned se_end = se + 1;
 
-      if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
+      if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
          se_end = sctx->screen->info.max_se;
 
       do {
@@ -1102,8 +389,9 @@ static const struct si_query_ops batch_query_ops = {
 };
 
 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
-                                              struct si_pc_block *block, unsigned sub_gid)
+                                              struct ac_pc_block *block, unsigned sub_gid)
 {
+   struct si_perfcounters *pc = screen->perfcounters;
    struct si_query_group *group = query->groups;
 
    while (group) {
@@ -1119,20 +407,20 @@ static struct si_query_group *get_group_state(struct si_screen *screen, struct s
    group->block = block;
    group->sub_gid = sub_gid;
 
-   if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+   if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
       unsigned sub_gids = block->num_instances;
       unsigned shader_id;
       unsigned shaders;
       unsigned query_shaders;
 
-      if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
+      if (ac_pc_block_has_per_se_groups(&pc->base, block))
          sub_gids = sub_gids * screen->info.max_se;
       shader_id = sub_gid / sub_gids;
       sub_gid = sub_gid % sub_gids;
 
-      shaders = si_pc_shader_type_bits[shader_id];
+      shaders = ac_pc_shader_type_bits[shader_id];
 
-      query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
+      query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
       if (query_shaders && query_shaders != shaders) {
          fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
          FREE(group);
@@ -1141,20 +429,20 @@ static struct si_query_group *get_group_state(struct si_screen *screen, struct s
       query->shaders = shaders;
    }
 
-   if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
+   if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
       // A non-zero value in query->shaders ensures that the shader
       // masking is reset unless the user explicitly requests one.
-      query->shaders = SI_PC_SHADERS_WINDOWING;
+      query->shaders = AC_PC_SHADERS_WINDOWING;
    }
 
-   if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
+   if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
       group->se = sub_gid / block->num_instances;
       sub_gid = sub_gid % block->num_instances;
    } else {
       group->se = -1;
    }
 
-   if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
+   if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
       group->instance = sub_gid;
    } else {
       group->instance = -1;
@@ -1171,7 +459,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
 {
    struct si_screen *screen = (struct si_screen *)ctx->screen;
    struct si_perfcounters *pc = screen->perfcounters;
-   struct si_pc_block *block;
+   struct ac_pc_block *block;
    struct si_query_group *group;
    struct si_query_pc *query;
    unsigned base_gid, sub_gid, sub_index;
@@ -1196,7 +484,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
          goto error;
 
       block =
-         lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+         ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
       if (!block)
          goto error;
 
@@ -1221,11 +509,11 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
 
    i = 0;
    for (group = query->groups; group; group = group->next) {
-      struct si_pc_block *block = group->block;
+      struct ac_pc_block *block = group->block;
       unsigned read_dw;
       unsigned instances = 1;
 
-      if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+      if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
          instances = screen->info.max_se;
       if (group->instance < 0)
          instances *= block->num_instances;
@@ -1240,7 +528,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
    }
 
    if (query->shaders) {
-      if (query->shaders == SI_PC_SHADERS_WINDOWING)
+      if (query->shaders == AC_PC_SHADERS_WINDOWING)
          query->shaders = 0xffffffff;
    }
 
@@ -1248,10 +536,10 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
    query->counters = CALLOC(num_queries, sizeof(*query->counters));
    for (i = 0; i < num_queries; ++i) {
       struct si_query_counter *counter = &query->counters[i];
-      struct si_pc_block *block;
+      struct ac_pc_block *block;
 
       block =
-         lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+         ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
 
       sub_gid = sub_index / block->b->selectors;
       sub_index = sub_index % block->b->selectors;
@@ -1268,7 +556,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
       counter->stride = group->num_counters;
 
       counter->qwords = 1;
-      if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+      if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
          counter->qwords = screen->info.max_se;
       if (group->instance < 0)
          counter->qwords *= block->num_instances;
@@ -1281,96 +569,11 @@ error:
    return NULL;
 }
 
-static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
-{
-   bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
-   bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
-   unsigned i, j, k;
-   unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
-   unsigned namelen;
-   char *groupname;
-   char *p;
-
-   if (per_instance_groups)
-      groups_instance = block->num_instances;
-   if (per_se_groups)
-      groups_se = screen->info.max_se;
-   if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-      groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
-
-   namelen = strlen(block->b->b->name);
-   block->group_name_stride = namelen + 1;
-   if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-      block->group_name_stride += 3;
-   if (per_se_groups) {
-      assert(groups_se <= 10);
-      block->group_name_stride += 1;
-
-      if (per_instance_groups)
-         block->group_name_stride += 1;
-   }
-   if (per_instance_groups) {
-      assert(groups_instance <= 100);
-      block->group_name_stride += 2;
-   }
-
-   block->group_names = MALLOC(block->num_groups * block->group_name_stride);
-   if (!block->group_names)
-      return false;
-
-   groupname = block->group_names;
-   for (i = 0; i < groups_shader; ++i) {
-      const char *shader_suffix = si_pc_shader_type_suffixes[i];
-      unsigned shaderlen = strlen(shader_suffix);
-      for (j = 0; j < groups_se; ++j) {
-         for (k = 0; k < groups_instance; ++k) {
-            strcpy(groupname, block->b->b->name);
-            p = groupname + namelen;
-
-            if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
-               strcpy(p, shader_suffix);
-               p += shaderlen;
-            }
-
-            if (per_se_groups) {
-               p += sprintf(p, "%d", j);
-               if (per_instance_groups)
-                  *p++ = '_';
-            }
-
-            if (per_instance_groups)
-               p += sprintf(p, "%d", k);
-
-            groupname += block->group_name_stride;
-         }
-      }
-   }
-
-   assert(block->b->selectors <= 1000);
-   block->selector_name_stride = block->group_name_stride + 4;
-   block->selector_names =
-      MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
-   if (!block->selector_names)
-      return false;
-
-   groupname = block->group_names;
-   p = block->selector_names;
-   for (i = 0; i < block->num_groups; ++i) {
-      for (j = 0; j < block->b->selectors; ++j) {
-         sprintf(p, "%s_%03d", groupname, j);
-         p += block->selector_name_stride;
-      }
-      groupname += block->group_name_stride;
-   }
-
-   return true;
-}
-
 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
                             struct pipe_driver_query_info *info)
 {
    struct si_perfcounters *pc = screen->perfcounters;
-   struct si_pc_block *block;
+   struct ac_pc_block *block;
    unsigned base_gid, sub;
 
    if (!pc)
@@ -1379,19 +582,19 @@ int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
    if (!info) {
       unsigned bid, num_queries = 0;
 
-      for (bid = 0; bid < pc->num_blocks; ++bid) {
-         num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
+      for (bid = 0; bid < pc->base.num_blocks; ++bid) {
+         num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
       }
 
       return num_queries;
    }
 
-   block = lookup_counter(pc, index, &base_gid, &sub);
+   block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
    if (!block)
       return 0;
 
    if (!block->selector_names) {
-      if (!si_init_block_names(screen, block))
+      if (!ac_init_block_names(&screen->info, &pc->base, block))
          return 0;
    }
    info->name = block->selector_names + sub * block->selector_name_stride;
@@ -1410,20 +613,20 @@ int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
                                   struct pipe_driver_query_group_info *info)
 {
    struct si_perfcounters *pc = screen->perfcounters;
-   struct si_pc_block *block;
+   struct ac_pc_block *block;
 
    if (!pc)
       return 0;
 
    if (!info)
-      return pc->num_groups;
+      return pc->base.num_groups;
 
-   block = lookup_group(pc, &index);
+   block = ac_lookup_group(&pc->base, &index);
    if (!block)
       return 0;
 
    if (!block->group_names) {
-      if (!si_init_block_names(screen, block))
+      if (!ac_init_block_names(&screen->info, &pc->base, block))
          return 0;
    }
    info->name = block->group_names + index * block->group_name_stride;
@@ -1435,100 +638,31 @@ int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
 void si_destroy_perfcounters(struct si_screen *screen)
 {
    struct si_perfcounters *pc = screen->perfcounters;
-   unsigned i;
 
    if (!pc)
       return;
 
-   for (i = 0; i < pc->num_blocks; ++i) {
-      FREE(pc->blocks[i].group_names);
-      FREE(pc->blocks[i].selector_names);
-   }
-   FREE(pc->blocks);
+   ac_destroy_perfcounters(&pc->base);
    FREE(pc);
    screen->perfcounters = NULL;
 }
 
 void si_init_perfcounters(struct si_screen *screen)
 {
-   struct si_perfcounters *pc;
-   const struct si_pc_block_gfxdescr *blocks;
-   unsigned num_blocks;
-   unsigned i;
-
-   switch (screen->info.chip_class) {
-   case GFX7:
-      blocks = groups_CIK;
-      num_blocks = ARRAY_SIZE(groups_CIK);
-      break;
-   case GFX8:
-      blocks = groups_VI;
-      num_blocks = ARRAY_SIZE(groups_VI);
-      break;
-   case GFX9:
-      blocks = groups_gfx9;
-      num_blocks = ARRAY_SIZE(groups_gfx9);
-      break;
-   case GFX10:
-   case GFX10_3:
-      blocks = groups_gfx10;
-      num_blocks = ARRAY_SIZE(groups_gfx10);
-      break;
-   case GFX6:
-   default:
-      return; /* not implemented */
-   }
-
-   screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
-   if (!pc)
-      return;
+   bool separate_se, separate_instance;
 
-   pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
-   pc->num_instance_cs_dwords = 3;
-
-   pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
-   pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
-
-   pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
-   if (!pc->blocks)
-      goto error;
-   pc->num_blocks = num_blocks;
-
-   for (i = 0; i < num_blocks; ++i) {
-      struct si_pc_block *block = &pc->blocks[i];
-      block->b = &blocks[i];
-      block->num_instances = MAX2(1, block->b->instances);
-
-      if (!strcmp(block->b->b->name, "CB") ||
-          !strcmp(block->b->b->name, "DB") ||
-          !strcmp(block->b->b->name, "RMI"))
-         block->num_instances = screen->info.max_se;
-      else if (!strcmp(block->b->b->name, "TCC"))
-         block->num_instances = screen->info.max_tcc_blocks;
-      else if (!strcmp(block->b->b->name, "IA"))
-         block->num_instances = MAX2(1, screen->info.max_se / 2);
-      else if (!strcmp(block->b->b->name, "TA") ||
-               !strcmp(block->b->b->name, "TCP") ||
-               !strcmp(block->b->b->name, "TD")) {
-         block->num_instances = MAX2(1, screen->info.max_good_cu_per_sa);
-      }
+   separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+   separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
 
-      if (si_pc_block_has_per_instance_groups(pc, block)) {
-         block->num_groups = block->num_instances;
-      } else {
-         block->num_groups = 1;
-      }
+   screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
+   if (!screen->perfcounters)
+      return;
 
-      if (si_pc_block_has_per_se_groups(pc, block))
-         block->num_groups *= screen->info.max_se;
-      if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-         block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
+   screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
+   screen->perfcounters->num_instance_cs_dwords = 3;
 
-      pc->num_groups += block->num_groups;
+   if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
+                             &screen->perfcounters->base)) {
+      si_destroy_perfcounters(screen);
    }
-
-   return;
-
-error:
-   si_destroy_perfcounters(screen);
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c
index 6196f2158..b812f170c 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c
@@ -35,6 +35,7 @@
 #include "sid.h"
 #include "ac_shadowed_regs.h"
 #include "util/disk_cache.h"
+#include "util/u_cpu_detect.h"
 #include "util/u_log.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
@@ -80,29 +81,25 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    {"compute", DBG(COMPUTE), "Print compute info"},
    {"vm", DBG(VM), "Print virtual addresses when creating resources"},
    {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."},
+   {"ib", DBG(IB), "Print command buffers."},
 
    /* Driver options: */
    {"nowc", DBG(NO_WC), "Disable GTT write combining"},
    {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
    {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
    {"shadowregs", DBG(SHADOW_REGS), "Enable CP register shadowing."},
+   {"nofastdlist", DBG(NO_FAST_DISPLAY_LIST), "Disable fast display lists"},
 
    /* 3D engine options: */
    {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
    {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
-   {"nofastlaunch", DBG(NO_FAST_LAUNCH), "Disable NGG GS fast launch."},
    {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
    {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
    {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
-   {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
-   {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
-   {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
    {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
    {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
    {"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
-   {"nodfsm", DBG(NO_DFSM), "Disable DFSM."},
    {"dpbb", DBG(DPBB), "Enable DPBB."},
-   {"dfsm", DBG(DFSM), "Enable DFSM."},
    {"nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z"},
    {"no2d", DBG(NO_2D_TILING), "Disable 2D tiling"},
    {"notiling", DBG(NO_TILING), "Disable tiling"},
@@ -110,9 +107,11 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    {"nodisplaydcc", DBG(NO_DISPLAY_DCC), "Disable display DCC"},
    {"nodcc", DBG(NO_DCC), "Disable DCC."},
    {"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."},
-   {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"},
+   {"nodccstore", DBG(NO_DCC_STORE), "Disable DCC stores"},
+   {"dccstore", DBG(DCC_STORE), "Enable DCC stores"},
    {"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"},
    {"nofmask", DBG(NO_FMASK), "Disable MSAA compression"},
+   {"nodma", DBG(NO_DMA), "Disable SDMA-copy for DRI_PRIME"},
 
    {"tmz", DBG(TMZ), "Force allocation of scanout/depth/stencil buffer as encrypted"},
    {"sqtt", DBG(SQTT), "Enable SQTT"},
@@ -142,7 +141,6 @@ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compil
 
    enum ac_target_machine_options tm_options =
       (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
-      (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
       (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
       (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
 
@@ -150,12 +148,24 @@ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compil
    ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
    compiler->passes = ac_create_llvm_passes(compiler->tm);
 
-   if (compiler->tm_wave32)
-      compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
    if (compiler->low_opt_tm)
       compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
 }
 
+void si_init_aux_async_compute_ctx(struct si_screen *sscreen)
+{
+   assert(!sscreen->async_compute_context);
+   sscreen->async_compute_context = si_create_context(
+      &sscreen->b,
+      SI_CONTEXT_FLAG_AUX |
+         (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+         PIPE_CONTEXT_COMPUTE_ONLY);
+
+   /* Limit the numbers of waves allocated for this context. */
+   if (sscreen->async_compute_context)
+      ((struct si_context*)sscreen->async_compute_context)->cs_max_waves_per_sh = 2;
+}
+
 static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
 {
    ac_destroy_llvm_compiler(compiler);
@@ -255,8 +265,10 @@ static void si_destroy_context(struct pipe_context *context)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
    if (sctx->cs_dcc_decompress)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_decompress);
-   if (sctx->cs_dcc_retile)
-      sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
+   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_dcc_retile); i++) {
+      if (sctx->cs_dcc_retile[i])
+         sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile[i]);
+   }
    if (sctx->no_velems_state)
       sctx->b.delete_vertex_elements_state(&sctx->b, sctx->no_velems_state);
 
@@ -284,17 +296,6 @@ static void si_destroy_context(struct pipe_context *context)
    if (sctx->blitter)
       util_blitter_destroy(sctx->blitter);
 
-   /* Release DCC stats. */
-   for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
-      assert(!sctx->dcc_stats[i].query_active);
-
-      for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
-         if (sctx->dcc_stats[i].ps_stats[j])
-            sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[i].ps_stats[j]);
-
-      si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
-   }
-
    if (sctx->query_result_shader)
       sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
    if (sctx->sh_query_result_shader)
@@ -303,6 +304,10 @@ static void si_destroy_context(struct pipe_context *context)
    sctx->ws->cs_destroy(&sctx->gfx_cs);
    if (sctx->ctx)
       sctx->ws->ctx_destroy(sctx->ctx);
+   if (sctx->sdma_cs) {
+      sctx->ws->cs_destroy(sctx->sdma_cs);
+      free(sctx->sdma_cs);
+   }
 
    if (sctx->dirty_implicit_resources)
       _mesa_hash_table_destroy(sctx->dirty_implicit_resources,
@@ -321,12 +326,8 @@ static void si_destroy_context(struct pipe_context *context)
    u_suballocator_destroy(&sctx->allocator_zeroed_memory);
 
    sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
-   sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
    si_resource_reference(&sctx->eop_bug_scratch, NULL);
    si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
-   si_resource_reference(&sctx->index_ring, NULL);
-   si_resource_reference(&sctx->barrier_buf, NULL);
-   si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
    si_resource_reference(&sctx->shadowed_regs, NULL);
    radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL);
    radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL);
@@ -503,7 +504,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
    /* Initialize private allocators. */
    u_suballocator_init(&sctx->allocator_zeroed_memory, &sctx->b, 128 * 1024, 0,
                        PIPE_USAGE_DEFAULT,
-                       SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false);
+                       SI_RESOURCE_FLAG_CLEAR | SI_RESOURCE_FLAG_32BIT, false);
 
    sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0);
    if (!sctx->cached_gtt_allocator)
@@ -552,6 +553,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
    }
 
    sctx->ngg = sscreen->use_ngg;
+   si_shader_change_notify(sctx);
 
    /* Initialize context functions used by graphics and compute. */
    if (sctx->chip_class >= GFX10)
@@ -588,6 +590,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       si_init_state_functions(sctx);
       si_init_streamout_functions(sctx);
       si_init_viewport_functions(sctx);
+      si_init_spi_map_functions(sctx);
 
       sctx->blitter = util_blitter_create(&sctx->b);
       if (sctx->blitter == NULL)
@@ -607,27 +610,46 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter);
       sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
 
-      si_init_draw_functions(sctx);
-
-      si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX,
-                                          &sctx->prim_discard_vertex_count_threshold,
-                                          &sctx->index_ring_size_per_ib);
-   } else {
-      sctx->prim_discard_vertex_count_threshold = UINT_MAX;
+      switch (sctx->chip_class) {
+      case GFX6:
+         si_init_draw_functions_GFX6(sctx);
+         break;
+      case GFX7:
+         si_init_draw_functions_GFX7(sctx);
+         break;
+      case GFX8:
+         si_init_draw_functions_GFX8(sctx);
+         break;
+      case GFX9:
+         si_init_draw_functions_GFX9(sctx);
+         break;
+      case GFX10:
+         si_init_draw_functions_GFX10(sctx);
+         break;
+      case GFX10_3:
+         si_init_draw_functions_GFX10_3(sctx);
+         break;
+      default:
+         unreachable("unhandled chip class");
+      }
    }
 
    sctx->sample_mask = 0xffff;
 
    /* Initialize multimedia functions. */
-   if (sscreen->info.has_hw_decode) {
+   if (sscreen->info.has_video_hw.uvd_decode || sscreen->info.has_video_hw.vcn_decode ||
+       sscreen->info.has_video_hw.jpeg_decode || sscreen->info.has_video_hw.vce_encode ||
+       sscreen->info.has_video_hw.uvd_encode || sscreen->info.has_video_hw.vcn_encode) {
       sctx->b.create_video_codec = si_uvd_create_decoder;
       sctx->b.create_video_buffer = si_video_buffer_create;
+      if (screen->resource_create_with_modifiers)
+         sctx->b.create_video_buffer_with_modifiers = si_video_buffer_create_with_modifiers;
    } else {
       sctx->b.create_video_codec = vl_create_decoder;
       sctx->b.create_video_buffer = vl_video_buffer_create;
    }
 
-   if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+   if (sctx->chip_class >= GFX9) {
       sctx->wait_mem_scratch =
            si_aligned_buffer_create(screen,
                                     SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
@@ -707,11 +729,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
    if (!sctx->dirty_implicit_resources)
       goto fail;
 
-   sctx->sample_pos_buffer =
-      pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions));
-   pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions),
-                     &sctx->sample_positions);
-
    /* The remainder of this function initializes the gfx CS and must be last. */
    assert(sctx->gfx_cs.current.cdw == 0);
 
@@ -719,6 +736,23 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       si_init_cp_reg_shadowing(sctx);
    }
 
+   /* Set immutable fields of shader keys. */
+   if (sctx->chip_class >= GFX9) {
+      /* The LS output / HS input layout can be communicated
+       * directly instead of via user SGPRs for merged LS-HS.
+       * This also enables jumping over the VS prolog for HS-only waves.
+       *
+       * When the LS VGPR fix is needed, monolithic shaders can:
+       *  - avoid initializing EXEC in both the LS prolog
+       *    and the LS main part when !vs_needs_prolog
+       *  - remove the fixup for unused input VGPRs
+       */
+      sctx->shader.tcs.key.opt.prefer_mono = 1;
+
+      /* This enables jumping over the VS prolog for GS-only waves. */
+      sctx->shader.gs.key.opt.prefer_mono = 1;
+   }
+
    si_begin_new_gfx_cs(sctx, true);
    assert(sctx->gfx_cs.current.cdw == sctx->initial_gfx_cs_size);
 
@@ -763,6 +797,13 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
          sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
       }
       simple_mtx_unlock(&sscreen->aux_context_lock);
+
+      simple_mtx_lock(&sscreen->async_compute_context_lock);
+      if (status != PIPE_NO_RESET && sscreen->async_compute_context) {
+         sscreen->async_compute_context->destroy(sscreen->async_compute_context);
+         sscreen->async_compute_context = NULL;
+      }
+      simple_mtx_unlock(&sscreen->async_compute_context_lock);
    }
 
    sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw;
@@ -773,12 +814,23 @@ fail:
    return NULL;
 }
 
+static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource,
+                                unsigned usage)
+{
+   struct radeon_winsys *ws = ((struct si_screen *)screen)->ws;
+
+   return !ws->buffer_wait(ws, si_resource(resource)->buf, 0,
+                           /* If mapping for write, we need to wait for all reads and writes.
+                            * If mapping for read, we only need to wait for writes.
+                            */
+                           usage & PIPE_MAP_WRITE ? RADEON_USAGE_READWRITE : RADEON_USAGE_WRITE);
+}
+
 static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, void *priv,
                                                    unsigned flags)
 {
    struct si_screen *sscreen = (struct si_screen *)screen;
    struct pipe_context *ctx;
-   uint64_t total_ram;
 
    if (sscreen->debug_flags & DBG(CHECK_VM))
       flags |= PIPE_CONTEXT_DEBUG;
@@ -806,14 +858,19 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
 
    /* Use asynchronous flushes only on amdgpu, since the radeon
     * implementation for fence_server_sync is incomplete. */
-   struct pipe_context * tc = threaded_context_create(
-            ctx, &sscreen->pool_transfers, si_replace_buffer_storage,
-            sscreen->info.is_amdgpu ? si_create_fence : NULL,
-            &((struct si_context *)ctx)->tc);
-
-   if (tc && tc != ctx && os_get_total_physical_memory(&total_ram)) {
-      ((struct threaded_context *) tc)->bytes_mapped_limit = total_ram / 4;
-   }
+   struct pipe_context *tc =
+      threaded_context_create(ctx, &sscreen->pool_transfers,
+                              si_replace_buffer_storage,
+                              &(struct threaded_context_options){
+                                 .create_fence = sscreen->info.is_amdgpu ?
+                                       si_create_fence : NULL,
+                                 .is_resource_busy = si_is_resource_busy,
+                                 .driver_calls_flush_notify = true,
+                              },
+                              &((struct si_context *)ctx)->tc);
+
+   if (tc && tc != ctx)
+      threaded_context_init_bytes_mapped_limit((struct threaded_context *)tc, 4);
 
    return tc;
 }
@@ -853,6 +910,11 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
        sscreen->aux_context->destroy(sscreen->aux_context);
    }
 
+   simple_mtx_destroy(&sscreen->async_compute_context_lock);
+   if (sscreen->async_compute_context) {
+      sscreen->async_compute_context->destroy(sscreen->async_compute_context);
+   }
+
    util_queue_destroy(&sscreen->shader_compiler_queue);
    util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
 
@@ -887,6 +949,9 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
 
    disk_cache_destroy(sscreen->disk_shader_cache);
    util_live_shader_cache_deinit(&sscreen->live_shader_cache);
+   util_idalloc_mt_fini(&sscreen->buffer_ids);
+   util_vertex_state_cache_deinit(&sscreen->vertex_state_cache);
+
    sscreen->ws->destroy(sscreen->ws);
    FREE(sscreen);
 }
@@ -1017,22 +1082,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
                   sscreen->options.enable_sam,
                   sscreen->options.disable_sam);
 
-   /* Older LLVM have buggy v_pk_* instructions. */
-   if (!sscreen->info.has_packed_math_16bit || LLVM_VERSION_MAJOR < 11)
-      sscreen->options.fp16 = false;
-
-   if (sscreen->info.chip_class == GFX10_3 && LLVM_VERSION_MAJOR < 11) {
-      fprintf(stderr, "radeonsi: GFX 10.3 requires LLVM 11 or higher\n");
-      FREE(sscreen);
-      return NULL;
-   }
-
-   if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
-      fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
-      FREE(sscreen);
-      return NULL;
-   }
-
    if (sscreen->info.chip_class >= GFX9) {
       sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
    } else {
@@ -1054,6 +1103,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
       return NULL;
    }
 
+   util_idalloc_mt_init_tc(&sscreen->buffer_ids);
 
    /* Set functions first. */
    sscreen->b.context_create = si_pipe_create_context;
@@ -1072,8 +1122,12 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
 
    /* Set these flags in debug_flags early, so that the shader cache takes
     * them into account.
+    *
+    * Enable FS_CORRECT_DERIVS_AFTER_KILL by default if LLVM is >= 13. This makes
+    * nir_opt_move_discards_to_top more effective.
     */
-   if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard"))
+   if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard") ||
+       LLVM_VERSION_MAJOR >= 13)
       sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
 
    if (sscreen->debug_flags & DBG(INFO))
@@ -1093,6 +1147,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    }
 
    (void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
+   (void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain);
    (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
 
    si_init_gs_info(sscreen);
@@ -1107,7 +1162,8 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    si_disk_cache_create(sscreen);
 
    /* Determine the number of shader compiler threads. */
-   hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
+   const struct util_cpu_caps_t *caps = util_get_cpu_caps();
+   hw_threads = caps->nr_cpus;
 
    if (hw_threads >= 12) {
       num_comp_hi_threads = hw_threads * 3 / 4;
@@ -1131,7 +1187,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
 
    if (!util_queue_init(
           &sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads,
-          UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
+          UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
       si_destroy_shader_cache(sscreen);
       FREE(sscreen);
       glsl_type_singleton_decref();
@@ -1141,7 +1197,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", 64,
                         num_comp_lo_threads,
                         UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
-                           UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+                           UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY, NULL)) {
       si_destroy_shader_cache(sscreen);
       FREE(sscreen);
       glsl_type_singleton_decref();
@@ -1151,11 +1207,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
       si_init_perfcounters(sscreen);
 
-   unsigned prim_discard_vertex_count_threshold, tmp;
-   si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
-   /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
-   if (prim_discard_vertex_count_threshold == UINT_MAX)
-      sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
+   sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;
 
    /* Determine tessellation ring info. */
    bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
@@ -1221,12 +1273,14 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    sscreen->commutative_blend_add =
       driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
       driQueryOptionb(config->options, "allow_draw_out_of_order");
+   sscreen->allow_draw_out_of_order = driQueryOptionb(config->options, "allow_draw_out_of_order");
 
    sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) &&
                       sscreen->info.chip_class >= GFX10 &&
                       (sscreen->info.family != CHIP_NAVI14 ||
                        sscreen->info.is_pro_graphics);
    sscreen->use_ngg_culling = sscreen->use_ngg &&
+                              sscreen->info.max_render_backends >= 2 &&
                               !((sscreen->debug_flags & DBG(NO_NGG_CULLING)) ||
                                 LLVM_VERSION_MAJOR <= 11 /* hangs on 11, see #4874 */);
    sscreen->use_ngg_streamout = false;
@@ -1239,30 +1293,19 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
          sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true;
    }
 
-   /* Only enable primitive binning on APUs by default. */
-   if (sscreen->info.chip_class >= GFX10) {
-      sscreen->dpbb_allowed = true;
-      /* DFSM is not supported on GFX 10.3 and not beneficial on Navi1x. */
-   } else if (sscreen->info.chip_class == GFX9) {
-      sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
-      /* DFSM reduces the Raven2 draw prim rate by ~43%. Disable it. */
-      sscreen->dfsm_allowed = false;
-   }
-
-   /* Process DPBB enable flags. */
-   if (sscreen->debug_flags & DBG(DPBB)) {
-      sscreen->dpbb_allowed = true;
-      if (sscreen->debug_flags & DBG(DFSM))
-         sscreen->dfsm_allowed = true;
-   }
+   /* DCC stores have 50% performance of uncompressed stores and sometimes
+    * even less than that. It's risky to enable on dGPUs.
+    */
+   sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) &&
+                                      ((sscreen->info.chip_class >= GFX10_3 &&
+                                        !sscreen->info.has_dedicated_vram) ||
+                                       sscreen->debug_flags & DBG(DCC_STORE));
 
-   /* Process DPBB disable flags. */
-   if (sscreen->debug_flags & DBG(NO_DPBB)) {
-      sscreen->dpbb_allowed = false;
-      sscreen->dfsm_allowed = false;
-   } else if (sscreen->debug_flags & DBG(NO_DFSM)) {
-      sscreen->dfsm_allowed = false;
-   }
+   sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) &&
+                           (sscreen->info.chip_class >= GFX10 ||
+                            /* Only enable primitive binning on gfx9 APUs by default. */
+                            (sscreen->info.chip_class == GFX9 && !sscreen->info.has_dedicated_vram) ||
+                            sscreen->debug_flags & DBG(DPBB));
 
    if (sscreen->dpbb_allowed) {
       if (sscreen->info.has_dedicated_vram) {
@@ -1289,11 +1332,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
              sscreen->pbb_persistent_states_per_bin <= 32);
    }
 
-   /* While it would be nice not to have this flag, we are constrained
-    * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
-    */
-   sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
-
    (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
    sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
 
@@ -1331,6 +1369,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
       }
    }
 
+   sscreen->ngg_subgroup_size = 128;
    sscreen->ge_wave_size = 64;
    sscreen->ps_wave_size = 64;
    sscreen->compute_wave_size = 64;
@@ -1406,6 +1445,9 @@ struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_conf
    drmVersionPtr version = drmGetVersion(fd);
    struct radeon_winsys *rw = NULL;
 
+   driParseConfigFiles(config->options, config->options_info, 0, "radeonsi",
+                       NULL, NULL, NULL, 0, NULL, 0);
+
    switch (version->version_major) {
    case 2:
       rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h
index c9f64a144..2408346c3 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h
@@ -31,6 +31,7 @@
 #include "util/u_idalloc.h"
 #include "util/u_suballoc.h"
 #include "util/u_threaded_context.h"
+#include "util/u_vertex_state_cache.h"
 #include "ac_sqtt.h"
 
 #ifdef __cplusplus
@@ -44,7 +45,6 @@ extern "C" {
 #endif
 
 #define ATI_VENDOR_ID         0x1002
-#define SI_PRIM_DISCARD_DEBUG 0
 #define SI_NOT_QUERY          0xffffffff
 
 /* The base vertex and primitive restart can be any number, but we must pick
@@ -55,7 +55,7 @@ extern "C" {
 #define SI_DRAW_ID_UNKNOWN        ((unsigned)INT_MIN)
 #define SI_RESTART_INDEX_UNKNOWN  ((unsigned)INT_MIN)
 #define SI_INSTANCE_COUNT_UNKNOWN ((unsigned)INT_MIN)
-#define SI_NUM_SMOOTH_AA_SAMPLES  8
+#define SI_NUM_SMOOTH_AA_SAMPLES  4
 #define SI_MAX_POINT_SIZE         2048
 #define SI_GS_PER_ES              128
 /* Alignment for optimal CP DMA performance. */
@@ -64,7 +64,8 @@ extern "C" {
 /* Tunables for compute-based clear_buffer and copy_buffer: */
 #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
 #define SI_COMPUTE_COPY_DW_PER_THREAD  4
-#define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
+/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
+#define SI_COMPUTE_DST_CACHE_POLICY    L2_LRU
 
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)
@@ -137,6 +138,7 @@ extern "C" {
    (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
 #define SI_RESOURCE_FLAG_UNCACHED          (PIPE_RESOURCE_FLAG_DRV_PRIV << 12)
 #define SI_RESOURCE_FLAG_DRIVER_INTERNAL   (PIPE_RESOURCE_FLAG_DRV_PRIV << 13)
+#define SI_RESOURCE_AUX_PLANE              (PIPE_RESOURCE_FLAG_DRV_PRIV << 14)
 
 enum si_has_gs {
    GS_OFF,
@@ -153,11 +155,6 @@ enum si_has_ngg {
    NGG_ON,
 };
 
-enum si_has_prim_discard_cs {
-   PRIM_DISCARD_CS_OFF,
-   PRIM_DISCARD_CS_ON,
-};
-
 enum si_clear_code
 {
    DCC_CLEAR_COLOR_0000 = 0x00000000,
@@ -168,9 +165,8 @@ enum si_clear_code
    DCC_UNCOMPRESSED = 0xFFFFFFFF,
 };
 
-#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7)
-#define SI_IMAGE_ACCESS_DCC_OFF   (1 << 8)
-#define SI_IMAGE_ACCESS_DCC_WRITE (1 << 9)
+#define SI_IMAGE_ACCESS_DCC_OFF           (1 << 8)
+#define SI_IMAGE_ACCESS_ALLOW_DCC_STORE   (1 << 9)
 
 /* Debug flags. */
 enum
@@ -208,12 +204,14 @@ enum
    DBG_COMPUTE,
    DBG_VM,
    DBG_CACHE_STATS,
+   DBG_IB,
 
    /* Driver options: */
    DBG_NO_WC,
    DBG_CHECK_VM,
    DBG_RESERVE_VMID,
    DBG_SHADOW_REGS,
+   DBG_NO_FAST_DISPLAY_LIST,
 
    /* 3D engine options: */
    DBG_NO_GFX,
@@ -221,16 +219,10 @@ enum
    DBG_ALWAYS_NGG_CULLING_ALL,
    DBG_ALWAYS_NGG_CULLING_TESS,
    DBG_NO_NGG_CULLING,
-   DBG_NO_FAST_LAUNCH,
-   DBG_ALWAYS_PD,
-   DBG_PD,
-   DBG_NO_PD,
    DBG_SWITCH_ON_EOP,
    DBG_NO_OUT_OF_ORDER,
    DBG_NO_DPBB,
-   DBG_NO_DFSM,
    DBG_DPBB,
-   DBG_DFSM,
    DBG_NO_HYPERZ,
    DBG_NO_2D_TILING,
    DBG_NO_TILING,
@@ -238,9 +230,11 @@ enum
    DBG_NO_DISPLAY_DCC,
    DBG_NO_DCC,
    DBG_NO_DCC_CLEAR,
-   DBG_NO_DCC_FB,
+   DBG_NO_DCC_STORE,
+   DBG_DCC_STORE,
    DBG_NO_DCC_MSAA,
    DBG_NO_FMASK,
+   DBG_NO_DMA,
 
    DBG_TMZ,
    DBG_SQTT,
@@ -293,16 +287,14 @@ struct si_resource {
    struct pb_buffer *buf;
    uint64_t gpu_address;
    /* Memory usage if the buffer placement is optimal. */
-   uint32_t vram_usage_kb;
-   uint32_t gart_usage_kb;
+   uint32_t memory_usage_kb;
 
    /* Resource properties. */
    uint64_t bo_size;
-   unsigned bo_alignment;
-   enum radeon_bo_domain domains;
-   enum radeon_bo_flag flags;
+   uint8_t bo_alignment_log2;
+   enum radeon_bo_domain domains:8;
+   enum radeon_bo_flag flags:16;
    unsigned bind_history;
-   int max_forced_staging_uploads;
 
    /* The buffer range which is initialized (with a write transfer,
     * streamout, DMA, or as a random access target). The rest of
@@ -331,13 +323,12 @@ struct si_resource {
    bool image_handle_allocated;
 
    /* Whether the resource has been exported via resource_get_handle. */
-   unsigned external_usage; /* PIPE_HANDLE_USAGE_* */
+   uint8_t external_usage; /* PIPE_HANDLE_USAGE_* */
 };
 
 struct si_transfer {
    struct threaded_transfer b;
    struct si_resource *staging;
-   unsigned offset;
 };
 
 struct si_texture {
@@ -368,7 +359,8 @@ struct si_texture {
    /* Depth buffer compression and fast clear. */
    float depth_clear_value[RADEON_SURF_MAX_LEVELS];
    uint8_t stencil_clear_value[RADEON_SURF_MAX_LEVELS];
-   uint16_t depth_cleared_level_mask;   /* if it was cleared at least once */
+   uint16_t depth_cleared_level_mask_once; /* if it was cleared at least once */
+   uint16_t depth_cleared_level_mask;     /* track if it's cleared (can be false negative) */
    uint16_t stencil_cleared_level_mask; /* if it was cleared at least once */
    uint16_t dirty_level_mask;         /* each bit says if that mipmap is compressed */
    uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
@@ -382,40 +374,36 @@ struct si_texture {
    bool db_compatible : 1;
    bool can_sample_z : 1;
    bool can_sample_s : 1;
+   bool need_flush_after_depth_decompression: 1;
 
    /* We need to track DCC dirtiness, because st/dri usually calls
     * flush_resource twice per frame (not a bug) and we don't wanna
-    * decompress DCC twice. Also, the dirty tracking must be done even
-    * if DCC isn't used, because it's required by the DCC usage analysis
-    * for a possible future enablement.
+    * decompress DCC twice.
     */
-   bool separate_dcc_dirty : 1;
    bool displayable_dcc_dirty : 1;
 
-   /* Statistics gathering for the DCC enablement heuristic. */
-   bool dcc_gather_statistics : 1;
    /* Counter that should be non-zero if the texture is bound to a
     * framebuffer.
     */
    unsigned framebuffers_bound;
-   /* Whether the texture is a displayable back buffer and needs DCC
-    * decompression, which is expensive. Therefore, it's enabled only
-    * if statistics suggest that it will pay off and it's allocated
-    * separately. It can't be bound as a sampler by apps. Limited to
-    * target == 2D and last_level == 0. If enabled, dcc_offset contains
-    * the absolute GPUVM address, not the relative one.
-    */
-   struct si_resource *dcc_separate_buffer;
-   /* When DCC is temporarily disabled, the separate buffer is here. */
-   struct si_resource *last_dcc_separate_buffer;
-   /* Estimate of how much this color buffer is written to in units of
-    * full-screen draws: ps_invocations / (width * height)
-    * Shader kills, late Z, and blending with trivial discards make it
-    * inaccurate (we need to count CB updates, not PS invocations).
-    */
-   unsigned ps_draw_ratio;
-   /* The number of clears since the last DCC usage analysis. */
-   unsigned num_slow_clears;
+};
+
+/* State trackers create separate textures in a next-chain for extra planes
+ * even if those are planes created purely for modifiers. Because the linking
+ * of the chain happens outside of the driver, and NULL is interpreted as
+ * failure, let's create some dummy texture structs. We could use these
+ * later to use the offsets for linking if we really wanted to.
+ *
+ * For now just create a dummy struct and completely ignore it.
+ *
+ * Potentially in the future we could store stride/offset and use it during
+ * creation, though we might want to change how linking is done first.
+ */
+struct si_auxiliary_texture {
+   struct threaded_resource b;
+   struct pb_buffer *buffer;
+   uint32_t offset;
+   uint32_t stride;
 };
 
 struct si_surface {
@@ -533,7 +521,7 @@ struct si_screen {
                                    unsigned width, unsigned height, unsigned depth, uint32_t *state,
                                    uint32_t *fmask_state);
 
-   unsigned num_vbos_in_user_sgprs;
+   unsigned max_memory_usage_kb;
    unsigned pa_sc_raster_config;
    unsigned pa_sc_raster_config_1;
    unsigned se_tile_repeat;
@@ -551,13 +539,13 @@ struct si_screen {
    bool has_out_of_order_rast;
    bool assume_no_z_fights;
    bool commutative_blend_add;
+   bool allow_draw_out_of_order;
    bool dpbb_allowed;
-   bool dfsm_allowed;
-   bool llvm_has_working_vgpr_indexing;
    bool use_ngg;
    bool use_ngg_culling;
    bool use_ngg_streamout;
    bool allow_dcc_msaa_clear_to_reg_for_bpp[5]; /* indexed by log2(Bpp) */
+   bool always_allow_dcc_stores;
 
    struct {
 #define OPT_BOOL(name, dflt, description) bool name : 1;
@@ -578,6 +566,10 @@ struct si_screen {
    struct pipe_context *aux_context;
    simple_mtx_t aux_context_lock;
 
+   /* Async compute context for DRI_PRIME copies. */
+   struct pipe_context *async_compute_context;
+   simple_mtx_t async_compute_context_lock;
+
    /* This must be in the screen, because UE4 uses one context for
     * compilation and another one for rendering.
     */
@@ -671,6 +663,10 @@ struct si_screen {
    unsigned compute_wave_size;
    unsigned ps_wave_size;
    unsigned ge_wave_size;
+   unsigned ngg_subgroup_size;
+
+   struct util_idalloc_mt buffer_ids;
+   struct util_vertex_state_cache vertex_state_cache;
 };
 
 struct si_sampler_view {
@@ -809,6 +805,8 @@ struct si_streamout {
 struct si_shader_ctx_state {
    struct si_shader_selector *cso;
    struct si_shader *current;
+   /* The shader variant key representing the current state. */
+   struct si_shader_key key;
 };
 
 #define SI_NUM_VGT_PARAM_KEY_BITS 12
@@ -846,35 +844,6 @@ union si_vgt_param_key {
    uint16_t index;
 };
 
-#define SI_NUM_VGT_STAGES_KEY_BITS 6
-#define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
-
-/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
- * Some fields are set by state-change calls, most are set by draw_vbo.
- */
-union si_vgt_stages_key {
-   struct {
-#if UTIL_ARCH_LITTLE_ENDIAN
-      uint8_t tess : 1;
-      uint8_t gs : 1;
-      uint8_t ngg_gs_fast_launch : 1;
-      uint8_t ngg_passthrough : 1;
-      uint8_t ngg : 1;       /* gfx10+ */
-      uint8_t streamout : 1; /* only used with NGG */
-      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
-#else /* UTIL_ARCH_BIG_ENDIAN */
-      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
-      uint8_t streamout : 1;
-      uint8_t ngg : 1;
-      uint8_t ngg_passthrough : 1;
-      uint8_t ngg_gs_fast_launch : 1;
-      uint8_t gs : 1;
-      uint8_t tess : 1;
-#endif
-   } u;
-   uint8_t index;
-};
-
 struct si_texture_handle {
    unsigned desc_slot;
    bool desc_dirty;
@@ -897,7 +866,6 @@ struct si_saved_cs {
    unsigned trace_id;
 
    unsigned gfx_last_dw;
-   unsigned compute_last_dw;
    bool flushed;
    int64_t time_flush;
 };
@@ -907,11 +875,24 @@ struct si_small_prim_cull_info {
    float small_prim_precision;
 };
 
+struct si_vertex_state {
+   struct pipe_vertex_state b;
+   struct si_vertex_elements velems;
+   uint32_t descriptors[4 * SI_MAX_ATTRIBS];
+};
+
 typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
                                    const struct pipe_draw_info *info,
+                                   unsigned drawid_offset,
                                    const struct pipe_draw_indirect_info *indirect,
-                                   const struct pipe_draw_start_count *draws,
+                                   const struct pipe_draw_start_count_bias *draws,
                                    unsigned num_draws);
+typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx,
+                                            struct pipe_vertex_state *vstate,
+                                            uint32_t partial_velem_mask,
+                                            struct pipe_draw_vertex_state_info info,
+                                            const struct pipe_draw_start_count_bias *draws,
+                                            unsigned num_draws);
 
 struct si_context {
    struct pipe_context b; /* base class */
@@ -922,6 +903,7 @@ struct si_context {
    struct radeon_winsys *ws;
    struct radeon_winsys_ctx *ctx;
    struct radeon_cmdbuf gfx_cs; /* compute IB if graphics is disabled */
+   struct radeon_cmdbuf *sdma_cs;
    struct pipe_fence_handle *last_gfx_fence;
    struct si_resource *eop_bug_scratch;
    struct si_resource *eop_bug_scratch_tmz;
@@ -962,7 +944,7 @@ struct si_context {
    void *cs_clear_render_target_1d_array;
    void *cs_clear_12bytes_buffer;
    void *cs_dcc_decompress;
-   void *cs_dcc_retile;
+   void *cs_dcc_retile[32];
    void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
    struct si_screen *screen;
    struct pipe_debug_callback debug;
@@ -990,33 +972,11 @@ struct si_context {
    unsigned last_num_draw_calls;
    unsigned flags; /* flush flags */
    /* Current unaccounted memory usage. */
-   uint32_t vram_kb;
-   uint32_t gtt_kb;
+   uint32_t memory_usage_kb;
 
-   /* Compute-based primitive discard. */
-   unsigned prim_discard_vertex_count_threshold;
+   /* NGG streamout. */
    struct pb_buffer *gds;
    struct pb_buffer *gds_oa;
-   struct radeon_cmdbuf prim_discard_compute_cs;
-   unsigned compute_gds_offset;
-   struct si_shader *compute_ib_last_shader;
-   uint32_t compute_rewind_va;
-   unsigned compute_num_prims_in_batch;
-   bool preserve_prim_restart_gds_at_flush;
-   /* index_ring is divided into 2 halves for doublebuffering. */
-   struct si_resource *index_ring;
-   unsigned index_ring_base;        /* offset of a per-IB portion */
-   unsigned index_ring_offset;      /* offset within a per-IB portion */
-   unsigned index_ring_size_per_ib; /* max available size per IB */
-   bool prim_discard_compute_ib_initialized;
-   /* For tracking the last execution barrier - it can be either
-    * a WRITE_DATA packet or a fence. */
-   uint32_t *last_pkt3_write_data;
-   struct si_resource *barrier_buf;
-   unsigned barrier_buf_offset;
-   struct pipe_fence_handle *last_ib_barrier_fence;
-   struct si_resource *last_ib_barrier_buf;
-   unsigned last_ib_barrier_buf_offset;
 
    /* Atoms (direct states). */
    union si_state_atoms atoms;
@@ -1065,28 +1025,27 @@ struct si_context {
       /* indexed access using pipe_shader_type (not by MESA_SHADER_*) */
       struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
    };
-   struct si_shader_ctx_state cs_prim_discard_state;
    struct si_cs_shader_state cs_shader_state;
 
    /* shader information */
+   uint64_t ps_inputs_read_or_disabled;
    struct si_vertex_elements *vertex_elements;
    unsigned num_vertex_elements;
-   unsigned sprite_coord_enable;
    unsigned cs_max_waves_per_sh;
-   bool flatshade;
+   bool uses_nontrivial_vs_prolog;
+   bool force_trivial_vs_prolog;
    bool do_update_shaders;
    bool compute_shaderbuf_sgprs_dirty;
    bool compute_image_sgprs_dirty;
    bool vs_uses_base_instance;
    bool vs_uses_draw_id;
+   uint8_t patch_vertices;
 
    /* shader descriptors */
    struct si_descriptors descriptors[SI_NUM_DESCS];
    unsigned descriptors_dirty;
    unsigned shader_pointers_dirty;
    unsigned shader_needs_decompress_mask;
-   unsigned inlinable_uniforms_valid_mask;
-   uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS];
    struct si_buffer_resources internal_bindings;
    struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
    struct si_samplers samplers[SI_NUM_SHADERS];
@@ -1141,11 +1100,7 @@ struct si_context {
    bool allow_flat_shading : 1;
 
    /* Emitted draw state. */
-   bool gs_tri_strip_adj_fix : 1;
-   bool ls_vgpr_fix : 1;
-   bool prim_discard_cs_instancing : 1;
    bool ngg : 1;
-   bool same_patch_vertices : 1;
    uint8_t ngg_culling;
    unsigned last_index_size;
    int last_base_vertex;
@@ -1256,9 +1211,6 @@ struct si_context {
    unsigned num_resident_handles;
    uint64_t num_alloc_tex_transfer_bytes;
    unsigned last_tex_ps_draw_ratio; /* for query */
-   unsigned compute_num_verts_accepted;
-   unsigned compute_num_verts_rejected;
-   unsigned compute_num_verts_ineligible; /* due to low vertex count */
    unsigned context_roll;
 
    /* Queries. */
@@ -1281,25 +1233,6 @@ struct si_context {
 
    bool force_cb_shader_coherent;
 
-   /* Statistics gathering for the DCC enablement heuristic. It can't be
-    * in si_texture because si_texture can be shared by multiple
-    * contexts. This is for back buffers only. We shouldn't get too many
-    * of those.
-    *
-    * X11 DRI3 rotates among a finite set of back buffers. They should
-    * all fit in this array. If they don't, separate DCC might never be
-    * enabled by DCC stat gathering.
-    */
-   struct {
-      struct si_texture *tex;
-      /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
-      struct pipe_query *ps_stats[3];
-      /* If all slots are used and another slot is needed,
-       * the least recently used slot is evicted based on this. */
-      int64_t last_use_timestamp;
-      bool query_active;
-   } dcc_stats[5];
-
    struct si_tracked_regs tracked_regs;
 
    /* Resources that need to be flushed, but will not get an explicit
@@ -1308,7 +1241,12 @@ struct si_context {
     */
    struct hash_table *dirty_implicit_resources;
 
-   pipe_draw_vbo_func draw_vbo[NUM_GFX_VERSIONS - GFX6][2][2][2][2];
+   pipe_draw_vbo_func draw_vbo[2][2][2];
+   pipe_draw_vertex_state_func draw_vertex_state[2][2][2];
+   /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
+   pipe_draw_vbo_func real_draw_vbo;
+   pipe_draw_vertex_state_func real_draw_vertex_state;
+   void (*emit_spi_map[33])(struct si_context *sctx);
 
    /* SQTT */
    struct ac_thread_trace_data *thread_trace;
@@ -1346,6 +1284,9 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst
 void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex);
 void si_flush_implicit_resources(struct si_context *sctx);
 
+/* si_nir_optim.c */
+bool si_nir_is_output_const_if_tex_is_const(nir_shader *shader, float *in, float *out, int *texunit);
+
 /* si_buffer.c */
 bool si_cs_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
                                 enum radeon_bo_usage usage);
@@ -1359,7 +1300,8 @@ struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, uns
 struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
                                              unsigned usage, unsigned size, unsigned alignment);
 void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
-                               struct pipe_resource *src);
+                               struct pipe_resource *src, unsigned num_rebinds,
+                               uint32_t rebind_mask, uint32_t delete_buffer_id);
 void si_init_screen_buffer_functions(struct si_screen *sscreen);
 void si_init_buffer_functions(struct si_context *sctx);
 
@@ -1474,6 +1416,7 @@ void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved,
                         enum ring_type ring);
 bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
+void si_print_current_ib(struct si_context *sctx, FILE *f);
 
 /* si_fence.c */
 void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
@@ -1491,16 +1434,23 @@ struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
 /* si_get.c */
 void si_init_screen_get_functions(struct si_screen *sscreen);
 
+bool si_sdma_copy_image(struct si_context *ctx, struct si_texture *dst, struct si_texture *src);
+
 /* si_gfx_cs.c */
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
 void si_allocate_gds(struct si_context *ctx);
 void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
 void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
-void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws);
+void si_trace_emit(struct si_context *sctx);
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
                           unsigned cp_coher_cntl);
 void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
 void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
+/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
+ * optimizations without affecting the normal draw_vbo functions perf.
+ */
+void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
+                             pipe_draw_vertex_state_func vstate_wrapper);
 
 /* si_gpu_load.c */
 void si_gpu_load_kill_thread(struct si_screen *sscreen);
@@ -1511,33 +1461,9 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
 void si_init_compute_functions(struct si_context *sctx);
 
-/* si_compute_prim_discard.c */
-enum si_prim_discard_outcome
-{
-   SI_PRIM_DISCARD_ENABLED,
-   SI_PRIM_DISCARD_DISABLED,
-   SI_PRIM_DISCARD_DRAW_SPLIT,
-   SI_PRIM_DISCARD_MULTI_DRAW_SPLIT,
-};
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      const struct pipe_draw_start_count *draws,
-                                      unsigned num_draws, bool primitive_restart,
-                                      unsigned total_count);
-void si_compute_signal_gfx(struct si_context *sctx);
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info,
-                                          unsigned count, unsigned index_size,
-                                          unsigned base_vertex, uint64_t input_indexbuf_va,
-                                          unsigned input_indexbuf_max_elements);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
-                                         unsigned *prim_discard_vertex_count_threshold,
-                                         unsigned *index_ring_size_per_ib);
-
 /* si_pipe.c */
 void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
+void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
 
 /* si_perfcounters.c */
 void si_init_perfcounters(struct si_screen *screen);
@@ -1587,6 +1513,10 @@ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
 
 struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
                                                  const struct pipe_video_buffer *tmpl);
+struct pipe_video_buffer *si_video_buffer_create_with_modifiers(struct pipe_context *pipe,
+                                                                const struct pipe_video_buffer *tmpl,
+                                                                const uint64_t *modifiers,
+                                                                unsigned int modifiers_count);
 
 /* si_viewport.c */
 void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
@@ -1613,10 +1543,6 @@ struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
                                               const struct pipe_surface *templ, unsigned width0,
                                               unsigned height0, unsigned width, unsigned height);
 unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap);
-void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex);
-void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex);
-void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex);
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex);
 bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex);
 void si_init_screen_texture_functions(struct si_screen *sscreen);
 void si_init_context_texture_functions(struct si_context *sctx);
@@ -1647,6 +1573,9 @@ bool si_init_thread_trace(struct si_context *sctx);
 void si_destroy_thread_trace(struct si_context *sctx);
 void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs);
 
+/* si_state_shaders.c */
+struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key);
+
 /*
  * common helpers
  */
@@ -1698,15 +1627,14 @@ static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx,
     * Also reserve space for stopping queries at the end of IB, because
     * the number of active queries is unlimited in theory.
     */
-   return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 9;
+   return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 10;
 }
 
 static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
 {
    if (r) {
       /* Add memory usage for need_gfx_cs_space */
-      sctx->vram_kb += si_resource(r)->vram_usage_kb;
-      sctx->gtt_kb += si_resource(r)->gart_usage_kb;
+      sctx->memory_usage_kb += si_resource(r)->memory_usage_kb;
    }
 }
 
@@ -1866,7 +1794,19 @@ static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsi
    if (zs_mask == PIPE_MASK_S && (tex->htile_stencil_disabled || !tex->surface.has_stencil))
       return false;
 
-   return tex->is_depth && tex->surface.meta_offset && level < tex->surface.num_meta_levels;
+   if (!tex->is_depth || !tex->surface.meta_offset)
+      return false;
+
+   struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
+   if (sscreen->info.chip_class >= GFX8) {
+      return level < tex->surface.num_meta_levels;
+   } else {
+      /* GFX6-7 don't have TC-compatible HTILE, which means they have to run
+       * a decompression pass for every mipmap level before texturing, so compress
+       * only one level to reduce the number of decompression passes to a minimum.
+       */
+      return level == 0;
+   }
 }
 
 static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
@@ -1908,6 +1848,12 @@ static inline unsigned si_get_total_colormask(struct si_context *sctx)
    ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) |            \
     (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
 
+#define UTIL_ALL_PRIM_TRIANGLE_MODES \
+   ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | \
+    (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | \
+    (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | \
+    (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))
+
 static inline bool util_prim_is_lines(unsigned prim)
 {
    return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
@@ -1920,11 +1866,12 @@ static inline bool util_prim_is_points_or_lines(unsigned prim)
 
 static inline bool util_rast_prim_is_triangles(unsigned prim)
 {
-   return ((1 << prim) &
-           ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
-            (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) |
-            (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
-            (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
+   return ((1 << prim) & UTIL_ALL_PRIM_TRIANGLE_MODES) != 0;
+}
+
+static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim)
+{
+   return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0;
 }
 
 /**
@@ -1935,17 +1882,27 @@ static inline bool util_rast_prim_is_triangles(unsigned prim)
  * \param gtt       GTT memory size not added to the buffer list yet
  */
 static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs,
-                                                uint32_t vram_kb, uint32_t gtt_kb)
+                                                uint32_t kb)
+{
+   return kb + cs->used_vram_kb + cs->used_gart_kb < screen->max_memory_usage_kb;
+}
+
+static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)
 {
-   vram_kb += cs->used_vram_kb;
-   gtt_kb += cs->used_gart_kb;
+   struct radeon_cmdbuf *cs = &ctx->gfx_cs;
+
+   /* There are two memory usage counters in the winsys for all buffers
+    * that have been added (cs_add_buffer) and one counter in the pipe
+    * driver for those that haven't been added yet.
+    */
+   uint32_t kb = ctx->memory_usage_kb;
+   ctx->memory_usage_kb = 0;
 
-   /* Anything that goes above the VRAM size should go to GTT. */
-   if (vram_kb > screen->info.vram_size_kb)
-      gtt_kb += vram_kb - screen->info.vram_size_kb;
+   if (radeon_cs_memory_below_limit(ctx->screen, &ctx->gfx_cs, kb) &&
+       ctx->ws->cs_check_space(cs, si_get_minimum_num_gfx_cs_dwords(ctx, num_draws), false))
+      return;
 
-   /* Now we just need to check if we have enough GTT (the limit is 75% of max). */
-   return gtt_kb < screen->info.gart_size_kb / 4 * 3;
+   si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 /**
@@ -1989,30 +1946,20 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
                                                            bool check_mem)
 {
    if (check_mem &&
-       !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->vram_kb + bo->vram_usage_kb,
-                                     sctx->gtt_kb + bo->gart_usage_kb))
+       !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->memory_usage_kb + bo->memory_usage_kb))
       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority);
 }
 
-static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
-{
-   return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
-}
-
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
-                                        gl_shader_stage stage, bool ngg, bool es,
-                                        bool gs_fast_launch, bool prim_discard_cs)
+                                        gl_shader_stage stage, bool ngg, bool es)
 {
    if (stage == MESA_SHADER_COMPUTE)
       return sscreen->compute_wave_size;
    else if (stage == MESA_SHADER_FRAGMENT)
       return sscreen->ps_wave_size;
-   else if (gs_fast_launch)
-      return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
-   else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
-            (stage == MESA_SHADER_VERTEX && es && !ngg) ||
+   else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
             (stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
             (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
       return 64;
@@ -2024,19 +1971,30 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
 {
    return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
                            shader->key.as_ngg,
-                           shader->key.as_es,
-                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                           shader->key.opt.vs_as_prim_discard_cs);
+                           shader->key.as_es);
 }
 
 static inline void si_select_draw_vbo(struct si_context *sctx)
 {
-   sctx->b.draw_vbo = sctx->draw_vbo[sctx->chip_class - GFX6]
-                                    [!!sctx->shader.tes.cso]
-                                    [!!sctx->shader.gs.cso]
-                                    [sctx->ngg]
-                                    [si_compute_prim_discard_enabled(sctx)];
-   assert(sctx->b.draw_vbo);
+   pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
+                                               [!!sctx->shader.gs.cso]
+                                               [sctx->ngg];
+   pipe_draw_vertex_state_func draw_vertex_state =
+      sctx->draw_vertex_state[!!sctx->shader.tes.cso]
+                             [!!sctx->shader.gs.cso]
+                             [sctx->ngg];
+   assert(draw_vbo);
+   assert(draw_vertex_state);
+
+   if (unlikely(sctx->real_draw_vbo)) {
+      assert(sctx->real_draw_vertex_state);
+      sctx->real_draw_vbo = draw_vbo;
+      sctx->real_draw_vertex_state = draw_vertex_state;
+   } else {
+      assert(!sctx->real_draw_vertex_state);
+      sctx->b.draw_vbo = draw_vbo;
+      sctx->b.draw_vertex_state = draw_vertex_state;
+   }
 }
 
 /* Return the number of samples that the rasterizer uses. */
@@ -2053,6 +2011,20 @@ static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
    return 1;
 }
 
+static unsigned ALWAYS_INLINE
+si_num_vbos_in_user_sgprs_inline(enum chip_class chip_class)
+{
+   /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
+    * have to allocate and count references for the upload buffer.
+    */
+   return chip_class >= GFX9 ? 5 : 1;
+}
+
+static inline unsigned si_num_vbos_in_user_sgprs(struct si_screen *sscreen)
+{
+   return si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class);
+}
+
 #define PRINT_ERR(fmt, args...)                                                                    \
    fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c
index 22b6e3ad5..ae4affa1b 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c
@@ -117,13 +117,13 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
-   if (state->shader) {
-      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, state->shader->bo,
+   if (state->is_shader) {
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo,
                                 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
    }
 
    radeon_begin(cs);
-   radeon_emit_array(cs, state->pm4, state->ndw);
+   radeon_emit_array(state->pm4, state->ndw);
    radeon_end();
 
    if (state->atom.emit)
@@ -139,7 +139,7 @@ void si_pm4_reset_emitted(struct si_context *sctx, bool first_cs)
       for (unsigned i = 0; i < SI_NUM_STATES; i++) {
          struct si_pm4_state *state = sctx->emitted.array[i];
 
-         if (state && state->shader) {
+         if (state && state->is_shader) {
             sctx->emitted.array[i] = NULL;
             sctx->dirty_states |= 1 << i;
          }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h
index 06909ff1a..03f79e0ba 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h
@@ -54,7 +54,7 @@ struct si_pm4_state {
    uint32_t pm4[SI_PM4_MAX_DW];
 
    /* For shader states only */
-   struct si_shader *shader;
+   bool is_shader;
    struct si_atom atom;
 };
 
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
index 121feb6fb..546f9da11 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
@@ -218,10 +218,10 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader)
    }
 
    /* Compile a variable block size using the maximum variable size. */
-   if (shader->selector->info.base.cs.local_size_variable)
+   if (shader->selector->info.base.workgroup_size_variable)
       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
 
-   uint16_t *local_size = shader->selector->info.base.cs.local_size;
+   uint16_t *local_size = shader->selector->info.base.workgroup_size;
    unsigned max_work_group_size = (uint32_t)local_size[0] *
                                   (uint32_t)local_size[1] *
                                   (uint32_t)local_size[2];
@@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
 
       /* VGPRs */
       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
-
-      /* Return values */
-      if (shader->key.opt.vs_as_prim_discard_cs) {
-         for (i = 0; i < 4; i++)
-            ac_add_return(&ctx->args, AC_ARG_VGPR);
-      }
       break;
 
    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@@ -553,11 +547,11 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
          declare_vb_descriptor_input_sgprs(ctx);
 
       /* VGPRs (first GS, then VS/TES) */
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
 
       if (ctx->stage == MESA_SHADER_VERTEX) {
          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
@@ -658,7 +652,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
                          SI_PARAM_LINEAR_CENTER);
       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
                          SI_PARAM_LINEAR_CENTROID);
-      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
                          SI_PARAM_POS_X_FLOAT);
       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
@@ -793,9 +787,6 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh
 
    if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
        (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
-      /* We add this symbol even on LLVM <= 8 to ensure that
-       * shader->config.lds_size is set correctly below.
-       */
       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
       sym->name = "esgs_ring";
       sym->size = shader->gs_info.esgs_ring_size * 4;
@@ -835,7 +826,9 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
 {
    struct ac_rtld_binary rtld;
    si_shader_binary_open(screen, shader, &rtld);
-   return rtld.exec_size;
+   uint64_t size = rtld.exec_size;
+   ac_rtld_close(&rtld);
+   return size;
 }
 
 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
@@ -865,8 +858,8 @@ bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader
    si_resource_reference(&shader->bo, NULL);
    shader->bo = si_aligned_buffer_create(
       &sscreen->b,
-      (sscreen->info.cpdma_prefetch_writes_memory ?
-         0 : SI_RESOURCE_FLAG_READ_ONLY) | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
+      (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
+      SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
       PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
    if (!shader->bo)
       return false;
@@ -1071,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader)
          return "Vertex Shader as ES";
       else if (shader->key.as_ls)
          return "Vertex Shader as LS";
-      else if (shader->key.opt.vs_as_prim_discard_cs)
-         return "Vertex Shader as Primitive Discard CS";
       else if (shader->key.as_ngg)
          return "Vertex Shader as ESGS";
       else
@@ -1153,8 +1144,6 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
    fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
    fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
            prolog->instance_divisor_is_fetched);
-   fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
-           prolog->unpack_instance_id_from_vertex_id);
    fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
 
    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
@@ -1186,17 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
       fprintf(f, "  as_ls = %u\n", key->as_ls);
       fprintf(f, "  as_ngg = %u\n", key->as_ngg);
       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
-      fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
-      fprintf(f, "  opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
-      fprintf(f, "  opt.cs_indexed = %u\n", key->opt.cs_indexed);
-      fprintf(f, "  opt.cs_instancing = %u\n", key->opt.cs_instancing);
-      fprintf(f, "  opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
-      fprintf(f, "  opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
-      fprintf(f, "  opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
-      fprintf(f, "  opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
-      fprintf(f, "  opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
-      fprintf(f, "  opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
-      fprintf(f, "  opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
       break;
 
    case MESA_SHADER_TESS_CTRL:
@@ -1297,8 +1275,8 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel,
    /* VGPR initialization fixup for Vega10 and Raven is always done in the
     * VS prolog. */
    return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
-          prolog_key->unpack_instance_id_from_vertex_id ||
-          (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+          /* The 2nd VS prolog loads input VGPRs from LDS */
+          (key->opt.ngg_culling && !ngg_cull_shader);
 }
 
 /**
@@ -1323,16 +1301,9 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
    key->vs_prolog.as_ls = shader_out->key.as_ls;
    key->vs_prolog.as_es = shader_out->key.as_es;
    key->vs_prolog.as_ngg = shader_out->key.as_ngg;
-   key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
-
-   if (ngg_cull_shader) {
-      key->vs_prolog.gs_fast_launch_tri_list =
-         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
-      key->vs_prolog.gs_fast_launch_tri_strip =
-         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
-      key->vs_prolog.gs_fast_launch_index_size_packed =
-         SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling);
-   }
+
+   if (!ngg_cull_shader && shader_out->key.opt.ngg_culling)
+      key->vs_prolog.load_vgprs_after_culling = 1;
 
    if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
       key->vs_prolog.as_ls = 1;
@@ -1346,8 +1317,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
 
    /* Only one of these combinations can be set. as_ngg can be set with as_es. */
    assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
-             (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
-          1);
+          (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
 
    /* Enable loading the InstanceID VGPR. */
    uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
@@ -1453,8 +1423,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
       si_dump_streamout(&sel->so);
    }
 
-   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-          sizeof(shader->info.vs_output_param_offset));
+   /* Initialize vs_output_ps_input_cntl to default. */
+   for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
+      shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
+   shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
 
    shader->info.uses_instanceid = sel->info.uses_instanceid;
 
@@ -1465,9 +1437,44 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
    if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
       return false;
 
-   /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
-    * LLVM 3.9svn has this bug.
-    */
+   /* Compute vs_output_ps_input_cntl. */
+   if ((sel->info.stage == MESA_SHADER_VERTEX ||
+        sel->info.stage == MESA_SHADER_TESS_EVAL ||
+        sel->info.stage == MESA_SHADER_GEOMETRY) &&
+       !shader->key.as_ls && !shader->key.as_es) {
+      ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
+
+      if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg)
+         vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset;
+
+      /* VS and TES should also set primitive ID output if it's used. */
+      unsigned num_outputs_with_prim_id = sel->info.num_outputs +
+                                          shader->key.mono.u.vs_export_prim_id;
+
+      for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
+         unsigned semantic = sel->info.output_semantic[i];
+         unsigned offset = vs_output_param_offset[i];
+         unsigned ps_input_cntl;
+
+         if (offset <= AC_EXP_PARAM_OFFSET_31) {
+            /* The input is loaded from parameter memory. */
+            ps_input_cntl = S_028644_OFFSET(offset);
+         } else {
+            /* The input is a DEFAULT_VAL constant. */
+            assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+                   offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+            offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
+
+            /* OFFSET=0x20 means that DEFAULT_VAL is used. */
+            ps_input_cntl = S_028644_OFFSET(0x20) |
+                            S_028644_DEFAULT_VAL(offset);
+         }
+
+         shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
+      }
+   }
+
+   /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
    if (sel->info.stage == MESA_SHADER_COMPUTE) {
       unsigned wave_size = sscreen->compute_wave_size;
       unsigned max_vgprs =
@@ -1559,11 +1566,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
       shader.key.as_ls = key->vs_prolog.as_ls;
       shader.key.as_es = key->vs_prolog.as_es;
       shader.key.as_ngg = key->vs_prolog.as_ngg;
-      shader.key.opt.ngg_culling =
-         (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
-         (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
-         SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
-      shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
       break;
    case MESA_SHADER_TESS_CTRL:
       assert(!prolog);
@@ -1586,9 +1588,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
    struct si_shader_context ctx;
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, stage,
-                                         shader.key.as_ngg, shader.key.as_es,
-                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                                         shader.key.opt.vs_as_prim_discard_cs));
+                                         shader.key.as_ngg, shader.key.as_es));
    ctx.shader = &shader;
    ctx.stage = stage;
 
@@ -2026,8 +2026,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
       shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
       shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
       shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
-      memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
-             sizeof(mainp->info.vs_output_param_offset));
+      memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl,
+             sizeof(mainp->info.vs_output_ps_input_cntl));
       shader->info.uses_instanceid = mainp->info.uses_instanceid;
       shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
       shader->info.nr_param_exports = mainp->info.nr_param_exports;
@@ -2115,9 +2115,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
        (sel->info.stage == MESA_SHADER_VERTEX &&
         /* Used to export PrimitiveID from the correct vertex. */
-        (shader->key.mono.u.vs_export_prim_id ||
-         /* Used to generate triangle strip vertex IDs for all threads. */
-         shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)));
+        shader->key.mono.u.vs_export_prim_id));
 
    shader->uses_vs_state_outprim = sscreen->use_ngg &&
                                    /* Only used by streamout in vertex shaders. */
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
index ab11a1852..d6dbb13ed 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
@@ -138,6 +138,7 @@
 #include "util/u_inlines.h"
 #include "util/u_live_shader_cache.h"
 #include "util/u_queue.h"
+#include "si_pm4.h"
 
 #include <stdio.h>
 
@@ -158,6 +159,12 @@ struct si_context;
 
 #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
 
+#define SI_PS_INPUT_CNTL_0000          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
+#define SI_PS_INPUT_CNTL_0001          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
+#define SI_PS_INPUT_CNTL_UNUSED        SI_PS_INPUT_CNTL_0000
+/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
+#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
+
 /* SGPR user data indices */
 enum
 {
@@ -272,14 +279,10 @@ enum
    SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
 };
 
-#define SI_NGG_CULL_VIEW_SMALLPRIMS          (1 << 0)   /* view.xy + small prims */
+#define SI_NGG_CULL_ENABLED                  (1 << 0)   /* this implies W, view.xy, and small prim culling */
 #define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
 #define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST  (1 << 3)   /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4)   /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x)     (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */
-#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3)
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL       (0xf << 3) /* GS fast launch (both prim types) */
+#define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
 
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
@@ -323,6 +326,16 @@ enum si_color_output_type {
    SI_TYPE_UINT16,
 };
 
+union si_input_info {
+   struct {
+      ubyte semantic;
+      ubyte interpolate;
+      ubyte fp16_lo_hi_valid;
+      ubyte usage_mask;
+   };
+   uint32_t _unused; /* this just forces 4-byte alignment */
+};
+
 struct si_shader_info {
    shader_info base;
 
@@ -330,12 +343,8 @@ struct si_shader_info {
 
    ubyte num_inputs;
    ubyte num_outputs;
-   ubyte input_semantic[PIPE_MAX_SHADER_INPUTS];
-   ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
-   ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
-   ubyte input_fp16_lo_hi_valid[PIPE_MAX_SHADER_INPUTS];
+   union si_input_info input[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS];
-   char output_semantic_to_slot[VARYING_SLOT_VAR15_16BIT + 1];
    ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
@@ -402,6 +411,13 @@ struct si_shader_info {
     * fragment shader invocations if flat shading.
     */
    bool allow_flat_shading;
+
+   /* Optimization: if the texture bound to this texunit has been cleared to 1,
+    * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
+    * value is 0xff (undetermined) and can be later changed to 0 (= false) or
+    * texunit + 1.
+    */
+   uint8_t writes_1_if_tex_is_1;
 };
 
 /* A shader selector is a gallium CSO and contains shader variants and
@@ -439,7 +455,6 @@ struct si_shader_selector {
    ubyte const_and_shader_buf_descriptors_index;
    ubyte sampler_and_images_descriptors_index;
    bool vs_needs_prolog;
-   bool prim_discard_cs_allowed;
    ubyte cs_shaderbufs_sgpr_index;
    ubyte cs_num_shaderbufs_in_user_sgprs;
    ubyte cs_images_sgpr_index;
@@ -447,7 +462,6 @@ struct si_shader_selector {
    ubyte cs_num_images_in_user_sgprs;
    ubyte num_vs_inputs;
    ubyte num_vbos_in_user_sgprs;
-   unsigned pa_cl_vs_out_cntl;
    unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
    ubyte clipdist_mask;
    ubyte culldist_mask;
@@ -521,7 +535,6 @@ struct si_vs_prolog_bits {
    uint16_t instance_divisor_is_one;     /* bitmask of inputs */
    uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
    unsigned ls_vgpr_fix : 1;
-   unsigned unpack_instance_id_from_vertex_id : 1;
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
@@ -571,10 +584,7 @@ union si_shader_part_key {
       unsigned as_ls : 1;
       unsigned as_es : 1;
       unsigned as_ngg : 1;
-      unsigned as_prim_discard_cs : 1;
-      unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
-      unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
-      unsigned gs_fast_launch_index_size_packed : 2;
+      unsigned load_vgprs_after_culling : 1;
       /* Prologs for monolithic shaders shouldn't set EXEC. */
       unsigned is_monolithic : 1;
    } vs_prolog;
@@ -633,9 +643,10 @@ struct si_shader_key {
    /* These three are initially set according to the NEXT_SHADER property,
     * or guessed if the property doesn't seem correct.
     */
-   unsigned as_es : 1;  /* export shader, which precedes GS */
-   unsigned as_ls : 1;  /* local shader, which precedes TCS */
-   unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */
+   unsigned as_es : 1;  /* whether it's a shader before GS */
+   unsigned as_ls : 1;  /* whether it's VS before TCS */
+   unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
+                           also set for the stage right before GS */
 
    /* Flags for monolithic compilation only. */
    struct {
@@ -666,7 +677,7 @@ struct si_shader_key {
       unsigned kill_pointsize : 1;
 
       /* For NGG VS and TES. */
-      unsigned ngg_culling : 7; /* SI_NGG_CULL_* */
+      unsigned ngg_culling : 4; /* SI_NGG_CULL_* */
 
       /* For shaders where monolithic variants have better code.
        *
@@ -676,19 +687,6 @@ struct si_shader_key {
        */
       unsigned prefer_mono : 1;
 
-      /* Primitive discard compute shader. */
-      unsigned vs_as_prim_discard_cs : 1;
-      unsigned cs_prim_type : 4;
-      unsigned cs_indexed : 1;
-      unsigned cs_instancing : 1;
-      unsigned cs_primitive_restart : 1;
-      unsigned cs_provoking_vertex_first : 1;
-      unsigned cs_need_correct_orientation : 1;
-      unsigned cs_cull_front : 1;
-      unsigned cs_cull_back : 1;
-      unsigned cs_cull_z : 1;
-      unsigned cs_halfz_clip_space : 1;
-
       /* VS and TCS have the same number of patch vertices. */
       unsigned same_patch_vertices:1;
 
@@ -707,6 +705,7 @@ struct si_shader_key {
 /* GCN-specific shader info. */
 struct si_shader_binary_info {
    ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+   uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
    ubyte num_input_sgprs;
    ubyte num_input_vgprs;
    signed char face_vgpr_index;
@@ -736,7 +735,35 @@ struct gfx9_gs_info {
    unsigned esgs_ring_size; /* in bytes */
 };
 
+#define SI_NUM_VGT_STAGES_KEY_BITS 5
+#define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
+
+/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
+ * Some fields are set by state-change calls, most are set by draw_vbo.
+ */
+union si_vgt_stages_key {
+   struct {
+#if UTIL_ARCH_LITTLE_ENDIAN
+      uint8_t tess : 1;
+      uint8_t gs : 1;
+      uint8_t ngg_passthrough : 1;
+      uint8_t ngg : 1;       /* gfx10+ */
+      uint8_t streamout : 1; /* only used with NGG */
+      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
+#else /* UTIL_ARCH_BIG_ENDIAN */
+      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
+      uint8_t streamout : 1;
+      uint8_t ngg : 1;
+      uint8_t ngg_passthrough : 1;
+      uint8_t gs : 1;
+      uint8_t tess : 1;
+#endif
+   } u;
+   uint8_t index;
+};
+
 struct si_shader {
+   struct si_pm4_state pm4; /* base class */
    struct si_compiler_ctx_state compiler_ctx_state;
 
    struct si_shader_selector *selector;
@@ -748,7 +775,6 @@ struct si_shader {
    struct si_shader_part *prolog2;
    struct si_shader_part *epilog;
 
-   struct si_pm4_state *pm4;
    struct si_resource *bo;
    struct si_resource *scratch_bo;
    struct si_shader_key key;
@@ -803,6 +829,8 @@ struct si_shader {
          unsigned vgt_gs_onchip_cntl;
          unsigned vgt_gs_max_prims_per_subgroup;
          unsigned vgt_esgs_ring_itemsize;
+         unsigned spi_shader_pgm_rsrc3_gs;
+         unsigned spi_shader_pgm_rsrc4_gs;
       } gs;
 
       struct {
@@ -819,6 +847,9 @@ struct si_shader {
          unsigned pa_cl_ngg_cntl;
          unsigned vgt_gs_max_vert_out; /* for API GS */
          unsigned ge_pc_alloc;         /* uconfig register */
+         unsigned spi_shader_pgm_rsrc3_gs;
+         unsigned spi_shader_pgm_rsrc4_gs;
+         union si_vgt_stages_key vgt_stages;
       } ngg;
 
       struct {
@@ -839,6 +870,7 @@ struct si_shader {
          unsigned spi_shader_z_format;
          unsigned spi_shader_col_format;
          unsigned cb_shader_mask;
+         unsigned num_interp;
       } ps;
    } ctx_reg;
 
@@ -884,17 +916,18 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
 void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
 void si_nir_late_opts(nir_shader *nir);
-void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize);
+char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
 
 /* si_state_shaders.c */
 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
                       struct gfx9_gs_info *out);
+bool gfx10_is_ngg_passthrough(struct si_shader *shader);
 
 /* Inline helpers. */
 
 /* Return the pointer to the main shader part's pointer. */
 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
-                                                         struct si_shader_key *key)
+                                                         const struct si_shader_key *key)
 {
    if (key->as_ls)
       return &sel->main_shader_part_ls;
@@ -907,15 +940,6 @@ static inline struct si_shader **si_get_main_shader_part(struct si_shader_select
    return &sel->main_shader_part;
 }
 
-static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader)
-{
-   struct si_shader_selector *sel = shader->selector;
-
-   return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag &&
-          !shader->key.opt.ngg_culling &&
-          (sel->info.stage != MESA_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id);
-}
-
 static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
 {
    return selector ? selector->info.uses_bindless_samplers : false;
@@ -926,6 +950,22 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel
    return selector ? selector->info.uses_bindless_images : false;
 }
 
+static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
+{
+   if (shader->selector->info.stage == MESA_SHADER_VERTEX &&
+       !shader->selector->info.base.vs.blit_sgprs_amd &&
+       !(shader->key.opt.ngg_culling & SI_NGG_CULL_LINES))
+      return true;
+
+   return false;
+}
+
+static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
+{
+   return gfx10_edgeflags_have_effect(shader) &&
+          shader->selector->info.writes_edgeflag;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h
index 46d8e69b9..3970125f5 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -30,8 +30,6 @@
 
 struct pipe_debug_callback;
 
-#define RADEON_LLVM_MAX_INPUTS 32 * 4
-
 /* Ideally pass the sample mask input to the PS epilog as v14, which
  * is its usual location, so that the shader doesn't have to add v_mov.
  */
@@ -60,8 +58,6 @@ struct si_shader_context {
    struct ac_shader_args args;
    struct ac_shader_abi abi;
 
-   LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
-
    LLVMBasicBlockRef merged_wrap_if_entry_block;
    int merged_wrap_if_label;
 
@@ -134,10 +130,6 @@ struct si_shader_context {
 
    /* API TES */
    struct ac_arg tes_offchip_addr;
-   /* API GS */
-   struct ac_arg gs_vtx01_offset;  /* in dwords (GFX9) */
-   struct ac_arg gs_vtx23_offset;  /* in dwords (GFX9) */
-   struct ac_arg gs_vtx45_offset;  /* in dwords (GFX9) */
    /* PS */
    struct ac_arg pos_fixed_pt;
    /* CS */
@@ -194,9 +186,8 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader);
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
 void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
                                  LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                                     LLVMValueRef *addrs);
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi);
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi);
 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
@@ -242,7 +233,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
 /* si_shader_llvm_gs.c */
 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi);
 void si_preload_esgs_ring(struct si_shader_context *ctx);
 void si_preload_gs_rings(struct si_shader_context *ctx);
 void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
@@ -250,7 +241,7 @@ void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_tess.c */
 void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi);
 void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
@@ -266,7 +257,6 @@ void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
 void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_vs.c */
-void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
 void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
                                     LLVMValueRef const *so_write_offsets,
                                     struct pipe_stream_output *stream_out,
@@ -275,7 +265,7 @@ void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_outp
                             unsigned noutput, unsigned stream);
 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
                               struct si_shader_output_values *outputs, unsigned noutput);
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi);
 void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
 
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 8420162ca..1a1dd07a5 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -22,6 +22,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "ac_exp_param.h"
 #include "ac_nir_to_llvm.h"
 #include "ac_rtld.h"
 #include "si_pipe.h"
@@ -93,9 +94,7 @@ bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
    if (!si_replace_shader(count, binary)) {
       struct ac_compiler_passes *passes = compiler->passes;
 
-      if (ac->wave_size == 32)
-         passes = compiler->passes_wave32;
-      else if (less_optimized && compiler->low_opt_passes)
+      if (less_optimized && compiler->low_opt_passes)
          passes = compiler->low_opt_passes;
 
       struct si_llvm_diagnostics diag = {debug};
@@ -190,6 +189,7 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy
    }
 
    ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
+   ac_llvm_set_target_features(ctx->main_fn, &ctx->ac);
 }
 
 void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader)
@@ -220,7 +220,7 @@ void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shade
 
 
    if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
-      if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+      if (USE_LDS_SYMBOLS) {
          /* The LSHS size is not known until draw time, so we append it
           * at the end of whatever LDS use there may be in the rest of
           * the shader (currently none, unless LLVM decides to do its
@@ -412,7 +412,7 @@ static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
-   assert(ctx->shader->selector->info.base.cs.local_size_variable &&
+   assert(ctx->shader->selector->info.base.workgroup_size_variable &&
           ctx->shader->selector->info.uses_variable_block_size);
 
    LLVMValueRef chan[3] = {
@@ -442,9 +442,7 @@ static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
 
 static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 {
-   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      si_llvm_load_vs_inputs(ctx, nir);
-   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       unsigned colors_read = ctx->shader->selector->info.colors_read;
       LLVMValueRef main_fn = ctx->main_fn;
 
@@ -491,7 +489,6 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *
          si_llvm_declare_compute_memory(ctx);
    }
 
-   ctx->abi.inputs = &ctx->inputs[0];
    ctx->abi.clamp_shadow_reference = true;
    ctx->abi.robust_buffer_access = true;
    ctx->abi.convert_undef_to_zero = true;
@@ -808,9 +805,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
        !same_thread_count && si_is_multi_part_shader(ctx->shader))
       ac_build_endif(&ctx->ac, 6507);
 
-   /* Return the value from the last part. It's non-void only for the prim
-    * discard compute shader.
-    */
    if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
       LLVMBuildRetVoid(builder);
    else
@@ -902,12 +896,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
       /* Unconditionally declare scratch space base for streamout and
        * vertex compaction. Whether space is actually allocated is
        * determined during linking / PM4 creation.
-       *
-       * Add an extra dword per vertex to ensure an odd stride, which
-       * avoids bank conflicts for SoA accesses.
        */
-      if (!gfx10_is_ngg_passthrough(shader))
-         si_llvm_declare_esgs_ring(ctx);
+      si_llvm_declare_esgs_ring(ctx);
 
       /* This is really only needed when streamout and / or vertex
        * compaction is enabled.
@@ -1091,7 +1081,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
    if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
       LLVMValueRef parts[4];
       unsigned num_parts = 0;
-      bool has_prolog = false;
+      bool first_is_prolog = false;
       LLVMValueRef main_fn = ctx.main_fn;
 
       if (ngg_cull_main_fn) {
@@ -1102,7 +1092,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
             prolog_key.vs_prolog.is_monolithic = true;
             si_llvm_build_vs_prolog(&ctx, &prolog_key);
             parts[num_parts++] = ctx.main_fn;
-            has_prolog = true;
+            first_is_prolog = true;
          }
          parts[num_parts++] = ngg_cull_main_fn;
       }
@@ -1114,21 +1104,31 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
          prolog_key.vs_prolog.is_monolithic = true;
          si_llvm_build_vs_prolog(&ctx, &prolog_key);
          parts[num_parts++] = ctx.main_fn;
-         has_prolog = true;
+         if (num_parts == 1)
+            first_is_prolog = true;
       }
       parts[num_parts++] = main_fn;
 
-      si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0, false);
-
-      if (ctx.shader->key.opt.vs_as_prim_discard_cs)
-         si_build_prim_discard_compute_shader(&ctx);
+      si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
-      LLVMValueRef parts[2];
+      LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
+
+      /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */
+      union si_shader_part_key prolog_key;
+      memset(&prolog_key, 0, sizeof(prolog_key));
+      prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+      prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5;
+      prolog_key.vs_prolog.as_ngg = 1;
+      prolog_key.vs_prolog.load_vgprs_after_culling = 1;
+      prolog_key.vs_prolog.is_monolithic = true;
+      si_llvm_build_vs_prolog(&ctx, &prolog_key);
+      prolog = ctx.main_fn;
 
       parts[0] = ngg_cull_main_fn;
-      parts[1] = ctx.main_fn;
+      parts[1] = prolog;
+      parts[2] = main_fn;
 
-      si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
+      si_build_wrapper_function(&ctx, parts, 3, 0, 0, false);
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
       if (sscreen->info.chip_class >= GFX9) {
          struct si_shader_selector *ls = shader->key.part.tcs.ls;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
index 18d8bca3c..450ee8348 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
@@ -24,11 +24,13 @@
 
 #include "si_build_pm4.h"
 #include "si_query.h"
+#include "si_shader_internal.h"
 #include "sid.h"
 #include "util/fast_idiv_by_const.h"
 #include "util/format/u_format.h"
 #include "util/format/u_format_s3tc.h"
 #include "util/u_dual_blend.h"
+#include "util/u_helpers.h"
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
@@ -92,8 +94,8 @@ static void si_emit_cb_render_state(struct si_context *sctx)
       sctx->last_cb_target_mask = cb_target_mask;
 
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
       radeon_end();
    }
 
@@ -445,6 +447,14 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
    blend->alpha_to_one = state->alpha_to_one;
    blend->dual_src_blend = util_blend_state_is_dual(state, 0);
    blend->logicop_enable = logicop_enable;
+   blend->allows_noop_optimization =
+      state->rt[0].rgb_func == PIPE_BLEND_ADD &&
+      state->rt[0].alpha_func == PIPE_BLEND_ADD &&
+      state->rt[0].rgb_src_factor == PIPE_BLENDFACTOR_DST_COLOR &&
+      state->rt[0].alpha_src_factor == PIPE_BLENDFACTOR_DST_COLOR &&
+      state->rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_ZERO &&
+      state->rt[0].alpha_dst_factor == PIPE_BLENDFACTOR_ZERO &&
+      mode == V_028808_CB_NORMAL;
 
    unsigned num_shader_outputs = state->max_rt + 1; /* estimate */
    if (blend->dual_src_blend)
@@ -627,6 +637,79 @@ static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_b
    return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
 }
 
+static bool si_check_blend_dst_sampler_noop(struct si_context *sctx)
+{
+   if (sctx->framebuffer.state.nr_cbufs == 1) {
+      struct si_shader_selector *sel = sctx->shader.ps.cso;
+      bool free_nir;
+      if (unlikely(sel->info.writes_1_if_tex_is_1 == 0xff)) {
+         struct nir_shader *nir = si_get_nir_shader(sel, NULL, &free_nir);
+
+         /* Determine if this fragment shader always writes vec4(1) if a specific texture
+          * is all 1s.
+          */
+         float in[4] = { 1.0, 1.0, 1.0, 1.0 };
+         float out[4];
+         int texunit;
+         if (si_nir_is_output_const_if_tex_is_const(nir, in, out, &texunit) &&
+             !memcmp(in, out, 4 * sizeof(float))) {
+            sel->info.writes_1_if_tex_is_1 = 1 + texunit;
+         } else {
+            sel->info.writes_1_if_tex_is_1 = 0;
+         }
+
+         if (free_nir)
+            ralloc_free(nir);
+      }
+
+      if (sel->info.writes_1_if_tex_is_1 &&
+          sel->info.writes_1_if_tex_is_1 != 0xff) {
+         /* Now check if the texture is cleared to 1 */
+         int unit = sctx->shader.ps.cso->info.writes_1_if_tex_is_1 - 1;
+         struct si_samplers *samp = &sctx->samplers[PIPE_SHADER_FRAGMENT];
+         if ((1u << unit) & samp->enabled_mask) {
+            struct si_texture* tex = (struct si_texture*) samp->views[unit]->texture;
+            if (tex->is_depth &&
+                tex->depth_cleared_level_mask & BITFIELD_BIT(samp->views[unit]->u.tex.first_level) &&
+                tex->depth_clear_value[0] == 1) {
+               return false;
+            }
+            /* TODO: handle color textures */
+         }
+      }
+   }
+
+   return true;
+}
+
+static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
+                                           const struct pipe_draw_info *info,
+                                           unsigned drawid_offset,
+                                           const struct pipe_draw_indirect_info *indirect,
+                                           const struct pipe_draw_start_count_bias *draws,
+                                           unsigned num_draws) {
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (!si_check_blend_dst_sampler_noop(sctx))
+      return;
+
+   sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);
+}
+
+static void si_draw_vstate_blend_dst_sampler_noop(struct pipe_context *ctx,
+                                                  struct pipe_vertex_state *state,
+                                                  uint32_t partial_velem_mask,
+                                                  struct pipe_draw_vertex_state_info info,
+                                                  const struct pipe_draw_start_count_bias *draws,
+                                                  unsigned num_draws) {
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (!si_check_blend_dst_sampler_noop(sctx))
+      return;
+
+   sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws);
+}
+
 static void si_bind_blend_state(struct pipe_context *ctx, void *state)
 {
    struct si_context *sctx = (struct si_context *)ctx;
@@ -649,8 +732,12 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
        old_blend->alpha_to_one != blend->alpha_to_one ||
        old_blend->dual_src_blend != blend->dual_src_blend ||
        old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
+       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) {
+      si_ps_key_update_framebuffer_blend(sctx);
+      si_ps_key_update_blend_rasterizer(sctx);
+      si_update_ps_inputs_read_or_disabled(sctx);
       sctx->do_update_shaders = true;
+   }
 
    if (sctx->screen->dpbb_allowed &&
        (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
@@ -664,6 +751,15 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
          old_blend->commutative_4bit != blend->commutative_4bit ||
          old_blend->logicop_enable != blend->logicop_enable)))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+   if (likely(!radeon_uses_secure_bos(sctx->ws))) {
+      if (unlikely(blend->allows_noop_optimization)) {
+         si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop,
+                                 si_draw_vstate_blend_dst_sampler_noop);
+      } else {
+         si_install_draw_wrapper(sctx, NULL, NULL);
+      }
+   }
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
@@ -691,8 +787,8 @@ static void si_emit_blend_color(struct si_context *sctx)
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
-   radeon_emit_array(cs, (uint32_t *)sctx->blend_color.color, 4);
+   radeon_set_context_reg_seq(R_028414_CB_BLEND_RED, 4);
+   radeon_emit_array((uint32_t *)sctx->blend_color.color, 4);
    radeon_end();
 }
 
@@ -725,8 +821,8 @@ static void si_emit_clip_state(struct si_context *sctx)
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
-   radeon_emit_array(cs, (uint32_t *)sctx->clip_state.ucp, 6 * 4);
+   radeon_set_context_reg_seq(R_0285BC_PA_CL_UCP_0_X, 6 * 4);
+   radeon_emit_array((uint32_t *)sctx->clip_state.ucp, 6 * 4);
    radeon_end();
 }
 
@@ -741,7 +837,6 @@ static void si_emit_clip_regs(struct si_context *sctx)
    unsigned clipdist_mask = vs_sel->clipdist_mask;
    unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
    unsigned culldist_mask = vs_sel->culldist_mask;
-   unsigned vs_out_mask = (clipdist_mask & ~vs->key.opt.kill_clip_distances) | culldist_mask;
 
    /* Clip distances on points have no effect, so need to be implemented
     * as cull distances. This applies for the clipvertex case as well.
@@ -752,23 +847,14 @@ static void si_emit_clip_regs(struct si_context *sctx)
    clipdist_mask &= rs->clip_plane_enable;
    culldist_mask |= clipdist_mask;
 
-   unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) |
-                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) |
-                         S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 &&
+   unsigned pa_cl_cntl = S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 &&
                                                            !sctx->screen->options.vrs2x2) |
                          S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
                          clipdist_mask | (culldist_mask << 8);
 
    radeon_begin(&sctx->gfx_cs);
-
-   if (sctx->chip_class >= GFX10) {
-      radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
-                                     ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-   } else {
-      radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
-                                 vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
-   }
+   radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL,
+			      pa_cl_cntl | vs->pa_cl_vs_out_cntl);
    radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
                               rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
    radeon_end_update_context_roll(sctx);
@@ -834,15 +920,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
       return NULL;
    }
 
-   if (!state->front_ccw) {
-      rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
-      rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
-   } else {
-      rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
-      rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
-   }
-   rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
-   rs->provoking_vertex_first = state->flatshade_first;
    rs->scissor_enable = state->scissor;
    rs->clip_halfz = state->clip_halfz;
    rs->two_side = state->light_twoside;
@@ -862,9 +939,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
    rs->flatshade_first = state->flatshade_first;
    rs->sprite_coord_enable = state->sprite_coord_enable;
    rs->rasterizer_discard = state->rasterizer_discard;
-   rs->polygon_mode_enabled =
-      (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
-      (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
    rs->polygon_mode_is_lines =
       (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
       (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
@@ -882,24 +956,30 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
                          S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
 
    if (rs->rasterizer_discard) {
-      rs->ngg_cull_flags = SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
+      rs->ngg_cull_flags = SI_NGG_CULL_ENABLED |
+                           SI_NGG_CULL_FRONT_FACE |
+                           SI_NGG_CULL_BACK_FACE;
       rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags;
    } else {
-      /* Polygon mode can't use view and small primitive culling,
-       * because it draws points or lines where the culling depends
-       * on the point or line width.
-       */
-      if (!rs->polygon_mode_enabled) {
-         rs->ngg_cull_flags |= SI_NGG_CULL_VIEW_SMALLPRIMS;
-         rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_VIEW_SMALLPRIMS;
+      rs->ngg_cull_flags = SI_NGG_CULL_ENABLED;
+      rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags;
+
+      bool cull_front, cull_back;
+
+      if (!state->front_ccw) {
+         cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+         cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+      } else {
+         cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+         cull_front = !!(state->cull_face & PIPE_FACE_BACK);
       }
 
-      if (rs->cull_front) {
+      if (cull_front) {
          rs->ngg_cull_flags |= SI_NGG_CULL_FRONT_FACE;
          rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_BACK_FACE;
       }
 
-      if (rs->cull_back) {
+      if (cull_back) {
          rs->ngg_cull_flags |= SI_NGG_CULL_BACK_FACE;
          rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_FRONT_FACE;
       }
@@ -942,7 +1022,10 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
          S_028A48_VPORT_SCISSOR_ENABLE(1) |
          S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
 
-   si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
+   bool polygon_mode_enabled =
+      (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
+      (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
+
    si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
                   S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
                      S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
@@ -951,11 +1034,11 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
                      S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
                      S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
                      S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
-                     S_028814_POLY_MODE(rs->polygon_mode_enabled) |
+                     S_028814_POLY_MODE(polygon_mode_enabled) |
                      S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
                      S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) |
                      /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */
-                     S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? rs->polygon_mode_enabled : 0));
+                     S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? polygon_mode_enabled : 0));
 
    if (!rs->uses_poly_offset)
       return rs;
@@ -991,11 +1074,12 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
          }
       }
 
+      si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
+      si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
       si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
       si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
       si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
       si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
-      si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
    }
 
    return rs;
@@ -1044,6 +1128,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
        old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 
+   if (old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
+       old_rs->flatshade != rs->flatshade)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
+
    if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
        old_rs->rasterizer_discard != rs->rasterizer_discard ||
        old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
@@ -1053,8 +1141,19 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
        old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
        old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
        old_rs->force_persample_interp != rs->force_persample_interp ||
-       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points)
+       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) {
+      si_ps_key_update_blend_rasterizer(sctx);
+      si_ps_key_update_rasterizer(sctx);
+      si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+      si_update_ps_inputs_read_or_disabled(sctx);
       sctx->do_update_shaders = true;
+   }
+
+   if (old_rs->line_smooth != rs->line_smooth ||
+       old_rs->poly_smooth != rs->poly_smooth ||
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+       old_rs->flatshade != rs->flatshade)
+      si_update_vrs_flat_shading(sctx);
 }
 
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
@@ -1079,14 +1178,15 @@ static void si_emit_stencil_ref(struct si_context *sctx)
    struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
-   radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
-                      S_028430_STENCILMASK(dsa->valuemask[0]) |
-                      S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
-   radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
-                      S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
-                      S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
-                      S_028434_STENCILOPVAL_BF(1));
+   radeon_set_context_reg_seq(R_028430_DB_STENCILREFMASK, 2);
+   radeon_emit(S_028430_STENCILTESTVAL(ref->ref_value[0]) |
+               S_028430_STENCILMASK(dsa->valuemask[0]) |
+               S_028430_STENCILWRITEMASK(dsa->writemask[0]) |
+               S_028430_STENCILOPVAL(1));
+   radeon_emit(S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
+               S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
+               S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
+               S_028434_STENCILOPVAL_BF(1));
    radeon_end();
 }
 
@@ -1270,8 +1370,12 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
    }
 
-   if (old_dsa->alpha_func != dsa->alpha_func)
+   if (old_dsa->alpha_func != dsa->alpha_func) {
+      si_ps_key_update_dsa(sctx);
+      si_update_ps_inputs_read_or_disabled(sctx);
+      si_update_ps_kill_enable(sctx);
       sctx->do_update_shaders = true;
+   }
 
    if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
                                        old_dsa->stencil_enabled != dsa->stencil_enabled ||
@@ -1446,8 +1550,8 @@ static void si_emit_db_render_state(struct si_context *sctx)
 /*
  * format translation
  */
-static uint32_t si_translate_colorformat(enum chip_class chip_class,
-                                         enum pipe_format format)
+uint32_t si_translate_colorformat(enum chip_class chip_class,
+                                  enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
    if (!desc)
@@ -2234,6 +2338,13 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format
       retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER);
    }
 
+   if (usage & PIPE_BIND_INDEX_BUFFER) {
+      if (format == PIPE_FORMAT_R8_UINT ||
+          format == PIPE_FORMAT_R16_UINT ||
+          format == PIPE_FORMAT_R32_UINT)
+         retval |= PIPE_BIND_INDEX_BUFFER;
+   }
+
    if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) &&
        !(usage & PIPE_BIND_DEPTH_STENCIL))
       retval |= PIPE_BIND_LINEAR;
@@ -2585,8 +2696,6 @@ void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
          tex->dirty_level_mask |= 1 << surf->u.tex.level;
          tex->fmask_is_identity = false;
       }
-      if (tex->dcc_gather_statistics)
-         tex->separate_dcc_dirty = true;
    }
 }
 
@@ -2658,15 +2767,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
    si_update_fb_dirtiness_after_rendering(sctx);
 
-   for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-      if (!sctx->framebuffer.state.cbufs[i])
-         continue;
-
-      tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
-      if (tex->dcc_gather_statistics)
-         vi_separate_dcc_stop_query(sctx, tex);
-   }
-
    /* Disable DCC if the formats are incompatible. */
    for (i = 0; i < state->nr_cbufs; i++) {
       if (!state->cbufs[i])
@@ -2823,12 +2923,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
       p_atomic_inc(&tex->framebuffers_bound);
 
-      if (tex->dcc_gather_statistics) {
-         /* Dirty tracking must be enabled for DCC usage analysis. */
-         sctx->framebuffer.compressed_cb_mask |= 1 << i;
-         vi_separate_dcc_start_query(sctx, tex);
-      }
-
       /* Update the minimum but don't keep 0. */
       if (!sctx->framebuffer.min_bytes_per_pixel ||
           tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
@@ -2889,6 +2983,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
+      if (!sctx->sample_pos_buffer) {
+         sctx->sample_pos_buffer = pipe_buffer_create_with_data(&sctx->b, 0, PIPE_USAGE_DEFAULT,
+                                                      sizeof(sctx->sample_positions),
+                                                      &sctx->sample_positions);
+      }
       constbuf.buffer = sctx->sample_pos_buffer;
 
       /* Set sample locations as fragment shader constants. */
@@ -2922,6 +3021,10 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
    }
 
+   si_ps_key_update_framebuffer(sctx);
+   si_ps_key_update_framebuffer_blend(sctx);
+   si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+   si_update_ps_inputs_read_or_disabled(sctx);
    sctx->do_update_shaders = true;
 
    if (!sctx->decompression_enabled) {
@@ -2953,7 +3056,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
 
       cb = (struct si_surface *)state->cbufs[i];
       if (!cb) {
-         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
+         radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C,
                                 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
          continue;
       }
@@ -2969,11 +3072,6 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
                                    RADEON_PRIO_SEPARATE_META);
       }
 
-      if (tex->dcc_separate_buffer)
-         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, tex->dcc_separate_buffer,
-                                   RADEON_USAGE_READWRITE | RADEON_USAGE_NEEDS_IMPLICIT_SYNC,
-                                   RADEON_PRIO_SEPARATE_META);
-
       /* Compute mutable surface parameters. */
       cb_color_base = tex->buffer.gpu_address >> 8;
       cb_color_fmask = 0;
@@ -3013,9 +3111,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
          if (!is_msaa_resolve_dst)
             cb_color_info |= S_028C70_DCC_ENABLE(1);
 
-         cb_dcc_base =
-            ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.meta_offset) >>
-            8;
+         cb_dcc_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8;
 
          unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
          dcc_tile_swizzle &= ((1 << tex->surface.meta_alignment_log2) - 1) >> 8;
@@ -3039,30 +3135,30 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
                             S_028EE0_CMASK_PIPE_ALIGNED(1) |
                             S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned);
 
-         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
-         radeon_emit(cs, cb_color_base);             /* CB_COLOR0_BASE */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, cb->cb_color_view);         /* CB_COLOR0_VIEW */
-         radeon_emit(cs, cb_color_info);             /* CB_COLOR0_INFO */
-         radeon_emit(cs, cb_color_attrib);           /* CB_COLOR0_ATTRIB */
-         radeon_emit(cs, cb->cb_dcc_control);        /* CB_COLOR0_DCC_CONTROL */
-         radeon_emit(cs, cb_color_cmask);            /* CB_COLOR0_CMASK */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, cb_color_fmask);            /* CB_COLOR0_FMASK */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
-         radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
-         radeon_emit(cs, cb_dcc_base);               /* CB_COLOR0_DCC_BASE */
-
-         radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
-         radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
+         radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
+         radeon_emit(cb_color_base);             /* CB_COLOR0_BASE */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(cb->cb_color_view);         /* CB_COLOR0_VIEW */
+         radeon_emit(cb_color_info);             /* CB_COLOR0_INFO */
+         radeon_emit(cb_color_attrib);           /* CB_COLOR0_ATTRIB */
+         radeon_emit(cb->cb_dcc_control);        /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cb_color_cmask);            /* CB_COLOR0_CMASK */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(cb_color_fmask);            /* CB_COLOR0_FMASK */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cb_dcc_base);               /* CB_COLOR0_DCC_BASE */
+
+         radeon_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
+         radeon_set_context_reg(R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
                                 cb_color_cmask >> 32);
-         radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
+         radeon_set_context_reg(R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
                                 cb_color_fmask >> 32);
-         radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
-         radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
-         radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
+         radeon_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
+         radeon_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
+         radeon_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
       } else if (sctx->chip_class == GFX9) {
          struct gfx9_surf_meta_flags meta = {
             .rb_aligned = 1,
@@ -3084,24 +3180,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
                             S_028C74_RB_ALIGNED(meta.rb_aligned) |
                             S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
 
-         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
-         radeon_emit(cs, cb_color_base);                            /* CB_COLOR0_BASE */
-         radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32));  /* CB_COLOR0_BASE_EXT */
-         radeon_emit(cs, cb->cb_color_attrib2);                     /* CB_COLOR0_ATTRIB2 */
-         radeon_emit(cs, cb->cb_color_view);                        /* CB_COLOR0_VIEW */
-         radeon_emit(cs, cb_color_info);                            /* CB_COLOR0_INFO */
-         radeon_emit(cs, cb_color_attrib);                          /* CB_COLOR0_ATTRIB */
-         radeon_emit(cs, cb->cb_dcc_control);                       /* CB_COLOR0_DCC_CONTROL */
-         radeon_emit(cs, cb_color_cmask);                           /* CB_COLOR0_CMASK */
-         radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
-         radeon_emit(cs, cb_color_fmask);                           /* CB_COLOR0_FMASK */
-         radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
-         radeon_emit(cs, tex->color_clear_value[0]);                /* CB_COLOR0_CLEAR_WORD0 */
-         radeon_emit(cs, tex->color_clear_value[1]);                /* CB_COLOR0_CLEAR_WORD1 */
-         radeon_emit(cs, cb_dcc_base);                              /* CB_COLOR0_DCC_BASE */
-         radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32));    /* CB_COLOR0_DCC_BASE_EXT */
-
-         radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
+         radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
+         radeon_emit(cb_color_base);                            /* CB_COLOR0_BASE */
+         radeon_emit(S_028C64_BASE_256B(cb_color_base >> 32));  /* CB_COLOR0_BASE_EXT */
+         radeon_emit(cb->cb_color_attrib2);                     /* CB_COLOR0_ATTRIB2 */
+         radeon_emit(cb->cb_color_view);                        /* CB_COLOR0_VIEW */
+         radeon_emit(cb_color_info);                            /* CB_COLOR0_INFO */
+         radeon_emit(cb_color_attrib);                          /* CB_COLOR0_ATTRIB */
+         radeon_emit(cb->cb_dcc_control);                       /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cb_color_cmask);                           /* CB_COLOR0_CMASK */
+         radeon_emit(S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
+         radeon_emit(cb_color_fmask);                           /* CB_COLOR0_FMASK */
+         radeon_emit(S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
+         radeon_emit(tex->color_clear_value[0]);                /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(tex->color_clear_value[1]);                /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cb_dcc_base);                              /* CB_COLOR0_DCC_BASE */
+         radeon_emit(S_028C98_BASE_256B(cb_dcc_base >> 32));    /* CB_COLOR0_DCC_BASE_EXT */
+
+         radeon_set_context_reg(R_0287A0_CB_MRT0_EPITCH + i * 4,
                                 S_0287A0_EPITCH(tex->surface.u.gfx9.epitch));
       } else {
          /* Compute mutable surface parameters (GFX6-GFX8). */
@@ -3145,29 +3241,29 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
             cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
          }
 
-         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+         radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C,
                                     sctx->chip_class >= GFX8 ? 14 : 13);
-         radeon_emit(cs, cb_color_base);                              /* CB_COLOR0_BASE */
-         radeon_emit(cs, cb_color_pitch);                             /* CB_COLOR0_PITCH */
-         radeon_emit(cs, cb_color_slice);                             /* CB_COLOR0_SLICE */
-         radeon_emit(cs, cb->cb_color_view);                          /* CB_COLOR0_VIEW */
-         radeon_emit(cs, cb_color_info);                              /* CB_COLOR0_INFO */
-         radeon_emit(cs, cb_color_attrib);                            /* CB_COLOR0_ATTRIB */
-         radeon_emit(cs, cb->cb_dcc_control);                         /* CB_COLOR0_DCC_CONTROL */
-         radeon_emit(cs, cb_color_cmask);                             /* CB_COLOR0_CMASK */
-         radeon_emit(cs, tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
-         radeon_emit(cs, cb_color_fmask);                             /* CB_COLOR0_FMASK */
-         radeon_emit(cs, cb_color_fmask_slice);                       /* CB_COLOR0_FMASK_SLICE */
-         radeon_emit(cs, tex->color_clear_value[0]);                  /* CB_COLOR0_CLEAR_WORD0 */
-         radeon_emit(cs, tex->color_clear_value[1]);                  /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cb_color_base);                              /* CB_COLOR0_BASE */
+         radeon_emit(cb_color_pitch);                             /* CB_COLOR0_PITCH */
+         radeon_emit(cb_color_slice);                             /* CB_COLOR0_SLICE */
+         radeon_emit(cb->cb_color_view);                          /* CB_COLOR0_VIEW */
+         radeon_emit(cb_color_info);                              /* CB_COLOR0_INFO */
+         radeon_emit(cb_color_attrib);                            /* CB_COLOR0_ATTRIB */
+         radeon_emit(cb->cb_dcc_control);                         /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cb_color_cmask);                             /* CB_COLOR0_CMASK */
+         radeon_emit(tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
+         radeon_emit(cb_color_fmask);                             /* CB_COLOR0_FMASK */
+         radeon_emit(cb_color_fmask_slice);                       /* CB_COLOR0_FMASK_SLICE */
+         radeon_emit(tex->color_clear_value[0]);                  /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(tex->color_clear_value[1]);                  /* CB_COLOR0_CLEAR_WORD1 */
 
          if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
-            radeon_emit(cs, cb_dcc_base);
+            radeon_emit(cb_dcc_base);
       }
    }
    for (; i < 8; i++)
       if (sctx->framebuffer.dirty_cbufs & (1 << i))
-         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
+         radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
 
    /* ZS buffer. */
    if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
@@ -3203,49 +3299,47 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
       unsigned level = zb->base.u.tex.level;
 
       if (sctx->chip_class >= GFX10) {
-         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
-         radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
-
-         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
-         radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
-         radeon_emit(cs, db_z_info |                  /* DB_Z_INFO */
-                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
-         radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
-
-         radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
-         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
-         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
-         radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
+         radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+         radeon_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
+
+         radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 7);
+         radeon_emit(S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
+         radeon_emit(db_z_info |                  /* DB_Z_INFO */
+                     S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+         radeon_emit(db_stencil_info);     /* DB_STENCIL_INFO */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+
+         radeon_set_context_reg_seq(R_028068_DB_Z_READ_BASE_HI, 5);
+         radeon_emit(zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
+         radeon_emit(zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
+         radeon_emit(zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
       } else if (sctx->chip_class == GFX9) {
-         radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
-         radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
-         radeon_emit(cs,
-                     S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
-         radeon_emit(cs, zb->db_depth_size);                          /* DB_DEPTH_SIZE */
-
-         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
-         radeon_emit(cs, db_z_info |                                   /* DB_Z_INFO */
-                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
-         radeon_emit(cs, db_stencil_info);                             /* DB_STENCIL_INFO */
-         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_READ_BASE */
-         radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
-         radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
-         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_WRITE_BASE */
-         radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_WRITE_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_WRITE_BASE */
-         radeon_emit(cs,
-                     S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
-
-         radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
-         radeon_emit(cs, zb->db_z_info2);       /* DB_Z_INFO2 */
-         radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
+         radeon_set_context_reg_seq(R_028014_DB_HTILE_DATA_BASE, 3);
+         radeon_emit(zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
+         radeon_emit(S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
+         radeon_emit(zb->db_depth_size);                          /* DB_DEPTH_SIZE */
+
+         radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 10);
+         radeon_emit(db_z_info |                                   /* DB_Z_INFO */
+                     S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+         radeon_emit(db_stencil_info);                             /* DB_STENCIL_INFO */
+         radeon_emit(zb->db_depth_base);                           /* DB_Z_READ_BASE */
+         radeon_emit(S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
+         radeon_emit(zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
+         radeon_emit(S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(zb->db_depth_base);                           /* DB_Z_WRITE_BASE */
+         radeon_emit(S_028054_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(zb->db_stencil_base);                         /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
+
+         radeon_set_context_reg_seq(R_028068_DB_Z_INFO2, 2);
+         radeon_emit(zb->db_z_info2);       /* DB_Z_INFO2 */
+         radeon_emit(zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
       } else {
          /* GFX6-GFX8 */
          /* Set fields dependent on tc_compatile_htile. */
@@ -3263,46 +3357,46 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
             }
          }
 
-         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+         radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
 
-         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
-         radeon_emit(cs, zb->db_depth_info |   /* DB_DEPTH_INFO */
+         radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 9);
+         radeon_emit(zb->db_depth_info |   /* DB_DEPTH_INFO */
                      S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile));
-         radeon_emit(cs, db_z_info |           /* DB_Z_INFO */
-                            S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
-         radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
-         radeon_emit(cs, zb->db_depth_size);   /* DB_DEPTH_SIZE */
-         radeon_emit(cs, zb->db_depth_slice);  /* DB_DEPTH_SLICE */
+         radeon_emit(db_z_info |           /* DB_Z_INFO */
+                     S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+         radeon_emit(db_stencil_info);     /* DB_STENCIL_INFO */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(zb->db_depth_size);   /* DB_DEPTH_SIZE */
+         radeon_emit(zb->db_depth_slice);  /* DB_DEPTH_SLICE */
       }
 
-      radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
-      radeon_emit(cs, tex->stencil_clear_value[level]);    /* R_028028_DB_STENCIL_CLEAR */
-      radeon_emit(cs, fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */
+      radeon_set_context_reg_seq(R_028028_DB_STENCIL_CLEAR, 2);
+      radeon_emit(tex->stencil_clear_value[level]);    /* R_028028_DB_STENCIL_CLEAR */
+      radeon_emit(fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */
 
-      radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
-      radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
+      radeon_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
+      radeon_set_context_reg(R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
    } else if (sctx->framebuffer.dirty_zsbuf) {
       if (sctx->chip_class == GFX9)
-         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
+         radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 2);
       else
-         radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
+         radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 2);
 
-      radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
-      radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
+      radeon_emit(S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
+      radeon_emit(S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
    }
 
    /* Framebuffer dimensions. */
    /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */
-   radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
+   radeon_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR,
                           S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
 
-   if (sctx->screen->dfsm_allowed) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+   if (sctx->screen->dpbb_allowed) {
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
    }
    radeon_end();
 
@@ -3508,14 +3602,15 @@ static void si_emit_msaa_config(struct si_context *sctx)
       }
    }
 
-   /* Required by OpenGL line rasterization.
+   /* The DX10 diamond test is optional in GL and decreases line rasterization
+    * performance, so don't use it.
     *
     * TODO: We should also enable perpendicular endcaps for AA lines,
     *       but that requires implementing line stippling in the pixel
     *       shader. SC can only do line stippling with axis-aligned
     *       endcaps.
     */
-   unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+   unsigned sc_line_cntl = 0;
    unsigned sc_aa_config = 0;
 
    if (coverage_samples > 1) {
@@ -3559,17 +3654,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
    /* R_028A4C_PA_SC_MODE_CNTL_1 */
    radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
                               sc_mode_cntl_1);
-
-   if (radeon_packets_added()) {
-      sctx->context_roll = true;
-
-      /* GFX9: Flush DFSM when the AA mode changes. */
-      if (sctx->screen->dfsm_allowed) {
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
-      }
-   }
-   radeon_end();
+   radeon_end_update_context_roll(sctx);
 }
 
 void si_update_ps_iter_samples(struct si_context *sctx)
@@ -3591,6 +3676,9 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
       return;
 
    sctx->ps_iter_samples = min_samples;
+
+   si_ps_key_update_sample_shading(sctx);
+   si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
    sctx->do_update_shaders = true;
 
    si_update_ps_iter_samples(sctx);
@@ -3753,8 +3841,8 @@ static void gfx10_make_texture_descriptor(
       }
 
       if (tex->upgraded_depth && !is_stencil) {
-         assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
-         img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
+         assert(img_format == V_008F0C_GFX10_FORMAT_32_FLOAT);
+         img_format = V_008F0C_GFX10_FORMAT_32_FLOAT_CLAMP;
       }
    } else {
       util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
@@ -3818,43 +3906,43 @@ static void gfx10_make_texture_descriptor(
 #define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
       switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
       case FMASK(2, 1):
-         format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
+         format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F1;
          break;
       case FMASK(2, 2):
-         format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
+         format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F2;
          break;
       case FMASK(4, 1):
-         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
+         format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F1;
          break;
       case FMASK(4, 2):
-         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
+         format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F2;
          break;
       case FMASK(4, 4):
-         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
+         format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F4;
          break;
       case FMASK(8, 1):
-         format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
+         format = V_008F0C_GFX10_FORMAT_FMASK8_S8_F1;
          break;
       case FMASK(8, 2):
-         format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
+         format = V_008F0C_GFX10_FORMAT_FMASK16_S8_F2;
          break;
       case FMASK(8, 4):
-         format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
+         format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F4;
          break;
       case FMASK(8, 8):
-         format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
+         format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F8;
          break;
       case FMASK(16, 1):
-         format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
+         format = V_008F0C_GFX10_FORMAT_FMASK16_S16_F1;
          break;
       case FMASK(16, 2):
-         format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
+         format = V_008F0C_GFX10_FORMAT_FMASK32_S16_F2;
          break;
       case FMASK(16, 4):
-         format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
+         format = V_008F0C_GFX10_FORMAT_FMASK64_S16_F4;
          break;
       case FMASK(16, 8):
-         format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
+         format = V_008F0C_GFX10_FORMAT_FMASK64_S16_F8;
          break;
       default:
          unreachable("invalid nr_samples");
@@ -4223,7 +4311,7 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx
                                                         unsigned force_level)
 {
    struct si_context *sctx = (struct si_context *)ctx;
-   struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
+   struct si_sampler_view *view = CALLOC_STRUCT_CL(si_sampler_view);
    struct si_texture *tex = (struct si_texture *)texture;
    unsigned base_level, first_level, last_level;
    unsigned char state_swizzle[4];
@@ -4357,7 +4445,7 @@ static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sample
    struct si_sampler_view *view = (struct si_sampler_view *)state;
 
    pipe_resource_reference(&state->texture, NULL);
-   FREE(view);
+   FREE_CL(view);
 }
 
 static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
@@ -4404,9 +4492,13 @@ static uint32_t si_translate_border_color(struct si_context *sctx,
 
    if (i >= SI_MAX_BORDER_COLORS) {
       /* Getting 4096 unique border colors is very unlikely. */
-      fprintf(stderr, "radeonsi: The border color table is full. "
-                      "Any new border colors will be just black. "
-                      "Please file a bug.\n");
+      static bool printed;
+      if (!printed) {
+         fprintf(stderr, "radeonsi: The border color table is full. "
+                         "Any new border colors will be just black. "
+                         "This is a hardware limitation.\n");
+         printed = true;
+      }
       return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
    }
 
@@ -4552,9 +4644,9 @@ static void si_emit_sample_mask(struct si_context *sctx)
           (mask & 1 && sctx->blitter_running));
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
-   radeon_emit(cs, mask | (mask << 16));
-   radeon_emit(cs, mask | (mask << 16));
+   radeon_set_context_reg_seq(R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
+   radeon_emit(mask | (mask << 16));
+   radeon_emit(mask | (mask << 16));
    radeon_end();
 }
 
@@ -4606,8 +4698,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
 
    v->count = count;
 
+   unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sscreen);
    unsigned alloc_count =
-      count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
+      count > num_vbos_in_user_sgprs ? count - num_vbos_in_user_sgprs : 0;
    v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
 
    for (i = 0; i < count; ++i) {
@@ -4623,8 +4716,6 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
 
       unsigned instance_divisor = elements[i].instance_divisor;
       if (instance_divisor) {
-         v->uses_instance_divisors = true;
-
          if (instance_divisor == 1) {
             v->instance_divisor_is_one |= 1u << i;
          } else {
@@ -4820,22 +4911,23 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
       sctx->vertex_buffer_user_sgprs_dirty = false;
    }
 
-   if (old->count != v->count ||
-       old->uses_instance_divisors != v->uses_instance_divisors ||
-       /* we don't check which divisors changed */
-       v->uses_instance_divisors ||
+   if (old->instance_divisor_is_one != v->instance_divisor_is_one ||
+       old->instance_divisor_is_fetched != v->instance_divisor_is_fetched ||
        (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
        sctx->vertex_buffer_unaligned ||
        ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
         memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
-               sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+               sizeof(v->vertex_buffer_index[0]) * MAX2(old->count, v->count))) ||
        /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
         * functions of fix_fetch and the src_offset alignment.
         * If they change and fix_fetch doesn't, it must be due to different
         * src_offset alignment, which is reflected in fix_fetch_opencode. */
        old->fix_fetch_opencode != v->fix_fetch_opencode ||
-       memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))
+       memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) *
+              MAX2(old->count, v->count))) {
+      si_vs_key_update_inputs(sctx);
       sctx->do_update_shaders = true;
+   }
 
    if (v->instance_divisor_is_fetched) {
       struct pipe_constant_buffer cb;
@@ -4931,8 +5023,82 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot,
     * be the case in well-behaved applications anyway.
     */
    if ((sctx->vertex_elements->vb_alignment_check_mask &
-        (unaligned | orig_unaligned) & updated_mask))
+        (unaligned | orig_unaligned) & updated_mask)) {
+      si_vs_key_update_inputs(sctx);
       sctx->do_update_shaders = true;
+   }
+}
+
+static struct pipe_vertex_state *
+si_create_vertex_state(struct pipe_screen *screen,
+                       struct pipe_vertex_buffer *buffer,
+                       const struct pipe_vertex_element *elements,
+                       unsigned num_elements,
+                       struct pipe_resource *indexbuf,
+                       uint32_t full_velem_mask)
+{
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_vertex_state *state = CALLOC_STRUCT(si_vertex_state);
+
+   util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf, full_velem_mask,
+                               &state->b);
+
+   /* Initialize the vertex element state in state->element.
+    * Do it by creating a vertex element state object and copying it there.
+    */
+   struct si_context ctx = {};
+   ctx.b.screen = screen;
+   struct si_vertex_elements *velems = si_create_vertex_elements(&ctx.b, num_elements, elements);
+   state->velems = *velems;
+   si_delete_vertex_element(&ctx.b, velems);
+
+   assert(!state->velems.instance_divisor_is_one);
+   assert(!state->velems.instance_divisor_is_fetched);
+   assert(!state->velems.fix_fetch_always);
+   assert(buffer->stride % 4 == 0);
+   assert(buffer->buffer_offset % 4 == 0);
+   assert(!buffer->is_user_buffer);
+   for (unsigned i = 0; i < num_elements; i++) {
+      assert(elements[i].src_offset % 4 == 0);
+      assert(!elements[i].dual_slot);
+   }
+
+   for (unsigned i = 0; i < num_elements; i++) {
+      si_set_vertex_buffer_descriptor(sscreen, &state->velems, &state->b.input.vbuffer, i,
+                                      &state->descriptors[i * 4]);
+   }
+
+   return &state->b;
+}
+
+static void si_vertex_state_destroy(struct pipe_screen *screen,
+                                    struct pipe_vertex_state *state)
+{
+   pipe_vertex_buffer_unreference(&state->input.vbuffer);
+   pipe_resource_reference(&state->input.indexbuf, NULL);
+   FREE(state);
+}
+
+static struct pipe_vertex_state *
+si_pipe_create_vertex_state(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask)
+{
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   return util_vertex_state_cache_get(screen, buffer, elements, num_elements, indexbuf,
+                                      full_velem_mask, &sscreen->vertex_state_cache);
+}
+
+static void si_pipe_vertex_state_destroy(struct pipe_screen *screen,
+                                         struct pipe_vertex_state *state)
+{
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   util_vertex_state_destroy(screen, &sscreen->vertex_state_cache, state);
 }
 
 /*
@@ -4957,6 +5123,13 @@ static void si_set_tess_state(struct pipe_context *ctx, const float default_oute
    si_set_internal_const_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
 }
 
+static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   sctx->patch_vertices = patch_vertices;
+}
+
 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
 {
    struct si_context *sctx = (struct si_context *)ctx;
@@ -5086,6 +5259,7 @@ void si_init_state_functions(struct si_context *sctx)
    sctx->b.texture_barrier = si_texture_barrier;
    sctx->b.set_min_samples = si_set_min_samples;
    sctx->b.set_tess_state = si_set_tess_state;
+   sctx->b.set_patch_vertices = si_set_patch_vertices;
 
    sctx->b.set_active_query_state = si_set_active_query_state;
 }
@@ -5093,12 +5267,17 @@ void si_init_state_functions(struct si_context *sctx)
 void si_init_screen_state_functions(struct si_screen *sscreen)
 {
    sscreen->b.is_format_supported = si_is_format_supported;
+   sscreen->b.create_vertex_state = si_pipe_create_vertex_state;
+   sscreen->b.vertex_state_destroy = si_pipe_vertex_state_destroy;
 
    if (sscreen->info.chip_class >= GFX10) {
       sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
    } else {
       sscreen->make_texture_descriptor = si_make_texture_descriptor;
    }
+
+   util_vertex_state_cache_init(&sscreen->vertex_state_cache,
+                                si_create_vertex_state, si_vertex_state_destroy);
 }
 
 static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
@@ -5226,6 +5405,12 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
                      S_028034_BR_X(16384) | S_028034_BR_Y(16384));
    }
 
+   if (sctx->chip_class >= GFX10) {
+      si_pm4_set_reg(pm4, R_028038_DB_DFSM_CONTROL,
+                     S_028038_PUNCHOUT_MODE(V_028038_FORCE_OFF) |
+                     S_028038_POPS_DRAIN_PS_ON_OVERLAP(1));
+   }
+
    unsigned cu_mask_ps = 0xffffffff;
 
    /* It's wasteful to enable all CUs for PS if shader arrays have a different
@@ -5239,63 +5424,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
       cu_mask_ps = u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa);
 
    if (sctx->chip_class >= GFX7) {
-      /* Compute LATE_ALLOC_VS.LIMIT. */
-      unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
-      unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
-      unsigned cu_mask_vs = 0xffff;
-      unsigned cu_mask_gs = 0xffff;
-
-      if (sctx->chip_class >= GFX10) {
-         /* For Wave32, the hw will launch twice the number of late
-          * alloc waves, so 1 == 2x wave32.
-          */
-         if (!sscreen->info.use_late_alloc) {
-            late_alloc_wave64 = 0;
-         } else {
-            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
-            /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
-             * Others: CU1 must be disabled to prevent a hw deadlock.
-             *
-             * The deadlock is caused by late alloc, which usually increases
-             * performance.
-             */
-            cu_mask_vs &= sctx->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) :
-                                                      ~BITFIELD_RANGE(1, 1);
-
-            /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
-            if (sscreen->use_ngg && sctx->family != CHIP_NAVI14)
-               cu_mask_gs = cu_mask_vs;
-         }
-      } else {
-         if (!sscreen->info.use_late_alloc) {
-            late_alloc_wave64 = 0;
-         } else if (num_cu_per_sh <= 4) {
-            /* Too few available compute units per SA. Disallowing
-             * VS to run on one CU could hurt us more than late VS
-             * allocation would help.
-             *
-             * 2 is the highest safe number that allows us to keep
-             * all CUs enabled.
-             */
-            late_alloc_wave64 = 2;
-         } else {
-            /* This is a good initial value, allowing 1 late_alloc
-             * wave per SIMD on num_cu - 2.
-             */
-            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-         }
-
-         if (late_alloc_wave64 > 2)
-            cu_mask_vs = 0xfffe; /* 1 CU disabled */
-      }
-
-      /* VS can't execute on one CU if the limit is > 2. */
-      si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-                     S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
-      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
-      si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                     S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
       si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
                      S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
    }
@@ -5316,6 +5444,21 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
       si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
    }
 
+   if (sscreen->info.chip_class >= GFX10) {
+      si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS,
+                     S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8));
+      si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
+                     S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
+   } else if (sscreen->info.chip_class == GFX9) {
+      si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS,
+                     S_00B414_MEM_BASE(sscreen->info.address32_hi >> 8));
+      si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES,
+                     S_00B214_MEM_BASE(sscreen->info.address32_hi >> 8));
+   } else {
+      si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS,
+                     S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8));
+   }
+
    if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) {
       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
                      S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
@@ -5354,6 +5497,10 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
       si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
       si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
       si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
+
+      si_pm4_set_reg(pm4, R_028060_DB_DFSM_CONTROL,
+                     S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
+                     S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
    }
 
    if (sctx->chip_class >= GFX9) {
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.h b/lib/mesa/src/gallium/drivers/radeonsi/si_state.h
index ea31a2afd..a6daa158b 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.h
@@ -65,6 +65,7 @@ struct si_state_blend {
    bool alpha_to_one : 1;
    bool dual_src_blend : 1;
    bool logicop_enable : 1;
+   bool allows_noop_optimization : 1;
 };
 
 struct si_state_rasterizer {
@@ -95,11 +96,6 @@ struct si_state_rasterizer {
    unsigned rasterizer_discard : 1;
    unsigned scissor_enable : 1;
    unsigned clip_halfz : 1;
-   unsigned cull_front : 1;
-   unsigned cull_back : 1;
-   unsigned depth_clamp_any : 1;
-   unsigned provoking_vertex_first : 1;
-   unsigned polygon_mode_enabled : 1;
    unsigned polygon_mode_is_lines : 1;
    unsigned polygon_mode_is_points : 1;
 };
@@ -173,7 +169,6 @@ struct si_vertex_elements {
    uint16_t vb_alignment_check_mask;
 
    uint8_t count;
-   bool uses_instance_divisors;
 
    uint16_t first_vb_use_mask;
    /* Vertex buffer descriptor list size aligned for optimal prefetch. */
@@ -188,13 +183,13 @@ union si_state {
       struct si_state_rasterizer *rasterizer;
       struct si_state_dsa *dsa;
       struct si_pm4_state *poly_offset;
-      struct si_pm4_state *ls;
-      struct si_pm4_state *hs;
-      struct si_pm4_state *es;
-      struct si_pm4_state *gs;
+      struct si_shader *ls;
+      struct si_shader *hs;
+      struct si_shader *es;
+      struct si_shader *gs;
       struct si_pm4_state *vgt_shader_config;
-      struct si_pm4_state *vs;
-      struct si_pm4_state *ps;
+      struct si_shader *vs;
+      struct si_shader *ps;
    } named;
    struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)];
 };
@@ -254,12 +249,6 @@ struct si_shader_data {
    uint32_t sh_base[SI_NUM_SHADERS];
 };
 
-#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK                                                      \
-   (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) |                               \
-    S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) |                   \
-    S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) |                       \
-    S_02881C_USE_VTX_VRS_RATE(1))
-
 /* The list of registers whose emitted values are remembered by si_context. */
 enum si_tracked_reg
 {
@@ -285,12 +274,11 @@ enum si_tracked_reg
    SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
    SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
 
-   SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
-   SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
+   SI_TRACKED_PA_CL_VS_OUT_CNTL,
    SI_TRACKED_PA_CL_CLIP_CNTL,
 
    SI_TRACKED_PA_SC_BINNER_CNTL_0,
-   SI_TRACKED_DB_DFSM_CONTROL,
+
    SI_TRACKED_DB_VRS_OVERRIDE_CNTL,
 
    SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
@@ -347,7 +335,10 @@ enum si_tracked_reg
    SI_TRACKED_VGT_TF_PARAM,
    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 
+   /* Non-context registers: */
    SI_TRACKED_GE_PC_ALLOC,
+   SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+   SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
 
    SI_NUM_TRACKED_REGS,
 };
@@ -490,8 +481,10 @@ struct si_buffer_resources {
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
                                     const struct legacy_surf_level *base_level_info,
                                     unsigned base_level, unsigned first_level, unsigned block_width,
-                                    bool is_stencil, uint16_t access, uint32_t *state);
+                                    /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+                                    bool is_stencil, uint16_t access, uint32_t * restrict state);
 void si_update_ps_colorbuf0_slot(struct si_context *sctx);
+void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader);
 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
                                  struct pipe_constant_buffer *cbuf);
 void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
@@ -527,6 +520,7 @@ struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, uns
 void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
 void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf);
 /* si_state.c */
+uint32_t si_translate_colorformat(enum chip_class chip_class, enum pipe_format format);
 void si_init_state_compute_functions(struct si_context *sctx);
 void si_init_state_functions(struct si_context *sctx);
 void si_init_screen_state_functions(struct si_screen *sscreen);
@@ -567,7 +561,6 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha
 void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
                                    struct si_shader *shader, bool insert_into_disk_cache);
 bool si_shader_mem_ordered(struct si_shader *shader);
-bool si_update_shaders(struct si_context *sctx);
 void si_init_screen_live_shader_cache(struct si_screen *sscreen);
 void si_init_shader_functions(struct si_context *sctx);
 bool si_init_shader_cache(struct si_screen *sscreen);
@@ -578,18 +571,40 @@ void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage,
                                  util_queue_execute_func execute);
 void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const_and_shader_buffers,
                               uint64_t *samplers_and_images);
-int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
-                              struct si_compiler_ctx_state *compiler_state,
-                              struct si_shader_key *key, int thread_index, bool optimized_or_none);
-void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
-                               struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key);
-unsigned si_get_input_prim(const struct si_shader_selector *gs);
+int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state,
+                              const struct si_shader_key *key, int thread_index,
+                              bool optimized_or_none);
+int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state);
+void si_vs_key_update_inputs(struct si_context *sctx);
+void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
+                          struct si_vs_prolog_bits *prolog_key);
+void si_update_ps_inputs_read_or_disabled(struct si_context *sctx);
+void si_update_ps_kill_enable(struct si_context *sctx);
+void si_update_vrs_flat_shading(struct si_context *sctx);
+unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key);
 bool si_update_ngg(struct si_context *sctx);
-
-/* si_state_draw.c */
-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
-void si_trace_emit(struct si_context *sctx);
-void si_init_draw_functions(struct si_context *sctx);
+void si_ps_key_update_framebuffer(struct si_context *sctx);
+void si_ps_key_update_framebuffer_blend(struct si_context *sctx);
+void si_ps_key_update_blend_rasterizer(struct si_context *sctx);
+void si_ps_key_update_rasterizer(struct si_context *sctx);
+void si_ps_key_update_dsa(struct si_context *sctx);
+void si_ps_key_update_sample_shading(struct si_context *sctx);
+void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx);
+void si_init_tess_factor_ring(struct si_context *sctx);
+bool si_update_gs_ring_buffers(struct si_context *sctx);
+bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes);
+
+/* si_state_draw.cpp */
+void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems,
+                                     struct pipe_vertex_buffer *vb, unsigned element_index,
+                                     uint32_t *out);
+void si_init_draw_functions_GFX6(struct si_context *sctx);
+void si_init_draw_functions_GFX7(struct si_context *sctx);
+void si_init_draw_functions_GFX8(struct si_context *sctx);
+void si_init_draw_functions_GFX9(struct si_context *sctx);
+void si_init_draw_functions_GFX10(struct si_context *sctx);
+void si_init_draw_functions_GFX10_3(struct si_context *sctx);
+void si_init_spi_map_functions(struct si_context *sctx);
 
 /* si_state_msaa.c */
 void si_init_msaa_functions(struct si_context *sctx);
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
index e5e5f1a65..921bd5446 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
       shader_variant_flags |= 1 << 0;
    if (sel->nir)
       shader_variant_flags |= 1 << 1;
-   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
+   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es) == 32)
       shader_variant_flags |= 1 << 2;
    if (sel->info.stage == MESA_SHADER_FRAGMENT &&
        /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
@@ -78,11 +78,14 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
        sel->info.base.fs.uses_discard &&
        sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
       shader_variant_flags |= 1 << 3;
-   if (sel->info.stage == MESA_SHADER_VERTEX) {
-      /* This varies depending on whether compute-based culling is enabled. */
-      assert(sel->screen->num_vbos_in_user_sgprs <= 7);
-      shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4;
-   }
+   /* use_ngg_culling disables NGG passthrough for non-culling shaders to reduce context
+    * rolls, which can be changed with AMD_DEBUG=nonggc or AMD_DEBUG=nggc.
+    */
+   if (sel->screen->use_ngg_culling)
+      shader_variant_flags |= 1 << 4;
+
+   /* bit gap */
+
    if (sel->screen->options.no_infinite_interp)
       shader_variant_flags |= 1 << 7;
    if (sel->screen->options.clamp_div_by_zero)
@@ -370,7 +373,7 @@ bool si_shader_mem_ordered(struct si_shader *shader)
 }
 
 static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
-                                 struct si_pm4_state *pm4)
+                                 struct si_shader *shader)
 {
    const struct si_shader_info *info = &tes->info;
    unsigned tes_prim_mode = info->base.tess.primitive_mode;
@@ -427,10 +430,9 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad
    } else
       distribution_mode = V_028B6C_NO_DIST;
 
-   assert(pm4->shader);
-   pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
-                               S_028B6C_TOPOLOGY(topology) |
-                               S_028B6C_DISTRIBUTION_MODE(distribution_mode);
+   shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
+                          S_028B6C_TOPOLOGY(topology) |
+                          S_028B6C_DISTRIBUTION_MODE(distribution_mode);
 }
 
 /* Polaris needs different VTX_REUSE_DEPTH settings depending on
@@ -444,18 +446,16 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad
  *     VS as ES | ES -> GS -> VS             | 30
  *    TES as VS | LS -> HS -> VS             | 14 or 30
  *    TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30
- *
- * If "shader" is NULL, it's assumed it's not LS or GS copy shader.
  */
 static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
-                                         struct si_shader *shader, struct si_pm4_state *pm4)
+                                         struct si_shader *shader)
 {
    if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10)
       return;
 
    /* VS as VS, or VS as ES: */
    if ((sel->info.stage == MESA_SHADER_VERTEX &&
-        (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
+        (!shader->key.as_ls && !shader->is_gs_copy_shader)) ||
        /* TES as VS, or TES as ES: */
        sel->info.stage == MESA_SHADER_TESS_EVAL) {
       unsigned vtx_reuse_depth = 30;
@@ -464,25 +464,15 @@ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_sh
           sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD)
          vtx_reuse_depth = 14;
 
-      assert(pm4->shader);
-      pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
+      shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
    }
 }
 
 static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
 {
-   if (shader->pm4)
-      si_pm4_clear_state(shader->pm4);
-   else
-      shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
-   if (shader->pm4) {
-      shader->pm4->shader = shader;
-      return shader->pm4;
-   } else {
-      fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
-      return NULL;
-   }
+   si_pm4_clear_state(&shader->pm4);
+   shader->pm4.is_shader = true;
+   return &shader->pm4;
 }
 
 static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
@@ -509,22 +499,30 @@ static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_sha
    assert(shader->selector->info.stage == MESA_SHADER_VERTEX ||
           (shader->previous_stage_sel && shader->previous_stage_sel->info.stage == MESA_SHADER_VERTEX));
 
-   /* GFX6-9 LS    (VertexID, RelAutoindex,                InstanceID / StepRate0(==1), ...).
-    * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID,                    ...)
-    * GFX10  LS    (VertexID, RelAutoindex,                UserVGPR1,                   InstanceID).
-    * GFX10  ES,VS (VertexID, UserVGPR0,                   UserVGPR1 or VSPrimID,       UserVGPR2 or
-    * InstanceID)
+   /* GFX6-9   LS    (VertexID, RelAutoIndex,           InstanceID / StepRate0, InstanceID)
+    * GFX6-9   ES,VS (VertexID, InstanceID / StepRate0, VSPrimID,               InstanceID)
+    * GFX10    LS    (VertexID, RelAutoIndex,           UserVGPR1,              UserVGPR2 or InstanceID)
+    * GFX10    ES,VS (VertexID, UserVGPR1,              UserVGPR2 or VSPrimID,  UserVGPR3 or InstanceID)
     */
    bool is_ls = shader->selector->info.stage == MESA_SHADER_TESS_CTRL || shader->key.as_ls;
+   unsigned max = 0;
 
-   if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
-      return 3;
-   else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
-      return 2;
-   else if (is_ls || shader->info.uses_instanceid)
-      return 1;
-   else
-      return 0;
+   if (shader->info.uses_instanceid) {
+      if (sscreen->info.chip_class >= GFX10)
+         max = MAX2(max, 3);
+      else if (is_ls)
+         max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
+      else
+         max = MAX2(max, 1); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
+   }
+
+   if (legacy_vs_prim_id)
+      max = MAX2(max, 2); /* VSPrimID */
+
+   if (is_ls)
+      max = MAX2(max, 1); /* RelAutoIndex */
+
+   return max;
 }
 
 static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
@@ -540,7 +538,6 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
 
    va = shader->bo->gpu_address;
    si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
-   si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
 
    shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
                           S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
@@ -565,10 +562,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
    if (sscreen->info.chip_class >= GFX9) {
       if (sscreen->info.chip_class >= GFX10) {
          si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
-         si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
       } else {
          si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
-         si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
       }
 
       unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
@@ -582,7 +577,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
          shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
    } else {
       si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
-      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
+      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
+                     S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
 
       shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) |
                              S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
@@ -607,7 +603,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
 
 static void si_emit_shader_es(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.es->shader;
+   struct si_shader *shader = sctx->queued.named.es;
    if (!shader)
       return;
 
@@ -656,7 +652,8 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
    oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
 
    si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
+                  S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
    si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
                   S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
                      S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
@@ -667,9 +664,9 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
                      S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 
    if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
-      si_set_tesseval_regs(sscreen, shader->selector, pm4);
+      si_set_tesseval_regs(sscreen, shader->selector, shader);
 
-   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
 }
 
 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
@@ -767,7 +764,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *
 
 static void si_emit_shader_gs(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.gs->shader;
+   struct si_shader *shader = sctx->queued.named.gs;
    if (!shader)
       return;
 
@@ -822,6 +819,20 @@ static void si_emit_shader_gs(struct si_context *sctx)
                                     shader->vgt_vertex_reuse_block_cntl);
    }
    radeon_end_update_context_roll(sctx);
+
+   /* These don't cause any context rolls. */
+   radeon_begin_again(&sctx->gfx_cs);
+   if (sctx->chip_class >= GFX7) {
+      radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                            SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                            shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs);
+   }
+   if (sctx->chip_class >= GFX10) {
+      radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                            SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                            shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs);
+   }
+   radeon_end();
 }
 
 static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@@ -868,6 +879,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
    shader->ctx_reg.gs.vgt_gs_instance_cnt =
       S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
 
+   /* Copy over fields from the GS copy shader to make them easily accessible from GS. */
+   shader->pa_cl_vs_out_cntl = sel->gs_copy_shader->pa_cl_vs_out_cntl;
+
    va = shader->bo->gpu_address;
 
    if (sscreen->info.chip_class >= GFX9) {
@@ -902,10 +916,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
       if (sscreen->info.chip_class >= GFX10) {
          si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-         si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
       } else {
          si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
-         si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
       }
 
       uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
@@ -929,10 +941,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
       si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
       si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
 
-      if (sscreen->info.chip_class >= GFX10) {
-         si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                        S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
-      }
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+                                                   S_00B21C_WAVE_LIMIT(0x3F);
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs =
+         S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0);
 
       shader->ctx_reg.gs.vgt_gs_onchip_cntl =
          S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
@@ -943,12 +955,16 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
       shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4;
 
       if (es_stage == MESA_SHADER_TESS_EVAL)
-         si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+         si_set_tesseval_regs(sscreen, shader->key.part.gs.es, shader);
 
-      polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
+      polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader);
    } else {
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+                                                   S_00B21C_WAVE_LIMIT(0x3F);
+
       si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
-      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
+      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
+                     S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
 
       si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
                      S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
@@ -960,28 +976,25 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
    }
 }
 
-static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
+bool gfx10_is_ngg_passthrough(struct si_shader *shader)
 {
-   enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
-
-   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-       sctx->tracked_regs.reg_value[reg] != value) {
-      struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
-      radeon_begin(cs);
-
-      if (sctx->chip_class == GFX10) {
-         /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
-      }
+   struct si_shader_selector *sel = shader->selector;
 
-      radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
-      radeon_end();
+   /* Never use NGG passthrough if culling is possible even when it's not used by this shader,
+    * so that we don't get context rolls when enabling and disabling NGG passthrough.
+    */
+   if (sel->screen->use_ngg_culling)
+      return false;
 
-      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-      sctx->tracked_regs.reg_value[reg] = value;
-   }
+   /* The definition of NGG passthrough is:
+    * - user GS is turned off (no amplification, no GS instancing, and no culling)
+    * - VGT_ESGS_RING_ITEMSIZE is ignored (behaving as if it was equal to 1)
+    * - vertex indices are packed into 1 VGPR
+    * - Dimgrey and later chips can optionally skip the gs_alloc_req message
+    *
+    * NGG passthrough still allows the use of LDS.
+    */
+   return sel->info.stage != MESA_SHADER_GEOMETRY && !shader->key.opt.ngg_culling;
 }
 
 /* Common tail code for NGG primitive shaders. */
@@ -1012,18 +1025,24 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
    radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
                               shader->ctx_reg.ngg.pa_cl_ngg_cntl);
 
-   radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
-                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
    radeon_end_update_context_roll(sctx);
 
-   /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
-   gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
+   /* These don't cause a context roll. */
+   radeon_begin_again(&sctx->gfx_cs);
+   radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
+                              shader->ctx_reg.ngg.ge_pc_alloc);
+   radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                         SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                         shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs);
+   radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                         SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                         shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs);
+   radeon_end();
 }
 
 static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.gs->shader;
+   struct si_shader *shader = sctx->queued.named.gs;
    if (!shader)
       return;
 
@@ -1032,7 +1051,7 @@ static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
 
 static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.gs->shader;
+   struct si_shader *shader = sctx->queued.named.gs;
    if (!shader)
       return;
 
@@ -1046,7 +1065,7 @@ static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
 
 static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.gs->shader;
+   struct si_shader *shader = sctx->queued.named.gs;
    if (!shader)
       return;
 
@@ -1060,7 +1079,7 @@ static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
 
 static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.gs->shader;
+   struct si_shader *shader = sctx->queued.named.gs;
 
    if (!shader)
       return;
@@ -1075,7 +1094,7 @@ static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
    gfx10_emit_shader_ngg_tail(sctx, shader);
 }
 
-unsigned si_get_input_prim(const struct si_shader_selector *gs)
+unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key)
 {
    if (gs->info.stage == MESA_SHADER_GEOMETRY)
       return gs->info.base.gs.input_primitive;
@@ -1088,22 +1107,26 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs)
       return PIPE_PRIM_TRIANGLES;
    }
 
-   /* TODO: Set this correctly if the primitive type is set in the shader key. */
+   if (key->opt.ngg_culling & SI_NGG_CULL_LINES)
+      return PIPE_PRIM_LINES;
+
    return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
 }
 
 static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
                                    const struct si_shader *shader, bool ngg)
 {
-   bool writes_psize = sel->info.writes_psize;
-
-   if (shader)
-      writes_psize &= !shader->key.opt.kill_pointsize;
-
+   /* Clip distances can be killed, but cull distances can't. */
+   unsigned clipcull_mask = (sel->clipdist_mask & ~shader->key.opt.kill_clip_distances) |
+                            sel->culldist_mask;
+   bool writes_psize = sel->info.writes_psize && !shader->key.opt.kill_pointsize;
    bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) ||
                        sel->screen->options.vrs2x2 ||
                        sel->info.writes_layer || sel->info.writes_viewport_index;
-   return S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
+
+   return S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipcull_mask & 0x0F) != 0) |
+          S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipcull_mask & 0xF0) != 0) |
+          S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
           S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
           S_02881C_USE_VTX_VRS_RATE(sel->screen->options.vrs2x2) |
           S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
@@ -1132,7 +1155,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                           gs_info->base.vs.window_space_position : 0;
    bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
    unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
-   unsigned input_prim = si_get_input_prim(gs_sel);
+   unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
    bool break_wave_at_eoi = false;
    struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
    if (!pm4)
@@ -1174,7 +1197,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
     * for the GL_LINE polygon mode to skip rendering lines on inner edges.
     */
    if (gs_info->uses_invocationid ||
-       (gs_stage == MESA_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
+       (gfx10_edgeflags_have_effect(shader) && !gfx10_is_ngg_passthrough(shader)))
       gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
    else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) ||
             (gs_stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
@@ -1185,9 +1208,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
       gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
 
    unsigned wave_size = si_get_shader_wave_size(shader);
+   unsigned late_alloc_wave64, cu_mask;
+
+   ac_compute_late_alloc(&sscreen->info, true, shader->key.opt.ngg_culling,
+                         shader->config.scratch_bytes_per_wave > 0,
+                         &late_alloc_wave64, &cu_mask);
 
    si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
    si_pm4_set_reg(
       pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
       S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
@@ -1205,32 +1232,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                      S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
                      S_00B22C_LDS_SIZE(shader->config.lds_size));
 
-   /* Determine LATE_ALLOC_GS. */
-   unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
-   unsigned late_alloc_wave64; /* The limit is per SA. */
-
-   /* For Wave32, the hw will launch twice the number of late
-    * alloc waves, so 1 == 2x wave32.
-    *
-    * Don't use late alloc for NGG on Navi14 due to a hw bug.
-    */
-   if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
-      late_alloc_wave64 = 0;
-   else if (shader->key.opt.ngg_culling)
-      late_alloc_wave64 = num_cu_per_sh * 10;
-   else
-      late_alloc_wave64 = num_cu_per_sh * 4;
-
-   /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
-   if (sscreen->info.chip_class == GFX10)
-      late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
-
-   /* Max number that fits into the register field. */
-   late_alloc_wave64 = MIN2(late_alloc_wave64, 127);
-
-   si_pm4_set_reg(
-      pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+   shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) |
+                                                 S_00B21C_WAVE_LIMIT(0x3F);
+   shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs =
+      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
 
    nparams = MAX2(shader->info.nr_param_exports, 1);
    shader->ctx_reg.ngg.spi_vs_out_config =
@@ -1261,7 +1266,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
    }
 
    if (es_stage == MESA_SHADER_TESS_EVAL)
-      si_set_tesseval_regs(sscreen, es_sel, pm4);
+      si_set_tesseval_regs(sscreen, es_sel, shader);
 
    shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
       S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
@@ -1275,59 +1280,55 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
       S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
       S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
 
-   /* Always output hw-generated edge flags and pass them via the prim
+   /* Output hw-generated edge flags if needed and pass them via the prim
     * export to prevent drawing lines on internal edges of decomposed
-    * primitives (such as quads) with polygon mode = lines. Only VS needs
-    * this.
+    * primitives (such as quads) with polygon mode = lines.
     */
    shader->ctx_reg.ngg.pa_cl_ngg_cntl =
-      S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_stage == MESA_SHADER_VERTEX) |
+      S_028838_INDEX_BUF_EDGE_FLAG_ENA(gfx10_edgeflags_have_effect(shader)) |
       /* Reuse for NGG. */
       S_028838_VERTEX_REUSE_DEPTH(sscreen->info.chip_class >= GFX10_3 ? 30 : 0);
    shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true);
 
    /* Oversubscribe PC. This improves performance when there are too many varyings. */
-   float oversub_pc_factor = 0.25;
+   unsigned oversub_pc_factor = 1;
 
    if (shader->key.opt.ngg_culling) {
       /* Be more aggressive with NGG culling. */
       if (shader->info.nr_param_exports > 4)
-         oversub_pc_factor = 1;
+         oversub_pc_factor = 4;
       else if (shader->info.nr_param_exports > 2)
-         oversub_pc_factor = 0.75;
+         oversub_pc_factor = 3;
       else
-         oversub_pc_factor = 0.5;
+         oversub_pc_factor = 2;
    }
 
-   unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
-   shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+   unsigned oversub_pc_lines =
+      late_alloc_wave64 ? (sscreen->info.pc_lines / 4) * oversub_pc_factor : 0;
+   shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) |
                                      S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
 
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                        S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
-   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                        S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
-   } else {
-      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                        S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
-                        S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+   shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                     S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
+                     S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
 
-      /* Bug workaround for a possible hang with non-tessellation cases.
-       * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
-       *
-       * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+   /* On gfx10, the GE only checks against the maximum number of ES verts after
+    * allocating a full GS primitive. So we need to ensure that whenever
+    * this check passes, there is enough space for a full primitive without
+    * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
+    * if we have enough LDS.
+    *
+    * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
+    */
+   if ((sscreen->info.chip_class == GFX10) &&
+       (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
+       shader->ngg.hw_max_esverts != 256 &&
+       shader->ngg.hw_max_esverts > 5) {
+      /* This could be based on the input primitive type. 5 is the worst case
+       * for primitive types with adjacency.
        */
-      if ((sscreen->info.chip_class == GFX10) &&
-          (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
-          shader->ngg.hw_max_esverts != 256) {
-         shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
-         if (shader->ngg.hw_max_esverts > 5) {
-            shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
-         }
-      }
+      shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+      shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
    }
 
    if (window_space) {
@@ -1338,11 +1339,15 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
          S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
          S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
    }
+
+   shader->ctx_reg.ngg.vgt_stages.u.ngg = 1;
+   shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs;
+   shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader);
 }
 
 static void si_emit_shader_vs(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.vs->shader;
+   struct si_shader *shader = sctx->queued.named.vs;
    if (!shader)
       return;
 
@@ -1385,16 +1390,15 @@ static void si_emit_shader_vs(struct si_context *sctx)
                                  S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
    }
 
-   if (sctx->chip_class >= GFX10) {
-      radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
-                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-   }
    radeon_end_update_context_roll(sctx);
 
    /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
-   if (sctx->chip_class >= GFX10)
-      gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
+   if (sctx->chip_class >= GFX10) {
+      radeon_begin_again(&sctx->gfx_cs);
+      radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
+                                 shader->ctx_reg.vs.ge_pc_alloc);
+      radeon_end();
+   }
 }
 
 /**
@@ -1485,14 +1489,26 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                                                                   : V_02870C_SPI_SHADER_NONE) |
       S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
                                                                   : V_02870C_SPI_SHADER_NONE);
-   shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+   unsigned late_alloc_wave64, cu_mask;
+   ac_compute_late_alloc(&sscreen->info, false, false,
+                         shader->config.scratch_bytes_per_wave > 0,
+                         &late_alloc_wave64, &cu_mask);
+
+   shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(late_alloc_wave64 > 0) |
                                     S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
    shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, false);
 
    oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
 
+   if (sscreen->info.chip_class >= GFX7) {
+      si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                     S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
+      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+   }
+
    si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
-   si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
+   si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
+                  S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
 
    uint32_t rsrc1 =
       S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
@@ -1530,9 +1546,9 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
          S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
 
    if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
-      si_set_tesseval_regs(sscreen, shader->selector, pm4);
+      si_set_tesseval_regs(sscreen, shader->selector, shader);
 
-   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
 }
 
 static unsigned si_get_ps_num_interp(struct si_shader *ps)
@@ -1567,7 +1583,7 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
 
 static void si_emit_shader_ps(struct si_context *sctx)
 {
-   struct si_shader *shader = sctx->queued.named.ps->shader;
+   struct si_shader *shader = sctx->queued.named.ps;
    if (!shader)
       return;
 
@@ -1695,10 +1711,13 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
    shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
    shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
 
+   unsigned num_interp = si_get_ps_num_interp(shader);
+
    /* Set interpolation controls. */
-   spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
+   spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) |
                        S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
 
+   shader->ctx_reg.ps.num_interp = num_interp;
    shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
    shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
    shader->ctx_reg.ps.spi_shader_z_format =
@@ -1708,7 +1727,8 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
 
    va = shader->bo->gpu_address;
    si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
-   si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
+   si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
+                  S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
 
    uint32_t rsrc1 =
       S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) |
@@ -1764,31 +1784,41 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
    }
 }
 
-static unsigned si_get_alpha_test_func(struct si_context *sctx)
+static void si_clear_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
+                                   struct si_vs_prolog_bits *prolog_key)
 {
-   /* Alpha-test should be disabled if colorbuffer 0 is integer. */
-   return sctx->queued.named.dsa->alpha_func;
+   prolog_key->instance_divisor_is_one = 0;
+   prolog_key->instance_divisor_is_fetched = 0;
+   key->mono.vs_fetch_opencode = 0;
+   memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch));
 }
 
-void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
-                               struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key)
+void si_vs_key_update_inputs(struct si_context *sctx)
 {
-   if (vs->info.base.vs.blit_sgprs_amd)
+   struct si_shader_selector *vs = sctx->shader.vs.cso;
+   struct si_vertex_elements *elts = sctx->vertex_elements;
+   struct si_shader_key *key = &sctx->shader.vs.key;
+
+   if (!vs)
       return;
 
-   struct si_vertex_elements *elts = sctx->vertex_elements;
+   if (vs->info.base.vs.blit_sgprs_amd) {
+      si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog);
+      key->opt.prefer_mono = 0;
+      sctx->uses_nontrivial_vs_prolog = false;
+      return;
+   }
 
-   prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
-   prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
-   prolog_key->unpack_instance_id_from_vertex_id = sctx->prim_discard_cs_instancing;
+   bool uses_nontrivial_vs_prolog = false;
 
-   /* Prefer a monolithic shader to allow scheduling divisions around
-    * VBO loads. */
-   if (prolog_key->instance_divisor_is_fetched)
-      key->opt.prefer_mono = 1;
+   if (elts->instance_divisor_is_one || elts->instance_divisor_is_fetched)
+      uses_nontrivial_vs_prolog = true;
+
+   key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one;
+   key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+   key->opt.prefer_mono = elts->instance_divisor_is_fetched;
 
-   unsigned count = MIN2(vs->info.num_inputs, elts->count);
-   unsigned count_mask = (1 << count) - 1;
+   unsigned count_mask = (1 << vs->info.num_inputs) - 1;
    unsigned fix = elts->fix_fetch_always & count_mask;
    unsigned opencode = elts->fix_fetch_opencode & count_mask;
 
@@ -1807,19 +1837,49 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto
       }
    }
 
+   memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch));
+
    while (fix) {
       unsigned i = u_bit_scan(&fix);
-      key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+      uint8_t fix_fetch = elts->fix_fetch[i];
+
+      key->mono.vs_fix_fetch[i].bits = fix_fetch;
+      if (fix_fetch)
+         uses_nontrivial_vs_prolog = true;
    }
    key->mono.vs_fetch_opencode = opencode;
+   if (opencode)
+      uses_nontrivial_vs_prolog = true;
+
+   sctx->uses_nontrivial_vs_prolog = uses_nontrivial_vs_prolog;
+
+   /* draw_vertex_state (display lists) requires a trivial VS prolog that ignores
+    * the current vertex buffers and vertex elements.
+    *
+    * We just computed the prolog key because we needed to set uses_nontrivial_vs_prolog,
+    * so that we know whether the VS prolog should be updated when we switch from
+    * draw_vertex_state to draw_vbo. Now clear the VS prolog for draw_vertex_state.
+    * This should happen rarely because the VS prolog should be trivial in most
+    * cases.
+    */
+   if (uses_nontrivial_vs_prolog && sctx->force_trivial_vs_prolog)
+      si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog);
 }
 
-static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs,
-                                         struct si_shader_key *key)
+void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
+                          struct si_vs_prolog_bits *prolog_key)
 {
-   struct si_shader_selector *ps = sctx->shader.ps.cso;
+   prolog_key->instance_divisor_is_one = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_one;
+   prolog_key->instance_divisor_is_fetched = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_fetched;
 
-   key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable;
+   key->mono.vs_fetch_opencode = sctx->shader.vs.key.mono.vs_fetch_opencode;
+   memcpy(key->mono.vs_fix_fetch, sctx->shader.vs.key.mono.vs_fix_fetch,
+          sizeof(key->mono.vs_fix_fetch));
+}
+
+void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
+{
+   struct si_shader_selector *ps = sctx->shader.ps.cso;
 
    /* Find out if PS is disabled. */
    bool ps_disabled = true;
@@ -1827,273 +1887,314 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad
       bool ps_modifies_zs = ps->info.base.fs.uses_discard || ps->info.writes_z || ps->info.writes_stencil ||
                             ps->info.writes_samplemask ||
                             sctx->queued.named.blend->alpha_to_coverage ||
-                            si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
+                            sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS;
       unsigned ps_colormask = si_get_total_colormask(sctx);
 
       ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
                     (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory);
    }
 
-   /* Find out which VS outputs aren't used by the PS. */
-   uint64_t outputs_written = vs->outputs_written_before_ps;
-   uint64_t inputs_read = 0;
+   sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->inputs_read;
+}
 
-   /* Ignore outputs that are not passed from VS to PS. */
-   outputs_written &= ~((1ull << si_shader_io_get_unique_index(VARYING_SLOT_POS, true)) |
-                        (1ull << si_shader_io_get_unique_index(VARYING_SLOT_PSIZ, true)) |
-                        (1ull << si_shader_io_get_unique_index(VARYING_SLOT_CLIP_VERTEX, true)));
+static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
+                                  struct si_shader_key *key)
+{
 
-   if (!ps_disabled) {
-      inputs_read = ps->inputs_read;
-   }
+   key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable;
 
-   uint64_t linked = outputs_written & inputs_read;
+   /* Find out which VS outputs aren't used by the PS. */
+   uint64_t outputs_written = vs->outputs_written_before_ps;
+   uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled;
 
    key->opt.kill_outputs = ~linked & outputs_written;
 
    if (vs->info.stage != MESA_SHADER_GEOMETRY) {
       key->opt.ngg_culling = sctx->ngg_culling;
-
-      if (sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid)
-         key->mono.u.vs_export_prim_id = 1;
+      key->mono.u.vs_export_prim_id = sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid;
+   } else {
+      key->opt.ngg_culling = 0;
+      key->mono.u.vs_export_prim_id = 0;
    }
 
-   /* We need PKT3_CONTEXT_REG_RMW, which we currently only use on GFX10+. */
-   if (sctx->chip_class >= GFX10 &&
-       vs->info.writes_psize &&
-       sctx->current_rast_prim != PIPE_PRIM_POINTS &&
-       !sctx->queued.named.rasterizer->polygon_mode_is_points)
-      key->opt.kill_pointsize = 1;
+   key->opt.kill_pointsize = vs->info.writes_psize &&
+                             sctx->current_rast_prim != PIPE_PRIM_POINTS &&
+                             !sctx->queued.named.rasterizer->polygon_mode_is_points;
 }
 
-/* Compute the key for the hw shader variant */
-static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
-                                          union si_vgt_stages_key stages_key,
-                                          struct si_shader_key *key)
+static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
+                                    struct si_shader_key *key)
 {
-   struct si_context *sctx = (struct si_context *)ctx;
+   key->opt.kill_clip_distances = 0;
+   key->opt.kill_outputs = 0;
+   key->opt.ngg_culling = 0;
+   key->mono.u.vs_export_prim_id = 0;
+   key->opt.kill_pointsize = 0;
+}
+
+void si_ps_key_update_framebuffer(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   struct si_shader_key *key = &sctx->shader.ps.key;
 
-   memset(key, 0, sizeof(*key));
+   if (!sel)
+      return;
 
-   unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms;
-   if (num_inlinable_uniforms &&
-       sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) {
-      key->opt.inline_uniforms = true;
-      memcpy(key->opt.inlined_uniform_values,
-             sctx->inlinable_uniforms[sel->pipe_shader_type],
-             num_inlinable_uniforms * 4);
+   if (sel->info.color0_writes_all_cbufs &&
+       sel->info.colors_written == 0x1)
+      key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+   else
+      key->part.ps.epilog.last_cbuf = 0;
+
+   /* ps_uses_fbfetch is true only if the color buffer is bound. */
+   if (sctx->ps_uses_fbfetch) {
+      struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+      struct pipe_resource *tex = cb0->texture;
+
+      /* 1D textures are allocated and used as 2D on GFX9. */
+      key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
+      key->mono.u.ps.fbfetch_is_1D =
+         sctx->chip_class != GFX9 &&
+         (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
+      key->mono.u.ps.fbfetch_layered =
+         tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
+         tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
+         tex->target == PIPE_TEXTURE_3D;
+   } else {
+      key->mono.u.ps.fbfetch_msaa = 0;
+      key->mono.u.ps.fbfetch_is_1D = 0;
+      key->mono.u.ps.fbfetch_layered = 0;
    }
+}
 
-   switch (sel->info.stage) {
-   case MESA_SHADER_VERTEX:
-      si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
+void si_ps_key_update_framebuffer_blend(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   struct si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_blend *blend = sctx->queued.named.blend;
 
-      if (sctx->shader.tes.cso)
-         key->as_ls = 1;
-      else if (sctx->shader.gs.cso) {
-         key->as_es = 1;
-         key->as_ngg = stages_key.u.ngg;
-      } else {
-         key->as_ngg = stages_key.u.ngg;
-         si_shader_selector_key_hw_vs(sctx, sel, key);
-      }
-      break;
-   case MESA_SHADER_TESS_CTRL:
-      if (sctx->chip_class >= GFX9) {
-         si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.tcs.ls_prolog);
-         key->part.tcs.ls = sctx->shader.vs.cso;
+   if (!sel)
+      return;
 
-         /* When the LS VGPR fix is needed, monolithic shaders
-          * can:
-          *  - avoid initializing EXEC in both the LS prolog
-          *    and the LS main part when !vs_needs_prolog
-          *  - remove the fixup for unused input VGPRs
-          */
-         key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+   /* Select the shader color format based on whether
+    * blending or alpha are needed.
+    */
+   key->part.ps.epilog.spi_shader_col_format =
+      (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format_blend_alpha) |
+      (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format_blend) |
+      (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format_alpha) |
+      (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format);
+   key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
+
+   /* The output for dual source blending should have
+    * the same format as the first output.
+    */
+   if (blend->dual_src_blend) {
+      key->part.ps.epilog.spi_shader_col_format |=
+         (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
+   }
 
-         /* The LS output / HS input layout can be communicated
-          * directly instead of via user SGPRs for merged LS-HS.
-          * This also enables jumping over the VS prolog for HS-only waves.
-          */
-         key->opt.prefer_mono = 1;
-         key->opt.same_patch_vertices = sctx->same_patch_vertices;
-      }
+   /* If alpha-to-coverage is enabled, we have to export alpha
+    * even if there is no color buffer.
+    */
+   if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
+      key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
 
-      key->part.tcs.epilog.prim_mode =
-         sctx->shader.tes.cso->info.base.tess.primitive_mode;
-      key->part.tcs.epilog.invoc0_tess_factors_are_def =
-         sel->info.tessfactors_are_def_in_all_invocs;
-      key->part.tcs.epilog.tes_reads_tess_factors = sctx->shader.tes.cso->info.reads_tess_factors;
+   /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
+    * to the range supported by the type if a channel has less
+    * than 16 bits and the export format is 16_ABGR.
+    */
+   if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
+      key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
+      key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
+   }
 
-      if (sel == sctx->fixed_func_tcs_shader.cso)
-         key->mono.u.ff_tcs_inputs_to_copy = sctx->shader.vs.cso->outputs_written;
-      break;
-   case MESA_SHADER_TESS_EVAL:
-      key->as_ngg = stages_key.u.ngg;
+   /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
+   if (!key->part.ps.epilog.last_cbuf) {
+      key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+      key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
+      key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
+   }
 
-      if (sctx->shader.gs.cso)
-         key->as_es = 1;
-      else {
-         si_shader_selector_key_hw_vs(sctx, sel, key);
-      }
-      break;
-   case MESA_SHADER_GEOMETRY:
-      if (sctx->chip_class >= GFX9) {
-         if (sctx->shader.tes.cso) {
-            key->part.gs.es = sctx->shader.tes.cso;
-         } else {
-            si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.gs.vs_prolog);
-            key->part.gs.es = sctx->shader.vs.cso;
-         }
+   /* Eliminate shader code computing output values that are unused.
+    * This enables dead code elimination between shader parts.
+    * Check if any output is eliminated.
+    */
+   if (sel->colors_written_4bit &
+       ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit))
+      key->opt.prefer_mono = 1;
+   else
+      key->opt.prefer_mono = 0;
+}
 
-         key->as_ngg = stages_key.u.ngg;
+void si_ps_key_update_blend_rasterizer(struct si_context *sctx)
+{
+   struct si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-         /* Only NGG can eliminate GS outputs, because the code is shared with VS. */
-         if (stages_key.u.ngg)
-            si_shader_selector_key_hw_vs(sctx, sel, key);
+   key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+}
 
-         /* This enables jumping over the VS prolog for GS-only waves. */
-         key->opt.prefer_mono = 1;
-      }
-      key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
-      break;
-   case MESA_SHADER_FRAGMENT: {
-      struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-      struct si_state_blend *blend = sctx->queued.named.blend;
+void si_ps_key_update_rasterizer(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   struct si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-      if (sel->info.color0_writes_all_cbufs &&
-          sel->info.colors_written == 0x1)
-         key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+   if (!sel)
+      return;
 
-      /* Select the shader color format based on whether
-       * blending or alpha are needed.
-       */
-      key->part.ps.epilog.spi_shader_col_format =
-         (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
-          sctx->framebuffer.spi_shader_col_format_blend_alpha) |
-         (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
-          sctx->framebuffer.spi_shader_col_format_blend) |
-         (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
-          sctx->framebuffer.spi_shader_col_format_alpha) |
-         (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
-          sctx->framebuffer.spi_shader_col_format);
-      key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
-
-      /* The output for dual source blending should have
-       * the same format as the first output.
-       */
-      if (blend->dual_src_blend) {
-         key->part.ps.epilog.spi_shader_col_format |=
-            (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
-      }
+   key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+   key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
+   key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+}
 
-      /* If alpha-to-coverage is enabled, we have to export alpha
-       * even if there is no color buffer.
-       */
-      if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
-         key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+void si_ps_key_update_dsa(struct si_context *sctx)
+{
+   struct si_shader_key *key = &sctx->shader.ps.key;
 
-      /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
-       * to the range supported by the type if a channel has less
-       * than 16 bits and the export format is 16_ABGR.
-       */
-      if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
-         key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
-         key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
-      }
+   key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func;
+}
 
-      /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
-      if (!key->part.ps.epilog.last_cbuf) {
-         key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
-         key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
-         key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
-      }
+static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx)
+{
+   struct si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-      /* Eliminate shader code computing output values that are unused.
-       * This enables dead code elimination between shader parts.
-       * Check if any output is eliminated.
-       */
-      if (sel->colors_written_4bit &
-          ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit))
-         key->opt.prefer_mono = 1;
+   bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+   bool is_line = util_prim_is_lines(sctx->current_rast_prim);
 
-      bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
-      bool is_line = util_prim_is_lines(sctx->current_rast_prim);
+   key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+   key->part.ps.epilog.poly_line_smoothing =
+      ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
+      sctx->framebuffer.nr_samples <= 1;
+}
 
-      key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
-      key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
+void si_ps_key_update_sample_shading(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   struct si_shader_key *key = &sctx->shader.ps.key;
 
-      key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+   if (!sel)
+      return;
 
-      key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
-      key->part.ps.epilog.poly_line_smoothing =
-         ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
-         sctx->framebuffer.nr_samples <= 1;
-      key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+   if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask)
+      key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
+   else
+      key->part.ps.prolog.samplemask_log_ps_iter = 0;
+}
 
-      if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) {
-         key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
-      }
+void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   struct si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-      bool uses_persp_center = sel->info.uses_persp_center ||
-                               (!rs->flatshade && sel->info.uses_persp_center_color);
-      bool uses_persp_centroid = sel->info.uses_persp_centroid ||
-                                 (!rs->flatshade && sel->info.uses_persp_centroid_color);
-      bool uses_persp_sample = sel->info.uses_persp_sample ||
-                               (!rs->flatshade && sel->info.uses_persp_sample_color);
-
-      if (rs->force_persample_interp && rs->multisample_enable &&
-          sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
-         key->part.ps.prolog.force_persp_sample_interp =
-            uses_persp_center || uses_persp_centroid;
-
-         key->part.ps.prolog.force_linear_sample_interp =
-            sel->info.uses_linear_center || sel->info.uses_linear_centroid;
-      } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
-         key->part.ps.prolog.bc_optimize_for_persp =
-            uses_persp_center && uses_persp_centroid;
-         key->part.ps.prolog.bc_optimize_for_linear =
-            sel->info.uses_linear_center && sel->info.uses_linear_centroid;
-      } else {
-         /* Make sure SPI doesn't compute more than 1 pair
-          * of (i,j), which is the optimization here. */
-         key->part.ps.prolog.force_persp_center_interp = uses_persp_center +
-                                                         uses_persp_centroid +
-                                                         uses_persp_sample > 1;
-
-         key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center +
-                                                          sel->info.uses_linear_centroid +
-                                                          sel->info.uses_linear_sample > 1;
-
-         if (sel->info.uses_interp_at_sample)
-            key->mono.u.ps.interpolate_at_sample_force_center = 1;
+   if (!sel)
+      return;
+
+   bool uses_persp_center = sel->info.uses_persp_center ||
+                            (!rs->flatshade && sel->info.uses_persp_center_color);
+   bool uses_persp_centroid = sel->info.uses_persp_centroid ||
+                              (!rs->flatshade && sel->info.uses_persp_centroid_color);
+   bool uses_persp_sample = sel->info.uses_persp_sample ||
+                            (!rs->flatshade && sel->info.uses_persp_sample_color);
+
+   if (rs->force_persample_interp && rs->multisample_enable &&
+       sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
+      key->part.ps.prolog.force_persp_sample_interp =
+         uses_persp_center || uses_persp_centroid;
+
+      key->part.ps.prolog.force_linear_sample_interp =
+         sel->info.uses_linear_center || sel->info.uses_linear_centroid;
+
+      key->part.ps.prolog.force_persp_center_interp = 0;
+      key->part.ps.prolog.force_linear_center_interp = 0;
+      key->part.ps.prolog.bc_optimize_for_persp = 0;
+      key->part.ps.prolog.bc_optimize_for_linear = 0;
+      key->mono.u.ps.interpolate_at_sample_force_center = 0;
+   } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
+      key->part.ps.prolog.force_persp_sample_interp = 0;
+      key->part.ps.prolog.force_linear_sample_interp = 0;
+      key->part.ps.prolog.force_persp_center_interp = 0;
+      key->part.ps.prolog.force_linear_center_interp = 0;
+      key->part.ps.prolog.bc_optimize_for_persp =
+         uses_persp_center && uses_persp_centroid;
+      key->part.ps.prolog.bc_optimize_for_linear =
+         sel->info.uses_linear_center && sel->info.uses_linear_centroid;
+      key->mono.u.ps.interpolate_at_sample_force_center = 0;
+   } else {
+      key->part.ps.prolog.force_persp_sample_interp = 0;
+      key->part.ps.prolog.force_linear_sample_interp = 0;
+
+      /* Make sure SPI doesn't compute more than 1 pair
+       * of (i,j), which is the optimization here. */
+      key->part.ps.prolog.force_persp_center_interp = uses_persp_center +
+                                                      uses_persp_centroid +
+                                                      uses_persp_sample > 1;
+
+      key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center +
+                                                       sel->info.uses_linear_centroid +
+                                                       sel->info.uses_linear_sample > 1;
+      key->part.ps.prolog.bc_optimize_for_persp = 0;
+      key->part.ps.prolog.bc_optimize_for_linear = 0;
+      key->mono.u.ps.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample;
+   }
+}
+
+/* Compute the key for the hw shader variant */
+static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
+                                          struct si_shader_key *key)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   switch (sel->info.stage) {
+   case MESA_SHADER_VERTEX:
+      if (!sctx->shader.tes.cso && !sctx->shader.gs.cso)
+         si_get_vs_key_outputs(sctx, sel, key);
+      else
+         si_clear_vs_key_outputs(sctx, sel, key);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      if (sctx->chip_class >= GFX9) {
+         si_get_vs_key_inputs(sctx, key, &key->part.tcs.ls_prolog);
+         key->part.tcs.ls = sctx->shader.vs.cso;
       }
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      if (!sctx->shader.gs.cso)
+         si_get_vs_key_outputs(sctx, sel, key);
+      else
+         si_clear_vs_key_outputs(sctx, sel, key);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      if (sctx->chip_class >= GFX9) {
+         if (sctx->shader.tes.cso) {
+            si_clear_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog);
+            key->part.gs.es = sctx->shader.tes.cso;
+         } else {
+            si_get_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog);
+            key->part.gs.es = sctx->shader.vs.cso;
+         }
 
-      key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
-
-      /* ps_uses_fbfetch is true only if the color buffer is bound. */
-      if (sctx->ps_uses_fbfetch && !sctx->blitter_running) {
-         struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
-         struct pipe_resource *tex = cb0->texture;
-
-         /* 1D textures are allocated and used as 2D on GFX9. */
-         key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
-         key->mono.u.ps.fbfetch_is_1D =
-            sctx->chip_class != GFX9 &&
-            (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
-         key->mono.u.ps.fbfetch_layered =
-            tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
-            tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
-            tex->target == PIPE_TEXTURE_3D;
+         /* Only NGG can eliminate GS outputs, because the code is shared with VS. */
+         if (sctx->ngg)
+            si_get_vs_key_outputs(sctx, sel, key);
+         else
+            si_clear_vs_key_outputs(sctx, sel, key);
       }
       break;
-   }
+   case MESA_SHADER_FRAGMENT:
+      si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx);
+      break;
    default:
       assert(0);
    }
-
-   if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
-      memset(&key->opt, 0, sizeof(key->opt));
 }
 
 static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority)
@@ -2138,7 +2239,7 @@ static void si_build_shader_variant(struct si_shader *shader, int thread_index,
    si_shader_init_pm4_state(sscreen, shader);
 }
 
-static void si_build_shader_variant_low_priority(void *job, int thread_index)
+static void si_build_shader_variant_low_priority(void *job, void *gdata, int thread_index)
 {
    struct si_shader *shader = (struct si_shader *)job;
 
@@ -2151,7 +2252,7 @@ static const struct si_shader_key zeroed;
 
 static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
                                        struct si_compiler_ctx_state *compiler_state,
-                                       struct si_shader_key *key)
+                                       const struct si_shader_key *key)
 {
    struct si_shader **mainp = si_get_main_shader_part(sel, key);
 
@@ -2182,6 +2283,16 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad
    return true;
 }
 
+/* A helper to copy *key to *local_key and return local_key. */
+static const struct si_shader_key *
+use_local_key_copy(const struct si_shader_key *key, struct si_shader_key *local_key)
+{
+   if (key != local_key)
+      memcpy(local_key, key, sizeof(*key));
+
+   return local_key;
+}
+
 /**
  * Select a shader variant according to the shader key.
  *
@@ -2189,14 +2300,26 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad
  *                           the compilation isn't finished, don't select any
  *                           shader and return an error.
  */
-int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
-                              struct si_compiler_ctx_state *compiler_state,
-                              struct si_shader_key *key, int thread_index, bool optimized_or_none)
+int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state,
+                              const struct si_shader_key *key, int thread_index,
+                              bool optimized_or_none)
 {
+   struct si_screen *sscreen = sctx->screen;
    struct si_shader_selector *sel = state->cso;
    struct si_shader_selector *previous_stage_sel = NULL;
    struct si_shader *current = state->current;
    struct si_shader *iter, *shader = NULL;
+   /* si_shader_select_with_key must not modify 'key' because it would affect future shaders.
+    * If we need to modify it for this specific shader (eg: to disable optimizations), we
+    * use a copy.
+    */
+   struct si_shader_key local_key;
+
+   if (unlikely(sscreen->debug_flags & DBG(NO_OPT_VARIANT))) {
+      /* Disable shader variant optimizations. */
+      key = use_local_key_copy(key, &local_key);
+      memset(&local_key.opt, 0, sizeof(key->opt));
+   }
 
 again:
    /* Check if we don't need to change anything.
@@ -2209,7 +2332,8 @@ again:
             if (optimized_or_none)
                return -1;
 
-            memset(&key->opt, 0, sizeof(key->opt));
+            key = use_local_key_copy(key, &local_key);
+            memset(&local_key.opt, 0, sizeof(key->opt));
             goto current_not_ready;
          }
 
@@ -2248,9 +2372,10 @@ current_not_ready:
                     key->opt.inlined_uniform_values,
                     MAX_INLINABLE_UNIFORMS * 4) != 0) {
             if (variant_count++ > max_inline_uniforms_variants) {
+               key = use_local_key_copy(key, &local_key);
                /* Too many variants. Disable inlining for this shader. */
-               key->opt.inline_uniforms = 0;
-               memset(key->opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4);
+               local_key.opt.inline_uniforms = 0;
+               memset(local_key.opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4);
                simple_mtx_unlock(&sel->mutex);
                goto again;
             }
@@ -2267,7 +2392,9 @@ current_not_ready:
             if (iter->is_optimized) {
                if (optimized_or_none)
                   return -1;
-               memset(&key->opt, 0, sizeof(key->opt));
+
+               key = use_local_key_copy(key, &local_key);
+               memset(&local_key.opt, 0, sizeof(key->opt));
                goto again;
             }
 
@@ -2292,9 +2419,14 @@ current_not_ready:
 
    util_queue_fence_init(&shader->ready);
 
+   if (!sctx->compiler.passes)
+      si_init_compiler(sctx->screen, &sctx->compiler);
+
    shader->selector = sel;
    shader->key = *key;
-   shader->compiler_ctx_state = *compiler_state;
+   shader->compiler_ctx_state.compiler = &sctx->compiler;
+   shader->compiler_ctx_state.debug = sctx->debug;
+   shader->compiler_ctx_state.is_debug_context = sctx->is_debug;
 
    /* If this is a merged shader, get the first shader's selector. */
    if (sscreen->info.chip_class >= GFX9) {
@@ -2313,10 +2445,8 @@ current_not_ready:
 
    /* Compile the main shader part if it doesn't exist. This can happen
     * if the initial guess was wrong.
-    *
-    * The prim discard CS doesn't need the main shader part.
     */
-   if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
+   if (!is_pure_monolithic) {
       bool ok = true;
 
       /* Make sure the main shader part is present. This is needed
@@ -2342,12 +2472,13 @@ current_not_ready:
          }
 
          simple_mtx_lock(&previous_stage_sel->mutex);
-         ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key);
+         ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state,
+                                         &shader1_key);
          simple_mtx_unlock(&previous_stage_sel->mutex);
       }
 
       if (ok) {
-         ok = si_check_missing_main_part(sscreen, sel, compiler_state, key);
+         ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state, key);
       }
 
       if (!ok) {
@@ -2370,8 +2501,7 @@ current_not_ready:
    shader->is_monolithic =
       is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
-   /* The prim discard CS is always optimized. */
-   shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+   shader->is_optimized = !is_pure_monolithic &&
                           memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
    /* If it's an optimized shader, compile it asynchronously. */
@@ -2391,7 +2521,8 @@ current_not_ready:
       }
 
       /* Use the default (unoptimized) shader for now. */
-      memset(&key->opt, 0, sizeof(key->opt));
+      key = use_local_key_copy(key, &local_key);
+      memset(&local_key.opt, 0, sizeof(key->opt));
       simple_mtx_unlock(&sel->mutex);
 
       if (sscreen->options.sync_compile)
@@ -2426,15 +2557,12 @@ current_not_ready:
    return shader->compilation_failed ? -1 : 0;
 }
 
-static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state,
-                            union si_vgt_stages_key stages_key,
-                            struct si_compiler_ctx_state *compiler_state)
+int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state)
 {
    struct si_context *sctx = (struct si_context *)ctx;
-   struct si_shader_key key;
 
-   si_shader_selector_key(ctx, state->cso, stages_key, &key);
-   return si_shader_select_with_key(sctx->screen, state, compiler_state, &key, -1, false);
+   si_shader_selector_key(ctx, state->cso, &state->key);
+   return si_shader_select_with_key(sctx, state, &state->key, -1, false);
 }
 
 static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
@@ -2477,7 +2605,7 @@ static void si_parse_next_shader_property(const struct si_shader_info *info, boo
  * si_shader_selector initialization. Since it can be done asynchronously,
  * there is no way to report compile failures to applications.
  */
-static void si_init_shader_selector_async(void *job, int thread_index)
+static void si_init_shader_selector_async(void *job, void *gdata, int thread_index)
 {
    struct si_shader_selector *sel = (struct si_shader_selector *)job;
    struct si_screen *sscreen = sel->screen;
@@ -2492,6 +2620,19 @@ static void si_init_shader_selector_async(void *job, int thread_index)
    if (!compiler->passes)
       si_init_compiler(sscreen, compiler);
 
+   /* The GS copy shader is always pre-compiled. */
+   if (sel->info.stage == MESA_SHADER_GEOMETRY &&
+       (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
+        sel->tess_turns_off_ngg)) {
+      sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
+      if (!sel->gs_copy_shader) {
+         fprintf(stderr, "radeonsi: can't create GS copy shader\n");
+         return;
+      }
+
+      si_shader_vs(sscreen, sel->gs_copy_shader, sel);
+   }
+
    /* Serialize NIR to save memory. Monolithic shader variants
     * have to deserialize NIR before compilation.
     */
@@ -2576,14 +2717,16 @@ static void si_init_shader_selector_async(void *job, int thread_index)
          unsigned i;
 
          for (i = 0; i < sel->info.num_outputs; i++) {
-            unsigned offset = shader->info.vs_output_param_offset[i];
+            unsigned semantic = sel->info.output_semantic[i];
+            unsigned ps_input_cntl = shader->info.vs_output_ps_input_cntl[semantic];
 
-            if (offset <= AC_EXP_PARAM_OFFSET_31)
+            /* OFFSET=0x20 means DEFAULT_VAL, which means VS doesn't export it. */
+            if (G_028644_OFFSET(ps_input_cntl) != 0x20)
                continue;
 
-            unsigned semantic = sel->info.output_semantic[i];
             unsigned id;
 
+            /* Remove the output from the mask. */
             if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
                 semantic != VARYING_SLOT_POS &&
                 semantic != VARYING_SLOT_PSIZ &&
@@ -2596,19 +2739,6 @@ static void si_init_shader_selector_async(void *job, int thread_index)
       }
    }
 
-   /* The GS copy shader is always pre-compiled. */
-   if (sel->info.stage == MESA_SHADER_GEOMETRY &&
-       (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
-        sel->tess_turns_off_ngg)) {
-      sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
-      if (!sel->gs_copy_shader) {
-         fprintf(stderr, "radeonsi: can't create GS copy shader\n");
-         return;
-      }
-
-      si_shader_vs(sscreen, sel->gs_copy_shader, sel);
-   }
-
    /* Free NIR. We only keep serialized NIR after this point. */
    if (sel->nir) {
       ralloc_free(sel->nir);
@@ -2724,18 +2854,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
       sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd
          ? sel->info.num_inputs
          : 0;
-   sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
+   unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class);
+   sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, num_vbos_in_sgprs);
 
    /* The prolog is a no-op if there are no inputs. */
    sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs &&
                           !sel->info.base.vs.blit_sgprs_amd;
 
-   sel->prim_discard_cs_allowed =
-      sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images &&
-      !sel->info.uses_bindless_samplers && !sel->info.base.writes_memory &&
-      !sel->info.writes_viewport_index &&
-      !sel->info.base.vs.window_space_position && !sel->so.num_outputs;
-
    if (sel->info.stage == MESA_SHADER_VERTEX ||
        sel->info.stage == MESA_SHADER_TESS_CTRL ||
        sel->info.stage == MESA_SHADER_TESS_EVAL ||
@@ -2756,8 +2881,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
          } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
                     semantic != VARYING_SLOT_EDGE) {
             sel->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false);
-            sel->outputs_written_before_ps |= 1ull
-                                              << si_shader_io_get_unique_index(semantic, true);
+
+            /* Ignore outputs that are not passed from VS to PS. */
+            if (semantic != VARYING_SLOT_POS &&
+                semantic != VARYING_SLOT_PSIZ &&
+                semantic != VARYING_SLOT_CLIP_VERTEX) {
+               sel->outputs_written_before_ps |= 1ull
+                                                 << si_shader_io_get_unique_index(semantic, true);
+            }
          }
       }
    }
@@ -2824,7 +2955,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 
    case MESA_SHADER_FRAGMENT:
       for (i = 0; i < sel->info.num_inputs; i++) {
-         unsigned semantic = sel->info.input_semantic[i];
+         unsigned semantic = sel->info.input[i].semantic;
 
          if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
              semantic != VARYING_SLOT_PNTC) {
@@ -2837,9 +2968,9 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
             sel->colors_written_4bit |= 0xf << (4 * i);
 
       for (i = 0; i < sel->info.num_inputs; i++) {
-         if (sel->info.input_semantic[i] == VARYING_SLOT_COL0)
+         if (sel->info.input[i].semantic == VARYING_SLOT_COL0)
             sel->color_attr_index[0] = i;
-         else if (sel->info.input_semantic[i] == VARYING_SLOT_COL1)
+         else if (sel->info.input[i].semantic == VARYING_SLOT_COL1)
             sel->color_attr_index[1] = i;
       }
       break;
@@ -2868,25 +2999,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                   sscreen->info.chip_class == GFX10_3 ||
                   (sscreen->info.chip_class == GFX10 &&
                    sscreen->info.is_pro_graphics)) {
-            /* Rough estimates. */
-            switch (sctx->family) {
-            case CHIP_NAVI10:
-            case CHIP_NAVI12:
-            case CHIP_SIENNA_CICHLID:
-               sel->ngg_cull_vert_threshold = 511;
-               break;
-            case CHIP_NAVI14:
-            case CHIP_NAVY_FLOUNDER:
-            case CHIP_DIMGREY_CAVEFISH:
-            case CHIP_VANGOGH:
-               sel->ngg_cull_vert_threshold = 255;
-               break;
-            default:
-               assert(!sscreen->use_ngg_culling);
-            }
+            sel->ngg_cull_vert_threshold = 128;
          }
       } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
-         if (sel->rast_prim == PIPE_PRIM_TRIANGLES &&
+         if (sel->rast_prim != PIPE_PRIM_POINTS &&
              (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) ||
               sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS) ||
               sscreen->info.chip_class == GFX10_3))
@@ -2894,10 +3010,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
       }
    }
 
-   /* PA_CL_VS_OUT_CNTL */
-   if (sctx->chip_class <= GFX9)
-      sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, NULL, false);
-
    sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS :
                            u_bit_consecutive(0, sel->info.base.clip_distance_array_size);
    sel->culldist_mask = u_bit_consecutive(0, sel->info.base.cull_distance_array_size) <<
@@ -3005,11 +3117,10 @@ static void si_update_clip_regs(struct si_context *sctx, struct si_shader_select
        (!old_hw_vs ||
         (old_hw_vs->info.stage == MESA_SHADER_VERTEX && old_hw_vs->info.base.vs.window_space_position) !=
         (next_hw_vs->info.stage == MESA_SHADER_VERTEX && next_hw_vs->info.base.vs.window_space_position) ||
-        old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
         old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
         old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant ||
         !next_hw_vs_variant ||
-        old_hw_vs_variant->key.opt.kill_clip_distances != next_hw_vs_variant->key.opt.kill_clip_distances))
+        old_hw_vs_variant->pa_cl_vs_out_cntl != next_hw_vs_variant->pa_cl_vs_out_cntl))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 }
 
@@ -3053,9 +3164,10 @@ static void si_update_common_shader_state(struct si_context *sctx, struct si_sha
                                 si_shader_uses_bindless_images(sctx->shader.tcs.cso) ||
                                 si_shader_uses_bindless_images(sctx->shader.tes.cso);
 
-   /* Invalidate inlinable uniforms. */
-   sctx->inlinable_uniforms_valid_mask &= ~(1 << type);
+   if (type == PIPE_SHADER_VERTEX || type == PIPE_SHADER_TESS_EVAL || type == PIPE_SHADER_GEOMETRY)
+      sctx->ngg_culling = 0; /* this will be enabled on the first draw if needed */
 
+   si_invalidate_inlinable_uniforms(sctx, type);
    sctx->do_update_shaders = true;
 }
 
@@ -3073,6 +3185,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
    sctx->shader.vs.current = sel ? sel->first_variant : NULL;
    sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0;
    sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false;
+   sctx->fixed_func_tcs_shader.key.mono.u.ff_tcs_inputs_to_copy = sel ? sel->outputs_written : 0;
 
    if (si_update_ngg(sctx))
       si_shader_change_notify(sctx);
@@ -3084,6 +3197,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
    si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
                        si_get_vs(sctx)->current);
    si_update_rasterized_prim(sctx);
+   si_vs_key_update_inputs(sctx);
 }
 
 static void si_update_tess_uses_prim_id(struct si_context *sctx)
@@ -3118,7 +3232,7 @@ bool si_update_ngg(struct si_context *sctx)
        * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
        * pointers are set.
        */
-      if ((sctx->chip_class == GFX10 || sctx->family == CHIP_SIENNA_CICHLID) && !new_ngg) {
+      if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
          sctx->flags |= SI_CONTEXT_VGT_FLUSH;
          if (sctx->chip_class == GFX10) {
             /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
@@ -3179,6 +3293,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 
    sctx->shader.tcs.cso = sel;
    sctx->shader.tcs.current = sel ? sel->first_variant : NULL;
+   sctx->shader.tcs.key.part.tcs.epilog.invoc0_tess_factors_are_def =
+      sel ? sel->info.tessfactors_are_def_in_all_invocs : 0;
    si_update_tess_uses_prim_id(sctx);
 
    si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL);
@@ -3203,6 +3319,14 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
    sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
    si_update_tess_uses_prim_id(sctx);
 
+   sctx->shader.tcs.key.part.tcs.epilog.prim_mode =
+   sctx->fixed_func_tcs_shader.key.part.tcs.epilog.prim_mode =
+      sel ? sel->info.base.tess.primitive_mode : 0;
+
+   sctx->shader.tcs.key.part.tcs.epilog.tes_reads_tess_factors =
+   sctx->fixed_func_tcs_shader.key.part.tcs.epilog.tes_reads_tess_factors =
+      sel ? sel->info.reads_tess_factors : 0;
+
    si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
    si_select_draw_vbo(sctx);
    sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
@@ -3219,6 +3343,41 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
    si_update_rasterized_prim(sctx);
 }
 
+void si_update_ps_kill_enable(struct si_context *sctx)
+{
+   if (!sctx->shader.ps.cso)
+      return;
+
+   unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control |
+                                S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS);
+
+   if (sctx->ps_db_shader_control != db_shader_control) {
+      sctx->ps_db_shader_control = db_shader_control;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      if (sctx->screen->dpbb_allowed)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   }
+}
+
+void si_update_vrs_flat_shading(struct si_context *sctx)
+{
+   if (sctx->chip_class >= GFX10_3 && sctx->shader.ps.cso) {
+      struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+      struct si_shader_info *info = &sctx->shader.ps.cso->info;
+      bool allow_flat_shading = info->allow_flat_shading;
+
+      if (allow_flat_shading &&
+          (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable ||
+           (!rs->flatshade && info->uses_interp_color)))
+         allow_flat_shading = false;
+
+      if (sctx->allow_flat_shading != allow_flat_shading) {
+         sctx->allow_flat_shading = allow_flat_shading;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      }
+   }
+}
+
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 {
    struct si_context *sctx = (struct si_context *)ctx;
@@ -3247,6 +3406,17 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
          si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
    }
    si_update_ps_colorbuf0_slot(sctx);
+
+   si_ps_key_update_framebuffer(sctx);
+   si_ps_key_update_framebuffer_blend(sctx);
+   si_ps_key_update_blend_rasterizer(sctx);
+   si_ps_key_update_rasterizer(sctx);
+   si_ps_key_update_dsa(sctx);
+   si_ps_key_update_sample_shading(sctx);
+   si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+   si_update_ps_inputs_read_or_disabled(sctx);
+   si_update_ps_kill_enable(sctx);
+   si_update_vrs_flat_shading(sctx);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
@@ -3257,55 +3427,55 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 
    util_queue_fence_destroy(&shader->ready);
 
-   if (shader->pm4) {
-      /* If destroyed shaders were not unbound, the next compiled
-       * shader variant could get the same pointer address and so
-       * binding it to the same shader stage would be considered
-       * a no-op, causing random behavior.
-       */
-      switch (shader->selector->info.stage) {
-      case MESA_SHADER_VERTEX:
-         if (shader->key.as_ls) {
-            assert(sctx->chip_class <= GFX8);
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ls));
-         } else if (shader->key.as_es) {
-            assert(sctx->chip_class <= GFX8);
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es));
-         } else if (shader->key.as_ngg) {
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs));
-         } else {
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs));
-         }
-         break;
-      case MESA_SHADER_TESS_CTRL:
-         si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(hs));
-         break;
-      case MESA_SHADER_TESS_EVAL:
-         if (shader->key.as_es) {
-            assert(sctx->chip_class <= GFX8);
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es));
-         } else if (shader->key.as_ngg) {
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs));
-         } else {
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs));
-         }
-         break;
-      case MESA_SHADER_GEOMETRY:
-         if (shader->is_gs_copy_shader)
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs));
-         else
-            si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs));
-         break;
-      case MESA_SHADER_FRAGMENT:
-         si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ps));
-         break;
-      default:;
+   /* If destroyed shaders were not unbound, the next compiled
+    * shader variant could get the same pointer address and so
+    * binding it to the same shader stage would be considered
+    * a no-op, causing random behavior.
+    */
+   int state_index = -1;
+
+   switch (shader->selector->info.stage) {
+   case MESA_SHADER_VERTEX:
+      if (shader->key.as_ls) {
+         if (sctx->chip_class <= GFX8)
+            state_index = SI_STATE_IDX(ls);
+      } else if (shader->key.as_es) {
+         if (sctx->chip_class <= GFX8)
+            state_index = SI_STATE_IDX(es);
+      } else if (shader->key.as_ngg) {
+         state_index = SI_STATE_IDX(gs);
+      } else {
+         state_index = SI_STATE_IDX(vs);
+      }
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      state_index = SI_STATE_IDX(hs);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      if (shader->key.as_es) {
+         if (sctx->chip_class <= GFX8)
+            state_index = SI_STATE_IDX(es);
+      } else if (shader->key.as_ngg) {
+         state_index = SI_STATE_IDX(gs);
+      } else {
+         state_index = SI_STATE_IDX(vs);
       }
+      break;
+   case MESA_SHADER_GEOMETRY:
+      if (shader->is_gs_copy_shader)
+         state_index = SI_STATE_IDX(vs);
+      else
+         state_index = SI_STATE_IDX(gs);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      state_index = SI_STATE_IDX(ps);
+      break;
+   default:;
    }
 
    si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
    si_shader_destroy(shader);
-   free(shader);
+   si_pm4_free_state(sctx, &shader->pm4, state_index);
 }
 
 static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
@@ -3354,128 +3524,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
    si_shader_selector_reference(sctx, &sel, NULL);
 }
 
-static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs,
-                                     unsigned semantic, enum glsl_interp_mode interpolate,
-                                     ubyte fp16_lo_hi_mask)
-{
-   struct si_shader_info *vsinfo = &vs->selector->info;
-   unsigned offset, ps_input_cntl = 0;
-
-   if (interpolate == INTERP_MODE_FLAT ||
-       (interpolate == INTERP_MODE_COLOR && sctx->flatshade) ||
-       semantic == VARYING_SLOT_PRIMITIVE_ID)
-      ps_input_cntl |= S_028644_FLAT_SHADE(1);
-
-   if (semantic == VARYING_SLOT_PNTC ||
-       (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 &&
-        sctx->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) {
-      ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
-      if (fp16_lo_hi_mask & 0x1) {
-         ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
-                          S_028644_ATTR0_VALID(1);
-      }
-   }
-
-   int vs_slot = vsinfo->output_semantic_to_slot[semantic];
-   if (vs_slot >= 0) {
-      offset = vs->info.vs_output_param_offset[vs_slot];
-
-      if (offset <= AC_EXP_PARAM_OFFSET_31) {
-         /* The input is loaded from parameter memory. */
-         ps_input_cntl |= S_028644_OFFSET(offset);
-      } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-         if (offset == AC_EXP_PARAM_UNDEFINED) {
-            /* This can happen with depth-only rendering. */
-            offset = 0;
-         } else {
-            /* The input is a DEFAULT_VAL constant. */
-            assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
-                   offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
-            offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
-         }
-
-         ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
-      }
-
-      if (fp16_lo_hi_mask && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-         assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000);
-
-         ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
-                          S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) |
-                          S_028644_DEFAULT_VAL_ATTR1(0) |
-                          S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */
-                          S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2));
-      }
-   } else {
-      /* VS output not found. */
-      if (semantic == VARYING_SLOT_PRIMITIVE_ID) {
-         /* PrimID is written after the last output when HW VS is used. */
-         ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
-      } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-         /* No corresponding output found, load defaults into input.
-          * Don't set any other bits.
-          * (FLAT_SHADE=1 completely changes behavior) */
-         ps_input_cntl = S_028644_OFFSET(0x20);
-         /* D3D 9 behaviour. GL is undefined */
-         if (semantic == VARYING_SLOT_COL0)
-            ps_input_cntl |= S_028644_DEFAULT_VAL(3);
-      }
-   }
-
-   return ps_input_cntl;
-}
-
-static void si_emit_spi_map(struct si_context *sctx)
-{
-   struct si_shader *ps = sctx->shader.ps.current;
-   struct si_shader *vs;
-   struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
-   unsigned i, num_interp, num_written = 0;
-   unsigned spi_ps_input_cntl[32];
-
-   if (!ps || !ps->selector->info.num_inputs)
-      return;
-
-   /* With legacy GS, only the GS copy shader contains information about param exports. */
-   if (sctx->shader.gs.cso && !sctx->ngg)
-      vs = sctx->shader.gs.cso->gs_copy_shader;
-   else
-      vs = si_get_vs(sctx)->current;
-
-   num_interp = si_get_ps_num_interp(ps);
-   assert(num_interp > 0);
-
-   for (i = 0; i < psinfo->num_inputs; i++) {
-      unsigned semantic = psinfo->input_semantic[i];
-      unsigned interpolate = psinfo->input_interpolate[i];
-      ubyte fp16_lo_hi_mask = psinfo->input_fp16_lo_hi_valid[i];
-
-      spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, interpolate,
-                                                              fp16_lo_hi_mask);
-   }
-
-   if (ps->key.part.ps.prolog.color_two_side) {
-      for (i = 0; i < 2; i++) {
-         if (!(psinfo->colors_read & (0xf << (i * 4))))
-            continue;
-
-         unsigned semantic = VARYING_SLOT_BFC0 + i;
-         spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic,
-                                                                 psinfo->color_interpolate[i],
-                                                                 false);
-      }
-   }
-   assert(num_interp == num_written);
-
-   /* R_028644_SPI_PS_INPUT_CNTL_0 */
-   /* Dota 2: Only ~16% of SPI map updates set different values. */
-   /* Talos: Only ~9% of SPI map updates set different values. */
-   radeon_begin(&sctx->gfx_cs);
-   radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
-                               sctx->tracked_regs.spi_ps_input_cntl, num_interp);
-   radeon_end_update_context_roll(sctx);
-}
-
 /**
  * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
  */
@@ -3505,17 +3553,17 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs)
    radeon_begin(cs);
 
    /* This is required before VGT_FLUSH. */
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 
    /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
    radeon_end();
 }
 
 /* Initialize state related to ESGS / GSVS ring buffers */
-static bool si_update_gs_ring_buffers(struct si_context *sctx)
+bool si_update_gs_ring_buffers(struct si_context *sctx)
 {
    struct si_shader_selector *es =
       sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso;
@@ -3610,11 +3658,11 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
       /* Set the GS registers. */
       if (sctx->esgs_ring) {
          assert(sctx->chip_class <= GFX8);
-         radeon_set_uconfig_reg(cs, R_030900_VGT_ESGS_RING_SIZE,
+         radeon_set_uconfig_reg(R_030900_VGT_ESGS_RING_SIZE,
                                 sctx->esgs_ring->width0 / 256);
       }
       if (sctx->gsvs_ring) {
-         radeon_set_uconfig_reg(cs, R_030904_VGT_GSVS_RING_SIZE,
+         radeon_set_uconfig_reg(R_030904_VGT_GSVS_RING_SIZE,
                                 sctx->gsvs_ring->width0 / 256);
       }
       radeon_end();
@@ -3718,11 +3766,6 @@ static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *s
    return 1;
 }
 
-static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
-{
-   return shader ? shader->config.scratch_bytes_per_wave : 0;
-}
-
 static struct si_shader *si_get_tcs_current(struct si_context *sctx)
 {
    if (!sctx->shader.tes.cso)
@@ -3745,19 +3788,19 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
    if (r < 0)
       return false;
    if (r == 1)
-      si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4);
+      si_pm4_bind_state(sctx, ps, sctx->shader.ps.current);
 
    r = si_update_scratch_buffer(sctx, sctx->shader.gs.current);
    if (r < 0)
       return false;
    if (r == 1)
-      si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4);
+      si_pm4_bind_state(sctx, gs, sctx->shader.gs.current);
 
    r = si_update_scratch_buffer(sctx, tcs);
    if (r < 0)
       return false;
    if (r == 1)
-      si_pm4_bind_state(sctx, hs, tcs->pm4);
+      si_pm4_bind_state(sctx, hs, tcs);
 
    /* VS can be bound as LS, ES, or VS. */
    r = si_update_scratch_buffer(sctx, sctx->shader.vs.current);
@@ -3765,13 +3808,13 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
       return false;
    if (r == 1) {
       if (sctx->shader.vs.current->key.as_ls)
-         si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4);
+         si_pm4_bind_state(sctx, ls, sctx->shader.vs.current);
       else if (sctx->shader.vs.current->key.as_es)
-         si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4);
+         si_pm4_bind_state(sctx, es, sctx->shader.vs.current);
       else if (sctx->shader.vs.current->key.as_ngg)
-         si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4);
+         si_pm4_bind_state(sctx, gs, sctx->shader.vs.current);
       else
-         si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4);
+         si_pm4_bind_state(sctx, vs, sctx->shader.vs.current);
    }
 
    /* TES can be bound as ES or VS. */
@@ -3780,17 +3823,17 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
       return false;
    if (r == 1) {
       if (sctx->shader.tes.current->key.as_es)
-         si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4);
+         si_pm4_bind_state(sctx, es, sctx->shader.tes.current);
       else if (sctx->shader.tes.current->key.as_ngg)
-         si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4);
+         si_pm4_bind_state(sctx, gs, sctx->shader.tes.current);
       else
-         si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4);
+         si_pm4_bind_state(sctx, vs, sctx->shader.tes.current);
    }
 
    return true;
 }
 
-static bool si_update_spi_tmpring_size(struct si_context *sctx)
+bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
 {
    /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
     * There are 2 cases to handle:
@@ -3805,17 +3848,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
     * Otherwise, the number of waves that can use scratch is
     * SPI_TMPRING_SIZE.WAVES.
     */
-   unsigned bytes = 0;
-
-   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.ps.current));
-   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.gs.current));
-   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.vs.current));
-
-   if (sctx->shader.tes.cso) {
-      bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.tes.current));
-      bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
-   }
-
    sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
 
    unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
@@ -3834,7 +3866,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
          if (!sctx->scratch_buffer)
             return false;
 
-         si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
          si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b);
       }
 
@@ -3855,7 +3886,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
    return true;
 }
 
-static void si_init_tess_factor_ring(struct si_context *sctx)
+void si_init_tess_factor_ring(struct si_context *sctx)
 {
    assert(!sctx->tess_rings);
    assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
@@ -3893,17 +3924,17 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
 
       /* Set tessellation registers. */
       radeon_begin(cs);
-      radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
+      radeon_set_uconfig_reg(R_030938_VGT_TF_RING_SIZE,
                              S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
-      radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      radeon_set_uconfig_reg(R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
       if (sctx->chip_class >= GFX10) {
-         radeon_set_uconfig_reg(cs, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
+         radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
                                 S_030984_BASE_HI(factor_va >> 40));
       } else if (sctx->chip_class == GFX9) {
-         radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI,
+         radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI,
                                 S_030944_BASE_HI(factor_va >> 40));
       }
-      radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM,
+      radeon_set_uconfig_reg(R_03093C_VGT_HS_OFFCHIP_PARAM,
                              sctx->screen->vgt_hs_offchip_param);
       radeon_end();
       return;
@@ -3955,8 +3986,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
    si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
-static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
-                                                       union si_vgt_stages_key key)
+struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key)
 {
    struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
    uint32_t stages = 0;
@@ -3977,7 +4007,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
    }
 
    if (key.u.ngg) {
-      stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
+      stages |= S_028B54_PRIMGEN_EN(1) |
                 S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
                 S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough) |
                 S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key.u.ngg_passthrough &&
@@ -3988,9 +4018,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
    if (screen->info.chip_class >= GFX9)
       stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
 
-   if (screen->info.chip_class >= GFX10 &&
-       /* GS fast launch hangs with Wave64, so always use Wave32. */
-       (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) {
+   if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
       stages |= S_028B54_HS_W32_EN(1) |
                 S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
                 S_028B54_VS_W32_EN(1);
@@ -4000,293 +4028,12 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
    return pm4;
 }
 
-static void si_update_vgt_shader_config(struct si_context *sctx, union si_vgt_stages_key key)
-{
-   struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
-
-   if (unlikely(!*pm4))
-      *pm4 = si_build_vgt_shader_config(sctx->screen, key);
-   si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
-}
-
-bool si_update_shaders(struct si_context *sctx)
-{
-   struct pipe_context *ctx = (struct pipe_context *)sctx;
-   struct si_compiler_ctx_state compiler_state;
-   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-   struct si_shader *old_vs = si_get_vs(sctx)->current;
-   unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0;
-   struct si_shader *old_ps = sctx->shader.ps.current;
-   union si_vgt_stages_key key;
-   unsigned old_spi_shader_col_format =
-      old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
-   int r;
-
-   if (!sctx->compiler.passes)
-      si_init_compiler(sctx->screen, &sctx->compiler);
-
-   compiler_state.compiler = &sctx->compiler;
-   compiler_state.debug = sctx->debug;
-   compiler_state.is_debug_context = sctx->is_debug;
-
-   key.index = 0;
-
-   if (sctx->shader.tes.cso)
-      key.u.tess = 1;
-   if (sctx->shader.gs.cso)
-      key.u.gs = 1;
-
-   if (sctx->ngg) {
-      key.u.ngg = 1;
-      key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
-   }
-
-   /* Update TCS and TES. */
-   if (sctx->shader.tes.cso) {
-      if (!sctx->tess_rings) {
-         si_init_tess_factor_ring(sctx);
-         if (!sctx->tess_rings)
-            return false;
-      }
-
-      if (sctx->shader.tcs.cso) {
-         r = si_shader_select(ctx, &sctx->shader.tcs, key, &compiler_state);
-         if (r)
-            return false;
-         si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4);
-      } else {
-         if (!sctx->fixed_func_tcs_shader.cso) {
-            sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx);
-            if (!sctx->fixed_func_tcs_shader.cso)
-               return false;
-         }
-
-         r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, key, &compiler_state);
-         if (r)
-            return false;
-         si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4);
-      }
-
-      if (!sctx->shader.gs.cso || sctx->chip_class <= GFX8) {
-         r = si_shader_select(ctx, &sctx->shader.tes, key, &compiler_state);
-         if (r)
-            return false;
-
-         if (sctx->shader.gs.cso) {
-            /* TES as ES */
-            assert(sctx->chip_class <= GFX8);
-            si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4);
-         } else if (key.u.ngg) {
-            si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4);
-         } else {
-            si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4);
-         }
-      }
-   } else {
-      if (sctx->chip_class <= GFX8)
-         si_pm4_bind_state(sctx, ls, NULL);
-      si_pm4_bind_state(sctx, hs, NULL);
-   }
-
-   /* Update GS. */
-   if (sctx->shader.gs.cso) {
-      r = si_shader_select(ctx, &sctx->shader.gs, key, &compiler_state);
-      if (r)
-         return false;
-      si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4);
-      if (!key.u.ngg) {
-         si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader->pm4);
-
-         if (!si_update_gs_ring_buffers(sctx))
-            return false;
-      } else {
-         si_pm4_bind_state(sctx, vs, NULL);
-      }
-   } else {
-      if (!key.u.ngg) {
-         si_pm4_bind_state(sctx, gs, NULL);
-         if (sctx->chip_class <= GFX8)
-            si_pm4_bind_state(sctx, es, NULL);
-      }
-   }
-
-   /* Update VS. */
-   if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
-      r = si_shader_select(ctx, &sctx->shader.vs, key, &compiler_state);
-      if (r)
-         return false;
-
-      if (!key.u.tess && !key.u.gs) {
-         if (key.u.ngg) {
-            si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4);
-            si_pm4_bind_state(sctx, vs, NULL);
-         } else {
-            si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4);
-         }
-      } else if (sctx->shader.tes.cso) {
-         si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4);
-      } else {
-         assert(sctx->shader.gs.cso);
-         si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4);
-      }
-   }
-
-   /* This must be done after the shader variant is selected. */
-   if (sctx->ngg) {
-      struct si_shader *vs = si_get_vs(sctx)->current;
-
-      key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
-      key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
-   }
-
-   sctx->vs_uses_base_instance =
-      sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance :
-      sctx->queued.named.hs ? sctx->queued.named.hs->shader->uses_base_instance :
-      sctx->shader.gs.current->uses_base_instance;
-
-   si_update_vgt_shader_config(sctx, key);
-
-   if (old_kill_clip_distances != si_get_vs(sctx)->current->key.opt.kill_clip_distances)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
-   if (sctx->shader.ps.cso) {
-      unsigned db_shader_control;
-
-      r = si_shader_select(ctx, &sctx->shader.ps, key, &compiler_state);
-      if (r)
-         return false;
-      si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4);
-
-      db_shader_control = sctx->shader.ps.cso->db_shader_control |
-                          S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
-
-      if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
-          (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
-          sctx->sprite_coord_enable != rs->sprite_coord_enable ||
-          sctx->flatshade != rs->flatshade) {
-         sctx->sprite_coord_enable = rs->sprite_coord_enable;
-         sctx->flatshade = rs->flatshade;
-         si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
-      }
-
-      if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) &&
-          (!old_ps || old_spi_shader_col_format !=
-                         sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format))
-         si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
-      if (sctx->ps_db_shader_control != db_shader_control) {
-         sctx->ps_db_shader_control = db_shader_control;
-         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-         if (sctx->screen->dpbb_allowed)
-            si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-      }
-
-      if (sctx->smoothing_enabled !=
-          sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) {
-         sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing;
-         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
-         /* NGG cull state uses smoothing_enabled. */
-         if (sctx->screen->use_ngg_culling)
-            si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
-
-         if (sctx->chip_class == GFX6)
-            si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-         if (sctx->framebuffer.nr_samples <= 1)
-            si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
-      }
-
-      if (sctx->chip_class >= GFX10_3) {
-         struct si_shader_info *info = &sctx->shader.ps.cso->info;
-         bool allow_flat_shading = info->allow_flat_shading;
-
-         if (allow_flat_shading &&
-             (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable ||
-              (!rs->flatshade && info->uses_interp_color)))
-            allow_flat_shading = false;
-
-         if (sctx->allow_flat_shading != allow_flat_shading) {
-            sctx->allow_flat_shading = allow_flat_shading;
-            si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-         }
-      }
-   }
-
-   if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) {
-      /* Pretend the bound shaders form a vk pipeline */
-      uint32_t pipeline_code_hash = 0;
-      uint64_t base_address = ~0;
-
-      for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
-         struct si_shader *shader = sctx->shaders[i].current;
-         if (sctx->shaders[i].cso && shader) {
-            pipeline_code_hash = _mesa_hash_data_with_seed(
-               shader->binary.elf_buffer,
-               shader->binary.elf_size,
-               pipeline_code_hash);
-            base_address = MIN2(base_address,
-                                shader->bo->gpu_address);
-         }
-      }
-
-      struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
-      if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
-         si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false);
-      }
-
-      si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0);
-   }
-
-   if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) ||
-       si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) ||
-       si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) {
-      if (!si_update_spi_tmpring_size(sctx))
-         return false;
-   }
-
-   if (sctx->chip_class >= GFX7) {
-      if (si_pm4_state_enabled_and_changed(sctx, ls))
-         sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
-      else if (!sctx->queued.named.ls)
-         sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
-
-      if (si_pm4_state_enabled_and_changed(sctx, hs))
-         sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
-      else if (!sctx->queued.named.hs)
-         sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
-
-      if (si_pm4_state_enabled_and_changed(sctx, es))
-         sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
-      else if (!sctx->queued.named.es)
-         sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
-
-      if (si_pm4_state_enabled_and_changed(sctx, gs))
-         sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
-      else if (!sctx->queued.named.gs)
-         sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
-
-      if (si_pm4_state_enabled_and_changed(sctx, vs))
-         sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
-      else if (!sctx->queued.named.vs)
-         sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
-
-      if (si_pm4_state_enabled_and_changed(sctx, ps))
-         sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
-      else if (!sctx->queued.named.ps)
-         sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
-   }
-
-   sctx->do_update_shaders = false;
-   return true;
-}
-
 static void si_emit_scratch_state(struct si_context *sctx)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
+   radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
    radeon_end();
 
    if (sctx->scratch_buffer) {
@@ -4303,7 +4050,6 @@ void si_init_screen_live_shader_cache(struct si_screen *sscreen)
 
 void si_init_shader_functions(struct si_context *sctx)
 {
-   sctx->atoms.s.spi_map.emit = si_emit_spi_map;
    sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
 
    sctx->b.create_vs_state = si_create_shader;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c b/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c
index b6656fdc8..e70987d66 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c
@@ -46,7 +46,8 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
    int modifiers_count = 0;
    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
 
-   /* TODO: get tiling working */
+   /* To get tiled buffers, users need to explicitly provide a list of
+    * modifiers. */
    vidbuf.bind |= PIPE_BIND_LINEAR;
 
    if (pipe->screen->resource_create_with_modifiers) {
@@ -58,6 +59,33 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
                                              modifiers_count);
 }
 
+struct pipe_video_buffer *si_video_buffer_create_with_modifiers(struct pipe_context *pipe,
+                                                                const struct pipe_video_buffer *tmpl,
+                                                                const uint64_t *modifiers,
+                                                                unsigned int modifiers_count)
+{
+   uint64_t *allowed_modifiers;
+   unsigned int allowed_modifiers_count, i;
+
+   /* Filter out DCC modifiers, because we don't support them for video
+    * for now. */
+   allowed_modifiers = calloc(modifiers_count, sizeof(uint64_t));
+   if (!allowed_modifiers)
+      return NULL;
+
+   allowed_modifiers_count = 0;
+   for (i = 0; i < modifiers_count; i++) {
+      if (ac_modifier_has_dcc(modifiers[i]))
+         continue;
+      allowed_modifiers[allowed_modifiers_count++] = modifiers[i];
+   }
+
+   struct pipe_video_buffer *buf =
+      vl_video_buffer_create_as_resource(pipe, tmpl, allowed_modifiers, allowed_modifiers_count);
+   free(allowed_modifiers);
+   return buf;
+}
+
 /* set the decoding target buffer offsets */
 static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
 {
author	Jonathan Gray <jsg@cvs.openbsd.org>	2022-02-24 02:30:08 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2022-02-24 02:30:08 +0000
commit	1d35364040c0ffa99133522fa5ab3bd6131d8bf7 (patch)
tree	0ea3d9ca4ad10692c6477168b67e98cb50ea6bd3 /lib/mesa/src/gallium/drivers/radeonsi
parent	b24b5b9049e889ee4eb39b565bcc8d48bd45ab48 (diff)