summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers/radeonsi
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2022-02-24 02:30:08 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2022-02-24 02:30:08 +0000
commit1d35364040c0ffa99133522fa5ab3bd6131d8bf7 (patch)
tree0ea3d9ca4ad10692c6477168b67e98cb50ea6bd3 /lib/mesa/src/gallium/drivers/radeonsi
parentb24b5b9049e889ee4eb39b565bcc8d48bd45ab48 (diff)
Merge Mesa 21.3.7
Diffstat (limited to 'lib/mesa/src/gallium/drivers/radeonsi')
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/Android.mk95
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources75
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt0
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt11
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt0
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_blit.c106
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_compute.c163
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c1580
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c74
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_debug.c37
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c403
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c1044
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c250
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h396
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c8
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h2
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_shader.c128
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_shader.h134
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h20
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c54
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state.c767
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state.h83
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c1718
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c30
24 files changed, 2306 insertions, 4872 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
deleted file mode 100644
index e402da639..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
+++ /dev/null
@@ -1,95 +0,0 @@
-# Mesa 3-D graphics library
-#
-# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
-# Copyright (C) 2010-2011 LunarG Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-LOCAL_PATH := $(call my-dir)
-
-# get C_SOURCES and GENERATED_SOURCES
-include $(LOCAL_PATH)/Makefile.sources
-
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := $(C_SOURCES)
-
-LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU # instructs LLVM to declare LLVMInitializeAMDGPU* functions
-
-LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
-LOCAL_C_INCLUDES := \
- $(MESA_TOP)/src/amd/common \
- $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \
- $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir
-
-LOCAL_STATIC_LIBRARIES := libmesa_amd_common
-
-LOCAL_SHARED_LIBRARIES := libdrm_radeon
-LOCAL_MODULE := libmesa_pipe_radeonsi
-
-intermediates := $(call local-generated-sources-dir)
-
-# We need to get NIR's generated headers.
-LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
-LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/radeonsi/,$(GENERATED_SOURCES))
-
-GEN_DRIINFO_INPUTS := \
- $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
- $(LOCAL_PATH)/driinfo_radeonsi.h
-
-MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
-
-$(intermediates)/radeonsi/si_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
- @mkdir -p $(dir $@)
- @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
- $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
-
-GEN10_FORMAT_TABLE_INPUTS := \
- $(MESA_TOP)/src/gallium/auxiliary/util/u_format.csv \
- $(MESA_TOP)/src/amd/registers/gfx10-rsrc.json
-
-GEN10_FORMAT_TABLE_DEP := \
- $(MESA_TOP)/src/amd/registers/regdb.py
-
-GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py
-
-$(intermediates)/radeonsi/gfx10_format_table.h: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP)
- @mkdir -p $(dir $@)
- @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
- $(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false)
-
-LOCAL_C_INCLUDES += $(intermediates)/radeonsi
-
-LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
-
-$(call mesa-build-with-llvm)
-
-include $(GALLIUM_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
-
-ifneq ($(HAVE_GALLIUM_RADEONSI),)
-GALLIUM_TARGET_DRIVERS += radeonsi
-$(eval GALLIUM_LIBS += \
- $(LOCAL_MODULE) \
- $(LOCAL_STATIC_LIBRARIES) \
- libmesa_winsys_radeon \
- libmesa_winsys_amdgpu)
-$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
-endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources b/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources
deleted file mode 100644
index 55ef80856..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources
+++ /dev/null
@@ -1,75 +0,0 @@
-C_SOURCES := \
- driinfo_radeonsi.h \
- gfx10_query.c \
- gfx10_shader_ngg.c \
- si_blit.c \
- si_buffer.c \
- si_build_pm4.h \
- si_clear.c \
- si_compute.c \
- si_compute_prim_discard.c \
- si_compute.h \
- si_compute_blit.c \
- si_cp_dma.c \
- si_cp_reg_shadowing.c \
- si_debug.c \
- si_descriptors.c \
- si_fence.c \
- si_get.c \
- si_gfx_cs.c \
- si_gpu_load.c \
- si_pipe.c \
- si_pipe.h \
- si_pm4.c \
- si_pm4.h \
- si_perfcounter.c \
- si_public.h \
- si_query.c \
- si_query.h \
- si_shader.c \
- si_shader.h \
- si_shader_internal.h \
- si_shader_llvm.c \
- si_shader_llvm_gs.c \
- si_shader_llvm_ps.c \
- si_shader_llvm_resources.c \
- si_shader_llvm_tess.c \
- si_shader_llvm_vs.c \
- si_shader_nir.c \
- si_shaderlib_nir.c \
- si_shaderlib_tgsi.c \
- si_sqtt.c \
- si_state.c \
- si_state_binning.c \
- si_state_draw.cpp \
- si_state_msaa.c \
- si_state_shaders.c \
- si_state_streamout.c \
- si_state_viewport.c \
- si_state.h \
- si_test_blit.c \
- si_test_dma_perf.c \
- si_texture.c \
- si_uvd.c \
- ../radeon/radeon_uvd.c \
- ../radeon/radeon_uvd.h \
- ../radeon/radeon_vcn_dec_jpeg.c \
- ../radeon/radeon_vcn_dec.c \
- ../radeon/radeon_vcn_dec.h \
- ../radeon/radeon_vcn_av1_default.h \
- ../radeon/radeon_vcn_enc_1_2.c \
- ../radeon/radeon_vcn_enc_2_0.c \
- ../radeon/radeon_vcn_enc_3_0.c \
- ../radeon/radeon_vcn_enc.c \
- ../radeon/radeon_vcn_enc.h \
- ../radeon/radeon_uvd_enc_1_1.c \
- ../radeon/radeon_uvd_enc.c \
- ../radeon/radeon_uvd_enc.h \
- ../radeon/radeon_vce_40_2_2.c \
- ../radeon/radeon_vce_50.c \
- ../radeon/radeon_vce_52.c \
- ../radeon/radeon_vce.c \
- ../radeon/radeon_vce.h \
- ../radeon/radeon_video.c \
- ../radeon/radeon_video.h \
- ../radeon/radeon_winsys.h
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt
deleted file mode 100644
index e69de29bb..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt
+++ /dev/null
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt
deleted file mode 100644
index 69d00870a..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# Note: skips lists for CI are just a list of lines that, when
-# non-zero-length and not starting with '#', will regex match to
-# delete lines from the test list. Be careful.
-
-# Skip the perf/stress tests to keep runtime manageable
-dEQP-GLES[0-9]*.performance.*
-dEQP-GLES[0-9]*.stress.*
-
-# These are really slow on tiling architectures (including llvmpipe).
-dEQP-GLES[0-9]*.functional.flush_finish.*
-
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt
deleted file mode 100644
index e69de29bb..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt
+++ /dev/null
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c b/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c
index 653dfc343..5653ff233 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c
@@ -98,11 +98,13 @@ void si_blitter_end(struct si_context *sctx)
/* Restore shader pointers because the VS blit shader changed all
* non-global VS user SGPRs. */
sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
+
+ unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
sctx->num_vertex_elements >
- sctx->screen->num_vbos_in_user_sgprs;
+ num_vbos_in_user_sgprs;
sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
- sctx->screen->num_vbos_in_user_sgprs;
+ num_vbos_in_user_sgprs;
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}
@@ -393,11 +395,12 @@ static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex,
si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, false, true /* no DCC */);
}
-static void si_decompress_sampler_depth_textures(struct si_context *sctx,
+static bool si_decompress_sampler_depth_textures(struct si_context *sctx,
struct si_samplers *textures)
{
unsigned i;
unsigned mask = textures->needs_depth_decompress_mask;
+ bool need_flush = false;
while (mask) {
struct pipe_sampler_view *view;
@@ -416,7 +419,14 @@ static void si_decompress_sampler_depth_textures(struct si_context *sctx,
si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
view->u.tex.first_level, view->u.tex.last_level, 0,
util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+
+ if (tex->need_flush_after_depth_decompression) {
+ need_flush = true;
+ tex->need_flush_after_depth_decompression = false;
+ }
}
+
+ return need_flush;
}
static void si_blit_decompress_color(struct si_context *sctx, struct si_texture *tex,
@@ -755,6 +765,7 @@ static void si_decompress_resident_images(struct si_context *sctx)
void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
{
unsigned compressed_colortex_counter, mask;
+ bool need_flush = false;
if (sctx->blitter_running)
return;
@@ -772,7 +783,7 @@ void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
unsigned i = u_bit_scan(&mask);
if (sctx->samplers[i].needs_depth_decompress_mask) {
- si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
+ need_flush |= si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
}
if (sctx->samplers[i].needs_color_decompress_mask) {
si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
@@ -782,6 +793,16 @@ void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
}
}
+ if (sctx->chip_class == GFX10_3 && need_flush) {
+ /* This fixes a corruption with the following sequence:
+ * - fast clear depth
+ * - decompress depth
+ * - draw
+ * (see https://gitlab.freedesktop.org/drm/amd/-/issues/1810#note_1170171)
+ */
+ sctx->b.flush(&sctx->b, NULL, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW);
+ }
+
if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
if (sctx->uses_bindless_samplers)
si_decompress_resident_textures(sctx);
@@ -1027,7 +1048,7 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst
/* Copy. */
si_blitter_begin(sctx, SI_COPY);
util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0,
- src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false);
+ src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false, false);
si_blitter_end(sctx);
pipe_surface_reference(&dst_view, NULL);
@@ -1203,11 +1224,48 @@ resolve_to_temp:
static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *sdst = (struct si_texture *)info->dst.resource;
if (do_hardware_msaa_resolve(ctx, info)) {
return;
}
+ if (info->is_dri_blit_image && sdst->surface.is_linear &&
+ sctx->chip_class >= GFX7 && sdst->surface.flags & RADEON_SURF_IMPORTED) {
+ struct si_texture *ssrc = (struct si_texture *)info->src.resource;
+ /* Use SDMA or async compute when copying to a DRI_PRIME imported linear surface. */
+ bool async_copy = info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.z == 0 &&
+ info->src.box.x == 0 && info->src.box.y == 0 && info->src.box.z == 0 &&
+ info->dst.level == 0 && info->src.level == 0 &&
+ info->src.box.width == info->dst.resource->width0 &&
+ info->src.box.height == info->dst.resource->height0 &&
+ info->src.box.depth == 1 && util_can_blit_via_copy_region(info, true);
+ /* Try SDMA first... */
+ /* TODO: figure out why SDMA copies are slow on GFX10_3 */
+ if (async_copy && sctx->chip_class < GFX10_3 && si_sdma_copy_image(sctx, sdst, ssrc))
+ return;
+
+ /* ... and use async compute as the fallback. */
+ if (async_copy) {
+ struct si_screen *sscreen = sctx->screen;
+
+ simple_mtx_lock(&sscreen->async_compute_context_lock);
+ if (!sscreen->async_compute_context)
+ si_init_aux_async_compute_ctx(sscreen);
+
+ if (sscreen->async_compute_context) {
+ si_compute_copy_image((struct si_context*)sctx->screen->async_compute_context,
+ info->dst.resource, 0, info->src.resource, 0, 0, 0, 0,
+ &info->src.box, false, 0);
+ si_flush_gfx_cs((struct si_context*)sctx->screen->async_compute_context, 0, NULL);
+ simple_mtx_unlock(&sscreen->async_compute_context_lock);
+ return;
+ }
+
+ simple_mtx_unlock(&sscreen->async_compute_context_lock);
+ }
+ }
+
if (unlikely(sctx->thread_trace_enabled))
sctx->sqtt_next_event = EventCmdCopyImage;
@@ -1276,52 +1334,16 @@ static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *re
struct si_texture *tex = (struct si_texture *)res;
assert(res->target != PIPE_BUFFER);
- assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
-
- /* st/dri calls flush twice per frame (not a bug), this prevents double
- * decompression. */
- if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
- return;
if (!tex->is_depth && (tex->cmask_buffer || vi_dcc_enabled(tex, 0))) {
si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0),
- tex->dcc_separate_buffer != NULL, false);
+ false, false);
if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
si_retile_dcc(sctx, tex);
tex->displayable_dcc_dirty = false;
}
}
-
- /* Always do the analysis even if DCC is disabled at the moment. */
- if (tex->dcc_gather_statistics) {
- bool separate_dcc_dirty = tex->separate_dcc_dirty;
-
- /* If the color buffer hasn't been unbound and fast clear hasn't
- * been used, separate_dcc_dirty is false, but there may have been
- * new rendering. Check if the color buffer is bound and assume
- * it's dirty.
- *
- * Note that DRI2 never unbinds window colorbuffers, which means
- * the DCC pipeline statistics query would never be re-set and would
- * keep adding new results until all free memory is exhausted if we
- * didn't do this.
- */
- if (!separate_dcc_dirty) {
- for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- if (sctx->framebuffer.state.cbufs[i] &&
- sctx->framebuffer.state.cbufs[i]->texture == res) {
- separate_dcc_dirty = true;
- break;
- }
- }
- }
-
- if (separate_dcc_dirty) {
- tex->separate_dcc_dirty = false;
- vi_separate_dcc_process_and_reset_stats(ctx, tex);
- }
- }
}
void si_flush_implicit_resources(struct si_context *sctx)
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
index 48ec79ac5..0ae232db2 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
@@ -107,7 +107,7 @@ static void code_object_to_config(const amd_kernel_code_t *code_object,
}
/* Asynchronous compute shader compilation. */
-static void si_create_compute_state_async(void *job, int thread_index)
+static void si_create_compute_state_async(void *job, void *gdata, int thread_index)
{
struct si_compute *program = (struct si_compute *)job;
struct si_shader_selector *sel = &program->sel;
@@ -367,11 +367,14 @@ static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsi
void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
{
radeon_begin(cs);
- radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
+ radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI,
+ S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
+
+ radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
* renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+ radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+ radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
if (sctx->chip_class == GFX6) {
/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
@@ -381,25 +384,25 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
* TODO: This should be:
* (number of compute units) * 4 * (waves per simd) - 1
*/
- radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+ radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
uint64_t bc_va = sctx->border_color_buffer->gpu_address;
- radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+ radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
}
}
if (sctx->chip_class >= GFX7) {
/* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
- radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+ radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
+ radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+ radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
/* Disable profiling on compute queues. */
if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) {
- radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
- radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
+ radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
+ radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
}
/* Set the pointer to border colors. */
@@ -407,9 +410,9 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
if (sctx->border_color_buffer) {
uint64_t bc_va = sctx->border_color_buffer->gpu_address;
- radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
- radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */
- radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
+ radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
+ radeon_emit(bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */
+ radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
}
}
@@ -418,17 +421,19 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
*/
if (sctx->chip_class >= GFX9 &&
(cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) {
- radeon_set_uconfig_reg(cs, R_0301EC_CP_COHER_START_DELAY,
+ radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY,
sctx->chip_class >= GFX10 ? 0x20 : 0);
}
if (sctx->chip_class >= GFX10) {
- radeon_set_sh_reg(cs, R_00B890_COMPUTE_USER_ACCUM_0, 0);
- radeon_set_sh_reg(cs, R_00B894_COMPUTE_USER_ACCUM_1, 0);
- radeon_set_sh_reg(cs, R_00B898_COMPUTE_USER_ACCUM_2, 0);
- radeon_set_sh_reg(cs, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
- radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
- radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
+ radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 5);
+ radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */
+ radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */
+ radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */
+ radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */
+ radeon_emit(0); /* R_00B8A0_COMPUTE_PGM_RSRC3 */
+
+ radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
}
radeon_end();
}
@@ -533,13 +538,11 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
RADEON_PRIO_SHADER_BINARY);
radeon_begin(cs);
- radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
- radeon_emit(cs, shader_va >> 8);
- radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+ radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
- radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
- radeon_emit(cs, config->rsrc1);
- radeon_emit(cs, config->rsrc2);
+ radeon_set_sh_reg_seq(R_00B848_COMPUTE_PGM_RSRC1, 2);
+ radeon_emit(config->rsrc1);
+ radeon_emit(config->rsrc2);
COMPUTE_DBG(sctx->screen,
"COMPUTE_PGM_RSRC1: 0x%08x "
@@ -549,7 +552,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
sctx->max_seen_compute_scratch_bytes_per_wave =
MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
- radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+ radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
S_00B860_WAVES(sctx->scratch_waves) |
S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
radeon_end();
@@ -592,11 +595,11 @@ static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
}
radeon_begin(cs);
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
- radeon_emit(cs, scratch_dword0);
- radeon_emit(cs, scratch_dword1);
- radeon_emit(cs, scratch_dword2);
- radeon_emit(cs, scratch_dword3);
+ radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
+ radeon_emit(scratch_dword0);
+ radeon_emit(scratch_dword1);
+ radeon_emit(scratch_dword2);
+ radeon_emit(scratch_dword3);
radeon_end();
}
@@ -656,9 +659,9 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
- radeon_emit(cs, dispatch_va);
- radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
+ radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+ radeon_emit(dispatch_va);
+ radeon_emit(S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
si_resource_reference(&dispatch_buf, NULL);
user_sgpr += 2;
@@ -666,16 +669,16 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
if (AMD_HSA_BITS_GET(code_object->code_properties,
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
- radeon_emit(cs, kernel_args_va);
- radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
+ radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+ radeon_emit(kernel_args_va);
+ radeon_emit(S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
user_sgpr += 2;
}
for (i = 0; i < 3 && user_sgpr < 16; i++) {
if (code_object->code_properties & workgroup_count_masks[i]) {
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
- radeon_emit(cs, info->grid[i]);
+ radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
+ radeon_emit(info->grid[i]);
user_sgpr += 1;
}
}
@@ -740,21 +743,21 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
}
radeon_begin_again(cs);
} else {
- radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
- radeon_emit(cs, info->grid[0]);
- radeon_emit(cs, info->grid[1]);
- radeon_emit(cs, info->grid[2]);
+ radeon_set_sh_reg_seq(grid_size_reg, 3);
+ radeon_emit(info->grid[0]);
+ radeon_emit(info->grid[1]);
+ radeon_emit(info->grid[2]);
}
}
if (sel->info.uses_variable_block_size) {
- radeon_set_sh_reg(cs, block_size_reg,
+ radeon_set_sh_reg(block_size_reg,
info->block[0] | (info->block[1] << 10) | (info->block[2] << 20));
}
if (sel->info.base.cs.user_data_components_amd) {
- radeon_set_sh_reg_seq(cs, cs_user_data_reg, sel->info.base.cs.user_data_components_amd);
- radeon_emit_array(cs, sctx->cs_user_data, sel->info.base.cs.user_data_components_amd);
+ radeon_set_sh_reg_seq(cs_user_data_reg, sel->info.base.cs.user_data_components_amd);
+ radeon_emit_array(sctx->cs_user_data, sel->info.base.cs.user_data_components_amd);
}
radeon_end();
}
@@ -780,7 +783,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
radeon_begin(cs);
radeon_set_sh_reg(
- cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+ R_00B854_COMPUTE_RESOURCE_LIMITS,
ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup,
sctx->cs_max_waves_per_sh, threadgroups_per_cu));
@@ -793,7 +796,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
const uint *last_block = info->last_block;
bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
- radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+ radeon_set_sh_reg_seq(R_00B81C_COMPUTE_NUM_THREAD_X, 3);
if (partial_block_en) {
unsigned partial[3];
@@ -803,18 +806,18 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
partial[1] = last_block[1] ? last_block[1] : info->block[1];
partial[2] = last_block[2] ? last_block[2] : info->block[2];
- radeon_emit(
- cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
- radeon_emit(
- cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1]));
- radeon_emit(
- cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+ radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]) |
+ S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+ radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]) |
+ S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+ radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]) |
+ S_00B824_NUM_THREAD_PARTIAL(partial[2]));
dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
} else {
- radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
- radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
- radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+ radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]));
+ radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]));
+ radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]));
}
if (info->indirect) {
@@ -823,25 +826,25 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ,
RADEON_PRIO_DRAW_INDIRECT);
- radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, 1);
- radeon_emit(cs, base_va);
- radeon_emit(cs, base_va >> 32);
+ radeon_emit(PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(1);
+ radeon_emit(base_va);
+ radeon_emit(base_va >> 32);
- radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, info->indirect_offset);
- radeon_emit(cs, dispatch_initiator);
+ radeon_emit(PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(info->indirect_offset);
+ radeon_emit(dispatch_initiator);
} else {
- radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, info->grid[0]);
- radeon_emit(cs, info->grid[1]);
- radeon_emit(cs, info->grid[2]);
- radeon_emit(cs, dispatch_initiator);
+ radeon_emit(PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(info->grid[0]);
+ radeon_emit(info->grid[1]);
+ radeon_emit(info->grid[2]);
+ radeon_emit(dispatch_initiator);
}
if (unlikely(sctx->thread_trace_enabled && sctx->chip_class >= GFX9)) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
}
radeon_end();
}
@@ -857,6 +860,8 @@ static bool si_check_needs_implicit_sync(struct si_context *sctx)
*
* buffer object and texture stores performed by shaders are not
* automatically synchronized
+ *
+ * TODO: Bindless textures are not handled, and thus are not synchronized.
*/
struct si_shader_info *info = &sctx->cs_shader_state.program->sel.info;
struct si_samplers *samplers = &sctx->samplers[PIPE_SHADER_COMPUTE];
@@ -890,18 +895,12 @@ static bool si_check_needs_implicit_sync(struct si_context *sctx)
static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
+ struct si_screen *sscreen = sctx->screen;
struct si_compute *program = sctx->cs_shader_state.program;
const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc);
int i;
- /* HW bug workaround when CS threadgroups > 256 threads and async
- * compute isn't used, i.e. only one compute job can run at a time.
- * If async compute is possible, the threadgroup size must be limited
- * to 256 threads on all queues to avoid the bug.
- * Only GFX6 and certain GFX7 chips are affected.
- */
- bool cs_regalloc_hang =
- (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) &&
- info->block[0] * info->block[1] * info->block[2] > 256;
+ bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug &&
+ info->block[0] * info->block[1] * info->block[2] > 256;
if (cs_regalloc_hang)
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
deleted file mode 100644
index 373fd4ffa..000000000
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ /dev/null
@@ -1,1580 +0,0 @@
-/*
- * Copyright 2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "si_pipe.h"
-#include "si_shader_internal.h"
-#include "sid.h"
-#include "si_build_pm4.h"
-#include "ac_llvm_cull.h"
-
-#include "util/u_prim.h"
-#include "util/u_suballoc.h"
-#include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
-
-/* Based on:
- * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
- */
-
-/* This file implements primitive culling using asynchronous compute.
- * It's written to be GL conformant.
- *
- * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
- * in a compute shader. The shader processes 1 primitive/thread by invoking
- * the VS for each vertex to get the positions, decomposes strips and fans
- * into triangles (if needed), eliminates primitive restart (if needed),
- * does (W<0) culling, face culling, view XY culling, zero-area and
- * small-primitive culling, and generates a new index buffer that doesn't
- * contain culled primitives.
- *
- * The index buffer is generated using the Ordered Count feature of GDS,
- * which is an atomic counter that is incremented in the wavefront launch
- * order, so that the original primitive order is preserved.
- *
- * Another GDS ordered counter is used to eliminate primitive restart indices.
- * If a restart index lands on an even thread ID, the compute shader has to flip
- * the primitive orientation of the whole following triangle strip. The primitive
- * orientation has to be correct after strip and fan decomposition for two-sided
- * shading to behave correctly. The decomposition also needs to be aware of
- * which vertex is the provoking vertex for flat shading to behave correctly.
- *
- * IB = a GPU command buffer
- *
- * Both the compute and gfx IBs run in parallel sort of like CE and DE.
- * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
- * doesn't continue if its word isn't 0x80000000. Once compute shaders are
- * finished culling, the last wave will write the final primitive count from
- * GDS directly into the count word of the draw packet in the gfx IB, and
- * a CS_DONE event will signal the REWIND packet to continue. It's really
- * a direct draw with command buffer patching from the compute queue.
- *
- * The compute IB doesn't have to start when its corresponding gfx IB starts,
- * but can start sooner. The compute IB is signaled to start after the last
- * execution barrier in the *previous* gfx IB. This is handled as follows.
- * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
- * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
- * represents the barrier in the previous gfx IB.
- *
- * Features:
- * - Triangle strips and fans are decomposed into an indexed triangle list.
- * The decomposition differs based on the provoking vertex state.
- * - Instanced draws are converted into non-instanced draws for 16-bit indices.
- * (InstanceID is stored in the high bits of VertexID and unpacked by VS)
- * - Primitive restart is fully supported with triangle strips, including
- * correct primitive orientation across multiple waves. (restart indices
- * reset primitive orientation)
- * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
- * - Back face culling, incl. culling zero-area / degenerate primitives.
- * - View XY culling.
- * - View Z culling (disabled due to limited impact with perspective projection).
- * - Small primitive culling for all MSAA modes and all quant modes.
- *
- * The following are not implemented:
- * - ClipVertex/ClipDistance/CullDistance-based culling.
- * - Scissor culling.
- * - HiZ culling.
- *
- * Limitations (and unimplemented features that may be possible to implement):
- * - Only triangles, triangle strips, and triangle fans are supported.
- * - Primitive restart is only supported with triangle strips.
- * - Instancing and primitive restart can't be used together.
- * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
- * - The instance divisor buffer is unavailable, so all divisors must be
- * either 0 or 1.
- * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
- * - No support for tessellation and geometry shaders.
- * (patch elimination where tess factors are 0 would be possible to implement)
- * - The vertex shader must not contain memory stores.
- * - All VS resources must not have a write usage in the command buffer.
- * (TODO: all shader buffers currently set the write usage)
- * - Bindless textures and images must not occur in the vertex shader.
- *
- * User data SGPR layout:
- * INDEX_BUFFERS: pointer to constants
- * 0..3: input index buffer - typed buffer view
- * 4..7: output index buffer - typed buffer view
- * 8..11: viewport state - scale.xy, translate.xy
- * VERTEX_COUNTER: counter address or first primitive ID
- * - If unordered memory counter: address of "count" in the draw packet
- * and is incremented atomically by the shader.
- * - If unordered GDS counter: address of "count" in GDS starting from 0,
- * must be initialized to 0 before the dispatch.
- * - If ordered GDS counter: the primitive ID that should reset the vertex
- * counter to 0 in GDS
- * LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
- * count to memory if using GDS ordered append
- * VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
- * using GDS ordered append
- * VS.VERTEX_BUFFERS: same value as VS
- * VS.CONST_AND_SHADER_BUFFERS: same value as VS
- * VS.SAMPLERS_AND_IMAGES: same value as VS
- * VS.BASE_VERTEX: same value as VS
- * VS.START_INSTANCE: same value as VS
- * NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
- * per instance for instancing.
- * NUM_PRIMS_UDIV_TERMS:
- * - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
- * - Bits [5:31]: The number of primitives per instance for computing the remainder.
- * PRIMITIVE_RESTART_INDEX
- * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
- *
- *
- * The code contains 3 codepaths:
- * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
- * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
- * - Ordered GDS counter (it preserves the primitive order)
- *
- * How to test primitive restart (the most complicated part because it needs
- * to get the primitive orientation right):
- * Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
- * primitive orientation flips with small draw calls, which is what most tests use.
- * You can also enable draw call splitting into draw calls with just 2 primitives.
- */
-
-/* At least 256 is needed for the fastest wave launch rate from compute queues
- * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH 0 /* no limit */
-#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
-/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
-#define CULL_Z 0
-/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
-#define VERTEX_COUNTER_GDS_MODE 2
-#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
-
-/* Grouping compute dispatches for small draw calls: How many primitives from multiple
- * draw calls to process by compute before signaling the gfx IB. This reduces the number
- * of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH (512 * 1024)
-/* Draw call splitting at the packet level. This allows signaling the gfx IB
- * for big draw calls sooner, but doesn't allow context flushes between packets.
- * Primitive restart is supported. Only implemented for ordered append. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
-/* If there is not enough ring buffer space for the current IB, split draw calls into
- * this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
-
-/* Derived values. */
-#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \
- SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
- UINT_MAX & ~(THREADGROUP_SIZE - 1))
-
-#define REWIND_SIGNAL_BIT 0x80000000
-/* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION 0
-
-void si_initialize_prim_discard_tunables(struct si_context *sctx)
-{
- sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
- if (sctx->chip_class == GFX6 || /* SI support is not implemented */
- !sctx->screen->info.has_gds_ordered_append ||
- sctx->screen->debug_flags & DBG(NO_PD) ||
- /* If aux_context == NULL, we are initializing aux_context right now. */
- !sctx->screen->aux_context)
- return;
-
- /* TODO: enable this after the GDS kernel memory management is fixed */
- bool enable_on_pro_graphics_by_default = false;
-
- if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
- sctx->screen->debug_flags & DBG(PD) ||
- (enable_on_pro_graphics_by_default &&
- sctx->screen->info.is_pro_graphics &&
- (sctx->family == CHIP_BONAIRE ||
- sctx->family == CHIP_HAWAII ||
- sctx->family == CHIP_TONGA ||
- sctx->family == CHIP_FIJI ||
- sctx->family == CHIP_POLARIS10 ||
- sctx->family == CHIP_POLARIS11 ||
- sctx->family == CHIP_VEGA10 ||
- sctx->family == CHIP_VEGA20))) {
- sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
- if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
- sctx->prim_discard_vertex_count_threshold = 0; /* always enable */
-
- const uint32_t MB = 1024 * 1024;
- const uint64_t GB = 1024 * 1024 * 1024;
-
- /* The total size is double this per context.
- * Greater numbers allow bigger gfx IBs.
- */
- if (sctx->screen->info.vram_size <= 2 * GB)
- sctx->index_ring_size_per_ib = 64 * MB;
- else if (sctx->screen->info.vram_size <= 4 * GB)
- sctx->index_ring_size_per_ib = 128 * MB;
- else
- sctx->index_ring_size_per_ib = 256 * MB;
- }
-}
-
-/* Opcode can be "add" or "swap". */
-static LLVMValueRef
-si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
- LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
- bool release, bool done)
-{
- LLVMValueRef args[] = {
- LLVMBuildIntToPtr(ctx->ac.builder, m0,
- LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""),
- value,
- LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
- ctx->i32_0, /* scope */
- ctx->i1false, /* volatile */
- LLVMConstInt(ctx->i32, ordered_count_index, 0),
- LLVMConstInt(ctx->i1, release, 0),
- LLVMConstInt(ctx->i1, done, 0),
- };
-
- char intrinsic[64];
- snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
- return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0);
-}
-
-static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
-{
- uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
- ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, "");
- ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), "");
- return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
- LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), "");
-}
-
-struct si_thread0_section {
- struct si_shader_context *ctx;
- LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
- LLVMValueRef saved_exec;
-};
-
-/* Enter a section that only executes on thread 0. */
-static void si_enter_thread0_section(struct si_shader_context *ctx,
- struct si_thread0_section *section,
- LLVMValueRef thread_id)
-{
- section->ctx = ctx;
- section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0");
-
- /* This IF has 4 instructions:
- * v_and_b32_e32 v, 63, v ; get the thread ID
- * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
- * s_and_saveexec_b64 s, vcc
- * s_cbranch_execz BB0_4
- *
- * It could just be s_and_saveexec_b64 s, 1.
- */
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
- ctx->i32_0, ""), 12601);
-}
-
-/* Exit a section that only executes on thread 0 and broadcast the result
- * to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section,
- LLVMValueRef *result)
-{
- struct si_shader_context *ctx = section->ctx;
-
- LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
-
- ac_build_endif(&ctx->ac, 12601);
-
- /* Broadcast the result from thread 0 to all threads. */
- *result = ac_build_readlane(&ctx->ac,
- LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
-}
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
-{
- struct si_shader_key *key = &ctx->shader->key;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef vs = ctx->main_fn;
-
- /* Always inline the VS function. */
- ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
- LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
- LLVMTypeRef const_desc_type;
- if (ctx->shader->selector->info.const_buffers_declared == 1 &&
- ctx->shader->selector->info.shader_buffers_declared == 0)
- const_desc_type = ctx->f32;
- else
- const_desc_type = ctx->v4i32;
-
- struct si_function_info fninfo;
- si_init_function_info(&fninfo);
-
- LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc;
- LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id;
- LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision;
- LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc;
- LLVMValueRef last_wave_prim_id, vertex_count_addr;
-
- add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
- &index_buffers_and_constants);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr);
- add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
- &vb_desc);
- add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type),
- &const_desc);
- add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32),
- &sampler_desc);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index);
- add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision);
-
- /* Block ID and thread ID inputs. */
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id);
- if (VERTEX_COUNTER_GDS_MODE == 2)
- add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id);
- add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id);
-
- /* Create the compute shader function. */
- unsigned old_type = ctx->type;
- ctx->type = PIPE_SHADER_COMPUTE;
- si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE);
- ctx->type = old_type;
-
- if (VERTEX_COUNTER_GDS_MODE == 1) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
- GDS_SIZE_UNORDERED);
- }
-
- /* Assemble parameters for VS. */
- LLVMValueRef vs_params[16];
- unsigned num_vs_params = 0;
- unsigned param_vertex_id, param_instance_id;
-
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
- vs_params[num_vs_params++] = const_desc;
- vs_params[num_vs_params++] = sampler_desc;
- vs_params[num_vs_params++] = LLVMConstInt(ctx->i32,
- S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
- vs_params[num_vs_params++] = base_vertex;
- vs_params[num_vs_params++] = start_instance;
- vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */
- vs_params[num_vs_params++] = vb_desc;
-
- vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
- vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
- vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */
- vs_params[num_vs_params++] = ctx->i32_0; /* unused */
-
- assert(num_vs_params <= ARRAY_SIZE(vs_params));
- assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
- /* Load descriptors. (load 8 dwords at once) */
- LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
- tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
- ac_array_in_const32_addr_space(ctx->v8i32), "");
- tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0);
-
- for (unsigned i = 0; i < 8; i++)
- desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
- input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
- output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
- /* Compute PrimID and InstanceID. */
- LLVMValueRef global_thread_id =
- ac_build_imad(&ctx->ac, block_id,
- LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id);
- LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
- LLVMValueRef instance_id = ctx->i32_0;
-
- if (key->opt.cs_instancing) {
- /* Unpack num_prims_udiv_terms. */
- LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
- LLVMConstInt(ctx->i32, 0x1f, 0), "");
- LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
- LLVMConstInt(ctx->i32, 5, 0), "");
- /* Divide the total prim_id by the number of prims per instance. */
- instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
- num_prims_udiv_multiplier,
- post_shift);
- /* Compute the remainder. */
- prim_id = LLVMBuildSub(builder, prim_id,
- LLVMBuildMul(builder, instance_id,
- prims_per_instance, ""), "");
- }
-
- /* Generate indices (like a non-indexed draw call). */
- LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)};
- unsigned vertices_per_prim = 3;
-
- switch (key->opt.cs_prim_type) {
- case PIPE_PRIM_TRIANGLES:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_imad(&ctx->ac, prim_id,
- LLVMConstInt(ctx->i32, 3, 0),
- LLVMConstInt(ctx->i32, i, 0));
- }
- break;
- case PIPE_PRIM_TRIANGLE_STRIP:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = LLVMBuildAdd(builder, prim_id,
- LLVMConstInt(ctx->i32, i, 0), "");
- }
- break;
- case PIPE_PRIM_TRIANGLE_FAN:
- /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
- * and rasterizer as a normal triangle, so we need to put the provoking
- * vertex into the correct index variable and preserve orientation at the same time.
- * gl_VertexID is preserved, because it's equal to the index.
- */
- if (key->opt.cs_provoking_vertex_first) {
- index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
- index[2] = ctx->i32_0;
- } else {
- index[0] = ctx->i32_0;
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
- index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
- }
- break;
- default:
- unreachable("unexpected primitive type");
- }
-
- /* Fetch indices. */
- if (key->opt.cs_indexed) {
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
- index[i], ctx->i32_0, 1,
- 0, true);
- index[i] = ac_to_integer(&ctx->ac, index[i]);
- }
- }
-
- /* Extract the ordered wave ID. */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
- LLVMConstInt(ctx->i32, 6, 0), "");
- ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
- LLVMConstInt(ctx->i32, 0xfff, 0), "");
- }
- LLVMValueRef thread_id =
- LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), "");
-
- /* Every other triangle in a strip has a reversed vertex order, so we
- * need to swap vertices of odd primitives to get the correct primitive
- * orientation when converting triangle strips to triangles. Primitive
- * restart complicates it, because a strip can start anywhere.
- */
- LLVMValueRef prim_restart_accepted = ctx->i1true;
-
- if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
- /* Without primitive restart, odd primitives have reversed orientation.
- * Only primitive restart can flip it with respect to the first vertex
- * of the draw call.
- */
- LLVMValueRef first_is_odd = ctx->i1false;
-
- /* Handle primitive restart. */
- if (key->opt.cs_primitive_restart) {
- /* Get the GDS primitive restart continue flag and clear
- * the flag in vertex_counter. This flag is used when the draw
- * call was split and we need to load the primitive orientation
- * flag from GDS for the first wave too.
- */
- LLVMValueRef gds_prim_restart_continue =
- LLVMBuildLShr(builder, vertex_counter,
- LLVMConstInt(ctx->i32, 31, 0), "");
- gds_prim_restart_continue =
- LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, "");
- vertex_counter = LLVMBuildAnd(builder, vertex_counter,
- LLVMConstInt(ctx->i32, 0x7fffffff, 0), "");
-
- LLVMValueRef index0_is_reset;
-
- for (unsigned i = 0; i < 3; i++) {
- LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
- restart_index, "");
- if (i == 0)
- index0_is_reset = LLVMBuildNot(builder, not_reset, "");
- prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
- not_reset, "");
- }
-
- /* If the previous waves flip the primitive orientation
- * of the current triangle strip, it will be stored in GDS.
- *
- * Sometimes the correct orientation is not needed, in which case
- * we don't need to execute this.
- */
- if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
- /* If there are reset indices in this wave, get the thread index
- * where the most recent strip starts relative to each thread.
- */
- LLVMValueRef preceding_threads_mask =
- LLVMBuildSub(builder,
- LLVMBuildShl(builder, ctx->ac.i64_1,
- LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""),
- ctx->ac.i64_1, "");
-
- LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
- LLVMValueRef preceding_reset_threadmask =
- LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
- LLVMValueRef strip_start =
- ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
- strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, "");
-
- /* This flips the orientatino based on reset indices within this wave only. */
- first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, "");
-
- LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
- LLVMValueRef is_first_wave, current_wave_resets_index;
-
- /* Get the thread index where the last strip starts in this wave.
- *
- * If the last strip doesn't start in this wave, the thread index
- * will be 0.
- *
- * If the last strip starts in the next wave, the thread index will
- * be 64.
- */
- last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
- last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, "");
-
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, &section, thread_id);
-
- /* This must be done in the thread 0 section, because
- * we expect PrimID to be 0 for the whole first wave
- * in this expression.
- *
- * NOTE: This will need to be different if we wanna support
- * instancing with primitive restart.
- */
- is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, "");
- is_first_wave = LLVMBuildAnd(builder, is_first_wave,
- LLVMBuildNot(builder,
- gds_prim_restart_continue, ""), "");
- current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
- last_strip_start, ctx->i32_0, "");
-
- ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state");
-
- /* Save the last strip start primitive index in GDS and read
- * the value that previous waves stored.
- *
- * if (is_first_wave || current_wave_resets_strip)
- * // Read the value that previous waves stored and store a new one.
- * first_is_odd = ds.ordered.swap(last_strip_start);
- * else
- * // Just read the value that previous waves stored.
- * first_is_odd = ds.ordered.add(0);
- */
- ac_build_ifcc(&ctx->ac,
- LLVMBuildOr(builder, is_first_wave,
- current_wave_resets_index, ""), 12602);
- {
- /* The GDS address is always 0 with ordered append. */
- tmp = si_build_ds_ordered_op(ctx, "swap",
- ordered_wave_id, last_strip_start,
- 1, true, false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_else(&ctx->ac, 12603);
- {
- /* Just read the value from GDS. */
- tmp = si_build_ds_ordered_op(ctx, "add",
- ordered_wave_id, ctx->i32_0,
- 1, true, false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_endif(&ctx->ac, 12602);
-
- prev_wave_state = LLVMBuildLoad(builder, ret, "");
- /* Ignore the return value if this is the first wave. */
- prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
- ctx->i32_0, prev_wave_state, "");
- si_exit_thread0_section(&section, &prev_wave_state);
- prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, "");
-
- /* If the strip start appears to be on thread 0 for the current primitive
- * (meaning the reset index is not present in this wave and might have
- * appeared in previous waves), use the value from GDS to determine
- * primitive orientation.
- *
- * If the strip start is in this wave for the current primitive, use
- * the value from the current wave to determine primitive orientation.
- */
- LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
- strip_start, ctx->i32_0, "");
- first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
- first_is_odd, "");
- }
- }
- /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
- LLVMValueRef prim_is_odd =
- LLVMBuildXor(builder, first_is_odd,
- LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), "");
-
- /* Determine the primitive orientation.
- * Only swap the vertices that are not the provoking vertex. We need to keep
- * the provoking vertex in place.
- */
- if (key->opt.cs_provoking_vertex_first) {
- LLVMValueRef index1 = index[1];
- LLVMValueRef index2 = index[2];
- index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, "");
- index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, "");
- } else {
- LLVMValueRef index0 = index[0];
- LLVMValueRef index1 = index[1];
- index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, "");
- index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, "");
- }
- }
-
- /* Execute the vertex shader for each vertex to get vertex positions. */
- LLVMValueRef pos[3][4];
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- vs_params[param_vertex_id] = index[i];
- vs_params[param_instance_id] = instance_id;
-
- LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
- for (unsigned chan = 0; chan < 4; chan++)
- pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
- }
-
- /* Divide XYZ by W. */
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- for (unsigned chan = 0; chan < 3; chan++)
- pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
- }
-
- /* Load the viewport state. */
- LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
- LLVMConstInt(ctx->i32, 2, 0));
- vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
- vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
- vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
- vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
- vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
- /* Do culling. */
- struct ac_cull_options options = {};
- options.cull_front = key->opt.cs_cull_front;
- options.cull_back = key->opt.cs_cull_back;
- options.cull_view_xy = true;
- options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_small_prims = true;
- options.cull_zero_area = true;
- options.cull_w = true;
- options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
- LLVMValueRef accepted =
- ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
- vp_scale, vp_translate, smallprim_precision,
- &options);
-
- LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
- /* Count the number of active threads by doing bitcount(accepted). */
- LLVMValueRef num_prims_accepted =
- ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64,
- &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
- num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, "");
-
- LLVMValueRef start;
-
- /* Execute atomic_add on the vertex count. */
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, &section, thread_id);
- {
- if (VERTEX_COUNTER_GDS_MODE == 0) {
- LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
- LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
- vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
- vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 1) {
- LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
- LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
- vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
- LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), "");
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
- vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 2) {
- LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
-
- /* If the draw call was split into multiple subdraws, each using
- * a separate draw packet, we need to start counting from 0 for
- * the first compute wave of the subdraw.
- *
- * vertex_counter contains the primitive ID of the first thread
- * in the first wave.
- *
- * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
- */
- LLVMValueRef is_first_wave =
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
- vertex_counter, "");
-
- /* Store the primitive count for ordered append, not vertex count.
- * The idea is to avoid GDS initialization via CP DMA. The shader
- * effectively stores the first count using "swap".
- *
- * if (first_wave) {
- * ds.ordered.swap(num_prims_accepted); // store the first primitive count
- * previous = 0;
- * } else {
- * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
- * }
- */
- ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
- {
- /* The GDS address is always 0 with ordered append. */
- si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
- num_prims_accepted, 0, true, true);
- LLVMBuildStore(builder, ctx->i32_0, tmp_store);
- }
- ac_build_else(&ctx->ac, 12605);
- {
- LLVMBuildStore(builder,
- si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
- num_prims_accepted, 0,
- true, true),
- tmp_store);
- }
- ac_build_endif(&ctx->ac, 12604);
-
- start = LLVMBuildLoad(builder, tmp_store, "");
- }
- }
- si_exit_thread0_section(&section, &start);
-
- /* Write the final vertex count to memory. An EOS/EOP event could do this,
- * but those events are super slow and should be avoided if performance
- * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
- * event like this.
- */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
- last_wave_prim_id, ""), 12606);
- LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
- count = LLVMBuildMul(builder, count,
- LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
-
- /* GFX8 needs to disable caching, so that the CP can see the stored value.
- * MTYPE=3 bypasses TC L2.
- */
- if (ctx->screen->info.chip_class <= GFX8) {
- LLVMValueRef desc[] = {
- vertex_count_addr,
- LLVMConstInt(ctx->i32,
- S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
- LLVMConstInt(ctx->i32, 4, 0),
- LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
- S_008F0C_MTYPE(3 /* uncached */), 0),
- };
- LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
- ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
- ctx->i32_0, 0, ac_glc | ac_slc, false);
- } else {
- LLVMBuildStore(builder, count,
- si_expand_32bit_pointer(ctx, vertex_count_addr));
- }
- ac_build_endif(&ctx->ac, 12606);
- } else {
- /* For unordered modes that increment a vertex count instead of
- * primitive count, convert it into the primitive index.
- */
- start = LLVMBuildUDiv(builder, start,
- LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
- }
-
- /* Now we need to store the indices of accepted primitives into
- * the output index buffer.
- */
- ac_build_ifcc(&ctx->ac, accepted, 16607);
- {
- /* Get the number of bits set before the index of this thread. */
- LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
- /* We have lowered instancing. Pack the instance ID into vertex ID. */
- if (key->opt.cs_instancing) {
- instance_id = LLVMBuildShl(builder, instance_id,
- LLVMConstInt(ctx->i32, 16, 0), "");
-
- for (unsigned i = 0; i < vertices_per_prim; i++)
- index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
- }
-
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- /* vertex_counter contains the first primitive ID
- * for this dispatch. If the draw call was split into
- * multiple subdraws, the first primitive ID is > 0
- * for subsequent subdraws. Each subdraw uses a different
- * portion of the output index buffer. Offset the store
- * vindex by the first primitive ID to get the correct
- * store address for the subdraw.
- */
- start = LLVMBuildAdd(builder, start, vertex_counter, "");
- }
-
- /* Write indices for accepted primitives. */
- LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
- LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
- if (!ac_has_vec3_support(ctx->ac.chip_class, true))
- vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
- ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
- vindex, ctx->i32_0, 3,
- ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
- }
- ac_build_endif(&ctx->ac, 16607);
-
- LLVMBuildRetVoid(builder);
-}
-
-/* Return false if the shader isn't ready. */
-static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
- const struct pipe_draw_info *info,
- bool primitive_restart)
-{
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct si_shader_key key;
-
- /* Primitive restart needs ordered counters. */
- assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
- assert(!primitive_restart || info->instance_count == 1);
-
- memset(&key, 0, sizeof(key));
- si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
- assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
- key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
- key.opt.vs_as_prim_discard_cs = 1;
- key.opt.cs_prim_type = info->mode;
- key.opt.cs_indexed = info->index_size != 0;
- key.opt.cs_instancing = info->instance_count > 1;
- key.opt.cs_primitive_restart = primitive_restart;
- key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
- /* Primitive restart with triangle strips needs to preserve primitive
- * orientation for cases where front and back primitive orientation matters.
- */
- if (primitive_restart) {
- struct si_shader_selector *ps = sctx->ps_shader.cso;
-
- key.opt.cs_need_correct_orientation =
- rs->cull_front != rs->cull_back ||
- ps->info.uses_frontface ||
- (rs->two_side && ps->info.colors_read);
- }
-
- if (rs->rasterizer_discard) {
- /* Just for performance testing and analysis of trivial bottlenecks.
- * This should result in a very short compute shader. */
- key.opt.cs_cull_front = 1;
- key.opt.cs_cull_back = 1;
- } else {
- key.opt.cs_cull_front =
- sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
- key.opt.cs_cull_back =
- sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
- }
-
- if (!rs->depth_clamp_any && CULL_Z) {
- key.opt.cs_cull_z = 1;
- key.opt.cs_halfz_clip_space = rs->clip_halfz;
- }
-
- sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
- sctx->cs_prim_discard_state.current = NULL;
-
- struct si_compiler_ctx_state compiler_state;
- compiler_state.compiler = &sctx->compiler;
- compiler_state.debug = sctx->debug;
- compiler_state.is_debug_context = sctx->is_debug;
-
- return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
- &compiler_state, &key, -1, true) == 0 &&
- /* Disallow compute shaders using the scratch buffer. */
- sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
-}
-
-static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
-{
- if (sctx->index_ring)
- return true;
-
- if (!sctx->prim_discard_compute_cs) {
- struct radeon_winsys *ws = sctx->ws;
- unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
- VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
- unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
-
- if (gds_size) {
- sctx->gds = ws->buffer_create(ws, gds_size, 4,
- RADEON_DOMAIN_GDS, 0);
- if (!sctx->gds)
- return false;
-
- ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
- RADEON_USAGE_READWRITE, 0, 0);
- }
- if (num_oa_counters) {
- assert(gds_size);
- sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
- 1, RADEON_DOMAIN_OA, 0);
- if (!sctx->gds_oa)
- return false;
-
- ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
- RADEON_USAGE_READWRITE, 0, 0);
- }
-
- sctx->prim_discard_compute_cs =
- ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
- num_oa_counters > 0);
- if (!sctx->prim_discard_compute_cs)
- return false;
- }
-
- if (!sctx->index_ring) {
- sctx->index_ring =
- si_aligned_buffer_create(sctx->b.screen,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- sctx->index_ring_size_per_ib * 2,
- 2 * 1024 * 1024);
- if (!sctx->index_ring)
- return false;
- }
- return true;
-}
-
-static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
-{
- return sctx->index_ring_offset +
- align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
- sctx->index_ring_size_per_ib;
-}
-
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- bool primitive_restart)
-{
- /* If the compute shader compilation isn't finished, this returns false. */
- if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
- return SI_PRIM_DISCARD_DISABLED;
-
- if (!si_initialize_prim_discard_cmdbuf(sctx))
- return SI_PRIM_DISCARD_DISABLED;
-
- struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
- unsigned prim = info->mode;
- unsigned count = info->count;
- unsigned instance_count = info->instance_count;
- unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
- unsigned num_prims = num_prims_per_instance * instance_count;
- unsigned out_indexbuf_size = num_prims * 12;
- bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
- const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
-
- /* Split draws at the draw call level if the ring is full. This makes
- * better use of the ring space.
- */
- if (ring_full &&
- num_prims > split_prims_draw_level &&
- instance_count == 1 && /* TODO: support splitting instanced draws */
- (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
- (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
- /* Split draws. */
- struct pipe_draw_info split_draw = *info;
- split_draw.primitive_restart = primitive_restart;
-
- unsigned base_start = split_draw.start;
-
- if (prim == PIPE_PRIM_TRIANGLES) {
- unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
- assert(vert_count_per_subdraw < count);
-
- for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
- split_draw.start = base_start + start;
- split_draw.count = MIN2(count - start, vert_count_per_subdraw);
-
- sctx->b.draw_vbo(&sctx->b, &split_draw);
- }
- } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
- /* No primitive pair can be split, because strips reverse orientation
- * for odd primitives. */
- STATIC_ASSERT(split_prims_draw_level % 2 == 0);
-
- unsigned vert_count_per_subdraw = split_prims_draw_level;
-
- for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
- split_draw.start = base_start + start;
- split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
- sctx->b.draw_vbo(&sctx->b, &split_draw);
-
- if (start == 0 &&
- primitive_restart &&
- sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
- sctx->preserve_prim_restart_gds_at_flush = true;
- }
- sctx->preserve_prim_restart_gds_at_flush = false;
- } else {
- assert(0);
- }
-
- return SI_PRIM_DISCARD_DRAW_SPLIT;
- }
-
- /* Just quit if the draw call doesn't fit into the ring and can't be split. */
- if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
- if (SI_PRIM_DISCARD_DEBUG)
- puts("PD failed: draw call too big, can't be split");
- return SI_PRIM_DISCARD_DISABLED;
- }
-
- unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
- unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
- 24 * (num_subdraws - 1) + /* subdraws */
- 20; /* leave some space at the end */
- unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
-
- if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
- need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
- else
- need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-
- if (ring_full ||
- (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
- !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
- /* If the current IB is empty but the size is too small, add a NOP
- * packet to force a flush and get a bigger IB.
- */
- if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
- gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
- radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(gfx_cs, 0);
- }
-
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- }
-
- /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
- struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
- ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
- assert(compute_has_space);
- assert(si_check_ring_space(sctx, out_indexbuf_size));
- return SI_PRIM_DISCARD_ENABLED;
-}
-
-void si_compute_signal_gfx(struct si_context *sctx)
-{
- struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
- unsigned writeback_L2_flags = 0;
-
- /* The writeback L2 flags vary with each chip generation. */
- /* CI needs to flush vertex indices to memory. */
- if (sctx->chip_class <= GFX7)
- writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
- else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
- writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
- if (!sctx->compute_num_prims_in_batch)
- return;
-
- assert(sctx->compute_rewind_va);
-
- /* After the queued dispatches are done and vertex counts are written to
- * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
- * the dispatches to finish, it only adds the CS_DONE event into the event
- * queue.
- */
- si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
- sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
- writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
- EOP_INT_SEL_NONE,
- EOP_DATA_SEL_VALUE_32BIT,
- NULL,
- sctx->compute_rewind_va |
- ((uint64_t)sctx->screen->info.address32_hi << 32),
- REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
- SI_NOT_QUERY);
-
- sctx->compute_rewind_va = 0;
- sctx->compute_num_prims_in_batch = 0;
-}
-
-/* Dispatch a primitive discard compute shader. */
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- unsigned index_size,
- unsigned base_vertex,
- uint64_t input_indexbuf_va,
- unsigned input_indexbuf_num_elements)
-{
- struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
- struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
- unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
- if (!num_prims_per_instance)
- return;
-
- unsigned num_prims = num_prims_per_instance * info->instance_count;
- unsigned vertices_per_prim, output_indexbuf_format;
-
- switch (info->mode) {
- case PIPE_PRIM_TRIANGLES:
- case PIPE_PRIM_TRIANGLE_STRIP:
- case PIPE_PRIM_TRIANGLE_FAN:
- vertices_per_prim = 3;
- output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
- break;
- default:
- unreachable("unsupported primitive type");
- return;
- }
-
- unsigned out_indexbuf_offset;
- uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
- bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
-
- /* Initialize the compute IB if it's empty. */
- if (!sctx->prim_discard_compute_ib_initialized) {
- /* 1) State initialization. */
- sctx->compute_gds_offset = 0;
- sctx->compute_ib_last_shader = NULL;
-
- if (sctx->last_ib_barrier_fence) {
- assert(!sctx->last_ib_barrier_buf);
- sctx->ws->cs_add_fence_dependency(gfx_cs,
- sctx->last_ib_barrier_fence,
- RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
- }
-
- /* 2) IB initialization. */
-
- /* This needs to be done at the beginning of IBs due to possible
- * TTM buffer moves in the kernel.
- *
- * TODO: update for GFX10
- */
- si_emit_surface_sync(sctx, cs,
- S_0085F0_TC_ACTION_ENA(1) |
- S_0085F0_TCL1_ACTION_ENA(1) |
- S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
- S_0085F0_SH_ICACHE_ACTION_ENA(1) |
- S_0085F0_SH_KCACHE_ACTION_ENA(1));
-
- /* Restore the GDS prim restart counter if needed. */
- if (sctx->preserve_prim_restart_gds_at_flush) {
- si_cp_copy_data(sctx, cs,
- COPY_DATA_GDS, NULL, 4,
- COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
- }
-
- si_emit_initial_compute_regs(sctx, cs);
-
- radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
- S_00B860_WAVES(sctx->scratch_waves) |
- S_00B860_WAVESIZE(0)); /* no scratch */
-
- /* Only 1D grids are launched. */
- radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
- radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
- S_00B820_NUM_THREAD_PARTIAL(1));
- radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
- S_00B824_NUM_THREAD_PARTIAL(1));
-
- radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0);
-
- /* Disable ordered alloc for OA resources. */
- for (unsigned i = 0; i < 2; i++) {
- radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
- radeon_emit(cs, S_031074_INDEX(i));
- radeon_emit(cs, 0);
- radeon_emit(cs, S_03107C_ENABLE(0));
- }
-
- if (sctx->last_ib_barrier_buf) {
- assert(!sctx->last_ib_barrier_fence);
- radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
- RADEON_USAGE_READ, RADEON_PRIO_FENCE);
- si_cp_wait_mem(sctx, cs,
- sctx->last_ib_barrier_buf->gpu_address +
- sctx->last_ib_barrier_buf_offset, 1, 1,
- WAIT_REG_MEM_EQUAL);
- }
-
- sctx->prim_discard_compute_ib_initialized = true;
- }
-
- /* Allocate the output index buffer. */
- output_indexbuf_size = align(output_indexbuf_size,
- sctx->screen->info.tcc_cache_line_size);
- assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
- out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
- sctx->index_ring_offset += output_indexbuf_size;
-
- radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SHADER_RW_BUFFER);
- uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
- /* Prepare index buffer descriptors. */
- struct si_resource *indexbuf_desc = NULL;
- unsigned indexbuf_desc_offset;
- unsigned desc_size = 12 * 4;
- uint32_t *desc;
-
- u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
- si_optimal_tcc_alignment(sctx, desc_size),
- &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
- (void**)&desc);
- radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
- RADEON_PRIO_DESCRIPTORS);
-
- /* Input index buffer. */
- desc[0] = input_indexbuf_va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
- S_008F04_STRIDE(index_size);
- desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
- index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
- V_008F0C_BUF_DATA_FORMAT_32);
-
- /* Output index buffer. */
- desc[4] = out_indexbuf_va;
- desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
- S_008F04_STRIDE(vertices_per_prim * 4);
- desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
- desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
- S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- S_008F0C_DATA_FORMAT(output_indexbuf_format);
-
- /* Viewport state.
- * This is needed by the small primitive culling, because it's done
- * in screen space.
- */
- float scale[2], translate[2];
-
- scale[0] = sctx->viewports.states[0].scale[0];
- scale[1] = sctx->viewports.states[0].scale[1];
- translate[0] = sctx->viewports.states[0].translate[0];
- translate[1] = sctx->viewports.states[0].translate[1];
-
- /* The viewport shouldn't flip the X axis for the small prim culling to work. */
- assert(-scale[0] + translate[0] <= scale[0] + translate[0]);
-
- /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
- * This is because the viewport transformation inverts the clip space
- * bounding box, so min becomes max, which breaks small primitive
- * culling.
- */
- if (sctx->viewports.y_inverted) {
- scale[1] = -scale[1];
- translate[1] = -translate[1];
- }
-
- /* Scale the framebuffer up, so that samples become pixels and small
- * primitive culling is the same for all sample counts.
- * This only works with the standard DX sample positions, because
- * the samples are evenly spaced on both X and Y axes.
- */
- unsigned num_samples = sctx->framebuffer.nr_samples;
- assert(num_samples >= 1);
-
- for (unsigned i = 0; i < 2; i++) {
- scale[i] *= num_samples;
- translate[i] *= num_samples;
- }
-
- desc[8] = fui(scale[0]);
- desc[9] = fui(scale[1]);
- desc[10] = fui(translate[0]);
- desc[11] = fui(translate[1]);
-
- /* Better subpixel precision increases the efficiency of small
- * primitive culling. */
- unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
- float small_prim_cull_precision;
-
- if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
- small_prim_cull_precision = num_samples / 4096.0;
- else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
- small_prim_cull_precision = num_samples / 1024.0;
- else
- small_prim_cull_precision = num_samples / 256.0;
-
- /* Set user data SGPRs. */
- /* This can't be greater than 14 if we want the fastest launch rate. */
- unsigned user_sgprs = 13;
-
- uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
- unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
- unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
- uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
- uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
- uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
- sctx->vb_descriptors_buffer->gpu_address +
- sctx->vb_descriptors_offset : 0;
- unsigned gds_offset, gds_size;
- struct si_fast_udiv_info32 num_prims_udiv = {};
-
- if (info->instance_count > 1)
- num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
- /* Limitations on how these two are packed in the user SGPR. */
- assert(num_prims_udiv.post_shift < 32);
- assert(num_prims_per_instance < 1 << 27);
-
- si_resource_reference(&indexbuf_desc, NULL);
-
- bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
-
- if (VERTEX_COUNTER_GDS_MODE == 1) {
- gds_offset = sctx->compute_gds_offset;
- gds_size = primitive_restart ? 8 : 4;
- sctx->compute_gds_offset += gds_size;
-
- /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
- * The remainder of the GDS will be cleared after the dispatch packet
- * in parallel with compute shaders.
- */
- if (first_dispatch) {
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
- radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
- radeon_emit(cs, gds_offset);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0); /* value to write */
- if (gds_size == 8)
- radeon_emit(cs, 0);
- }
- }
-
- /* Set shader registers. */
- struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
- if (shader != sctx->compute_ib_last_shader) {
- radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
- RADEON_PRIO_SHADER_BINARY);
- uint64_t shader_va = shader->bo->gpu_address;
-
- assert(shader->config.scratch_bytes_per_wave == 0);
- assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
- radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
- radeon_emit(cs, shader_va >> 8);
- radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
- radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
- radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
- S_00B848_FLOAT_MODE(shader->config.float_mode) |
- S_00B848_DX10_CLAMP(1));
- radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
- S_00B84C_USER_SGPR(user_sgprs) |
- S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
- S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
- S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
- S_00B84C_LDS_SIZE(shader->config.lds_size));
-
- radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
- ac_get_compute_resource_limits(&sctx->screen->info,
- WAVES_PER_TG,
- MAX_WAVES_PER_SH,
- THREADGROUPS_PER_CU));
- sctx->compute_ib_last_shader = shader;
- }
-
- STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
-
- /* Big draw calls are split into smaller dispatches and draw packets. */
- for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
- unsigned num_subdraw_prims;
-
- if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
- num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
- else
- num_subdraw_prims = num_prims - start_prim;
-
- /* Small dispatches are executed back to back until a specific primitive
- * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
- * to start drawing the batch. This batching adds latency to the gfx IB,
- * but CS_DONE and REWIND are too slow.
- */
- if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
- si_compute_signal_gfx(sctx);
-
- if (sctx->compute_num_prims_in_batch == 0) {
- assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
- sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
- if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
- radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(gfx_cs, 0);
-
- si_cp_wait_mem(sctx, gfx_cs,
- sctx->compute_rewind_va |
- (uint64_t)sctx->screen->info.address32_hi << 32,
- REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
- WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
- /* Use INDIRECT_BUFFER to chain to a different buffer
- * to discard the CP prefetch cache.
- */
- sctx->ws->cs_check_space(gfx_cs, 0, true);
- } else {
- radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
- radeon_emit(gfx_cs, 0);
- }
- }
-
- sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
- uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
- uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
- /* Emit the draw packet into the gfx IB. */
- radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
- radeon_emit(gfx_cs, num_prims * vertices_per_prim);
- radeon_emit(gfx_cs, index_va);
- radeon_emit(gfx_cs, index_va >> 32);
- radeon_emit(gfx_cs, 0);
- radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
-
- /* Continue with the compute IB. */
- if (start_prim == 0) {
- uint32_t gds_prim_restart_continue_bit = 0;
-
- if (sctx->preserve_prim_restart_gds_at_flush) {
- assert(primitive_restart &&
- info->mode == PIPE_PRIM_TRIANGLE_STRIP);
- assert(start_prim < 1 << 31);
- gds_prim_restart_continue_bit = 1 << 31;
- }
-
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
- radeon_emit(cs, index_buffers_va);
- radeon_emit(cs,
- VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
- VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
- start_prim |
- gds_prim_restart_continue_bit);
- radeon_emit(cs, start_prim + num_subdraw_prims - 1);
- radeon_emit(cs, count_va);
- radeon_emit(cs, vb_desc_va);
- radeon_emit(cs, vs_const_desc_va);
- radeon_emit(cs, vs_sampler_desc_va);
- radeon_emit(cs, base_vertex);
- radeon_emit(cs, info->start_instance);
- radeon_emit(cs, num_prims_udiv.multiplier);
- radeon_emit(cs, num_prims_udiv.post_shift |
- (num_prims_per_instance << 5));
- radeon_emit(cs, info->restart_index);
- /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
- radeon_emit(cs, fui(small_prim_cull_precision));
- } else {
- assert(VERTEX_COUNTER_GDS_MODE == 2);
- /* Only update the SGPRs that changed. */
- radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
- radeon_emit(cs, start_prim);
- radeon_emit(cs, start_prim + num_subdraw_prims - 1);
- radeon_emit(cs, count_va);
- }
-
- /* Set grid dimensions. */
- unsigned start_block = start_prim / THREADGROUP_SIZE;
- unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
- unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
- radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
- radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
- S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
- S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
- radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
- PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
- radeon_emit(cs, 1);
- radeon_emit(cs, 1);
- radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
- S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
- S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
- S_00B800_ORDER_MODE(0 /* launch in order */));
-
- /* This is only for unordered append. Ordered append writes this from
- * the shader.
- *
- * Note that EOP and EOS events are super slow, so emulating the event
- * in a shader is an important optimization.
- */
- if (VERTEX_COUNTER_GDS_MODE == 1) {
- si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
- sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
- EOP_INT_SEL_NONE,
- EOP_DATA_SEL_GDS,
- NULL,
- count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
- EOP_DATA_GDS(gds_offset / 4, 1),
- SI_NOT_QUERY);
-
- /* Now that compute shaders are running, clear the remainder of GDS. */
- if (first_dispatch) {
- unsigned offset = gds_offset + gds_size;
- si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
- GDS_SIZE_UNORDERED - offset,
- 0,
- SI_CPDMA_SKIP_CHECK_CS_SPACE |
- SI_CPDMA_SKIP_GFX_SYNC |
- SI_CPDMA_SKIP_SYNC_BEFORE,
- SI_COHERENCY_NONE, L2_BYPASS);
- }
- }
- first_dispatch = false;
-
- assert(cs->current.cdw <= cs->current.max_dw);
- assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
- }
-}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c b/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c
index b7aece564..ca2230620 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -100,22 +100,22 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
radeon_begin(cs);
if (sctx->chip_class >= GFX7) {
- radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
- radeon_emit(cs, header);
- radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
- radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
- radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
- radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
- radeon_emit(cs, command);
+ radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+ radeon_emit(header);
+ radeon_emit(src_va); /* SRC_ADDR_LO [31:0] */
+ radeon_emit(src_va >> 32); /* SRC_ADDR_HI [31:0] */
+ radeon_emit(dst_va); /* DST_ADDR_LO [31:0] */
+ radeon_emit(dst_va >> 32); /* DST_ADDR_HI [31:0] */
+ radeon_emit(command);
} else {
header |= S_411_SRC_ADDR_HI(src_va >> 32);
- radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
- radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
- radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */
- radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
- radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
- radeon_emit(cs, command);
+ radeon_emit(PKT3(PKT3_CP_DMA, 4, 0));
+ radeon_emit(src_va); /* SRC_ADDR_LO [31:0] */
+ radeon_emit(header); /* SRC_ADDR_HI [15:0] + flags. */
+ radeon_emit(dst_va); /* DST_ADDR_LO [31:0] */
+ radeon_emit((dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
+ radeon_emit(command);
}
/* CP DMA is executed in ME, but index buffers are read by PFP.
@@ -124,8 +124,8 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
* should precede it.
*/
if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
- radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
- radeon_emit(cs, 0);
+ radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(0);
}
radeon_end();
}
@@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
sdst->TC_L2_dirty = true;
/* If it's not a framebuffer fast clear... */
- if (coher == SI_COHERENCY_SHADER) {
+ if (coher == SI_COHERENCY_SHADER)
sctx->num_cp_dma_calls++;
- si_prim_discard_signal_next_compute_ib_start(sctx);
- }
}
/**
@@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
si_resource(dst)->TC_L2_dirty = true;
/* If it's not a prefetch or GDS copy... */
- if (dst && src && (dst != src || dst_offset != src_offset)) {
+ if (dst && src && (dst != src || dst_offset != src_offset))
sctx->num_cp_dma_calls++;
- si_prim_discard_signal_next_compute_ib_start(sctx);
- }
}
void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
@@ -423,13 +419,13 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
radeon_begin(cs);
- radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
- radeon_emit(cs, header);
- radeon_emit(cs, address); /* SRC_ADDR_LO [31:0] */
- radeon_emit(cs, address >> 32); /* SRC_ADDR_HI [31:0] */
- radeon_emit(cs, address); /* DST_ADDR_LO [31:0] */
- radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */
- radeon_emit(cs, command);
+ radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+ radeon_emit(header);
+ radeon_emit(address); /* SRC_ADDR_LO [31:0] */
+ radeon_emit(address >> 32); /* SRC_ADDR_HI [31:0] */
+ radeon_emit(address); /* DST_ADDR_LO [31:0] */
+ radeon_emit(address >> 32); /* DST_ADDR_HI [31:0] */
+ radeon_emit(command);
radeon_end();
}
@@ -495,11 +491,11 @@ void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned
uint64_t va = buf->gpu_address + offset;
radeon_begin(cs);
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
- radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- radeon_emit_array(cs, (const uint32_t *)data, size / 4);
+ radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
+ radeon_emit(S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
+ radeon_emit(va);
+ radeon_emit(va >> 32);
+ radeon_emit_array((const uint32_t *)data, size / 4);
radeon_end();
}
@@ -519,11 +515,11 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned
uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
radeon_begin(cs);
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
- radeon_emit(cs, src_va);
- radeon_emit(cs, src_va >> 32);
- radeon_emit(cs, dst_va);
- radeon_emit(cs, dst_va >> 32);
+ radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
+ radeon_emit(src_va);
+ radeon_emit(src_va >> 32);
+ radeon_emit(dst_va);
+ radeon_emit(dst_va >> 32);
radeon_end();
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c b/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c
index bcc8baa93..540206c15 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c
@@ -344,7 +344,6 @@ struct si_log_chunk_cs {
struct si_saved_cs *cs;
bool dump_bo_list;
unsigned gfx_begin, gfx_end;
- unsigned compute_begin, compute_end;
};
static void si_log_chunk_type_cs_destroy(void *data)
@@ -390,13 +389,18 @@ static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begi
fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end);
}
+void si_print_current_ib(struct si_context *sctx, FILE *f)
+{
+ si_parse_current_ib(f, &sctx->gfx_cs, 0, sctx->gfx_cs.prev_dw + sctx->gfx_cs.current.cdw,
+ NULL, 0, "GFX", sctx->chip_class);
+}
+
static void si_log_chunk_type_cs_print(void *data, FILE *f)
{
struct si_log_chunk_cs *chunk = data;
struct si_context *ctx = chunk->ctx;
struct si_saved_cs *scs = chunk->cs;
int last_trace_id = -1;
- int last_compute_trace_id = -1;
/* We are expecting that the ddebug pipe has already
* waited for the context, so this buffer should be idle.
@@ -404,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
*/
uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL,
PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ);
- if (map) {
+ if (map)
last_trace_id = map[0];
- last_compute_trace_id = map[1];
- }
if (chunk->gfx_end != chunk->gfx_begin) {
if (chunk->gfx_begin == 0) {
@@ -429,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
}
}
- if (chunk->compute_end != chunk->compute_begin) {
- assert(ctx->prim_discard_compute_cs.priv);
-
- if (scs->flushed) {
- ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
- chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
- "Compute IB", ctx->chip_class, NULL, NULL);
- } else {
- si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin,
- chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
- ctx->chip_class);
- }
- }
-
if (chunk->dump_bo_list) {
fprintf(f, "Flushing. Time: ");
util_dump_ns(f, scs->time_flush);
@@ -462,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
struct si_saved_cs *scs = ctx->current_saved_cs;
unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw;
- unsigned compute_cur = 0;
-
- if (ctx->prim_discard_compute_cs.priv)
- compute_cur =
- ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw;
- if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+ if (!dump_bo_list && gfx_cur == scs->gfx_last_dw)
return;
struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@@ -481,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
chunk->gfx_end = gfx_cur;
scs->gfx_last_dw = gfx_cur;
- chunk->compute_begin = scs->compute_last_dw;
- chunk->compute_end = compute_cur;
- scs->compute_last_dw = compute_cur;
-
u_log_chunk(log, &si_log_chunk_type_cs, chunk);
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c b/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c
index 60daaeb07..f02855743 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -231,15 +231,6 @@ static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_reso
priority = si_get_sampler_view_priority(&tex->buffer);
radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, check_mem);
-
- if (resource->target == PIPE_BUFFER)
- return;
-
- /* Add separate DCC. */
- if (tex->dcc_separate_buffer) {
- radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, usage,
- RADEON_PRIO_SEPARATE_META, check_mem);
- }
}
static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers)
@@ -296,7 +287,8 @@ static void si_set_buf_desc_address(struct si_resource *buf, uint64_t offset, ui
void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
const struct legacy_surf_level *base_level_info,
unsigned base_level, unsigned first_level, unsigned block_width,
- bool is_stencil, uint16_t access, uint32_t *state)
+ /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+ bool is_stencil, uint16_t access, uint32_t * restrict state)
{
uint64_t va, meta_va = 0;
@@ -318,7 +310,6 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
}
state[0] = va >> 8;
- state[1] &= C_008F14_BASE_ADDRESS_HI;
state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
/* Only macrotiled modes can set tile swizzle.
@@ -328,11 +319,8 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
state[0] |= tex->surface.tile_swizzle;
if (sscreen->info.chip_class >= GFX8) {
- state[6] &= C_008F28_COMPRESSION_EN;
-
if (!(access & SI_IMAGE_ACCESS_DCC_OFF) && vi_dcc_enabled(tex, first_level)) {
- meta_va =
- (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.meta_offset;
+ meta_va = tex->buffer.gpu_address + tex->surface.meta_offset;
if (sscreen->info.chip_class == GFX8) {
meta_va += tex->surface.u.legacy.color.dcc_level[base_level].dcc_offset;
@@ -355,17 +343,12 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
state[7] = meta_va >> 8;
if (sscreen->info.chip_class >= GFX10) {
- state[3] &= C_00A00C_SW_MODE;
-
if (is_stencil) {
state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode);
} else {
state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.swizzle_mode);
}
- state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED &
- C_00A018_WRITE_COMPRESS_ENABLE;
-
if (meta_va) {
struct gfx9_surf_meta_flags meta = {
.rb_aligned = 1,
@@ -377,14 +360,21 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8) |
- S_00A018_WRITE_COMPRESS_ENABLE((access & SI_IMAGE_ACCESS_DCC_WRITE) != 0);
+ /* DCC image stores require the following settings:
+ * - INDEPENDENT_64B_BLOCKS = 0
+ * - INDEPENDENT_128B_BLOCKS = 1
+ * - MAX_COMPRESSED_BLOCK_SIZE = 128B
+ * - MAX_UNCOMPRESSED_BLOCK_SIZE = 256B (always used)
+ *
+ * The same limitations apply to SDMA compressed stores because
+ * SDMA uses the same DCC codec.
+ */
+ S_00A018_WRITE_COMPRESS_ENABLE(ac_surface_supports_dcc_image_stores(sscreen->info.chip_class, &tex->surface) &&
+ (access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE));
}
state[7] = meta_va >> 16;
} else if (sscreen->info.chip_class == GFX9) {
- state[3] &= C_008F1C_SW_MODE;
- state[4] &= C_008F20_PITCH;
-
if (is_stencil) {
state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode);
state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.zs.stencil_epitch);
@@ -423,9 +413,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
unsigned pitch = base_level_info->nblk_x * block_width;
unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
- state[3] &= C_008F1C_TILING_INDEX;
state[3] |= S_008F1C_TILING_INDEX(index);
- state[4] &= C_008F20_PITCH;
state[4] |= S_008F20_PITCH(pitch - 1);
}
@@ -451,13 +439,23 @@ static void si_set_sampler_state_desc(struct si_sampler_state *sstate,
}
static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_view *sview,
- struct si_sampler_state *sstate, uint32_t *desc)
+ struct si_sampler_state *sstate,
+ /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+ uint32_t * restrict desc)
{
struct pipe_sampler_view *view = &sview->base;
struct si_texture *tex = (struct si_texture *)view->texture;
- bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
- if (unlikely(!is_buffer && sview->dcc_incompatible)) {
+ assert(tex); /* views with texture == NULL aren't supported */
+
+ if (tex->buffer.b.b.target == PIPE_BUFFER) {
+ memcpy(desc, sview->state, 8 * 4);
+ memcpy(desc + 8, null_texture_descriptor, 4 * 4); /* Disable FMASK. */
+ si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4);
+ return;
+ }
+
+ if (unlikely(sview->dcc_incompatible)) {
if (vi_dcc_enabled(tex, view->u.tex.first_level))
if (!si_texture_disable_dcc(sctx, tex))
si_decompress_dcc(sctx, tex);
@@ -465,27 +463,21 @@ static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_
sview->dcc_incompatible = false;
}
- assert(tex); /* views with texture == NULL aren't supported */
- memcpy(desc, sview->state, 8 * 4);
+ bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler;
- if (is_buffer) {
- si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4);
- } else {
- bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler;
-
- si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level,
- sview->base.u.tex.first_level, sview->block_width,
- is_separate_stencil, 0, desc);
- }
+ memcpy(desc, sview->state, 8 * 4);
+ si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level,
+ sview->base.u.tex.first_level, sview->block_width,
+ is_separate_stencil, 0, desc);
- if (!is_buffer && tex->surface.fmask_size) {
+ if (tex->surface.fmask_size) {
memcpy(desc + 8, sview->fmask_state, 8 * 4);
} else {
/* Disable FMASK and bind sampler state in [12:15]. */
memcpy(desc + 8, null_texture_descriptor, 4 * 4);
if (sstate)
- si_set_sampler_state_desc(sstate, sview, is_buffer ? NULL : tex, desc + 12);
+ si_set_sampler_state_desc(sstate, sview, tex, desc + 12);
}
}
@@ -508,65 +500,106 @@ static bool depth_needs_decompression(struct si_texture *tex)
return tex->db_compatible;
}
-static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot,
- struct pipe_sampler_view *view, bool disallow_early_out)
+static void si_reset_sampler_view_slot(struct si_samplers *samplers, unsigned slot,
+ uint32_t * restrict desc)
+{
+ pipe_sampler_view_reference(&samplers->views[slot], NULL);
+ memcpy(desc, null_texture_descriptor, 8 * 4);
+ /* Only clear the lower dwords of FMASK. */
+ memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+ /* Re-set the sampler state if we are transitioning from FMASK. */
+ if (samplers->sampler_states[slot])
+ si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12);
+}
+
+static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
+ unsigned start_slot, unsigned count,
+ unsigned unbind_num_trailing_slots,
+ bool take_ownership, struct pipe_sampler_view **views,
+ bool disallow_early_out)
{
struct si_samplers *samplers = &sctx->samplers[shader];
- struct si_sampler_view *sview = (struct si_sampler_view *)view;
struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
- unsigned desc_slot = si_get_sampler_slot(slot);
- uint32_t *desc = descs->list + desc_slot * 16;
+ uint32_t unbound_mask = 0;
- if (samplers->views[slot] == view && !disallow_early_out)
- return;
+ if (views) {
+ for (unsigned i = 0; i < count; i++) {
+ unsigned slot = start_slot + i;
+ struct si_sampler_view *sview = (struct si_sampler_view *)views[i];
+ unsigned desc_slot = si_get_sampler_slot(slot);
+ /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+ uint32_t *restrict desc = descs->list + desc_slot * 16;
+
+ if (samplers->views[slot] == &sview->base && !disallow_early_out) {
+ if (take_ownership) {
+ struct pipe_sampler_view *view = views[i];
+ pipe_sampler_view_reference(&view, NULL);
+ }
+ continue;
+ }
- if (view) {
- struct si_texture *tex = (struct si_texture *)view->texture;
+ if (sview) {
+ struct si_texture *tex = (struct si_texture *)sview->base.texture;
+
+ si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
+
+ if (tex->buffer.b.b.target == PIPE_BUFFER) {
+ tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
+ samplers->needs_depth_decompress_mask &= ~(1u << slot);
+ samplers->needs_color_decompress_mask &= ~(1u << slot);
+ } else {
+ if (depth_needs_decompression(tex)) {
+ samplers->needs_depth_decompress_mask |= 1u << slot;
+ } else {
+ samplers->needs_depth_decompress_mask &= ~(1u << slot);
+ }
+ if (color_needs_decompression(tex)) {
+ samplers->needs_color_decompress_mask |= 1u << slot;
+ } else {
+ samplers->needs_color_decompress_mask &= ~(1u << slot);
+ }
+
+ if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) &&
+ p_atomic_read(&tex->framebuffers_bound))
+ sctx->need_check_render_feedback = true;
+ }
- si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
+ if (take_ownership) {
+ pipe_sampler_view_reference(&samplers->views[slot], NULL);
+ samplers->views[slot] = &sview->base;
+ } else {
+ pipe_sampler_view_reference(&samplers->views[slot], &sview->base);
+ }
+ samplers->enabled_mask |= 1u << slot;
- if (tex->buffer.b.b.target == PIPE_BUFFER) {
- tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
- samplers->needs_depth_decompress_mask &= ~(1u << slot);
- samplers->needs_color_decompress_mask &= ~(1u << slot);
- } else {
- if (depth_needs_decompression(tex)) {
- samplers->needs_depth_decompress_mask |= 1u << slot;
+ /* Since this can flush, it must be done after enabled_mask is
+ * updated. */
+ si_sampler_view_add_buffer(sctx, &tex->buffer.b.b, RADEON_USAGE_READ,
+ sview->is_stencil_sampler, true);
} else {
- samplers->needs_depth_decompress_mask &= ~(1u << slot);
+ si_reset_sampler_view_slot(samplers, slot, desc);
+ unbound_mask |= 1u << slot;
}
- if (color_needs_decompression(tex)) {
- samplers->needs_color_decompress_mask |= 1u << slot;
- } else {
- samplers->needs_color_decompress_mask &= ~(1u << slot);
- }
-
- if (vi_dcc_enabled(tex, view->u.tex.first_level) &&
- p_atomic_read(&tex->framebuffers_bound))
- sctx->need_check_render_feedback = true;
}
-
- pipe_sampler_view_reference(&samplers->views[slot], view);
- samplers->enabled_mask |= 1u << slot;
-
- /* Since this can flush, it must be done after enabled_mask is
- * updated. */
- si_sampler_view_add_buffer(sctx, view->texture, RADEON_USAGE_READ, sview->is_stencil_sampler,
- true);
} else {
- pipe_sampler_view_reference(&samplers->views[slot], NULL);
- memcpy(desc, null_texture_descriptor, 8 * 4);
- /* Only clear the lower dwords of FMASK. */
- memcpy(desc + 8, null_texture_descriptor, 4 * 4);
- /* Re-set the sampler state if we are transitioning from FMASK. */
- if (samplers->sampler_states[slot])
- si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12);
+ unbind_num_trailing_slots += count;
+ count = 0;
+ }
- samplers->enabled_mask &= ~(1u << slot);
- samplers->needs_depth_decompress_mask &= ~(1u << slot);
- samplers->needs_color_decompress_mask &= ~(1u << slot);
+ for (unsigned i = 0; i < unbind_num_trailing_slots; i++) {
+ unsigned slot = start_slot + count + i;
+ unsigned desc_slot = si_get_sampler_slot(slot);
+ uint32_t * restrict desc = descs->list + desc_slot * 16;
+
+ if (samplers->views[slot])
+ si_reset_sampler_view_slot(samplers, slot, desc);
}
+ unbound_mask |= BITFIELD_RANGE(start_slot + count, unbind_num_trailing_slots);
+ samplers->enabled_mask &= ~unbound_mask;
+ samplers->needs_depth_decompress_mask &= ~unbound_mask;
+ samplers->needs_color_decompress_mask &= ~unbound_mask;
+
sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
}
@@ -582,28 +615,18 @@ static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsi
sctx->shader_needs_decompress_mask &= ~shader_bit;
}
-static void si_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
- unsigned start, unsigned count,
- unsigned unbind_num_trailing_slots,
- struct pipe_sampler_view **views)
+static void si_pipe_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
+ unsigned start, unsigned count,
+ unsigned unbind_num_trailing_slots,
+ bool take_ownership, struct pipe_sampler_view **views)
{
struct si_context *sctx = (struct si_context *)ctx;
- int i;
if ((!count && !unbind_num_trailing_slots) || shader >= SI_NUM_SHADERS)
return;
- if (views) {
- for (i = 0; i < count; i++)
- si_set_sampler_view(sctx, shader, start + i, views[i], false);
- } else {
- for (i = 0; i < count; i++)
- si_set_sampler_view(sctx, shader, start + i, NULL, false);
- }
-
- for (; i < count + unbind_num_trailing_slots; i++)
- si_set_sampler_view(sctx, shader, start + i, NULL, false);
-
+ si_set_sampler_views(sctx, shader, start, count, unbind_num_trailing_slots,
+ take_ownership, views, false);
si_update_shader_needs_decompress_mask(sctx, shader);
}
@@ -710,7 +733,7 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
res = si_resource(view->resource);
- if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+ if (res->b.b.target == PIPE_BUFFER) {
if (view->access & PIPE_IMAGE_ACCESS_WRITE)
si_mark_image_range_valid(view);
@@ -725,12 +748,15 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
bool uses_dcc = vi_dcc_enabled(tex, level);
unsigned access = view->access;
+ if (uses_dcc && screen->always_allow_dcc_stores)
+ access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
+
assert(!tex->is_depth);
assert(fmask_desc || tex->surface.fmask_offset == 0);
if (uses_dcc && !skip_decompress &&
!(access & SI_IMAGE_ACCESS_DCC_OFF) &&
- ((!(access & SI_IMAGE_ACCESS_DCC_WRITE) && (access & PIPE_IMAGE_ACCESS_WRITE)) ||
+ ((!(access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE) && (access & PIPE_IMAGE_ACCESS_WRITE)) ||
!vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
/* If DCC can't be disabled, at least decompress it.
* The decompression is relatively cheap if the surface
@@ -766,7 +792,7 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc);
si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level,
util_format_get_blockwidth(view->format),
- false, view->access, desc);
+ false, access, desc);
}
}
@@ -790,7 +816,7 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
if (&images->views[slot] != view)
util_copy_image_view(&images->views[slot], view);
- if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+ if (res->b.b.target == PIPE_BUFFER) {
images->needs_color_decompress_mask &= ~(1 << slot);
images->display_dcc_store_mask &= ~(1u << slot);
res->bind_history |= PIPE_BIND_SHADER_IMAGE;
@@ -804,10 +830,15 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
images->needs_color_decompress_mask &= ~(1 << slot);
}
- if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE)
+ if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE) {
images->display_dcc_store_mask |= 1u << slot;
- else
+
+ /* Set displayable_dcc_dirty for non-compute stages conservatively (before draw calls). */
+ if (shader != PIPE_SHADER_COMPUTE)
+ tex->displayable_dcc_dirty = true;
+ } else {
images->display_dcc_store_mask &= ~(1u << slot);
+ }
if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
ctx->need_check_render_feedback = true;
@@ -992,7 +1023,8 @@ static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_ty
/* BUFFER RESOURCES */
-static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_context *sctx,
+ struct si_buffer_resources *buffers,
struct si_descriptors *descs, unsigned num_buffers,
short shader_userdata_rel_index,
enum radeon_bo_priority priority,
@@ -1004,6 +1036,22 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
+
+ /* Initialize buffer descriptors, so that we don't have to do it at bind time. */
+ for (unsigned i = 0; i < num_buffers; i++) {
+ uint32_t *desc = descs->list + i * 4;
+
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (sctx->chip_class >= GFX10) {
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+ }
+ }
}
static void si_release_buffer_resources(struct si_buffer_resources *buffers,
@@ -1145,7 +1193,6 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
}
} else {
if (take_ownership) {
- pipe_resource_reference(&buffer, NULL);
buffer = input->buffer;
} else {
pipe_resource_reference(&buffer, input->buffer);
@@ -1160,16 +1207,6 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
desc[0] = va;
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
desc[2] = input->buffer_size;
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
- if (sctx->chip_class >= GFX10) {
- desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- }
buffers->buffers[slot] = buffer;
buffers->offsets[slot] = buffer_offset;
@@ -1177,14 +1214,27 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
buffers->priority_constbuf, true);
buffers->enabled_mask |= 1llu << slot;
} else {
- /* Clear the descriptor. */
- memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+ /* Clear the descriptor. Only 3 dwords are cleared. The 4th dword is immutable. */
+ memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 3);
buffers->enabled_mask &= ~(1llu << slot);
}
sctx->descriptors_dirty |= 1u << descriptors_idx;
}
+void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader)
+{
+ if (shader == PIPE_SHADER_COMPUTE)
+ return;
+
+ if (sctx->shaders[shader].key.opt.inline_uniforms) {
+ sctx->shaders[shader].key.opt.inline_uniforms = false;
+ memset(sctx->shaders[shader].key.opt.inlined_uniform_values, 0,
+ sizeof(sctx->shaders[shader].key.opt.inlined_uniform_values));
+ sctx->do_update_shaders = true;
+ }
+}
+
static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader,
uint slot, bool take_ownership,
const struct pipe_constant_buffer *input)
@@ -1204,10 +1254,8 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad
si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
}
- if (slot == 0) {
- /* Invalidate current inlinable uniforms. */
- sctx->inlinable_uniforms_valid_mask &= ~(1 << shader);
- }
+ if (slot == 0)
+ si_invalidate_inlinable_uniforms(sctx, shader);
}
slot = si_get_constbuf_slot(slot);
@@ -1222,9 +1270,24 @@ static void si_set_inlinable_constants(struct pipe_context *ctx,
{
struct si_context *sctx = (struct si_context *)ctx;
- memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4);
- sctx->inlinable_uniforms_valid_mask |= 1 << shader;
- sctx->do_update_shaders = true;
+ if (shader == PIPE_SHADER_COMPUTE)
+ return;
+
+ if (!sctx->shaders[shader].key.opt.inline_uniforms) {
+ /* It's the first time we set the constants. Always update shaders. */
+ sctx->shaders[shader].key.opt.inline_uniforms = true;
+ memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4);
+ sctx->do_update_shaders = true;
+ return;
+ }
+
+ /* We have already set inlinable constants for this shader. Update the shader only if
+ * the constants are being changed so as not to update shaders needlessly.
+ */
+ if (memcmp(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4)) {
+ memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4);
+ sctx->do_update_shaders = true;
+ }
}
void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
@@ -1248,7 +1311,8 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou
if (!sbuffer || !sbuffer->buffer) {
pipe_resource_reference(&buffers->buffers[slot], NULL);
- memset(desc, 0, sizeof(uint32_t) * 4);
+ /* Clear the descriptor. Only 3 dwords are cleared. The 4th dword is immutable. */
+ memset(desc, 0, sizeof(uint32_t) * 3);
buffers->enabled_mask &= ~(1llu << slot);
buffers->writable_mask &= ~(1llu << slot);
sctx->descriptors_dirty |= 1u << descriptors_idx;
@@ -1261,16 +1325,6 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou
desc[0] = va;
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
desc[2] = sbuffer->buffer_size;
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
- if (sctx->chip_class >= GFX10) {
- desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- }
pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
buffers->offsets[slot] = sbuffer->buffer_offset;
@@ -1417,7 +1471,7 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource
desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
if (sctx->chip_class >= GFX10) {
- desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
} else {
desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
@@ -1879,7 +1933,7 @@ void si_update_all_texture_descriptors(struct si_context *sctx)
if (!view || !view->texture || view->texture->target == PIPE_BUFFER)
continue;
- si_set_sampler_view(sctx, shader, i, samplers->views[i], true);
+ si_set_sampler_views(sctx, shader, i, 1, 0, false, &samplers->views[i], true);
}
si_update_shader_needs_decompress_mask(sctx, shader);
@@ -1897,11 +1951,13 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
if (shader == PIPE_SHADER_VERTEX) {
+ unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
+
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
sctx->num_vertex_elements >
- sctx->screen->num_vbos_in_user_sgprs;
+ num_vbos_in_user_sgprs;
sctx->vertex_buffer_user_sgprs_dirty =
- sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+ sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
}
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
@@ -1909,12 +1965,14 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
void si_shader_pointers_mark_dirty(struct si_context *sctx)
{
+ unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
+
sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
sctx->num_vertex_elements >
- sctx->screen->num_vbos_in_user_sgprs;
+ num_vbos_in_user_sgprs;
sctx->vertex_buffer_user_sgprs_dirty =
- sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+ sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
@@ -1963,6 +2021,36 @@ void si_shader_change_notify(struct si_context *sctx)
sctx->shader.gs.cso ? GS_ON : GS_OFF,
sctx->ngg ? NGG_ON : NGG_OFF,
PIPE_SHADER_TESS_EVAL));
+
+ /* Update as_* flags in shader keys. Ignore disabled shader stages.
+ * as_ls = VS before TCS
+ * as_es = VS before GS or TES before GS
+ * as_ngg = NGG enabled for the last geometry stage.
+ * If GS sets as_ngg, the previous stage must set as_ngg too.
+ */
+ if (sctx->shader.tes.cso) {
+ sctx->shader.vs.key.as_ls = 1;
+ sctx->shader.vs.key.as_es = 0;
+ sctx->shader.vs.key.as_ngg = 0;
+
+ if (sctx->shader.gs.cso) {
+ sctx->shader.tes.key.as_es = 1;
+ sctx->shader.tes.key.as_ngg = sctx->ngg;
+ sctx->shader.gs.key.as_ngg = sctx->ngg;
+ } else {
+ sctx->shader.tes.key.as_es = 0;
+ sctx->shader.tes.key.as_ngg = sctx->ngg;
+ }
+ } else if (sctx->shader.gs.cso) {
+ sctx->shader.vs.key.as_ls = 0;
+ sctx->shader.vs.key.as_es = 1;
+ sctx->shader.vs.key.as_ngg = sctx->ngg;
+ sctx->shader.gs.key.as_ngg = sctx->ngg;
+ } else {
+ sctx->shader.vs.key.as_ls = 0;
+ sctx->shader.vs.key.as_es = 0;
+ sctx->shader.vs.key.as_ngg = sctx->ngg;
+ }
}
#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \
@@ -1977,9 +2065,9 @@ void si_shader_change_notify(struct si_context *sctx)
struct si_descriptors *descs = &sctx->descriptors[start]; \
unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \
\
- radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, count); \
+ radeon_set_sh_reg_seq(sh_offset, count); \
for (int i = 0; i < count; i++) \
- radeon_emit_32bit_pointer(sctx->screen, cs, descs[i].gpu_address); \
+ radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \
} \
} \
} while (0)
@@ -2070,12 +2158,12 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) {
struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE);
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 +
shader->cs_shaderbufs_sgpr_index * 4,
num_shaderbufs * 4);
for (unsigned i = 0; i < num_shaderbufs; i++)
- radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4);
+ radeon_emit_array(&desc->list[si_get_shaderbuf_slot(i) * 4], 4);
sctx->compute_shaderbuf_sgprs_dirty = false;
}
@@ -2085,7 +2173,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
if (num_images && sctx->compute_image_sgprs_dirty) {
struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE);
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 +
shader->cs_images_sgpr_index * 4,
shader->cs_images_num_sgprs);
@@ -2099,7 +2187,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
num_sgprs = 4;
}
- radeon_emit_array(cs, &desc->list[desc_offset], num_sgprs);
+ radeon_emit_array(&desc->list[desc_offset], num_sgprs);
}
sctx->compute_image_sgprs_dirty = false;
@@ -2123,8 +2211,7 @@ static void si_init_bindless_descriptors(struct si_context *sctx, struct si_desc
sctx->num_bindless_descriptors = 1;
/* Track which bindless slots are used (or not). */
- util_idalloc_init(&sctx->bindless_used_slots);
- util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
+ util_idalloc_init(&sctx->bindless_used_slots, num_elements);
/* Reserve slot 0 because it's an invalid handle for bindless. */
desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
@@ -2526,7 +2613,7 @@ void si_init_all_descriptors(struct si_context *sctx)
rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
}
desc = si_const_and_shader_buffer_descriptors(sctx, i);
- si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots,
+ si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i], desc, num_buffer_slots,
rel_dw_offset, RADEON_PRIO_SHADER_RW_BUFFER,
RADEON_PRIO_CONST_BUFFER);
desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
@@ -2556,7 +2643,7 @@ void si_init_all_descriptors(struct si_context *sctx)
memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
}
- si_init_buffer_resources(&sctx->internal_bindings, &sctx->descriptors[SI_DESCS_INTERNAL],
+ si_init_buffer_resources(sctx, &sctx->internal_bindings, &sctx->descriptors[SI_DESCS_INTERNAL],
SI_NUM_INTERNAL_BINDINGS, SI_SGPR_INTERNAL_BINDINGS,
/* The second priority is used by
* const buffers in RW buffer slots. */
@@ -2577,7 +2664,7 @@ void si_init_all_descriptors(struct si_context *sctx)
sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
sctx->b.set_inlinable_constants = si_set_inlinable_constants;
sctx->b.set_shader_buffers = si_set_shader_buffers;
- sctx->b.set_sampler_views = si_set_sampler_views;
+ sctx->b.set_sampler_views = si_pipe_set_sampler_views;
sctx->b.create_texture_handle = si_create_texture_handle;
sctx->b.delete_texture_handle = si_delete_texture_handle;
sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c b/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c
index fc965cd7a..0bee2f7d0 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -26,141 +26,17 @@
#include "si_query.h"
#include "util/u_memory.h"
-enum si_pc_block_flags
-{
- /* This block is part of the shader engine */
- SI_PC_BLOCK_SE = (1 << 0),
-
- /* Expose per-instance groups instead of summing all instances (within
- * an SE). */
- SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
-
- /* Expose per-SE groups instead of summing instances across SEs. */
- SI_PC_BLOCK_SE_GROUPS = (1 << 2),
-
- /* Shader block */
- SI_PC_BLOCK_SHADER = (1 << 3),
-
- /* Non-shader block with perfcounters windowed by shaders. */
- SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
-};
-
-enum si_pc_reg_layout
-{
- /* All secondary selector dwords follow as one block after the primary
- * selector dwords for the counters that have secondary selectors.
- *
- * Example:
- * PERFCOUNTER0_SELECT
- * PERFCOUNTER1_SELECT
- * PERFCOUNTER0_SELECT1
- * PERFCOUNTER1_SELECT1
- * PERFCOUNTER2_SELECT
- * PERFCOUNTER3_SELECT
- */
- SI_PC_MULTI_BLOCK = 0,
-
- /* Each secondary selector dword follows immediately after the
- * corresponding primary.
- *
- * Example:
- * PERFCOUNTER0_SELECT
- * PERFCOUNTER0_SELECT1
- * PERFCOUNTER1_SELECT
- * PERFCOUNTER1_SELECT1
- * PERFCOUNTER2_SELECT
- * PERFCOUNTER3_SELECT
- */
- SI_PC_MULTI_ALTERNATE = 1,
-
- /* All secondary selector dwords follow as one block after all primary
- * selector dwords.
- *
- * Example:
- * PERFCOUNTER0_SELECT
- * PERFCOUNTER1_SELECT
- * PERFCOUNTER2_SELECT
- * PERFCOUNTER3_SELECT
- * PERFCOUNTER0_SELECT1
- * PERFCOUNTER1_SELECT1
- */
- SI_PC_MULTI_TAIL = 2,
-
- /* Free-form arrangement of selector registers. */
- SI_PC_MULTI_CUSTOM = 3,
-
- SI_PC_MULTI_MASK = 3,
-
- /* Registers are laid out in decreasing rather than increasing order. */
- SI_PC_REG_REVERSE = 4,
-
- SI_PC_FAKE = 8,
-};
-
-struct si_pc_block_base {
- const char *name;
- unsigned num_counters;
- unsigned flags;
-
- unsigned select_or;
- unsigned select0;
- unsigned counter0_lo;
- unsigned *select;
- unsigned *counters;
- unsigned num_multi;
- unsigned num_prelude;
- unsigned layout;
-};
-
-struct si_pc_block_gfxdescr {
- struct si_pc_block_base *b;
- unsigned selectors;
- unsigned instances;
-};
-
-struct si_pc_block {
- const struct si_pc_block_gfxdescr *b;
- unsigned num_instances;
-
- unsigned num_groups;
- char *group_names;
- unsigned group_name_stride;
-
- char *selector_names;
- unsigned selector_name_stride;
-};
-
-/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
- * performance counter group IDs.
- */
-static const char *const si_pc_shader_type_suffixes[] = {"", "_ES", "_GS", "_VS",
- "_PS", "_LS", "_HS", "_CS"};
-
-static const unsigned si_pc_shader_type_bits[] = {
- 0x7f,
- S_036780_ES_EN(1),
- S_036780_GS_EN(1),
- S_036780_VS_EN(1),
- S_036780_PS_EN(1),
- S_036780_LS_EN(1),
- S_036780_HS_EN(1),
- S_036780_CS_EN(1),
-};
-
-/* Max counters per HW block */
-#define SI_QUERY_MAX_COUNTERS 16
-
-#define SI_PC_SHADERS_WINDOWING (1u << 31)
+#include "ac_perfcounter.h"
struct si_query_group {
struct si_query_group *next;
- struct si_pc_block *block;
+ struct ac_pc_block *block;
unsigned sub_gid; /* only used during init */
unsigned result_base; /* only used during init */
int se;
int instance;
unsigned num_counters;
- unsigned selectors[SI_QUERY_MAX_COUNTERS];
+ unsigned selectors[AC_QUERY_MAX_COUNTERS];
};
struct si_query_counter {
@@ -182,525 +58,6 @@ struct si_query_pc {
struct si_query_group *groups;
};
-static struct si_pc_block_base cik_CB = {
- .name = "CB",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_037000_CB_PERFCOUNTER_FILTER,
- .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
- .num_multi = 1,
- .num_prelude = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static unsigned cik_CPC_select[] = {
- R_036024_CPC_PERFCOUNTER0_SELECT,
- R_036010_CPC_PERFCOUNTER0_SELECT1,
- R_03600C_CPC_PERFCOUNTER1_SELECT,
-};
-static struct si_pc_block_base cik_CPC = {
- .name = "CPC",
- .num_counters = 2,
-
- .select = cik_CPC_select,
- .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
-};
-
-static struct si_pc_block_base cik_CPF = {
- .name = "CPF",
- .num_counters = 2,
-
- .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
-};
-
-static struct si_pc_block_base cik_CPG = {
- .name = "CPG",
- .num_counters = 2,
-
- .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
-};
-
-static struct si_pc_block_base cik_DB = {
- .name = "DB",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
- .num_multi = 3, // really only 2, but there's a gap between registers
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_GDS = {
- .name = "GDS",
- .num_counters = 4,
-
- .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_TAIL,
-};
-
-static unsigned cik_GRBM_counters[] = {
- R_034100_GRBM_PERFCOUNTER0_LO,
- R_03410C_GRBM_PERFCOUNTER1_LO,
-};
-static struct si_pc_block_base cik_GRBM = {
- .name = "GRBM",
- .num_counters = 2,
-
- .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
- .counters = cik_GRBM_counters,
-};
-
-static struct si_pc_block_base cik_GRBMSE = {
- .name = "GRBMSE",
- .num_counters = 4,
-
- .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
- .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
-};
-
-static struct si_pc_block_base cik_IA = {
- .name = "IA",
- .num_counters = 4,
-
- .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_TAIL,
-};
-
-static struct si_pc_block_base cik_PA_SC = {
- .name = "PA_SC",
- .num_counters = 8,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-/* According to docs, PA_SU counters are only 48 bits wide. */
-static struct si_pc_block_base cik_PA_SU = {
- .name = "PA_SU",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_SPI = {
- .name = "SPI",
- .num_counters = 6,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
- .num_multi = 4,
- .layout = SI_PC_MULTI_BLOCK,
-};
-
-static struct si_pc_block_base cik_SQ = {
- .name = "SQ",
- .num_counters = 16,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
-
- .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
- .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
- .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
-};
-
-static struct si_pc_block_base cik_SX = {
- .name = "SX",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_TAIL,
-};
-
-static struct si_pc_block_base cik_TA = {
- .name = "TA",
- .num_counters = 2,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TD = {
- .name = "TD",
- .num_counters = 2,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TCA = {
- .name = "TCA",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TCC = {
- .name = "TCC",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_TCP = {
- .name = "TCP",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base cik_VGT = {
- .name = "VGT",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_TAIL,
-};
-
-static struct si_pc_block_base cik_WD = {
- .name = "WD",
- .num_counters = 4,
-
- .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
-};
-
-static struct si_pc_block_base cik_MC = {
- .name = "MC",
- .num_counters = 4,
-
- .layout = SI_PC_FAKE,
-};
-
-static struct si_pc_block_base cik_SRBM = {
- .name = "SRBM",
- .num_counters = 2,
-
- .layout = SI_PC_FAKE,
-};
-
-static struct si_pc_block_base gfx10_CHA = {
- .name = "CHA",
- .num_counters = 4,
-
- .select0 = R_037780_CHA_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035800_CHA_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_CHCG = {
- .name = "CHCG",
- .num_counters = 4,
-
- .select0 = R_036F18_CHCG_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034F20_CHCG_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_CHC = {
- .name = "CHC",
- .num_counters = 4,
-
- .select0 = R_036F00_CHC_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034F00_CHC_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GCR = {
- .name = "GCR",
- .num_counters = 2,
-
- .select0 = R_037580_GCR_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035480_GCR_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GE = {
- .name = "GE",
- .num_counters = 12,
-
- .select0 = R_036200_GE_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034200_GE_PERFCOUNTER0_LO,
- .num_multi = 4,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL1A = {
- .name = "GL1A",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_037700_GL1A_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035700_GL1A_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL1C = {
- .name = "GL1C",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_036E80_GL1C_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034E80_GL1C_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL2A = {
- .name = "GL2A",
- .num_counters = 4,
-
- .select0 = R_036E40_GL2A_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034E40_GL2A_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_GL2C = {
- .name = "GL2C",
- .num_counters = 4,
-
- .select0 = R_036E00_GL2C_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034E00_GL2C_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static unsigned gfx10_PA_PH_select[] = {
- R_037600_PA_PH_PERFCOUNTER0_SELECT,
- R_037604_PA_PH_PERFCOUNTER0_SELECT1,
- R_037608_PA_PH_PERFCOUNTER1_SELECT,
- R_037640_PA_PH_PERFCOUNTER1_SELECT1,
- R_03760C_PA_PH_PERFCOUNTER2_SELECT,
- R_037644_PA_PH_PERFCOUNTER2_SELECT1,
- R_037610_PA_PH_PERFCOUNTER3_SELECT,
- R_037648_PA_PH_PERFCOUNTER3_SELECT1,
- R_037614_PA_PH_PERFCOUNTER4_SELECT,
- R_037618_PA_PH_PERFCOUNTER5_SELECT,
- R_03761C_PA_PH_PERFCOUNTER6_SELECT,
- R_037620_PA_PH_PERFCOUNTER7_SELECT,
-};
-static struct si_pc_block_base gfx10_PA_PH = {
- .name = "PA_PH",
- .num_counters = 8,
- .flags = SI_PC_BLOCK_SE,
-
- .select = gfx10_PA_PH_select,
- .counter0_lo = R_035600_PA_PH_PERFCOUNTER0_LO,
- .num_multi = 4,
- .layout = SI_PC_MULTI_CUSTOM,
-};
-
-static struct si_pc_block_base gfx10_PA_SU = {
- .name = "PA_SU",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
- .num_multi = 4,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_RLC = {
- .name = "RLC",
- .num_counters = 2,
-
- .select0 = R_037304_RLC_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035200_RLC_PERFCOUNTER0_LO,
- .num_multi = 0,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_RMI = {
- .name = "RMI",
- /* Actually 4, but the 2nd counter is missing the secondary selector while
- * the 3rd counter has it, which complicates the register layout. */
- .num_counters = 2,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_037400_RMI_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035300_RMI_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-static struct si_pc_block_base gfx10_UTCL1 = {
- .name = "UTCL1",
- .num_counters = 2,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_03758C_UTCL1_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035470_UTCL1_PERFCOUNTER0_LO,
- .num_multi = 0,
- .layout = SI_PC_MULTI_ALTERNATE,
-};
-
-/* Both the number of instances and selectors varies between chips of the same
- * class. We only differentiate by class here and simply expose the maximum
- * number over all chips in a class.
- *
- * Unfortunately, GPUPerfStudio uses the order of performance counter groups
- * blindly once it believes it has identified the hardware, so the order of
- * blocks here matters.
- */
-static struct si_pc_block_gfxdescr groups_CIK[] = {
- {&cik_CB, 226}, {&cik_CPF, 17}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
- {&cik_PA_SU, 153}, {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252}, {&cik_SX, 32},
- {&cik_TA, 111}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55}, {&cik_TCP, 154},
- {&cik_GDS, 121}, {&cik_VGT, 140}, {&cik_IA, 22}, {&cik_MC, 22}, {&cik_SRBM, 19},
- {&cik_WD, 22}, {&cik_CPG, 46}, {&cik_CPC, 22},
-
-};
-
-static struct si_pc_block_gfxdescr groups_VI[] = {
- {&cik_CB, 405}, {&cik_CPF, 19}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
- {&cik_PA_SU, 154}, {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273}, {&cik_SX, 34},
- {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55}, {&cik_TCP, 180},
- {&cik_GDS, 121}, {&cik_VGT, 147}, {&cik_IA, 24}, {&cik_MC, 22}, {&cik_SRBM, 27},
- {&cik_WD, 37}, {&cik_CPG, 48}, {&cik_CPC, 24},
-
-};
-
-static struct si_pc_block_gfxdescr groups_gfx9[] = {
- {&cik_CB, 438}, {&cik_CPF, 32}, {&cik_DB, 328}, {&cik_GRBM, 38}, {&cik_GRBMSE, 16},
- {&cik_PA_SU, 292}, {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374}, {&cik_SX, 208},
- {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57}, {&cik_TCP, 85},
- {&cik_GDS, 121}, {&cik_VGT, 148}, {&cik_IA, 32}, {&cik_WD, 58}, {&cik_CPG, 59},
- {&cik_CPC, 35},
-};
-
-static struct si_pc_block_gfxdescr groups_gfx10[] = {
- {&cik_CB, 461},
- {&gfx10_CHA, 45},
- {&gfx10_CHCG, 35},
- {&gfx10_CHC, 35},
- {&cik_CPC, 47},
- {&cik_CPF, 40},
- {&cik_CPG, 82},
- {&cik_DB, 370},
- {&gfx10_GCR, 94},
- {&cik_GDS, 123},
- {&gfx10_GE, 315},
- {&gfx10_GL1A, 36},
- {&gfx10_GL1C, 64},
- {&gfx10_GL2A, 91},
- {&gfx10_GL2C, 235},
- {&cik_GRBM, 47},
- {&cik_GRBMSE, 19},
- {&gfx10_PA_PH, 960},
- {&cik_PA_SC, 552},
- {&gfx10_PA_SU, 266},
- {&gfx10_RLC, 7},
- {&gfx10_RMI, 258},
- {&cik_SPI, 329},
- {&cik_SQ, 509},
- {&cik_SX, 225},
- {&cik_TA, 226},
- {&cik_TCP, 77},
- {&cik_TD, 61},
- {&gfx10_UTCL1, 15},
-};
-
-static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
- const struct si_pc_block *block)
-{
- return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
- (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
-}
-
-static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
- const struct si_pc_block *block)
-{
- return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
- (block->num_instances > 1 && pc->separate_instance);
-}
-
-static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
- unsigned *base_gid, unsigned *sub_index)
-{
- struct si_pc_block *block = pc->blocks;
- unsigned bid;
-
- *base_gid = 0;
- for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
- unsigned total = block->num_groups * block->b->selectors;
-
- if (index < total) {
- *sub_index = index;
- return block;
- }
-
- index -= total;
- *base_gid += block->num_groups;
- }
-
- return NULL;
-}
-
-static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
-{
- unsigned bid;
- struct si_pc_block *block = pc->blocks;
-
- for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
- if (*index < block->num_groups)
- return block;
- *index -= block->num_groups;
- }
-
- return NULL;
-}
-
static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
@@ -724,7 +81,7 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
}
radeon_begin(cs);
- radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+ radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
radeon_end();
}
@@ -733,105 +90,37 @@ static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
radeon_begin(cs);
- radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
- radeon_emit(cs, shaders & 0x7f);
- radeon_emit(cs, 0xffffffff);
+ radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
+ radeon_emit(shaders & 0x7f);
+ radeon_emit(0xffffffff);
radeon_end();
}
-static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
unsigned *selectors)
{
- struct si_pc_block_base *regs = block->b->b;
+ struct ac_pc_block_base *regs = block->b->b;
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
unsigned idx;
- unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
- unsigned dw;
assert(count <= regs->num_counters);
- if (regs->layout & SI_PC_FAKE)
+ /* Fake counters. */
+ if (!regs->select0)
return;
radeon_begin(cs);
- if (layout_multi == SI_PC_MULTI_BLOCK) {
- assert(!(regs->layout & SI_PC_REG_REVERSE));
-
- dw = count + regs->num_prelude;
- if (count >= regs->num_multi)
- dw += regs->num_multi;
- radeon_set_uconfig_reg_seq(cs, regs->select0, dw, false);
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
- radeon_emit(cs, selectors[idx] | regs->select_or);
-
- if (count < regs->num_multi) {
- unsigned select1 = regs->select0 + 4 * regs->num_multi;
- radeon_set_uconfig_reg_seq(cs, select1, count, false);
- }
-
- for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
- radeon_emit(cs, 0);
+ for (idx = 0; idx < count; ++idx) {
+ radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false);
+ radeon_emit(selectors[idx] | regs->select_or);
+ }
- if (count > regs->num_multi) {
- for (idx = regs->num_multi; idx < count; ++idx)
- radeon_emit(cs, selectors[idx] | regs->select_or);
- }
- } else if (layout_multi == SI_PC_MULTI_TAIL) {
- unsigned select1, select1_count;
-
- assert(!(regs->layout & SI_PC_REG_REVERSE));
-
- radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude, false);
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- for (idx = 0; idx < count; ++idx)
- radeon_emit(cs, selectors[idx] | regs->select_or);
-
- select1 = regs->select0 + 4 * regs->num_counters;
- select1_count = MIN2(count, regs->num_multi);
- radeon_set_uconfig_reg_seq(cs, select1, select1_count, false);
- for (idx = 0; idx < select1_count; ++idx)
- radeon_emit(cs, 0);
- } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
- unsigned *reg = regs->select;
- for (idx = 0; idx < count; ++idx) {
- radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
- if (idx < regs->num_multi)
- radeon_set_uconfig_reg(cs, *reg++, 0);
- }
- } else {
- assert(layout_multi == SI_PC_MULTI_ALTERNATE);
-
- unsigned reg_base = regs->select0;
- unsigned reg_count = count + MIN2(count, regs->num_multi);
- reg_count += regs->num_prelude;
-
- if (!(regs->layout & SI_PC_REG_REVERSE)) {
- radeon_set_uconfig_reg_seq(cs, reg_base, reg_count, false);
-
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- for (idx = 0; idx < count; ++idx) {
- radeon_emit(cs, selectors[idx] | regs->select_or);
- if (idx < regs->num_multi)
- radeon_emit(cs, 0);
- }
- } else {
- reg_base -= (reg_count - 1) * 4;
- radeon_set_uconfig_reg_seq(cs, reg_base, reg_count, false);
-
- for (idx = count; idx > 0; --idx) {
- if (idx <= regs->num_multi)
- radeon_emit(cs, 0);
- radeon_emit(cs, selectors[idx - 1] | regs->select_or);
- }
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- }
+ for (idx = 0; idx < regs->num_spm_counters; idx++) {
+ radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false);
+ radeon_emit(0);
}
+
radeon_end();
}
@@ -843,11 +132,11 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer
COPY_DATA_IMM, NULL, 1);
radeon_begin(cs);
- radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+ radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
- radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
+ radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
radeon_end();
}
@@ -863,20 +152,20 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer,
si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
radeon_begin(cs);
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
radeon_set_uconfig_reg(
- cs, R_036020_CP_PERFMON_CNTL,
+ R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
radeon_end();
}
-static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
uint64_t va)
{
- struct si_pc_block_base *regs = block->b->b;
+ struct ac_pc_block_base *regs = block->b->b;
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
unsigned idx;
unsigned reg = regs->counter0_lo;
@@ -884,33 +173,31 @@ static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block,
radeon_begin(cs);
- if (!(regs->layout & SI_PC_FAKE)) {
- if (regs->layout & SI_PC_REG_REVERSE)
- reg_delta = -reg_delta;
-
+ if (regs->select0) {
for (idx = 0; idx < count; ++idx) {
if (regs->counters)
reg = regs->counters[idx];
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+ radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
COPY_DATA_COUNT_SEL); /* 64 bits */
- radeon_emit(cs, reg >> 2);
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ radeon_emit(reg >> 2);
+ radeon_emit(0); /* unused */
+ radeon_emit(va);
+ radeon_emit(va >> 32);
va += sizeof(uint64_t);
reg += reg_delta;
}
} else {
+ /* Fake counters. */
for (idx = 0; idx < count; ++idx) {
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
- COPY_DATA_COUNT_SEL);
- radeon_emit(cs, 0); /* immediate */
- radeon_emit(cs, 0);
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+ COPY_DATA_COUNT_SEL);
+ radeon_emit(0); /* immediate */
+ radeon_emit(0);
+ radeon_emit(va);
+ radeon_emit(va >> 32);
va += sizeof(uint64_t);
}
}
@@ -938,10 +225,10 @@ void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, b
radeon_begin(&sctx->gfx_cs);
if (sctx->chip_class >= GFX10) {
- radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL,
+ radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
S_037390_PERFMON_CLOCK_STATE(inhibit));
} else if (sctx->chip_class >= GFX8) {
- radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL,
+ radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
S_0372FC_PERFMON_CLOCK_STATE(inhibit));
}
radeon_end();
@@ -966,7 +253,7 @@ static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
for (struct si_query_group *group = query->groups; group; group = group->next) {
- struct si_pc_block *block = group->block;
+ struct ac_pc_block *block = group->block;
if (group->se != current_se || group->instance != current_instance) {
current_se = group->se;
@@ -997,11 +284,11 @@ static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery
si_pc_emit_stop(sctx, query->buffer.buf, va);
for (struct si_query_group *group = query->groups; group; group = group->next) {
- struct si_pc_block *block = group->block;
+ struct ac_pc_block *block = group->block;
unsigned se = group->se >= 0 ? group->se : 0;
unsigned se_end = se + 1;
- if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
+ if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
se_end = sctx->screen->info.max_se;
do {
@@ -1102,8 +389,9 @@ static const struct si_query_ops batch_query_ops = {
};
static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
- struct si_pc_block *block, unsigned sub_gid)
+ struct ac_pc_block *block, unsigned sub_gid)
{
+ struct si_perfcounters *pc = screen->perfcounters;
struct si_query_group *group = query->groups;
while (group) {
@@ -1119,20 +407,20 @@ static struct si_query_group *get_group_state(struct si_screen *screen, struct s
group->block = block;
group->sub_gid = sub_gid;
- if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+ if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
unsigned sub_gids = block->num_instances;
unsigned shader_id;
unsigned shaders;
unsigned query_shaders;
- if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
+ if (ac_pc_block_has_per_se_groups(&pc->base, block))
sub_gids = sub_gids * screen->info.max_se;
shader_id = sub_gid / sub_gids;
sub_gid = sub_gid % sub_gids;
- shaders = si_pc_shader_type_bits[shader_id];
+ shaders = ac_pc_shader_type_bits[shader_id];
- query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
+ query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
if (query_shaders && query_shaders != shaders) {
fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
FREE(group);
@@ -1141,20 +429,20 @@ static struct si_query_group *get_group_state(struct si_screen *screen, struct s
query->shaders = shaders;
}
- if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
+ if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
// A non-zero value in query->shaders ensures that the shader
// masking is reset unless the user explicitly requests one.
- query->shaders = SI_PC_SHADERS_WINDOWING;
+ query->shaders = AC_PC_SHADERS_WINDOWING;
}
- if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
+ if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
group->se = sub_gid / block->num_instances;
sub_gid = sub_gid % block->num_instances;
} else {
group->se = -1;
}
- if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
+ if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
group->instance = sub_gid;
} else {
group->instance = -1;
@@ -1171,7 +459,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
{
struct si_screen *screen = (struct si_screen *)ctx->screen;
struct si_perfcounters *pc = screen->perfcounters;
- struct si_pc_block *block;
+ struct ac_pc_block *block;
struct si_query_group *group;
struct si_query_pc *query;
unsigned base_gid, sub_gid, sub_index;
@@ -1196,7 +484,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
goto error;
block =
- lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+ ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
if (!block)
goto error;
@@ -1221,11 +509,11 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
i = 0;
for (group = query->groups; group; group = group->next) {
- struct si_pc_block *block = group->block;
+ struct ac_pc_block *block = group->block;
unsigned read_dw;
unsigned instances = 1;
- if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+ if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
instances = screen->info.max_se;
if (group->instance < 0)
instances *= block->num_instances;
@@ -1240,7 +528,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
}
if (query->shaders) {
- if (query->shaders == SI_PC_SHADERS_WINDOWING)
+ if (query->shaders == AC_PC_SHADERS_WINDOWING)
query->shaders = 0xffffffff;
}
@@ -1248,10 +536,10 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
query->counters = CALLOC(num_queries, sizeof(*query->counters));
for (i = 0; i < num_queries; ++i) {
struct si_query_counter *counter = &query->counters[i];
- struct si_pc_block *block;
+ struct ac_pc_block *block;
block =
- lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+ ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
sub_gid = sub_index / block->b->selectors;
sub_index = sub_index % block->b->selectors;
@@ -1268,7 +556,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_
counter->stride = group->num_counters;
counter->qwords = 1;
- if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+ if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
counter->qwords = screen->info.max_se;
if (group->instance < 0)
counter->qwords *= block->num_instances;
@@ -1281,96 +569,11 @@ error:
return NULL;
}
-static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
-{
- bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
- bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
- unsigned i, j, k;
- unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
- unsigned namelen;
- char *groupname;
- char *p;
-
- if (per_instance_groups)
- groups_instance = block->num_instances;
- if (per_se_groups)
- groups_se = screen->info.max_se;
- if (block->b->b->flags & SI_PC_BLOCK_SHADER)
- groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
-
- namelen = strlen(block->b->b->name);
- block->group_name_stride = namelen + 1;
- if (block->b->b->flags & SI_PC_BLOCK_SHADER)
- block->group_name_stride += 3;
- if (per_se_groups) {
- assert(groups_se <= 10);
- block->group_name_stride += 1;
-
- if (per_instance_groups)
- block->group_name_stride += 1;
- }
- if (per_instance_groups) {
- assert(groups_instance <= 100);
- block->group_name_stride += 2;
- }
-
- block->group_names = MALLOC(block->num_groups * block->group_name_stride);
- if (!block->group_names)
- return false;
-
- groupname = block->group_names;
- for (i = 0; i < groups_shader; ++i) {
- const char *shader_suffix = si_pc_shader_type_suffixes[i];
- unsigned shaderlen = strlen(shader_suffix);
- for (j = 0; j < groups_se; ++j) {
- for (k = 0; k < groups_instance; ++k) {
- strcpy(groupname, block->b->b->name);
- p = groupname + namelen;
-
- if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
- strcpy(p, shader_suffix);
- p += shaderlen;
- }
-
- if (per_se_groups) {
- p += sprintf(p, "%d", j);
- if (per_instance_groups)
- *p++ = '_';
- }
-
- if (per_instance_groups)
- p += sprintf(p, "%d", k);
-
- groupname += block->group_name_stride;
- }
- }
- }
-
- assert(block->b->selectors <= 1000);
- block->selector_name_stride = block->group_name_stride + 4;
- block->selector_names =
- MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
- if (!block->selector_names)
- return false;
-
- groupname = block->group_names;
- p = block->selector_names;
- for (i = 0; i < block->num_groups; ++i) {
- for (j = 0; j < block->b->selectors; ++j) {
- sprintf(p, "%s_%03d", groupname, j);
- p += block->selector_name_stride;
- }
- groupname += block->group_name_stride;
- }
-
- return true;
-}
-
int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
struct pipe_driver_query_info *info)
{
struct si_perfcounters *pc = screen->perfcounters;
- struct si_pc_block *block;
+ struct ac_pc_block *block;
unsigned base_gid, sub;
if (!pc)
@@ -1379,19 +582,19 @@ int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
if (!info) {
unsigned bid, num_queries = 0;
- for (bid = 0; bid < pc->num_blocks; ++bid) {
- num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
+ for (bid = 0; bid < pc->base.num_blocks; ++bid) {
+ num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
}
return num_queries;
}
- block = lookup_counter(pc, index, &base_gid, &sub);
+ block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
if (!block)
return 0;
if (!block->selector_names) {
- if (!si_init_block_names(screen, block))
+ if (!ac_init_block_names(&screen->info, &pc->base, block))
return 0;
}
info->name = block->selector_names + sub * block->selector_name_stride;
@@ -1410,20 +613,20 @@ int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
struct pipe_driver_query_group_info *info)
{
struct si_perfcounters *pc = screen->perfcounters;
- struct si_pc_block *block;
+ struct ac_pc_block *block;
if (!pc)
return 0;
if (!info)
- return pc->num_groups;
+ return pc->base.num_groups;
- block = lookup_group(pc, &index);
+ block = ac_lookup_group(&pc->base, &index);
if (!block)
return 0;
if (!block->group_names) {
- if (!si_init_block_names(screen, block))
+ if (!ac_init_block_names(&screen->info, &pc->base, block))
return 0;
}
info->name = block->group_names + index * block->group_name_stride;
@@ -1435,100 +638,31 @@ int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
void si_destroy_perfcounters(struct si_screen *screen)
{
struct si_perfcounters *pc = screen->perfcounters;
- unsigned i;
if (!pc)
return;
- for (i = 0; i < pc->num_blocks; ++i) {
- FREE(pc->blocks[i].group_names);
- FREE(pc->blocks[i].selector_names);
- }
- FREE(pc->blocks);
+ ac_destroy_perfcounters(&pc->base);
FREE(pc);
screen->perfcounters = NULL;
}
void si_init_perfcounters(struct si_screen *screen)
{
- struct si_perfcounters *pc;
- const struct si_pc_block_gfxdescr *blocks;
- unsigned num_blocks;
- unsigned i;
-
- switch (screen->info.chip_class) {
- case GFX7:
- blocks = groups_CIK;
- num_blocks = ARRAY_SIZE(groups_CIK);
- break;
- case GFX8:
- blocks = groups_VI;
- num_blocks = ARRAY_SIZE(groups_VI);
- break;
- case GFX9:
- blocks = groups_gfx9;
- num_blocks = ARRAY_SIZE(groups_gfx9);
- break;
- case GFX10:
- case GFX10_3:
- blocks = groups_gfx10;
- num_blocks = ARRAY_SIZE(groups_gfx10);
- break;
- case GFX6:
- default:
- return; /* not implemented */
- }
-
- screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
- if (!pc)
- return;
+ bool separate_se, separate_instance;
- pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
- pc->num_instance_cs_dwords = 3;
-
- pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
- pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
-
- pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
- if (!pc->blocks)
- goto error;
- pc->num_blocks = num_blocks;
-
- for (i = 0; i < num_blocks; ++i) {
- struct si_pc_block *block = &pc->blocks[i];
- block->b = &blocks[i];
- block->num_instances = MAX2(1, block->b->instances);
-
- if (!strcmp(block->b->b->name, "CB") ||
- !strcmp(block->b->b->name, "DB") ||
- !strcmp(block->b->b->name, "RMI"))
- block->num_instances = screen->info.max_se;
- else if (!strcmp(block->b->b->name, "TCC"))
- block->num_instances = screen->info.max_tcc_blocks;
- else if (!strcmp(block->b->b->name, "IA"))
- block->num_instances = MAX2(1, screen->info.max_se / 2);
- else if (!strcmp(block->b->b->name, "TA") ||
- !strcmp(block->b->b->name, "TCP") ||
- !strcmp(block->b->b->name, "TD")) {
- block->num_instances = MAX2(1, screen->info.max_good_cu_per_sa);
- }
+ separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+ separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
- if (si_pc_block_has_per_instance_groups(pc, block)) {
- block->num_groups = block->num_instances;
- } else {
- block->num_groups = 1;
- }
+ screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
+ if (!screen->perfcounters)
+ return;
- if (si_pc_block_has_per_se_groups(pc, block))
- block->num_groups *= screen->info.max_se;
- if (block->b->b->flags & SI_PC_BLOCK_SHADER)
- block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
+ screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
+ screen->perfcounters->num_instance_cs_dwords = 3;
- pc->num_groups += block->num_groups;
+ if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
+ &screen->perfcounters->base)) {
+ si_destroy_perfcounters(screen);
}
-
- return;
-
-error:
- si_destroy_perfcounters(screen);
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c
index 6196f2158..b812f170c 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c
@@ -35,6 +35,7 @@
#include "sid.h"
#include "ac_shadowed_regs.h"
#include "util/disk_cache.h"
+#include "util/u_cpu_detect.h"
#include "util/u_log.h"
#include "util/u_memory.h"
#include "util/u_suballoc.h"
@@ -80,29 +81,25 @@ static const struct debug_named_value radeonsi_debug_options[] = {
{"compute", DBG(COMPUTE), "Print compute info"},
{"vm", DBG(VM), "Print virtual addresses when creating resources"},
{"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."},
+ {"ib", DBG(IB), "Print command buffers."},
/* Driver options: */
{"nowc", DBG(NO_WC), "Disable GTT write combining"},
{"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
{"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
{"shadowregs", DBG(SHADOW_REGS), "Enable CP register shadowing."},
+ {"nofastdlist", DBG(NO_FAST_DISPLAY_LIST), "Disable fast display lists"},
/* 3D engine options: */
{"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
{"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
- {"nofastlaunch", DBG(NO_FAST_LAUNCH), "Disable NGG GS fast launch."},
{"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
{"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
{"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
- {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
- {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
- {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
{"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
{"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
{"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
- {"nodfsm", DBG(NO_DFSM), "Disable DFSM."},
{"dpbb", DBG(DPBB), "Enable DPBB."},
- {"dfsm", DBG(DFSM), "Enable DFSM."},
{"nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z"},
{"no2d", DBG(NO_2D_TILING), "Disable 2D tiling"},
{"notiling", DBG(NO_TILING), "Disable tiling"},
@@ -110,9 +107,11 @@ static const struct debug_named_value radeonsi_debug_options[] = {
{"nodisplaydcc", DBG(NO_DISPLAY_DCC), "Disable display DCC"},
{"nodcc", DBG(NO_DCC), "Disable DCC."},
{"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."},
- {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"},
+ {"nodccstore", DBG(NO_DCC_STORE), "Disable DCC stores"},
+ {"dccstore", DBG(DCC_STORE), "Enable DCC stores"},
{"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"},
{"nofmask", DBG(NO_FMASK), "Disable MSAA compression"},
+ {"nodma", DBG(NO_DMA), "Disable SDMA-copy for DRI_PRIME"},
{"tmz", DBG(TMZ), "Force allocation of scanout/depth/stencil buffer as encrypted"},
{"sqtt", DBG(SQTT), "Enable SQTT"},
@@ -142,7 +141,6 @@ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compil
enum ac_target_machine_options tm_options =
(sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
- (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
(sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
(create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
@@ -150,12 +148,24 @@ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compil
ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
compiler->passes = ac_create_llvm_passes(compiler->tm);
- if (compiler->tm_wave32)
- compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
if (compiler->low_opt_tm)
compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
}
+void si_init_aux_async_compute_ctx(struct si_screen *sscreen)
+{
+ assert(!sscreen->async_compute_context);
+ sscreen->async_compute_context = si_create_context(
+ &sscreen->b,
+ SI_CONTEXT_FLAG_AUX |
+ (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+ PIPE_CONTEXT_COMPUTE_ONLY);
+
+ /* Limit the numbers of waves allocated for this context. */
+ if (sscreen->async_compute_context)
+ ((struct si_context*)sscreen->async_compute_context)->cs_max_waves_per_sh = 2;
+}
+
static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
{
ac_destroy_llvm_compiler(compiler);
@@ -255,8 +265,10 @@ static void si_destroy_context(struct pipe_context *context)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
if (sctx->cs_dcc_decompress)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_decompress);
- if (sctx->cs_dcc_retile)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
+ for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_dcc_retile); i++) {
+ if (sctx->cs_dcc_retile[i])
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile[i]);
+ }
if (sctx->no_velems_state)
sctx->b.delete_vertex_elements_state(&sctx->b, sctx->no_velems_state);
@@ -284,17 +296,6 @@ static void si_destroy_context(struct pipe_context *context)
if (sctx->blitter)
util_blitter_destroy(sctx->blitter);
- /* Release DCC stats. */
- for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
- assert(!sctx->dcc_stats[i].query_active);
-
- for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
- if (sctx->dcc_stats[i].ps_stats[j])
- sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[i].ps_stats[j]);
-
- si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
- }
-
if (sctx->query_result_shader)
sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
if (sctx->sh_query_result_shader)
@@ -303,6 +304,10 @@ static void si_destroy_context(struct pipe_context *context)
sctx->ws->cs_destroy(&sctx->gfx_cs);
if (sctx->ctx)
sctx->ws->ctx_destroy(sctx->ctx);
+ if (sctx->sdma_cs) {
+ sctx->ws->cs_destroy(sctx->sdma_cs);
+ free(sctx->sdma_cs);
+ }
if (sctx->dirty_implicit_resources)
_mesa_hash_table_destroy(sctx->dirty_implicit_resources,
@@ -321,12 +326,8 @@ static void si_destroy_context(struct pipe_context *context)
u_suballocator_destroy(&sctx->allocator_zeroed_memory);
sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
- sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
si_resource_reference(&sctx->eop_bug_scratch, NULL);
si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
- si_resource_reference(&sctx->index_ring, NULL);
- si_resource_reference(&sctx->barrier_buf, NULL);
- si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
si_resource_reference(&sctx->shadowed_regs, NULL);
radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL);
radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL);
@@ -503,7 +504,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
/* Initialize private allocators. */
u_suballocator_init(&sctx->allocator_zeroed_memory, &sctx->b, 128 * 1024, 0,
PIPE_USAGE_DEFAULT,
- SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false);
+ SI_RESOURCE_FLAG_CLEAR | SI_RESOURCE_FLAG_32BIT, false);
sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0);
if (!sctx->cached_gtt_allocator)
@@ -552,6 +553,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
}
sctx->ngg = sscreen->use_ngg;
+ si_shader_change_notify(sctx);
/* Initialize context functions used by graphics and compute. */
if (sctx->chip_class >= GFX10)
@@ -588,6 +590,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
si_init_state_functions(sctx);
si_init_streamout_functions(sctx);
si_init_viewport_functions(sctx);
+ si_init_spi_map_functions(sctx);
sctx->blitter = util_blitter_create(&sctx->b);
if (sctx->blitter == NULL)
@@ -607,27 +610,46 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter);
sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
- si_init_draw_functions(sctx);
-
- si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX,
- &sctx->prim_discard_vertex_count_threshold,
- &sctx->index_ring_size_per_ib);
- } else {
- sctx->prim_discard_vertex_count_threshold = UINT_MAX;
+ switch (sctx->chip_class) {
+ case GFX6:
+ si_init_draw_functions_GFX6(sctx);
+ break;
+ case GFX7:
+ si_init_draw_functions_GFX7(sctx);
+ break;
+ case GFX8:
+ si_init_draw_functions_GFX8(sctx);
+ break;
+ case GFX9:
+ si_init_draw_functions_GFX9(sctx);
+ break;
+ case GFX10:
+ si_init_draw_functions_GFX10(sctx);
+ break;
+ case GFX10_3:
+ si_init_draw_functions_GFX10_3(sctx);
+ break;
+ default:
+ unreachable("unhandled chip class");
+ }
}
sctx->sample_mask = 0xffff;
/* Initialize multimedia functions. */
- if (sscreen->info.has_hw_decode) {
+ if (sscreen->info.has_video_hw.uvd_decode || sscreen->info.has_video_hw.vcn_decode ||
+ sscreen->info.has_video_hw.jpeg_decode || sscreen->info.has_video_hw.vce_encode ||
+ sscreen->info.has_video_hw.uvd_encode || sscreen->info.has_video_hw.vcn_encode) {
sctx->b.create_video_codec = si_uvd_create_decoder;
sctx->b.create_video_buffer = si_video_buffer_create;
+ if (screen->resource_create_with_modifiers)
+ sctx->b.create_video_buffer_with_modifiers = si_video_buffer_create_with_modifiers;
} else {
sctx->b.create_video_codec = vl_create_decoder;
sctx->b.create_video_buffer = vl_video_buffer_create;
}
- if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+ if (sctx->chip_class >= GFX9) {
sctx->wait_mem_scratch =
si_aligned_buffer_create(screen,
SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
@@ -707,11 +729,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
if (!sctx->dirty_implicit_resources)
goto fail;
- sctx->sample_pos_buffer =
- pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions));
- pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions),
- &sctx->sample_positions);
-
/* The remainder of this function initializes the gfx CS and must be last. */
assert(sctx->gfx_cs.current.cdw == 0);
@@ -719,6 +736,23 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
si_init_cp_reg_shadowing(sctx);
}
+ /* Set immutable fields of shader keys. */
+ if (sctx->chip_class >= GFX9) {
+ /* The LS output / HS input layout can be communicated
+ * directly instead of via user SGPRs for merged LS-HS.
+ * This also enables jumping over the VS prolog for HS-only waves.
+ *
+ * When the LS VGPR fix is needed, monolithic shaders can:
+ * - avoid initializing EXEC in both the LS prolog
+ * and the LS main part when !vs_needs_prolog
+ * - remove the fixup for unused input VGPRs
+ */
+ sctx->shader.tcs.key.opt.prefer_mono = 1;
+
+ /* This enables jumping over the VS prolog for GS-only waves. */
+ sctx->shader.gs.key.opt.prefer_mono = 1;
+ }
+
si_begin_new_gfx_cs(sctx, true);
assert(sctx->gfx_cs.current.cdw == sctx->initial_gfx_cs_size);
@@ -763,6 +797,13 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
}
simple_mtx_unlock(&sscreen->aux_context_lock);
+
+ simple_mtx_lock(&sscreen->async_compute_context_lock);
+ if (status != PIPE_NO_RESET && sscreen->async_compute_context) {
+ sscreen->async_compute_context->destroy(sscreen->async_compute_context);
+ sscreen->async_compute_context = NULL;
+ }
+ simple_mtx_unlock(&sscreen->async_compute_context_lock);
}
sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw;
@@ -773,12 +814,23 @@ fail:
return NULL;
}
+static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource,
+ unsigned usage)
+{
+ struct radeon_winsys *ws = ((struct si_screen *)screen)->ws;
+
+ return !ws->buffer_wait(ws, si_resource(resource)->buf, 0,
+ /* If mapping for write, we need to wait for all reads and writes.
+ * If mapping for read, we only need to wait for writes.
+ */
+ usage & PIPE_MAP_WRITE ? RADEON_USAGE_READWRITE : RADEON_USAGE_WRITE);
+}
+
static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, void *priv,
unsigned flags)
{
struct si_screen *sscreen = (struct si_screen *)screen;
struct pipe_context *ctx;
- uint64_t total_ram;
if (sscreen->debug_flags & DBG(CHECK_VM))
flags |= PIPE_CONTEXT_DEBUG;
@@ -806,14 +858,19 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
/* Use asynchronous flushes only on amdgpu, since the radeon
* implementation for fence_server_sync is incomplete. */
- struct pipe_context * tc = threaded_context_create(
- ctx, &sscreen->pool_transfers, si_replace_buffer_storage,
- sscreen->info.is_amdgpu ? si_create_fence : NULL,
- &((struct si_context *)ctx)->tc);
-
- if (tc && tc != ctx && os_get_total_physical_memory(&total_ram)) {
- ((struct threaded_context *) tc)->bytes_mapped_limit = total_ram / 4;
- }
+ struct pipe_context *tc =
+ threaded_context_create(ctx, &sscreen->pool_transfers,
+ si_replace_buffer_storage,
+ &(struct threaded_context_options){
+ .create_fence = sscreen->info.is_amdgpu ?
+ si_create_fence : NULL,
+ .is_resource_busy = si_is_resource_busy,
+ .driver_calls_flush_notify = true,
+ },
+ &((struct si_context *)ctx)->tc);
+
+ if (tc && tc != ctx)
+ threaded_context_init_bytes_mapped_limit((struct threaded_context *)tc, 4);
return tc;
}
@@ -853,6 +910,11 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
sscreen->aux_context->destroy(sscreen->aux_context);
}
+ simple_mtx_destroy(&sscreen->async_compute_context_lock);
+ if (sscreen->async_compute_context) {
+ sscreen->async_compute_context->destroy(sscreen->async_compute_context);
+ }
+
util_queue_destroy(&sscreen->shader_compiler_queue);
util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
@@ -887,6 +949,9 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
disk_cache_destroy(sscreen->disk_shader_cache);
util_live_shader_cache_deinit(&sscreen->live_shader_cache);
+ util_idalloc_mt_fini(&sscreen->buffer_ids);
+ util_vertex_state_cache_deinit(&sscreen->vertex_state_cache);
+
sscreen->ws->destroy(sscreen->ws);
FREE(sscreen);
}
@@ -1017,22 +1082,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->options.enable_sam,
sscreen->options.disable_sam);
- /* Older LLVM have buggy v_pk_* instructions. */
- if (!sscreen->info.has_packed_math_16bit || LLVM_VERSION_MAJOR < 11)
- sscreen->options.fp16 = false;
-
- if (sscreen->info.chip_class == GFX10_3 && LLVM_VERSION_MAJOR < 11) {
- fprintf(stderr, "radeonsi: GFX 10.3 requires LLVM 11 or higher\n");
- FREE(sscreen);
- return NULL;
- }
-
- if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
- fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
- FREE(sscreen);
- return NULL;
- }
-
if (sscreen->info.chip_class >= GFX9) {
sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
} else {
@@ -1054,6 +1103,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
return NULL;
}
+ util_idalloc_mt_init_tc(&sscreen->buffer_ids);
/* Set functions first. */
sscreen->b.context_create = si_pipe_create_context;
@@ -1072,8 +1122,12 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
/* Set these flags in debug_flags early, so that the shader cache takes
* them into account.
+ *
+ * Enable FS_CORRECT_DERIVS_AFTER_KILL by default if LLVM is >= 13. This makes
+ * nir_opt_move_discards_to_top more effective.
*/
- if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard"))
+ if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard") ||
+ LLVM_VERSION_MAJOR >= 13)
sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
if (sscreen->debug_flags & DBG(INFO))
@@ -1093,6 +1147,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
}
(void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
+ (void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain);
(void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
si_init_gs_info(sscreen);
@@ -1107,7 +1162,8 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
si_disk_cache_create(sscreen);
/* Determine the number of shader compiler threads. */
- hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
+ const struct util_cpu_caps_t *caps = util_get_cpu_caps();
+ hw_threads = caps->nr_cpus;
if (hw_threads >= 12) {
num_comp_hi_threads = hw_threads * 3 / 4;
@@ -1131,7 +1187,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
if (!util_queue_init(
&sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads,
- UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
+ UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen);
glsl_type_singleton_decref();
@@ -1141,7 +1197,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", 64,
num_comp_lo_threads,
UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
- UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+ UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen);
glsl_type_singleton_decref();
@@ -1151,11 +1207,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
si_init_perfcounters(sscreen);
- unsigned prim_discard_vertex_count_threshold, tmp;
- si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
- /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
- if (prim_discard_vertex_count_threshold == UINT_MAX)
- sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
+ sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;
/* Determine tessellation ring info. */
bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
@@ -1221,12 +1273,14 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->commutative_blend_add =
driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
driQueryOptionb(config->options, "allow_draw_out_of_order");
+ sscreen->allow_draw_out_of_order = driQueryOptionb(config->options, "allow_draw_out_of_order");
sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) &&
sscreen->info.chip_class >= GFX10 &&
(sscreen->info.family != CHIP_NAVI14 ||
sscreen->info.is_pro_graphics);
sscreen->use_ngg_culling = sscreen->use_ngg &&
+ sscreen->info.max_render_backends >= 2 &&
!((sscreen->debug_flags & DBG(NO_NGG_CULLING)) ||
LLVM_VERSION_MAJOR <= 11 /* hangs on 11, see #4874 */);
sscreen->use_ngg_streamout = false;
@@ -1239,30 +1293,19 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true;
}
- /* Only enable primitive binning on APUs by default. */
- if (sscreen->info.chip_class >= GFX10) {
- sscreen->dpbb_allowed = true;
- /* DFSM is not supported on GFX 10.3 and not beneficial on Navi1x. */
- } else if (sscreen->info.chip_class == GFX9) {
- sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
- /* DFSM reduces the Raven2 draw prim rate by ~43%. Disable it. */
- sscreen->dfsm_allowed = false;
- }
-
- /* Process DPBB enable flags. */
- if (sscreen->debug_flags & DBG(DPBB)) {
- sscreen->dpbb_allowed = true;
- if (sscreen->debug_flags & DBG(DFSM))
- sscreen->dfsm_allowed = true;
- }
+ /* DCC stores have 50% performance of uncompressed stores and sometimes
+ * even less than that. It's risky to enable on dGPUs.
+ */
+ sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) &&
+ ((sscreen->info.chip_class >= GFX10_3 &&
+ !sscreen->info.has_dedicated_vram) ||
+ sscreen->debug_flags & DBG(DCC_STORE));
- /* Process DPBB disable flags. */
- if (sscreen->debug_flags & DBG(NO_DPBB)) {
- sscreen->dpbb_allowed = false;
- sscreen->dfsm_allowed = false;
- } else if (sscreen->debug_flags & DBG(NO_DFSM)) {
- sscreen->dfsm_allowed = false;
- }
+ sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) &&
+ (sscreen->info.chip_class >= GFX10 ||
+ /* Only enable primitive binning on gfx9 APUs by default. */
+ (sscreen->info.chip_class == GFX9 && !sscreen->info.has_dedicated_vram) ||
+ sscreen->debug_flags & DBG(DPBB));
if (sscreen->dpbb_allowed) {
if (sscreen->info.has_dedicated_vram) {
@@ -1289,11 +1332,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->pbb_persistent_states_per_bin <= 32);
}
- /* While it would be nice not to have this flag, we are constrained
- * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
- */
- sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
-
(void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
@@ -1331,6 +1369,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
}
}
+ sscreen->ngg_subgroup_size = 128;
sscreen->ge_wave_size = 64;
sscreen->ps_wave_size = 64;
sscreen->compute_wave_size = 64;
@@ -1406,6 +1445,9 @@ struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_conf
drmVersionPtr version = drmGetVersion(fd);
struct radeon_winsys *rw = NULL;
+ driParseConfigFiles(config->options, config->options_info, 0, "radeonsi",
+ NULL, NULL, NULL, 0, NULL, 0);
+
switch (version->version_major) {
case 2:
rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h
index c9f64a144..2408346c3 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h
@@ -31,6 +31,7 @@
#include "util/u_idalloc.h"
#include "util/u_suballoc.h"
#include "util/u_threaded_context.h"
+#include "util/u_vertex_state_cache.h"
#include "ac_sqtt.h"
#ifdef __cplusplus
@@ -44,7 +45,6 @@ extern "C" {
#endif
#define ATI_VENDOR_ID 0x1002
-#define SI_PRIM_DISCARD_DEBUG 0
#define SI_NOT_QUERY 0xffffffff
/* The base vertex and primitive restart can be any number, but we must pick
@@ -55,7 +55,7 @@ extern "C" {
#define SI_DRAW_ID_UNKNOWN ((unsigned)INT_MIN)
#define SI_RESTART_INDEX_UNKNOWN ((unsigned)INT_MIN)
#define SI_INSTANCE_COUNT_UNKNOWN ((unsigned)INT_MIN)
-#define SI_NUM_SMOOTH_AA_SAMPLES 8
+#define SI_NUM_SMOOTH_AA_SAMPLES 4
#define SI_MAX_POINT_SIZE 2048
#define SI_GS_PER_ES 128
/* Alignment for optimal CP DMA performance. */
@@ -64,7 +64,8 @@ extern "C" {
/* Tunables for compute-based clear_buffer and copy_buffer: */
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
-#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM
+/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
+#define SI_COMPUTE_DST_CACHE_POLICY L2_LRU
/* Pipeline & streamout query controls. */
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
@@ -137,6 +138,7 @@ extern "C" {
(((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
#define SI_RESOURCE_FLAG_UNCACHED (PIPE_RESOURCE_FLAG_DRV_PRIV << 12)
#define SI_RESOURCE_FLAG_DRIVER_INTERNAL (PIPE_RESOURCE_FLAG_DRV_PRIV << 13)
+#define SI_RESOURCE_AUX_PLANE (PIPE_RESOURCE_FLAG_DRV_PRIV << 14)
enum si_has_gs {
GS_OFF,
@@ -153,11 +155,6 @@ enum si_has_ngg {
NGG_ON,
};
-enum si_has_prim_discard_cs {
- PRIM_DISCARD_CS_OFF,
- PRIM_DISCARD_CS_ON,
-};
-
enum si_clear_code
{
DCC_CLEAR_COLOR_0000 = 0x00000000,
@@ -168,9 +165,8 @@ enum si_clear_code
DCC_UNCOMPRESSED = 0xFFFFFFFF,
};
-#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7)
-#define SI_IMAGE_ACCESS_DCC_OFF (1 << 8)
-#define SI_IMAGE_ACCESS_DCC_WRITE (1 << 9)
+#define SI_IMAGE_ACCESS_DCC_OFF (1 << 8)
+#define SI_IMAGE_ACCESS_ALLOW_DCC_STORE (1 << 9)
/* Debug flags. */
enum
@@ -208,12 +204,14 @@ enum
DBG_COMPUTE,
DBG_VM,
DBG_CACHE_STATS,
+ DBG_IB,
/* Driver options: */
DBG_NO_WC,
DBG_CHECK_VM,
DBG_RESERVE_VMID,
DBG_SHADOW_REGS,
+ DBG_NO_FAST_DISPLAY_LIST,
/* 3D engine options: */
DBG_NO_GFX,
@@ -221,16 +219,10 @@ enum
DBG_ALWAYS_NGG_CULLING_ALL,
DBG_ALWAYS_NGG_CULLING_TESS,
DBG_NO_NGG_CULLING,
- DBG_NO_FAST_LAUNCH,
- DBG_ALWAYS_PD,
- DBG_PD,
- DBG_NO_PD,
DBG_SWITCH_ON_EOP,
DBG_NO_OUT_OF_ORDER,
DBG_NO_DPBB,
- DBG_NO_DFSM,
DBG_DPBB,
- DBG_DFSM,
DBG_NO_HYPERZ,
DBG_NO_2D_TILING,
DBG_NO_TILING,
@@ -238,9 +230,11 @@ enum
DBG_NO_DISPLAY_DCC,
DBG_NO_DCC,
DBG_NO_DCC_CLEAR,
- DBG_NO_DCC_FB,
+ DBG_NO_DCC_STORE,
+ DBG_DCC_STORE,
DBG_NO_DCC_MSAA,
DBG_NO_FMASK,
+ DBG_NO_DMA,
DBG_TMZ,
DBG_SQTT,
@@ -293,16 +287,14 @@ struct si_resource {
struct pb_buffer *buf;
uint64_t gpu_address;
/* Memory usage if the buffer placement is optimal. */
- uint32_t vram_usage_kb;
- uint32_t gart_usage_kb;
+ uint32_t memory_usage_kb;
/* Resource properties. */
uint64_t bo_size;
- unsigned bo_alignment;
- enum radeon_bo_domain domains;
- enum radeon_bo_flag flags;
+ uint8_t bo_alignment_log2;
+ enum radeon_bo_domain domains:8;
+ enum radeon_bo_flag flags:16;
unsigned bind_history;
- int max_forced_staging_uploads;
/* The buffer range which is initialized (with a write transfer,
* streamout, DMA, or as a random access target). The rest of
@@ -331,13 +323,12 @@ struct si_resource {
bool image_handle_allocated;
/* Whether the resource has been exported via resource_get_handle. */
- unsigned external_usage; /* PIPE_HANDLE_USAGE_* */
+ uint8_t external_usage; /* PIPE_HANDLE_USAGE_* */
};
struct si_transfer {
struct threaded_transfer b;
struct si_resource *staging;
- unsigned offset;
};
struct si_texture {
@@ -368,7 +359,8 @@ struct si_texture {
/* Depth buffer compression and fast clear. */
float depth_clear_value[RADEON_SURF_MAX_LEVELS];
uint8_t stencil_clear_value[RADEON_SURF_MAX_LEVELS];
- uint16_t depth_cleared_level_mask; /* if it was cleared at least once */
+ uint16_t depth_cleared_level_mask_once; /* if it was cleared at least once */
+ uint16_t depth_cleared_level_mask; /* track if it's cleared (can be false negative) */
uint16_t stencil_cleared_level_mask; /* if it was cleared at least once */
uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */
uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
@@ -382,40 +374,36 @@ struct si_texture {
bool db_compatible : 1;
bool can_sample_z : 1;
bool can_sample_s : 1;
+ bool need_flush_after_depth_decompression: 1;
/* We need to track DCC dirtiness, because st/dri usually calls
* flush_resource twice per frame (not a bug) and we don't wanna
- * decompress DCC twice. Also, the dirty tracking must be done even
- * if DCC isn't used, because it's required by the DCC usage analysis
- * for a possible future enablement.
+ * decompress DCC twice.
*/
- bool separate_dcc_dirty : 1;
bool displayable_dcc_dirty : 1;
- /* Statistics gathering for the DCC enablement heuristic. */
- bool dcc_gather_statistics : 1;
/* Counter that should be non-zero if the texture is bound to a
* framebuffer.
*/
unsigned framebuffers_bound;
- /* Whether the texture is a displayable back buffer and needs DCC
- * decompression, which is expensive. Therefore, it's enabled only
- * if statistics suggest that it will pay off and it's allocated
- * separately. It can't be bound as a sampler by apps. Limited to
- * target == 2D and last_level == 0. If enabled, dcc_offset contains
- * the absolute GPUVM address, not the relative one.
- */
- struct si_resource *dcc_separate_buffer;
- /* When DCC is temporarily disabled, the separate buffer is here. */
- struct si_resource *last_dcc_separate_buffer;
- /* Estimate of how much this color buffer is written to in units of
- * full-screen draws: ps_invocations / (width * height)
- * Shader kills, late Z, and blending with trivial discards make it
- * inaccurate (we need to count CB updates, not PS invocations).
- */
- unsigned ps_draw_ratio;
- /* The number of clears since the last DCC usage analysis. */
- unsigned num_slow_clears;
+};
+
+/* State trackers create separate textures in a next-chain for extra planes
+ * even if those are planes created purely for modifiers. Because the linking
+ * of the chain happens outside of the driver, and NULL is interpreted as
+ * failure, let's create some dummy texture structs. We could use these
+ * later to use the offsets for linking if we really wanted to.
+ *
+ * For now just create a dummy struct and completely ignore it.
+ *
+ * Potentially in the future we could store stride/offset and use it during
+ * creation, though we might want to change how linking is done first.
+ */
+struct si_auxiliary_texture {
+ struct threaded_resource b;
+ struct pb_buffer *buffer;
+ uint32_t offset;
+ uint32_t stride;
};
struct si_surface {
@@ -533,7 +521,7 @@ struct si_screen {
unsigned width, unsigned height, unsigned depth, uint32_t *state,
uint32_t *fmask_state);
- unsigned num_vbos_in_user_sgprs;
+ unsigned max_memory_usage_kb;
unsigned pa_sc_raster_config;
unsigned pa_sc_raster_config_1;
unsigned se_tile_repeat;
@@ -551,13 +539,13 @@ struct si_screen {
bool has_out_of_order_rast;
bool assume_no_z_fights;
bool commutative_blend_add;
+ bool allow_draw_out_of_order;
bool dpbb_allowed;
- bool dfsm_allowed;
- bool llvm_has_working_vgpr_indexing;
bool use_ngg;
bool use_ngg_culling;
bool use_ngg_streamout;
bool allow_dcc_msaa_clear_to_reg_for_bpp[5]; /* indexed by log2(Bpp) */
+ bool always_allow_dcc_stores;
struct {
#define OPT_BOOL(name, dflt, description) bool name : 1;
@@ -578,6 +566,10 @@ struct si_screen {
struct pipe_context *aux_context;
simple_mtx_t aux_context_lock;
+ /* Async compute context for DRI_PRIME copies. */
+ struct pipe_context *async_compute_context;
+ simple_mtx_t async_compute_context_lock;
+
/* This must be in the screen, because UE4 uses one context for
* compilation and another one for rendering.
*/
@@ -671,6 +663,10 @@ struct si_screen {
unsigned compute_wave_size;
unsigned ps_wave_size;
unsigned ge_wave_size;
+ unsigned ngg_subgroup_size;
+
+ struct util_idalloc_mt buffer_ids;
+ struct util_vertex_state_cache vertex_state_cache;
};
struct si_sampler_view {
@@ -809,6 +805,8 @@ struct si_streamout {
struct si_shader_ctx_state {
struct si_shader_selector *cso;
struct si_shader *current;
+ /* The shader variant key representing the current state. */
+ struct si_shader_key key;
};
#define SI_NUM_VGT_PARAM_KEY_BITS 12
@@ -846,35 +844,6 @@ union si_vgt_param_key {
uint16_t index;
};
-#define SI_NUM_VGT_STAGES_KEY_BITS 6
-#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
-
-/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
- * Some fields are set by state-change calls, most are set by draw_vbo.
- */
-union si_vgt_stages_key {
- struct {
-#if UTIL_ARCH_LITTLE_ENDIAN
- uint8_t tess : 1;
- uint8_t gs : 1;
- uint8_t ngg_gs_fast_launch : 1;
- uint8_t ngg_passthrough : 1;
- uint8_t ngg : 1; /* gfx10+ */
- uint8_t streamout : 1; /* only used with NGG */
- uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
-#else /* UTIL_ARCH_BIG_ENDIAN */
- uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
- uint8_t streamout : 1;
- uint8_t ngg : 1;
- uint8_t ngg_passthrough : 1;
- uint8_t ngg_gs_fast_launch : 1;
- uint8_t gs : 1;
- uint8_t tess : 1;
-#endif
- } u;
- uint8_t index;
-};
-
struct si_texture_handle {
unsigned desc_slot;
bool desc_dirty;
@@ -897,7 +866,6 @@ struct si_saved_cs {
unsigned trace_id;
unsigned gfx_last_dw;
- unsigned compute_last_dw;
bool flushed;
int64_t time_flush;
};
@@ -907,11 +875,24 @@ struct si_small_prim_cull_info {
float small_prim_precision;
};
+struct si_vertex_state {
+ struct pipe_vertex_state b;
+ struct si_vertex_elements velems;
+ uint32_t descriptors[4 * SI_MAX_ATTRIBS];
+};
+
typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
const struct pipe_draw_info *info,
+ unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect,
- const struct pipe_draw_start_count *draws,
+ const struct pipe_draw_start_count_bias *draws,
unsigned num_draws);
+typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx,
+ struct pipe_vertex_state *vstate,
+ uint32_t partial_velem_mask,
+ struct pipe_draw_vertex_state_info info,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws);
struct si_context {
struct pipe_context b; /* base class */
@@ -922,6 +903,7 @@ struct si_context {
struct radeon_winsys *ws;
struct radeon_winsys_ctx *ctx;
struct radeon_cmdbuf gfx_cs; /* compute IB if graphics is disabled */
+ struct radeon_cmdbuf *sdma_cs;
struct pipe_fence_handle *last_gfx_fence;
struct si_resource *eop_bug_scratch;
struct si_resource *eop_bug_scratch_tmz;
@@ -962,7 +944,7 @@ struct si_context {
void *cs_clear_render_target_1d_array;
void *cs_clear_12bytes_buffer;
void *cs_dcc_decompress;
- void *cs_dcc_retile;
+ void *cs_dcc_retile[32];
void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
struct si_screen *screen;
struct pipe_debug_callback debug;
@@ -990,33 +972,11 @@ struct si_context {
unsigned last_num_draw_calls;
unsigned flags; /* flush flags */
/* Current unaccounted memory usage. */
- uint32_t vram_kb;
- uint32_t gtt_kb;
+ uint32_t memory_usage_kb;
- /* Compute-based primitive discard. */
- unsigned prim_discard_vertex_count_threshold;
+ /* NGG streamout. */
struct pb_buffer *gds;
struct pb_buffer *gds_oa;
- struct radeon_cmdbuf prim_discard_compute_cs;
- unsigned compute_gds_offset;
- struct si_shader *compute_ib_last_shader;
- uint32_t compute_rewind_va;
- unsigned compute_num_prims_in_batch;
- bool preserve_prim_restart_gds_at_flush;
- /* index_ring is divided into 2 halves for doublebuffering. */
- struct si_resource *index_ring;
- unsigned index_ring_base; /* offset of a per-IB portion */
- unsigned index_ring_offset; /* offset within a per-IB portion */
- unsigned index_ring_size_per_ib; /* max available size per IB */
- bool prim_discard_compute_ib_initialized;
- /* For tracking the last execution barrier - it can be either
- * a WRITE_DATA packet or a fence. */
- uint32_t *last_pkt3_write_data;
- struct si_resource *barrier_buf;
- unsigned barrier_buf_offset;
- struct pipe_fence_handle *last_ib_barrier_fence;
- struct si_resource *last_ib_barrier_buf;
- unsigned last_ib_barrier_buf_offset;
/* Atoms (direct states). */
union si_state_atoms atoms;
@@ -1065,28 +1025,27 @@ struct si_context {
/* indexed access using pipe_shader_type (not by MESA_SHADER_*) */
struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
};
- struct si_shader_ctx_state cs_prim_discard_state;
struct si_cs_shader_state cs_shader_state;
/* shader information */
+ uint64_t ps_inputs_read_or_disabled;
struct si_vertex_elements *vertex_elements;
unsigned num_vertex_elements;
- unsigned sprite_coord_enable;
unsigned cs_max_waves_per_sh;
- bool flatshade;
+ bool uses_nontrivial_vs_prolog;
+ bool force_trivial_vs_prolog;
bool do_update_shaders;
bool compute_shaderbuf_sgprs_dirty;
bool compute_image_sgprs_dirty;
bool vs_uses_base_instance;
bool vs_uses_draw_id;
+ uint8_t patch_vertices;
/* shader descriptors */
struct si_descriptors descriptors[SI_NUM_DESCS];
unsigned descriptors_dirty;
unsigned shader_pointers_dirty;
unsigned shader_needs_decompress_mask;
- unsigned inlinable_uniforms_valid_mask;
- uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS];
struct si_buffer_resources internal_bindings;
struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
struct si_samplers samplers[SI_NUM_SHADERS];
@@ -1141,11 +1100,7 @@ struct si_context {
bool allow_flat_shading : 1;
/* Emitted draw state. */
- bool gs_tri_strip_adj_fix : 1;
- bool ls_vgpr_fix : 1;
- bool prim_discard_cs_instancing : 1;
bool ngg : 1;
- bool same_patch_vertices : 1;
uint8_t ngg_culling;
unsigned last_index_size;
int last_base_vertex;
@@ -1256,9 +1211,6 @@ struct si_context {
unsigned num_resident_handles;
uint64_t num_alloc_tex_transfer_bytes;
unsigned last_tex_ps_draw_ratio; /* for query */
- unsigned compute_num_verts_accepted;
- unsigned compute_num_verts_rejected;
- unsigned compute_num_verts_ineligible; /* due to low vertex count */
unsigned context_roll;
/* Queries. */
@@ -1281,25 +1233,6 @@ struct si_context {
bool force_cb_shader_coherent;
- /* Statistics gathering for the DCC enablement heuristic. It can't be
- * in si_texture because si_texture can be shared by multiple
- * contexts. This is for back buffers only. We shouldn't get too many
- * of those.
- *
- * X11 DRI3 rotates among a finite set of back buffers. They should
- * all fit in this array. If they don't, separate DCC might never be
- * enabled by DCC stat gathering.
- */
- struct {
- struct si_texture *tex;
- /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
- struct pipe_query *ps_stats[3];
- /* If all slots are used and another slot is needed,
- * the least recently used slot is evicted based on this. */
- int64_t last_use_timestamp;
- bool query_active;
- } dcc_stats[5];
-
struct si_tracked_regs tracked_regs;
/* Resources that need to be flushed, but will not get an explicit
@@ -1308,7 +1241,12 @@ struct si_context {
*/
struct hash_table *dirty_implicit_resources;
- pipe_draw_vbo_func draw_vbo[NUM_GFX_VERSIONS - GFX6][2][2][2][2];
+ pipe_draw_vbo_func draw_vbo[2][2][2];
+ pipe_draw_vertex_state_func draw_vertex_state[2][2][2];
+ /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
+ pipe_draw_vbo_func real_draw_vbo;
+ pipe_draw_vertex_state_func real_draw_vertex_state;
+ void (*emit_spi_map[33])(struct si_context *sctx);
/* SQTT */
struct ac_thread_trace_data *thread_trace;
@@ -1346,6 +1284,9 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst
void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex);
void si_flush_implicit_resources(struct si_context *sctx);
+/* si_nir_optim.c */
+bool si_nir_is_output_const_if_tex_is_const(nir_shader *shader, float *in, float *out, int *texunit);
+
/* si_buffer.c */
bool si_cs_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
enum radeon_bo_usage usage);
@@ -1359,7 +1300,8 @@ struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, uns
struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
unsigned usage, unsigned size, unsigned alignment);
void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
- struct pipe_resource *src);
+ struct pipe_resource *src, unsigned num_rebinds,
+ uint32_t rebind_mask, uint32_t delete_buffer_id);
void si_init_screen_buffer_functions(struct si_screen *sscreen);
void si_init_buffer_functions(struct si_context *sctx);
@@ -1474,6 +1416,7 @@ void si_init_debug_functions(struct si_context *sctx);
void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved,
enum ring_type ring);
bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
+void si_print_current_ib(struct si_context *sctx, FILE *f);
/* si_fence.c */
void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
@@ -1491,16 +1434,23 @@ struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
/* si_get.c */
void si_init_screen_get_functions(struct si_screen *sscreen);
+bool si_sdma_copy_image(struct si_context *ctx, struct si_texture *dst, struct si_texture *src);
+
/* si_gfx_cs.c */
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
void si_allocate_gds(struct si_context *ctx);
void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
-void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws);
+void si_trace_emit(struct si_context *sctx);
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
unsigned cp_coher_cntl);
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
+/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
+ * optimizations without affecting the normal draw_vbo functions perf.
+ */
+void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
+ pipe_draw_vertex_state_func vstate_wrapper);
/* si_gpu_load.c */
void si_gpu_load_kill_thread(struct si_screen *sscreen);
@@ -1511,33 +1461,9 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin
void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
void si_init_compute_functions(struct si_context *sctx);
-/* si_compute_prim_discard.c */
-enum si_prim_discard_outcome
-{
- SI_PRIM_DISCARD_ENABLED,
- SI_PRIM_DISCARD_DISABLED,
- SI_PRIM_DISCARD_DRAW_SPLIT,
- SI_PRIM_DISCARD_MULTI_DRAW_SPLIT,
-};
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
- const struct pipe_draw_start_count *draws,
- unsigned num_draws, bool primitive_restart,
- unsigned total_count);
-void si_compute_signal_gfx(struct si_context *sctx);
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- unsigned count, unsigned index_size,
- unsigned base_vertex, uint64_t input_indexbuf_va,
- unsigned input_indexbuf_max_elements);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
- unsigned *prim_discard_vertex_count_threshold,
- unsigned *index_ring_size_per_ib);
-
/* si_pipe.c */
void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
+void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
/* si_perfcounters.c */
void si_init_perfcounters(struct si_screen *screen);
@@ -1587,6 +1513,10 @@ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
const struct pipe_video_buffer *tmpl);
+struct pipe_video_buffer *si_video_buffer_create_with_modifiers(struct pipe_context *pipe,
+ const struct pipe_video_buffer *tmpl,
+ const uint64_t *modifiers,
+ unsigned int modifiers_count);
/* si_viewport.c */
void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
@@ -1613,10 +1543,6 @@ struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
const struct pipe_surface *templ, unsigned width0,
unsigned height0, unsigned width, unsigned height);
unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap);
-void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex);
-void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex);
-void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex);
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex);
bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex);
void si_init_screen_texture_functions(struct si_screen *sscreen);
void si_init_context_texture_functions(struct si_context *sctx);
@@ -1647,6 +1573,9 @@ bool si_init_thread_trace(struct si_context *sctx);
void si_destroy_thread_trace(struct si_context *sctx);
void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs);
+/* si_state_shaders.c */
+struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key);
+
/*
* common helpers
*/
@@ -1698,15 +1627,14 @@ static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx,
* Also reserve space for stopping queries at the end of IB, because
* the number of active queries is unlimited in theory.
*/
- return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 9;
+ return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 10;
}
static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
{
if (r) {
/* Add memory usage for need_gfx_cs_space */
- sctx->vram_kb += si_resource(r)->vram_usage_kb;
- sctx->gtt_kb += si_resource(r)->gart_usage_kb;
+ sctx->memory_usage_kb += si_resource(r)->memory_usage_kb;
}
}
@@ -1866,7 +1794,19 @@ static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsi
if (zs_mask == PIPE_MASK_S && (tex->htile_stencil_disabled || !tex->surface.has_stencil))
return false;
- return tex->is_depth && tex->surface.meta_offset && level < tex->surface.num_meta_levels;
+ if (!tex->is_depth || !tex->surface.meta_offset)
+ return false;
+
+ struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
+ if (sscreen->info.chip_class >= GFX8) {
+ return level < tex->surface.num_meta_levels;
+ } else {
+ /* GFX6-7 don't have TC-compatible HTILE, which means they have to run
+ * a decompression pass for every mipmap level before texturing, so compress
+ * only one level to reduce the number of decompression passes to a minimum.
+ */
+ return level == 0;
+ }
}
static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
@@ -1908,6 +1848,12 @@ static inline unsigned si_get_total_colormask(struct si_context *sctx)
((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) | \
(1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
+#define UTIL_ALL_PRIM_TRIANGLE_MODES \
+ ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | \
+ (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | \
+ (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | \
+ (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))
+
static inline bool util_prim_is_lines(unsigned prim)
{
return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
@@ -1920,11 +1866,12 @@ static inline bool util_prim_is_points_or_lines(unsigned prim)
static inline bool util_rast_prim_is_triangles(unsigned prim)
{
- return ((1 << prim) &
- ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
- (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) |
- (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
- (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
+ return ((1 << prim) & UTIL_ALL_PRIM_TRIANGLE_MODES) != 0;
+}
+
+static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim)
+{
+ return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0;
}
/**
@@ -1935,17 +1882,27 @@ static inline bool util_rast_prim_is_triangles(unsigned prim)
* \param gtt GTT memory size not added to the buffer list yet
*/
static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs,
- uint32_t vram_kb, uint32_t gtt_kb)
+ uint32_t kb)
+{
+ return kb + cs->used_vram_kb + cs->used_gart_kb < screen->max_memory_usage_kb;
+}
+
+static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)
{
- vram_kb += cs->used_vram_kb;
- gtt_kb += cs->used_gart_kb;
+ struct radeon_cmdbuf *cs = &ctx->gfx_cs;
+
+ /* There are two memory usage counters in the winsys for all buffers
+ * that have been added (cs_add_buffer) and one counter in the pipe
+ * driver for those that haven't been added yet.
+ */
+ uint32_t kb = ctx->memory_usage_kb;
+ ctx->memory_usage_kb = 0;
- /* Anything that goes above the VRAM size should go to GTT. */
- if (vram_kb > screen->info.vram_size_kb)
- gtt_kb += vram_kb - screen->info.vram_size_kb;
+ if (radeon_cs_memory_below_limit(ctx->screen, &ctx->gfx_cs, kb) &&
+ ctx->ws->cs_check_space(cs, si_get_minimum_num_gfx_cs_dwords(ctx, num_draws), false))
+ return;
- /* Now we just need to check if we have enough GTT (the limit is 75% of max). */
- return gtt_kb < screen->info.gart_size_kb / 4 * 3;
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
/**
@@ -1989,30 +1946,20 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
bool check_mem)
{
if (check_mem &&
- !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->vram_kb + bo->vram_usage_kb,
- sctx->gtt_kb + bo->gart_usage_kb))
+ !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->memory_usage_kb + bo->memory_usage_kb))
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority);
}
-static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
-{
- return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
-}
-
static inline unsigned si_get_wave_size(struct si_screen *sscreen,
- gl_shader_stage stage, bool ngg, bool es,
- bool gs_fast_launch, bool prim_discard_cs)
+ gl_shader_stage stage, bool ngg, bool es)
{
if (stage == MESA_SHADER_COMPUTE)
return sscreen->compute_wave_size;
else if (stage == MESA_SHADER_FRAGMENT)
return sscreen->ps_wave_size;
- else if (gs_fast_launch)
- return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
- else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
- (stage == MESA_SHADER_VERTEX && es && !ngg) ||
+ else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
(stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
(stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
return 64;
@@ -2024,19 +1971,30 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
{
return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
shader->key.as_ngg,
- shader->key.as_es,
- shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
- shader->key.opt.vs_as_prim_discard_cs);
+ shader->key.as_es);
}
static inline void si_select_draw_vbo(struct si_context *sctx)
{
- sctx->b.draw_vbo = sctx->draw_vbo[sctx->chip_class - GFX6]
- [!!sctx->shader.tes.cso]
- [!!sctx->shader.gs.cso]
- [sctx->ngg]
- [si_compute_prim_discard_enabled(sctx)];
- assert(sctx->b.draw_vbo);
+ pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
+ [!!sctx->shader.gs.cso]
+ [sctx->ngg];
+ pipe_draw_vertex_state_func draw_vertex_state =
+ sctx->draw_vertex_state[!!sctx->shader.tes.cso]
+ [!!sctx->shader.gs.cso]
+ [sctx->ngg];
+ assert(draw_vbo);
+ assert(draw_vertex_state);
+
+ if (unlikely(sctx->real_draw_vbo)) {
+ assert(sctx->real_draw_vertex_state);
+ sctx->real_draw_vbo = draw_vbo;
+ sctx->real_draw_vertex_state = draw_vertex_state;
+ } else {
+ assert(!sctx->real_draw_vertex_state);
+ sctx->b.draw_vbo = draw_vbo;
+ sctx->b.draw_vertex_state = draw_vertex_state;
+ }
}
/* Return the number of samples that the rasterizer uses. */
@@ -2053,6 +2011,20 @@ static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
return 1;
}
+static unsigned ALWAYS_INLINE
+si_num_vbos_in_user_sgprs_inline(enum chip_class chip_class)
+{
+ /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
+ * have to allocate and count references for the upload buffer.
+ */
+ return chip_class >= GFX9 ? 5 : 1;
+}
+
+static inline unsigned si_num_vbos_in_user_sgprs(struct si_screen *sscreen)
+{
+ return si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class);
+}
+
#define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c
index 22b6e3ad5..ae4affa1b 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c
@@ -117,13 +117,13 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
- if (state->shader) {
- radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, state->shader->bo,
+ if (state->is_shader) {
+ radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo,
RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
}
radeon_begin(cs);
- radeon_emit_array(cs, state->pm4, state->ndw);
+ radeon_emit_array(state->pm4, state->ndw);
radeon_end();
if (state->atom.emit)
@@ -139,7 +139,7 @@ void si_pm4_reset_emitted(struct si_context *sctx, bool first_cs)
for (unsigned i = 0; i < SI_NUM_STATES; i++) {
struct si_pm4_state *state = sctx->emitted.array[i];
- if (state && state->shader) {
+ if (state && state->is_shader) {
sctx->emitted.array[i] = NULL;
sctx->dirty_states |= 1 << i;
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h
index 06909ff1a..03f79e0ba 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h
@@ -54,7 +54,7 @@ struct si_pm4_state {
uint32_t pm4[SI_PM4_MAX_DW];
/* For shader states only */
- struct si_shader *shader;
+ bool is_shader;
struct si_atom atom;
};
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
index 121feb6fb..546f9da11 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
@@ -218,10 +218,10 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader)
}
/* Compile a variable block size using the maximum variable size. */
- if (shader->selector->info.base.cs.local_size_variable)
+ if (shader->selector->info.base.workgroup_size_variable)
return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
- uint16_t *local_size = shader->selector->info.base.cs.local_size;
+ uint16_t *local_size = shader->selector->info.base.workgroup_size;
unsigned max_work_group_size = (uint32_t)local_size[0] *
(uint32_t)local_size[1] *
(uint32_t)local_size[2];
@@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
/* VGPRs */
declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
-
- /* Return values */
- if (shader->key.opt.vs_as_prim_discard_cs) {
- for (i = 0; i < 4; i++)
- ac_add_return(&ctx->args, AC_ARG_VGPR);
- }
break;
case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@@ -553,11 +547,11 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
declare_vb_descriptor_input_sgprs(ctx);
/* VGPRs (first GS, then VS/TES) */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
if (ctx->stage == MESA_SHADER_VERTEX) {
declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
@@ -658,7 +652,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
SI_PARAM_LINEAR_CENTER);
si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
SI_PARAM_LINEAR_CENTROID);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
SI_PARAM_POS_X_FLOAT);
si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
@@ -793,9 +787,6 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh
if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
(sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
- /* We add this symbol even on LLVM <= 8 to ensure that
- * shader->config.lds_size is set correctly below.
- */
struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
sym->name = "esgs_ring";
sym->size = shader->gs_info.esgs_ring_size * 4;
@@ -835,7 +826,9 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
{
struct ac_rtld_binary rtld;
si_shader_binary_open(screen, shader, &rtld);
- return rtld.exec_size;
+ uint64_t size = rtld.exec_size;
+ ac_rtld_close(&rtld);
+ return size;
}
static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
@@ -865,8 +858,8 @@ bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader
si_resource_reference(&shader->bo, NULL);
shader->bo = si_aligned_buffer_create(
&sscreen->b,
- (sscreen->info.cpdma_prefetch_writes_memory ?
- 0 : SI_RESOURCE_FLAG_READ_ONLY) | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
+ (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
+ SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
if (!shader->bo)
return false;
@@ -1071,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader)
return "Vertex Shader as ES";
else if (shader->key.as_ls)
return "Vertex Shader as LS";
- else if (shader->key.opt.vs_as_prim_discard_cs)
- return "Vertex Shader as Primitive Discard CS";
else if (shader->key.as_ngg)
return "Vertex Shader as ESGS";
else
@@ -1153,8 +1144,6 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix,
prolog->instance_divisor_is_fetched);
- fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
- prolog->unpack_instance_id_from_vertex_id);
fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
@@ -1186,17 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
fprintf(f, " as_ls = %u\n", key->as_ls);
fprintf(f, " as_ngg = %u\n", key->as_ngg);
fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
- fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
- fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
- fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);
- fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing);
- fprintf(f, " opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
- fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
- fprintf(f, " opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
- fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
- fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
- fprintf(f, " opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
- fprintf(f, " opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
break;
case MESA_SHADER_TESS_CTRL:
@@ -1297,8 +1275,8 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel,
/* VGPR initialization fixup for Vega10 and Raven is always done in the
* VS prolog. */
return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
- prolog_key->unpack_instance_id_from_vertex_id ||
- (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+ /* The 2nd VS prolog loads input VGPRs from LDS */
+ (key->opt.ngg_culling && !ngg_cull_shader);
}
/**
@@ -1323,16 +1301,9 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
key->vs_prolog.as_ls = shader_out->key.as_ls;
key->vs_prolog.as_es = shader_out->key.as_es;
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
- key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
-
- if (ngg_cull_shader) {
- key->vs_prolog.gs_fast_launch_tri_list =
- !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
- key->vs_prolog.gs_fast_launch_tri_strip =
- !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
- key->vs_prolog.gs_fast_launch_index_size_packed =
- SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling);
- }
+
+ if (!ngg_cull_shader && shader_out->key.opt.ngg_culling)
+ key->vs_prolog.load_vgprs_after_culling = 1;
if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
key->vs_prolog.as_ls = 1;
@@ -1346,8 +1317,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
/* Only one of these combinations can be set. as_ngg can be set with as_es. */
assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
- (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
- 1);
+ (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
/* Enable loading the InstanceID VGPR. */
uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
@@ -1453,8 +1423,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
si_dump_streamout(&sel->so);
}
- memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
- sizeof(shader->info.vs_output_param_offset));
+ /* Initialize vs_output_ps_input_cntl to default. */
+ for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
+ shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
+ shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
shader->info.uses_instanceid = sel->info.uses_instanceid;
@@ -1465,9 +1437,44 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
return false;
- /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
- * LLVM 3.9svn has this bug.
- */
+ /* Compute vs_output_ps_input_cntl. */
+ if ((sel->info.stage == MESA_SHADER_VERTEX ||
+ sel->info.stage == MESA_SHADER_TESS_EVAL ||
+ sel->info.stage == MESA_SHADER_GEOMETRY) &&
+ !shader->key.as_ls && !shader->key.as_es) {
+ ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
+
+ if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg)
+ vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset;
+
+ /* VS and TES should also set primitive ID output if it's used. */
+ unsigned num_outputs_with_prim_id = sel->info.num_outputs +
+ shader->key.mono.u.vs_export_prim_id;
+
+ for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
+ unsigned semantic = sel->info.output_semantic[i];
+ unsigned offset = vs_output_param_offset[i];
+ unsigned ps_input_cntl;
+
+ if (offset <= AC_EXP_PARAM_OFFSET_31) {
+ /* The input is loaded from parameter memory. */
+ ps_input_cntl = S_028644_OFFSET(offset);
+ } else {
+ /* The input is a DEFAULT_VAL constant. */
+ assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+ offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+ offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
+
+ /* OFFSET=0x20 means that DEFAULT_VAL is used. */
+ ps_input_cntl = S_028644_OFFSET(0x20) |
+ S_028644_DEFAULT_VAL(offset);
+ }
+
+ shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
+ }
+ }
+
+ /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
if (sel->info.stage == MESA_SHADER_COMPUTE) {
unsigned wave_size = sscreen->compute_wave_size;
unsigned max_vgprs =
@@ -1559,11 +1566,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
shader.key.as_ls = key->vs_prolog.as_ls;
shader.key.as_es = key->vs_prolog.as_es;
shader.key.as_ngg = key->vs_prolog.as_ngg;
- shader.key.opt.ngg_culling =
- (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
- (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
- SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
- shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
break;
case MESA_SHADER_TESS_CTRL:
assert(!prolog);
@@ -1586,9 +1588,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
struct si_shader_context ctx;
si_llvm_context_init(&ctx, sscreen, compiler,
si_get_wave_size(sscreen, stage,
- shader.key.as_ngg, shader.key.as_es,
- shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
- shader.key.opt.vs_as_prim_discard_cs));
+ shader.key.as_ngg, shader.key.as_es));
ctx.shader = &shader;
ctx.stage = stage;
@@ -2026,8 +2026,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
- memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
- sizeof(mainp->info.vs_output_param_offset));
+ memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl,
+ sizeof(mainp->info.vs_output_ps_input_cntl));
shader->info.uses_instanceid = mainp->info.uses_instanceid;
shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
shader->info.nr_param_exports = mainp->info.nr_param_exports;
@@ -2115,9 +2115,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
(sel->info.stage == MESA_SHADER_VERTEX &&
/* Used to export PrimitiveID from the correct vertex. */
- (shader->key.mono.u.vs_export_prim_id ||
- /* Used to generate triangle strip vertex IDs for all threads. */
- shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)));
+ shader->key.mono.u.vs_export_prim_id));
shader->uses_vs_state_outprim = sscreen->use_ngg &&
/* Only used by streamout in vertex shaders. */
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
index ab11a1852..d6dbb13ed 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
@@ -138,6 +138,7 @@
#include "util/u_inlines.h"
#include "util/u_live_shader_cache.h"
#include "util/u_queue.h"
+#include "si_pm4.h"
#include <stdio.h>
@@ -158,6 +159,12 @@ struct si_context;
#define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
+#define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
+#define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
+#define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000
+/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
+#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
+
/* SGPR user data indices */
enum
{
@@ -272,14 +279,10 @@ enum
SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
};
-#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
+#define SI_NGG_CULL_ENABLED (1 << 0) /* this implies W, view.xy, and small prim culling */
#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */
-#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3)
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */
+#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */
/**
* For VS shader keys, describe any fixups required for vertex fetch.
@@ -323,6 +326,16 @@ enum si_color_output_type {
SI_TYPE_UINT16,
};
+union si_input_info {
+ struct {
+ ubyte semantic;
+ ubyte interpolate;
+ ubyte fp16_lo_hi_valid;
+ ubyte usage_mask;
+ };
+ uint32_t _unused; /* this just forces 4-byte alignment */
+};
+
struct si_shader_info {
shader_info base;
@@ -330,12 +343,8 @@ struct si_shader_info {
ubyte num_inputs;
ubyte num_outputs;
- ubyte input_semantic[PIPE_MAX_SHADER_INPUTS];
- ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
- ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
- ubyte input_fp16_lo_hi_valid[PIPE_MAX_SHADER_INPUTS];
+ union si_input_info input[PIPE_MAX_SHADER_INPUTS];
ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS];
- char output_semantic_to_slot[VARYING_SLOT_VAR15_16BIT + 1];
ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
@@ -402,6 +411,13 @@ struct si_shader_info {
* fragment shader invocations if flat shading.
*/
bool allow_flat_shading;
+
+ /* Optimization: if the texture bound to this texunit has been cleared to 1,
+ * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
+ * value is 0xff (undetermined) and can be later changed to 0 (= false) or
+ * texunit + 1.
+ */
+ uint8_t writes_1_if_tex_is_1;
};
/* A shader selector is a gallium CSO and contains shader variants and
@@ -439,7 +455,6 @@ struct si_shader_selector {
ubyte const_and_shader_buf_descriptors_index;
ubyte sampler_and_images_descriptors_index;
bool vs_needs_prolog;
- bool prim_discard_cs_allowed;
ubyte cs_shaderbufs_sgpr_index;
ubyte cs_num_shaderbufs_in_user_sgprs;
ubyte cs_images_sgpr_index;
@@ -447,7 +462,6 @@ struct si_shader_selector {
ubyte cs_num_images_in_user_sgprs;
ubyte num_vs_inputs;
ubyte num_vbos_in_user_sgprs;
- unsigned pa_cl_vs_out_cntl;
unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
ubyte clipdist_mask;
ubyte culldist_mask;
@@ -521,7 +535,6 @@ struct si_vs_prolog_bits {
uint16_t instance_divisor_is_one; /* bitmask of inputs */
uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
unsigned ls_vgpr_fix : 1;
- unsigned unpack_instance_id_from_vertex_id : 1;
};
/* Common TCS bits between the shader key and the epilog key. */
@@ -571,10 +584,7 @@ union si_shader_part_key {
unsigned as_ls : 1;
unsigned as_es : 1;
unsigned as_ngg : 1;
- unsigned as_prim_discard_cs : 1;
- unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */
- unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
- unsigned gs_fast_launch_index_size_packed : 2;
+ unsigned load_vgprs_after_culling : 1;
/* Prologs for monolithic shaders shouldn't set EXEC. */
unsigned is_monolithic : 1;
} vs_prolog;
@@ -633,9 +643,10 @@ struct si_shader_key {
/* These three are initially set according to the NEXT_SHADER property,
* or guessed if the property doesn't seem correct.
*/
- unsigned as_es : 1; /* export shader, which precedes GS */
- unsigned as_ls : 1; /* local shader, which precedes TCS */
- unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */
+ unsigned as_es : 1; /* whether it's a shader before GS */
+ unsigned as_ls : 1; /* whether it's VS before TCS */
+ unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
+ also set for the stage right before GS */
/* Flags for monolithic compilation only. */
struct {
@@ -666,7 +677,7 @@ struct si_shader_key {
unsigned kill_pointsize : 1;
/* For NGG VS and TES. */
- unsigned ngg_culling : 7; /* SI_NGG_CULL_* */
+ unsigned ngg_culling : 4; /* SI_NGG_CULL_* */
/* For shaders where monolithic variants have better code.
*
@@ -676,19 +687,6 @@ struct si_shader_key {
*/
unsigned prefer_mono : 1;
- /* Primitive discard compute shader. */
- unsigned vs_as_prim_discard_cs : 1;
- unsigned cs_prim_type : 4;
- unsigned cs_indexed : 1;
- unsigned cs_instancing : 1;
- unsigned cs_primitive_restart : 1;
- unsigned cs_provoking_vertex_first : 1;
- unsigned cs_need_correct_orientation : 1;
- unsigned cs_cull_front : 1;
- unsigned cs_cull_back : 1;
- unsigned cs_cull_z : 1;
- unsigned cs_halfz_clip_space : 1;
-
/* VS and TCS have the same number of patch vertices. */
unsigned same_patch_vertices:1;
@@ -707,6 +705,7 @@ struct si_shader_key {
/* GCN-specific shader info. */
struct si_shader_binary_info {
ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+ uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
ubyte num_input_sgprs;
ubyte num_input_vgprs;
signed char face_vgpr_index;
@@ -736,7 +735,35 @@ struct gfx9_gs_info {
unsigned esgs_ring_size; /* in bytes */
};
+#define SI_NUM_VGT_STAGES_KEY_BITS 5
+#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
+
+/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
+ * Some fields are set by state-change calls, most are set by draw_vbo.
+ */
+union si_vgt_stages_key {
+ struct {
+#if UTIL_ARCH_LITTLE_ENDIAN
+ uint8_t tess : 1;
+ uint8_t gs : 1;
+ uint8_t ngg_passthrough : 1;
+ uint8_t ngg : 1; /* gfx10+ */
+ uint8_t streamout : 1; /* only used with NGG */
+ uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
+#else /* UTIL_ARCH_BIG_ENDIAN */
+ uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
+ uint8_t streamout : 1;
+ uint8_t ngg : 1;
+ uint8_t ngg_passthrough : 1;
+ uint8_t gs : 1;
+ uint8_t tess : 1;
+#endif
+ } u;
+ uint8_t index;
+};
+
struct si_shader {
+ struct si_pm4_state pm4; /* base class */
struct si_compiler_ctx_state compiler_ctx_state;
struct si_shader_selector *selector;
@@ -748,7 +775,6 @@ struct si_shader {
struct si_shader_part *prolog2;
struct si_shader_part *epilog;
- struct si_pm4_state *pm4;
struct si_resource *bo;
struct si_resource *scratch_bo;
struct si_shader_key key;
@@ -803,6 +829,8 @@ struct si_shader {
unsigned vgt_gs_onchip_cntl;
unsigned vgt_gs_max_prims_per_subgroup;
unsigned vgt_esgs_ring_itemsize;
+ unsigned spi_shader_pgm_rsrc3_gs;
+ unsigned spi_shader_pgm_rsrc4_gs;
} gs;
struct {
@@ -819,6 +847,9 @@ struct si_shader {
unsigned pa_cl_ngg_cntl;
unsigned vgt_gs_max_vert_out; /* for API GS */
unsigned ge_pc_alloc; /* uconfig register */
+ unsigned spi_shader_pgm_rsrc3_gs;
+ unsigned spi_shader_pgm_rsrc4_gs;
+ union si_vgt_stages_key vgt_stages;
} ngg;
struct {
@@ -839,6 +870,7 @@ struct si_shader {
unsigned spi_shader_z_format;
unsigned spi_shader_col_format;
unsigned cb_shader_mask;
+ unsigned num_interp;
} ps;
} ctx_reg;
@@ -884,17 +916,18 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
void si_nir_late_opts(nir_shader *nir);
-void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize);
+char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
/* si_state_shaders.c */
void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
struct gfx9_gs_info *out);
+bool gfx10_is_ngg_passthrough(struct si_shader *shader);
/* Inline helpers. */
/* Return the pointer to the main shader part's pointer. */
static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
- struct si_shader_key *key)
+ const struct si_shader_key *key)
{
if (key->as_ls)
return &sel->main_shader_part_ls;
@@ -907,15 +940,6 @@ static inline struct si_shader **si_get_main_shader_part(struct si_shader_select
return &sel->main_shader_part;
}
-static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader)
-{
- struct si_shader_selector *sel = shader->selector;
-
- return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag &&
- !shader->key.opt.ngg_culling &&
- (sel->info.stage != MESA_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id);
-}
-
static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
{
return selector ? selector->info.uses_bindless_samplers : false;
@@ -926,6 +950,22 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel
return selector ? selector->info.uses_bindless_images : false;
}
+static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
+{
+ if (shader->selector->info.stage == MESA_SHADER_VERTEX &&
+ !shader->selector->info.base.vs.blit_sgprs_amd &&
+ !(shader->key.opt.ngg_culling & SI_NGG_CULL_LINES))
+ return true;
+
+ return false;
+}
+
+static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
+{
+ return gfx10_edgeflags_have_effect(shader) &&
+ shader->selector->info.writes_edgeflag;
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h
index 46d8e69b9..3970125f5 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -30,8 +30,6 @@
struct pipe_debug_callback;
-#define RADEON_LLVM_MAX_INPUTS 32 * 4
-
/* Ideally pass the sample mask input to the PS epilog as v14, which
* is its usual location, so that the shader doesn't have to add v_mov.
*/
@@ -60,8 +58,6 @@ struct si_shader_context {
struct ac_shader_args args;
struct ac_shader_abi abi;
- LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
-
LLVMBasicBlockRef merged_wrap_if_entry_block;
int merged_wrap_if_label;
@@ -134,10 +130,6 @@ struct si_shader_context {
/* API TES */
struct ac_arg tes_offchip_addr;
- /* API GS */
- struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
- struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
- struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
/* PS */
struct ac_arg pos_fixed_pt;
/* CS */
@@ -194,9 +186,8 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader);
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
- LLVMValueRef *addrs);
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi);
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi);
void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
@@ -242,7 +233,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
/* si_shader_llvm_gs.c */
LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi);
void si_preload_esgs_ring(struct si_shader_context *ctx);
void si_preload_gs_rings(struct si_shader_context *ctx);
void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
@@ -250,7 +241,7 @@ void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
/* si_shader_llvm_tess.c */
void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi);
void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
@@ -266,7 +257,6 @@ void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
/* si_shader_llvm_vs.c */
-void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
LLVMValueRef const *so_write_offsets,
struct pipe_stream_output *stream_out,
@@ -275,7 +265,7 @@ void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_outp
unsigned noutput, unsigned stream);
void si_llvm_build_vs_exports(struct si_shader_context *ctx,
struct si_shader_output_values *outputs, unsigned noutput);
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi);
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 8420162ca..1a1dd07a5 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -22,6 +22,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
+#include "ac_exp_param.h"
#include "ac_nir_to_llvm.h"
#include "ac_rtld.h"
#include "si_pipe.h"
@@ -93,9 +94,7 @@ bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
if (!si_replace_shader(count, binary)) {
struct ac_compiler_passes *passes = compiler->passes;
- if (ac->wave_size == 32)
- passes = compiler->passes_wave32;
- else if (less_optimized && compiler->low_opt_passes)
+ if (less_optimized && compiler->low_opt_passes)
passes = compiler->low_opt_passes;
struct si_llvm_diagnostics diag = {debug};
@@ -190,6 +189,7 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy
}
ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
+ ac_llvm_set_target_features(ctx->main_fn, &ctx->ac);
}
void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader)
@@ -220,7 +220,7 @@ void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shade
if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
- if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+ if (USE_LDS_SYMBOLS) {
/* The LSHS size is not known until draw time, so we append it
* at the end of whatever LDS use there may be in the rest of
* the shader (currently none, unless LLVM decides to do its
@@ -412,7 +412,7 @@ static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
{
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- assert(ctx->shader->selector->info.base.cs.local_size_variable &&
+ assert(ctx->shader->selector->info.base.workgroup_size_variable &&
ctx->shader->selector->info.uses_variable_block_size);
LLVMValueRef chan[3] = {
@@ -442,9 +442,7 @@ static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
{
- if (nir->info.stage == MESA_SHADER_VERTEX) {
- si_llvm_load_vs_inputs(ctx, nir);
- } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+ if (nir->info.stage == MESA_SHADER_FRAGMENT) {
unsigned colors_read = ctx->shader->selector->info.colors_read;
LLVMValueRef main_fn = ctx->main_fn;
@@ -491,7 +489,6 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *
si_llvm_declare_compute_memory(ctx);
}
- ctx->abi.inputs = &ctx->inputs[0];
ctx->abi.clamp_shadow_reference = true;
ctx->abi.robust_buffer_access = true;
ctx->abi.convert_undef_to_zero = true;
@@ -808,9 +805,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
!same_thread_count && si_is_multi_part_shader(ctx->shader))
ac_build_endif(&ctx->ac, 6507);
- /* Return the value from the last part. It's non-void only for the prim
- * discard compute shader.
- */
if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
LLVMBuildRetVoid(builder);
else
@@ -902,12 +896,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
/* Unconditionally declare scratch space base for streamout and
* vertex compaction. Whether space is actually allocated is
* determined during linking / PM4 creation.
- *
- * Add an extra dword per vertex to ensure an odd stride, which
- * avoids bank conflicts for SoA accesses.
*/
- if (!gfx10_is_ngg_passthrough(shader))
- si_llvm_declare_esgs_ring(ctx);
+ si_llvm_declare_esgs_ring(ctx);
/* This is really only needed when streamout and / or vertex
* compaction is enabled.
@@ -1091,7 +1081,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
LLVMValueRef parts[4];
unsigned num_parts = 0;
- bool has_prolog = false;
+ bool first_is_prolog = false;
LLVMValueRef main_fn = ctx.main_fn;
if (ngg_cull_main_fn) {
@@ -1102,7 +1092,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
prolog_key.vs_prolog.is_monolithic = true;
si_llvm_build_vs_prolog(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
- has_prolog = true;
+ first_is_prolog = true;
}
parts[num_parts++] = ngg_cull_main_fn;
}
@@ -1114,21 +1104,31 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
prolog_key.vs_prolog.is_monolithic = true;
si_llvm_build_vs_prolog(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
- has_prolog = true;
+ if (num_parts == 1)
+ first_is_prolog = true;
}
parts[num_parts++] = main_fn;
- si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0, false);
-
- if (ctx.shader->key.opt.vs_as_prim_discard_cs)
- si_build_prim_discard_compute_shader(&ctx);
+ si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
} else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
- LLVMValueRef parts[2];
+ LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
+
+ /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */
+ union si_shader_part_key prolog_key;
+ memset(&prolog_key, 0, sizeof(prolog_key));
+ prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+ prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5;
+ prolog_key.vs_prolog.as_ngg = 1;
+ prolog_key.vs_prolog.load_vgprs_after_culling = 1;
+ prolog_key.vs_prolog.is_monolithic = true;
+ si_llvm_build_vs_prolog(&ctx, &prolog_key);
+ prolog = ctx.main_fn;
parts[0] = ngg_cull_main_fn;
- parts[1] = ctx.main_fn;
+ parts[1] = prolog;
+ parts[2] = main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
+ si_build_wrapper_function(&ctx, parts, 3, 0, 0, false);
} else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
if (sscreen->info.chip_class >= GFX9) {
struct si_shader_selector *ls = shader->key.part.tcs.ls;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
index 18d8bca3c..450ee8348 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
@@ -24,11 +24,13 @@
#include "si_build_pm4.h"
#include "si_query.h"
+#include "si_shader_internal.h"
#include "sid.h"
#include "util/fast_idiv_by_const.h"
#include "util/format/u_format.h"
#include "util/format/u_format_s3tc.h"
#include "util/u_dual_blend.h"
+#include "util/u_helpers.h"
#include "util/u_memory.h"
#include "util/u_resource.h"
#include "util/u_upload_mgr.h"
@@ -92,8 +94,8 @@ static void si_emit_cb_render_state(struct si_context *sctx)
sctx->last_cb_target_mask = cb_target_mask;
radeon_begin(cs);
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
radeon_end();
}
@@ -445,6 +447,14 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
blend->alpha_to_one = state->alpha_to_one;
blend->dual_src_blend = util_blend_state_is_dual(state, 0);
blend->logicop_enable = logicop_enable;
+ blend->allows_noop_optimization =
+ state->rt[0].rgb_func == PIPE_BLEND_ADD &&
+ state->rt[0].alpha_func == PIPE_BLEND_ADD &&
+ state->rt[0].rgb_src_factor == PIPE_BLENDFACTOR_DST_COLOR &&
+ state->rt[0].alpha_src_factor == PIPE_BLENDFACTOR_DST_COLOR &&
+ state->rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_ZERO &&
+ state->rt[0].alpha_dst_factor == PIPE_BLENDFACTOR_ZERO &&
+ mode == V_028808_CB_NORMAL;
unsigned num_shader_outputs = state->max_rt + 1; /* estimate */
if (blend->dual_src_blend)
@@ -627,6 +637,79 @@ static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_b
return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
}
+static bool si_check_blend_dst_sampler_noop(struct si_context *sctx)
+{
+ if (sctx->framebuffer.state.nr_cbufs == 1) {
+ struct si_shader_selector *sel = sctx->shader.ps.cso;
+ bool free_nir;
+ if (unlikely(sel->info.writes_1_if_tex_is_1 == 0xff)) {
+ struct nir_shader *nir = si_get_nir_shader(sel, NULL, &free_nir);
+
+ /* Determine if this fragment shader always writes vec4(1) if a specific texture
+ * is all 1s.
+ */
+ float in[4] = { 1.0, 1.0, 1.0, 1.0 };
+ float out[4];
+ int texunit;
+ if (si_nir_is_output_const_if_tex_is_const(nir, in, out, &texunit) &&
+ !memcmp(in, out, 4 * sizeof(float))) {
+ sel->info.writes_1_if_tex_is_1 = 1 + texunit;
+ } else {
+ sel->info.writes_1_if_tex_is_1 = 0;
+ }
+
+ if (free_nir)
+ ralloc_free(nir);
+ }
+
+ if (sel->info.writes_1_if_tex_is_1 &&
+ sel->info.writes_1_if_tex_is_1 != 0xff) {
+ /* Now check if the texture is cleared to 1 */
+ int unit = sctx->shader.ps.cso->info.writes_1_if_tex_is_1 - 1;
+ struct si_samplers *samp = &sctx->samplers[PIPE_SHADER_FRAGMENT];
+ if ((1u << unit) & samp->enabled_mask) {
+ struct si_texture* tex = (struct si_texture*) samp->views[unit]->texture;
+ if (tex->is_depth &&
+ tex->depth_cleared_level_mask & BITFIELD_BIT(samp->views[unit]->u.tex.first_level) &&
+ tex->depth_clear_value[0] == 1) {
+ return false;
+ }
+ /* TODO: handle color textures */
+ }
+ }
+ }
+
+ return true;
+}
+
+static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws) {
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ if (!si_check_blend_dst_sampler_noop(sctx))
+ return;
+
+ sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);
+}
+
+static void si_draw_vstate_blend_dst_sampler_noop(struct pipe_context *ctx,
+ struct pipe_vertex_state *state,
+ uint32_t partial_velem_mask,
+ struct pipe_draw_vertex_state_info info,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws) {
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ if (!si_check_blend_dst_sampler_noop(sctx))
+ return;
+
+ sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws);
+}
+
static void si_bind_blend_state(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
@@ -649,8 +732,12 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
old_blend->alpha_to_one != blend->alpha_to_one ||
old_blend->dual_src_blend != blend->dual_src_blend ||
old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
- old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
+ old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) {
+ si_ps_key_update_framebuffer_blend(sctx);
+ si_ps_key_update_blend_rasterizer(sctx);
+ si_update_ps_inputs_read_or_disabled(sctx);
sctx->do_update_shaders = true;
+ }
if (sctx->screen->dpbb_allowed &&
(old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
@@ -664,6 +751,15 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
old_blend->commutative_4bit != blend->commutative_4bit ||
old_blend->logicop_enable != blend->logicop_enable)))
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+ if (likely(!radeon_uses_secure_bos(sctx->ws))) {
+ if (unlikely(blend->allows_noop_optimization)) {
+ si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop,
+ si_draw_vstate_blend_dst_sampler_noop);
+ } else {
+ si_install_draw_wrapper(sctx, NULL, NULL);
+ }
+ }
}
static void si_delete_blend_state(struct pipe_context *ctx, void *state)
@@ -691,8 +787,8 @@ static void si_emit_blend_color(struct si_context *sctx)
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
radeon_begin(cs);
- radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
- radeon_emit_array(cs, (uint32_t *)sctx->blend_color.color, 4);
+ radeon_set_context_reg_seq(R_028414_CB_BLEND_RED, 4);
+ radeon_emit_array((uint32_t *)sctx->blend_color.color, 4);
radeon_end();
}
@@ -725,8 +821,8 @@ static void si_emit_clip_state(struct si_context *sctx)
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
radeon_begin(cs);
- radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
- radeon_emit_array(cs, (uint32_t *)sctx->clip_state.ucp, 6 * 4);
+ radeon_set_context_reg_seq(R_0285BC_PA_CL_UCP_0_X, 6 * 4);
+ radeon_emit_array((uint32_t *)sctx->clip_state.ucp, 6 * 4);
radeon_end();
}
@@ -741,7 +837,6 @@ static void si_emit_clip_regs(struct si_context *sctx)
unsigned clipdist_mask = vs_sel->clipdist_mask;
unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
unsigned culldist_mask = vs_sel->culldist_mask;
- unsigned vs_out_mask = (clipdist_mask & ~vs->key.opt.kill_clip_distances) | culldist_mask;
/* Clip distances on points have no effect, so need to be implemented
* as cull distances. This applies for the clipvertex case as well.
@@ -752,23 +847,14 @@ static void si_emit_clip_regs(struct si_context *sctx)
clipdist_mask &= rs->clip_plane_enable;
culldist_mask |= clipdist_mask;
- unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) |
- S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) |
- S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 &&
+ unsigned pa_cl_cntl = S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 &&
!sctx->screen->options.vrs2x2) |
S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
clipdist_mask | (culldist_mask << 8);
radeon_begin(&sctx->gfx_cs);
-
- if (sctx->chip_class >= GFX10) {
- radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
- ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
- } else {
- radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
- vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
- }
+ radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL,
+ pa_cl_cntl | vs->pa_cl_vs_out_cntl);
radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
radeon_end_update_context_roll(sctx);
@@ -834,15 +920,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
return NULL;
}
- if (!state->front_ccw) {
- rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
- rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
- } else {
- rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
- rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
- }
- rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
- rs->provoking_vertex_first = state->flatshade_first;
rs->scissor_enable = state->scissor;
rs->clip_halfz = state->clip_halfz;
rs->two_side = state->light_twoside;
@@ -862,9 +939,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
rs->flatshade_first = state->flatshade_first;
rs->sprite_coord_enable = state->sprite_coord_enable;
rs->rasterizer_discard = state->rasterizer_discard;
- rs->polygon_mode_enabled =
- (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
- (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
rs->polygon_mode_is_lines =
(state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
(state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
@@ -882,24 +956,30 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
if (rs->rasterizer_discard) {
- rs->ngg_cull_flags = SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
+ rs->ngg_cull_flags = SI_NGG_CULL_ENABLED |
+ SI_NGG_CULL_FRONT_FACE |
+ SI_NGG_CULL_BACK_FACE;
rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags;
} else {
- /* Polygon mode can't use view and small primitive culling,
- * because it draws points or lines where the culling depends
- * on the point or line width.
- */
- if (!rs->polygon_mode_enabled) {
- rs->ngg_cull_flags |= SI_NGG_CULL_VIEW_SMALLPRIMS;
- rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_VIEW_SMALLPRIMS;
+ rs->ngg_cull_flags = SI_NGG_CULL_ENABLED;
+ rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags;
+
+ bool cull_front, cull_back;
+
+ if (!state->front_ccw) {
+ cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+ cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+ } else {
+ cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+ cull_front = !!(state->cull_face & PIPE_FACE_BACK);
}
- if (rs->cull_front) {
+ if (cull_front) {
rs->ngg_cull_flags |= SI_NGG_CULL_FRONT_FACE;
rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_BACK_FACE;
}
- if (rs->cull_back) {
+ if (cull_back) {
rs->ngg_cull_flags |= SI_NGG_CULL_BACK_FACE;
rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_FRONT_FACE;
}
@@ -942,7 +1022,10 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
S_028A48_VPORT_SCISSOR_ENABLE(1) |
S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
- si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
+ bool polygon_mode_enabled =
+ (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
+ (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
+
si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
@@ -951,11 +1034,11 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
- S_028814_POLY_MODE(rs->polygon_mode_enabled) |
+ S_028814_POLY_MODE(polygon_mode_enabled) |
S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) |
/* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */
- S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? rs->polygon_mode_enabled : 0));
+ S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? polygon_mode_enabled : 0));
if (!rs->uses_poly_offset)
return rs;
@@ -991,11 +1074,12 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
}
}
+ si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
+ si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
- si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
}
return rs;
@@ -1044,6 +1128,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+ if (old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
+ old_rs->flatshade != rs->flatshade)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
+
if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
old_rs->rasterizer_discard != rs->rasterizer_discard ||
old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
@@ -1053,8 +1141,19 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
old_rs->force_persample_interp != rs->force_persample_interp ||
- old_rs->polygon_mode_is_points != rs->polygon_mode_is_points)
+ old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) {
+ si_ps_key_update_blend_rasterizer(sctx);
+ si_ps_key_update_rasterizer(sctx);
+ si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+ si_update_ps_inputs_read_or_disabled(sctx);
sctx->do_update_shaders = true;
+ }
+
+ if (old_rs->line_smooth != rs->line_smooth ||
+ old_rs->poly_smooth != rs->poly_smooth ||
+ old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+ old_rs->flatshade != rs->flatshade)
+ si_update_vrs_flat_shading(sctx);
}
static void si_delete_rs_state(struct pipe_context *ctx, void *state)
@@ -1079,14 +1178,15 @@ static void si_emit_stencil_ref(struct si_context *sctx)
struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
radeon_begin(cs);
- radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
- radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
- S_028430_STENCILMASK(dsa->valuemask[0]) |
- S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
- radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
- S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
- S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
- S_028434_STENCILOPVAL_BF(1));
+ radeon_set_context_reg_seq(R_028430_DB_STENCILREFMASK, 2);
+ radeon_emit(S_028430_STENCILTESTVAL(ref->ref_value[0]) |
+ S_028430_STENCILMASK(dsa->valuemask[0]) |
+ S_028430_STENCILWRITEMASK(dsa->writemask[0]) |
+ S_028430_STENCILOPVAL(1));
+ radeon_emit(S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
+ S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
+ S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
+ S_028434_STENCILOPVAL_BF(1));
radeon_end();
}
@@ -1270,8 +1370,12 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
}
- if (old_dsa->alpha_func != dsa->alpha_func)
+ if (old_dsa->alpha_func != dsa->alpha_func) {
+ si_ps_key_update_dsa(sctx);
+ si_update_ps_inputs_read_or_disabled(sctx);
+ si_update_ps_kill_enable(sctx);
sctx->do_update_shaders = true;
+ }
if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
old_dsa->stencil_enabled != dsa->stencil_enabled ||
@@ -1446,8 +1550,8 @@ static void si_emit_db_render_state(struct si_context *sctx)
/*
* format translation
*/
-static uint32_t si_translate_colorformat(enum chip_class chip_class,
- enum pipe_format format)
+uint32_t si_translate_colorformat(enum chip_class chip_class,
+ enum pipe_format format)
{
const struct util_format_description *desc = util_format_description(format);
if (!desc)
@@ -2234,6 +2338,13 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format
retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER);
}
+ if (usage & PIPE_BIND_INDEX_BUFFER) {
+ if (format == PIPE_FORMAT_R8_UINT ||
+ format == PIPE_FORMAT_R16_UINT ||
+ format == PIPE_FORMAT_R32_UINT)
+ retval |= PIPE_BIND_INDEX_BUFFER;
+ }
+
if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) &&
!(usage & PIPE_BIND_DEPTH_STENCIL))
retval |= PIPE_BIND_LINEAR;
@@ -2585,8 +2696,6 @@ void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
tex->dirty_level_mask |= 1 << surf->u.tex.level;
tex->fmask_is_identity = false;
}
- if (tex->dcc_gather_statistics)
- tex->separate_dcc_dirty = true;
}
}
@@ -2658,15 +2767,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
si_update_fb_dirtiness_after_rendering(sctx);
- for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- if (!sctx->framebuffer.state.cbufs[i])
- continue;
-
- tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
- if (tex->dcc_gather_statistics)
- vi_separate_dcc_stop_query(sctx, tex);
- }
-
/* Disable DCC if the formats are incompatible. */
for (i = 0; i < state->nr_cbufs; i++) {
if (!state->cbufs[i])
@@ -2823,12 +2923,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
p_atomic_inc(&tex->framebuffers_bound);
- if (tex->dcc_gather_statistics) {
- /* Dirty tracking must be enabled for DCC usage analysis. */
- sctx->framebuffer.compressed_cb_mask |= 1 << i;
- vi_separate_dcc_start_query(sctx, tex);
- }
-
/* Update the minimum but don't keep 0. */
if (!sctx->framebuffer.min_bytes_per_pixel ||
tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
@@ -2889,6 +2983,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ if (!sctx->sample_pos_buffer) {
+ sctx->sample_pos_buffer = pipe_buffer_create_with_data(&sctx->b, 0, PIPE_USAGE_DEFAULT,
+ sizeof(sctx->sample_positions),
+ &sctx->sample_positions);
+ }
constbuf.buffer = sctx->sample_pos_buffer;
/* Set sample locations as fragment shader constants. */
@@ -2922,6 +3021,10 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
}
+ si_ps_key_update_framebuffer(sctx);
+ si_ps_key_update_framebuffer_blend(sctx);
+ si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+ si_update_ps_inputs_read_or_disabled(sctx);
sctx->do_update_shaders = true;
if (!sctx->decompression_enabled) {
@@ -2953,7 +3056,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
cb = (struct si_surface *)state->cbufs[i];
if (!cb) {
- radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
+ radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C,
S_028C70_FORMAT(V_028C70_COLOR_INVALID));
continue;
}
@@ -2969,11 +3072,6 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
RADEON_PRIO_SEPARATE_META);
}
- if (tex->dcc_separate_buffer)
- radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, tex->dcc_separate_buffer,
- RADEON_USAGE_READWRITE | RADEON_USAGE_NEEDS_IMPLICIT_SYNC,
- RADEON_PRIO_SEPARATE_META);
-
/* Compute mutable surface parameters. */
cb_color_base = tex->buffer.gpu_address >> 8;
cb_color_fmask = 0;
@@ -3013,9 +3111,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
if (!is_msaa_resolve_dst)
cb_color_info |= S_028C70_DCC_ENABLE(1);
- cb_dcc_base =
- ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.meta_offset) >>
- 8;
+ cb_dcc_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8;
unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
dcc_tile_swizzle &= ((1 << tex->surface.meta_alignment_log2) - 1) >> 8;
@@ -3039,30 +3135,30 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
S_028EE0_CMASK_PIPE_ALIGNED(1) |
S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned);
- radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
- radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
- radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
- radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
- radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
- radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
- radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
- radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
-
- radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
- radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
+ radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
+ radeon_emit(cb_color_base); /* CB_COLOR0_BASE */
+ radeon_emit(0); /* hole */
+ radeon_emit(0); /* hole */
+ radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */
+ radeon_emit(cb_color_info); /* CB_COLOR0_INFO */
+ radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */
+ radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
+ radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */
+ radeon_emit(0); /* hole */
+ radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */
+ radeon_emit(0); /* hole */
+ radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+ radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+ radeon_emit(cb_dcc_base); /* CB_COLOR0_DCC_BASE */
+
+ radeon_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
+ radeon_set_context_reg(R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
cb_color_cmask >> 32);
- radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
+ radeon_set_context_reg(R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
cb_color_fmask >> 32);
- radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
- radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
- radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
+ radeon_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
+ radeon_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
+ radeon_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
} else if (sctx->chip_class == GFX9) {
struct gfx9_surf_meta_flags meta = {
.rb_aligned = 1,
@@ -3084,24 +3180,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
S_028C74_RB_ALIGNED(meta.rb_aligned) |
S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
- radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
- radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
- radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
- radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */
- radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
- radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
- radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
- radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
- radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
- radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
- radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
- radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
- radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
- radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
- radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
- radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
-
- radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
+ radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
+ radeon_emit(cb_color_base); /* CB_COLOR0_BASE */
+ radeon_emit(S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
+ radeon_emit(cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */
+ radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */
+ radeon_emit(cb_color_info); /* CB_COLOR0_INFO */
+ radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */
+ radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
+ radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */
+ radeon_emit(S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
+ radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */
+ radeon_emit(S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
+ radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+ radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+ radeon_emit(cb_dcc_base); /* CB_COLOR0_DCC_BASE */
+ radeon_emit(S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
+
+ radeon_set_context_reg(R_0287A0_CB_MRT0_EPITCH + i * 4,
S_0287A0_EPITCH(tex->surface.u.gfx9.epitch));
} else {
/* Compute mutable surface parameters (GFX6-GFX8). */
@@ -3145,29 +3241,29 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
}
- radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+ radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C,
sctx->chip_class >= GFX8 ? 14 : 13);
- radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
- radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */
- radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */
- radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
- radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
- radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
- radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
- radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
- radeon_emit(cs, tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
- radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
- radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */
- radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
- radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+ radeon_emit(cb_color_base); /* CB_COLOR0_BASE */
+ radeon_emit(cb_color_pitch); /* CB_COLOR0_PITCH */
+ radeon_emit(cb_color_slice); /* CB_COLOR0_SLICE */
+ radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */
+ radeon_emit(cb_color_info); /* CB_COLOR0_INFO */
+ radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */
+ radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
+ radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */
+ radeon_emit(tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
+ radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */
+ radeon_emit(cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */
+ radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+ radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
- radeon_emit(cs, cb_dcc_base);
+ radeon_emit(cb_dcc_base);
}
}
for (; i < 8; i++)
if (sctx->framebuffer.dirty_cbufs & (1 << i))
- radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
+ radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
/* ZS buffer. */
if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
@@ -3203,49 +3299,47 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
unsigned level = zb->base.u.tex.level;
if (sctx->chip_class >= GFX10) {
- radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
- radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
-
- radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
- radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
- radeon_emit(cs, db_z_info | /* DB_Z_INFO */
- S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
- radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
-
- radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
- radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */
- radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */
- radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
+ radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+ radeon_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
+
+ radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 7);
+ radeon_emit(S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
+ radeon_emit(db_z_info | /* DB_Z_INFO */
+ S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+ radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */
+ radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */
+ radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+ radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */
+ radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+
+ radeon_set_context_reg_seq(R_028068_DB_Z_READ_BASE_HI, 5);
+ radeon_emit(zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */
+ radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */
+ radeon_emit(zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */
+ radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */
+ radeon_emit(zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
} else if (sctx->chip_class == GFX9) {
- radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
- radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
- radeon_emit(cs,
- S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
- radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
-
- radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
- radeon_emit(cs, db_z_info | /* DB_Z_INFO */
- S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
- radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
- radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
- radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
- radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
- radeon_emit(cs,
- S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
-
- radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
- radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */
- radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
+ radeon_set_context_reg_seq(R_028014_DB_HTILE_DATA_BASE, 3);
+ radeon_emit(zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
+ radeon_emit(S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
+ radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */
+
+ radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 10);
+ radeon_emit(db_z_info | /* DB_Z_INFO */
+ S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+ radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */
+ radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */
+ radeon_emit(S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
+ radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+ radeon_emit(S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
+ radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */
+ radeon_emit(S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
+ radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+ radeon_emit(S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
+
+ radeon_set_context_reg_seq(R_028068_DB_Z_INFO2, 2);
+ radeon_emit(zb->db_z_info2); /* DB_Z_INFO2 */
+ radeon_emit(zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
} else {
/* GFX6-GFX8 */
/* Set fields dependent on tc_compatile_htile. */
@@ -3263,46 +3357,46 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
}
}
- radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+ radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
- radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
- radeon_emit(cs, zb->db_depth_info | /* DB_DEPTH_INFO */
+ radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 9);
+ radeon_emit(zb->db_depth_info | /* DB_DEPTH_INFO */
S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile));
- radeon_emit(cs, db_z_info | /* DB_Z_INFO */
- S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
- radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
- radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
- radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */
+ radeon_emit(db_z_info | /* DB_Z_INFO */
+ S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+ radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */
+ radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */
+ radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+ radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */
+ radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+ radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */
+ radeon_emit(zb->db_depth_slice); /* DB_DEPTH_SLICE */
}
- radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
- radeon_emit(cs, tex->stencil_clear_value[level]); /* R_028028_DB_STENCIL_CLEAR */
- radeon_emit(cs, fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */
+ radeon_set_context_reg_seq(R_028028_DB_STENCIL_CLEAR, 2);
+ radeon_emit(tex->stencil_clear_value[level]); /* R_028028_DB_STENCIL_CLEAR */
+ radeon_emit(fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */
- radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
- radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
+ radeon_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
+ radeon_set_context_reg(R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
} else if (sctx->framebuffer.dirty_zsbuf) {
if (sctx->chip_class == GFX9)
- radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
+ radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 2);
else
- radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
+ radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 2);
- radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
- radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
+ radeon_emit(S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
+ radeon_emit(S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
}
/* Framebuffer dimensions. */
/* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */
- radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
+ radeon_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR,
S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
- if (sctx->screen->dfsm_allowed) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+ if (sctx->screen->dpbb_allowed) {
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
}
radeon_end();
@@ -3508,14 +3602,15 @@ static void si_emit_msaa_config(struct si_context *sctx)
}
}
- /* Required by OpenGL line rasterization.
+ /* The DX10 diamond test is optional in GL and decreases line rasterization
+ * performance, so don't use it.
*
* TODO: We should also enable perpendicular endcaps for AA lines,
* but that requires implementing line stippling in the pixel
* shader. SC can only do line stippling with axis-aligned
* endcaps.
*/
- unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+ unsigned sc_line_cntl = 0;
unsigned sc_aa_config = 0;
if (coverage_samples > 1) {
@@ -3559,17 +3654,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
/* R_028A4C_PA_SC_MODE_CNTL_1 */
radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
sc_mode_cntl_1);
-
- if (radeon_packets_added()) {
- sctx->context_roll = true;
-
- /* GFX9: Flush DFSM when the AA mode changes. */
- if (sctx->screen->dfsm_allowed) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
- }
- }
- radeon_end();
+ radeon_end_update_context_roll(sctx);
}
void si_update_ps_iter_samples(struct si_context *sctx)
@@ -3591,6 +3676,9 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
return;
sctx->ps_iter_samples = min_samples;
+
+ si_ps_key_update_sample_shading(sctx);
+ si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
sctx->do_update_shaders = true;
si_update_ps_iter_samples(sctx);
@@ -3753,8 +3841,8 @@ static void gfx10_make_texture_descriptor(
}
if (tex->upgraded_depth && !is_stencil) {
- assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
- img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
+ assert(img_format == V_008F0C_GFX10_FORMAT_32_FLOAT);
+ img_format = V_008F0C_GFX10_FORMAT_32_FLOAT_CLAMP;
}
} else {
util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
@@ -3818,43 +3906,43 @@ static void gfx10_make_texture_descriptor(
#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
case FMASK(2, 1):
- format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
+ format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F1;
break;
case FMASK(2, 2):
- format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
+ format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F2;
break;
case FMASK(4, 1):
- format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
+ format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F1;
break;
case FMASK(4, 2):
- format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
+ format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F2;
break;
case FMASK(4, 4):
- format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
+ format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F4;
break;
case FMASK(8, 1):
- format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
+ format = V_008F0C_GFX10_FORMAT_FMASK8_S8_F1;
break;
case FMASK(8, 2):
- format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
+ format = V_008F0C_GFX10_FORMAT_FMASK16_S8_F2;
break;
case FMASK(8, 4):
- format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
+ format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F4;
break;
case FMASK(8, 8):
- format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
+ format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F8;
break;
case FMASK(16, 1):
- format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
+ format = V_008F0C_GFX10_FORMAT_FMASK16_S16_F1;
break;
case FMASK(16, 2):
- format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
+ format = V_008F0C_GFX10_FORMAT_FMASK32_S16_F2;
break;
case FMASK(16, 4):
- format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
+ format = V_008F0C_GFX10_FORMAT_FMASK64_S16_F4;
break;
case FMASK(16, 8):
- format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
+ format = V_008F0C_GFX10_FORMAT_FMASK64_S16_F8;
break;
default:
unreachable("invalid nr_samples");
@@ -4223,7 +4311,7 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx
unsigned force_level)
{
struct si_context *sctx = (struct si_context *)ctx;
- struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
+ struct si_sampler_view *view = CALLOC_STRUCT_CL(si_sampler_view);
struct si_texture *tex = (struct si_texture *)texture;
unsigned base_level, first_level, last_level;
unsigned char state_swizzle[4];
@@ -4357,7 +4445,7 @@ static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sample
struct si_sampler_view *view = (struct si_sampler_view *)state;
pipe_resource_reference(&state->texture, NULL);
- FREE(view);
+ FREE_CL(view);
}
static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
@@ -4404,9 +4492,13 @@ static uint32_t si_translate_border_color(struct si_context *sctx,
if (i >= SI_MAX_BORDER_COLORS) {
/* Getting 4096 unique border colors is very unlikely. */
- fprintf(stderr, "radeonsi: The border color table is full. "
- "Any new border colors will be just black. "
- "Please file a bug.\n");
+ static bool printed;
+ if (!printed) {
+ fprintf(stderr, "radeonsi: The border color table is full. "
+ "Any new border colors will be just black. "
+ "This is a hardware limitation.\n");
+ printed = true;
+ }
return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
}
@@ -4552,9 +4644,9 @@ static void si_emit_sample_mask(struct si_context *sctx)
(mask & 1 && sctx->blitter_running));
radeon_begin(cs);
- radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
- radeon_emit(cs, mask | (mask << 16));
- radeon_emit(cs, mask | (mask << 16));
+ radeon_set_context_reg_seq(R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
+ radeon_emit(mask | (mask << 16));
+ radeon_emit(mask | (mask << 16));
radeon_end();
}
@@ -4606,8 +4698,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
v->count = count;
+ unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sscreen);
unsigned alloc_count =
- count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
+ count > num_vbos_in_user_sgprs ? count - num_vbos_in_user_sgprs : 0;
v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
for (i = 0; i < count; ++i) {
@@ -4623,8 +4716,6 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
unsigned instance_divisor = elements[i].instance_divisor;
if (instance_divisor) {
- v->uses_instance_divisors = true;
-
if (instance_divisor == 1) {
v->instance_divisor_is_one |= 1u << i;
} else {
@@ -4820,22 +4911,23 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
sctx->vertex_buffer_user_sgprs_dirty = false;
}
- if (old->count != v->count ||
- old->uses_instance_divisors != v->uses_instance_divisors ||
- /* we don't check which divisors changed */
- v->uses_instance_divisors ||
+ if (old->instance_divisor_is_one != v->instance_divisor_is_one ||
+ old->instance_divisor_is_fetched != v->instance_divisor_is_fetched ||
(old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
sctx->vertex_buffer_unaligned ||
((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
- sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+ sizeof(v->vertex_buffer_index[0]) * MAX2(old->count, v->count))) ||
/* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
* functions of fix_fetch and the src_offset alignment.
* If they change and fix_fetch doesn't, it must be due to different
* src_offset alignment, which is reflected in fix_fetch_opencode. */
old->fix_fetch_opencode != v->fix_fetch_opencode ||
- memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))
+ memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) *
+ MAX2(old->count, v->count))) {
+ si_vs_key_update_inputs(sctx);
sctx->do_update_shaders = true;
+ }
if (v->instance_divisor_is_fetched) {
struct pipe_constant_buffer cb;
@@ -4931,8 +5023,82 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot,
* be the case in well-behaved applications anyway.
*/
if ((sctx->vertex_elements->vb_alignment_check_mask &
- (unaligned | orig_unaligned) & updated_mask))
+ (unaligned | orig_unaligned) & updated_mask)) {
+ si_vs_key_update_inputs(sctx);
sctx->do_update_shaders = true;
+ }
+}
+
+static struct pipe_vertex_state *
+si_create_vertex_state(struct pipe_screen *screen,
+ struct pipe_vertex_buffer *buffer,
+ const struct pipe_vertex_element *elements,
+ unsigned num_elements,
+ struct pipe_resource *indexbuf,
+ uint32_t full_velem_mask)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_vertex_state *state = CALLOC_STRUCT(si_vertex_state);
+
+ util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf, full_velem_mask,
+ &state->b);
+
+ /* Initialize the vertex element state in state->element.
+ * Do it by creating a vertex element state object and copying it there.
+ */
+ struct si_context ctx = {};
+ ctx.b.screen = screen;
+ struct si_vertex_elements *velems = si_create_vertex_elements(&ctx.b, num_elements, elements);
+ state->velems = *velems;
+ si_delete_vertex_element(&ctx.b, velems);
+
+ assert(!state->velems.instance_divisor_is_one);
+ assert(!state->velems.instance_divisor_is_fetched);
+ assert(!state->velems.fix_fetch_always);
+ assert(buffer->stride % 4 == 0);
+ assert(buffer->buffer_offset % 4 == 0);
+ assert(!buffer->is_user_buffer);
+ for (unsigned i = 0; i < num_elements; i++) {
+ assert(elements[i].src_offset % 4 == 0);
+ assert(!elements[i].dual_slot);
+ }
+
+ for (unsigned i = 0; i < num_elements; i++) {
+ si_set_vertex_buffer_descriptor(sscreen, &state->velems, &state->b.input.vbuffer, i,
+ &state->descriptors[i * 4]);
+ }
+
+ return &state->b;
+}
+
+static void si_vertex_state_destroy(struct pipe_screen *screen,
+ struct pipe_vertex_state *state)
+{
+ pipe_vertex_buffer_unreference(&state->input.vbuffer);
+ pipe_resource_reference(&state->input.indexbuf, NULL);
+ FREE(state);
+}
+
+static struct pipe_vertex_state *
+si_pipe_create_vertex_state(struct pipe_screen *screen,
+ struct pipe_vertex_buffer *buffer,
+ const struct pipe_vertex_element *elements,
+ unsigned num_elements,
+ struct pipe_resource *indexbuf,
+ uint32_t full_velem_mask)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+
+ return util_vertex_state_cache_get(screen, buffer, elements, num_elements, indexbuf,
+ full_velem_mask, &sscreen->vertex_state_cache);
+}
+
+static void si_pipe_vertex_state_destroy(struct pipe_screen *screen,
+ struct pipe_vertex_state *state)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+
+ util_vertex_state_destroy(screen, &sscreen->vertex_state_cache, state);
}
/*
@@ -4957,6 +5123,13 @@ static void si_set_tess_state(struct pipe_context *ctx, const float default_oute
si_set_internal_const_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
}
+static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ sctx->patch_vertices = patch_vertices;
+}
+
static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
{
struct si_context *sctx = (struct si_context *)ctx;
@@ -5086,6 +5259,7 @@ void si_init_state_functions(struct si_context *sctx)
sctx->b.texture_barrier = si_texture_barrier;
sctx->b.set_min_samples = si_set_min_samples;
sctx->b.set_tess_state = si_set_tess_state;
+ sctx->b.set_patch_vertices = si_set_patch_vertices;
sctx->b.set_active_query_state = si_set_active_query_state;
}
@@ -5093,12 +5267,17 @@ void si_init_state_functions(struct si_context *sctx)
void si_init_screen_state_functions(struct si_screen *sscreen)
{
sscreen->b.is_format_supported = si_is_format_supported;
+ sscreen->b.create_vertex_state = si_pipe_create_vertex_state;
+ sscreen->b.vertex_state_destroy = si_pipe_vertex_state_destroy;
if (sscreen->info.chip_class >= GFX10) {
sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
} else {
sscreen->make_texture_descriptor = si_make_texture_descriptor;
}
+
+ util_vertex_state_cache_init(&sscreen->vertex_state_cache,
+ si_create_vertex_state, si_vertex_state_destroy);
}
static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
@@ -5226,6 +5405,12 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
S_028034_BR_X(16384) | S_028034_BR_Y(16384));
}
+ if (sctx->chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_028038_DB_DFSM_CONTROL,
+ S_028038_PUNCHOUT_MODE(V_028038_FORCE_OFF) |
+ S_028038_POPS_DRAIN_PS_ON_OVERLAP(1));
+ }
+
unsigned cu_mask_ps = 0xffffffff;
/* It's wasteful to enable all CUs for PS if shader arrays have a different
@@ -5239,63 +5424,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
cu_mask_ps = u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa);
if (sctx->chip_class >= GFX7) {
- /* Compute LATE_ALLOC_VS.LIMIT. */
- unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
- unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
- unsigned cu_mask_vs = 0xffff;
- unsigned cu_mask_gs = 0xffff;
-
- if (sctx->chip_class >= GFX10) {
- /* For Wave32, the hw will launch twice the number of late
- * alloc waves, so 1 == 2x wave32.
- */
- if (!sscreen->info.use_late_alloc) {
- late_alloc_wave64 = 0;
- } else {
- late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
- /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
- * Others: CU1 must be disabled to prevent a hw deadlock.
- *
- * The deadlock is caused by late alloc, which usually increases
- * performance.
- */
- cu_mask_vs &= sctx->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) :
- ~BITFIELD_RANGE(1, 1);
-
- /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
- if (sscreen->use_ngg && sctx->family != CHIP_NAVI14)
- cu_mask_gs = cu_mask_vs;
- }
- } else {
- if (!sscreen->info.use_late_alloc) {
- late_alloc_wave64 = 0;
- } else if (num_cu_per_sh <= 4) {
- /* Too few available compute units per SA. Disallowing
- * VS to run on one CU could hurt us more than late VS
- * allocation would help.
- *
- * 2 is the highest safe number that allows us to keep
- * all CUs enabled.
- */
- late_alloc_wave64 = 2;
- } else {
- /* This is a good initial value, allowing 1 late_alloc
- * wave per SIMD on num_cu - 2.
- */
- late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
- }
-
- if (late_alloc_wave64 > 2)
- cu_mask_vs = 0xfffe; /* 1 CU disabled */
- }
-
- /* VS can't execute on one CU if the limit is > 2. */
- si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
- S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
- si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
- si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
- S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
}
@@ -5316,6 +5444,21 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
}
+ if (sscreen->info.chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS,
+ S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8));
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
+ S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
+ } else if (sscreen->info.chip_class == GFX9) {
+ si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS,
+ S_00B414_MEM_BASE(sscreen->info.address32_hi >> 8));
+ si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES,
+ S_00B214_MEM_BASE(sscreen->info.address32_hi >> 8));
+ } else {
+ si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS,
+ S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8));
+ }
+
if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) {
si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
@@ -5354,6 +5497,10 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
+
+ si_pm4_set_reg(pm4, R_028060_DB_DFSM_CONTROL,
+ S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
+ S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
}
if (sctx->chip_class >= GFX9) {
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.h b/lib/mesa/src/gallium/drivers/radeonsi/si_state.h
index ea31a2afd..a6daa158b 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.h
@@ -65,6 +65,7 @@ struct si_state_blend {
bool alpha_to_one : 1;
bool dual_src_blend : 1;
bool logicop_enable : 1;
+ bool allows_noop_optimization : 1;
};
struct si_state_rasterizer {
@@ -95,11 +96,6 @@ struct si_state_rasterizer {
unsigned rasterizer_discard : 1;
unsigned scissor_enable : 1;
unsigned clip_halfz : 1;
- unsigned cull_front : 1;
- unsigned cull_back : 1;
- unsigned depth_clamp_any : 1;
- unsigned provoking_vertex_first : 1;
- unsigned polygon_mode_enabled : 1;
unsigned polygon_mode_is_lines : 1;
unsigned polygon_mode_is_points : 1;
};
@@ -173,7 +169,6 @@ struct si_vertex_elements {
uint16_t vb_alignment_check_mask;
uint8_t count;
- bool uses_instance_divisors;
uint16_t first_vb_use_mask;
/* Vertex buffer descriptor list size aligned for optimal prefetch. */
@@ -188,13 +183,13 @@ union si_state {
struct si_state_rasterizer *rasterizer;
struct si_state_dsa *dsa;
struct si_pm4_state *poly_offset;
- struct si_pm4_state *ls;
- struct si_pm4_state *hs;
- struct si_pm4_state *es;
- struct si_pm4_state *gs;
+ struct si_shader *ls;
+ struct si_shader *hs;
+ struct si_shader *es;
+ struct si_shader *gs;
struct si_pm4_state *vgt_shader_config;
- struct si_pm4_state *vs;
- struct si_pm4_state *ps;
+ struct si_shader *vs;
+ struct si_shader *ps;
} named;
struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)];
};
@@ -254,12 +249,6 @@ struct si_shader_data {
uint32_t sh_base[SI_NUM_SHADERS];
};
-#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \
- (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) | \
- S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) | \
- S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) | \
- S_02881C_USE_VTX_VRS_RATE(1))
-
/* The list of registers whose emitted values are remembered by si_context. */
enum si_tracked_reg
{
@@ -285,12 +274,11 @@ enum si_tracked_reg
SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
- SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
+ SI_TRACKED_PA_CL_VS_OUT_CNTL,
SI_TRACKED_PA_CL_CLIP_CNTL,
SI_TRACKED_PA_SC_BINNER_CNTL_0,
- SI_TRACKED_DB_DFSM_CONTROL,
+
SI_TRACKED_DB_VRS_OVERRIDE_CNTL,
SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
@@ -347,7 +335,10 @@ enum si_tracked_reg
SI_TRACKED_VGT_TF_PARAM,
SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ /* Non-context registers: */
SI_TRACKED_GE_PC_ALLOC,
+ SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+ SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
SI_NUM_TRACKED_REGS,
};
@@ -490,8 +481,10 @@ struct si_buffer_resources {
void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
const struct legacy_surf_level *base_level_info,
unsigned base_level, unsigned first_level, unsigned block_width,
- bool is_stencil, uint16_t access, uint32_t *state);
+ /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
+ bool is_stencil, uint16_t access, uint32_t * restrict state);
void si_update_ps_colorbuf0_slot(struct si_context *sctx);
+void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader);
void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
struct pipe_constant_buffer *cbuf);
void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
@@ -527,6 +520,7 @@ struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, uns
void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf);
/* si_state.c */
+uint32_t si_translate_colorformat(enum chip_class chip_class, enum pipe_format format);
void si_init_state_compute_functions(struct si_context *sctx);
void si_init_state_functions(struct si_context *sctx);
void si_init_screen_state_functions(struct si_screen *sscreen);
@@ -567,7 +561,6 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha
void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
struct si_shader *shader, bool insert_into_disk_cache);
bool si_shader_mem_ordered(struct si_shader *shader);
-bool si_update_shaders(struct si_context *sctx);
void si_init_screen_live_shader_cache(struct si_screen *sscreen);
void si_init_shader_functions(struct si_context *sctx);
bool si_init_shader_cache(struct si_screen *sscreen);
@@ -578,18 +571,40 @@ void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage,
util_queue_execute_func execute);
void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const_and_shader_buffers,
uint64_t *samplers_and_images);
-int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
- struct si_compiler_ctx_state *compiler_state,
- struct si_shader_key *key, int thread_index, bool optimized_or_none);
-void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
- struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key);
-unsigned si_get_input_prim(const struct si_shader_selector *gs);
+int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state,
+ const struct si_shader_key *key, int thread_index,
+ bool optimized_or_none);
+int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state);
+void si_vs_key_update_inputs(struct si_context *sctx);
+void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
+ struct si_vs_prolog_bits *prolog_key);
+void si_update_ps_inputs_read_or_disabled(struct si_context *sctx);
+void si_update_ps_kill_enable(struct si_context *sctx);
+void si_update_vrs_flat_shading(struct si_context *sctx);
+unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key);
bool si_update_ngg(struct si_context *sctx);
-
-/* si_state_draw.c */
-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
-void si_trace_emit(struct si_context *sctx);
-void si_init_draw_functions(struct si_context *sctx);
+void si_ps_key_update_framebuffer(struct si_context *sctx);
+void si_ps_key_update_framebuffer_blend(struct si_context *sctx);
+void si_ps_key_update_blend_rasterizer(struct si_context *sctx);
+void si_ps_key_update_rasterizer(struct si_context *sctx);
+void si_ps_key_update_dsa(struct si_context *sctx);
+void si_ps_key_update_sample_shading(struct si_context *sctx);
+void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx);
+void si_init_tess_factor_ring(struct si_context *sctx);
+bool si_update_gs_ring_buffers(struct si_context *sctx);
+bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes);
+
+/* si_state_draw.cpp */
+void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems,
+ struct pipe_vertex_buffer *vb, unsigned element_index,
+ uint32_t *out);
+void si_init_draw_functions_GFX6(struct si_context *sctx);
+void si_init_draw_functions_GFX7(struct si_context *sctx);
+void si_init_draw_functions_GFX8(struct si_context *sctx);
+void si_init_draw_functions_GFX9(struct si_context *sctx);
+void si_init_draw_functions_GFX10(struct si_context *sctx);
+void si_init_draw_functions_GFX10_3(struct si_context *sctx);
+void si_init_spi_map_functions(struct si_context *sctx);
/* si_state_msaa.c */
void si_init_msaa_functions(struct si_context *sctx);
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
index e5e5f1a65..921bd5446 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
shader_variant_flags |= 1 << 0;
if (sel->nir)
shader_variant_flags |= 1 << 1;
- if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
+ if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es) == 32)
shader_variant_flags |= 1 << 2;
if (sel->info.stage == MESA_SHADER_FRAGMENT &&
/* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
@@ -78,11 +78,14 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
sel->info.base.fs.uses_discard &&
sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
shader_variant_flags |= 1 << 3;
- if (sel->info.stage == MESA_SHADER_VERTEX) {
- /* This varies depending on whether compute-based culling is enabled. */
- assert(sel->screen->num_vbos_in_user_sgprs <= 7);
- shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4;
- }
+ /* use_ngg_culling disables NGG passthrough for non-culling shaders to reduce context
+ * rolls, which can be changed with AMD_DEBUG=nonggc or AMD_DEBUG=nggc.
+ */
+ if (sel->screen->use_ngg_culling)
+ shader_variant_flags |= 1 << 4;
+
+ /* bit gap */
+
if (sel->screen->options.no_infinite_interp)
shader_variant_flags |= 1 << 7;
if (sel->screen->options.clamp_div_by_zero)
@@ -370,7 +373,7 @@ bool si_shader_mem_ordered(struct si_shader *shader)
}
static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
- struct si_pm4_state *pm4)
+ struct si_shader *shader)
{
const struct si_shader_info *info = &tes->info;
unsigned tes_prim_mode = info->base.tess.primitive_mode;
@@ -427,10 +430,9 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad
} else
distribution_mode = V_028B6C_NO_DIST;
- assert(pm4->shader);
- pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
- S_028B6C_TOPOLOGY(topology) |
- S_028B6C_DISTRIBUTION_MODE(distribution_mode);
+ shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
+ S_028B6C_TOPOLOGY(topology) |
+ S_028B6C_DISTRIBUTION_MODE(distribution_mode);
}
/* Polaris needs different VTX_REUSE_DEPTH settings depending on
@@ -444,18 +446,16 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad
* VS as ES | ES -> GS -> VS | 30
* TES as VS | LS -> HS -> VS | 14 or 30
* TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30
- *
- * If "shader" is NULL, it's assumed it's not LS or GS copy shader.
*/
static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
- struct si_shader *shader, struct si_pm4_state *pm4)
+ struct si_shader *shader)
{
if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10)
return;
/* VS as VS, or VS as ES: */
if ((sel->info.stage == MESA_SHADER_VERTEX &&
- (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
+ (!shader->key.as_ls && !shader->is_gs_copy_shader)) ||
/* TES as VS, or TES as ES: */
sel->info.stage == MESA_SHADER_TESS_EVAL) {
unsigned vtx_reuse_depth = 30;
@@ -464,25 +464,15 @@ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_sh
sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD)
vtx_reuse_depth = 14;
- assert(pm4->shader);
- pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
+ shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
}
}
static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
{
- if (shader->pm4)
- si_pm4_clear_state(shader->pm4);
- else
- shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
- if (shader->pm4) {
- shader->pm4->shader = shader;
- return shader->pm4;
- } else {
- fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
- return NULL;
- }
+ si_pm4_clear_state(&shader->pm4);
+ shader->pm4.is_shader = true;
+ return &shader->pm4;
}
static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
@@ -509,22 +499,30 @@ static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_sha
assert(shader->selector->info.stage == MESA_SHADER_VERTEX ||
(shader->previous_stage_sel && shader->previous_stage_sel->info.stage == MESA_SHADER_VERTEX));
- /* GFX6-9 LS (VertexID, RelAutoindex, InstanceID / StepRate0(==1), ...).
- * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID, ...)
- * GFX10 LS (VertexID, RelAutoindex, UserVGPR1, InstanceID).
- * GFX10 ES,VS (VertexID, UserVGPR0, UserVGPR1 or VSPrimID, UserVGPR2 or
- * InstanceID)
+ /* GFX6-9 LS (VertexID, RelAutoIndex, InstanceID / StepRate0, InstanceID)
+ * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0, VSPrimID, InstanceID)
+ * GFX10 LS (VertexID, RelAutoIndex, UserVGPR1, UserVGPR2 or InstanceID)
+ * GFX10 ES,VS (VertexID, UserVGPR1, UserVGPR2 or VSPrimID, UserVGPR3 or InstanceID)
*/
bool is_ls = shader->selector->info.stage == MESA_SHADER_TESS_CTRL || shader->key.as_ls;
+ unsigned max = 0;
- if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
- return 3;
- else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
- return 2;
- else if (is_ls || shader->info.uses_instanceid)
- return 1;
- else
- return 0;
+ if (shader->info.uses_instanceid) {
+ if (sscreen->info.chip_class >= GFX10)
+ max = MAX2(max, 3);
+ else if (is_ls)
+ max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
+ else
+ max = MAX2(max, 1); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
+ }
+
+ if (legacy_vs_prim_id)
+ max = MAX2(max, 2); /* VSPrimID */
+
+ if (is_ls)
+ max = MAX2(max, 1); /* RelAutoIndex */
+
+ return max;
}
static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
@@ -540,7 +538,6 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
va = shader->bo->gpu_address;
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
- si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
@@ -565,10 +562,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
if (sscreen->info.chip_class >= GFX9) {
if (sscreen->info.chip_class >= GFX10) {
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
- si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
} else {
si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
- si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
}
unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
@@ -582,7 +577,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
} else {
si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
- si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
+ si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
+ S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) |
S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
@@ -607,7 +603,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
static void si_emit_shader_es(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.es->shader;
+ struct si_shader *shader = sctx->queued.named.es;
if (!shader)
return;
@@ -656,7 +652,8 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
+ S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
@@ -667,9 +664,9 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader->selector, pm4);
+ si_set_tesseval_regs(sscreen, shader->selector, shader);
- polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+ polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
}
void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
@@ -767,7 +764,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *
static void si_emit_shader_gs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
+ struct si_shader *shader = sctx->queued.named.gs;
if (!shader)
return;
@@ -822,6 +819,20 @@ static void si_emit_shader_gs(struct si_context *sctx)
shader->vgt_vertex_reuse_block_cntl);
}
radeon_end_update_context_roll(sctx);
+
+ /* These don't cause any context rolls. */
+ radeon_begin_again(&sctx->gfx_cs);
+ if (sctx->chip_class >= GFX7) {
+ radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+ SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+ shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs);
+ }
+ if (sctx->chip_class >= GFX10) {
+ radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+ SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+ shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs);
+ }
+ radeon_end();
}
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@@ -868,6 +879,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
shader->ctx_reg.gs.vgt_gs_instance_cnt =
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
+ /* Copy over fields from the GS copy shader to make them easily accessible from GS. */
+ shader->pa_cl_vs_out_cntl = sel->gs_copy_shader->pa_cl_vs_out_cntl;
+
va = shader->bo->gpu_address;
if (sscreen->info.chip_class >= GFX9) {
@@ -902,10 +916,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
if (sscreen->info.chip_class >= GFX10) {
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
} else {
si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
}
uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
@@ -929,10 +941,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
- if (sscreen->info.chip_class >= GFX10) {
- si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
- S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
- }
+ shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+ S_00B21C_WAVE_LIMIT(0x3F);
+ shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs =
+ S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0);
shader->ctx_reg.gs.vgt_gs_onchip_cntl =
S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
@@ -943,12 +955,16 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4;
if (es_stage == MESA_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+ si_set_tesseval_regs(sscreen, shader->key.part.gs.es, shader);
- polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
+ polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader);
} else {
+ shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+ S_00B21C_WAVE_LIMIT(0x3F);
+
si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
- si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
+ si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
+ S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
@@ -960,28 +976,25 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
}
}
-static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
+bool gfx10_is_ngg_passthrough(struct si_shader *shader)
{
- enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
- sctx->tracked_regs.reg_value[reg] != value) {
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- radeon_begin(cs);
-
- if (sctx->chip_class == GFX10) {
- /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
- }
+ struct si_shader_selector *sel = shader->selector;
- radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
- radeon_end();
+ /* Never use NGG passthrough if culling is possible even when it's not used by this shader,
+ * so that we don't get context rolls when enabling and disabling NGG passthrough.
+ */
+ if (sel->screen->use_ngg_culling)
+ return false;
- sctx->tracked_regs.reg_saved |= 0x1ull << reg;
- sctx->tracked_regs.reg_value[reg] = value;
- }
+ /* The definition of NGG passthrough is:
+ * - user GS is turned off (no amplification, no GS instancing, and no culling)
+ * - VGT_ESGS_RING_ITEMSIZE is ignored (behaving as if it was equal to 1)
+ * - vertex indices are packed into 1 VGPR
+ * - Dimgrey and later chips can optionally skip the gs_alloc_req message
+ *
+ * NGG passthrough still allows the use of LDS.
+ */
+ return sel->info.stage != MESA_SHADER_GEOMETRY && !shader->key.opt.ngg_culling;
}
/* Common tail code for NGG primitive shaders. */
@@ -1012,18 +1025,24 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
shader->ctx_reg.ngg.pa_cl_ngg_cntl);
- radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
radeon_end_update_context_roll(sctx);
- /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
- gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
+ /* These don't cause a context roll. */
+ radeon_begin_again(&sctx->gfx_cs);
+ radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
+ shader->ctx_reg.ngg.ge_pc_alloc);
+ radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+ SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+ shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs);
+ radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+ SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+ shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs);
+ radeon_end();
}
static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
+ struct si_shader *shader = sctx->queued.named.gs;
if (!shader)
return;
@@ -1032,7 +1051,7 @@ static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
+ struct si_shader *shader = sctx->queued.named.gs;
if (!shader)
return;
@@ -1046,7 +1065,7 @@ static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
+ struct si_shader *shader = sctx->queued.named.gs;
if (!shader)
return;
@@ -1060,7 +1079,7 @@ static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
+ struct si_shader *shader = sctx->queued.named.gs;
if (!shader)
return;
@@ -1075,7 +1094,7 @@ static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
gfx10_emit_shader_ngg_tail(sctx, shader);
}
-unsigned si_get_input_prim(const struct si_shader_selector *gs)
+unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key)
{
if (gs->info.stage == MESA_SHADER_GEOMETRY)
return gs->info.base.gs.input_primitive;
@@ -1088,22 +1107,26 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs)
return PIPE_PRIM_TRIANGLES;
}
- /* TODO: Set this correctly if the primitive type is set in the shader key. */
+ if (key->opt.ngg_culling & SI_NGG_CULL_LINES)
+ return PIPE_PRIM_LINES;
+
return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
}
static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
const struct si_shader *shader, bool ngg)
{
- bool writes_psize = sel->info.writes_psize;
-
- if (shader)
- writes_psize &= !shader->key.opt.kill_pointsize;
-
+ /* Clip distances can be killed, but cull distances can't. */
+ unsigned clipcull_mask = (sel->clipdist_mask & ~shader->key.opt.kill_clip_distances) |
+ sel->culldist_mask;
+ bool writes_psize = sel->info.writes_psize && !shader->key.opt.kill_pointsize;
bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) ||
sel->screen->options.vrs2x2 ||
sel->info.writes_layer || sel->info.writes_viewport_index;
- return S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
+
+ return S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipcull_mask & 0x0F) != 0) |
+ S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipcull_mask & 0xF0) != 0) |
+ S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
S_02881C_USE_VTX_VRS_RATE(sel->screen->options.vrs2x2) |
S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
@@ -1132,7 +1155,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
gs_info->base.vs.window_space_position : 0;
bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
- unsigned input_prim = si_get_input_prim(gs_sel);
+ unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
bool break_wave_at_eoi = false;
struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
if (!pm4)
@@ -1174,7 +1197,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
* for the GL_LINE polygon mode to skip rendering lines on inner edges.
*/
if (gs_info->uses_invocationid ||
- (gs_stage == MESA_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
+ (gfx10_edgeflags_have_effect(shader) && !gfx10_is_ngg_passthrough(shader)))
gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) ||
(gs_stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
@@ -1185,9 +1208,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
unsigned wave_size = si_get_shader_wave_size(shader);
+ unsigned late_alloc_wave64, cu_mask;
+
+ ac_compute_late_alloc(&sscreen->info, true, shader->key.opt.ngg_culling,
+ shader->config.scratch_bytes_per_wave > 0,
+ &late_alloc_wave64, &cu_mask);
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
si_pm4_set_reg(
pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
@@ -1205,32 +1232,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
S_00B22C_LDS_SIZE(shader->config.lds_size));
- /* Determine LATE_ALLOC_GS. */
- unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
- unsigned late_alloc_wave64; /* The limit is per SA. */
-
- /* For Wave32, the hw will launch twice the number of late
- * alloc waves, so 1 == 2x wave32.
- *
- * Don't use late alloc for NGG on Navi14 due to a hw bug.
- */
- if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
- late_alloc_wave64 = 0;
- else if (shader->key.opt.ngg_culling)
- late_alloc_wave64 = num_cu_per_sh * 10;
- else
- late_alloc_wave64 = num_cu_per_sh * 4;
-
- /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
- if (sscreen->info.chip_class == GFX10)
- late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
-
- /* Max number that fits into the register field. */
- late_alloc_wave64 = MIN2(late_alloc_wave64, 127);
-
- si_pm4_set_reg(
- pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
- S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+ shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) |
+ S_00B21C_WAVE_LIMIT(0x3F);
+ shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs =
+ S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
nparams = MAX2(shader->info.nr_param_exports, 1);
shader->ctx_reg.ngg.spi_vs_out_config =
@@ -1261,7 +1266,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
}
if (es_stage == MESA_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, es_sel, pm4);
+ si_set_tesseval_regs(sscreen, es_sel, shader);
shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
@@ -1275,59 +1280,55 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
- /* Always output hw-generated edge flags and pass them via the prim
+ /* Output hw-generated edge flags if needed and pass them via the prim
* export to prevent drawing lines on internal edges of decomposed
- * primitives (such as quads) with polygon mode = lines. Only VS needs
- * this.
+ * primitives (such as quads) with polygon mode = lines.
*/
shader->ctx_reg.ngg.pa_cl_ngg_cntl =
- S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_stage == MESA_SHADER_VERTEX) |
+ S_028838_INDEX_BUF_EDGE_FLAG_ENA(gfx10_edgeflags_have_effect(shader)) |
/* Reuse for NGG. */
S_028838_VERTEX_REUSE_DEPTH(sscreen->info.chip_class >= GFX10_3 ? 30 : 0);
shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true);
/* Oversubscribe PC. This improves performance when there are too many varyings. */
- float oversub_pc_factor = 0.25;
+ unsigned oversub_pc_factor = 1;
if (shader->key.opt.ngg_culling) {
/* Be more aggressive with NGG culling. */
if (shader->info.nr_param_exports > 4)
- oversub_pc_factor = 1;
+ oversub_pc_factor = 4;
else if (shader->info.nr_param_exports > 2)
- oversub_pc_factor = 0.75;
+ oversub_pc_factor = 3;
else
- oversub_pc_factor = 0.5;
+ oversub_pc_factor = 2;
}
- unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
- shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+ unsigned oversub_pc_lines =
+ late_alloc_wave64 ? (sscreen->info.pc_lines / 4) * oversub_pc_factor : 0;
+ shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) |
S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
- if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
- shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
- S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
- } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
- shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
- S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
- } else {
- shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
- S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
- S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+ shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
+ S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
- /* Bug workaround for a possible hang with non-tessellation cases.
- * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
- *
- * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+ /* On gfx10, the GE only checks against the maximum number of ES verts after
+ * allocating a full GS primitive. So we need to ensure that whenever
+ * this check passes, there is enough space for a full primitive without
+ * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
+ * if we have enough LDS.
+ *
+ * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
+ */
+ if ((sscreen->info.chip_class == GFX10) &&
+ (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
+ shader->ngg.hw_max_esverts != 256 &&
+ shader->ngg.hw_max_esverts > 5) {
+ /* This could be based on the input primitive type. 5 is the worst case
+ * for primitive types with adjacency.
*/
- if ((sscreen->info.chip_class == GFX10) &&
- (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
- shader->ngg.hw_max_esverts != 256) {
- shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
- if (shader->ngg.hw_max_esverts > 5) {
- shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
- }
- }
+ shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+ shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
}
if (window_space) {
@@ -1338,11 +1339,15 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
}
+
+ shader->ctx_reg.ngg.vgt_stages.u.ngg = 1;
+ shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs;
+ shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader);
}
static void si_emit_shader_vs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.vs->shader;
+ struct si_shader *shader = sctx->queued.named.vs;
if (!shader)
return;
@@ -1385,16 +1390,15 @@ static void si_emit_shader_vs(struct si_context *sctx)
S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
}
- if (sctx->chip_class >= GFX10) {
- radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
- }
radeon_end_update_context_roll(sctx);
/* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
- if (sctx->chip_class >= GFX10)
- gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
+ if (sctx->chip_class >= GFX10) {
+ radeon_begin_again(&sctx->gfx_cs);
+ radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
+ shader->ctx_reg.vs.ge_pc_alloc);
+ radeon_end();
+ }
}
/**
@@ -1485,14 +1489,26 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
: V_02870C_SPI_SHADER_NONE) |
S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
: V_02870C_SPI_SHADER_NONE);
- shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+ unsigned late_alloc_wave64, cu_mask;
+ ac_compute_late_alloc(&sscreen->info, false, false,
+ shader->config.scratch_bytes_per_wave > 0,
+ &late_alloc_wave64, &cu_mask);
+
+ shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(late_alloc_wave64 > 0) |
S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, false);
oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
+ if (sscreen->info.chip_class >= GFX7) {
+ si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+ S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
+ si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+ }
+
si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
- si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
+ si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
+ S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
uint32_t rsrc1 =
S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
@@ -1530,9 +1546,9 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader->selector, pm4);
+ si_set_tesseval_regs(sscreen, shader->selector, shader);
- polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+ polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
}
static unsigned si_get_ps_num_interp(struct si_shader *ps)
@@ -1567,7 +1583,7 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
static void si_emit_shader_ps(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.ps->shader;
+ struct si_shader *shader = sctx->queued.named.ps;
if (!shader)
return;
@@ -1695,10 +1711,13 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
+ unsigned num_interp = si_get_ps_num_interp(shader);
+
/* Set interpolation controls. */
- spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
+ spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) |
S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
+ shader->ctx_reg.ps.num_interp = num_interp;
shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
shader->ctx_reg.ps.spi_shader_z_format =
@@ -1708,7 +1727,8 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
va = shader->bo->gpu_address;
si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
- si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
+ si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
+ S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
uint32_t rsrc1 =
S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) |
@@ -1764,31 +1784,41 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
}
}
-static unsigned si_get_alpha_test_func(struct si_context *sctx)
+static void si_clear_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
+ struct si_vs_prolog_bits *prolog_key)
{
- /* Alpha-test should be disabled if colorbuffer 0 is integer. */
- return sctx->queued.named.dsa->alpha_func;
+ prolog_key->instance_divisor_is_one = 0;
+ prolog_key->instance_divisor_is_fetched = 0;
+ key->mono.vs_fetch_opencode = 0;
+ memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch));
}
-void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
- struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key)
+void si_vs_key_update_inputs(struct si_context *sctx)
{
- if (vs->info.base.vs.blit_sgprs_amd)
+ struct si_shader_selector *vs = sctx->shader.vs.cso;
+ struct si_vertex_elements *elts = sctx->vertex_elements;
+ struct si_shader_key *key = &sctx->shader.vs.key;
+
+ if (!vs)
return;
- struct si_vertex_elements *elts = sctx->vertex_elements;
+ if (vs->info.base.vs.blit_sgprs_amd) {
+ si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog);
+ key->opt.prefer_mono = 0;
+ sctx->uses_nontrivial_vs_prolog = false;
+ return;
+ }
- prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
- prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
- prolog_key->unpack_instance_id_from_vertex_id = sctx->prim_discard_cs_instancing;
+ bool uses_nontrivial_vs_prolog = false;
- /* Prefer a monolithic shader to allow scheduling divisions around
- * VBO loads. */
- if (prolog_key->instance_divisor_is_fetched)
- key->opt.prefer_mono = 1;
+ if (elts->instance_divisor_is_one || elts->instance_divisor_is_fetched)
+ uses_nontrivial_vs_prolog = true;
+
+ key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one;
+ key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+ key->opt.prefer_mono = elts->instance_divisor_is_fetched;
- unsigned count = MIN2(vs->info.num_inputs, elts->count);
- unsigned count_mask = (1 << count) - 1;
+ unsigned count_mask = (1 << vs->info.num_inputs) - 1;
unsigned fix = elts->fix_fetch_always & count_mask;
unsigned opencode = elts->fix_fetch_opencode & count_mask;
@@ -1807,19 +1837,49 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto
}
}
+ memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch));
+
while (fix) {
unsigned i = u_bit_scan(&fix);
- key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+ uint8_t fix_fetch = elts->fix_fetch[i];
+
+ key->mono.vs_fix_fetch[i].bits = fix_fetch;
+ if (fix_fetch)
+ uses_nontrivial_vs_prolog = true;
}
key->mono.vs_fetch_opencode = opencode;
+ if (opencode)
+ uses_nontrivial_vs_prolog = true;
+
+ sctx->uses_nontrivial_vs_prolog = uses_nontrivial_vs_prolog;
+
+ /* draw_vertex_state (display lists) requires a trivial VS prolog that ignores
+ * the current vertex buffers and vertex elements.
+ *
+ * We just computed the prolog key because we needed to set uses_nontrivial_vs_prolog,
+ * so that we know whether the VS prolog should be updated when we switch from
+ * draw_vertex_state to draw_vbo. Now clear the VS prolog for draw_vertex_state.
+ * This should happen rarely because the VS prolog should be trivial in most
+ * cases.
+ */
+ if (uses_nontrivial_vs_prolog && sctx->force_trivial_vs_prolog)
+ si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog);
}
-static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs,
- struct si_shader_key *key)
+void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
+ struct si_vs_prolog_bits *prolog_key)
{
- struct si_shader_selector *ps = sctx->shader.ps.cso;
+ prolog_key->instance_divisor_is_one = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_one;
+ prolog_key->instance_divisor_is_fetched = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_fetched;
- key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable;
+ key->mono.vs_fetch_opencode = sctx->shader.vs.key.mono.vs_fetch_opencode;
+ memcpy(key->mono.vs_fix_fetch, sctx->shader.vs.key.mono.vs_fix_fetch,
+ sizeof(key->mono.vs_fix_fetch));
+}
+
+void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
+{
+ struct si_shader_selector *ps = sctx->shader.ps.cso;
/* Find out if PS is disabled. */
bool ps_disabled = true;
@@ -1827,273 +1887,314 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad
bool ps_modifies_zs = ps->info.base.fs.uses_discard || ps->info.writes_z || ps->info.writes_stencil ||
ps->info.writes_samplemask ||
sctx->queued.named.blend->alpha_to_coverage ||
- si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
+ sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS;
unsigned ps_colormask = si_get_total_colormask(sctx);
ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
(!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory);
}
- /* Find out which VS outputs aren't used by the PS. */
- uint64_t outputs_written = vs->outputs_written_before_ps;
- uint64_t inputs_read = 0;
+ sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->inputs_read;
+}
- /* Ignore outputs that are not passed from VS to PS. */
- outputs_written &= ~((1ull << si_shader_io_get_unique_index(VARYING_SLOT_POS, true)) |
- (1ull << si_shader_io_get_unique_index(VARYING_SLOT_PSIZ, true)) |
- (1ull << si_shader_io_get_unique_index(VARYING_SLOT_CLIP_VERTEX, true)));
+static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
+ struct si_shader_key *key)
+{
- if (!ps_disabled) {
- inputs_read = ps->inputs_read;
- }
+ key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable;
- uint64_t linked = outputs_written & inputs_read;
+ /* Find out which VS outputs aren't used by the PS. */
+ uint64_t outputs_written = vs->outputs_written_before_ps;
+ uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled;
key->opt.kill_outputs = ~linked & outputs_written;
if (vs->info.stage != MESA_SHADER_GEOMETRY) {
key->opt.ngg_culling = sctx->ngg_culling;
-
- if (sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid)
- key->mono.u.vs_export_prim_id = 1;
+ key->mono.u.vs_export_prim_id = sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid;
+ } else {
+ key->opt.ngg_culling = 0;
+ key->mono.u.vs_export_prim_id = 0;
}
- /* We need PKT3_CONTEXT_REG_RMW, which we currently only use on GFX10+. */
- if (sctx->chip_class >= GFX10 &&
- vs->info.writes_psize &&
- sctx->current_rast_prim != PIPE_PRIM_POINTS &&
- !sctx->queued.named.rasterizer->polygon_mode_is_points)
- key->opt.kill_pointsize = 1;
+ key->opt.kill_pointsize = vs->info.writes_psize &&
+ sctx->current_rast_prim != PIPE_PRIM_POINTS &&
+ !sctx->queued.named.rasterizer->polygon_mode_is_points;
}
-/* Compute the key for the hw shader variant */
-static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
- union si_vgt_stages_key stages_key,
- struct si_shader_key *key)
+static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
+ struct si_shader_key *key)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ key->opt.kill_clip_distances = 0;
+ key->opt.kill_outputs = 0;
+ key->opt.ngg_culling = 0;
+ key->mono.u.vs_export_prim_id = 0;
+ key->opt.kill_pointsize = 0;
+}
+
+void si_ps_key_update_framebuffer(struct si_context *sctx)
+{
+ struct si_shader_selector *sel = sctx->shader.ps.cso;
+ struct si_shader_key *key = &sctx->shader.ps.key;
- memset(key, 0, sizeof(*key));
+ if (!sel)
+ return;
- unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms;
- if (num_inlinable_uniforms &&
- sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) {
- key->opt.inline_uniforms = true;
- memcpy(key->opt.inlined_uniform_values,
- sctx->inlinable_uniforms[sel->pipe_shader_type],
- num_inlinable_uniforms * 4);
+ if (sel->info.color0_writes_all_cbufs &&
+ sel->info.colors_written == 0x1)
+ key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+ else
+ key->part.ps.epilog.last_cbuf = 0;
+
+ /* ps_uses_fbfetch is true only if the color buffer is bound. */
+ if (sctx->ps_uses_fbfetch) {
+ struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+ struct pipe_resource *tex = cb0->texture;
+
+ /* 1D textures are allocated and used as 2D on GFX9. */
+ key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
+ key->mono.u.ps.fbfetch_is_1D =
+ sctx->chip_class != GFX9 &&
+ (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
+ key->mono.u.ps.fbfetch_layered =
+ tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
+ tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
+ tex->target == PIPE_TEXTURE_3D;
+ } else {
+ key->mono.u.ps.fbfetch_msaa = 0;
+ key->mono.u.ps.fbfetch_is_1D = 0;
+ key->mono.u.ps.fbfetch_layered = 0;
}
+}
- switch (sel->info.stage) {
- case MESA_SHADER_VERTEX:
- si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
+void si_ps_key_update_framebuffer_blend(struct si_context *sctx)
+{
+ struct si_shader_selector *sel = sctx->shader.ps.cso;
+ struct si_shader_key *key = &sctx->shader.ps.key;
+ struct si_state_blend *blend = sctx->queued.named.blend;
- if (sctx->shader.tes.cso)
- key->as_ls = 1;
- else if (sctx->shader.gs.cso) {
- key->as_es = 1;
- key->as_ngg = stages_key.u.ngg;
- } else {
- key->as_ngg = stages_key.u.ngg;
- si_shader_selector_key_hw_vs(sctx, sel, key);
- }
- break;
- case MESA_SHADER_TESS_CTRL:
- if (sctx->chip_class >= GFX9) {
- si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.tcs.ls_prolog);
- key->part.tcs.ls = sctx->shader.vs.cso;
+ if (!sel)
+ return;
- /* When the LS VGPR fix is needed, monolithic shaders
- * can:
- * - avoid initializing EXEC in both the LS prolog
- * and the LS main part when !vs_needs_prolog
- * - remove the fixup for unused input VGPRs
- */
- key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+ /* Select the shader color format based on whether
+ * blending or alpha are needed.
+ */
+ key->part.ps.epilog.spi_shader_col_format =
+ (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format_blend_alpha) |
+ (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format_blend) |
+ (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format_alpha) |
+ (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format);
+ key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
+
+ /* The output for dual source blending should have
+ * the same format as the first output.
+ */
+ if (blend->dual_src_blend) {
+ key->part.ps.epilog.spi_shader_col_format |=
+ (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
+ }
- /* The LS output / HS input layout can be communicated
- * directly instead of via user SGPRs for merged LS-HS.
- * This also enables jumping over the VS prolog for HS-only waves.
- */
- key->opt.prefer_mono = 1;
- key->opt.same_patch_vertices = sctx->same_patch_vertices;
- }
+ /* If alpha-to-coverage is enabled, we have to export alpha
+ * even if there is no color buffer.
+ */
+ if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
+ key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
- key->part.tcs.epilog.prim_mode =
- sctx->shader.tes.cso->info.base.tess.primitive_mode;
- key->part.tcs.epilog.invoc0_tess_factors_are_def =
- sel->info.tessfactors_are_def_in_all_invocs;
- key->part.tcs.epilog.tes_reads_tess_factors = sctx->shader.tes.cso->info.reads_tess_factors;
+ /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
+ * to the range supported by the type if a channel has less
+ * than 16 bits and the export format is 16_ABGR.
+ */
+ if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
+ key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
+ key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
+ }
- if (sel == sctx->fixed_func_tcs_shader.cso)
- key->mono.u.ff_tcs_inputs_to_copy = sctx->shader.vs.cso->outputs_written;
- break;
- case MESA_SHADER_TESS_EVAL:
- key->as_ngg = stages_key.u.ngg;
+ /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
+ if (!key->part.ps.epilog.last_cbuf) {
+ key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+ key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
+ key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
+ }
- if (sctx->shader.gs.cso)
- key->as_es = 1;
- else {
- si_shader_selector_key_hw_vs(sctx, sel, key);
- }
- break;
- case MESA_SHADER_GEOMETRY:
- if (sctx->chip_class >= GFX9) {
- if (sctx->shader.tes.cso) {
- key->part.gs.es = sctx->shader.tes.cso;
- } else {
- si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.gs.vs_prolog);
- key->part.gs.es = sctx->shader.vs.cso;
- }
+ /* Eliminate shader code computing output values that are unused.
+ * This enables dead code elimination between shader parts.
+ * Check if any output is eliminated.
+ */
+ if (sel->colors_written_4bit &
+ ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit))
+ key->opt.prefer_mono = 1;
+ else
+ key->opt.prefer_mono = 0;
+}
- key->as_ngg = stages_key.u.ngg;
+void si_ps_key_update_blend_rasterizer(struct si_context *sctx)
+{
+ struct si_shader_key *key = &sctx->shader.ps.key;
+ struct si_state_blend *blend = sctx->queued.named.blend;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- /* Only NGG can eliminate GS outputs, because the code is shared with VS. */
- if (stages_key.u.ngg)
- si_shader_selector_key_hw_vs(sctx, sel, key);
+ key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+}
- /* This enables jumping over the VS prolog for GS-only waves. */
- key->opt.prefer_mono = 1;
- }
- key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
- break;
- case MESA_SHADER_FRAGMENT: {
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct si_state_blend *blend = sctx->queued.named.blend;
+void si_ps_key_update_rasterizer(struct si_context *sctx)
+{
+ struct si_shader_selector *sel = sctx->shader.ps.cso;
+ struct si_shader_key *key = &sctx->shader.ps.key;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- if (sel->info.color0_writes_all_cbufs &&
- sel->info.colors_written == 0x1)
- key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+ if (!sel)
+ return;
- /* Select the shader color format based on whether
- * blending or alpha are needed.
- */
- key->part.ps.epilog.spi_shader_col_format =
- (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format_blend_alpha) |
- (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format_blend) |
- (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format_alpha) |
- (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format);
- key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
-
- /* The output for dual source blending should have
- * the same format as the first output.
- */
- if (blend->dual_src_blend) {
- key->part.ps.epilog.spi_shader_col_format |=
- (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
- }
+ key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+ key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
+ key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+}
- /* If alpha-to-coverage is enabled, we have to export alpha
- * even if there is no color buffer.
- */
- if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
- key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+void si_ps_key_update_dsa(struct si_context *sctx)
+{
+ struct si_shader_key *key = &sctx->shader.ps.key;
- /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
- * to the range supported by the type if a channel has less
- * than 16 bits and the export format is 16_ABGR.
- */
- if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
- key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
- key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
- }
+ key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func;
+}
- /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
- if (!key->part.ps.epilog.last_cbuf) {
- key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
- key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
- key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
- }
+static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx)
+{
+ struct si_shader_key *key = &sctx->shader.ps.key;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- /* Eliminate shader code computing output values that are unused.
- * This enables dead code elimination between shader parts.
- * Check if any output is eliminated.
- */
- if (sel->colors_written_4bit &
- ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit))
- key->opt.prefer_mono = 1;
+ bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+ bool is_line = util_prim_is_lines(sctx->current_rast_prim);
- bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
- bool is_line = util_prim_is_lines(sctx->current_rast_prim);
+ key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+ key->part.ps.epilog.poly_line_smoothing =
+ ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
+ sctx->framebuffer.nr_samples <= 1;
+}
- key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
- key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
+void si_ps_key_update_sample_shading(struct si_context *sctx)
+{
+ struct si_shader_selector *sel = sctx->shader.ps.cso;
+ struct si_shader_key *key = &sctx->shader.ps.key;
- key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+ if (!sel)
+ return;
- key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
- key->part.ps.epilog.poly_line_smoothing =
- ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
- sctx->framebuffer.nr_samples <= 1;
- key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+ if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask)
+ key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
+ else
+ key->part.ps.prolog.samplemask_log_ps_iter = 0;
+}
- if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) {
- key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
- }
+void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx)
+{
+ struct si_shader_selector *sel = sctx->shader.ps.cso;
+ struct si_shader_key *key = &sctx->shader.ps.key;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- bool uses_persp_center = sel->info.uses_persp_center ||
- (!rs->flatshade && sel->info.uses_persp_center_color);
- bool uses_persp_centroid = sel->info.uses_persp_centroid ||
- (!rs->flatshade && sel->info.uses_persp_centroid_color);
- bool uses_persp_sample = sel->info.uses_persp_sample ||
- (!rs->flatshade && sel->info.uses_persp_sample_color);
-
- if (rs->force_persample_interp && rs->multisample_enable &&
- sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
- key->part.ps.prolog.force_persp_sample_interp =
- uses_persp_center || uses_persp_centroid;
-
- key->part.ps.prolog.force_linear_sample_interp =
- sel->info.uses_linear_center || sel->info.uses_linear_centroid;
- } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
- key->part.ps.prolog.bc_optimize_for_persp =
- uses_persp_center && uses_persp_centroid;
- key->part.ps.prolog.bc_optimize_for_linear =
- sel->info.uses_linear_center && sel->info.uses_linear_centroid;
- } else {
- /* Make sure SPI doesn't compute more than 1 pair
- * of (i,j), which is the optimization here. */
- key->part.ps.prolog.force_persp_center_interp = uses_persp_center +
- uses_persp_centroid +
- uses_persp_sample > 1;
-
- key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center +
- sel->info.uses_linear_centroid +
- sel->info.uses_linear_sample > 1;
-
- if (sel->info.uses_interp_at_sample)
- key->mono.u.ps.interpolate_at_sample_force_center = 1;
+ if (!sel)
+ return;
+
+ bool uses_persp_center = sel->info.uses_persp_center ||
+ (!rs->flatshade && sel->info.uses_persp_center_color);
+ bool uses_persp_centroid = sel->info.uses_persp_centroid ||
+ (!rs->flatshade && sel->info.uses_persp_centroid_color);
+ bool uses_persp_sample = sel->info.uses_persp_sample ||
+ (!rs->flatshade && sel->info.uses_persp_sample_color);
+
+ if (rs->force_persample_interp && rs->multisample_enable &&
+ sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
+ key->part.ps.prolog.force_persp_sample_interp =
+ uses_persp_center || uses_persp_centroid;
+
+ key->part.ps.prolog.force_linear_sample_interp =
+ sel->info.uses_linear_center || sel->info.uses_linear_centroid;
+
+ key->part.ps.prolog.force_persp_center_interp = 0;
+ key->part.ps.prolog.force_linear_center_interp = 0;
+ key->part.ps.prolog.bc_optimize_for_persp = 0;
+ key->part.ps.prolog.bc_optimize_for_linear = 0;
+ key->mono.u.ps.interpolate_at_sample_force_center = 0;
+ } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
+ key->part.ps.prolog.force_persp_sample_interp = 0;
+ key->part.ps.prolog.force_linear_sample_interp = 0;
+ key->part.ps.prolog.force_persp_center_interp = 0;
+ key->part.ps.prolog.force_linear_center_interp = 0;
+ key->part.ps.prolog.bc_optimize_for_persp =
+ uses_persp_center && uses_persp_centroid;
+ key->part.ps.prolog.bc_optimize_for_linear =
+ sel->info.uses_linear_center && sel->info.uses_linear_centroid;
+ key->mono.u.ps.interpolate_at_sample_force_center = 0;
+ } else {
+ key->part.ps.prolog.force_persp_sample_interp = 0;
+ key->part.ps.prolog.force_linear_sample_interp = 0;
+
+ /* Make sure SPI doesn't compute more than 1 pair
+ * of (i,j), which is the optimization here. */
+ key->part.ps.prolog.force_persp_center_interp = uses_persp_center +
+ uses_persp_centroid +
+ uses_persp_sample > 1;
+
+ key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center +
+ sel->info.uses_linear_centroid +
+ sel->info.uses_linear_sample > 1;
+ key->part.ps.prolog.bc_optimize_for_persp = 0;
+ key->part.ps.prolog.bc_optimize_for_linear = 0;
+ key->mono.u.ps.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample;
+ }
+}
+
+/* Compute the key for the hw shader variant */
+static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
+ struct si_shader_key *key)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ switch (sel->info.stage) {
+ case MESA_SHADER_VERTEX:
+ if (!sctx->shader.tes.cso && !sctx->shader.gs.cso)
+ si_get_vs_key_outputs(sctx, sel, key);
+ else
+ si_clear_vs_key_outputs(sctx, sel, key);
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ if (sctx->chip_class >= GFX9) {
+ si_get_vs_key_inputs(sctx, key, &key->part.tcs.ls_prolog);
+ key->part.tcs.ls = sctx->shader.vs.cso;
}
+ break;
+ case MESA_SHADER_TESS_EVAL:
+ if (!sctx->shader.gs.cso)
+ si_get_vs_key_outputs(sctx, sel, key);
+ else
+ si_clear_vs_key_outputs(sctx, sel, key);
+ break;
+ case MESA_SHADER_GEOMETRY:
+ if (sctx->chip_class >= GFX9) {
+ if (sctx->shader.tes.cso) {
+ si_clear_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog);
+ key->part.gs.es = sctx->shader.tes.cso;
+ } else {
+ si_get_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog);
+ key->part.gs.es = sctx->shader.vs.cso;
+ }
- key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
-
- /* ps_uses_fbfetch is true only if the color buffer is bound. */
- if (sctx->ps_uses_fbfetch && !sctx->blitter_running) {
- struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
- struct pipe_resource *tex = cb0->texture;
-
- /* 1D textures are allocated and used as 2D on GFX9. */
- key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
- key->mono.u.ps.fbfetch_is_1D =
- sctx->chip_class != GFX9 &&
- (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
- key->mono.u.ps.fbfetch_layered =
- tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
- tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
- tex->target == PIPE_TEXTURE_3D;
+ /* Only NGG can eliminate GS outputs, because the code is shared with VS. */
+ if (sctx->ngg)
+ si_get_vs_key_outputs(sctx, sel, key);
+ else
+ si_clear_vs_key_outputs(sctx, sel, key);
}
break;
- }
+ case MESA_SHADER_FRAGMENT:
+ si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx);
+ break;
default:
assert(0);
}
-
- if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
- memset(&key->opt, 0, sizeof(key->opt));
}
static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority)
@@ -2138,7 +2239,7 @@ static void si_build_shader_variant(struct si_shader *shader, int thread_index,
si_shader_init_pm4_state(sscreen, shader);
}
-static void si_build_shader_variant_low_priority(void *job, int thread_index)
+static void si_build_shader_variant_low_priority(void *job, void *gdata, int thread_index)
{
struct si_shader *shader = (struct si_shader *)job;
@@ -2151,7 +2252,7 @@ static const struct si_shader_key zeroed;
static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
struct si_compiler_ctx_state *compiler_state,
- struct si_shader_key *key)
+ const struct si_shader_key *key)
{
struct si_shader **mainp = si_get_main_shader_part(sel, key);
@@ -2182,6 +2283,16 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad
return true;
}
+/* A helper to copy *key to *local_key and return local_key. */
+static const struct si_shader_key *
+use_local_key_copy(const struct si_shader_key *key, struct si_shader_key *local_key)
+{
+ if (key != local_key)
+ memcpy(local_key, key, sizeof(*key));
+
+ return local_key;
+}
+
/**
* Select a shader variant according to the shader key.
*
@@ -2189,14 +2300,26 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad
* the compilation isn't finished, don't select any
* shader and return an error.
*/
-int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
- struct si_compiler_ctx_state *compiler_state,
- struct si_shader_key *key, int thread_index, bool optimized_or_none)
+int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state,
+ const struct si_shader_key *key, int thread_index,
+ bool optimized_or_none)
{
+ struct si_screen *sscreen = sctx->screen;
struct si_shader_selector *sel = state->cso;
struct si_shader_selector *previous_stage_sel = NULL;
struct si_shader *current = state->current;
struct si_shader *iter, *shader = NULL;
+ /* si_shader_select_with_key must not modify 'key' because it would affect future shaders.
+ * If we need to modify it for this specific shader (eg: to disable optimizations), we
+ * use a copy.
+ */
+ struct si_shader_key local_key;
+
+ if (unlikely(sscreen->debug_flags & DBG(NO_OPT_VARIANT))) {
+ /* Disable shader variant optimizations. */
+ key = use_local_key_copy(key, &local_key);
+ memset(&local_key.opt, 0, sizeof(key->opt));
+ }
again:
/* Check if we don't need to change anything.
@@ -2209,7 +2332,8 @@ again:
if (optimized_or_none)
return -1;
- memset(&key->opt, 0, sizeof(key->opt));
+ key = use_local_key_copy(key, &local_key);
+ memset(&local_key.opt, 0, sizeof(key->opt));
goto current_not_ready;
}
@@ -2248,9 +2372,10 @@ current_not_ready:
key->opt.inlined_uniform_values,
MAX_INLINABLE_UNIFORMS * 4) != 0) {
if (variant_count++ > max_inline_uniforms_variants) {
+ key = use_local_key_copy(key, &local_key);
/* Too many variants. Disable inlining for this shader. */
- key->opt.inline_uniforms = 0;
- memset(key->opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4);
+ local_key.opt.inline_uniforms = 0;
+ memset(local_key.opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4);
simple_mtx_unlock(&sel->mutex);
goto again;
}
@@ -2267,7 +2392,9 @@ current_not_ready:
if (iter->is_optimized) {
if (optimized_or_none)
return -1;
- memset(&key->opt, 0, sizeof(key->opt));
+
+ key = use_local_key_copy(key, &local_key);
+ memset(&local_key.opt, 0, sizeof(key->opt));
goto again;
}
@@ -2292,9 +2419,14 @@ current_not_ready:
util_queue_fence_init(&shader->ready);
+ if (!sctx->compiler.passes)
+ si_init_compiler(sctx->screen, &sctx->compiler);
+
shader->selector = sel;
shader->key = *key;
- shader->compiler_ctx_state = *compiler_state;
+ shader->compiler_ctx_state.compiler = &sctx->compiler;
+ shader->compiler_ctx_state.debug = sctx->debug;
+ shader->compiler_ctx_state.is_debug_context = sctx->is_debug;
/* If this is a merged shader, get the first shader's selector. */
if (sscreen->info.chip_class >= GFX9) {
@@ -2313,10 +2445,8 @@ current_not_ready:
/* Compile the main shader part if it doesn't exist. This can happen
* if the initial guess was wrong.
- *
- * The prim discard CS doesn't need the main shader part.
*/
- if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
+ if (!is_pure_monolithic) {
bool ok = true;
/* Make sure the main shader part is present. This is needed
@@ -2342,12 +2472,13 @@ current_not_ready:
}
simple_mtx_lock(&previous_stage_sel->mutex);
- ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key);
+ ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state,
+ &shader1_key);
simple_mtx_unlock(&previous_stage_sel->mutex);
}
if (ok) {
- ok = si_check_missing_main_part(sscreen, sel, compiler_state, key);
+ ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state, key);
}
if (!ok) {
@@ -2370,8 +2501,7 @@ current_not_ready:
shader->is_monolithic =
is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
- /* The prim discard CS is always optimized. */
- shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+ shader->is_optimized = !is_pure_monolithic &&
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
/* If it's an optimized shader, compile it asynchronously. */
@@ -2391,7 +2521,8 @@ current_not_ready:
}
/* Use the default (unoptimized) shader for now. */
- memset(&key->opt, 0, sizeof(key->opt));
+ key = use_local_key_copy(key, &local_key);
+ memset(&local_key.opt, 0, sizeof(key->opt));
simple_mtx_unlock(&sel->mutex);
if (sscreen->options.sync_compile)
@@ -2426,15 +2557,12 @@ current_not_ready:
return shader->compilation_failed ? -1 : 0;
}
-static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state,
- union si_vgt_stages_key stages_key,
- struct si_compiler_ctx_state *compiler_state)
+int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state)
{
struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_key key;
- si_shader_selector_key(ctx, state->cso, stages_key, &key);
- return si_shader_select_with_key(sctx->screen, state, compiler_state, &key, -1, false);
+ si_shader_selector_key(ctx, state->cso, &state->key);
+ return si_shader_select_with_key(sctx, state, &state->key, -1, false);
}
static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
@@ -2477,7 +2605,7 @@ static void si_parse_next_shader_property(const struct si_shader_info *info, boo
* si_shader_selector initialization. Since it can be done asynchronously,
* there is no way to report compile failures to applications.
*/
-static void si_init_shader_selector_async(void *job, int thread_index)
+static void si_init_shader_selector_async(void *job, void *gdata, int thread_index)
{
struct si_shader_selector *sel = (struct si_shader_selector *)job;
struct si_screen *sscreen = sel->screen;
@@ -2492,6 +2620,19 @@ static void si_init_shader_selector_async(void *job, int thread_index)
if (!compiler->passes)
si_init_compiler(sscreen, compiler);
+ /* The GS copy shader is always pre-compiled. */
+ if (sel->info.stage == MESA_SHADER_GEOMETRY &&
+ (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
+ sel->tess_turns_off_ngg)) {
+ sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
+ if (!sel->gs_copy_shader) {
+ fprintf(stderr, "radeonsi: can't create GS copy shader\n");
+ return;
+ }
+
+ si_shader_vs(sscreen, sel->gs_copy_shader, sel);
+ }
+
/* Serialize NIR to save memory. Monolithic shader variants
* have to deserialize NIR before compilation.
*/
@@ -2576,14 +2717,16 @@ static void si_init_shader_selector_async(void *job, int thread_index)
unsigned i;
for (i = 0; i < sel->info.num_outputs; i++) {
- unsigned offset = shader->info.vs_output_param_offset[i];
+ unsigned semantic = sel->info.output_semantic[i];
+ unsigned ps_input_cntl = shader->info.vs_output_ps_input_cntl[semantic];
- if (offset <= AC_EXP_PARAM_OFFSET_31)
+ /* OFFSET=0x20 means DEFAULT_VAL, which means VS doesn't export it. */
+ if (G_028644_OFFSET(ps_input_cntl) != 0x20)
continue;
- unsigned semantic = sel->info.output_semantic[i];
unsigned id;
+ /* Remove the output from the mask. */
if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
semantic != VARYING_SLOT_POS &&
semantic != VARYING_SLOT_PSIZ &&
@@ -2596,19 +2739,6 @@ static void si_init_shader_selector_async(void *job, int thread_index)
}
}
- /* The GS copy shader is always pre-compiled. */
- if (sel->info.stage == MESA_SHADER_GEOMETRY &&
- (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
- sel->tess_turns_off_ngg)) {
- sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
- if (!sel->gs_copy_shader) {
- fprintf(stderr, "radeonsi: can't create GS copy shader\n");
- return;
- }
-
- si_shader_vs(sscreen, sel->gs_copy_shader, sel);
- }
-
/* Free NIR. We only keep serialized NIR after this point. */
if (sel->nir) {
ralloc_free(sel->nir);
@@ -2724,18 +2854,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd
? sel->info.num_inputs
: 0;
- sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
+ unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class);
+ sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, num_vbos_in_sgprs);
/* The prolog is a no-op if there are no inputs. */
sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs &&
!sel->info.base.vs.blit_sgprs_amd;
- sel->prim_discard_cs_allowed =
- sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images &&
- !sel->info.uses_bindless_samplers && !sel->info.base.writes_memory &&
- !sel->info.writes_viewport_index &&
- !sel->info.base.vs.window_space_position && !sel->so.num_outputs;
-
if (sel->info.stage == MESA_SHADER_VERTEX ||
sel->info.stage == MESA_SHADER_TESS_CTRL ||
sel->info.stage == MESA_SHADER_TESS_EVAL ||
@@ -2756,8 +2881,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
} else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
semantic != VARYING_SLOT_EDGE) {
sel->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false);
- sel->outputs_written_before_ps |= 1ull
- << si_shader_io_get_unique_index(semantic, true);
+
+ /* Ignore outputs that are not passed from VS to PS. */
+ if (semantic != VARYING_SLOT_POS &&
+ semantic != VARYING_SLOT_PSIZ &&
+ semantic != VARYING_SLOT_CLIP_VERTEX) {
+ sel->outputs_written_before_ps |= 1ull
+ << si_shader_io_get_unique_index(semantic, true);
+ }
}
}
}
@@ -2824,7 +2955,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
case MESA_SHADER_FRAGMENT:
for (i = 0; i < sel->info.num_inputs; i++) {
- unsigned semantic = sel->info.input_semantic[i];
+ unsigned semantic = sel->info.input[i].semantic;
if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
semantic != VARYING_SLOT_PNTC) {
@@ -2837,9 +2968,9 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->colors_written_4bit |= 0xf << (4 * i);
for (i = 0; i < sel->info.num_inputs; i++) {
- if (sel->info.input_semantic[i] == VARYING_SLOT_COL0)
+ if (sel->info.input[i].semantic == VARYING_SLOT_COL0)
sel->color_attr_index[0] = i;
- else if (sel->info.input_semantic[i] == VARYING_SLOT_COL1)
+ else if (sel->info.input[i].semantic == VARYING_SLOT_COL1)
sel->color_attr_index[1] = i;
}
break;
@@ -2868,25 +2999,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sscreen->info.chip_class == GFX10_3 ||
(sscreen->info.chip_class == GFX10 &&
sscreen->info.is_pro_graphics)) {
- /* Rough estimates. */
- switch (sctx->family) {
- case CHIP_NAVI10:
- case CHIP_NAVI12:
- case CHIP_SIENNA_CICHLID:
- sel->ngg_cull_vert_threshold = 511;
- break;
- case CHIP_NAVI14:
- case CHIP_NAVY_FLOUNDER:
- case CHIP_DIMGREY_CAVEFISH:
- case CHIP_VANGOGH:
- sel->ngg_cull_vert_threshold = 255;
- break;
- default:
- assert(!sscreen->use_ngg_culling);
- }
+ sel->ngg_cull_vert_threshold = 128;
}
} else if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
- if (sel->rast_prim == PIPE_PRIM_TRIANGLES &&
+ if (sel->rast_prim != PIPE_PRIM_POINTS &&
(sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) ||
sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS) ||
sscreen->info.chip_class == GFX10_3))
@@ -2894,10 +3010,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
}
}
- /* PA_CL_VS_OUT_CNTL */
- if (sctx->chip_class <= GFX9)
- sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, NULL, false);
-
sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS :
u_bit_consecutive(0, sel->info.base.clip_distance_array_size);
sel->culldist_mask = u_bit_consecutive(0, sel->info.base.cull_distance_array_size) <<
@@ -3005,11 +3117,10 @@ static void si_update_clip_regs(struct si_context *sctx, struct si_shader_select
(!old_hw_vs ||
(old_hw_vs->info.stage == MESA_SHADER_VERTEX && old_hw_vs->info.base.vs.window_space_position) !=
(next_hw_vs->info.stage == MESA_SHADER_VERTEX && next_hw_vs->info.base.vs.window_space_position) ||
- old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant ||
!next_hw_vs_variant ||
- old_hw_vs_variant->key.opt.kill_clip_distances != next_hw_vs_variant->key.opt.kill_clip_distances))
+ old_hw_vs_variant->pa_cl_vs_out_cntl != next_hw_vs_variant->pa_cl_vs_out_cntl))
si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
}
@@ -3053,9 +3164,10 @@ static void si_update_common_shader_state(struct si_context *sctx, struct si_sha
si_shader_uses_bindless_images(sctx->shader.tcs.cso) ||
si_shader_uses_bindless_images(sctx->shader.tes.cso);
- /* Invalidate inlinable uniforms. */
- sctx->inlinable_uniforms_valid_mask &= ~(1 << type);
+ if (type == PIPE_SHADER_VERTEX || type == PIPE_SHADER_TESS_EVAL || type == PIPE_SHADER_GEOMETRY)
+ sctx->ngg_culling = 0; /* this will be enabled on the first draw if needed */
+ si_invalidate_inlinable_uniforms(sctx, type);
sctx->do_update_shaders = true;
}
@@ -3073,6 +3185,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
sctx->shader.vs.current = sel ? sel->first_variant : NULL;
sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0;
sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false;
+ sctx->fixed_func_tcs_shader.key.mono.u.ff_tcs_inputs_to_copy = sel ? sel->outputs_written : 0;
if (si_update_ngg(sctx))
si_shader_change_notify(sctx);
@@ -3084,6 +3197,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
si_get_vs(sctx)->current);
si_update_rasterized_prim(sctx);
+ si_vs_key_update_inputs(sctx);
}
static void si_update_tess_uses_prim_id(struct si_context *sctx)
@@ -3118,7 +3232,7 @@ bool si_update_ngg(struct si_context *sctx)
* VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
* pointers are set.
*/
- if ((sctx->chip_class == GFX10 || sctx->family == CHIP_SIENNA_CICHLID) && !new_ngg) {
+ if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
if (sctx->chip_class == GFX10) {
/* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
@@ -3179,6 +3293,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
sctx->shader.tcs.cso = sel;
sctx->shader.tcs.current = sel ? sel->first_variant : NULL;
+ sctx->shader.tcs.key.part.tcs.epilog.invoc0_tess_factors_are_def =
+ sel ? sel->info.tessfactors_are_def_in_all_invocs : 0;
si_update_tess_uses_prim_id(sctx);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL);
@@ -3203,6 +3319,14 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
si_update_tess_uses_prim_id(sctx);
+ sctx->shader.tcs.key.part.tcs.epilog.prim_mode =
+ sctx->fixed_func_tcs_shader.key.part.tcs.epilog.prim_mode =
+ sel ? sel->info.base.tess.primitive_mode : 0;
+
+ sctx->shader.tcs.key.part.tcs.epilog.tes_reads_tess_factors =
+ sctx->fixed_func_tcs_shader.key.part.tcs.epilog.tes_reads_tess_factors =
+ sel ? sel->info.reads_tess_factors : 0;
+
si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
si_select_draw_vbo(sctx);
sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
@@ -3219,6 +3343,41 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
si_update_rasterized_prim(sctx);
}
+void si_update_ps_kill_enable(struct si_context *sctx)
+{
+ if (!sctx->shader.ps.cso)
+ return;
+
+ unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control |
+ S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS);
+
+ if (sctx->ps_db_shader_control != db_shader_control) {
+ sctx->ps_db_shader_control = db_shader_control;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ if (sctx->screen->dpbb_allowed)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+ }
+}
+
+void si_update_vrs_flat_shading(struct si_context *sctx)
+{
+ if (sctx->chip_class >= GFX10_3 && sctx->shader.ps.cso) {
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ struct si_shader_info *info = &sctx->shader.ps.cso->info;
+ bool allow_flat_shading = info->allow_flat_shading;
+
+ if (allow_flat_shading &&
+ (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable ||
+ (!rs->flatshade && info->uses_interp_color)))
+ allow_flat_shading = false;
+
+ if (sctx->allow_flat_shading != allow_flat_shading) {
+ sctx->allow_flat_shading = allow_flat_shading;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+ }
+}
+
static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
@@ -3247,6 +3406,17 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
}
si_update_ps_colorbuf0_slot(sctx);
+
+ si_ps_key_update_framebuffer(sctx);
+ si_ps_key_update_framebuffer_blend(sctx);
+ si_ps_key_update_blend_rasterizer(sctx);
+ si_ps_key_update_rasterizer(sctx);
+ si_ps_key_update_dsa(sctx);
+ si_ps_key_update_sample_shading(sctx);
+ si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+ si_update_ps_inputs_read_or_disabled(sctx);
+ si_update_ps_kill_enable(sctx);
+ si_update_vrs_flat_shading(sctx);
}
static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
@@ -3257,55 +3427,55 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
util_queue_fence_destroy(&shader->ready);
- if (shader->pm4) {
- /* If destroyed shaders were not unbound, the next compiled
- * shader variant could get the same pointer address and so
- * binding it to the same shader stage would be considered
- * a no-op, causing random behavior.
- */
- switch (shader->selector->info.stage) {
- case MESA_SHADER_VERTEX:
- if (shader->key.as_ls) {
- assert(sctx->chip_class <= GFX8);
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ls));
- } else if (shader->key.as_es) {
- assert(sctx->chip_class <= GFX8);
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es));
- } else if (shader->key.as_ngg) {
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs));
- } else {
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs));
- }
- break;
- case MESA_SHADER_TESS_CTRL:
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(hs));
- break;
- case MESA_SHADER_TESS_EVAL:
- if (shader->key.as_es) {
- assert(sctx->chip_class <= GFX8);
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es));
- } else if (shader->key.as_ngg) {
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs));
- } else {
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs));
- }
- break;
- case MESA_SHADER_GEOMETRY:
- if (shader->is_gs_copy_shader)
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs));
- else
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs));
- break;
- case MESA_SHADER_FRAGMENT:
- si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ps));
- break;
- default:;
+ /* If destroyed shaders were not unbound, the next compiled
+ * shader variant could get the same pointer address and so
+ * binding it to the same shader stage would be considered
+ * a no-op, causing random behavior.
+ */
+ int state_index = -1;
+
+ switch (shader->selector->info.stage) {
+ case MESA_SHADER_VERTEX:
+ if (shader->key.as_ls) {
+ if (sctx->chip_class <= GFX8)
+ state_index = SI_STATE_IDX(ls);
+ } else if (shader->key.as_es) {
+ if (sctx->chip_class <= GFX8)
+ state_index = SI_STATE_IDX(es);
+ } else if (shader->key.as_ngg) {
+ state_index = SI_STATE_IDX(gs);
+ } else {
+ state_index = SI_STATE_IDX(vs);
+ }
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ state_index = SI_STATE_IDX(hs);
+ break;
+ case MESA_SHADER_TESS_EVAL:
+ if (shader->key.as_es) {
+ if (sctx->chip_class <= GFX8)
+ state_index = SI_STATE_IDX(es);
+ } else if (shader->key.as_ngg) {
+ state_index = SI_STATE_IDX(gs);
+ } else {
+ state_index = SI_STATE_IDX(vs);
}
+ break;
+ case MESA_SHADER_GEOMETRY:
+ if (shader->is_gs_copy_shader)
+ state_index = SI_STATE_IDX(vs);
+ else
+ state_index = SI_STATE_IDX(gs);
+ break;
+ case MESA_SHADER_FRAGMENT:
+ state_index = SI_STATE_IDX(ps);
+ break;
+ default:;
}
si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
si_shader_destroy(shader);
- free(shader);
+ si_pm4_free_state(sctx, &shader->pm4, state_index);
}
static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
@@ -3354,128 +3524,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
si_shader_selector_reference(sctx, &sel, NULL);
}
-static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs,
- unsigned semantic, enum glsl_interp_mode interpolate,
- ubyte fp16_lo_hi_mask)
-{
- struct si_shader_info *vsinfo = &vs->selector->info;
- unsigned offset, ps_input_cntl = 0;
-
- if (interpolate == INTERP_MODE_FLAT ||
- (interpolate == INTERP_MODE_COLOR && sctx->flatshade) ||
- semantic == VARYING_SLOT_PRIMITIVE_ID)
- ps_input_cntl |= S_028644_FLAT_SHADE(1);
-
- if (semantic == VARYING_SLOT_PNTC ||
- (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 &&
- sctx->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) {
- ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
- if (fp16_lo_hi_mask & 0x1) {
- ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
- S_028644_ATTR0_VALID(1);
- }
- }
-
- int vs_slot = vsinfo->output_semantic_to_slot[semantic];
- if (vs_slot >= 0) {
- offset = vs->info.vs_output_param_offset[vs_slot];
-
- if (offset <= AC_EXP_PARAM_OFFSET_31) {
- /* The input is loaded from parameter memory. */
- ps_input_cntl |= S_028644_OFFSET(offset);
- } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
- if (offset == AC_EXP_PARAM_UNDEFINED) {
- /* This can happen with depth-only rendering. */
- offset = 0;
- } else {
- /* The input is a DEFAULT_VAL constant. */
- assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
- offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
- offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
- }
-
- ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
- }
-
- if (fp16_lo_hi_mask && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
- assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000);
-
- ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
- S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) |
- S_028644_DEFAULT_VAL_ATTR1(0) |
- S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */
- S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2));
- }
- } else {
- /* VS output not found. */
- if (semantic == VARYING_SLOT_PRIMITIVE_ID) {
- /* PrimID is written after the last output when HW VS is used. */
- ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
- } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
- /* No corresponding output found, load defaults into input.
- * Don't set any other bits.
- * (FLAT_SHADE=1 completely changes behavior) */
- ps_input_cntl = S_028644_OFFSET(0x20);
- /* D3D 9 behaviour. GL is undefined */
- if (semantic == VARYING_SLOT_COL0)
- ps_input_cntl |= S_028644_DEFAULT_VAL(3);
- }
- }
-
- return ps_input_cntl;
-}
-
-static void si_emit_spi_map(struct si_context *sctx)
-{
- struct si_shader *ps = sctx->shader.ps.current;
- struct si_shader *vs;
- struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
- unsigned i, num_interp, num_written = 0;
- unsigned spi_ps_input_cntl[32];
-
- if (!ps || !ps->selector->info.num_inputs)
- return;
-
- /* With legacy GS, only the GS copy shader contains information about param exports. */
- if (sctx->shader.gs.cso && !sctx->ngg)
- vs = sctx->shader.gs.cso->gs_copy_shader;
- else
- vs = si_get_vs(sctx)->current;
-
- num_interp = si_get_ps_num_interp(ps);
- assert(num_interp > 0);
-
- for (i = 0; i < psinfo->num_inputs; i++) {
- unsigned semantic = psinfo->input_semantic[i];
- unsigned interpolate = psinfo->input_interpolate[i];
- ubyte fp16_lo_hi_mask = psinfo->input_fp16_lo_hi_valid[i];
-
- spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, interpolate,
- fp16_lo_hi_mask);
- }
-
- if (ps->key.part.ps.prolog.color_two_side) {
- for (i = 0; i < 2; i++) {
- if (!(psinfo->colors_read & (0xf << (i * 4))))
- continue;
-
- unsigned semantic = VARYING_SLOT_BFC0 + i;
- spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic,
- psinfo->color_interpolate[i],
- false);
- }
- }
- assert(num_interp == num_written);
-
- /* R_028644_SPI_PS_INPUT_CNTL_0 */
- /* Dota 2: Only ~16% of SPI map updates set different values. */
- /* Talos: Only ~9% of SPI map updates set different values. */
- radeon_begin(&sctx->gfx_cs);
- radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
- sctx->tracked_regs.spi_ps_input_cntl, num_interp);
- radeon_end_update_context_roll(sctx);
-}
-
/**
* Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
*/
@@ -3505,17 +3553,17 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs)
radeon_begin(cs);
/* This is required before VGT_FLUSH. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+ radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
radeon_end();
}
/* Initialize state related to ESGS / GSVS ring buffers */
-static bool si_update_gs_ring_buffers(struct si_context *sctx)
+bool si_update_gs_ring_buffers(struct si_context *sctx)
{
struct si_shader_selector *es =
sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso;
@@ -3610,11 +3658,11 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
/* Set the GS registers. */
if (sctx->esgs_ring) {
assert(sctx->chip_class <= GFX8);
- radeon_set_uconfig_reg(cs, R_030900_VGT_ESGS_RING_SIZE,
+ radeon_set_uconfig_reg(R_030900_VGT_ESGS_RING_SIZE,
sctx->esgs_ring->width0 / 256);
}
if (sctx->gsvs_ring) {
- radeon_set_uconfig_reg(cs, R_030904_VGT_GSVS_RING_SIZE,
+ radeon_set_uconfig_reg(R_030904_VGT_GSVS_RING_SIZE,
sctx->gsvs_ring->width0 / 256);
}
radeon_end();
@@ -3718,11 +3766,6 @@ static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *s
return 1;
}
-static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
-{
- return shader ? shader->config.scratch_bytes_per_wave : 0;
-}
-
static struct si_shader *si_get_tcs_current(struct si_context *sctx)
{
if (!sctx->shader.tes.cso)
@@ -3745,19 +3788,19 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
if (r < 0)
return false;
if (r == 1)
- si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4);
+ si_pm4_bind_state(sctx, ps, sctx->shader.ps.current);
r = si_update_scratch_buffer(sctx, sctx->shader.gs.current);
if (r < 0)
return false;
if (r == 1)
- si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4);
+ si_pm4_bind_state(sctx, gs, sctx->shader.gs.current);
r = si_update_scratch_buffer(sctx, tcs);
if (r < 0)
return false;
if (r == 1)
- si_pm4_bind_state(sctx, hs, tcs->pm4);
+ si_pm4_bind_state(sctx, hs, tcs);
/* VS can be bound as LS, ES, or VS. */
r = si_update_scratch_buffer(sctx, sctx->shader.vs.current);
@@ -3765,13 +3808,13 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
return false;
if (r == 1) {
if (sctx->shader.vs.current->key.as_ls)
- si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4);
+ si_pm4_bind_state(sctx, ls, sctx->shader.vs.current);
else if (sctx->shader.vs.current->key.as_es)
- si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4);
+ si_pm4_bind_state(sctx, es, sctx->shader.vs.current);
else if (sctx->shader.vs.current->key.as_ngg)
- si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4);
+ si_pm4_bind_state(sctx, gs, sctx->shader.vs.current);
else
- si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4);
+ si_pm4_bind_state(sctx, vs, sctx->shader.vs.current);
}
/* TES can be bound as ES or VS. */
@@ -3780,17 +3823,17 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
return false;
if (r == 1) {
if (sctx->shader.tes.current->key.as_es)
- si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4);
+ si_pm4_bind_state(sctx, es, sctx->shader.tes.current);
else if (sctx->shader.tes.current->key.as_ngg)
- si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4);
+ si_pm4_bind_state(sctx, gs, sctx->shader.tes.current);
else
- si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4);
+ si_pm4_bind_state(sctx, vs, sctx->shader.tes.current);
}
return true;
}
-static bool si_update_spi_tmpring_size(struct si_context *sctx)
+bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
{
/* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
* There are 2 cases to handle:
@@ -3805,17 +3848,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
* Otherwise, the number of waves that can use scratch is
* SPI_TMPRING_SIZE.WAVES.
*/
- unsigned bytes = 0;
-
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.ps.current));
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.gs.current));
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.vs.current));
-
- if (sctx->shader.tes.cso) {
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.tes.current));
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
- }
-
sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
@@ -3834,7 +3866,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
if (!sctx->scratch_buffer)
return false;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b);
}
@@ -3855,7 +3886,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
return true;
}
-static void si_init_tess_factor_ring(struct si_context *sctx)
+void si_init_tess_factor_ring(struct si_context *sctx)
{
assert(!sctx->tess_rings);
assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
@@ -3893,17 +3924,17 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
/* Set tessellation registers. */
radeon_begin(cs);
- radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
+ radeon_set_uconfig_reg(R_030938_VGT_TF_RING_SIZE,
S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
- radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
+ radeon_set_uconfig_reg(R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
if (sctx->chip_class >= GFX10) {
- radeon_set_uconfig_reg(cs, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
+ radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
S_030984_BASE_HI(factor_va >> 40));
} else if (sctx->chip_class == GFX9) {
- radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI,
+ radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI,
S_030944_BASE_HI(factor_va >> 40));
}
- radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM,
+ radeon_set_uconfig_reg(R_03093C_VGT_HS_OFFCHIP_PARAM,
sctx->screen->vgt_hs_offchip_param);
radeon_end();
return;
@@ -3955,8 +3986,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
-static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
- union si_vgt_stages_key key)
+struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key)
{
struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
uint32_t stages = 0;
@@ -3977,7 +4007,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
}
if (key.u.ngg) {
- stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
+ stages |= S_028B54_PRIMGEN_EN(1) |
S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough) |
S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key.u.ngg_passthrough &&
@@ -3988,9 +4018,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
if (screen->info.chip_class >= GFX9)
stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
- if (screen->info.chip_class >= GFX10 &&
- /* GS fast launch hangs with Wave64, so always use Wave32. */
- (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) {
+ if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
stages |= S_028B54_HS_W32_EN(1) |
S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
S_028B54_VS_W32_EN(1);
@@ -4000,293 +4028,12 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
return pm4;
}
-static void si_update_vgt_shader_config(struct si_context *sctx, union si_vgt_stages_key key)
-{
- struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
-
- if (unlikely(!*pm4))
- *pm4 = si_build_vgt_shader_config(sctx->screen, key);
- si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
-}
-
-bool si_update_shaders(struct si_context *sctx)
-{
- struct pipe_context *ctx = (struct pipe_context *)sctx;
- struct si_compiler_ctx_state compiler_state;
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct si_shader *old_vs = si_get_vs(sctx)->current;
- unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0;
- struct si_shader *old_ps = sctx->shader.ps.current;
- union si_vgt_stages_key key;
- unsigned old_spi_shader_col_format =
- old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
- int r;
-
- if (!sctx->compiler.passes)
- si_init_compiler(sctx->screen, &sctx->compiler);
-
- compiler_state.compiler = &sctx->compiler;
- compiler_state.debug = sctx->debug;
- compiler_state.is_debug_context = sctx->is_debug;
-
- key.index = 0;
-
- if (sctx->shader.tes.cso)
- key.u.tess = 1;
- if (sctx->shader.gs.cso)
- key.u.gs = 1;
-
- if (sctx->ngg) {
- key.u.ngg = 1;
- key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
- }
-
- /* Update TCS and TES. */
- if (sctx->shader.tes.cso) {
- if (!sctx->tess_rings) {
- si_init_tess_factor_ring(sctx);
- if (!sctx->tess_rings)
- return false;
- }
-
- if (sctx->shader.tcs.cso) {
- r = si_shader_select(ctx, &sctx->shader.tcs, key, &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4);
- } else {
- if (!sctx->fixed_func_tcs_shader.cso) {
- sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx);
- if (!sctx->fixed_func_tcs_shader.cso)
- return false;
- }
-
- r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, key, &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4);
- }
-
- if (!sctx->shader.gs.cso || sctx->chip_class <= GFX8) {
- r = si_shader_select(ctx, &sctx->shader.tes, key, &compiler_state);
- if (r)
- return false;
-
- if (sctx->shader.gs.cso) {
- /* TES as ES */
- assert(sctx->chip_class <= GFX8);
- si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4);
- } else if (key.u.ngg) {
- si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4);
- } else {
- si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4);
- }
- }
- } else {
- if (sctx->chip_class <= GFX8)
- si_pm4_bind_state(sctx, ls, NULL);
- si_pm4_bind_state(sctx, hs, NULL);
- }
-
- /* Update GS. */
- if (sctx->shader.gs.cso) {
- r = si_shader_select(ctx, &sctx->shader.gs, key, &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4);
- if (!key.u.ngg) {
- si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader->pm4);
-
- if (!si_update_gs_ring_buffers(sctx))
- return false;
- } else {
- si_pm4_bind_state(sctx, vs, NULL);
- }
- } else {
- if (!key.u.ngg) {
- si_pm4_bind_state(sctx, gs, NULL);
- if (sctx->chip_class <= GFX8)
- si_pm4_bind_state(sctx, es, NULL);
- }
- }
-
- /* Update VS. */
- if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
- r = si_shader_select(ctx, &sctx->shader.vs, key, &compiler_state);
- if (r)
- return false;
-
- if (!key.u.tess && !key.u.gs) {
- if (key.u.ngg) {
- si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4);
- si_pm4_bind_state(sctx, vs, NULL);
- } else {
- si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4);
- }
- } else if (sctx->shader.tes.cso) {
- si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4);
- } else {
- assert(sctx->shader.gs.cso);
- si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4);
- }
- }
-
- /* This must be done after the shader variant is selected. */
- if (sctx->ngg) {
- struct si_shader *vs = si_get_vs(sctx)->current;
-
- key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
- key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
- }
-
- sctx->vs_uses_base_instance =
- sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance :
- sctx->queued.named.hs ? sctx->queued.named.hs->shader->uses_base_instance :
- sctx->shader.gs.current->uses_base_instance;
-
- si_update_vgt_shader_config(sctx, key);
-
- if (old_kill_clip_distances != si_get_vs(sctx)->current->key.opt.kill_clip_distances)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
- if (sctx->shader.ps.cso) {
- unsigned db_shader_control;
-
- r = si_shader_select(ctx, &sctx->shader.ps, key, &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4);
-
- db_shader_control = sctx->shader.ps.cso->db_shader_control |
- S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
-
- if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
- (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
- sctx->sprite_coord_enable != rs->sprite_coord_enable ||
- sctx->flatshade != rs->flatshade) {
- sctx->sprite_coord_enable = rs->sprite_coord_enable;
- sctx->flatshade = rs->flatshade;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
- }
-
- if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) &&
- (!old_ps || old_spi_shader_col_format !=
- sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
- if (sctx->ps_db_shader_control != db_shader_control) {
- sctx->ps_db_shader_control = db_shader_control;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- if (sctx->screen->dpbb_allowed)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
- }
-
- if (sctx->smoothing_enabled !=
- sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) {
- sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
- /* NGG cull state uses smoothing_enabled. */
- if (sctx->screen->use_ngg_culling)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
-
- if (sctx->chip_class == GFX6)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
- if (sctx->framebuffer.nr_samples <= 1)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
- }
-
- if (sctx->chip_class >= GFX10_3) {
- struct si_shader_info *info = &sctx->shader.ps.cso->info;
- bool allow_flat_shading = info->allow_flat_shading;
-
- if (allow_flat_shading &&
- (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable ||
- (!rs->flatshade && info->uses_interp_color)))
- allow_flat_shading = false;
-
- if (sctx->allow_flat_shading != allow_flat_shading) {
- sctx->allow_flat_shading = allow_flat_shading;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- }
- }
- }
-
- if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) {
- /* Pretend the bound shaders form a vk pipeline */
- uint32_t pipeline_code_hash = 0;
- uint64_t base_address = ~0;
-
- for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
- struct si_shader *shader = sctx->shaders[i].current;
- if (sctx->shaders[i].cso && shader) {
- pipeline_code_hash = _mesa_hash_data_with_seed(
- shader->binary.elf_buffer,
- shader->binary.elf_size,
- pipeline_code_hash);
- base_address = MIN2(base_address,
- shader->bo->gpu_address);
- }
- }
-
- struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
- if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
- si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false);
- }
-
- si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0);
- }
-
- if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) ||
- si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) ||
- si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) {
- if (!si_update_spi_tmpring_size(sctx))
- return false;
- }
-
- if (sctx->chip_class >= GFX7) {
- if (si_pm4_state_enabled_and_changed(sctx, ls))
- sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
- else if (!sctx->queued.named.ls)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
-
- if (si_pm4_state_enabled_and_changed(sctx, hs))
- sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
- else if (!sctx->queued.named.hs)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
-
- if (si_pm4_state_enabled_and_changed(sctx, es))
- sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
- else if (!sctx->queued.named.es)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
-
- if (si_pm4_state_enabled_and_changed(sctx, gs))
- sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
- else if (!sctx->queued.named.gs)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
-
- if (si_pm4_state_enabled_and_changed(sctx, vs))
- sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
- else if (!sctx->queued.named.vs)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
-
- if (si_pm4_state_enabled_and_changed(sctx, ps))
- sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
- else if (!sctx->queued.named.ps)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
- }
-
- sctx->do_update_shaders = false;
- return true;
-}
-
static void si_emit_scratch_state(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
radeon_begin(cs);
- radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
+ radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
radeon_end();
if (sctx->scratch_buffer) {
@@ -4303,7 +4050,6 @@ void si_init_screen_live_shader_cache(struct si_screen *sscreen)
void si_init_shader_functions(struct si_context *sctx)
{
- sctx->atoms.s.spi_map.emit = si_emit_spi_map;
sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
sctx->b.create_vs_state = si_create_shader;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c b/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c
index b6656fdc8..e70987d66 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c
@@ -46,7 +46,8 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
int modifiers_count = 0;
uint64_t mod = DRM_FORMAT_MOD_LINEAR;
- /* TODO: get tiling working */
+ /* To get tiled buffers, users need to explicitly provide a list of
+ * modifiers. */
vidbuf.bind |= PIPE_BIND_LINEAR;
if (pipe->screen->resource_create_with_modifiers) {
@@ -58,6 +59,33 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
modifiers_count);
}
+struct pipe_video_buffer *si_video_buffer_create_with_modifiers(struct pipe_context *pipe,
+ const struct pipe_video_buffer *tmpl,
+ const uint64_t *modifiers,
+ unsigned int modifiers_count)
+{
+ uint64_t *allowed_modifiers;
+ unsigned int allowed_modifiers_count, i;
+
+ /* Filter out DCC modifiers, because we don't support them for video
+ * for now. */
+ allowed_modifiers = calloc(modifiers_count, sizeof(uint64_t));
+ if (!allowed_modifiers)
+ return NULL;
+
+ allowed_modifiers_count = 0;
+ for (i = 0; i < modifiers_count; i++) {
+ if (ac_modifier_has_dcc(modifiers[i]))
+ continue;
+ allowed_modifiers[allowed_modifiers_count++] = modifiers[i];
+ }
+
+ struct pipe_video_buffer *buf =
+ vl_video_buffer_create_as_resource(pipe, tmpl, allowed_modifiers, allowed_modifiers_count);
+ free(allowed_modifiers);
+ return buf;
+}
+
/* set the decoding target buffer offsets */
static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
{