diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2022-02-24 02:30:08 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2022-02-24 02:30:08 +0000 |
commit | 1d35364040c0ffa99133522fa5ab3bd6131d8bf7 (patch) | |
tree | 0ea3d9ca4ad10692c6477168b67e98cb50ea6bd3 /lib/mesa/src/gallium/drivers/radeonsi | |
parent | b24b5b9049e889ee4eb39b565bcc8d48bd45ab48 (diff) |
Merge Mesa 21.3.7
Diffstat (limited to 'lib/mesa/src/gallium/drivers/radeonsi')
24 files changed, 2306 insertions, 4872 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk deleted file mode 100644 index e402da639..000000000 --- a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk +++ /dev/null @@ -1,95 +0,0 @@ -# Mesa 3-D graphics library -# -# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com> -# Copyright (C) 2010-2011 LunarG Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -LOCAL_PATH := $(call my-dir) - -# get C_SOURCES and GENERATED_SOURCES -include $(LOCAL_PATH)/Makefile.sources - -include $(CLEAR_VARS) - -LOCAL_SRC_FILES := $(C_SOURCES) - -LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU # instructs LLVM to declare LLVMInitializeAMDGPU* functions - -LOCAL_MODULE_CLASS := STATIC_LIBRARIES - -LOCAL_C_INCLUDES := \ - $(MESA_TOP)/src/amd/common \ - $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \ - $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir - -LOCAL_STATIC_LIBRARIES := libmesa_amd_common - -LOCAL_SHARED_LIBRARIES := libdrm_radeon -LOCAL_MODULE := libmesa_pipe_radeonsi - -intermediates := $(call local-generated-sources-dir) - -# We need to get NIR's generated headers. -LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H) -LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/radeonsi/,$(GENERATED_SOURCES)) - -GEN_DRIINFO_INPUTS := \ - $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \ - $(LOCAL_PATH)/driinfo_radeonsi.h - -MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py - -$(intermediates)/radeonsi/si_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false) - -GEN10_FORMAT_TABLE_INPUTS := \ - $(MESA_TOP)/src/gallium/auxiliary/util/u_format.csv \ - $(MESA_TOP)/src/amd/registers/gfx10-rsrc.json - -GEN10_FORMAT_TABLE_DEP := \ - $(MESA_TOP)/src/amd/registers/regdb.py - -GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py - -$(intermediates)/radeonsi/gfx10_format_table.h: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP) - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false) - -LOCAL_C_INCLUDES += $(intermediates)/radeonsi - -LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates) - -$(call mesa-build-with-llvm) - -include $(GALLIUM_COMMON_MK) -include $(BUILD_STATIC_LIBRARY) - -ifneq ($(HAVE_GALLIUM_RADEONSI),) -GALLIUM_TARGET_DRIVERS += radeonsi -$(eval GALLIUM_LIBS += \ - $(LOCAL_MODULE) \ - $(LOCAL_STATIC_LIBRARIES) \ - libmesa_winsys_radeon \ - libmesa_winsys_amdgpu) -$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES)) -endif diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources b/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources deleted file mode 100644 index 55ef80856..000000000 --- a/lib/mesa/src/gallium/drivers/radeonsi/Makefile.sources +++ /dev/null @@ -1,75 +0,0 @@ -C_SOURCES := \ - driinfo_radeonsi.h \ - gfx10_query.c \ - gfx10_shader_ngg.c \ - si_blit.c \ - si_buffer.c \ - si_build_pm4.h \ - si_clear.c \ - si_compute.c \ - si_compute_prim_discard.c \ - si_compute.h \ - si_compute_blit.c \ - si_cp_dma.c \ - si_cp_reg_shadowing.c \ - si_debug.c \ - si_descriptors.c \ - si_fence.c \ - si_get.c \ - si_gfx_cs.c \ - si_gpu_load.c \ - si_pipe.c \ - si_pipe.h \ - si_pm4.c \ - si_pm4.h \ - si_perfcounter.c \ - si_public.h \ - si_query.c \ - si_query.h \ - si_shader.c \ - si_shader.h \ - si_shader_internal.h \ - si_shader_llvm.c \ - si_shader_llvm_gs.c \ - si_shader_llvm_ps.c \ - si_shader_llvm_resources.c \ - si_shader_llvm_tess.c \ - si_shader_llvm_vs.c \ - si_shader_nir.c \ - si_shaderlib_nir.c \ - si_shaderlib_tgsi.c \ - si_sqtt.c \ - si_state.c \ - si_state_binning.c \ - si_state_draw.cpp \ - si_state_msaa.c \ - si_state_shaders.c \ - si_state_streamout.c \ - si_state_viewport.c \ - si_state.h \ - si_test_blit.c \ - si_test_dma_perf.c \ - si_texture.c \ - si_uvd.c \ - ../radeon/radeon_uvd.c \ - ../radeon/radeon_uvd.h \ - ../radeon/radeon_vcn_dec_jpeg.c \ - ../radeon/radeon_vcn_dec.c \ - ../radeon/radeon_vcn_dec.h \ - ../radeon/radeon_vcn_av1_default.h \ - ../radeon/radeon_vcn_enc_1_2.c \ - ../radeon/radeon_vcn_enc_2_0.c \ - ../radeon/radeon_vcn_enc_3_0.c \ - ../radeon/radeon_vcn_enc.c \ - ../radeon/radeon_vcn_enc.h \ - ../radeon/radeon_uvd_enc_1_1.c \ - ../radeon/radeon_uvd_enc.c \ - ../radeon/radeon_uvd_enc.h \ - ../radeon/radeon_vce_40_2_2.c \ - ../radeon/radeon_vce_50.c \ - ../radeon/radeon_vce_52.c \ - ../radeon/radeon_vce.c \ - ../radeon/radeon_vce.h \ - ../radeon/radeon_video.c \ - ../radeon/radeon_video.h \ - ../radeon/radeon_winsys.h diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt deleted file mode 100644 index e69de29bb..000000000 --- a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-fails.txt +++ /dev/null diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt deleted file mode 100644 index 69d00870a..000000000 --- a/lib/mesa/src/gallium/drivers/radeonsi/ci/deqp-radeonsi-stoney-skips.txt +++ /dev/null @@ -1,11 +0,0 @@ -# Note: skips lists for CI are just a list of lines that, when -# non-zero-length and not starting with '#', will regex match to -# delete lines from the test list. Be careful. - -# Skip the perf/stress tests to keep runtime manageable -dEQP-GLES[0-9]*.performance.* -dEQP-GLES[0-9]*.stress.* - -# These are really slow on tiling architectures (including llvmpipe). -dEQP-GLES[0-9]*.functional.flush_finish.* - diff --git a/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt b/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt deleted file mode 100644 index e69de29bb..000000000 --- a/lib/mesa/src/gallium/drivers/radeonsi/ci/radeonsi-stoney-replay.txt +++ /dev/null diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c b/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c index 653dfc343..5653ff233 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_blit.c @@ -98,11 +98,13 @@ void si_blitter_end(struct si_context *sctx) /* Restore shader pointers because the VS blit shader changed all * non-global VS user SGPRs. */ sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); + + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } @@ -393,11 +395,12 @@ static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex, si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, false, true /* no DCC */); } -static void si_decompress_sampler_depth_textures(struct si_context *sctx, +static bool si_decompress_sampler_depth_textures(struct si_context *sctx, struct si_samplers *textures) { unsigned i; unsigned mask = textures->needs_depth_decompress_mask; + bool need_flush = false; while (mask) { struct pipe_sampler_view *view; @@ -416,7 +419,14 @@ static void si_decompress_sampler_depth_textures(struct si_context *sctx, si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z, view->u.tex.first_level, view->u.tex.last_level, 0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level)); + + if (tex->need_flush_after_depth_decompression) { + need_flush = true; + tex->need_flush_after_depth_decompression = false; + } } + + return need_flush; } static void si_blit_decompress_color(struct si_context *sctx, struct si_texture *tex, @@ -755,6 +765,7 @@ static void si_decompress_resident_images(struct si_context *sctx) void si_decompress_textures(struct si_context *sctx, unsigned shader_mask) { unsigned compressed_colortex_counter, mask; + bool need_flush = false; if (sctx->blitter_running) return; @@ -772,7 +783,7 @@ void si_decompress_textures(struct si_context *sctx, unsigned shader_mask) unsigned i = u_bit_scan(&mask); if (sctx->samplers[i].needs_depth_decompress_mask) { - si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]); + need_flush |= si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]); } if (sctx->samplers[i].needs_color_decompress_mask) { si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]); @@ -782,6 +793,16 @@ void si_decompress_textures(struct si_context *sctx, unsigned shader_mask) } } + if (sctx->chip_class == GFX10_3 && need_flush) { + /* This fixes a corruption with the following sequence: + * - fast clear depth + * - decompress depth + * - draw + * (see https://gitlab.freedesktop.org/drm/amd/-/issues/1810#note_1170171) + */ + sctx->b.flush(&sctx->b, NULL, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW); + } + if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) { if (sctx->uses_bindless_samplers) si_decompress_resident_textures(sctx); @@ -1027,7 +1048,7 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst /* Copy. */ si_blitter_begin(sctx, SI_COPY); util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0, - src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false); + src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false, false); si_blitter_end(sctx); pipe_surface_reference(&dst_view, NULL); @@ -1203,11 +1224,48 @@ resolve_to_temp: static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) { struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *sdst = (struct si_texture *)info->dst.resource; if (do_hardware_msaa_resolve(ctx, info)) { return; } + if (info->is_dri_blit_image && sdst->surface.is_linear && + sctx->chip_class >= GFX7 && sdst->surface.flags & RADEON_SURF_IMPORTED) { + struct si_texture *ssrc = (struct si_texture *)info->src.resource; + /* Use SDMA or async compute when copying to a DRI_PRIME imported linear surface. */ + bool async_copy = info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.z == 0 && + info->src.box.x == 0 && info->src.box.y == 0 && info->src.box.z == 0 && + info->dst.level == 0 && info->src.level == 0 && + info->src.box.width == info->dst.resource->width0 && + info->src.box.height == info->dst.resource->height0 && + info->src.box.depth == 1 && util_can_blit_via_copy_region(info, true); + /* Try SDMA first... */ + /* TODO: figure out why SDMA copies are slow on GFX10_3 */ + if (async_copy && sctx->chip_class < GFX10_3 && si_sdma_copy_image(sctx, sdst, ssrc)) + return; + + /* ... and use async compute as the fallback. */ + if (async_copy) { + struct si_screen *sscreen = sctx->screen; + + simple_mtx_lock(&sscreen->async_compute_context_lock); + if (!sscreen->async_compute_context) + si_init_aux_async_compute_ctx(sscreen); + + if (sscreen->async_compute_context) { + si_compute_copy_image((struct si_context*)sctx->screen->async_compute_context, + info->dst.resource, 0, info->src.resource, 0, 0, 0, 0, + &info->src.box, false, 0); + si_flush_gfx_cs((struct si_context*)sctx->screen->async_compute_context, 0, NULL); + simple_mtx_unlock(&sscreen->async_compute_context_lock); + return; + } + + simple_mtx_unlock(&sscreen->async_compute_context_lock); + } + } + if (unlikely(sctx->thread_trace_enabled)) sctx->sqtt_next_event = EventCmdCopyImage; @@ -1276,52 +1334,16 @@ static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *re struct si_texture *tex = (struct si_texture *)res; assert(res->target != PIPE_BUFFER); - assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics); - - /* st/dri calls flush twice per frame (not a bug), this prevents double - * decompression. */ - if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty) - return; if (!tex->is_depth && (tex->cmask_buffer || vi_dcc_enabled(tex, 0))) { si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0), - tex->dcc_separate_buffer != NULL, false); + false, false); if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) { si_retile_dcc(sctx, tex); tex->displayable_dcc_dirty = false; } } - - /* Always do the analysis even if DCC is disabled at the moment. */ - if (tex->dcc_gather_statistics) { - bool separate_dcc_dirty = tex->separate_dcc_dirty; - - /* If the color buffer hasn't been unbound and fast clear hasn't - * been used, separate_dcc_dirty is false, but there may have been - * new rendering. Check if the color buffer is bound and assume - * it's dirty. - * - * Note that DRI2 never unbinds window colorbuffers, which means - * the DCC pipeline statistics query would never be re-set and would - * keep adding new results until all free memory is exhausted if we - * didn't do this. - */ - if (!separate_dcc_dirty) { - for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (sctx->framebuffer.state.cbufs[i] && - sctx->framebuffer.state.cbufs[i]->texture == res) { - separate_dcc_dirty = true; - break; - } - } - } - - if (separate_dcc_dirty) { - tex->separate_dcc_dirty = false; - vi_separate_dcc_process_and_reset_stats(ctx, tex); - } - } } void si_flush_implicit_resources(struct si_context *sctx) diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c index 48ec79ac5..0ae232db2 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c @@ -107,7 +107,7 @@ static void code_object_to_config(const amd_kernel_code_t *code_object, } /* Asynchronous compute shader compilation. */ -static void si_create_compute_state_async(void *job, int thread_index) +static void si_create_compute_state_async(void *job, void *gdata, int thread_index) { struct si_compute *program = (struct si_compute *)job; struct si_shader_selector *sel = &program->sel; @@ -367,11 +367,14 @@ static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsi void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs) { radeon_begin(cs); - radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); + radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI, + S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); + + radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); if (sctx->chip_class == GFX6) { /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID @@ -381,25 +384,25 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf * TODO: This should be: * (number of compute units) * 4 * (waves per simd) - 1 */ - radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); + radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) { uint64_t bc_va = sctx->border_color_buffer->gpu_address; - radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8); + radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8); } } if (sctx->chip_class >= GFX7) { /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ - radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); + radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); /* Disable profiling on compute queues. */ if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) { - radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); - radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0); + radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); + radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0); } /* Set the pointer to border colors. */ @@ -407,9 +410,9 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf if (sctx->border_color_buffer) { uint64_t bc_va = sctx->border_color_buffer->gpu_address; - radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2, false); - radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ - radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ + radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false); + radeon_emit(bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ + radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ } } @@ -418,17 +421,19 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf */ if (sctx->chip_class >= GFX9 && (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) { - radeon_set_uconfig_reg(cs, R_0301EC_CP_COHER_START_DELAY, + radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY, sctx->chip_class >= GFX10 ? 0x20 : 0); } if (sctx->chip_class >= GFX10) { - radeon_set_sh_reg(cs, R_00B890_COMPUTE_USER_ACCUM_0, 0); - radeon_set_sh_reg(cs, R_00B894_COMPUTE_USER_ACCUM_1, 0); - radeon_set_sh_reg(cs, R_00B898_COMPUTE_USER_ACCUM_2, 0); - radeon_set_sh_reg(cs, R_00B89C_COMPUTE_USER_ACCUM_3, 0); - radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0); - radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); + radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 5); + radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */ + radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */ + radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */ + radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */ + radeon_emit(0); /* R_00B8A0_COMPUTE_PGM_RSRC3 */ + + radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); } radeon_end(); } @@ -533,13 +538,11 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute RADEON_PRIO_SHADER_BINARY); radeon_begin(cs); - radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cs, shader_va >> 8); - radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); + radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8); - radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit(cs, config->rsrc1); - radeon_emit(cs, config->rsrc2); + radeon_set_sh_reg_seq(R_00B848_COMPUTE_PGM_RSRC1, 2); + radeon_emit(config->rsrc1); + radeon_emit(config->rsrc2); COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x " @@ -549,7 +552,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute sctx->max_seen_compute_scratch_bytes_per_wave = MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave); - radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, + radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10)); radeon_end(); @@ -592,11 +595,11 @@ static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx, } radeon_begin(cs); - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4); - radeon_emit(cs, scratch_dword0); - radeon_emit(cs, scratch_dword1); - radeon_emit(cs, scratch_dword2); - radeon_emit(cs, scratch_dword3); + radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4); + radeon_emit(scratch_dword0); + radeon_emit(scratch_dword1); + radeon_emit(scratch_dword2); + radeon_emit(scratch_dword3); radeon_end(); } @@ -656,9 +659,9 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_ dispatch_va = dispatch_buf->gpu_address + dispatch_offset; - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); - radeon_emit(cs, dispatch_va); - radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0)); + radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); + radeon_emit(dispatch_va); + radeon_emit(S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0)); si_resource_reference(&dispatch_buf, NULL); user_sgpr += 2; @@ -666,16 +669,16 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_ if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); - radeon_emit(cs, kernel_args_va); - radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0)); + radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); + radeon_emit(kernel_args_va); + radeon_emit(S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0)); user_sgpr += 2; } for (i = 0; i < 3 && user_sgpr < 16; i++) { if (code_object->code_properties & workgroup_count_masks[i]) { - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1); - radeon_emit(cs, info->grid[i]); + radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1); + radeon_emit(info->grid[i]); user_sgpr += 1; } } @@ -740,21 +743,21 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr } radeon_begin_again(cs); } else { - radeon_set_sh_reg_seq(cs, grid_size_reg, 3); - radeon_emit(cs, info->grid[0]); - radeon_emit(cs, info->grid[1]); - radeon_emit(cs, info->grid[2]); + radeon_set_sh_reg_seq(grid_size_reg, 3); + radeon_emit(info->grid[0]); + radeon_emit(info->grid[1]); + radeon_emit(info->grid[2]); } } if (sel->info.uses_variable_block_size) { - radeon_set_sh_reg(cs, block_size_reg, + radeon_set_sh_reg(block_size_reg, info->block[0] | (info->block[1] << 10) | (info->block[2] << 20)); } if (sel->info.base.cs.user_data_components_amd) { - radeon_set_sh_reg_seq(cs, cs_user_data_reg, sel->info.base.cs.user_data_components_amd); - radeon_emit_array(cs, sctx->cs_user_data, sel->info.base.cs.user_data_components_amd); + radeon_set_sh_reg_seq(cs_user_data_reg, sel->info.base.cs.user_data_components_amd); + radeon_emit_array(sctx->cs_user_data, sel->info.base.cs.user_data_components_amd); } radeon_end(); } @@ -780,7 +783,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ radeon_begin(cs); radeon_set_sh_reg( - cs, R_00B854_COMPUTE_RESOURCE_LIMITS, + R_00B854_COMPUTE_RESOURCE_LIMITS, ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup, sctx->cs_max_waves_per_sh, threadgroups_per_cu)); @@ -793,7 +796,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ const uint *last_block = info->last_block; bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; - radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + radeon_set_sh_reg_seq(R_00B81C_COMPUTE_NUM_THREAD_X, 3); if (partial_block_en) { unsigned partial[3]; @@ -803,18 +806,18 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ partial[1] = last_block[1] ? last_block[1] : info->block[1]; partial[2] = last_block[2] ? last_block[2] : info->block[2]; - radeon_emit( - cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0])); - radeon_emit( - cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1])); - radeon_emit( - cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2])); + radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]) | + S_00B81C_NUM_THREAD_PARTIAL(partial[0])); + radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]) | + S_00B820_NUM_THREAD_PARTIAL(partial[1])); + radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]) | + S_00B824_NUM_THREAD_PARTIAL(partial[2])); dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); } else { - radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); + radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0])); + radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1])); + radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2])); } if (info->indirect) { @@ -823,25 +826,25 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); - radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, 1); - radeon_emit(cs, base_va); - radeon_emit(cs, base_va >> 32); + radeon_emit(PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(1); + radeon_emit(base_va); + radeon_emit(base_va >> 32); - radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, info->indirect_offset); - radeon_emit(cs, dispatch_initiator); + radeon_emit(PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(info->indirect_offset); + radeon_emit(dispatch_initiator); } else { - radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, info->grid[0]); - radeon_emit(cs, info->grid[1]); - radeon_emit(cs, info->grid[2]); - radeon_emit(cs, dispatch_initiator); + radeon_emit(PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(info->grid[0]); + radeon_emit(info->grid[1]); + radeon_emit(info->grid[2]); + radeon_emit(dispatch_initiator); } if (unlikely(sctx->thread_trace_enabled && sctx->chip_class >= GFX9)) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); } radeon_end(); } @@ -857,6 +860,8 @@ static bool si_check_needs_implicit_sync(struct si_context *sctx) * * buffer object and texture stores performed by shaders are not * automatically synchronized + * + * TODO: Bindless textures are not handled, and thus are not synchronized. */ struct si_shader_info *info = &sctx->cs_shader_state.program->sel.info; struct si_samplers *samplers = &sctx->samplers[PIPE_SHADER_COMPUTE]; @@ -890,18 +895,12 @@ static bool si_check_needs_implicit_sync(struct si_context *sctx) static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info) { struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *sscreen = sctx->screen; struct si_compute *program = sctx->cs_shader_state.program; const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc); int i; - /* HW bug workaround when CS threadgroups > 256 threads and async - * compute isn't used, i.e. only one compute job can run at a time. - * If async compute is possible, the threadgroup size must be limited - * to 256 threads on all queues to avoid the bug. - * Only GFX6 and certain GFX7 chips are affected. - */ - bool cs_regalloc_hang = - (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) && - info->block[0] * info->block[1] * info->block[2] > 256; + bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug && + info->block[0] * info->block[1] * info->block[2] > 256; if (cs_regalloc_hang) sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c deleted file mode 100644 index 373fd4ffa..000000000 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ /dev/null @@ -1,1580 +0,0 @@ -/* - * Copyright 2019 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "si_pipe.h" -#include "si_shader_internal.h" -#include "sid.h" -#include "si_build_pm4.h" -#include "ac_llvm_cull.h" - -#include "util/u_prim.h" -#include "util/u_suballoc.h" -#include "util/u_upload_mgr.h" -#include "util/fast_idiv_by_const.h" - -/* Based on: - * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf - */ - -/* This file implements primitive culling using asynchronous compute. - * It's written to be GL conformant. - * - * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it - * in a compute shader. The shader processes 1 primitive/thread by invoking - * the VS for each vertex to get the positions, decomposes strips and fans - * into triangles (if needed), eliminates primitive restart (if needed), - * does (W<0) culling, face culling, view XY culling, zero-area and - * small-primitive culling, and generates a new index buffer that doesn't - * contain culled primitives. - * - * The index buffer is generated using the Ordered Count feature of GDS, - * which is an atomic counter that is incremented in the wavefront launch - * order, so that the original primitive order is preserved. - * - * Another GDS ordered counter is used to eliminate primitive restart indices. - * If a restart index lands on an even thread ID, the compute shader has to flip - * the primitive orientation of the whole following triangle strip. The primitive - * orientation has to be correct after strip and fan decomposition for two-sided - * shading to behave correctly. The decomposition also needs to be aware of - * which vertex is the provoking vertex for flat shading to behave correctly. - * - * IB = a GPU command buffer - * - * Both the compute and gfx IBs run in parallel sort of like CE and DE. - * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND - * doesn't continue if its word isn't 0x80000000. Once compute shaders are - * finished culling, the last wave will write the final primitive count from - * GDS directly into the count word of the draw packet in the gfx IB, and - * a CS_DONE event will signal the REWIND packet to continue. It's really - * a direct draw with command buffer patching from the compute queue. - * - * The compute IB doesn't have to start when its corresponding gfx IB starts, - * but can start sooner. The compute IB is signaled to start after the last - * execution barrier in the *previous* gfx IB. This is handled as follows. - * The kernel GPU scheduler starts the compute IB after the previous gfx IB has - * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that - * represents the barrier in the previous gfx IB. - * - * Features: - * - Triangle strips and fans are decomposed into an indexed triangle list. - * The decomposition differs based on the provoking vertex state. - * - Instanced draws are converted into non-instanced draws for 16-bit indices. - * (InstanceID is stored in the high bits of VertexID and unpacked by VS) - * - Primitive restart is fully supported with triangle strips, including - * correct primitive orientation across multiple waves. (restart indices - * reset primitive orientation) - * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling). - * - Back face culling, incl. culling zero-area / degenerate primitives. - * - View XY culling. - * - View Z culling (disabled due to limited impact with perspective projection). - * - Small primitive culling for all MSAA modes and all quant modes. - * - * The following are not implemented: - * - ClipVertex/ClipDistance/CullDistance-based culling. - * - Scissor culling. - * - HiZ culling. - * - * Limitations (and unimplemented features that may be possible to implement): - * - Only triangles, triangle strips, and triangle fans are supported. - * - Primitive restart is only supported with triangle strips. - * - Instancing and primitive restart can't be used together. - * - Instancing is only supported with 16-bit indices and instance count <= 2^16. - * - The instance divisor buffer is unavailable, so all divisors must be - * either 0 or 1. - * - Multidraws where the vertex shader reads gl_DrawID are unsupported. - * - No support for tessellation and geometry shaders. - * (patch elimination where tess factors are 0 would be possible to implement) - * - The vertex shader must not contain memory stores. - * - All VS resources must not have a write usage in the command buffer. - * (TODO: all shader buffers currently set the write usage) - * - Bindless textures and images must not occur in the vertex shader. - * - * User data SGPR layout: - * INDEX_BUFFERS: pointer to constants - * 0..3: input index buffer - typed buffer view - * 4..7: output index buffer - typed buffer view - * 8..11: viewport state - scale.xy, translate.xy - * VERTEX_COUNTER: counter address or first primitive ID - * - If unordered memory counter: address of "count" in the draw packet - * and is incremented atomically by the shader. - * - If unordered GDS counter: address of "count" in GDS starting from 0, - * must be initialized to 0 before the dispatch. - * - If ordered GDS counter: the primitive ID that should reset the vertex - * counter to 0 in GDS - * LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex - * count to memory if using GDS ordered append - * VERTEX_COUNT_ADDR: where the last wave should write the vertex count if - * using GDS ordered append - * VS.VERTEX_BUFFERS: same value as VS - * VS.CONST_AND_SHADER_BUFFERS: same value as VS - * VS.SAMPLERS_AND_IMAGES: same value as VS - * VS.BASE_VERTEX: same value as VS - * VS.START_INSTANCE: same value as VS - * NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives - * per instance for instancing. - * NUM_PRIMS_UDIV_TERMS: - * - Bits [0:4]: "post_shift" for fast 31-bit division for instancing. - * - Bits [5:31]: The number of primitives per instance for computing the remainder. - * PRIMITIVE_RESTART_INDEX - * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number. - * - * - * The code contains 3 codepaths: - * - Unordered memory counter (for debugging, random primitive order, no primitive restart) - * - Unordered GDS counter (for debugging, random primitive order, no primitive restart) - * - Ordered GDS counter (it preserves the primitive order) - * - * How to test primitive restart (the most complicated part because it needs - * to get the primitive orientation right): - * Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave - * primitive orientation flips with small draw calls, which is what most tests use. - * You can also enable draw call splitting into draw calls with just 2 primitives. - */ - -/* At least 256 is needed for the fastest wave launch rate from compute queues - * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */ -#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */ -#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */ -#define MAX_WAVES_PER_SH 0 /* no limit */ -#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */ -/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */ -#define CULL_Z 0 -/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */ -#define VERTEX_COUNTER_GDS_MODE 2 -#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */ - -/* Grouping compute dispatches for small draw calls: How many primitives from multiple - * draw calls to process by compute before signaling the gfx IB. This reduces the number - * of EOP events + REWIND packets, because they decrease performance. */ -#define PRIMS_PER_BATCH (512 * 1024) -/* Draw call splitting at the packet level. This allows signaling the gfx IB - * for big draw calls sooner, but doesn't allow context flushes between packets. - * Primitive restart is supported. Only implemented for ordered append. */ -#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH -/* If there is not enough ring buffer space for the current IB, split draw calls into - * this number of primitives, so that we can flush the context and get free ring space. */ -#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH - -/* Derived values. */ -#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64) -#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \ - SPLIT_PRIMS_PACKET_LEVEL_VALUE : \ - UINT_MAX & ~(THREADGROUP_SIZE - 1)) - -#define REWIND_SIGNAL_BIT 0x80000000 -/* For emulating the rewind packet on CI. */ -#define FORCE_REWIND_EMULATION 0 - -void si_initialize_prim_discard_tunables(struct si_context *sctx) -{ - sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ - - if (sctx->chip_class == GFX6 || /* SI support is not implemented */ - !sctx->screen->info.has_gds_ordered_append || - sctx->screen->debug_flags & DBG(NO_PD) || - /* If aux_context == NULL, we are initializing aux_context right now. */ - !sctx->screen->aux_context) - return; - - /* TODO: enable this after the GDS kernel memory management is fixed */ - bool enable_on_pro_graphics_by_default = false; - - if (sctx->screen->debug_flags & DBG(ALWAYS_PD) || - sctx->screen->debug_flags & DBG(PD) || - (enable_on_pro_graphics_by_default && - sctx->screen->info.is_pro_graphics && - (sctx->family == CHIP_BONAIRE || - sctx->family == CHIP_HAWAII || - sctx->family == CHIP_TONGA || - sctx->family == CHIP_FIJI || - sctx->family == CHIP_POLARIS10 || - sctx->family == CHIP_POLARIS11 || - sctx->family == CHIP_VEGA10 || - sctx->family == CHIP_VEGA20))) { - sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ - - if (sctx->screen->debug_flags & DBG(ALWAYS_PD)) - sctx->prim_discard_vertex_count_threshold = 0; /* always enable */ - - const uint32_t MB = 1024 * 1024; - const uint64_t GB = 1024 * 1024 * 1024; - - /* The total size is double this per context. - * Greater numbers allow bigger gfx IBs. - */ - if (sctx->screen->info.vram_size <= 2 * GB) - sctx->index_ring_size_per_ib = 64 * MB; - else if (sctx->screen->info.vram_size <= 4 * GB) - sctx->index_ring_size_per_ib = 128 * MB; - else - sctx->index_ring_size_per_ib = 256 * MB; - } -} - -/* Opcode can be "add" or "swap". */ -static LLVMValueRef -si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode, - LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index, - bool release, bool done) -{ - LLVMValueRef args[] = { - LLVMBuildIntToPtr(ctx->ac.builder, m0, - LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""), - value, - LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ - ctx->i32_0, /* scope */ - ctx->i1false, /* volatile */ - LLVMConstInt(ctx->i32, ordered_count_index, 0), - LLVMConstInt(ctx->i1, release, 0), - LLVMConstInt(ctx->i1, done, 0), - }; - - char intrinsic[64]; - snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode); - return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0); -} - -static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr) -{ - uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; - ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, ""); - ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), ""); - return LLVMBuildIntToPtr(ctx->ac.builder, ptr, - LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), ""); -} - -struct si_thread0_section { - struct si_shader_context *ctx; - LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */ - LLVMValueRef saved_exec; -}; - -/* Enter a section that only executes on thread 0. */ -static void si_enter_thread0_section(struct si_shader_context *ctx, - struct si_thread0_section *section, - LLVMValueRef thread_id) -{ - section->ctx = ctx; - section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0"); - - /* This IF has 4 instructions: - * v_and_b32_e32 v, 63, v ; get the thread ID - * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0 - * s_and_saveexec_b64 s, vcc - * s_cbranch_execz BB0_4 - * - * It could just be s_and_saveexec_b64 s, 1. - */ - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, - ctx->i32_0, ""), 12601); -} - -/* Exit a section that only executes on thread 0 and broadcast the result - * to all threads. */ -static void si_exit_thread0_section(struct si_thread0_section *section, - LLVMValueRef *result) -{ - struct si_shader_context *ctx = section->ctx; - - LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result); - - ac_build_endif(&ctx->ac, 12601); - - /* Broadcast the result from thread 0 to all threads. */ - *result = ac_build_readlane(&ctx->ac, - LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); -} - -void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) -{ - struct si_shader_key *key = &ctx->shader->key; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef vs = ctx->main_fn; - - /* Always inline the VS function. */ - ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); - LLVMSetLinkage(vs, LLVMPrivateLinkage); - - LLVMTypeRef const_desc_type; - if (ctx->shader->selector->info.const_buffers_declared == 1 && - ctx->shader->selector->info.shader_buffers_declared == 0) - const_desc_type = ctx->f32; - else - const_desc_type = ctx->v4i32; - - struct si_function_info fninfo; - si_init_function_info(&fninfo); - - LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc; - LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id; - LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision; - LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc; - LLVMValueRef last_wave_prim_id, vertex_count_addr; - - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), - &index_buffers_and_constants); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr); - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), - &vb_desc); - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type), - &const_desc); - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32), - &sampler_desc); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index); - add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision); - - /* Block ID and thread ID inputs. */ - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id); - if (VERTEX_COUNTER_GDS_MODE == 2) - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id); - - /* Create the compute shader function. */ - unsigned old_type = ctx->type; - ctx->type = PIPE_SHADER_COMPUTE; - si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE); - ctx->type = old_type; - - if (VERTEX_COUNTER_GDS_MODE == 1) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", - GDS_SIZE_UNORDERED); - } - - /* Assemble parameters for VS. */ - LLVMValueRef vs_params[16]; - unsigned num_vs_params = 0; - unsigned param_vertex_id, param_instance_id; - - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */ - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ - vs_params[num_vs_params++] = const_desc; - vs_params[num_vs_params++] = sampler_desc; - vs_params[num_vs_params++] = LLVMConstInt(ctx->i32, - S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); - vs_params[num_vs_params++] = base_vertex; - vs_params[num_vs_params++] = start_instance; - vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */ - vs_params[num_vs_params++] = vb_desc; - - vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ - vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */ - vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */ - vs_params[num_vs_params++] = ctx->i32_0; /* unused */ - - assert(num_vs_params <= ARRAY_SIZE(vs_params)); - assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); - - /* Load descriptors. (load 8 dwords at once) */ - LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; - - tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, - ac_array_in_const32_addr_space(ctx->v8i32), ""); - tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0); - - for (unsigned i = 0; i < 8; i++) - desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); - - input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4); - output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4); - - /* Compute PrimID and InstanceID. */ - LLVMValueRef global_thread_id = - ac_build_imad(&ctx->ac, block_id, - LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id); - LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */ - LLVMValueRef instance_id = ctx->i32_0; - - if (key->opt.cs_instancing) { - /* Unpack num_prims_udiv_terms. */ - LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms, - LLVMConstInt(ctx->i32, 0x1f, 0), ""); - LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms, - LLVMConstInt(ctx->i32, 5, 0), ""); - /* Divide the total prim_id by the number of prims per instance. */ - instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, - num_prims_udiv_multiplier, - post_shift); - /* Compute the remainder. */ - prim_id = LLVMBuildSub(builder, prim_id, - LLVMBuildMul(builder, instance_id, - prims_per_instance, ""), ""); - } - - /* Generate indices (like a non-indexed draw call). */ - LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)}; - unsigned vertices_per_prim = 3; - - switch (key->opt.cs_prim_type) { - case PIPE_PRIM_TRIANGLES: - for (unsigned i = 0; i < 3; i++) { - index[i] = ac_build_imad(&ctx->ac, prim_id, - LLVMConstInt(ctx->i32, 3, 0), - LLVMConstInt(ctx->i32, i, 0)); - } - break; - case PIPE_PRIM_TRIANGLE_STRIP: - for (unsigned i = 0; i < 3; i++) { - index[i] = LLVMBuildAdd(builder, prim_id, - LLVMConstInt(ctx->i32, i, 0), ""); - } - break; - case PIPE_PRIM_TRIANGLE_FAN: - /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper - * and rasterizer as a normal triangle, so we need to put the provoking - * vertex into the correct index variable and preserve orientation at the same time. - * gl_VertexID is preserved, because it's equal to the index. - */ - if (key->opt.cs_provoking_vertex_first) { - index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); - index[2] = ctx->i32_0; - } else { - index[0] = ctx->i32_0; - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); - index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); - } - break; - default: - unreachable("unexpected primitive type"); - } - - /* Fetch indices. */ - if (key->opt.cs_indexed) { - for (unsigned i = 0; i < 3; i++) { - index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, - index[i], ctx->i32_0, 1, - 0, true); - index[i] = ac_to_integer(&ctx->ac, index[i]); - } - } - - /* Extract the ordered wave ID. */ - if (VERTEX_COUNTER_GDS_MODE == 2) { - ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id, - LLVMConstInt(ctx->i32, 6, 0), ""); - ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id, - LLVMConstInt(ctx->i32, 0xfff, 0), ""); - } - LLVMValueRef thread_id = - LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), ""); - - /* Every other triangle in a strip has a reversed vertex order, so we - * need to swap vertices of odd primitives to get the correct primitive - * orientation when converting triangle strips to triangles. Primitive - * restart complicates it, because a strip can start anywhere. - */ - LLVMValueRef prim_restart_accepted = ctx->i1true; - - if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { - /* Without primitive restart, odd primitives have reversed orientation. - * Only primitive restart can flip it with respect to the first vertex - * of the draw call. - */ - LLVMValueRef first_is_odd = ctx->i1false; - - /* Handle primitive restart. */ - if (key->opt.cs_primitive_restart) { - /* Get the GDS primitive restart continue flag and clear - * the flag in vertex_counter. This flag is used when the draw - * call was split and we need to load the primitive orientation - * flag from GDS for the first wave too. - */ - LLVMValueRef gds_prim_restart_continue = - LLVMBuildLShr(builder, vertex_counter, - LLVMConstInt(ctx->i32, 31, 0), ""); - gds_prim_restart_continue = - LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, ""); - vertex_counter = LLVMBuildAnd(builder, vertex_counter, - LLVMConstInt(ctx->i32, 0x7fffffff, 0), ""); - - LLVMValueRef index0_is_reset; - - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i], - restart_index, ""); - if (i == 0) - index0_is_reset = LLVMBuildNot(builder, not_reset, ""); - prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, - not_reset, ""); - } - - /* If the previous waves flip the primitive orientation - * of the current triangle strip, it will be stored in GDS. - * - * Sometimes the correct orientation is not needed, in which case - * we don't need to execute this. - */ - if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) { - /* If there are reset indices in this wave, get the thread index - * where the most recent strip starts relative to each thread. - */ - LLVMValueRef preceding_threads_mask = - LLVMBuildSub(builder, - LLVMBuildShl(builder, ctx->ac.i64_1, - LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""), - ctx->ac.i64_1, ""); - - LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset); - LLVMValueRef preceding_reset_threadmask = - LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, ""); - LLVMValueRef strip_start = - ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL); - strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, ""); - - /* This flips the orientatino based on reset indices within this wave only. */ - first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, ""); - - LLVMValueRef last_strip_start, prev_wave_state, ret, tmp; - LLVMValueRef is_first_wave, current_wave_resets_index; - - /* Get the thread index where the last strip starts in this wave. - * - * If the last strip doesn't start in this wave, the thread index - * will be 0. - * - * If the last strip starts in the next wave, the thread index will - * be 64. - */ - last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL); - last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, ""); - - struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, thread_id); - - /* This must be done in the thread 0 section, because - * we expect PrimID to be 0 for the whole first wave - * in this expression. - * - * NOTE: This will need to be different if we wanna support - * instancing with primitive restart. - */ - is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, ""); - is_first_wave = LLVMBuildAnd(builder, is_first_wave, - LLVMBuildNot(builder, - gds_prim_restart_continue, ""), ""); - current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE, - last_strip_start, ctx->i32_0, ""); - - ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state"); - - /* Save the last strip start primitive index in GDS and read - * the value that previous waves stored. - * - * if (is_first_wave || current_wave_resets_strip) - * // Read the value that previous waves stored and store a new one. - * first_is_odd = ds.ordered.swap(last_strip_start); - * else - * // Just read the value that previous waves stored. - * first_is_odd = ds.ordered.add(0); - */ - ac_build_ifcc(&ctx->ac, - LLVMBuildOr(builder, is_first_wave, - current_wave_resets_index, ""), 12602); - { - /* The GDS address is always 0 with ordered append. */ - tmp = si_build_ds_ordered_op(ctx, "swap", - ordered_wave_id, last_strip_start, - 1, true, false); - LLVMBuildStore(builder, tmp, ret); - } - ac_build_else(&ctx->ac, 12603); - { - /* Just read the value from GDS. */ - tmp = si_build_ds_ordered_op(ctx, "add", - ordered_wave_id, ctx->i32_0, - 1, true, false); - LLVMBuildStore(builder, tmp, ret); - } - ac_build_endif(&ctx->ac, 12602); - - prev_wave_state = LLVMBuildLoad(builder, ret, ""); - /* Ignore the return value if this is the first wave. */ - prev_wave_state = LLVMBuildSelect(builder, is_first_wave, - ctx->i32_0, prev_wave_state, ""); - si_exit_thread0_section(§ion, &prev_wave_state); - prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, ""); - - /* If the strip start appears to be on thread 0 for the current primitive - * (meaning the reset index is not present in this wave and might have - * appeared in previous waves), use the value from GDS to determine - * primitive orientation. - * - * If the strip start is in this wave for the current primitive, use - * the value from the current wave to determine primitive orientation. - */ - LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ, - strip_start, ctx->i32_0, ""); - first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, - first_is_odd, ""); - } - } - /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */ - LLVMValueRef prim_is_odd = - LLVMBuildXor(builder, first_is_odd, - LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), ""); - - /* Determine the primitive orientation. - * Only swap the vertices that are not the provoking vertex. We need to keep - * the provoking vertex in place. - */ - if (key->opt.cs_provoking_vertex_first) { - LLVMValueRef index1 = index[1]; - LLVMValueRef index2 = index[2]; - index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, ""); - index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, ""); - } else { - LLVMValueRef index0 = index[0]; - LLVMValueRef index1 = index[1]; - index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, ""); - index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, ""); - } - } - - /* Execute the vertex shader for each vertex to get vertex positions. */ - LLVMValueRef pos[3][4]; - for (unsigned i = 0; i < vertices_per_prim; i++) { - vs_params[param_vertex_id] = index[i]; - vs_params[param_instance_id] = instance_id; - - LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params); - for (unsigned chan = 0; chan < 4; chan++) - pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, ""); - } - - /* Divide XYZ by W. */ - for (unsigned i = 0; i < vertices_per_prim; i++) { - for (unsigned chan = 0; chan < 3; chan++) - pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]); - } - - /* Load the viewport state. */ - LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, - LLVMConstInt(ctx->i32, 2, 0)); - vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, ""); - vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); - vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); - vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); - vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); - - /* Do culling. */ - struct ac_cull_options options = {}; - options.cull_front = key->opt.cs_cull_front; - options.cull_back = key->opt.cs_cull_back; - options.cull_view_xy = true; - options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z; - options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z; - options.cull_small_prims = true; - options.cull_zero_area = true; - options.cull_w = true; - options.use_halfz_clip_space = key->opt.cs_halfz_clip_space; - - LLVMValueRef accepted = - ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, - vp_scale, vp_translate, smallprim_precision, - &options); - - LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); - - /* Count the number of active threads by doing bitcount(accepted). */ - LLVMValueRef num_prims_accepted = - ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64, - &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE); - num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, ""); - - LLVMValueRef start; - - /* Execute atomic_add on the vertex count. */ - struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, thread_id); - { - if (VERTEX_COUNTER_GDS_MODE == 0) { - LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); - vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); - start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, - vertex_counter, num_indices, - LLVMAtomicOrderingMonotonic, false); - } else if (VERTEX_COUNTER_GDS_MODE == 1) { - LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); - vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter, - LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""); - start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, - vertex_counter, num_indices, - LLVMAtomicOrderingMonotonic, false); - } else if (VERTEX_COUNTER_GDS_MODE == 2) { - LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); - - /* If the draw call was split into multiple subdraws, each using - * a separate draw packet, we need to start counting from 0 for - * the first compute wave of the subdraw. - * - * vertex_counter contains the primitive ID of the first thread - * in the first wave. - * - * This is only correct with VERTEX_COUNTER_GDS_MODE == 2: - */ - LLVMValueRef is_first_wave = - LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, - vertex_counter, ""); - - /* Store the primitive count for ordered append, not vertex count. - * The idea is to avoid GDS initialization via CP DMA. The shader - * effectively stores the first count using "swap". - * - * if (first_wave) { - * ds.ordered.swap(num_prims_accepted); // store the first primitive count - * previous = 0; - * } else { - * previous = ds.ordered.add(num_prims_accepted) // add the primitive count - * } - */ - ac_build_ifcc(&ctx->ac, is_first_wave, 12604); - { - /* The GDS address is always 0 with ordered append. */ - si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, - num_prims_accepted, 0, true, true); - LLVMBuildStore(builder, ctx->i32_0, tmp_store); - } - ac_build_else(&ctx->ac, 12605); - { - LLVMBuildStore(builder, - si_build_ds_ordered_op(ctx, "add", ordered_wave_id, - num_prims_accepted, 0, - true, true), - tmp_store); - } - ac_build_endif(&ctx->ac, 12604); - - start = LLVMBuildLoad(builder, tmp_store, ""); - } - } - si_exit_thread0_section(§ion, &start); - - /* Write the final vertex count to memory. An EOS/EOP event could do this, - * but those events are super slow and should be avoided if performance - * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE - * event like this. - */ - if (VERTEX_COUNTER_GDS_MODE == 2) { - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, - last_wave_prim_id, ""), 12606); - LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, ""); - count = LLVMBuildMul(builder, count, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); - - /* GFX8 needs to disable caching, so that the CP can see the stored value. - * MTYPE=3 bypasses TC L2. - */ - if (ctx->screen->info.chip_class <= GFX8) { - LLVMValueRef desc[] = { - vertex_count_addr, - LLVMConstInt(ctx->i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0), - LLVMConstInt(ctx->i32, 4, 0), - LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | - S_008F0C_MTYPE(3 /* uncached */), 0), - }; - LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4); - ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0, - ctx->i32_0, 0, ac_glc | ac_slc, false); - } else { - LLVMBuildStore(builder, count, - si_expand_32bit_pointer(ctx, vertex_count_addr)); - } - ac_build_endif(&ctx->ac, 12606); - } else { - /* For unordered modes that increment a vertex count instead of - * primitive count, convert it into the primitive index. - */ - start = LLVMBuildUDiv(builder, start, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); - } - - /* Now we need to store the indices of accepted primitives into - * the output index buffer. - */ - ac_build_ifcc(&ctx->ac, accepted, 16607); - { - /* Get the number of bits set before the index of this thread. */ - LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); - - /* We have lowered instancing. Pack the instance ID into vertex ID. */ - if (key->opt.cs_instancing) { - instance_id = LLVMBuildShl(builder, instance_id, - LLVMConstInt(ctx->i32, 16, 0), ""); - - for (unsigned i = 0; i < vertices_per_prim; i++) - index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); - } - - if (VERTEX_COUNTER_GDS_MODE == 2) { - /* vertex_counter contains the first primitive ID - * for this dispatch. If the draw call was split into - * multiple subdraws, the first primitive ID is > 0 - * for subsequent subdraws. Each subdraw uses a different - * portion of the output index buffer. Offset the store - * vindex by the first primitive ID to get the correct - * store address for the subdraw. - */ - start = LLVMBuildAdd(builder, start, vertex_counter, ""); - } - - /* Write indices for accepted primitives. */ - LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); - LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); - - if (!ac_has_vec3_support(ctx->ac.chip_class, true)) - vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); - - ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, - vindex, ctx->i32_0, 3, - ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); - } - ac_build_endif(&ctx->ac, 16607); - - LLVMBuildRetVoid(builder); -} - -/* Return false if the shader isn't ready. */ -static bool si_shader_select_prim_discard_cs(struct si_context *sctx, - const struct pipe_draw_info *info, - bool primitive_restart) -{ - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader_key key; - - /* Primitive restart needs ordered counters. */ - assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2); - assert(!primitive_restart || info->instance_count == 1); - - memset(&key, 0, sizeof(key)); - si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog); - assert(!key.part.vs.prolog.instance_divisor_is_fetched); - - key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0; - key.opt.vs_as_prim_discard_cs = 1; - key.opt.cs_prim_type = info->mode; - key.opt.cs_indexed = info->index_size != 0; - key.opt.cs_instancing = info->instance_count > 1; - key.opt.cs_primitive_restart = primitive_restart; - key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first; - - /* Primitive restart with triangle strips needs to preserve primitive - * orientation for cases where front and back primitive orientation matters. - */ - if (primitive_restart) { - struct si_shader_selector *ps = sctx->ps_shader.cso; - - key.opt.cs_need_correct_orientation = - rs->cull_front != rs->cull_back || - ps->info.uses_frontface || - (rs->two_side && ps->info.colors_read); - } - - if (rs->rasterizer_discard) { - /* Just for performance testing and analysis of trivial bottlenecks. - * This should result in a very short compute shader. */ - key.opt.cs_cull_front = 1; - key.opt.cs_cull_back = 1; - } else { - key.opt.cs_cull_front = - sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front; - key.opt.cs_cull_back = - sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back; - } - - if (!rs->depth_clamp_any && CULL_Z) { - key.opt.cs_cull_z = 1; - key.opt.cs_halfz_clip_space = rs->clip_halfz; - } - - sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso; - sctx->cs_prim_discard_state.current = NULL; - - struct si_compiler_ctx_state compiler_state; - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, - &compiler_state, &key, -1, true) == 0 && - /* Disallow compute shaders using the scratch buffer. */ - sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0; -} - -static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx) -{ - if (sctx->index_ring) - return true; - - if (!sctx->prim_discard_compute_cs) { - struct radeon_winsys *ws = sctx->ws; - unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : - VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0; - unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0; - - if (gds_size) { - sctx->gds = ws->buffer_create(ws, gds_size, 4, - RADEON_DOMAIN_GDS, 0); - if (!sctx->gds) - return false; - - ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, - RADEON_USAGE_READWRITE, 0, 0); - } - if (num_oa_counters) { - assert(gds_size); - sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, - 1, RADEON_DOMAIN_OA, 0); - if (!sctx->gds_oa) - return false; - - ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, - RADEON_USAGE_READWRITE, 0, 0); - } - - sctx->prim_discard_compute_cs = - ws->cs_add_parallel_compute_ib(sctx->gfx_cs, - num_oa_counters > 0); - if (!sctx->prim_discard_compute_cs) - return false; - } - - if (!sctx->index_ring) { - sctx->index_ring = - si_aligned_buffer_create(sctx->b.screen, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - sctx->index_ring_size_per_ib * 2, - 2 * 1024 * 1024); - if (!sctx->index_ring) - return false; - } - return true; -} - -static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size) -{ - return sctx->index_ring_offset + - align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <= - sctx->index_ring_size_per_ib; -} - -enum si_prim_discard_outcome -si_prepare_prim_discard_or_split_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - bool primitive_restart) -{ - /* If the compute shader compilation isn't finished, this returns false. */ - if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart)) - return SI_PRIM_DISCARD_DISABLED; - - if (!si_initialize_prim_discard_cmdbuf(sctx)) - return SI_PRIM_DISCARD_DISABLED; - - struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; - unsigned prim = info->mode; - unsigned count = info->count; - unsigned instance_count = info->instance_count; - unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count); - unsigned num_prims = num_prims_per_instance * instance_count; - unsigned out_indexbuf_size = num_prims * 12; - bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size); - const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL; - - /* Split draws at the draw call level if the ring is full. This makes - * better use of the ring space. - */ - if (ring_full && - num_prims > split_prims_draw_level && - instance_count == 1 && /* TODO: support splitting instanced draws */ - (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | - (1 << PIPE_PRIM_TRIANGLE_STRIP))) { - /* Split draws. */ - struct pipe_draw_info split_draw = *info; - split_draw.primitive_restart = primitive_restart; - - unsigned base_start = split_draw.start; - - if (prim == PIPE_PRIM_TRIANGLES) { - unsigned vert_count_per_subdraw = split_prims_draw_level * 3; - assert(vert_count_per_subdraw < count); - - for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { - split_draw.start = base_start + start; - split_draw.count = MIN2(count - start, vert_count_per_subdraw); - - sctx->b.draw_vbo(&sctx->b, &split_draw); - } - } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { - /* No primitive pair can be split, because strips reverse orientation - * for odd primitives. */ - STATIC_ASSERT(split_prims_draw_level % 2 == 0); - - unsigned vert_count_per_subdraw = split_prims_draw_level; - - for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { - split_draw.start = base_start + start; - split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2); - - sctx->b.draw_vbo(&sctx->b, &split_draw); - - if (start == 0 && - primitive_restart && - sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation) - sctx->preserve_prim_restart_gds_at_flush = true; - } - sctx->preserve_prim_restart_gds_at_flush = false; - } else { - assert(0); - } - - return SI_PRIM_DISCARD_DRAW_SPLIT; - } - - /* Just quit if the draw call doesn't fit into the ring and can't be split. */ - if (out_indexbuf_size > sctx->index_ring_size_per_ib) { - if (SI_PRIM_DISCARD_DEBUG) - puts("PD failed: draw call too big, can't be split"); - return SI_PRIM_DISCARD_DISABLED; - } - - unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL); - unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ + - 24 * (num_subdraws - 1) + /* subdraws */ - 20; /* leave some space at the end */ - unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx); - - if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) - need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */ - else - need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */ - - if (ring_full || - (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) || - !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) { - /* If the current IB is empty but the size is too small, add a NOP - * packet to force a flush and get a bigger IB. - */ - if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && - gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { - radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(gfx_cs, 0); - } - - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - } - - /* The compute IB is always chained, but we need to call cs_check_space to add more space. */ - struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; - ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false); - assert(compute_has_space); - assert(si_check_ring_space(sctx, out_indexbuf_size)); - return SI_PRIM_DISCARD_ENABLED; -} - -void si_compute_signal_gfx(struct si_context *sctx) -{ - struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; - unsigned writeback_L2_flags = 0; - - /* The writeback L2 flags vary with each chip generation. */ - /* CI needs to flush vertex indices to memory. */ - if (sctx->chip_class <= GFX7) - writeback_L2_flags = EVENT_TC_WB_ACTION_ENA; - else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0) - writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA; - - if (!sctx->compute_num_prims_in_batch) - return; - - assert(sctx->compute_rewind_va); - - /* After the queued dispatches are done and vertex counts are written to - * the gfx IB, signal the gfx IB to continue. CP doesn't wait for - * the dispatches to finish, it only adds the CS_DONE event into the event - * queue. - */ - si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags, - sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, - writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : - EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, - NULL, - sctx->compute_rewind_va | - ((uint64_t)sctx->screen->info.address32_hi << 32), - REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */ - SI_NOT_QUERY); - - sctx->compute_rewind_va = 0; - sctx->compute_num_prims_in_batch = 0; -} - -/* Dispatch a primitive discard compute shader. */ -void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - unsigned index_size, - unsigned base_vertex, - uint64_t input_indexbuf_va, - unsigned input_indexbuf_num_elements) -{ - struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; - struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; - unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count); - if (!num_prims_per_instance) - return; - - unsigned num_prims = num_prims_per_instance * info->instance_count; - unsigned vertices_per_prim, output_indexbuf_format; - - switch (info->mode) { - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - vertices_per_prim = 3; - output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32; - break; - default: - unreachable("unsupported primitive type"); - return; - } - - unsigned out_indexbuf_offset; - uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4; - bool first_dispatch = !sctx->prim_discard_compute_ib_initialized; - - /* Initialize the compute IB if it's empty. */ - if (!sctx->prim_discard_compute_ib_initialized) { - /* 1) State initialization. */ - sctx->compute_gds_offset = 0; - sctx->compute_ib_last_shader = NULL; - - if (sctx->last_ib_barrier_fence) { - assert(!sctx->last_ib_barrier_buf); - sctx->ws->cs_add_fence_dependency(gfx_cs, - sctx->last_ib_barrier_fence, - RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY); - } - - /* 2) IB initialization. */ - - /* This needs to be done at the beginning of IBs due to possible - * TTM buffer moves in the kernel. - * - * TODO: update for GFX10 - */ - si_emit_surface_sync(sctx, cs, - S_0085F0_TC_ACTION_ENA(1) | - S_0085F0_TCL1_ACTION_ENA(1) | - S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) | - S_0085F0_SH_ICACHE_ACTION_ENA(1) | - S_0085F0_SH_KCACHE_ACTION_ENA(1)); - - /* Restore the GDS prim restart counter if needed. */ - if (sctx->preserve_prim_restart_gds_at_flush) { - si_cp_copy_data(sctx, cs, - COPY_DATA_GDS, NULL, 4, - COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4); - } - - si_emit_initial_compute_regs(sctx, cs); - - radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(sctx->scratch_waves) | - S_00B860_WAVESIZE(0)); /* no scratch */ - - /* Only 1D grids are launched. */ - radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | - S_00B820_NUM_THREAD_PARTIAL(1)); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | - S_00B824_NUM_THREAD_PARTIAL(1)); - - radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2); - radeon_emit(cs, 0); - radeon_emit(cs, 0); - - /* Disable ordered alloc for OA resources. */ - for (unsigned i = 0; i < 2; i++) { - radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3); - radeon_emit(cs, S_031074_INDEX(i)); - radeon_emit(cs, 0); - radeon_emit(cs, S_03107C_ENABLE(0)); - } - - if (sctx->last_ib_barrier_buf) { - assert(!sctx->last_ib_barrier_fence); - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, - RADEON_USAGE_READ, RADEON_PRIO_FENCE); - si_cp_wait_mem(sctx, cs, - sctx->last_ib_barrier_buf->gpu_address + - sctx->last_ib_barrier_buf_offset, 1, 1, - WAIT_REG_MEM_EQUAL); - } - - sctx->prim_discard_compute_ib_initialized = true; - } - - /* Allocate the output index buffer. */ - output_indexbuf_size = align(output_indexbuf_size, - sctx->screen->info.tcc_cache_line_size); - assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib); - out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset; - sctx->index_ring_offset += output_indexbuf_size; - - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE, - RADEON_PRIO_SHADER_RW_BUFFER); - uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset; - - /* Prepare index buffer descriptors. */ - struct si_resource *indexbuf_desc = NULL; - unsigned indexbuf_desc_offset; - unsigned desc_size = 12 * 4; - uint32_t *desc; - - u_upload_alloc(sctx->b.const_uploader, 0, desc_size, - si_optimal_tcc_alignment(sctx, desc_size), - &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc, - (void**)&desc); - radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); - - /* Input index buffer. */ - desc[0] = input_indexbuf_va; - desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | - S_008F04_STRIDE(index_size); - desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1); - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : - index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 : - V_008F0C_BUF_DATA_FORMAT_32); - - /* Output index buffer. */ - desc[4] = out_indexbuf_va; - desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | - S_008F04_STRIDE(vertices_per_prim * 4); - desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1); - desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(output_indexbuf_format); - - /* Viewport state. - * This is needed by the small primitive culling, because it's done - * in screen space. - */ - float scale[2], translate[2]; - - scale[0] = sctx->viewports.states[0].scale[0]; - scale[1] = sctx->viewports.states[0].scale[1]; - translate[0] = sctx->viewports.states[0].translate[0]; - translate[1] = sctx->viewports.states[0].translate[1]; - - /* The viewport shouldn't flip the X axis for the small prim culling to work. */ - assert(-scale[0] + translate[0] <= scale[0] + translate[0]); - - /* If the Y axis is inverted (OpenGL default framebuffer), reverse it. - * This is because the viewport transformation inverts the clip space - * bounding box, so min becomes max, which breaks small primitive - * culling. - */ - if (sctx->viewports.y_inverted) { - scale[1] = -scale[1]; - translate[1] = -translate[1]; - } - - /* Scale the framebuffer up, so that samples become pixels and small - * primitive culling is the same for all sample counts. - * This only works with the standard DX sample positions, because - * the samples are evenly spaced on both X and Y axes. - */ - unsigned num_samples = sctx->framebuffer.nr_samples; - assert(num_samples >= 1); - - for (unsigned i = 0; i < 2; i++) { - scale[i] *= num_samples; - translate[i] *= num_samples; - } - - desc[8] = fui(scale[0]); - desc[9] = fui(scale[1]); - desc[10] = fui(translate[0]); - desc[11] = fui(translate[1]); - - /* Better subpixel precision increases the efficiency of small - * primitive culling. */ - unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; - float small_prim_cull_precision; - - if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) - small_prim_cull_precision = num_samples / 4096.0; - else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) - small_prim_cull_precision = num_samples / 1024.0; - else - small_prim_cull_precision = num_samples / 256.0; - - /* Set user data SGPRs. */ - /* This can't be greater than 14 if we want the fastest launch rate. */ - unsigned user_sgprs = 13; - - uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset; - unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX); - unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX); - uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address; - uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address; - uint64_t vb_desc_va = sctx->vb_descriptors_buffer ? - sctx->vb_descriptors_buffer->gpu_address + - sctx->vb_descriptors_offset : 0; - unsigned gds_offset, gds_size; - struct si_fast_udiv_info32 num_prims_udiv = {}; - - if (info->instance_count > 1) - num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31); - - /* Limitations on how these two are packed in the user SGPR. */ - assert(num_prims_udiv.post_shift < 32); - assert(num_prims_per_instance < 1 << 27); - - si_resource_reference(&indexbuf_desc, NULL); - - bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart; - - if (VERTEX_COUNTER_GDS_MODE == 1) { - gds_offset = sctx->compute_gds_offset; - gds_size = primitive_restart ? 8 : 4; - sctx->compute_gds_offset += gds_size; - - /* Reset the counters in GDS for the first dispatch using WRITE_DATA. - * The remainder of the GDS will be cleared after the dispatch packet - * in parallel with compute shaders. - */ - if (first_dispatch) { - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0)); - radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1)); - radeon_emit(cs, gds_offset); - radeon_emit(cs, 0); - radeon_emit(cs, 0); /* value to write */ - if (gds_size == 8) - radeon_emit(cs, 0); - } - } - - /* Set shader registers. */ - struct si_shader *shader = sctx->cs_prim_discard_state.current; - - if (shader != sctx->compute_ib_last_shader) { - radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_BINARY); - uint64_t shader_va = shader->bo->gpu_address; - - assert(shader->config.scratch_bytes_per_wave == 0); - assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); - - radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cs, shader_va >> 8); - radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); - - radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) | - S_00B848_FLOAT_MODE(shader->config.float_mode) | - S_00B848_DX10_CLAMP(1)); - radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | - S_00B84C_USER_SGPR(user_sgprs) | - S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) | - S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) | - S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) | - S_00B84C_LDS_SIZE(shader->config.lds_size)); - - radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - ac_get_compute_resource_limits(&sctx->screen->info, - WAVES_PER_TG, - MAX_WAVES_PER_SH, - THREADGROUPS_PER_CU)); - sctx->compute_ib_last_shader = shader; - } - - STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0); - - /* Big draw calls are split into smaller dispatches and draw packets. */ - for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) { - unsigned num_subdraw_prims; - - if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims) - num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL; - else - num_subdraw_prims = num_prims - start_prim; - - /* Small dispatches are executed back to back until a specific primitive - * count is reached. Then, a CS_DONE is inserted to signal the gfx IB - * to start drawing the batch. This batching adds latency to the gfx IB, - * but CS_DONE and REWIND are too slow. - */ - if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH) - si_compute_signal_gfx(sctx); - - if (sctx->compute_num_prims_in_batch == 0) { - assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi); - sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; - - if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) { - radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(gfx_cs, 0); - - si_cp_wait_mem(sctx, gfx_cs, - sctx->compute_rewind_va | - (uint64_t)sctx->screen->info.address32_hi << 32, - REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, - WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP); - - /* Use INDIRECT_BUFFER to chain to a different buffer - * to discard the CP prefetch cache. - */ - sctx->ws->cs_check_space(gfx_cs, 0, true); - } else { - radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); - radeon_emit(gfx_cs, 0); - } - } - - sctx->compute_num_prims_in_batch += num_subdraw_prims; - - uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4; - uint64_t index_va = out_indexbuf_va + start_prim * 12; - - /* Emit the draw packet into the gfx IB. */ - radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); - radeon_emit(gfx_cs, num_prims * vertices_per_prim); - radeon_emit(gfx_cs, index_va); - radeon_emit(gfx_cs, index_va >> 32); - radeon_emit(gfx_cs, 0); - radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); - - /* Continue with the compute IB. */ - if (start_prim == 0) { - uint32_t gds_prim_restart_continue_bit = 0; - - if (sctx->preserve_prim_restart_gds_at_flush) { - assert(primitive_restart && - info->mode == PIPE_PRIM_TRIANGLE_STRIP); - assert(start_prim < 1 << 31); - gds_prim_restart_continue_bit = 1 << 31; - } - - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs); - radeon_emit(cs, index_buffers_va); - radeon_emit(cs, - VERTEX_COUNTER_GDS_MODE == 0 ? count_va : - VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset : - start_prim | - gds_prim_restart_continue_bit); - radeon_emit(cs, start_prim + num_subdraw_prims - 1); - radeon_emit(cs, count_va); - radeon_emit(cs, vb_desc_va); - radeon_emit(cs, vs_const_desc_va); - radeon_emit(cs, vs_sampler_desc_va); - radeon_emit(cs, base_vertex); - radeon_emit(cs, info->start_instance); - radeon_emit(cs, num_prims_udiv.multiplier); - radeon_emit(cs, num_prims_udiv.post_shift | - (num_prims_per_instance << 5)); - radeon_emit(cs, info->restart_index); - /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ - radeon_emit(cs, fui(small_prim_cull_precision)); - } else { - assert(VERTEX_COUNTER_GDS_MODE == 2); - /* Only update the SGPRs that changed. */ - radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3); - radeon_emit(cs, start_prim); - radeon_emit(cs, start_prim + num_subdraw_prims - 1); - radeon_emit(cs, count_va); - } - - /* Set grid dimensions. */ - unsigned start_block = start_prim / THREADGROUP_SIZE; - unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE; - unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE; - - radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block); - radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X, - S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) | - S_00B81C_NUM_THREAD_PARTIAL(partial_block_size)); - - radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size); - radeon_emit(cs, 1); - radeon_emit(cs, 1); - radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | - S_00B800_PARTIAL_TG_EN(!!partial_block_size) | - S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) | - S_00B800_ORDER_MODE(0 /* launch in order */)); - - /* This is only for unordered append. Ordered append writes this from - * the shader. - * - * Note that EOP and EOS events are super slow, so emulating the event - * in a shader is an important optimization. - */ - if (VERTEX_COUNTER_GDS_MODE == 1) { - si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0, - sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, - EOP_INT_SEL_NONE, - EOP_DATA_SEL_GDS, - NULL, - count_va | ((uint64_t)sctx->screen->info.address32_hi << 32), - EOP_DATA_GDS(gds_offset / 4, 1), - SI_NOT_QUERY); - - /* Now that compute shaders are running, clear the remainder of GDS. */ - if (first_dispatch) { - unsigned offset = gds_offset + gds_size; - si_cp_dma_clear_buffer(sctx, cs, NULL, offset, - GDS_SIZE_UNORDERED - offset, - 0, - SI_CPDMA_SKIP_CHECK_CS_SPACE | - SI_CPDMA_SKIP_GFX_SYNC | - SI_CPDMA_SKIP_SYNC_BEFORE, - SI_COHERENCY_NONE, L2_BYPASS); - } - } - first_dispatch = false; - - assert(cs->current.cdw <= cs->current.max_dw); - assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw); - } -} diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c b/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c index b7aece564..ca2230620 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -100,22 +100,22 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui radeon_begin(cs); if (sctx->chip_class >= GFX7) { - radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(cs, header); - radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ - radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ - radeon_emit(cs, command); + radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(header); + radeon_emit(src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(src_va >> 32); /* SRC_ADDR_HI [31:0] */ + radeon_emit(dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(dst_va >> 32); /* DST_ADDR_HI [31:0] */ + radeon_emit(command); } else { header |= S_411_SRC_ADDR_HI(src_va >> 32); - radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); - radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ - radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */ - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, command); + radeon_emit(PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(header); /* SRC_ADDR_HI [15:0] + flags. */ + radeon_emit(dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit((dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(command); } /* CP DMA is executed in ME, but index buffers are read by PFP. @@ -124,8 +124,8 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui * should precede it. */ if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) { - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); + radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(0); } radeon_end(); } @@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, sdst->TC_L2_dirty = true; /* If it's not a framebuffer fast clear... */ - if (coher == SI_COHERENCY_SHADER) { + if (coher == SI_COHERENCY_SHADER) sctx->num_cp_dma_calls++; - si_prim_discard_signal_next_compute_ib_start(sctx); - } } /** @@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, si_resource(dst)->TC_L2_dirty = true; /* If it's not a prefetch or GDS copy... */ - if (dst && src && (dst != src || dst_offset != src_offset)) { + if (dst && src && (dst != src || dst_offset != src_offset)) sctx->num_cp_dma_calls++; - si_prim_discard_signal_next_compute_ib_start(sctx); - } } void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, @@ -423,13 +419,13 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(cs, header); - radeon_emit(cs, address); /* SRC_ADDR_LO [31:0] */ - radeon_emit(cs, address >> 32); /* SRC_ADDR_HI [31:0] */ - radeon_emit(cs, address); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */ - radeon_emit(cs, command); + radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(header); + radeon_emit(address); /* SRC_ADDR_LO [31:0] */ + radeon_emit(address >> 32); /* SRC_ADDR_HI [31:0] */ + radeon_emit(address); /* DST_ADDR_LO [31:0] */ + radeon_emit(address >> 32); /* DST_ADDR_HI [31:0] */ + radeon_emit(command); radeon_end(); } @@ -495,11 +491,11 @@ void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned uint64_t va = buf->gpu_address + offset; radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0)); - radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit_array(cs, (const uint32_t *)data, size / 4); + radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0)); + radeon_emit(S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine)); + radeon_emit(va); + radeon_emit(va >> 32); + radeon_emit_array((const uint32_t *)data, size / 4); radeon_end(); } @@ -519,11 +515,11 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset; radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM); - radeon_emit(cs, src_va); - radeon_emit(cs, src_va >> 32); - radeon_emit(cs, dst_va); - radeon_emit(cs, dst_va >> 32); + radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM); + radeon_emit(src_va); + radeon_emit(src_va >> 32); + radeon_emit(dst_va); + radeon_emit(dst_va >> 32); radeon_end(); } diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c b/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c index bcc8baa93..540206c15 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_debug.c @@ -344,7 +344,6 @@ struct si_log_chunk_cs { struct si_saved_cs *cs; bool dump_bo_list; unsigned gfx_begin, gfx_end; - unsigned compute_begin, compute_end; }; static void si_log_chunk_type_cs_destroy(void *data) @@ -390,13 +389,18 @@ static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begi fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end); } +void si_print_current_ib(struct si_context *sctx, FILE *f) +{ + si_parse_current_ib(f, &sctx->gfx_cs, 0, sctx->gfx_cs.prev_dw + sctx->gfx_cs.current.cdw, + NULL, 0, "GFX", sctx->chip_class); +} + static void si_log_chunk_type_cs_print(void *data, FILE *f) { struct si_log_chunk_cs *chunk = data; struct si_context *ctx = chunk->ctx; struct si_saved_cs *scs = chunk->cs; int last_trace_id = -1; - int last_compute_trace_id = -1; /* We are expecting that the ddebug pipe has already * waited for the context, so this buffer should be idle. @@ -404,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) */ uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL, PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ); - if (map) { + if (map) last_trace_id = map[0]; - last_compute_trace_id = map[1]; - } if (chunk->gfx_end != chunk->gfx_begin) { if (chunk->gfx_begin == 0) { @@ -429,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) } } - if (chunk->compute_end != chunk->compute_begin) { - assert(ctx->prim_discard_compute_cs.priv); - - if (scs->flushed) { - ac_parse_ib(f, scs->compute.ib + chunk->compute_begin, - chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0, - "Compute IB", ctx->chip_class, NULL, NULL); - } else { - si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin, - chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB", - ctx->chip_class); - } - } - if (chunk->dump_bo_list) { fprintf(f, "Flushing. Time: "); util_dump_ns(f, scs->time_flush); @@ -462,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du struct si_saved_cs *scs = ctx->current_saved_cs; unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw; - unsigned compute_cur = 0; - - if (ctx->prim_discard_compute_cs.priv) - compute_cur = - ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw; - if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw) + if (!dump_bo_list && gfx_cur == scs->gfx_last_dw) return; struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); @@ -481,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du chunk->gfx_end = gfx_cur; scs->gfx_last_dw = gfx_cur; - chunk->compute_begin = scs->compute_last_dw; - chunk->compute_end = compute_cur; - scs->compute_last_dw = compute_cur; - u_log_chunk(log, &si_log_chunk_type_cs, chunk); } diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c b/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c index 60daaeb07..f02855743 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_descriptors.c @@ -231,15 +231,6 @@ static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_reso priority = si_get_sampler_view_priority(&tex->buffer); radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, check_mem); - - if (resource->target == PIPE_BUFFER) - return; - - /* Add separate DCC. */ - if (tex->dcc_separate_buffer) { - radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, usage, - RADEON_PRIO_SEPARATE_META, check_mem); - } } static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers) @@ -296,7 +287,8 @@ static void si_set_buf_desc_address(struct si_resource *buf, uint64_t offset, ui void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex, const struct legacy_surf_level *base_level_info, unsigned base_level, unsigned first_level, unsigned block_width, - bool is_stencil, uint16_t access, uint32_t *state) + /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */ + bool is_stencil, uint16_t access, uint32_t * restrict state) { uint64_t va, meta_va = 0; @@ -318,7 +310,6 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture } state[0] = va >> 8; - state[1] &= C_008F14_BASE_ADDRESS_HI; state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); /* Only macrotiled modes can set tile swizzle. @@ -328,11 +319,8 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture state[0] |= tex->surface.tile_swizzle; if (sscreen->info.chip_class >= GFX8) { - state[6] &= C_008F28_COMPRESSION_EN; - if (!(access & SI_IMAGE_ACCESS_DCC_OFF) && vi_dcc_enabled(tex, first_level)) { - meta_va = - (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.meta_offset; + meta_va = tex->buffer.gpu_address + tex->surface.meta_offset; if (sscreen->info.chip_class == GFX8) { meta_va += tex->surface.u.legacy.color.dcc_level[base_level].dcc_offset; @@ -355,17 +343,12 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture state[7] = meta_va >> 8; if (sscreen->info.chip_class >= GFX10) { - state[3] &= C_00A00C_SW_MODE; - if (is_stencil) { state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode); } else { state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.swizzle_mode); } - state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED & - C_00A018_WRITE_COMPRESS_ENABLE; - if (meta_va) { struct gfx9_surf_meta_flags meta = { .rb_aligned = 1, @@ -377,14 +360,21 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) | S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8) | - S_00A018_WRITE_COMPRESS_ENABLE((access & SI_IMAGE_ACCESS_DCC_WRITE) != 0); + /* DCC image stores require the following settings: + * - INDEPENDENT_64B_BLOCKS = 0 + * - INDEPENDENT_128B_BLOCKS = 1 + * - MAX_COMPRESSED_BLOCK_SIZE = 128B + * - MAX_UNCOMPRESSED_BLOCK_SIZE = 256B (always used) + * + * The same limitations apply to SDMA compressed stores because + * SDMA uses the same DCC codec. + */ + S_00A018_WRITE_COMPRESS_ENABLE(ac_surface_supports_dcc_image_stores(sscreen->info.chip_class, &tex->surface) && + (access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE)); } state[7] = meta_va >> 16; } else if (sscreen->info.chip_class == GFX9) { - state[3] &= C_008F1C_SW_MODE; - state[4] &= C_008F20_PITCH; - if (is_stencil) { state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.zs.stencil_swizzle_mode); state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.zs.stencil_epitch); @@ -423,9 +413,7 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture unsigned pitch = base_level_info->nblk_x * block_width; unsigned index = si_tile_mode_index(tex, base_level, is_stencil); - state[3] &= C_008F1C_TILING_INDEX; state[3] |= S_008F1C_TILING_INDEX(index); - state[4] &= C_008F20_PITCH; state[4] |= S_008F20_PITCH(pitch - 1); } @@ -451,13 +439,23 @@ static void si_set_sampler_state_desc(struct si_sampler_state *sstate, } static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_view *sview, - struct si_sampler_state *sstate, uint32_t *desc) + struct si_sampler_state *sstate, + /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */ + uint32_t * restrict desc) { struct pipe_sampler_view *view = &sview->base; struct si_texture *tex = (struct si_texture *)view->texture; - bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER; - if (unlikely(!is_buffer && sview->dcc_incompatible)) { + assert(tex); /* views with texture == NULL aren't supported */ + + if (tex->buffer.b.b.target == PIPE_BUFFER) { + memcpy(desc, sview->state, 8 * 4); + memcpy(desc + 8, null_texture_descriptor, 4 * 4); /* Disable FMASK. */ + si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4); + return; + } + + if (unlikely(sview->dcc_incompatible)) { if (vi_dcc_enabled(tex, view->u.tex.first_level)) if (!si_texture_disable_dcc(sctx, tex)) si_decompress_dcc(sctx, tex); @@ -465,27 +463,21 @@ static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_ sview->dcc_incompatible = false; } - assert(tex); /* views with texture == NULL aren't supported */ - memcpy(desc, sview->state, 8 * 4); + bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler; - if (is_buffer) { - si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4); - } else { - bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler; - - si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level, - sview->base.u.tex.first_level, sview->block_width, - is_separate_stencil, 0, desc); - } + memcpy(desc, sview->state, 8 * 4); + si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level, + sview->base.u.tex.first_level, sview->block_width, + is_separate_stencil, 0, desc); - if (!is_buffer && tex->surface.fmask_size) { + if (tex->surface.fmask_size) { memcpy(desc + 8, sview->fmask_state, 8 * 4); } else { /* Disable FMASK and bind sampler state in [12:15]. */ memcpy(desc + 8, null_texture_descriptor, 4 * 4); if (sstate) - si_set_sampler_state_desc(sstate, sview, is_buffer ? NULL : tex, desc + 12); + si_set_sampler_state_desc(sstate, sview, tex, desc + 12); } } @@ -508,65 +500,106 @@ static bool depth_needs_decompression(struct si_texture *tex) return tex->db_compatible; } -static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot, - struct pipe_sampler_view *view, bool disallow_early_out) +static void si_reset_sampler_view_slot(struct si_samplers *samplers, unsigned slot, + uint32_t * restrict desc) +{ + pipe_sampler_view_reference(&samplers->views[slot], NULL); + memcpy(desc, null_texture_descriptor, 8 * 4); + /* Only clear the lower dwords of FMASK. */ + memcpy(desc + 8, null_texture_descriptor, 4 * 4); + /* Re-set the sampler state if we are transitioning from FMASK. */ + if (samplers->sampler_states[slot]) + si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12); +} + +static void si_set_sampler_views(struct si_context *sctx, unsigned shader, + unsigned start_slot, unsigned count, + unsigned unbind_num_trailing_slots, + bool take_ownership, struct pipe_sampler_view **views, + bool disallow_early_out) { struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_sampler_view *sview = (struct si_sampler_view *)view; struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); - unsigned desc_slot = si_get_sampler_slot(slot); - uint32_t *desc = descs->list + desc_slot * 16; + uint32_t unbound_mask = 0; - if (samplers->views[slot] == view && !disallow_early_out) - return; + if (views) { + for (unsigned i = 0; i < count; i++) { + unsigned slot = start_slot + i; + struct si_sampler_view *sview = (struct si_sampler_view *)views[i]; + unsigned desc_slot = si_get_sampler_slot(slot); + /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */ + uint32_t *restrict desc = descs->list + desc_slot * 16; + + if (samplers->views[slot] == &sview->base && !disallow_early_out) { + if (take_ownership) { + struct pipe_sampler_view *view = views[i]; + pipe_sampler_view_reference(&view, NULL); + } + continue; + } - if (view) { - struct si_texture *tex = (struct si_texture *)view->texture; + if (sview) { + struct si_texture *tex = (struct si_texture *)sview->base.texture; + + si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc); + + if (tex->buffer.b.b.target == PIPE_BUFFER) { + tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW; + samplers->needs_depth_decompress_mask &= ~(1u << slot); + samplers->needs_color_decompress_mask &= ~(1u << slot); + } else { + if (depth_needs_decompression(tex)) { + samplers->needs_depth_decompress_mask |= 1u << slot; + } else { + samplers->needs_depth_decompress_mask &= ~(1u << slot); + } + if (color_needs_decompression(tex)) { + samplers->needs_color_decompress_mask |= 1u << slot; + } else { + samplers->needs_color_decompress_mask &= ~(1u << slot); + } + + if (vi_dcc_enabled(tex, sview->base.u.tex.first_level) && + p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + } - si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc); + if (take_ownership) { + pipe_sampler_view_reference(&samplers->views[slot], NULL); + samplers->views[slot] = &sview->base; + } else { + pipe_sampler_view_reference(&samplers->views[slot], &sview->base); + } + samplers->enabled_mask |= 1u << slot; - if (tex->buffer.b.b.target == PIPE_BUFFER) { - tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW; - samplers->needs_depth_decompress_mask &= ~(1u << slot); - samplers->needs_color_decompress_mask &= ~(1u << slot); - } else { - if (depth_needs_decompression(tex)) { - samplers->needs_depth_decompress_mask |= 1u << slot; + /* Since this can flush, it must be done after enabled_mask is + * updated. */ + si_sampler_view_add_buffer(sctx, &tex->buffer.b.b, RADEON_USAGE_READ, + sview->is_stencil_sampler, true); } else { - samplers->needs_depth_decompress_mask &= ~(1u << slot); + si_reset_sampler_view_slot(samplers, slot, desc); + unbound_mask |= 1u << slot; } - if (color_needs_decompression(tex)) { - samplers->needs_color_decompress_mask |= 1u << slot; - } else { - samplers->needs_color_decompress_mask &= ~(1u << slot); - } - - if (vi_dcc_enabled(tex, view->u.tex.first_level) && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; } - - pipe_sampler_view_reference(&samplers->views[slot], view); - samplers->enabled_mask |= 1u << slot; - - /* Since this can flush, it must be done after enabled_mask is - * updated. */ - si_sampler_view_add_buffer(sctx, view->texture, RADEON_USAGE_READ, sview->is_stencil_sampler, - true); } else { - pipe_sampler_view_reference(&samplers->views[slot], NULL); - memcpy(desc, null_texture_descriptor, 8 * 4); - /* Only clear the lower dwords of FMASK. */ - memcpy(desc + 8, null_texture_descriptor, 4 * 4); - /* Re-set the sampler state if we are transitioning from FMASK. */ - if (samplers->sampler_states[slot]) - si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12); + unbind_num_trailing_slots += count; + count = 0; + } - samplers->enabled_mask &= ~(1u << slot); - samplers->needs_depth_decompress_mask &= ~(1u << slot); - samplers->needs_color_decompress_mask &= ~(1u << slot); + for (unsigned i = 0; i < unbind_num_trailing_slots; i++) { + unsigned slot = start_slot + count + i; + unsigned desc_slot = si_get_sampler_slot(slot); + uint32_t * restrict desc = descs->list + desc_slot * 16; + + if (samplers->views[slot]) + si_reset_sampler_view_slot(samplers, slot, desc); } + unbound_mask |= BITFIELD_RANGE(start_slot + count, unbind_num_trailing_slots); + samplers->enabled_mask &= ~unbound_mask; + samplers->needs_depth_decompress_mask &= ~unbound_mask; + samplers->needs_color_decompress_mask &= ~unbound_mask; + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } @@ -582,28 +615,18 @@ static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsi sctx->shader_needs_decompress_mask &= ~shader_bit; } -static void si_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader, - unsigned start, unsigned count, - unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views) +static void si_pipe_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader, + unsigned start, unsigned count, + unsigned unbind_num_trailing_slots, + bool take_ownership, struct pipe_sampler_view **views) { struct si_context *sctx = (struct si_context *)ctx; - int i; if ((!count && !unbind_num_trailing_slots) || shader >= SI_NUM_SHADERS) return; - if (views) { - for (i = 0; i < count; i++) - si_set_sampler_view(sctx, shader, start + i, views[i], false); - } else { - for (i = 0; i < count; i++) - si_set_sampler_view(sctx, shader, start + i, NULL, false); - } - - for (; i < count + unbind_num_trailing_slots; i++) - si_set_sampler_view(sctx, shader, start + i, NULL, false); - + si_set_sampler_views(sctx, shader, start, count, unbind_num_trailing_slots, + take_ownership, views, false); si_update_shader_needs_decompress_mask(sctx, shader); } @@ -710,7 +733,7 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i res = si_resource(view->resource); - if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { + if (res->b.b.target == PIPE_BUFFER) { if (view->access & PIPE_IMAGE_ACCESS_WRITE) si_mark_image_range_valid(view); @@ -725,12 +748,15 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i bool uses_dcc = vi_dcc_enabled(tex, level); unsigned access = view->access; + if (uses_dcc && screen->always_allow_dcc_stores) + access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE; + assert(!tex->is_depth); assert(fmask_desc || tex->surface.fmask_offset == 0); if (uses_dcc && !skip_decompress && !(access & SI_IMAGE_ACCESS_DCC_OFF) && - ((!(access & SI_IMAGE_ACCESS_DCC_WRITE) && (access & PIPE_IMAGE_ACCESS_WRITE)) || + ((!(access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE) && (access & PIPE_IMAGE_ACCESS_WRITE)) || !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) { /* If DCC can't be disabled, at least decompress it. * The decompression is relatively cheap if the surface @@ -766,7 +792,7 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc); si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level, util_format_get_blockwidth(view->format), - false, view->access, desc); + false, access, desc); } } @@ -790,7 +816,7 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne if (&images->views[slot] != view) util_copy_image_view(&images->views[slot], view); - if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { + if (res->b.b.target == PIPE_BUFFER) { images->needs_color_decompress_mask &= ~(1 << slot); images->display_dcc_store_mask &= ~(1u << slot); res->bind_history |= PIPE_BIND_SHADER_IMAGE; @@ -804,10 +830,15 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne images->needs_color_decompress_mask &= ~(1 << slot); } - if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE) + if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE) { images->display_dcc_store_mask |= 1u << slot; - else + + /* Set displayable_dcc_dirty for non-compute stages conservatively (before draw calls). */ + if (shader != PIPE_SHADER_COMPUTE) + tex->displayable_dcc_dirty = true; + } else { images->display_dcc_store_mask &= ~(1u << slot); + } if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound)) ctx->need_check_render_feedback = true; @@ -992,7 +1023,8 @@ static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_ty /* BUFFER RESOURCES */ -static void si_init_buffer_resources(struct si_buffer_resources *buffers, +static void si_init_buffer_resources(struct si_context *sctx, + struct si_buffer_resources *buffers, struct si_descriptors *descs, unsigned num_buffers, short shader_userdata_rel_index, enum radeon_bo_priority priority, @@ -1004,6 +1036,22 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers, buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0])); si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers); + + /* Initialize buffer descriptors, so that we don't have to do it at bind time. */ + for (unsigned i = 0; i < num_buffers; i++) { + uint32_t *desc = descs->list + i * 4; + + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (sctx->chip_class >= GFX10) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + } } static void si_release_buffer_resources(struct si_buffer_resources *buffers, @@ -1145,7 +1193,6 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res } } else { if (take_ownership) { - pipe_resource_reference(&buffer, NULL); buffer = input->buffer; } else { pipe_resource_reference(&buffer, input->buffer); @@ -1160,16 +1207,6 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0); desc[2] = input->buffer_size; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } buffers->buffers[slot] = buffer; buffers->offsets[slot] = buffer_offset; @@ -1177,14 +1214,27 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res buffers->priority_constbuf, true); buffers->enabled_mask |= 1llu << slot; } else { - /* Clear the descriptor. */ - memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4); + /* Clear the descriptor. Only 3 dwords are cleared. The 4th dword is immutable. */ + memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 3); buffers->enabled_mask &= ~(1llu << slot); } sctx->descriptors_dirty |= 1u << descriptors_idx; } +void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader) +{ + if (shader == PIPE_SHADER_COMPUTE) + return; + + if (sctx->shaders[shader].key.opt.inline_uniforms) { + sctx->shaders[shader].key.opt.inline_uniforms = false; + memset(sctx->shaders[shader].key.opt.inlined_uniform_values, 0, + sizeof(sctx->shaders[shader].key.opt.inlined_uniform_values)); + sctx->do_update_shaders = true; + } +} + static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader, uint slot, bool take_ownership, const struct pipe_constant_buffer *input) @@ -1204,10 +1254,8 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; } - if (slot == 0) { - /* Invalidate current inlinable uniforms. */ - sctx->inlinable_uniforms_valid_mask &= ~(1 << shader); - } + if (slot == 0) + si_invalidate_inlinable_uniforms(sctx, shader); } slot = si_get_constbuf_slot(slot); @@ -1222,9 +1270,24 @@ static void si_set_inlinable_constants(struct pipe_context *ctx, { struct si_context *sctx = (struct si_context *)ctx; - memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4); - sctx->inlinable_uniforms_valid_mask |= 1 << shader; - sctx->do_update_shaders = true; + if (shader == PIPE_SHADER_COMPUTE) + return; + + if (!sctx->shaders[shader].key.opt.inline_uniforms) { + /* It's the first time we set the constants. Always update shaders. */ + sctx->shaders[shader].key.opt.inline_uniforms = true; + memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4); + sctx->do_update_shaders = true; + return; + } + + /* We have already set inlinable constants for this shader. Update the shader only if + * the constants are being changed so as not to update shaders needlessly. + */ + if (memcmp(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4)) { + memcpy(sctx->shaders[shader].key.opt.inlined_uniform_values, values, num_values * 4); + sctx->do_update_shaders = true; + } } void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot, @@ -1248,7 +1311,8 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou if (!sbuffer || !sbuffer->buffer) { pipe_resource_reference(&buffers->buffers[slot], NULL); - memset(desc, 0, sizeof(uint32_t) * 4); + /* Clear the descriptor. Only 3 dwords are cleared. The 4th dword is immutable. */ + memset(desc, 0, sizeof(uint32_t) * 3); buffers->enabled_mask &= ~(1llu << slot); buffers->writable_mask &= ~(1llu << slot); sctx->descriptors_dirty |= 1u << descriptors_idx; @@ -1261,16 +1325,6 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0); desc[2] = sbuffer->buffer_size; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } pipe_resource_reference(&buffers->buffers[slot], &buf->b.b); buffers->offsets[slot] = sbuffer->buffer_offset; @@ -1417,7 +1471,7 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource desc[3] |= S_008F0C_ELEMENT_SIZE(element_size); if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -1879,7 +1933,7 @@ void si_update_all_texture_descriptors(struct si_context *sctx) if (!view || !view->texture || view->texture->target == PIPE_BUFFER) continue; - si_set_sampler_view(sctx, shader, i, samplers->views[i], true); + si_set_sampler_views(sctx, shader, i, 1, 0, false, &samplers->views[i], true); } si_update_shader_needs_decompress_mask(sctx, shader); @@ -1897,11 +1951,13 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS); if (shader == PIPE_SHADER_VERTEX) { + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = - sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs; } si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); @@ -1909,12 +1965,14 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad void si_shader_pointers_mark_dirty(struct si_context *sctx) { + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen); + sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL && sctx->num_vertex_elements > - sctx->screen->num_vbos_in_user_sgprs; + num_vbos_in_user_sgprs; sctx->vertex_buffer_user_sgprs_dirty = - sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; @@ -1963,6 +2021,36 @@ void si_shader_change_notify(struct si_context *sctx) sctx->shader.gs.cso ? GS_ON : GS_OFF, sctx->ngg ? NGG_ON : NGG_OFF, PIPE_SHADER_TESS_EVAL)); + + /* Update as_* flags in shader keys. Ignore disabled shader stages. + * as_ls = VS before TCS + * as_es = VS before GS or TES before GS + * as_ngg = NGG enabled for the last geometry stage. + * If GS sets as_ngg, the previous stage must set as_ngg too. + */ + if (sctx->shader.tes.cso) { + sctx->shader.vs.key.as_ls = 1; + sctx->shader.vs.key.as_es = 0; + sctx->shader.vs.key.as_ngg = 0; + + if (sctx->shader.gs.cso) { + sctx->shader.tes.key.as_es = 1; + sctx->shader.tes.key.as_ngg = sctx->ngg; + sctx->shader.gs.key.as_ngg = sctx->ngg; + } else { + sctx->shader.tes.key.as_es = 0; + sctx->shader.tes.key.as_ngg = sctx->ngg; + } + } else if (sctx->shader.gs.cso) { + sctx->shader.vs.key.as_ls = 0; + sctx->shader.vs.key.as_es = 1; + sctx->shader.vs.key.as_ngg = sctx->ngg; + sctx->shader.gs.key.as_ngg = sctx->ngg; + } else { + sctx->shader.vs.key.as_ls = 0; + sctx->shader.vs.key.as_es = 0; + sctx->shader.vs.key.as_ngg = sctx->ngg; + } } #define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \ @@ -1977,9 +2065,9 @@ void si_shader_change_notify(struct si_context *sctx) struct si_descriptors *descs = &sctx->descriptors[start]; \ unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \ \ - radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, count); \ + radeon_set_sh_reg_seq(sh_offset, count); \ for (int i = 0; i < count; i++) \ - radeon_emit_32bit_pointer(sctx->screen, cs, descs[i].gpu_address); \ + radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \ } \ } \ } while (0) @@ -2070,12 +2158,12 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) { struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE); - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + + radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + shader->cs_shaderbufs_sgpr_index * 4, num_shaderbufs * 4); for (unsigned i = 0; i < num_shaderbufs; i++) - radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4); + radeon_emit_array(&desc->list[si_get_shaderbuf_slot(i) * 4], 4); sctx->compute_shaderbuf_sgprs_dirty = false; } @@ -2085,7 +2173,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) if (num_images && sctx->compute_image_sgprs_dirty) { struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE); - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + + radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + shader->cs_images_sgpr_index * 4, shader->cs_images_num_sgprs); @@ -2099,7 +2187,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) num_sgprs = 4; } - radeon_emit_array(cs, &desc->list[desc_offset], num_sgprs); + radeon_emit_array(&desc->list[desc_offset], num_sgprs); } sctx->compute_image_sgprs_dirty = false; @@ -2123,8 +2211,7 @@ static void si_init_bindless_descriptors(struct si_context *sctx, struct si_desc sctx->num_bindless_descriptors = 1; /* Track which bindless slots are used (or not). */ - util_idalloc_init(&sctx->bindless_used_slots); - util_idalloc_resize(&sctx->bindless_used_slots, num_elements); + util_idalloc_init(&sctx->bindless_used_slots, num_elements); /* Reserve slot 0 because it's an invalid handle for bindless. */ desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); @@ -2526,7 +2613,7 @@ void si_init_all_descriptors(struct si_context *sctx) rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS; } desc = si_const_and_shader_buffer_descriptors(sctx, i); - si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots, + si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i], desc, num_buffer_slots, rel_dw_offset, RADEON_PRIO_SHADER_RW_BUFFER, RADEON_PRIO_CONST_BUFFER); desc->slot_index_to_bind_directly = si_get_constbuf_slot(0); @@ -2556,7 +2643,7 @@ void si_init_all_descriptors(struct si_context *sctx) memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); } - si_init_buffer_resources(&sctx->internal_bindings, &sctx->descriptors[SI_DESCS_INTERNAL], + si_init_buffer_resources(sctx, &sctx->internal_bindings, &sctx->descriptors[SI_DESCS_INTERNAL], SI_NUM_INTERNAL_BINDINGS, SI_SGPR_INTERNAL_BINDINGS, /* The second priority is used by * const buffers in RW buffer slots. */ @@ -2577,7 +2664,7 @@ void si_init_all_descriptors(struct si_context *sctx) sctx->b.set_constant_buffer = si_pipe_set_constant_buffer; sctx->b.set_inlinable_constants = si_set_inlinable_constants; sctx->b.set_shader_buffers = si_set_shader_buffers; - sctx->b.set_sampler_views = si_set_sampler_views; + sctx->b.set_sampler_views = si_pipe_set_sampler_views; sctx->b.create_texture_handle = si_create_texture_handle; sctx->b.delete_texture_handle = si_delete_texture_handle; sctx->b.make_texture_handle_resident = si_make_texture_handle_resident; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c b/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c index fc965cd7a..0bee2f7d0 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -26,141 +26,17 @@ #include "si_query.h" #include "util/u_memory.h" -enum si_pc_block_flags -{ - /* This block is part of the shader engine */ - SI_PC_BLOCK_SE = (1 << 0), - - /* Expose per-instance groups instead of summing all instances (within - * an SE). */ - SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1), - - /* Expose per-SE groups instead of summing instances across SEs. */ - SI_PC_BLOCK_SE_GROUPS = (1 << 2), - - /* Shader block */ - SI_PC_BLOCK_SHADER = (1 << 3), - - /* Non-shader block with perfcounters windowed by shaders. */ - SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4), -}; - -enum si_pc_reg_layout -{ - /* All secondary selector dwords follow as one block after the primary - * selector dwords for the counters that have secondary selectors. - * - * Example: - * PERFCOUNTER0_SELECT - * PERFCOUNTER1_SELECT - * PERFCOUNTER0_SELECT1 - * PERFCOUNTER1_SELECT1 - * PERFCOUNTER2_SELECT - * PERFCOUNTER3_SELECT - */ - SI_PC_MULTI_BLOCK = 0, - - /* Each secondary selector dword follows immediately after the - * corresponding primary. - * - * Example: - * PERFCOUNTER0_SELECT - * PERFCOUNTER0_SELECT1 - * PERFCOUNTER1_SELECT - * PERFCOUNTER1_SELECT1 - * PERFCOUNTER2_SELECT - * PERFCOUNTER3_SELECT - */ - SI_PC_MULTI_ALTERNATE = 1, - - /* All secondary selector dwords follow as one block after all primary - * selector dwords. - * - * Example: - * PERFCOUNTER0_SELECT - * PERFCOUNTER1_SELECT - * PERFCOUNTER2_SELECT - * PERFCOUNTER3_SELECT - * PERFCOUNTER0_SELECT1 - * PERFCOUNTER1_SELECT1 - */ - SI_PC_MULTI_TAIL = 2, - - /* Free-form arrangement of selector registers. */ - SI_PC_MULTI_CUSTOM = 3, - - SI_PC_MULTI_MASK = 3, - - /* Registers are laid out in decreasing rather than increasing order. */ - SI_PC_REG_REVERSE = 4, - - SI_PC_FAKE = 8, -}; - -struct si_pc_block_base { - const char *name; - unsigned num_counters; - unsigned flags; - - unsigned select_or; - unsigned select0; - unsigned counter0_lo; - unsigned *select; - unsigned *counters; - unsigned num_multi; - unsigned num_prelude; - unsigned layout; -}; - -struct si_pc_block_gfxdescr { - struct si_pc_block_base *b; - unsigned selectors; - unsigned instances; -}; - -struct si_pc_block { - const struct si_pc_block_gfxdescr *b; - unsigned num_instances; - - unsigned num_groups; - char *group_names; - unsigned group_name_stride; - - char *selector_names; - unsigned selector_name_stride; -}; - -/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of - * performance counter group IDs. - */ -static const char *const si_pc_shader_type_suffixes[] = {"", "_ES", "_GS", "_VS", - "_PS", "_LS", "_HS", "_CS"}; - -static const unsigned si_pc_shader_type_bits[] = { - 0x7f, - S_036780_ES_EN(1), - S_036780_GS_EN(1), - S_036780_VS_EN(1), - S_036780_PS_EN(1), - S_036780_LS_EN(1), - S_036780_HS_EN(1), - S_036780_CS_EN(1), -}; - -/* Max counters per HW block */ -#define SI_QUERY_MAX_COUNTERS 16 - -#define SI_PC_SHADERS_WINDOWING (1u << 31) +#include "ac_perfcounter.h" struct si_query_group { struct si_query_group *next; - struct si_pc_block *block; + struct ac_pc_block *block; unsigned sub_gid; /* only used during init */ unsigned result_base; /* only used during init */ int se; int instance; unsigned num_counters; - unsigned selectors[SI_QUERY_MAX_COUNTERS]; + unsigned selectors[AC_QUERY_MAX_COUNTERS]; }; struct si_query_counter { @@ -182,525 +58,6 @@ struct si_query_pc { struct si_query_group *groups; }; -static struct si_pc_block_base cik_CB = { - .name = "CB", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_037000_CB_PERFCOUNTER_FILTER, - .counter0_lo = R_035018_CB_PERFCOUNTER0_LO, - .num_multi = 1, - .num_prelude = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static unsigned cik_CPC_select[] = { - R_036024_CPC_PERFCOUNTER0_SELECT, - R_036010_CPC_PERFCOUNTER0_SELECT1, - R_03600C_CPC_PERFCOUNTER1_SELECT, -}; -static struct si_pc_block_base cik_CPC = { - .name = "CPC", - .num_counters = 2, - - .select = cik_CPC_select, - .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE, -}; - -static struct si_pc_block_base cik_CPF = { - .name = "CPF", - .num_counters = 2, - - .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT, - .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, -}; - -static struct si_pc_block_base cik_CPG = { - .name = "CPG", - .num_counters = 2, - - .select0 = R_036008_CPG_PERFCOUNTER0_SELECT, - .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, -}; - -static struct si_pc_block_base cik_DB = { - .name = "DB", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_037100_DB_PERFCOUNTER0_SELECT, - .counter0_lo = R_035100_DB_PERFCOUNTER0_LO, - .num_multi = 3, // really only 2, but there's a gap between registers - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base cik_GDS = { - .name = "GDS", - .num_counters = 4, - - .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT, - .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_TAIL, -}; - -static unsigned cik_GRBM_counters[] = { - R_034100_GRBM_PERFCOUNTER0_LO, - R_03410C_GRBM_PERFCOUNTER1_LO, -}; -static struct si_pc_block_base cik_GRBM = { - .name = "GRBM", - .num_counters = 2, - - .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT, - .counters = cik_GRBM_counters, -}; - -static struct si_pc_block_base cik_GRBMSE = { - .name = "GRBMSE", - .num_counters = 4, - - .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT, - .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO, -}; - -static struct si_pc_block_base cik_IA = { - .name = "IA", - .num_counters = 4, - - .select0 = R_036210_IA_PERFCOUNTER0_SELECT, - .counter0_lo = R_034220_IA_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_TAIL, -}; - -static struct si_pc_block_base cik_PA_SC = { - .name = "PA_SC", - .num_counters = 8, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT, - .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -/* According to docs, PA_SU counters are only 48 bits wide. */ -static struct si_pc_block_base cik_PA_SU = { - .name = "PA_SU", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT, - .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base cik_SPI = { - .name = "SPI", - .num_counters = 6, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036600_SPI_PERFCOUNTER0_SELECT, - .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO, - .num_multi = 4, - .layout = SI_PC_MULTI_BLOCK, -}; - -static struct si_pc_block_base cik_SQ = { - .name = "SQ", - .num_counters = 16, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER, - - .select0 = R_036700_SQ_PERFCOUNTER0_SELECT, - .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15), - .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO, -}; - -static struct si_pc_block_base cik_SX = { - .name = "SX", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036900_SX_PERFCOUNTER0_SELECT, - .counter0_lo = R_034900_SX_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_TAIL, -}; - -static struct si_pc_block_base cik_TA = { - .name = "TA", - .num_counters = 2, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_036B00_TA_PERFCOUNTER0_SELECT, - .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base cik_TD = { - .name = "TD", - .num_counters = 2, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_036C00_TD_PERFCOUNTER0_SELECT, - .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base cik_TCA = { - .name = "TCA", - .num_counters = 4, - .flags = SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT, - .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base cik_TCC = { - .name = "TCC", - .num_counters = 4, - .flags = SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT, - .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base cik_TCP = { - .name = "TCP", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT, - .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base cik_VGT = { - .name = "VGT", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036230_VGT_PERFCOUNTER0_SELECT, - .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_TAIL, -}; - -static struct si_pc_block_base cik_WD = { - .name = "WD", - .num_counters = 4, - - .select0 = R_036200_WD_PERFCOUNTER0_SELECT, - .counter0_lo = R_034200_WD_PERFCOUNTER0_LO, -}; - -static struct si_pc_block_base cik_MC = { - .name = "MC", - .num_counters = 4, - - .layout = SI_PC_FAKE, -}; - -static struct si_pc_block_base cik_SRBM = { - .name = "SRBM", - .num_counters = 2, - - .layout = SI_PC_FAKE, -}; - -static struct si_pc_block_base gfx10_CHA = { - .name = "CHA", - .num_counters = 4, - - .select0 = R_037780_CHA_PERFCOUNTER0_SELECT, - .counter0_lo = R_035800_CHA_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_CHCG = { - .name = "CHCG", - .num_counters = 4, - - .select0 = R_036F18_CHCG_PERFCOUNTER0_SELECT, - .counter0_lo = R_034F20_CHCG_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_CHC = { - .name = "CHC", - .num_counters = 4, - - .select0 = R_036F00_CHC_PERFCOUNTER0_SELECT, - .counter0_lo = R_034F00_CHC_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_GCR = { - .name = "GCR", - .num_counters = 2, - - .select0 = R_037580_GCR_PERFCOUNTER0_SELECT, - .counter0_lo = R_035480_GCR_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_GE = { - .name = "GE", - .num_counters = 12, - - .select0 = R_036200_GE_PERFCOUNTER0_SELECT, - .counter0_lo = R_034200_GE_PERFCOUNTER0_LO, - .num_multi = 4, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_GL1A = { - .name = "GL1A", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_037700_GL1A_PERFCOUNTER0_SELECT, - .counter0_lo = R_035700_GL1A_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_GL1C = { - .name = "GL1C", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_036E80_GL1C_PERFCOUNTER0_SELECT, - .counter0_lo = R_034E80_GL1C_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_GL2A = { - .name = "GL2A", - .num_counters = 4, - - .select0 = R_036E40_GL2A_PERFCOUNTER0_SELECT, - .counter0_lo = R_034E40_GL2A_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_GL2C = { - .name = "GL2C", - .num_counters = 4, - - .select0 = R_036E00_GL2C_PERFCOUNTER0_SELECT, - .counter0_lo = R_034E00_GL2C_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static unsigned gfx10_PA_PH_select[] = { - R_037600_PA_PH_PERFCOUNTER0_SELECT, - R_037604_PA_PH_PERFCOUNTER0_SELECT1, - R_037608_PA_PH_PERFCOUNTER1_SELECT, - R_037640_PA_PH_PERFCOUNTER1_SELECT1, - R_03760C_PA_PH_PERFCOUNTER2_SELECT, - R_037644_PA_PH_PERFCOUNTER2_SELECT1, - R_037610_PA_PH_PERFCOUNTER3_SELECT, - R_037648_PA_PH_PERFCOUNTER3_SELECT1, - R_037614_PA_PH_PERFCOUNTER4_SELECT, - R_037618_PA_PH_PERFCOUNTER5_SELECT, - R_03761C_PA_PH_PERFCOUNTER6_SELECT, - R_037620_PA_PH_PERFCOUNTER7_SELECT, -}; -static struct si_pc_block_base gfx10_PA_PH = { - .name = "PA_PH", - .num_counters = 8, - .flags = SI_PC_BLOCK_SE, - - .select = gfx10_PA_PH_select, - .counter0_lo = R_035600_PA_PH_PERFCOUNTER0_LO, - .num_multi = 4, - .layout = SI_PC_MULTI_CUSTOM, -}; - -static struct si_pc_block_base gfx10_PA_SU = { - .name = "PA_SU", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT, - .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO, - .num_multi = 4, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_RLC = { - .name = "RLC", - .num_counters = 2, - - .select0 = R_037304_RLC_PERFCOUNTER0_SELECT, - .counter0_lo = R_035200_RLC_PERFCOUNTER0_LO, - .num_multi = 0, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_RMI = { - .name = "RMI", - /* Actually 4, but the 2nd counter is missing the secondary selector while - * the 3rd counter has it, which complicates the register layout. */ - .num_counters = 2, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_037400_RMI_PERFCOUNTER0_SELECT, - .counter0_lo = R_035300_RMI_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -static struct si_pc_block_base gfx10_UTCL1 = { - .name = "UTCL1", - .num_counters = 2, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_03758C_UTCL1_PERFCOUNTER0_SELECT, - .counter0_lo = R_035470_UTCL1_PERFCOUNTER0_LO, - .num_multi = 0, - .layout = SI_PC_MULTI_ALTERNATE, -}; - -/* Both the number of instances and selectors varies between chips of the same - * class. We only differentiate by class here and simply expose the maximum - * number over all chips in a class. - * - * Unfortunately, GPUPerfStudio uses the order of performance counter groups - * blindly once it believes it has identified the hardware, so the order of - * blocks here matters. - */ -static struct si_pc_block_gfxdescr groups_CIK[] = { - {&cik_CB, 226}, {&cik_CPF, 17}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15}, - {&cik_PA_SU, 153}, {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252}, {&cik_SX, 32}, - {&cik_TA, 111}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55}, {&cik_TCP, 154}, - {&cik_GDS, 121}, {&cik_VGT, 140}, {&cik_IA, 22}, {&cik_MC, 22}, {&cik_SRBM, 19}, - {&cik_WD, 22}, {&cik_CPG, 46}, {&cik_CPC, 22}, - -}; - -static struct si_pc_block_gfxdescr groups_VI[] = { - {&cik_CB, 405}, {&cik_CPF, 19}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15}, - {&cik_PA_SU, 154}, {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273}, {&cik_SX, 34}, - {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55}, {&cik_TCP, 180}, - {&cik_GDS, 121}, {&cik_VGT, 147}, {&cik_IA, 24}, {&cik_MC, 22}, {&cik_SRBM, 27}, - {&cik_WD, 37}, {&cik_CPG, 48}, {&cik_CPC, 24}, - -}; - -static struct si_pc_block_gfxdescr groups_gfx9[] = { - {&cik_CB, 438}, {&cik_CPF, 32}, {&cik_DB, 328}, {&cik_GRBM, 38}, {&cik_GRBMSE, 16}, - {&cik_PA_SU, 292}, {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374}, {&cik_SX, 208}, - {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57}, {&cik_TCP, 85}, - {&cik_GDS, 121}, {&cik_VGT, 148}, {&cik_IA, 32}, {&cik_WD, 58}, {&cik_CPG, 59}, - {&cik_CPC, 35}, -}; - -static struct si_pc_block_gfxdescr groups_gfx10[] = { - {&cik_CB, 461}, - {&gfx10_CHA, 45}, - {&gfx10_CHCG, 35}, - {&gfx10_CHC, 35}, - {&cik_CPC, 47}, - {&cik_CPF, 40}, - {&cik_CPG, 82}, - {&cik_DB, 370}, - {&gfx10_GCR, 94}, - {&cik_GDS, 123}, - {&gfx10_GE, 315}, - {&gfx10_GL1A, 36}, - {&gfx10_GL1C, 64}, - {&gfx10_GL2A, 91}, - {&gfx10_GL2C, 235}, - {&cik_GRBM, 47}, - {&cik_GRBMSE, 19}, - {&gfx10_PA_PH, 960}, - {&cik_PA_SC, 552}, - {&gfx10_PA_SU, 266}, - {&gfx10_RLC, 7}, - {&gfx10_RMI, 258}, - {&cik_SPI, 329}, - {&cik_SQ, 509}, - {&cik_SX, 225}, - {&cik_TA, 226}, - {&cik_TCP, 77}, - {&cik_TD, 61}, - {&gfx10_UTCL1, 15}, -}; - -static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc, - const struct si_pc_block *block) -{ - return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS || - (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se); -} - -static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc, - const struct si_pc_block *block) -{ - return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS || - (block->num_instances > 1 && pc->separate_instance); -} - -static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index, - unsigned *base_gid, unsigned *sub_index) -{ - struct si_pc_block *block = pc->blocks; - unsigned bid; - - *base_gid = 0; - for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { - unsigned total = block->num_groups * block->b->selectors; - - if (index < total) { - *sub_index = index; - return block; - } - - index -= total; - *base_gid += block->num_groups; - } - - return NULL; -} - -static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index) -{ - unsigned bid; - struct si_pc_block *block = pc->blocks; - - for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { - if (*index < block->num_groups) - return block; - *index -= block->num_groups; - } - - return NULL; -} - static void si_pc_emit_instance(struct si_context *sctx, int se, int instance) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; @@ -724,7 +81,7 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance) } radeon_begin(cs); - radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); + radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value); radeon_end(); } @@ -733,105 +90,37 @@ static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders) struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2, false); - radeon_emit(cs, shaders & 0x7f); - radeon_emit(cs, 0xffffffff); + radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false); + radeon_emit(shaders & 0x7f); + radeon_emit(0xffffffff); radeon_end(); } -static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count, +static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count, unsigned *selectors) { - struct si_pc_block_base *regs = block->b->b; + struct ac_pc_block_base *regs = block->b->b; struct radeon_cmdbuf *cs = &sctx->gfx_cs; unsigned idx; - unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK; - unsigned dw; assert(count <= regs->num_counters); - if (regs->layout & SI_PC_FAKE) + /* Fake counters. */ + if (!regs->select0) return; radeon_begin(cs); - if (layout_multi == SI_PC_MULTI_BLOCK) { - assert(!(regs->layout & SI_PC_REG_REVERSE)); - - dw = count + regs->num_prelude; - if (count >= regs->num_multi) - dw += regs->num_multi; - radeon_set_uconfig_reg_seq(cs, regs->select0, dw, false); - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) - radeon_emit(cs, selectors[idx] | regs->select_or); - - if (count < regs->num_multi) { - unsigned select1 = regs->select0 + 4 * regs->num_multi; - radeon_set_uconfig_reg_seq(cs, select1, count, false); - } - - for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) - radeon_emit(cs, 0); + for (idx = 0; idx < count; ++idx) { + radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false); + radeon_emit(selectors[idx] | regs->select_or); + } - if (count > regs->num_multi) { - for (idx = regs->num_multi; idx < count; ++idx) - radeon_emit(cs, selectors[idx] | regs->select_or); - } - } else if (layout_multi == SI_PC_MULTI_TAIL) { - unsigned select1, select1_count; - - assert(!(regs->layout & SI_PC_REG_REVERSE)); - - radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude, false); - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - for (idx = 0; idx < count; ++idx) - radeon_emit(cs, selectors[idx] | regs->select_or); - - select1 = regs->select0 + 4 * regs->num_counters; - select1_count = MIN2(count, regs->num_multi); - radeon_set_uconfig_reg_seq(cs, select1, select1_count, false); - for (idx = 0; idx < select1_count; ++idx) - radeon_emit(cs, 0); - } else if (layout_multi == SI_PC_MULTI_CUSTOM) { - unsigned *reg = regs->select; - for (idx = 0; idx < count; ++idx) { - radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or); - if (idx < regs->num_multi) - radeon_set_uconfig_reg(cs, *reg++, 0); - } - } else { - assert(layout_multi == SI_PC_MULTI_ALTERNATE); - - unsigned reg_base = regs->select0; - unsigned reg_count = count + MIN2(count, regs->num_multi); - reg_count += regs->num_prelude; - - if (!(regs->layout & SI_PC_REG_REVERSE)) { - radeon_set_uconfig_reg_seq(cs, reg_base, reg_count, false); - - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - for (idx = 0; idx < count; ++idx) { - radeon_emit(cs, selectors[idx] | regs->select_or); - if (idx < regs->num_multi) - radeon_emit(cs, 0); - } - } else { - reg_base -= (reg_count - 1) * 4; - radeon_set_uconfig_reg_seq(cs, reg_base, reg_count, false); - - for (idx = count; idx > 0; --idx) { - if (idx <= regs->num_multi) - radeon_emit(cs, 0); - radeon_emit(cs, selectors[idx - 1] | regs->select_or); - } - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - } + for (idx = 0; idx < regs->num_spm_counters; idx++) { + radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false); + radeon_emit(0); } + radeon_end(); } @@ -843,11 +132,11 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer COPY_DATA_IMM, NULL, 1); radeon_begin(cs); - radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, + radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0)); - radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0)); + radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING)); radeon_end(); } @@ -863,20 +152,20 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL); radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0)); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0)); radeon_set_uconfig_reg( - cs, R_036020_CP_PERFMON_CNTL, + R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1)); radeon_end(); } -static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count, +static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count, uint64_t va) { - struct si_pc_block_base *regs = block->b->b; + struct ac_pc_block_base *regs = block->b->b; struct radeon_cmdbuf *cs = &sctx->gfx_cs; unsigned idx; unsigned reg = regs->counter0_lo; @@ -884,33 +173,31 @@ static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, radeon_begin(cs); - if (!(regs->layout & SI_PC_FAKE)) { - if (regs->layout & SI_PC_REG_REVERSE) - reg_delta = -reg_delta; - + if (regs->select0) { for (idx = 0; idx < count; ++idx) { if (regs->counters) reg = regs->counters[idx]; - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | + radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_COUNT_SEL); /* 64 bits */ - radeon_emit(cs, reg >> 2); - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); + radeon_emit(reg >> 2); + radeon_emit(0); /* unused */ + radeon_emit(va); + radeon_emit(va >> 32); va += sizeof(uint64_t); reg += reg_delta; } } else { + /* Fake counters. */ for (idx = 0; idx < count; ++idx) { - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | - COPY_DATA_COUNT_SEL); - radeon_emit(cs, 0); /* immediate */ - radeon_emit(cs, 0); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); + radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | + COPY_DATA_COUNT_SEL); + radeon_emit(0); /* immediate */ + radeon_emit(0); + radeon_emit(va); + radeon_emit(va >> 32); va += sizeof(uint64_t); } } @@ -938,10 +225,10 @@ void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, b radeon_begin(&sctx->gfx_cs); if (sctx->chip_class >= GFX10) { - radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, + radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL, S_037390_PERFMON_CLOCK_STATE(inhibit)); } else if (sctx->chip_class >= GFX8) { - radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, + radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL, S_0372FC_PERFMON_CLOCK_STATE(inhibit)); } radeon_end(); @@ -966,7 +253,7 @@ static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery) si_inhibit_clockgating(sctx, &sctx->gfx_cs, true); for (struct si_query_group *group = query->groups; group; group = group->next) { - struct si_pc_block *block = group->block; + struct ac_pc_block *block = group->block; if (group->se != current_se || group->instance != current_instance) { current_se = group->se; @@ -997,11 +284,11 @@ static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery si_pc_emit_stop(sctx, query->buffer.buf, va); for (struct si_query_group *group = query->groups; group; group = group->next) { - struct si_pc_block *block = group->block; + struct ac_pc_block *block = group->block; unsigned se = group->se >= 0 ? group->se : 0; unsigned se_end = se + 1; - if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0)) + if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0)) se_end = sctx->screen->info.max_se; do { @@ -1102,8 +389,9 @@ static const struct si_query_ops batch_query_ops = { }; static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query, - struct si_pc_block *block, unsigned sub_gid) + struct ac_pc_block *block, unsigned sub_gid) { + struct si_perfcounters *pc = screen->perfcounters; struct si_query_group *group = query->groups; while (group) { @@ -1119,20 +407,20 @@ static struct si_query_group *get_group_state(struct si_screen *screen, struct s group->block = block; group->sub_gid = sub_gid; - if (block->b->b->flags & SI_PC_BLOCK_SHADER) { + if (block->b->b->flags & AC_PC_BLOCK_SHADER) { unsigned sub_gids = block->num_instances; unsigned shader_id; unsigned shaders; unsigned query_shaders; - if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) + if (ac_pc_block_has_per_se_groups(&pc->base, block)) sub_gids = sub_gids * screen->info.max_se; shader_id = sub_gid / sub_gids; sub_gid = sub_gid % sub_gids; - shaders = si_pc_shader_type_bits[shader_id]; + shaders = ac_pc_shader_type_bits[shader_id]; - query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING; + query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING; if (query_shaders && query_shaders != shaders) { fprintf(stderr, "si_perfcounter: incompatible shader groups\n"); FREE(group); @@ -1141,20 +429,20 @@ static struct si_query_group *get_group_state(struct si_screen *screen, struct s query->shaders = shaders; } - if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) { + if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) { // A non-zero value in query->shaders ensures that the shader // masking is reset unless the user explicitly requests one. - query->shaders = SI_PC_SHADERS_WINDOWING; + query->shaders = AC_PC_SHADERS_WINDOWING; } - if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) { + if (ac_pc_block_has_per_se_groups(&pc->base, block)) { group->se = sub_gid / block->num_instances; sub_gid = sub_gid % block->num_instances; } else { group->se = -1; } - if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) { + if (ac_pc_block_has_per_instance_groups(&pc->base, block)) { group->instance = sub_gid; } else { group->instance = -1; @@ -1171,7 +459,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_ { struct si_screen *screen = (struct si_screen *)ctx->screen; struct si_perfcounters *pc = screen->perfcounters; - struct si_pc_block *block; + struct ac_pc_block *block; struct si_query_group *group; struct si_query_pc *query; unsigned base_gid, sub_gid, sub_index; @@ -1196,7 +484,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_ goto error; block = - lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index); + ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index); if (!block) goto error; @@ -1221,11 +509,11 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_ i = 0; for (group = query->groups; group; group = group->next) { - struct si_pc_block *block = group->block; + struct ac_pc_block *block = group->block; unsigned read_dw; unsigned instances = 1; - if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0) + if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0) instances = screen->info.max_se; if (group->instance < 0) instances *= block->num_instances; @@ -1240,7 +528,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_ } if (query->shaders) { - if (query->shaders == SI_PC_SHADERS_WINDOWING) + if (query->shaders == AC_PC_SHADERS_WINDOWING) query->shaders = 0xffffffff; } @@ -1248,10 +536,10 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_ query->counters = CALLOC(num_queries, sizeof(*query->counters)); for (i = 0; i < num_queries; ++i) { struct si_query_counter *counter = &query->counters[i]; - struct si_pc_block *block; + struct ac_pc_block *block; block = - lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index); + ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index); sub_gid = sub_index / block->b->selectors; sub_index = sub_index % block->b->selectors; @@ -1268,7 +556,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_ counter->stride = group->num_counters; counter->qwords = 1; - if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0) + if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0) counter->qwords = screen->info.max_se; if (group->instance < 0) counter->qwords *= block->num_instances; @@ -1281,96 +569,11 @@ error: return NULL; } -static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block) -{ - bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block); - bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block); - unsigned i, j, k; - unsigned groups_shader = 1, groups_se = 1, groups_instance = 1; - unsigned namelen; - char *groupname; - char *p; - - if (per_instance_groups) - groups_instance = block->num_instances; - if (per_se_groups) - groups_se = screen->info.max_se; - if (block->b->b->flags & SI_PC_BLOCK_SHADER) - groups_shader = ARRAY_SIZE(si_pc_shader_type_bits); - - namelen = strlen(block->b->b->name); - block->group_name_stride = namelen + 1; - if (block->b->b->flags & SI_PC_BLOCK_SHADER) - block->group_name_stride += 3; - if (per_se_groups) { - assert(groups_se <= 10); - block->group_name_stride += 1; - - if (per_instance_groups) - block->group_name_stride += 1; - } - if (per_instance_groups) { - assert(groups_instance <= 100); - block->group_name_stride += 2; - } - - block->group_names = MALLOC(block->num_groups * block->group_name_stride); - if (!block->group_names) - return false; - - groupname = block->group_names; - for (i = 0; i < groups_shader; ++i) { - const char *shader_suffix = si_pc_shader_type_suffixes[i]; - unsigned shaderlen = strlen(shader_suffix); - for (j = 0; j < groups_se; ++j) { - for (k = 0; k < groups_instance; ++k) { - strcpy(groupname, block->b->b->name); - p = groupname + namelen; - - if (block->b->b->flags & SI_PC_BLOCK_SHADER) { - strcpy(p, shader_suffix); - p += shaderlen; - } - - if (per_se_groups) { - p += sprintf(p, "%d", j); - if (per_instance_groups) - *p++ = '_'; - } - - if (per_instance_groups) - p += sprintf(p, "%d", k); - - groupname += block->group_name_stride; - } - } - } - - assert(block->b->selectors <= 1000); - block->selector_name_stride = block->group_name_stride + 4; - block->selector_names = - MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride); - if (!block->selector_names) - return false; - - groupname = block->group_names; - p = block->selector_names; - for (i = 0; i < block->num_groups; ++i) { - for (j = 0; j < block->b->selectors; ++j) { - sprintf(p, "%s_%03d", groupname, j); - p += block->selector_name_stride; - } - groupname += block->group_name_stride; - } - - return true; -} - int si_get_perfcounter_info(struct si_screen *screen, unsigned index, struct pipe_driver_query_info *info) { struct si_perfcounters *pc = screen->perfcounters; - struct si_pc_block *block; + struct ac_pc_block *block; unsigned base_gid, sub; if (!pc) @@ -1379,19 +582,19 @@ int si_get_perfcounter_info(struct si_screen *screen, unsigned index, if (!info) { unsigned bid, num_queries = 0; - for (bid = 0; bid < pc->num_blocks; ++bid) { - num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups; + for (bid = 0; bid < pc->base.num_blocks; ++bid) { + num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups; } return num_queries; } - block = lookup_counter(pc, index, &base_gid, &sub); + block = ac_lookup_counter(&pc->base, index, &base_gid, &sub); if (!block) return 0; if (!block->selector_names) { - if (!si_init_block_names(screen, block)) + if (!ac_init_block_names(&screen->info, &pc->base, block)) return 0; } info->name = block->selector_names + sub * block->selector_name_stride; @@ -1410,20 +613,20 @@ int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index, struct pipe_driver_query_group_info *info) { struct si_perfcounters *pc = screen->perfcounters; - struct si_pc_block *block; + struct ac_pc_block *block; if (!pc) return 0; if (!info) - return pc->num_groups; + return pc->base.num_groups; - block = lookup_group(pc, &index); + block = ac_lookup_group(&pc->base, &index); if (!block) return 0; if (!block->group_names) { - if (!si_init_block_names(screen, block)) + if (!ac_init_block_names(&screen->info, &pc->base, block)) return 0; } info->name = block->group_names + index * block->group_name_stride; @@ -1435,100 +638,31 @@ int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index, void si_destroy_perfcounters(struct si_screen *screen) { struct si_perfcounters *pc = screen->perfcounters; - unsigned i; if (!pc) return; - for (i = 0; i < pc->num_blocks; ++i) { - FREE(pc->blocks[i].group_names); - FREE(pc->blocks[i].selector_names); - } - FREE(pc->blocks); + ac_destroy_perfcounters(&pc->base); FREE(pc); screen->perfcounters = NULL; } void si_init_perfcounters(struct si_screen *screen) { - struct si_perfcounters *pc; - const struct si_pc_block_gfxdescr *blocks; - unsigned num_blocks; - unsigned i; - - switch (screen->info.chip_class) { - case GFX7: - blocks = groups_CIK; - num_blocks = ARRAY_SIZE(groups_CIK); - break; - case GFX8: - blocks = groups_VI; - num_blocks = ARRAY_SIZE(groups_VI); - break; - case GFX9: - blocks = groups_gfx9; - num_blocks = ARRAY_SIZE(groups_gfx9); - break; - case GFX10: - case GFX10_3: - blocks = groups_gfx10; - num_blocks = ARRAY_SIZE(groups_gfx10); - break; - case GFX6: - default: - return; /* not implemented */ - } - - screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters); - if (!pc) - return; + bool separate_se, separate_instance; - pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen); - pc->num_instance_cs_dwords = 3; - - pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false); - pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false); - - pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block)); - if (!pc->blocks) - goto error; - pc->num_blocks = num_blocks; - - for (i = 0; i < num_blocks; ++i) { - struct si_pc_block *block = &pc->blocks[i]; - block->b = &blocks[i]; - block->num_instances = MAX2(1, block->b->instances); - - if (!strcmp(block->b->b->name, "CB") || - !strcmp(block->b->b->name, "DB") || - !strcmp(block->b->b->name, "RMI")) - block->num_instances = screen->info.max_se; - else if (!strcmp(block->b->b->name, "TCC")) - block->num_instances = screen->info.max_tcc_blocks; - else if (!strcmp(block->b->b->name, "IA")) - block->num_instances = MAX2(1, screen->info.max_se / 2); - else if (!strcmp(block->b->b->name, "TA") || - !strcmp(block->b->b->name, "TCP") || - !strcmp(block->b->b->name, "TD")) { - block->num_instances = MAX2(1, screen->info.max_good_cu_per_sa); - } + separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false); + separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false); - if (si_pc_block_has_per_instance_groups(pc, block)) { - block->num_groups = block->num_instances; - } else { - block->num_groups = 1; - } + screen->perfcounters = CALLOC_STRUCT(si_perfcounters); + if (!screen->perfcounters) + return; - if (si_pc_block_has_per_se_groups(pc, block)) - block->num_groups *= screen->info.max_se; - if (block->b->b->flags & SI_PC_BLOCK_SHADER) - block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits); + screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen); + screen->perfcounters->num_instance_cs_dwords = 3; - pc->num_groups += block->num_groups; + if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance, + &screen->perfcounters->base)) { + si_destroy_perfcounters(screen); } - - return; - -error: - si_destroy_perfcounters(screen); } diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c index 6196f2158..b812f170c 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.c @@ -35,6 +35,7 @@ #include "sid.h" #include "ac_shadowed_regs.h" #include "util/disk_cache.h" +#include "util/u_cpu_detect.h" #include "util/u_log.h" #include "util/u_memory.h" #include "util/u_suballoc.h" @@ -80,29 +81,25 @@ static const struct debug_named_value radeonsi_debug_options[] = { {"compute", DBG(COMPUTE), "Print compute info"}, {"vm", DBG(VM), "Print virtual addresses when creating resources"}, {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."}, + {"ib", DBG(IB), "Print command buffers."}, /* Driver options: */ {"nowc", DBG(NO_WC), "Disable GTT write combining"}, {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."}, {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."}, {"shadowregs", DBG(SHADOW_REGS), "Enable CP register shadowing."}, + {"nofastdlist", DBG(NO_FAST_DISPLAY_LIST), "Disable fast display lists"}, /* 3D engine options: */ {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."}, {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."}, - {"nofastlaunch", DBG(NO_FAST_LAUNCH), "Disable NGG GS fast launch."}, {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."}, {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."}, {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."}, - {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."}, - {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."}, - {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."}, {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."}, {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"}, {"nodpbb", DBG(NO_DPBB), "Disable DPBB."}, - {"nodfsm", DBG(NO_DFSM), "Disable DFSM."}, {"dpbb", DBG(DPBB), "Enable DPBB."}, - {"dfsm", DBG(DFSM), "Enable DFSM."}, {"nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z"}, {"no2d", DBG(NO_2D_TILING), "Disable 2D tiling"}, {"notiling", DBG(NO_TILING), "Disable tiling"}, @@ -110,9 +107,11 @@ static const struct debug_named_value radeonsi_debug_options[] = { {"nodisplaydcc", DBG(NO_DISPLAY_DCC), "Disable display DCC"}, {"nodcc", DBG(NO_DCC), "Disable DCC."}, {"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."}, - {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"}, + {"nodccstore", DBG(NO_DCC_STORE), "Disable DCC stores"}, + {"dccstore", DBG(DCC_STORE), "Enable DCC stores"}, {"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"}, {"nofmask", DBG(NO_FMASK), "Disable MSAA compression"}, + {"nodma", DBG(NO_DMA), "Disable SDMA-copy for DRI_PRIME"}, {"tmz", DBG(TMZ), "Force allocation of scanout/depth/stencil buffer as encrypted"}, {"sqtt", DBG(SQTT), "Enable SQTT"}, @@ -142,7 +141,6 @@ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compil enum ac_target_machine_options tm_options = (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) | - (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) | (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) | (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0); @@ -150,12 +148,24 @@ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compil ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options); compiler->passes = ac_create_llvm_passes(compiler->tm); - if (compiler->tm_wave32) - compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32); if (compiler->low_opt_tm) compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm); } +void si_init_aux_async_compute_ctx(struct si_screen *sscreen) +{ + assert(!sscreen->async_compute_context); + sscreen->async_compute_context = si_create_context( + &sscreen->b, + SI_CONTEXT_FLAG_AUX | + (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) | + PIPE_CONTEXT_COMPUTE_ONLY); + + /* Limit the numbers of waves allocated for this context. */ + if (sscreen->async_compute_context) + ((struct si_context*)sscreen->async_compute_context)->cs_max_waves_per_sh = 2; +} + static void si_destroy_compiler(struct ac_llvm_compiler *compiler) { ac_destroy_llvm_compiler(compiler); @@ -255,8 +265,10 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer); if (sctx->cs_dcc_decompress) sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_decompress); - if (sctx->cs_dcc_retile) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile); + for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_dcc_retile); i++) { + if (sctx->cs_dcc_retile[i]) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile[i]); + } if (sctx->no_velems_state) sctx->b.delete_vertex_elements_state(&sctx->b, sctx->no_velems_state); @@ -284,17 +296,6 @@ static void si_destroy_context(struct pipe_context *context) if (sctx->blitter) util_blitter_destroy(sctx->blitter); - /* Release DCC stats. */ - for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { - assert(!sctx->dcc_stats[i].query_active); - - for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++) - if (sctx->dcc_stats[i].ps_stats[j]) - sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[i].ps_stats[j]); - - si_texture_reference(&sctx->dcc_stats[i].tex, NULL); - } - if (sctx->query_result_shader) sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader); if (sctx->sh_query_result_shader) @@ -303,6 +304,10 @@ static void si_destroy_context(struct pipe_context *context) sctx->ws->cs_destroy(&sctx->gfx_cs); if (sctx->ctx) sctx->ws->ctx_destroy(sctx->ctx); + if (sctx->sdma_cs) { + sctx->ws->cs_destroy(sctx->sdma_cs); + free(sctx->sdma_cs); + } if (sctx->dirty_implicit_resources) _mesa_hash_table_destroy(sctx->dirty_implicit_resources, @@ -321,12 +326,8 @@ static void si_destroy_context(struct pipe_context *context) u_suballocator_destroy(&sctx->allocator_zeroed_memory); sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL); - sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL); si_resource_reference(&sctx->eop_bug_scratch, NULL); si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL); - si_resource_reference(&sctx->index_ring, NULL); - si_resource_reference(&sctx->barrier_buf, NULL); - si_resource_reference(&sctx->last_ib_barrier_buf, NULL); si_resource_reference(&sctx->shadowed_regs, NULL); radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL); radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL); @@ -503,7 +504,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign /* Initialize private allocators. */ u_suballocator_init(&sctx->allocator_zeroed_memory, &sctx->b, 128 * 1024, 0, PIPE_USAGE_DEFAULT, - SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false); + SI_RESOURCE_FLAG_CLEAR | SI_RESOURCE_FLAG_32BIT, false); sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0); if (!sctx->cached_gtt_allocator) @@ -552,6 +553,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign } sctx->ngg = sscreen->use_ngg; + si_shader_change_notify(sctx); /* Initialize context functions used by graphics and compute. */ if (sctx->chip_class >= GFX10) @@ -588,6 +590,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign si_init_state_functions(sctx); si_init_streamout_functions(sctx); si_init_viewport_functions(sctx); + si_init_spi_map_functions(sctx); sctx->blitter = util_blitter_create(&sctx->b); if (sctx->blitter == NULL) @@ -607,27 +610,46 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter); sctx->queued.named.rasterizer = sctx->discard_rasterizer_state; - si_init_draw_functions(sctx); - - si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX, - &sctx->prim_discard_vertex_count_threshold, - &sctx->index_ring_size_per_ib); - } else { - sctx->prim_discard_vertex_count_threshold = UINT_MAX; + switch (sctx->chip_class) { + case GFX6: + si_init_draw_functions_GFX6(sctx); + break; + case GFX7: + si_init_draw_functions_GFX7(sctx); + break; + case GFX8: + si_init_draw_functions_GFX8(sctx); + break; + case GFX9: + si_init_draw_functions_GFX9(sctx); + break; + case GFX10: + si_init_draw_functions_GFX10(sctx); + break; + case GFX10_3: + si_init_draw_functions_GFX10_3(sctx); + break; + default: + unreachable("unhandled chip class"); + } } sctx->sample_mask = 0xffff; /* Initialize multimedia functions. */ - if (sscreen->info.has_hw_decode) { + if (sscreen->info.has_video_hw.uvd_decode || sscreen->info.has_video_hw.vcn_decode || + sscreen->info.has_video_hw.jpeg_decode || sscreen->info.has_video_hw.vce_encode || + sscreen->info.has_video_hw.uvd_encode || sscreen->info.has_video_hw.vcn_encode) { sctx->b.create_video_codec = si_uvd_create_decoder; sctx->b.create_video_buffer = si_video_buffer_create; + if (screen->resource_create_with_modifiers) + sctx->b.create_video_buffer_with_modifiers = si_video_buffer_create_with_modifiers; } else { sctx->b.create_video_codec = vl_create_decoder; sctx->b.create_video_buffer = vl_video_buffer_create; } - if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) { + if (sctx->chip_class >= GFX9) { sctx->wait_mem_scratch = si_aligned_buffer_create(screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL, @@ -707,11 +729,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign if (!sctx->dirty_implicit_resources) goto fail; - sctx->sample_pos_buffer = - pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions)); - pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions), - &sctx->sample_positions); - /* The remainder of this function initializes the gfx CS and must be last. */ assert(sctx->gfx_cs.current.cdw == 0); @@ -719,6 +736,23 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign si_init_cp_reg_shadowing(sctx); } + /* Set immutable fields of shader keys. */ + if (sctx->chip_class >= GFX9) { + /* The LS output / HS input layout can be communicated + * directly instead of via user SGPRs for merged LS-HS. + * This also enables jumping over the VS prolog for HS-only waves. + * + * When the LS VGPR fix is needed, monolithic shaders can: + * - avoid initializing EXEC in both the LS prolog + * and the LS main part when !vs_needs_prolog + * - remove the fixup for unused input VGPRs + */ + sctx->shader.tcs.key.opt.prefer_mono = 1; + + /* This enables jumping over the VS prolog for GS-only waves. */ + sctx->shader.gs.key.opt.prefer_mono = 1; + } + si_begin_new_gfx_cs(sctx, true); assert(sctx->gfx_cs.current.cdw == sctx->initial_gfx_cs_size); @@ -763,6 +797,13 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log); } simple_mtx_unlock(&sscreen->aux_context_lock); + + simple_mtx_lock(&sscreen->async_compute_context_lock); + if (status != PIPE_NO_RESET && sscreen->async_compute_context) { + sscreen->async_compute_context->destroy(sscreen->async_compute_context); + sscreen->async_compute_context = NULL; + } + simple_mtx_unlock(&sscreen->async_compute_context_lock); } sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw; @@ -773,12 +814,23 @@ fail: return NULL; } +static bool si_is_resource_busy(struct pipe_screen *screen, struct pipe_resource *resource, + unsigned usage) +{ + struct radeon_winsys *ws = ((struct si_screen *)screen)->ws; + + return !ws->buffer_wait(ws, si_resource(resource)->buf, 0, + /* If mapping for write, we need to wait for all reads and writes. + * If mapping for read, we only need to wait for writes. + */ + usage & PIPE_MAP_WRITE ? RADEON_USAGE_READWRITE : RADEON_USAGE_WRITE); +} + static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, void *priv, unsigned flags) { struct si_screen *sscreen = (struct si_screen *)screen; struct pipe_context *ctx; - uint64_t total_ram; if (sscreen->debug_flags & DBG(CHECK_VM)) flags |= PIPE_CONTEXT_DEBUG; @@ -806,14 +858,19 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v /* Use asynchronous flushes only on amdgpu, since the radeon * implementation for fence_server_sync is incomplete. */ - struct pipe_context * tc = threaded_context_create( - ctx, &sscreen->pool_transfers, si_replace_buffer_storage, - sscreen->info.is_amdgpu ? si_create_fence : NULL, - &((struct si_context *)ctx)->tc); - - if (tc && tc != ctx && os_get_total_physical_memory(&total_ram)) { - ((struct threaded_context *) tc)->bytes_mapped_limit = total_ram / 4; - } + struct pipe_context *tc = + threaded_context_create(ctx, &sscreen->pool_transfers, + si_replace_buffer_storage, + &(struct threaded_context_options){ + .create_fence = sscreen->info.is_amdgpu ? + si_create_fence : NULL, + .is_resource_busy = si_is_resource_busy, + .driver_calls_flush_notify = true, + }, + &((struct si_context *)ctx)->tc); + + if (tc && tc != ctx) + threaded_context_init_bytes_mapped_limit((struct threaded_context *)tc, 4); return tc; } @@ -853,6 +910,11 @@ static void si_destroy_screen(struct pipe_screen *pscreen) sscreen->aux_context->destroy(sscreen->aux_context); } + simple_mtx_destroy(&sscreen->async_compute_context_lock); + if (sscreen->async_compute_context) { + sscreen->async_compute_context->destroy(sscreen->async_compute_context); + } + util_queue_destroy(&sscreen->shader_compiler_queue); util_queue_destroy(&sscreen->shader_compiler_queue_low_priority); @@ -887,6 +949,9 @@ static void si_destroy_screen(struct pipe_screen *pscreen) disk_cache_destroy(sscreen->disk_shader_cache); util_live_shader_cache_deinit(&sscreen->live_shader_cache); + util_idalloc_mt_fini(&sscreen->buffer_ids); + util_vertex_state_cache_deinit(&sscreen->vertex_state_cache); + sscreen->ws->destroy(sscreen->ws); FREE(sscreen); } @@ -1017,22 +1082,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->options.enable_sam, sscreen->options.disable_sam); - /* Older LLVM have buggy v_pk_* instructions. */ - if (!sscreen->info.has_packed_math_16bit || LLVM_VERSION_MAJOR < 11) - sscreen->options.fp16 = false; - - if (sscreen->info.chip_class == GFX10_3 && LLVM_VERSION_MAJOR < 11) { - fprintf(stderr, "radeonsi: GFX 10.3 requires LLVM 11 or higher\n"); - FREE(sscreen); - return NULL; - } - - if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) { - fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n"); - FREE(sscreen); - return NULL; - } - if (sscreen->info.chip_class >= GFX9) { sscreen->se_tile_repeat = 32 * sscreen->info.max_se; } else { @@ -1054,6 +1103,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, return NULL; } + util_idalloc_mt_init_tc(&sscreen->buffer_ids); /* Set functions first. */ sscreen->b.context_create = si_pipe_create_context; @@ -1072,8 +1122,12 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, /* Set these flags in debug_flags early, so that the shader cache takes * them into account. + * + * Enable FS_CORRECT_DERIVS_AFTER_KILL by default if LLVM is >= 13. This makes + * nir_opt_move_discards_to_top more effective. */ - if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard")) + if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard") || + LLVM_VERSION_MAJOR >= 13) sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL); if (sscreen->debug_flags & DBG(INFO)) @@ -1093,6 +1147,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, } (void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain); + (void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain); (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain); si_init_gs_info(sscreen); @@ -1107,7 +1162,8 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, si_disk_cache_create(sscreen); /* Determine the number of shader compiler threads. */ - hw_threads = sysconf(_SC_NPROCESSORS_ONLN); + const struct util_cpu_caps_t *caps = util_get_cpu_caps(); + hw_threads = caps->nr_cpus; if (hw_threads >= 12) { num_comp_hi_threads = hw_threads * 3 / 4; @@ -1131,7 +1187,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (!util_queue_init( &sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) { + UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) { si_destroy_shader_cache(sscreen); FREE(sscreen); glsl_type_singleton_decref(); @@ -1141,7 +1197,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", 64, num_comp_lo_threads, UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY | - UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) { + UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY, NULL)) { si_destroy_shader_cache(sscreen); FREE(sscreen); glsl_type_singleton_decref(); @@ -1151,11 +1207,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) si_init_perfcounters(sscreen); - unsigned prim_discard_vertex_count_threshold, tmp; - si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp); - /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */ - if (prim_discard_vertex_count_threshold == UINT_MAX) - sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; + sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3; /* Determine tessellation ring info. */ bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && @@ -1221,12 +1273,14 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->commutative_blend_add = driQueryOptionb(config->options, "radeonsi_commutative_blend_add") || driQueryOptionb(config->options, "allow_draw_out_of_order"); + sscreen->allow_draw_out_of_order = driQueryOptionb(config->options, "allow_draw_out_of_order"); sscreen->use_ngg = !(sscreen->debug_flags & DBG(NO_NGG)) && sscreen->info.chip_class >= GFX10 && (sscreen->info.family != CHIP_NAVI14 || sscreen->info.is_pro_graphics); sscreen->use_ngg_culling = sscreen->use_ngg && + sscreen->info.max_render_backends >= 2 && !((sscreen->debug_flags & DBG(NO_NGG_CULLING)) || LLVM_VERSION_MAJOR <= 11 /* hangs on 11, see #4874 */); sscreen->use_ngg_streamout = false; @@ -1239,30 +1293,19 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true; } - /* Only enable primitive binning on APUs by default. */ - if (sscreen->info.chip_class >= GFX10) { - sscreen->dpbb_allowed = true; - /* DFSM is not supported on GFX 10.3 and not beneficial on Navi1x. */ - } else if (sscreen->info.chip_class == GFX9) { - sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram; - /* DFSM reduces the Raven2 draw prim rate by ~43%. Disable it. */ - sscreen->dfsm_allowed = false; - } - - /* Process DPBB enable flags. */ - if (sscreen->debug_flags & DBG(DPBB)) { - sscreen->dpbb_allowed = true; - if (sscreen->debug_flags & DBG(DFSM)) - sscreen->dfsm_allowed = true; - } + /* DCC stores have 50% performance of uncompressed stores and sometimes + * even less than that. It's risky to enable on dGPUs. + */ + sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) && + ((sscreen->info.chip_class >= GFX10_3 && + !sscreen->info.has_dedicated_vram) || + sscreen->debug_flags & DBG(DCC_STORE)); - /* Process DPBB disable flags. */ - if (sscreen->debug_flags & DBG(NO_DPBB)) { - sscreen->dpbb_allowed = false; - sscreen->dfsm_allowed = false; - } else if (sscreen->debug_flags & DBG(NO_DFSM)) { - sscreen->dfsm_allowed = false; - } + sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) && + (sscreen->info.chip_class >= GFX10 || + /* Only enable primitive binning on gfx9 APUs by default. */ + (sscreen->info.chip_class == GFX9 && !sscreen->info.has_dedicated_vram) || + sscreen->debug_flags & DBG(DPBB)); if (sscreen->dpbb_allowed) { if (sscreen->info.has_dedicated_vram) { @@ -1289,11 +1332,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->pbb_persistent_states_per_bin <= 32); } - /* While it would be nice not to have this flag, we are constrained - * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9. - */ - sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9; - (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain); sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; @@ -1331,6 +1369,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, } } + sscreen->ngg_subgroup_size = 128; sscreen->ge_wave_size = 64; sscreen->ps_wave_size = 64; sscreen->compute_wave_size = 64; @@ -1406,6 +1445,9 @@ struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_conf drmVersionPtr version = drmGetVersion(fd); struct radeon_winsys *rw = NULL; + driParseConfigFiles(config->options, config->options_info, 0, "radeonsi", + NULL, NULL, NULL, 0, NULL, 0); + switch (version->version_major) { case 2: rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl); diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h index c9f64a144..2408346c3 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pipe.h @@ -31,6 +31,7 @@ #include "util/u_idalloc.h" #include "util/u_suballoc.h" #include "util/u_threaded_context.h" +#include "util/u_vertex_state_cache.h" #include "ac_sqtt.h" #ifdef __cplusplus @@ -44,7 +45,6 @@ extern "C" { #endif #define ATI_VENDOR_ID 0x1002 -#define SI_PRIM_DISCARD_DEBUG 0 #define SI_NOT_QUERY 0xffffffff /* The base vertex and primitive restart can be any number, but we must pick @@ -55,7 +55,7 @@ extern "C" { #define SI_DRAW_ID_UNKNOWN ((unsigned)INT_MIN) #define SI_RESTART_INDEX_UNKNOWN ((unsigned)INT_MIN) #define SI_INSTANCE_COUNT_UNKNOWN ((unsigned)INT_MIN) -#define SI_NUM_SMOOTH_AA_SAMPLES 8 +#define SI_NUM_SMOOTH_AA_SAMPLES 4 #define SI_MAX_POINT_SIZE 2048 #define SI_GS_PER_ES 128 /* Alignment for optimal CP DMA performance. */ @@ -64,7 +64,8 @@ extern "C" { /* Tunables for compute-based clear_buffer and copy_buffer: */ #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 #define SI_COMPUTE_COPY_DW_PER_THREAD 4 -#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM +/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */ +#define SI_COMPUTE_DST_CACHE_POLICY L2_LRU /* Pipeline & streamout query controls. */ #define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) @@ -137,6 +138,7 @@ extern "C" { (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3) #define SI_RESOURCE_FLAG_UNCACHED (PIPE_RESOURCE_FLAG_DRV_PRIV << 12) #define SI_RESOURCE_FLAG_DRIVER_INTERNAL (PIPE_RESOURCE_FLAG_DRV_PRIV << 13) +#define SI_RESOURCE_AUX_PLANE (PIPE_RESOURCE_FLAG_DRV_PRIV << 14) enum si_has_gs { GS_OFF, @@ -153,11 +155,6 @@ enum si_has_ngg { NGG_ON, }; -enum si_has_prim_discard_cs { - PRIM_DISCARD_CS_OFF, - PRIM_DISCARD_CS_ON, -}; - enum si_clear_code { DCC_CLEAR_COLOR_0000 = 0x00000000, @@ -168,9 +165,8 @@ enum si_clear_code DCC_UNCOMPRESSED = 0xFFFFFFFF, }; -#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7) -#define SI_IMAGE_ACCESS_DCC_OFF (1 << 8) -#define SI_IMAGE_ACCESS_DCC_WRITE (1 << 9) +#define SI_IMAGE_ACCESS_DCC_OFF (1 << 8) +#define SI_IMAGE_ACCESS_ALLOW_DCC_STORE (1 << 9) /* Debug flags. */ enum @@ -208,12 +204,14 @@ enum DBG_COMPUTE, DBG_VM, DBG_CACHE_STATS, + DBG_IB, /* Driver options: */ DBG_NO_WC, DBG_CHECK_VM, DBG_RESERVE_VMID, DBG_SHADOW_REGS, + DBG_NO_FAST_DISPLAY_LIST, /* 3D engine options: */ DBG_NO_GFX, @@ -221,16 +219,10 @@ enum DBG_ALWAYS_NGG_CULLING_ALL, DBG_ALWAYS_NGG_CULLING_TESS, DBG_NO_NGG_CULLING, - DBG_NO_FAST_LAUNCH, - DBG_ALWAYS_PD, - DBG_PD, - DBG_NO_PD, DBG_SWITCH_ON_EOP, DBG_NO_OUT_OF_ORDER, DBG_NO_DPBB, - DBG_NO_DFSM, DBG_DPBB, - DBG_DFSM, DBG_NO_HYPERZ, DBG_NO_2D_TILING, DBG_NO_TILING, @@ -238,9 +230,11 @@ enum DBG_NO_DISPLAY_DCC, DBG_NO_DCC, DBG_NO_DCC_CLEAR, - DBG_NO_DCC_FB, + DBG_NO_DCC_STORE, + DBG_DCC_STORE, DBG_NO_DCC_MSAA, DBG_NO_FMASK, + DBG_NO_DMA, DBG_TMZ, DBG_SQTT, @@ -293,16 +287,14 @@ struct si_resource { struct pb_buffer *buf; uint64_t gpu_address; /* Memory usage if the buffer placement is optimal. */ - uint32_t vram_usage_kb; - uint32_t gart_usage_kb; + uint32_t memory_usage_kb; /* Resource properties. */ uint64_t bo_size; - unsigned bo_alignment; - enum radeon_bo_domain domains; - enum radeon_bo_flag flags; + uint8_t bo_alignment_log2; + enum radeon_bo_domain domains:8; + enum radeon_bo_flag flags:16; unsigned bind_history; - int max_forced_staging_uploads; /* The buffer range which is initialized (with a write transfer, * streamout, DMA, or as a random access target). The rest of @@ -331,13 +323,12 @@ struct si_resource { bool image_handle_allocated; /* Whether the resource has been exported via resource_get_handle. */ - unsigned external_usage; /* PIPE_HANDLE_USAGE_* */ + uint8_t external_usage; /* PIPE_HANDLE_USAGE_* */ }; struct si_transfer { struct threaded_transfer b; struct si_resource *staging; - unsigned offset; }; struct si_texture { @@ -368,7 +359,8 @@ struct si_texture { /* Depth buffer compression and fast clear. */ float depth_clear_value[RADEON_SURF_MAX_LEVELS]; uint8_t stencil_clear_value[RADEON_SURF_MAX_LEVELS]; - uint16_t depth_cleared_level_mask; /* if it was cleared at least once */ + uint16_t depth_cleared_level_mask_once; /* if it was cleared at least once */ + uint16_t depth_cleared_level_mask; /* track if it's cleared (can be false negative) */ uint16_t stencil_cleared_level_mask; /* if it was cleared at least once */ uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */ uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ @@ -382,40 +374,36 @@ struct si_texture { bool db_compatible : 1; bool can_sample_z : 1; bool can_sample_s : 1; + bool need_flush_after_depth_decompression: 1; /* We need to track DCC dirtiness, because st/dri usually calls * flush_resource twice per frame (not a bug) and we don't wanna - * decompress DCC twice. Also, the dirty tracking must be done even - * if DCC isn't used, because it's required by the DCC usage analysis - * for a possible future enablement. + * decompress DCC twice. */ - bool separate_dcc_dirty : 1; bool displayable_dcc_dirty : 1; - /* Statistics gathering for the DCC enablement heuristic. */ - bool dcc_gather_statistics : 1; /* Counter that should be non-zero if the texture is bound to a * framebuffer. */ unsigned framebuffers_bound; - /* Whether the texture is a displayable back buffer and needs DCC - * decompression, which is expensive. Therefore, it's enabled only - * if statistics suggest that it will pay off and it's allocated - * separately. It can't be bound as a sampler by apps. Limited to - * target == 2D and last_level == 0. If enabled, dcc_offset contains - * the absolute GPUVM address, not the relative one. - */ - struct si_resource *dcc_separate_buffer; - /* When DCC is temporarily disabled, the separate buffer is here. */ - struct si_resource *last_dcc_separate_buffer; - /* Estimate of how much this color buffer is written to in units of - * full-screen draws: ps_invocations / (width * height) - * Shader kills, late Z, and blending with trivial discards make it - * inaccurate (we need to count CB updates, not PS invocations). - */ - unsigned ps_draw_ratio; - /* The number of clears since the last DCC usage analysis. */ - unsigned num_slow_clears; +}; + +/* State trackers create separate textures in a next-chain for extra planes + * even if those are planes created purely for modifiers. Because the linking + * of the chain happens outside of the driver, and NULL is interpreted as + * failure, let's create some dummy texture structs. We could use these + * later to use the offsets for linking if we really wanted to. + * + * For now just create a dummy struct and completely ignore it. + * + * Potentially in the future we could store stride/offset and use it during + * creation, though we might want to change how linking is done first. + */ +struct si_auxiliary_texture { + struct threaded_resource b; + struct pb_buffer *buffer; + uint32_t offset; + uint32_t stride; }; struct si_surface { @@ -533,7 +521,7 @@ struct si_screen { unsigned width, unsigned height, unsigned depth, uint32_t *state, uint32_t *fmask_state); - unsigned num_vbos_in_user_sgprs; + unsigned max_memory_usage_kb; unsigned pa_sc_raster_config; unsigned pa_sc_raster_config_1; unsigned se_tile_repeat; @@ -551,13 +539,13 @@ struct si_screen { bool has_out_of_order_rast; bool assume_no_z_fights; bool commutative_blend_add; + bool allow_draw_out_of_order; bool dpbb_allowed; - bool dfsm_allowed; - bool llvm_has_working_vgpr_indexing; bool use_ngg; bool use_ngg_culling; bool use_ngg_streamout; bool allow_dcc_msaa_clear_to_reg_for_bpp[5]; /* indexed by log2(Bpp) */ + bool always_allow_dcc_stores; struct { #define OPT_BOOL(name, dflt, description) bool name : 1; @@ -578,6 +566,10 @@ struct si_screen { struct pipe_context *aux_context; simple_mtx_t aux_context_lock; + /* Async compute context for DRI_PRIME copies. */ + struct pipe_context *async_compute_context; + simple_mtx_t async_compute_context_lock; + /* This must be in the screen, because UE4 uses one context for * compilation and another one for rendering. */ @@ -671,6 +663,10 @@ struct si_screen { unsigned compute_wave_size; unsigned ps_wave_size; unsigned ge_wave_size; + unsigned ngg_subgroup_size; + + struct util_idalloc_mt buffer_ids; + struct util_vertex_state_cache vertex_state_cache; }; struct si_sampler_view { @@ -809,6 +805,8 @@ struct si_streamout { struct si_shader_ctx_state { struct si_shader_selector *cso; struct si_shader *current; + /* The shader variant key representing the current state. */ + struct si_shader_key key; }; #define SI_NUM_VGT_PARAM_KEY_BITS 12 @@ -846,35 +844,6 @@ union si_vgt_param_key { uint16_t index; }; -#define SI_NUM_VGT_STAGES_KEY_BITS 6 -#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) - -/* The VGT_SHADER_STAGES key used to index the table of precomputed values. - * Some fields are set by state-change calls, most are set by draw_vbo. - */ -union si_vgt_stages_key { - struct { -#if UTIL_ARCH_LITTLE_ENDIAN - uint8_t tess : 1; - uint8_t gs : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg : 1; /* gfx10+ */ - uint8_t streamout : 1; /* only used with NGG */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; -#else /* UTIL_ARCH_BIG_ENDIAN */ - uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; - uint8_t streamout : 1; - uint8_t ngg : 1; - uint8_t ngg_passthrough : 1; - uint8_t ngg_gs_fast_launch : 1; - uint8_t gs : 1; - uint8_t tess : 1; -#endif - } u; - uint8_t index; -}; - struct si_texture_handle { unsigned desc_slot; bool desc_dirty; @@ -897,7 +866,6 @@ struct si_saved_cs { unsigned trace_id; unsigned gfx_last_dw; - unsigned compute_last_dw; bool flushed; int64_t time_flush; }; @@ -907,11 +875,24 @@ struct si_small_prim_cull_info { float small_prim_precision; }; +struct si_vertex_state { + struct pipe_vertex_state b; + struct si_vertex_elements velems; + uint32_t descriptors[4 * SI_MAX_ATTRIBS]; +}; + typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe, const struct pipe_draw_info *info, + unsigned drawid_offset, const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *draws, + const struct pipe_draw_start_count_bias *draws, unsigned num_draws); +typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx, + struct pipe_vertex_state *vstate, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws); struct si_context { struct pipe_context b; /* base class */ @@ -922,6 +903,7 @@ struct si_context { struct radeon_winsys *ws; struct radeon_winsys_ctx *ctx; struct radeon_cmdbuf gfx_cs; /* compute IB if graphics is disabled */ + struct radeon_cmdbuf *sdma_cs; struct pipe_fence_handle *last_gfx_fence; struct si_resource *eop_bug_scratch; struct si_resource *eop_bug_scratch_tmz; @@ -962,7 +944,7 @@ struct si_context { void *cs_clear_render_target_1d_array; void *cs_clear_12bytes_buffer; void *cs_dcc_decompress; - void *cs_dcc_retile; + void *cs_dcc_retile[32]; void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ struct si_screen *screen; struct pipe_debug_callback debug; @@ -990,33 +972,11 @@ struct si_context { unsigned last_num_draw_calls; unsigned flags; /* flush flags */ /* Current unaccounted memory usage. */ - uint32_t vram_kb; - uint32_t gtt_kb; + uint32_t memory_usage_kb; - /* Compute-based primitive discard. */ - unsigned prim_discard_vertex_count_threshold; + /* NGG streamout. */ struct pb_buffer *gds; struct pb_buffer *gds_oa; - struct radeon_cmdbuf prim_discard_compute_cs; - unsigned compute_gds_offset; - struct si_shader *compute_ib_last_shader; - uint32_t compute_rewind_va; - unsigned compute_num_prims_in_batch; - bool preserve_prim_restart_gds_at_flush; - /* index_ring is divided into 2 halves for doublebuffering. */ - struct si_resource *index_ring; - unsigned index_ring_base; /* offset of a per-IB portion */ - unsigned index_ring_offset; /* offset within a per-IB portion */ - unsigned index_ring_size_per_ib; /* max available size per IB */ - bool prim_discard_compute_ib_initialized; - /* For tracking the last execution barrier - it can be either - * a WRITE_DATA packet or a fence. */ - uint32_t *last_pkt3_write_data; - struct si_resource *barrier_buf; - unsigned barrier_buf_offset; - struct pipe_fence_handle *last_ib_barrier_fence; - struct si_resource *last_ib_barrier_buf; - unsigned last_ib_barrier_buf_offset; /* Atoms (direct states). */ union si_state_atoms atoms; @@ -1065,28 +1025,27 @@ struct si_context { /* indexed access using pipe_shader_type (not by MESA_SHADER_*) */ struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS]; }; - struct si_shader_ctx_state cs_prim_discard_state; struct si_cs_shader_state cs_shader_state; /* shader information */ + uint64_t ps_inputs_read_or_disabled; struct si_vertex_elements *vertex_elements; unsigned num_vertex_elements; - unsigned sprite_coord_enable; unsigned cs_max_waves_per_sh; - bool flatshade; + bool uses_nontrivial_vs_prolog; + bool force_trivial_vs_prolog; bool do_update_shaders; bool compute_shaderbuf_sgprs_dirty; bool compute_image_sgprs_dirty; bool vs_uses_base_instance; bool vs_uses_draw_id; + uint8_t patch_vertices; /* shader descriptors */ struct si_descriptors descriptors[SI_NUM_DESCS]; unsigned descriptors_dirty; unsigned shader_pointers_dirty; unsigned shader_needs_decompress_mask; - unsigned inlinable_uniforms_valid_mask; - uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS]; struct si_buffer_resources internal_bindings; struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS]; struct si_samplers samplers[SI_NUM_SHADERS]; @@ -1141,11 +1100,7 @@ struct si_context { bool allow_flat_shading : 1; /* Emitted draw state. */ - bool gs_tri_strip_adj_fix : 1; - bool ls_vgpr_fix : 1; - bool prim_discard_cs_instancing : 1; bool ngg : 1; - bool same_patch_vertices : 1; uint8_t ngg_culling; unsigned last_index_size; int last_base_vertex; @@ -1256,9 +1211,6 @@ struct si_context { unsigned num_resident_handles; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ - unsigned compute_num_verts_accepted; - unsigned compute_num_verts_rejected; - unsigned compute_num_verts_ineligible; /* due to low vertex count */ unsigned context_roll; /* Queries. */ @@ -1281,25 +1233,6 @@ struct si_context { bool force_cb_shader_coherent; - /* Statistics gathering for the DCC enablement heuristic. It can't be - * in si_texture because si_texture can be shared by multiple - * contexts. This is for back buffers only. We shouldn't get too many - * of those. - * - * X11 DRI3 rotates among a finite set of back buffers. They should - * all fit in this array. If they don't, separate DCC might never be - * enabled by DCC stat gathering. - */ - struct { - struct si_texture *tex; - /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */ - struct pipe_query *ps_stats[3]; - /* If all slots are used and another slot is needed, - * the least recently used slot is evicted based on this. */ - int64_t last_use_timestamp; - bool query_active; - } dcc_stats[5]; - struct si_tracked_regs tracked_regs; /* Resources that need to be flushed, but will not get an explicit @@ -1308,7 +1241,12 @@ struct si_context { */ struct hash_table *dirty_implicit_resources; - pipe_draw_vbo_func draw_vbo[NUM_GFX_VERSIONS - GFX6][2][2][2][2]; + pipe_draw_vbo_func draw_vbo[2][2][2]; + pipe_draw_vertex_state_func draw_vertex_state[2][2][2]; + /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */ + pipe_draw_vbo_func real_draw_vbo; + pipe_draw_vertex_state_func real_draw_vertex_state; + void (*emit_spi_map[33])(struct si_context *sctx); /* SQTT */ struct ac_thread_trace_data *thread_trace; @@ -1346,6 +1284,9 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex); void si_flush_implicit_resources(struct si_context *sctx); +/* si_nir_optim.c */ +bool si_nir_is_output_const_if_tex_is_const(nir_shader *shader, float *in, float *out, int *texunit); + /* si_buffer.c */ bool si_cs_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf, enum radeon_bo_usage usage); @@ -1359,7 +1300,8 @@ struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, uns struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, unsigned usage, unsigned size, unsigned alignment); void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, - struct pipe_resource *src); + struct pipe_resource *src, unsigned num_rebinds, + uint32_t rebind_mask, uint32_t delete_buffer_id); void si_init_screen_buffer_functions(struct si_screen *sscreen); void si_init_buffer_functions(struct si_context *sctx); @@ -1474,6 +1416,7 @@ void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring); bool si_replace_shader(unsigned num, struct si_shader_binary *binary); +void si_print_current_ib(struct si_context *sctx, FILE *f); /* si_fence.c */ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event, @@ -1491,16 +1434,23 @@ struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, /* si_get.c */ void si_init_screen_get_functions(struct si_screen *sscreen); +bool si_sdma_copy_image(struct si_context *ctx, struct si_texture *dst, struct si_texture *src); + /* si_gfx_cs.c */ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); void si_allocate_gds(struct si_context *ctx); void si_set_tracked_regs_to_clear_state(struct si_context *ctx); void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs); -void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws); +void si_trace_emit(struct si_context *sctx); void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl); void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); +/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement + * optimizations without affecting the normal draw_vbo functions perf. + */ +void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper, + pipe_draw_vertex_state_func vstate_wrapper); /* si_gpu_load.c */ void si_gpu_load_kill_thread(struct si_screen *sscreen); @@ -1511,33 +1461,9 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs); void si_init_compute_functions(struct si_context *sctx); -/* si_compute_prim_discard.c */ -enum si_prim_discard_outcome -{ - SI_PRIM_DISCARD_ENABLED, - SI_PRIM_DISCARD_DISABLED, - SI_PRIM_DISCARD_DRAW_SPLIT, - SI_PRIM_DISCARD_MULTI_DRAW_SPLIT, -}; - -void si_build_prim_discard_compute_shader(struct si_shader_context *ctx); -enum si_prim_discard_outcome -si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, - const struct pipe_draw_start_count *draws, - unsigned num_draws, bool primitive_restart, - unsigned total_count); -void si_compute_signal_gfx(struct si_context *sctx); -void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - unsigned count, unsigned index_size, - unsigned base_vertex, uint64_t input_indexbuf_va, - unsigned input_indexbuf_max_elements); -void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context, - unsigned *prim_discard_vertex_count_threshold, - unsigned *index_ring_size_per_ib); - /* si_pipe.c */ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler); +void si_init_aux_async_compute_ctx(struct si_screen *sscreen); /* si_perfcounters.c */ void si_init_perfcounters(struct si_screen *screen); @@ -1587,6 +1513,10 @@ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context, struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, const struct pipe_video_buffer *tmpl); +struct pipe_video_buffer *si_video_buffer_create_with_modifiers(struct pipe_context *pipe, + const struct pipe_video_buffer *tmpl, + const uint64_t *modifiers, + unsigned int modifiers_count); /* si_viewport.c */ void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out); @@ -1613,10 +1543,6 @@ struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe, const struct pipe_surface *templ, unsigned width0, unsigned height0, unsigned width, unsigned height); unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap); -void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex); -void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex); -void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex); -void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex); bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex); void si_init_screen_texture_functions(struct si_screen *sscreen); void si_init_context_texture_functions(struct si_context *sctx); @@ -1647,6 +1573,9 @@ bool si_init_thread_trace(struct si_context *sctx); void si_destroy_thread_trace(struct si_context *sctx); void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs); +/* si_state_shaders.c */ +struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key); + /* * common helpers */ @@ -1698,15 +1627,14 @@ static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx, * Also reserve space for stopping queries at the end of IB, because * the number of active queries is unlimited in theory. */ - return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 9; + return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 10; } static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r) { if (r) { /* Add memory usage for need_gfx_cs_space */ - sctx->vram_kb += si_resource(r)->vram_usage_kb; - sctx->gtt_kb += si_resource(r)->gart_usage_kb; + sctx->memory_usage_kb += si_resource(r)->memory_usage_kb; } } @@ -1866,7 +1794,19 @@ static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsi if (zs_mask == PIPE_MASK_S && (tex->htile_stencil_disabled || !tex->surface.has_stencil)) return false; - return tex->is_depth && tex->surface.meta_offset && level < tex->surface.num_meta_levels; + if (!tex->is_depth || !tex->surface.meta_offset) + return false; + + struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen; + if (sscreen->info.chip_class >= GFX8) { + return level < tex->surface.num_meta_levels; + } else { + /* GFX6-7 don't have TC-compatible HTILE, which means they have to run + * a decompression pass for every mipmap level before texturing, so compress + * only one level to reduce the number of decompression passes to a minimum. + */ + return level == 0; + } } static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, @@ -1908,6 +1848,12 @@ static inline unsigned si_get_total_colormask(struct si_context *sctx) ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) | \ (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY)) +#define UTIL_ALL_PRIM_TRIANGLE_MODES \ + ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | \ + (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | \ + (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | \ + (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)) + static inline bool util_prim_is_lines(unsigned prim) { return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0; @@ -1920,11 +1866,12 @@ static inline bool util_prim_is_points_or_lines(unsigned prim) static inline bool util_rast_prim_is_triangles(unsigned prim) { - return ((1 << prim) & - ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | - (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | - (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | - (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))); + return ((1 << prim) & UTIL_ALL_PRIM_TRIANGLE_MODES) != 0; +} + +static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim) +{ + return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0; } /** @@ -1935,17 +1882,27 @@ static inline bool util_rast_prim_is_triangles(unsigned prim) * \param gtt GTT memory size not added to the buffer list yet */ static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs, - uint32_t vram_kb, uint32_t gtt_kb) + uint32_t kb) +{ + return kb + cs->used_vram_kb + cs->used_gart_kb < screen->max_memory_usage_kb; +} + +static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws) { - vram_kb += cs->used_vram_kb; - gtt_kb += cs->used_gart_kb; + struct radeon_cmdbuf *cs = &ctx->gfx_cs; + + /* There are two memory usage counters in the winsys for all buffers + * that have been added (cs_add_buffer) and one counter in the pipe + * driver for those that haven't been added yet. + */ + uint32_t kb = ctx->memory_usage_kb; + ctx->memory_usage_kb = 0; - /* Anything that goes above the VRAM size should go to GTT. */ - if (vram_kb > screen->info.vram_size_kb) - gtt_kb += vram_kb - screen->info.vram_size_kb; + if (radeon_cs_memory_below_limit(ctx->screen, &ctx->gfx_cs, kb) && + ctx->ws->cs_check_space(cs, si_get_minimum_num_gfx_cs_dwords(ctx, num_draws), false)) + return; - /* Now we just need to check if we have enough GTT (the limit is 75% of max). */ - return gtt_kb < screen->info.gart_size_kb / 4 * 3; + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } /** @@ -1989,30 +1946,20 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc bool check_mem) { if (check_mem && - !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->vram_kb + bo->vram_usage_kb, - sctx->gtt_kb + bo->gart_usage_kb)) + !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->memory_usage_kb + bo->memory_usage_kb)) si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority); } -static inline bool si_compute_prim_discard_enabled(struct si_context *sctx) -{ - return sctx->prim_discard_vertex_count_threshold != UINT_MAX; -} - static inline unsigned si_get_wave_size(struct si_screen *sscreen, - gl_shader_stage stage, bool ngg, bool es, - bool gs_fast_launch, bool prim_discard_cs) + gl_shader_stage stage, bool ngg, bool es) { if (stage == MESA_SHADER_COMPUTE) return sscreen->compute_wave_size; else if (stage == MESA_SHADER_FRAGMENT) return sscreen->ps_wave_size; - else if (gs_fast_launch) - return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */ - else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */ - (stage == MESA_SHADER_VERTEX && es && !ngg) || + else if ((stage == MESA_SHADER_VERTEX && es && !ngg) || (stage == MESA_SHADER_TESS_EVAL && es && !ngg) || (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */ return 64; @@ -2024,19 +1971,30 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader) { return si_get_wave_size(shader->selector->screen, shader->selector->info.stage, shader->key.as_ngg, - shader->key.as_es, - shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL, - shader->key.opt.vs_as_prim_discard_cs); + shader->key.as_es); } static inline void si_select_draw_vbo(struct si_context *sctx) { - sctx->b.draw_vbo = sctx->draw_vbo[sctx->chip_class - GFX6] - [!!sctx->shader.tes.cso] - [!!sctx->shader.gs.cso] - [sctx->ngg] - [si_compute_prim_discard_enabled(sctx)]; - assert(sctx->b.draw_vbo); + pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso] + [!!sctx->shader.gs.cso] + [sctx->ngg]; + pipe_draw_vertex_state_func draw_vertex_state = + sctx->draw_vertex_state[!!sctx->shader.tes.cso] + [!!sctx->shader.gs.cso] + [sctx->ngg]; + assert(draw_vbo); + assert(draw_vertex_state); + + if (unlikely(sctx->real_draw_vbo)) { + assert(sctx->real_draw_vertex_state); + sctx->real_draw_vbo = draw_vbo; + sctx->real_draw_vertex_state = draw_vertex_state; + } else { + assert(!sctx->real_draw_vertex_state); + sctx->b.draw_vbo = draw_vbo; + sctx->b.draw_vertex_state = draw_vertex_state; + } } /* Return the number of samples that the rasterizer uses. */ @@ -2053,6 +2011,20 @@ static inline unsigned si_get_num_coverage_samples(struct si_context *sctx) return 1; } +static unsigned ALWAYS_INLINE +si_num_vbos_in_user_sgprs_inline(enum chip_class chip_class) +{ + /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't + * have to allocate and count references for the upload buffer. + */ + return chip_class >= GFX9 ? 5 : 1; +} + +static inline unsigned si_num_vbos_in_user_sgprs(struct si_screen *sscreen) +{ + return si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); +} + #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c index 22b6e3ad5..ae4affa1b 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.c @@ -117,13 +117,13 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - if (state->shader) { - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, state->shader->bo, + if (state->is_shader) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); } radeon_begin(cs); - radeon_emit_array(cs, state->pm4, state->ndw); + radeon_emit_array(state->pm4, state->ndw); radeon_end(); if (state->atom.emit) @@ -139,7 +139,7 @@ void si_pm4_reset_emitted(struct si_context *sctx, bool first_cs) for (unsigned i = 0; i < SI_NUM_STATES; i++) { struct si_pm4_state *state = sctx->emitted.array[i]; - if (state && state->shader) { + if (state && state->is_shader) { sctx->emitted.array[i] = NULL; sctx->dirty_states |= 1 << i; } diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h index 06909ff1a..03f79e0ba 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_pm4.h @@ -54,7 +54,7 @@ struct si_pm4_state { uint32_t pm4[SI_PM4_MAX_DW]; /* For shader states only */ - struct si_shader *shader; + bool is_shader; struct si_atom atom; }; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c index 121feb6fb..546f9da11 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c @@ -218,10 +218,10 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader) } /* Compile a variable block size using the maximum variable size. */ - if (shader->selector->info.base.cs.local_size_variable) + if (shader->selector->info.base.workgroup_size_variable) return SI_MAX_VARIABLE_THREADS_PER_BLOCK; - uint16_t *local_size = shader->selector->info.base.cs.local_size; + uint16_t *local_size = shader->selector->info.base.workgroup_size; unsigned max_work_group_size = (uint32_t)local_size[0] * (uint32_t)local_size[1] * (uint32_t)local_size[2]; @@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) /* VGPRs */ declare_vs_input_vgprs(ctx, &num_prolog_vgprs); - - /* Return values */ - if (shader->key.opt.vs_as_prim_discard_cs) { - for (i = 0; i < 4; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); - } break; case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */ @@ -553,11 +547,11 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) declare_vb_descriptor_input_sgprs(ctx); /* VGPRs (first GS, then VS/TES) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]); ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]); if (ctx->stage == MESA_SHADER_VERTEX) { declare_vs_input_vgprs(ctx, &num_prolog_vgprs); @@ -658,7 +652,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) SI_PARAM_LINEAR_CENTER); si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX); si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT); si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1], @@ -793,9 +787,6 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) { - /* We add this symbol even on LLVM <= 8 to ensure that - * shader->config.lds_size is set correctly below. - */ struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; sym->name = "esgs_ring"; sym->size = shader->gs_info.esgs_ring_size * 4; @@ -835,7 +826,9 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh { struct ac_rtld_binary rtld; si_shader_binary_open(screen, shader, &rtld); - return rtld.exec_size; + uint64_t size = rtld.exec_size; + ac_rtld_close(&rtld); + return size; } static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) @@ -865,8 +858,8 @@ bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader si_resource_reference(&shader->bo, NULL); shader->bo = si_aligned_buffer_create( &sscreen->b, - (sscreen->info.cpdma_prefetch_writes_memory ? - 0 : SI_RESOURCE_FLAG_READ_ONLY) | SI_RESOURCE_FLAG_DRIVER_INTERNAL, + (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) | + SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256); if (!shader->bo) return false; @@ -1071,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader) return "Vertex Shader as ES"; else if (shader->key.as_ls) return "Vertex Shader as LS"; - else if (shader->key.opt.vs_as_prim_discard_cs) - return "Vertex Shader as Primitive Discard CS"; else if (shader->key.as_ngg) return "Vertex Shader as ESGS"; else @@ -1153,8 +1144,6 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key, fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one); fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix, prolog->instance_divisor_is_fetched); - fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix, - prolog->unpack_instance_id_from_vertex_id); fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix); fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); @@ -1186,17 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) fprintf(f, " as_ls = %u\n", key->as_ls); fprintf(f, " as_ngg = %u\n", key->as_ngg); fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); - fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs); - fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]); - fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed); - fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing); - fprintf(f, " opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart); - fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first); - fprintf(f, " opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation); - fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front); - fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back); - fprintf(f, " opt.cs_cull_z = %u\n", key->opt.cs_cull_z); - fprintf(f, " opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space); break; case MESA_SHADER_TESS_CTRL: @@ -1297,8 +1275,8 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel, /* VGPR initialization fixup for Vega10 and Raven is always done in the * VS prolog. */ return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix || - prolog_key->unpack_instance_id_from_vertex_id || - (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + /* The 2nd VS prolog loads input VGPRs from LDS */ + (key->opt.ngg_culling && !ngg_cull_shader); } /** @@ -1323,16 +1301,9 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_ key->vs_prolog.as_ls = shader_out->key.as_ls; key->vs_prolog.as_es = shader_out->key.as_es; key->vs_prolog.as_ngg = shader_out->key.as_ngg; - key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs; - - if (ngg_cull_shader) { - key->vs_prolog.gs_fast_launch_tri_list = - !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); - key->vs_prolog.gs_fast_launch_tri_strip = - !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); - key->vs_prolog.gs_fast_launch_index_size_packed = - SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling); - } + + if (!ngg_cull_shader && shader_out->key.opt.ngg_culling) + key->vs_prolog.load_vgprs_after_culling = 1; if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) { key->vs_prolog.as_ls = 1; @@ -1346,8 +1317,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_ /* Only one of these combinations can be set. as_ngg can be set with as_es. */ assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg + - (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <= - 1); + (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1); /* Enable loading the InstanceID VGPR. */ uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); @@ -1453,8 +1423,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi si_dump_streamout(&sel->so); } - memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, - sizeof(shader->info.vs_output_param_offset)); + /* Initialize vs_output_ps_input_cntl to default. */ + for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++) + shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED; + shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0; shader->info.uses_instanceid = sel->info.uses_instanceid; @@ -1465,9 +1437,44 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) return false; - /* Validate SGPR and VGPR usage for compute to detect compiler bugs. - * LLVM 3.9svn has this bug. - */ + /* Compute vs_output_ps_input_cntl. */ + if ((sel->info.stage == MESA_SHADER_VERTEX || + sel->info.stage == MESA_SHADER_TESS_EVAL || + sel->info.stage == MESA_SHADER_GEOMETRY) && + !shader->key.as_ls && !shader->key.as_es) { + ubyte *vs_output_param_offset = shader->info.vs_output_param_offset; + + if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) + vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset; + + /* VS and TES should also set primitive ID output if it's used. */ + unsigned num_outputs_with_prim_id = sel->info.num_outputs + + shader->key.mono.u.vs_export_prim_id; + + for (unsigned i = 0; i < num_outputs_with_prim_id; i++) { + unsigned semantic = sel->info.output_semantic[i]; + unsigned offset = vs_output_param_offset[i]; + unsigned ps_input_cntl; + + if (offset <= AC_EXP_PARAM_OFFSET_31) { + /* The input is loaded from parameter memory. */ + ps_input_cntl = S_028644_OFFSET(offset); + } else { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && + offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); + offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; + + /* OFFSET=0x20 means that DEFAULT_VAL is used. */ + ps_input_cntl = S_028644_OFFSET(0x20) | + S_028644_DEFAULT_VAL(offset); + } + + shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl; + } + } + + /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */ if (sel->info.stage == MESA_SHADER_COMPUTE) { unsigned wave_size = sscreen->compute_wave_size; unsigned max_vgprs = @@ -1559,11 +1566,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, shader.key.as_ls = key->vs_prolog.as_ls; shader.key.as_es = key->vs_prolog.as_es; shader.key.as_ngg = key->vs_prolog.as_ngg; - shader.key.opt.ngg_culling = - (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) | - (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) | - SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed); - shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs; break; case MESA_SHADER_TESS_CTRL: assert(!prolog); @@ -1586,9 +1588,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, struct si_shader_context ctx; si_llvm_context_init(&ctx, sscreen, compiler, si_get_wave_size(sscreen, stage, - shader.key.as_ngg, shader.key.as_es, - shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL, - shader.key.opt.vs_as_prim_discard_cs)); + shader.key.as_ngg, shader.key.as_es)); ctx.shader = &shader; ctx.stage = stage; @@ -2026,8 +2026,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler shader->info.num_input_vgprs = mainp->info.num_input_vgprs; shader->info.face_vgpr_index = mainp->info.face_vgpr_index; shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; - memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset, - sizeof(mainp->info.vs_output_param_offset)); + memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl, + sizeof(mainp->info.vs_output_ps_input_cntl)); shader->info.uses_instanceid = mainp->info.uses_instanceid; shader->info.nr_pos_exports = mainp->info.nr_pos_exports; shader->info.nr_param_exports = mainp->info.nr_param_exports; @@ -2115,9 +2115,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) || (sel->info.stage == MESA_SHADER_VERTEX && /* Used to export PrimitiveID from the correct vertex. */ - (shader->key.mono.u.vs_export_prim_id || - /* Used to generate triangle strip vertex IDs for all threads. */ - shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP))); + shader->key.mono.u.vs_export_prim_id)); shader->uses_vs_state_outprim = sscreen->use_ngg && /* Only used by streamout in vertex shaders. */ diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h index ab11a1852..d6dbb13ed 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h @@ -138,6 +138,7 @@ #include "util/u_inlines.h" #include "util/u_live_shader_cache.h" #include "util/u_queue.h" +#include "si_pm4.h" #include <stdio.h> @@ -158,6 +159,12 @@ struct si_context; #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) +#define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0)) +#define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3)) +#define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000 +/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */ +#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001 + /* SGPR user data indices */ enum { @@ -272,14 +279,10 @@ enum SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, }; -#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */ +#define SI_NGG_CULL_ENABLED (1 << 0) /* this implies W, view.xy, and small prim culling */ #define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ #define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */ -#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3) -#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */ +#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */ /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -323,6 +326,16 @@ enum si_color_output_type { SI_TYPE_UINT16, }; +union si_input_info { + struct { + ubyte semantic; + ubyte interpolate; + ubyte fp16_lo_hi_valid; + ubyte usage_mask; + }; + uint32_t _unused; /* this just forces 4-byte alignment */ +}; + struct si_shader_info { shader_info base; @@ -330,12 +343,8 @@ struct si_shader_info { ubyte num_inputs; ubyte num_outputs; - ubyte input_semantic[PIPE_MAX_SHADER_INPUTS]; - ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; - ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; - ubyte input_fp16_lo_hi_valid[PIPE_MAX_SHADER_INPUTS]; + union si_input_info input[PIPE_MAX_SHADER_INPUTS]; ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS]; - char output_semantic_to_slot[VARYING_SLOT_VAR15_16BIT + 1]; ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; @@ -402,6 +411,13 @@ struct si_shader_info { * fragment shader invocations if flat shading. */ bool allow_flat_shading; + + /* Optimization: if the texture bound to this texunit has been cleared to 1, + * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the + * value is 0xff (undetermined) and can be later changed to 0 (= false) or + * texunit + 1. + */ + uint8_t writes_1_if_tex_is_1; }; /* A shader selector is a gallium CSO and contains shader variants and @@ -439,7 +455,6 @@ struct si_shader_selector { ubyte const_and_shader_buf_descriptors_index; ubyte sampler_and_images_descriptors_index; bool vs_needs_prolog; - bool prim_discard_cs_allowed; ubyte cs_shaderbufs_sgpr_index; ubyte cs_num_shaderbufs_in_user_sgprs; ubyte cs_images_sgpr_index; @@ -447,7 +462,6 @@ struct si_shader_selector { ubyte cs_num_images_in_user_sgprs; ubyte num_vs_inputs; ubyte num_vbos_in_user_sgprs; - unsigned pa_cl_vs_out_cntl; unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */ ubyte clipdist_mask; ubyte culldist_mask; @@ -521,7 +535,6 @@ struct si_vs_prolog_bits { uint16_t instance_divisor_is_one; /* bitmask of inputs */ uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ unsigned ls_vgpr_fix : 1; - unsigned unpack_instance_id_from_vertex_id : 1; }; /* Common TCS bits between the shader key and the epilog key. */ @@ -571,10 +584,7 @@ union si_shader_part_key { unsigned as_ls : 1; unsigned as_es : 1; unsigned as_ngg : 1; - unsigned as_prim_discard_cs : 1; - unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */ - unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */ - unsigned gs_fast_launch_index_size_packed : 2; + unsigned load_vgprs_after_culling : 1; /* Prologs for monolithic shaders shouldn't set EXEC. */ unsigned is_monolithic : 1; } vs_prolog; @@ -633,9 +643,10 @@ struct si_shader_key { /* These three are initially set according to the NEXT_SHADER property, * or guessed if the property doesn't seem correct. */ - unsigned as_es : 1; /* export shader, which precedes GS */ - unsigned as_ls : 1; /* local shader, which precedes TCS */ - unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */ + unsigned as_es : 1; /* whether it's a shader before GS */ + unsigned as_ls : 1; /* whether it's VS before TCS */ + unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled, + also set for the stage right before GS */ /* Flags for monolithic compilation only. */ struct { @@ -666,7 +677,7 @@ struct si_shader_key { unsigned kill_pointsize : 1; /* For NGG VS and TES. */ - unsigned ngg_culling : 7; /* SI_NGG_CULL_* */ + unsigned ngg_culling : 4; /* SI_NGG_CULL_* */ /* For shaders where monolithic variants have better code. * @@ -676,19 +687,6 @@ struct si_shader_key { */ unsigned prefer_mono : 1; - /* Primitive discard compute shader. */ - unsigned vs_as_prim_discard_cs : 1; - unsigned cs_prim_type : 4; - unsigned cs_indexed : 1; - unsigned cs_instancing : 1; - unsigned cs_primitive_restart : 1; - unsigned cs_provoking_vertex_first : 1; - unsigned cs_need_correct_orientation : 1; - unsigned cs_cull_front : 1; - unsigned cs_cull_back : 1; - unsigned cs_cull_z : 1; - unsigned cs_halfz_clip_space : 1; - /* VS and TCS have the same number of patch vertices. */ unsigned same_patch_vertices:1; @@ -707,6 +705,7 @@ struct si_shader_key { /* GCN-specific shader info. */ struct si_shader_binary_info { ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; + uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS]; ubyte num_input_sgprs; ubyte num_input_vgprs; signed char face_vgpr_index; @@ -736,7 +735,35 @@ struct gfx9_gs_info { unsigned esgs_ring_size; /* in bytes */ }; +#define SI_NUM_VGT_STAGES_KEY_BITS 5 +#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) + +/* The VGT_SHADER_STAGES key used to index the table of precomputed values. + * Some fields are set by state-change calls, most are set by draw_vbo. + */ +union si_vgt_stages_key { + struct { +#if UTIL_ARCH_LITTLE_ENDIAN + uint8_t tess : 1; + uint8_t gs : 1; + uint8_t ngg_passthrough : 1; + uint8_t ngg : 1; /* gfx10+ */ + uint8_t streamout : 1; /* only used with NGG */ + uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; +#else /* UTIL_ARCH_BIG_ENDIAN */ + uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; + uint8_t streamout : 1; + uint8_t ngg : 1; + uint8_t ngg_passthrough : 1; + uint8_t gs : 1; + uint8_t tess : 1; +#endif + } u; + uint8_t index; +}; + struct si_shader { + struct si_pm4_state pm4; /* base class */ struct si_compiler_ctx_state compiler_ctx_state; struct si_shader_selector *selector; @@ -748,7 +775,6 @@ struct si_shader { struct si_shader_part *prolog2; struct si_shader_part *epilog; - struct si_pm4_state *pm4; struct si_resource *bo; struct si_resource *scratch_bo; struct si_shader_key key; @@ -803,6 +829,8 @@ struct si_shader { unsigned vgt_gs_onchip_cntl; unsigned vgt_gs_max_prims_per_subgroup; unsigned vgt_esgs_ring_itemsize; + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; } gs; struct { @@ -819,6 +847,9 @@ struct si_shader { unsigned pa_cl_ngg_cntl; unsigned vgt_gs_max_vert_out; /* for API GS */ unsigned ge_pc_alloc; /* uconfig register */ + unsigned spi_shader_pgm_rsrc3_gs; + unsigned spi_shader_pgm_rsrc4_gs; + union si_vgt_stages_key vgt_stages; } ngg; struct { @@ -839,6 +870,7 @@ struct si_shader { unsigned spi_shader_z_format; unsigned spi_shader_col_format; unsigned cb_shader_mask; + unsigned num_interp; } ps; } ctx_reg; @@ -884,17 +916,18 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info); void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first); void si_nir_late_opts(nir_shader *nir); -void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize); +char *si_finalize_nir(struct pipe_screen *screen, void *nirptr); /* si_state_shaders.c */ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, struct gfx9_gs_info *out); +bool gfx10_is_ngg_passthrough(struct si_shader *shader); /* Inline helpers. */ /* Return the pointer to the main shader part's pointer. */ static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel, - struct si_shader_key *key) + const struct si_shader_key *key) { if (key->as_ls) return &sel->main_shader_part_ls; @@ -907,15 +940,6 @@ static inline struct si_shader **si_get_main_shader_part(struct si_shader_select return &sel->main_shader_part; } -static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader) -{ - struct si_shader_selector *sel = shader->selector; - - return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag && - !shader->key.opt.ngg_culling && - (sel->info.stage != MESA_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id); -} - static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector) { return selector ? selector->info.uses_bindless_samplers : false; @@ -926,6 +950,22 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel return selector ? selector->info.uses_bindless_images : false; } +static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader) +{ + if (shader->selector->info.stage == MESA_SHADER_VERTEX && + !shader->selector->info.base.vs.blit_sgprs_amd && + !(shader->key.opt.ngg_culling & SI_NGG_CULL_LINES)) + return true; + + return false; +} + +static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader) +{ + return gfx10_edgeflags_have_effect(shader) && + shader->selector->info.writes_edgeflag; +} + #ifdef __cplusplus } #endif diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h index 46d8e69b9..3970125f5 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -30,8 +30,6 @@ struct pipe_debug_callback; -#define RADEON_LLVM_MAX_INPUTS 32 * 4 - /* Ideally pass the sample mask input to the PS epilog as v14, which * is its usual location, so that the shader doesn't have to add v_mov. */ @@ -60,8 +58,6 @@ struct si_shader_context { struct ac_shader_args args; struct ac_shader_abi abi; - LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; - LLVMBasicBlockRef merged_wrap_if_entry_block; int merged_wrap_if_label; @@ -134,10 +130,6 @@ struct si_shader_context { /* API TES */ struct ac_arg tes_offchip_addr; - /* API GS */ - struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */ - struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */ - struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */ /* PS */ struct ac_arg pos_fixed_pt; /* CS */ @@ -194,9 +186,8 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader); void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx); void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3], LLVMValueRef prim_passthrough); -void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs); -void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); +void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi); +void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi); void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs); void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx); void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx); @@ -242,7 +233,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * /* si_shader_llvm_gs.c */ LLVMValueRef si_is_es_thread(struct si_shader_context *ctx); LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx); -void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); +void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi); void si_preload_esgs_ring(struct si_shader_context *ctx); void si_preload_gs_rings(struct si_shader_context *ctx); void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key); @@ -250,7 +241,7 @@ void si_llvm_init_gs_callbacks(struct si_shader_context *ctx); /* si_shader_llvm_tess.c */ void si_llvm_preload_tes_rings(struct si_shader_context *ctx); -void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); +void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi); void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key); void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx); void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); @@ -266,7 +257,6 @@ void si_llvm_init_ps_callbacks(struct si_shader_context *ctx); void si_llvm_init_resource_callbacks(struct si_shader_context *ctx); /* si_shader_llvm_vs.c */ -void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir); void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers, LLVMValueRef const *so_write_offsets, struct pipe_stream_output *stream_out, @@ -275,7 +265,7 @@ void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_outp unsigned noutput, unsigned stream); void si_llvm_build_vs_exports(struct si_shader_context *ctx, struct si_shader_output_values *outputs, unsigned noutput); -void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); +void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi); void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key); void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c index 8420162ca..1a1dd07a5 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -22,6 +22,7 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_exp_param.h" #include "ac_nir_to_llvm.h" #include "ac_rtld.h" #include "si_pipe.h" @@ -93,9 +94,7 @@ bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary, if (!si_replace_shader(count, binary)) { struct ac_compiler_passes *passes = compiler->passes; - if (ac->wave_size == 32) - passes = compiler->passes_wave32; - else if (less_optimized && compiler->low_opt_passes) + if (less_optimized && compiler->low_opt_passes) passes = compiler->low_opt_passes; struct si_llvm_diagnostics diag = {debug}; @@ -190,6 +189,7 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy } ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); + ac_llvm_set_target_features(ctx->main_fn, &ctx->ac); } void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader) @@ -220,7 +220,7 @@ void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shade if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) { - if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { + if (USE_LDS_SYMBOLS) { /* The LSHS size is not known until draw time, so we append it * at the end of whatever LDS use there may be in the rest of * the shader (currently none, unless LLVM decides to do its @@ -412,7 +412,7 @@ static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi) { struct si_shader_context *ctx = si_shader_context_from_abi(abi); - assert(ctx->shader->selector->info.base.cs.local_size_variable && + assert(ctx->shader->selector->info.base.workgroup_size_variable && ctx->shader->selector->info.uses_variable_block_size); LLVMValueRef chan[3] = { @@ -442,9 +442,7 @@ static void si_llvm_declare_compute_memory(struct si_shader_context *ctx) static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) { - if (nir->info.stage == MESA_SHADER_VERTEX) { - si_llvm_load_vs_inputs(ctx, nir); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + if (nir->info.stage == MESA_SHADER_FRAGMENT) { unsigned colors_read = ctx->shader->selector->info.colors_read; LLVMValueRef main_fn = ctx->main_fn; @@ -491,7 +489,6 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader * si_llvm_declare_compute_memory(ctx); } - ctx->abi.inputs = &ctx->inputs[0]; ctx->abi.clamp_shadow_reference = true; ctx->abi.robust_buffer_access = true; ctx->abi.convert_undef_to_zero = true; @@ -808,9 +805,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part !same_thread_count && si_is_multi_part_shader(ctx->shader)) ac_build_endif(&ctx->ac, 6507); - /* Return the value from the last part. It's non-void only for the prim - * discard compute shader. - */ if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) LLVMBuildRetVoid(builder); else @@ -902,12 +896,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad /* Unconditionally declare scratch space base for streamout and * vertex compaction. Whether space is actually allocated is * determined during linking / PM4 creation. - * - * Add an extra dword per vertex to ensure an odd stride, which - * avoids bank conflicts for SoA accesses. */ - if (!gfx10_is_ngg_passthrough(shader)) - si_llvm_declare_esgs_ring(ctx); + si_llvm_declare_esgs_ring(ctx); /* This is really only needed when streamout and / or vertex * compaction is enabled. @@ -1091,7 +1081,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) { LLVMValueRef parts[4]; unsigned num_parts = 0; - bool has_prolog = false; + bool first_is_prolog = false; LLVMValueRef main_fn = ctx.main_fn; if (ngg_cull_main_fn) { @@ -1102,7 +1092,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * prolog_key.vs_prolog.is_monolithic = true; si_llvm_build_vs_prolog(&ctx, &prolog_key); parts[num_parts++] = ctx.main_fn; - has_prolog = true; + first_is_prolog = true; } parts[num_parts++] = ngg_cull_main_fn; } @@ -1114,21 +1104,31 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * prolog_key.vs_prolog.is_monolithic = true; si_llvm_build_vs_prolog(&ctx, &prolog_key); parts[num_parts++] = ctx.main_fn; - has_prolog = true; + if (num_parts == 1) + first_is_prolog = true; } parts[num_parts++] = main_fn; - si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0, false); - - if (ctx.shader->key.opt.vs_as_prim_discard_cs) - si_build_prim_discard_compute_shader(&ctx); + si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false); } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) { - LLVMValueRef parts[2]; + LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn; + + /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */ + union si_shader_part_key prolog_key; + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; + prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5; + prolog_key.vs_prolog.as_ngg = 1; + prolog_key.vs_prolog.load_vgprs_after_culling = 1; + prolog_key.vs_prolog.is_monolithic = true; + si_llvm_build_vs_prolog(&ctx, &prolog_key); + prolog = ctx.main_fn; parts[0] = ngg_cull_main_fn; - parts[1] = ctx.main_fn; + parts[1] = prolog; + parts[2] = main_fn; - si_build_wrapper_function(&ctx, parts, 2, 0, 0, false); + si_build_wrapper_function(&ctx, parts, 3, 0, 0, false); } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) { if (sscreen->info.chip_class >= GFX9) { struct si_shader_selector *ls = shader->key.part.tcs.ls; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c index 18d8bca3c..450ee8348 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c @@ -24,11 +24,13 @@ #include "si_build_pm4.h" #include "si_query.h" +#include "si_shader_internal.h" #include "sid.h" #include "util/fast_idiv_by_const.h" #include "util/format/u_format.h" #include "util/format/u_format_s3tc.h" #include "util/u_dual_blend.h" +#include "util/u_helpers.h" #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" @@ -92,8 +94,8 @@ static void si_emit_cb_render_state(struct si_context *sctx) sctx->last_cb_target_mask = cb_target_mask; radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); radeon_end(); } @@ -445,6 +447,14 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->alpha_to_one = state->alpha_to_one; blend->dual_src_blend = util_blend_state_is_dual(state, 0); blend->logicop_enable = logicop_enable; + blend->allows_noop_optimization = + state->rt[0].rgb_func == PIPE_BLEND_ADD && + state->rt[0].alpha_func == PIPE_BLEND_ADD && + state->rt[0].rgb_src_factor == PIPE_BLENDFACTOR_DST_COLOR && + state->rt[0].alpha_src_factor == PIPE_BLENDFACTOR_DST_COLOR && + state->rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_ZERO && + state->rt[0].alpha_dst_factor == PIPE_BLENDFACTOR_ZERO && + mode == V_028808_CB_NORMAL; unsigned num_shader_outputs = state->max_rt + 1; /* estimate */ if (blend->dual_src_blend) @@ -627,6 +637,79 @@ static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_b return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL); } +static bool si_check_blend_dst_sampler_noop(struct si_context *sctx) +{ + if (sctx->framebuffer.state.nr_cbufs == 1) { + struct si_shader_selector *sel = sctx->shader.ps.cso; + bool free_nir; + if (unlikely(sel->info.writes_1_if_tex_is_1 == 0xff)) { + struct nir_shader *nir = si_get_nir_shader(sel, NULL, &free_nir); + + /* Determine if this fragment shader always writes vec4(1) if a specific texture + * is all 1s. + */ + float in[4] = { 1.0, 1.0, 1.0, 1.0 }; + float out[4]; + int texunit; + if (si_nir_is_output_const_if_tex_is_const(nir, in, out, &texunit) && + !memcmp(in, out, 4 * sizeof(float))) { + sel->info.writes_1_if_tex_is_1 = 1 + texunit; + } else { + sel->info.writes_1_if_tex_is_1 = 0; + } + + if (free_nir) + ralloc_free(nir); + } + + if (sel->info.writes_1_if_tex_is_1 && + sel->info.writes_1_if_tex_is_1 != 0xff) { + /* Now check if the texture is cleared to 1 */ + int unit = sctx->shader.ps.cso->info.writes_1_if_tex_is_1 - 1; + struct si_samplers *samp = &sctx->samplers[PIPE_SHADER_FRAGMENT]; + if ((1u << unit) & samp->enabled_mask) { + struct si_texture* tex = (struct si_texture*) samp->views[unit]->texture; + if (tex->is_depth && + tex->depth_cleared_level_mask & BITFIELD_BIT(samp->views[unit]->u.tex.first_level) && + tex->depth_clear_value[0] == 1) { + return false; + } + /* TODO: handle color textures */ + } + } + } + + return true; +} + +static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) { + struct si_context *sctx = (struct si_context *)ctx; + + if (!si_check_blend_dst_sampler_noop(sctx)) + return; + + sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws); +} + +static void si_draw_vstate_blend_dst_sampler_noop(struct pipe_context *ctx, + struct pipe_vertex_state *state, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) { + struct si_context *sctx = (struct si_context *)ctx; + + if (!si_check_blend_dst_sampler_noop(sctx)) + return; + + sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws); +} + static void si_bind_blend_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -649,8 +732,12 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->alpha_to_one != blend->alpha_to_one || old_blend->dual_src_blend != blend->dual_src_blend || old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) { + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_blend_rasterizer(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; + } if (sctx->screen->dpbb_allowed && (old_blend->alpha_to_coverage != blend->alpha_to_coverage || @@ -664,6 +751,15 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->commutative_4bit != blend->commutative_4bit || old_blend->logicop_enable != blend->logicop_enable))) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + if (likely(!radeon_uses_secure_bos(sctx->ws))) { + if (unlikely(blend->allows_noop_optimization)) { + si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop, + si_draw_vstate_blend_dst_sampler_noop); + } else { + si_install_draw_wrapper(sctx, NULL, NULL); + } + } } static void si_delete_blend_state(struct pipe_context *ctx, void *state) @@ -691,8 +787,8 @@ static void si_emit_blend_color(struct si_context *sctx) struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); - radeon_emit_array(cs, (uint32_t *)sctx->blend_color.color, 4); + radeon_set_context_reg_seq(R_028414_CB_BLEND_RED, 4); + radeon_emit_array((uint32_t *)sctx->blend_color.color, 4); radeon_end(); } @@ -725,8 +821,8 @@ static void si_emit_clip_state(struct si_context *sctx) struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4); - radeon_emit_array(cs, (uint32_t *)sctx->clip_state.ucp, 6 * 4); + radeon_set_context_reg_seq(R_0285BC_PA_CL_UCP_0_X, 6 * 4); + radeon_emit_array((uint32_t *)sctx->clip_state.ucp, 6 * 4); radeon_end(); } @@ -741,7 +837,6 @@ static void si_emit_clip_regs(struct si_context *sctx) unsigned clipdist_mask = vs_sel->clipdist_mask; unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS; unsigned culldist_mask = vs_sel->culldist_mask; - unsigned vs_out_mask = (clipdist_mask & ~vs->key.opt.kill_clip_distances) | culldist_mask; /* Clip distances on points have no effect, so need to be implemented * as cull distances. This applies for the clipvertex case as well. @@ -752,23 +847,14 @@ static void si_emit_clip_regs(struct si_context *sctx) clipdist_mask &= rs->clip_plane_enable; culldist_mask |= clipdist_mask; - unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) | - S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 && + unsigned pa_cl_cntl = S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 && !sctx->screen->options.vrs2x2) | S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) | clipdist_mask | (culldist_mask << 8); radeon_begin(&sctx->gfx_cs); - - if (sctx->chip_class >= GFX10) { - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl, - ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); - } else { - radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, - vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); - } + radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, + pa_cl_cntl | vs->pa_cl_vs_out_cntl); radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space)); radeon_end_update_context_roll(sctx); @@ -834,15 +920,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast return NULL; } - if (!state->front_ccw) { - rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT); - rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK); - } else { - rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT); - rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK); - } - rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far; - rs->provoking_vertex_first = state->flatshade_first; rs->scissor_enable = state->scissor; rs->clip_halfz = state->clip_halfz; rs->two_side = state->light_twoside; @@ -862,9 +939,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast rs->flatshade_first = state->flatshade_first; rs->sprite_coord_enable = state->sprite_coord_enable; rs->rasterizer_discard = state->rasterizer_discard; - rs->polygon_mode_enabled = - (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) || - (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK)); rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) || (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK)); @@ -882,24 +956,30 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); if (rs->rasterizer_discard) { - rs->ngg_cull_flags = SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE; + rs->ngg_cull_flags = SI_NGG_CULL_ENABLED | + SI_NGG_CULL_FRONT_FACE | + SI_NGG_CULL_BACK_FACE; rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags; } else { - /* Polygon mode can't use view and small primitive culling, - * because it draws points or lines where the culling depends - * on the point or line width. - */ - if (!rs->polygon_mode_enabled) { - rs->ngg_cull_flags |= SI_NGG_CULL_VIEW_SMALLPRIMS; - rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_VIEW_SMALLPRIMS; + rs->ngg_cull_flags = SI_NGG_CULL_ENABLED; + rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags; + + bool cull_front, cull_back; + + if (!state->front_ccw) { + cull_front = !!(state->cull_face & PIPE_FACE_FRONT); + cull_back = !!(state->cull_face & PIPE_FACE_BACK); + } else { + cull_back = !!(state->cull_face & PIPE_FACE_FRONT); + cull_front = !!(state->cull_face & PIPE_FACE_BACK); } - if (rs->cull_front) { + if (cull_front) { rs->ngg_cull_flags |= SI_NGG_CULL_FRONT_FACE; rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_BACK_FACE; } - if (rs->cull_back) { + if (cull_back) { rs->ngg_cull_flags |= SI_NGG_CULL_BACK_FACE; rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_FRONT_FACE; } @@ -942,7 +1022,10 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast S_028A48_VPORT_SCISSOR_ENABLE(1) | S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9)); - si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); + bool polygon_mode_enabled = + (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) || + (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK)); + si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL, S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | @@ -951,11 +1034,11 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | - S_028814_POLY_MODE(rs->polygon_mode_enabled) | + S_028814_POLY_MODE(polygon_mode_enabled) | S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) | /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */ - S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? rs->polygon_mode_enabled : 0)); + S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? polygon_mode_enabled : 0)); if (!rs->uses_poly_offset) return rs; @@ -991,11 +1074,12 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast } } + si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl); + si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale)); si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units)); si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale)); si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units)); - si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl); } return rs; @@ -1044,6 +1128,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + if (old_rs->sprite_coord_enable != rs->sprite_coord_enable || + old_rs->flatshade != rs->flatshade) + si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); + if (old_rs->clip_plane_enable != rs->clip_plane_enable || old_rs->rasterizer_discard != rs->rasterizer_discard || old_rs->sprite_coord_enable != rs->sprite_coord_enable || @@ -1053,8 +1141,19 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth || old_rs->clamp_fragment_color != rs->clamp_fragment_color || old_rs->force_persample_interp != rs->force_persample_interp || - old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) + old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) { + si_ps_key_update_blend_rasterizer(sctx); + si_ps_key_update_rasterizer(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; + } + + if (old_rs->line_smooth != rs->line_smooth || + old_rs->poly_smooth != rs->poly_smooth || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->flatshade != rs->flatshade) + si_update_vrs_flat_shading(sctx); } static void si_delete_rs_state(struct pipe_context *ctx, void *state) @@ -1079,14 +1178,15 @@ static void si_emit_stencil_ref(struct si_context *sctx) struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); - radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) | - S_028430_STENCILMASK(dsa->valuemask[0]) | - S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1)); - radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | - S_028434_STENCILMASK_BF(dsa->valuemask[1]) | - S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | - S_028434_STENCILOPVAL_BF(1)); + radeon_set_context_reg_seq(R_028430_DB_STENCILREFMASK, 2); + radeon_emit(S_028430_STENCILTESTVAL(ref->ref_value[0]) | + S_028430_STENCILMASK(dsa->valuemask[0]) | + S_028430_STENCILWRITEMASK(dsa->writemask[0]) | + S_028430_STENCILOPVAL(1)); + radeon_emit(S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | + S_028434_STENCILMASK_BF(dsa->valuemask[1]) | + S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | + S_028434_STENCILOPVAL_BF(1)); radeon_end(); } @@ -1270,8 +1370,12 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } - if (old_dsa->alpha_func != dsa->alpha_func) + if (old_dsa->alpha_func != dsa->alpha_func) { + si_ps_key_update_dsa(sctx); + si_update_ps_inputs_read_or_disabled(sctx); + si_update_ps_kill_enable(sctx); sctx->do_update_shaders = true; + } if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled || old_dsa->stencil_enabled != dsa->stencil_enabled || @@ -1446,8 +1550,8 @@ static void si_emit_db_render_state(struct si_context *sctx) /* * format translation */ -static uint32_t si_translate_colorformat(enum chip_class chip_class, - enum pipe_format format) +uint32_t si_translate_colorformat(enum chip_class chip_class, + enum pipe_format format) { const struct util_format_description *desc = util_format_description(format); if (!desc) @@ -2234,6 +2338,13 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER); } + if (usage & PIPE_BIND_INDEX_BUFFER) { + if (format == PIPE_FORMAT_R8_UINT || + format == PIPE_FORMAT_R16_UINT || + format == PIPE_FORMAT_R32_UINT) + retval |= PIPE_BIND_INDEX_BUFFER; + } + if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) && !(usage & PIPE_BIND_DEPTH_STENCIL)) retval |= PIPE_BIND_LINEAR; @@ -2585,8 +2696,6 @@ void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) tex->dirty_level_mask |= 1 << surf->u.tex.level; tex->fmask_is_identity = false; } - if (tex->dcc_gather_statistics) - tex->separate_dcc_dirty = true; } } @@ -2658,15 +2767,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_update_fb_dirtiness_after_rendering(sctx); - for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (!sctx->framebuffer.state.cbufs[i]) - continue; - - tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture; - if (tex->dcc_gather_statistics) - vi_separate_dcc_stop_query(sctx, tex); - } - /* Disable DCC if the formats are incompatible. */ for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) @@ -2823,12 +2923,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, p_atomic_inc(&tex->framebuffers_bound); - if (tex->dcc_gather_statistics) { - /* Dirty tracking must be enabled for DCC usage analysis. */ - sctx->framebuffer.compressed_cb_mask |= 1 << i; - vi_separate_dcc_start_query(sctx, tex); - } - /* Update the minimum but don't keep 0. */ if (!sctx->framebuffer.min_bytes_per_pixel || tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) @@ -2889,6 +2983,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (!sctx->sample_pos_buffer) { + sctx->sample_pos_buffer = pipe_buffer_create_with_data(&sctx->b, 0, PIPE_USAGE_DEFAULT, + sizeof(sctx->sample_positions), + &sctx->sample_positions); + } constbuf.buffer = sctx->sample_pos_buffer; /* Set sample locations as fragment shader constants. */ @@ -2922,6 +3021,10 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); } + si_ps_key_update_framebuffer(sctx); + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); sctx->do_update_shaders = true; if (!sctx->decompression_enabled) { @@ -2953,7 +3056,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb = (struct si_surface *)state->cbufs[i]; if (!cb) { - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, + radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, S_028C70_FORMAT(V_028C70_COLOR_INVALID)); continue; } @@ -2969,11 +3072,6 @@ static void si_emit_framebuffer_state(struct si_context *sctx) RADEON_PRIO_SEPARATE_META); } - if (tex->dcc_separate_buffer) - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, tex->dcc_separate_buffer, - RADEON_USAGE_READWRITE | RADEON_USAGE_NEEDS_IMPLICIT_SYNC, - RADEON_PRIO_SEPARATE_META); - /* Compute mutable surface parameters. */ cb_color_base = tex->buffer.gpu_address >> 8; cb_color_fmask = 0; @@ -3013,9 +3111,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) if (!is_msaa_resolve_dst) cb_color_info |= S_028C70_DCC_ENABLE(1); - cb_dcc_base = - ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.meta_offset) >> - 8; + cb_dcc_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; dcc_tile_swizzle &= ((1 << tex->surface.meta_alignment_log2) - 1) >> 8; @@ -3039,30 +3135,30 @@ static void si_emit_framebuffer_state(struct si_context *sctx) S_028EE0_CMASK_PIPE_ALIGNED(1) | S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned); - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - - radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); - radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, + radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); + radeon_emit(cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(0); /* hole */ + radeon_emit(0); /* hole */ + radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(0); /* hole */ + radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(0); /* hole */ + radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + + radeon_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); + radeon_set_context_reg(R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, cb_color_cmask >> 32); - radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, + radeon_set_context_reg(R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, cb_color_fmask >> 32); - radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); - radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); - radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); + radeon_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); + radeon_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); + radeon_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); } else if (sctx->chip_class == GFX9) { struct gfx9_surf_meta_flags meta = { .rb_aligned = 1, @@ -3084,24 +3180,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx) S_028C74_RB_ALIGNED(meta.rb_aligned) | S_028C74_PIPE_ALIGNED(meta.pipe_aligned); - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ - radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ - - radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4, + radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); + radeon_emit(cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ + radeon_emit(cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ + radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ + radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ + radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + radeon_emit(S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ + + radeon_set_context_reg(R_0287A0_CB_MRT0_EPITCH + i * 4, S_0287A0_EPITCH(tex->surface.u.gfx9.epitch)); } else { /* Compute mutable surface parameters (GFX6-GFX8). */ @@ -3145,29 +3241,29 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max); } - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, + radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, sctx->chip_class >= GFX8 ? 14 : 13); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */ - radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cb_color_pitch); /* CB_COLOR0_PITCH */ + radeon_emit(cb_color_slice); /* CB_COLOR0_SLICE */ + radeon_emit(cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ + radeon_emit(cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ + radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */ - radeon_emit(cs, cb_dcc_base); + radeon_emit(cb_dcc_base); } } for (; i < 8; i++) if (sctx->framebuffer.dirty_cbufs & (1 << i)) - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); + radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); /* ZS buffer. */ if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { @@ -3203,49 +3299,47 @@ static void si_emit_framebuffer_state(struct si_context *sctx) unsigned level = zb->base.u.tex.level; if (sctx->chip_class >= GFX10) { - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); - - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7); - radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ - radeon_emit(cs, db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); - radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5); - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ + radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); + + radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 7); + radeon_emit(S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ + radeon_emit(db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + + radeon_set_context_reg_seq(R_028068_DB_Z_READ_BASE_HI, 5); + radeon_emit(zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ + radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ + radeon_emit(zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ } else if (sctx->chip_class == GFX9) { - radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); - radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ - radeon_emit(cs, - S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10); - radeon_emit(cs, db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); - radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, - S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2); - radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */ - radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ + radeon_set_context_reg_seq(R_028014_DB_HTILE_DATA_BASE, 3); + radeon_emit(zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ + radeon_emit(S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ + radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */ + + radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 10); + radeon_emit(db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ + + radeon_set_context_reg_seq(R_028068_DB_Z_INFO2, 2); + radeon_emit(zb->db_z_info2); /* DB_Z_INFO2 */ + radeon_emit(zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ } else { /* GFX6-GFX8 */ /* Set fields dependent on tc_compatile_htile. */ @@ -3263,46 +3357,46 @@ static void si_emit_framebuffer_state(struct si_context *sctx) } } - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9); - radeon_emit(cs, zb->db_depth_info | /* DB_DEPTH_INFO */ + radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 9); + radeon_emit(zb->db_depth_info | /* DB_DEPTH_INFO */ S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile)); - radeon_emit(cs, db_z_info | /* DB_Z_INFO */ - S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); - radeon_emit(cs, db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */ + radeon_emit(db_z_info | /* DB_Z_INFO */ + S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + radeon_emit(db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(zb->db_depth_size); /* DB_DEPTH_SIZE */ + radeon_emit(zb->db_depth_slice); /* DB_DEPTH_SLICE */ } - radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); - radeon_emit(cs, tex->stencil_clear_value[level]); /* R_028028_DB_STENCIL_CLEAR */ - radeon_emit(cs, fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */ + radeon_set_context_reg_seq(R_028028_DB_STENCIL_CLEAR, 2); + radeon_emit(tex->stencil_clear_value[level]); /* R_028028_DB_STENCIL_CLEAR */ + radeon_emit(fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */ - radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view); - radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface); + radeon_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view); + radeon_set_context_reg(R_028ABC_DB_HTILE_SURFACE, db_htile_surface); } else if (sctx->framebuffer.dirty_zsbuf) { if (sctx->chip_class == GFX9) - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2); + radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 2); else - radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2); + radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 2); - radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ - radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ + radeon_emit(S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ + radeon_emit(S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ } /* Framebuffer dimensions. */ /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */ - radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, + radeon_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR, S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + if (sctx->screen->dpbb_allowed) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } radeon_end(); @@ -3508,14 +3602,15 @@ static void si_emit_msaa_config(struct si_context *sctx) } } - /* Required by OpenGL line rasterization. + /* The DX10 diamond test is optional in GL and decreases line rasterization + * performance, so don't use it. * * TODO: We should also enable perpendicular endcaps for AA lines, * but that requires implementing line stippling in the pixel * shader. SC can only do line stippling with axis-aligned * endcaps. */ - unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); + unsigned sc_line_cntl = 0; unsigned sc_aa_config = 0; if (coverage_samples > 1) { @@ -3559,17 +3654,7 @@ static void si_emit_msaa_config(struct si_context *sctx) /* R_028A4C_PA_SC_MODE_CNTL_1 */ radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); - - if (radeon_packets_added()) { - sctx->context_roll = true; - - /* GFX9: Flush DFSM when the AA mode changes. */ - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); - } - } - radeon_end(); + radeon_end_update_context_roll(sctx); } void si_update_ps_iter_samples(struct si_context *sctx) @@ -3591,6 +3676,9 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) return; sctx->ps_iter_samples = min_samples; + + si_ps_key_update_sample_shading(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); sctx->do_update_shaders = true; si_update_ps_iter_samples(sctx); @@ -3753,8 +3841,8 @@ static void gfx10_make_texture_descriptor( } if (tex->upgraded_depth && !is_stencil) { - assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT); - img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP; + assert(img_format == V_008F0C_GFX10_FORMAT_32_FLOAT); + img_format = V_008F0C_GFX10_FORMAT_32_FLOAT_CLAMP; } } else { util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); @@ -3818,43 +3906,43 @@ static void gfx10_make_texture_descriptor( #define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) switch (FMASK(res->nr_samples, res->nr_storage_samples)) { case FMASK(2, 1): - format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1; + format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F1; break; case FMASK(2, 2): - format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2; + format = V_008F0C_GFX10_FORMAT_FMASK8_S2_F2; break; case FMASK(4, 1): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1; + format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F1; break; case FMASK(4, 2): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2; + format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F2; break; case FMASK(4, 4): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4; + format = V_008F0C_GFX10_FORMAT_FMASK8_S4_F4; break; case FMASK(8, 1): - format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1; + format = V_008F0C_GFX10_FORMAT_FMASK8_S8_F1; break; case FMASK(8, 2): - format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2; + format = V_008F0C_GFX10_FORMAT_FMASK16_S8_F2; break; case FMASK(8, 4): - format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4; + format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F4; break; case FMASK(8, 8): - format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8; + format = V_008F0C_GFX10_FORMAT_FMASK32_S8_F8; break; case FMASK(16, 1): - format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1; + format = V_008F0C_GFX10_FORMAT_FMASK16_S16_F1; break; case FMASK(16, 2): - format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2; + format = V_008F0C_GFX10_FORMAT_FMASK32_S16_F2; break; case FMASK(16, 4): - format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4; + format = V_008F0C_GFX10_FORMAT_FMASK64_S16_F4; break; case FMASK(16, 8): - format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8; + format = V_008F0C_GFX10_FORMAT_FMASK64_S16_F8; break; default: unreachable("invalid nr_samples"); @@ -4223,7 +4311,7 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx unsigned force_level) { struct si_context *sctx = (struct si_context *)ctx; - struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); + struct si_sampler_view *view = CALLOC_STRUCT_CL(si_sampler_view); struct si_texture *tex = (struct si_texture *)texture; unsigned base_level, first_level, last_level; unsigned char state_swizzle[4]; @@ -4357,7 +4445,7 @@ static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sample struct si_sampler_view *view = (struct si_sampler_view *)state; pipe_resource_reference(&state->texture, NULL); - FREE(view); + FREE_CL(view); } static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter) @@ -4404,9 +4492,13 @@ static uint32_t si_translate_border_color(struct si_context *sctx, if (i >= SI_MAX_BORDER_COLORS) { /* Getting 4096 unique border colors is very unlikely. */ - fprintf(stderr, "radeonsi: The border color table is full. " - "Any new border colors will be just black. " - "Please file a bug.\n"); + static bool printed; + if (!printed) { + fprintf(stderr, "radeonsi: The border color table is full. " + "Any new border colors will be just black. " + "This is a hardware limitation.\n"); + printed = true; + } return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); } @@ -4552,9 +4644,9 @@ static void si_emit_sample_mask(struct si_context *sctx) (mask & 1 && sctx->blitter_running)); radeon_begin(cs); - radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); - radeon_emit(cs, mask | (mask << 16)); - radeon_emit(cs, mask | (mask << 16)); + radeon_set_context_reg_seq(R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); + radeon_emit(mask | (mask << 16)); + radeon_emit(mask | (mask << 16)); radeon_end(); } @@ -4606,8 +4698,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, v->count = count; + unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sscreen); unsigned alloc_count = - count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0; + count > num_vbos_in_user_sgprs ? count - num_vbos_in_user_sgprs : 0; v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); for (i = 0; i < count; ++i) { @@ -4623,8 +4716,6 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, unsigned instance_divisor = elements[i].instance_divisor; if (instance_divisor) { - v->uses_instance_divisors = true; - if (instance_divisor == 1) { v->instance_divisor_is_one |= 1u << i; } else { @@ -4820,22 +4911,23 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) sctx->vertex_buffer_user_sgprs_dirty = false; } - if (old->count != v->count || - old->uses_instance_divisors != v->uses_instance_divisors || - /* we don't check which divisors changed */ - v->uses_instance_divisors || + if (old->instance_divisor_is_one != v->instance_divisor_is_one || + old->instance_divisor_is_fetched != v->instance_divisor_is_fetched || (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned || ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) && memcmp(old->vertex_buffer_index, v->vertex_buffer_index, - sizeof(v->vertex_buffer_index[0]) * v->count)) || + sizeof(v->vertex_buffer_index[0]) * MAX2(old->count, v->count))) || /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are * functions of fix_fetch and the src_offset alignment. * If they change and fix_fetch doesn't, it must be due to different * src_offset alignment, which is reflected in fix_fetch_opencode. */ old->fix_fetch_opencode != v->fix_fetch_opencode || - memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)) + memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * + MAX2(old->count, v->count))) { + si_vs_key_update_inputs(sctx); sctx->do_update_shaders = true; + } if (v->instance_divisor_is_fetched) { struct pipe_constant_buffer cb; @@ -4931,8 +5023,82 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, * be the case in well-behaved applications anyway. */ if ((sctx->vertex_elements->vb_alignment_check_mask & - (unaligned | orig_unaligned) & updated_mask)) + (unaligned | orig_unaligned) & updated_mask)) { + si_vs_key_update_inputs(sctx); sctx->do_update_shaders = true; + } +} + +static struct pipe_vertex_state * +si_create_vertex_state(struct pipe_screen *screen, + struct pipe_vertex_buffer *buffer, + const struct pipe_vertex_element *elements, + unsigned num_elements, + struct pipe_resource *indexbuf, + uint32_t full_velem_mask) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_vertex_state *state = CALLOC_STRUCT(si_vertex_state); + + util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf, full_velem_mask, + &state->b); + + /* Initialize the vertex element state in state->element. + * Do it by creating a vertex element state object and copying it there. + */ + struct si_context ctx = {}; + ctx.b.screen = screen; + struct si_vertex_elements *velems = si_create_vertex_elements(&ctx.b, num_elements, elements); + state->velems = *velems; + si_delete_vertex_element(&ctx.b, velems); + + assert(!state->velems.instance_divisor_is_one); + assert(!state->velems.instance_divisor_is_fetched); + assert(!state->velems.fix_fetch_always); + assert(buffer->stride % 4 == 0); + assert(buffer->buffer_offset % 4 == 0); + assert(!buffer->is_user_buffer); + for (unsigned i = 0; i < num_elements; i++) { + assert(elements[i].src_offset % 4 == 0); + assert(!elements[i].dual_slot); + } + + for (unsigned i = 0; i < num_elements; i++) { + si_set_vertex_buffer_descriptor(sscreen, &state->velems, &state->b.input.vbuffer, i, + &state->descriptors[i * 4]); + } + + return &state->b; +} + +static void si_vertex_state_destroy(struct pipe_screen *screen, + struct pipe_vertex_state *state) +{ + pipe_vertex_buffer_unreference(&state->input.vbuffer); + pipe_resource_reference(&state->input.indexbuf, NULL); + FREE(state); +} + +static struct pipe_vertex_state * +si_pipe_create_vertex_state(struct pipe_screen *screen, + struct pipe_vertex_buffer *buffer, + const struct pipe_vertex_element *elements, + unsigned num_elements, + struct pipe_resource *indexbuf, + uint32_t full_velem_mask) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + return util_vertex_state_cache_get(screen, buffer, elements, num_elements, indexbuf, + full_velem_mask, &sscreen->vertex_state_cache); +} + +static void si_pipe_vertex_state_destroy(struct pipe_screen *screen, + struct pipe_vertex_state *state) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + util_vertex_state_destroy(screen, &sscreen->vertex_state_cache, state); } /* @@ -4957,6 +5123,13 @@ static void si_set_tess_state(struct pipe_context *ctx, const float default_oute si_set_internal_const_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb); } +static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices) +{ + struct si_context *sctx = (struct si_context *)ctx; + + sctx->patch_vertices = patch_vertices; +} + static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; @@ -5086,6 +5259,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.texture_barrier = si_texture_barrier; sctx->b.set_min_samples = si_set_min_samples; sctx->b.set_tess_state = si_set_tess_state; + sctx->b.set_patch_vertices = si_set_patch_vertices; sctx->b.set_active_query_state = si_set_active_query_state; } @@ -5093,12 +5267,17 @@ void si_init_state_functions(struct si_context *sctx) void si_init_screen_state_functions(struct si_screen *sscreen) { sscreen->b.is_format_supported = si_is_format_supported; + sscreen->b.create_vertex_state = si_pipe_create_vertex_state; + sscreen->b.vertex_state_destroy = si_pipe_vertex_state_destroy; if (sscreen->info.chip_class >= GFX10) { sscreen->make_texture_descriptor = gfx10_make_texture_descriptor; } else { sscreen->make_texture_descriptor = si_make_texture_descriptor; } + + util_vertex_state_cache_init(&sscreen->vertex_state_cache, + si_create_vertex_state, si_vertex_state_destroy); } static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value) @@ -5226,6 +5405,12 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) S_028034_BR_X(16384) | S_028034_BR_Y(16384)); } + if (sctx->chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_028038_DB_DFSM_CONTROL, + S_028038_PUNCHOUT_MODE(V_028038_FORCE_OFF) | + S_028038_POPS_DRAIN_PS_ON_OVERLAP(1)); + } + unsigned cu_mask_ps = 0xffffffff; /* It's wasteful to enable all CUs for PS if shader arrays have a different @@ -5239,63 +5424,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) cu_mask_ps = u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa); if (sctx->chip_class >= GFX7) { - /* Compute LATE_ALLOC_VS.LIMIT. */ - unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa; - unsigned late_alloc_wave64 = 0; /* The limit is per SA. */ - unsigned cu_mask_vs = 0xffff; - unsigned cu_mask_gs = 0xffff; - - if (sctx->chip_class >= GFX10) { - /* For Wave32, the hw will launch twice the number of late - * alloc waves, so 1 == 2x wave32. - */ - if (!sscreen->info.use_late_alloc) { - late_alloc_wave64 = 0; - } else { - late_alloc_wave64 = (num_cu_per_sh - 2) * 4; - - /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock. - * Others: CU1 must be disabled to prevent a hw deadlock. - * - * The deadlock is caused by late alloc, which usually increases - * performance. - */ - cu_mask_vs &= sctx->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) : - ~BITFIELD_RANGE(1, 1); - - /* Late alloc is not used for NGG on Navi14 due to a hw bug. */ - if (sscreen->use_ngg && sctx->family != CHIP_NAVI14) - cu_mask_gs = cu_mask_vs; - } - } else { - if (!sscreen->info.use_late_alloc) { - late_alloc_wave64 = 0; - } else if (num_cu_per_sh <= 4) { - /* Too few available compute units per SA. Disallowing - * VS to run on one CU could hurt us more than late VS - * allocation would help. - * - * 2 is the highest safe number that allows us to keep - * all CUs enabled. - */ - late_alloc_wave64 = 2; - } else { - /* This is a good initial value, allowing 1 late_alloc - * wave per SIMD on num_cu - 2. - */ - late_alloc_wave64 = (num_cu_per_sh - 2) * 4; - } - - if (late_alloc_wave64 > 2) - cu_mask_vs = 0xfffe; /* 1 CU disabled */ - } - - /* VS can't execute on one CU if the limit is > 2. */ - si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, - S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F)); } @@ -5316,6 +5444,21 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); } + if (sscreen->info.chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, + S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8)); + si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, + S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8)); + } else if (sscreen->info.chip_class == GFX9) { + si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, + S_00B414_MEM_BASE(sscreen->info.address32_hi >> 8)); + si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, + S_00B214_MEM_BASE(sscreen->info.address32_hi >> 8)); + } else { + si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, + S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8)); + } + if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) { si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); @@ -5354,6 +5497,10 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); + + si_pm4_set_reg(pm4, R_028060_DB_DFSM_CONTROL, + S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | + S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); } if (sctx->chip_class >= GFX9) { diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.h b/lib/mesa/src/gallium/drivers/radeonsi/si_state.h index ea31a2afd..a6daa158b 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.h +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.h @@ -65,6 +65,7 @@ struct si_state_blend { bool alpha_to_one : 1; bool dual_src_blend : 1; bool logicop_enable : 1; + bool allows_noop_optimization : 1; }; struct si_state_rasterizer { @@ -95,11 +96,6 @@ struct si_state_rasterizer { unsigned rasterizer_discard : 1; unsigned scissor_enable : 1; unsigned clip_halfz : 1; - unsigned cull_front : 1; - unsigned cull_back : 1; - unsigned depth_clamp_any : 1; - unsigned provoking_vertex_first : 1; - unsigned polygon_mode_enabled : 1; unsigned polygon_mode_is_lines : 1; unsigned polygon_mode_is_points : 1; }; @@ -173,7 +169,6 @@ struct si_vertex_elements { uint16_t vb_alignment_check_mask; uint8_t count; - bool uses_instance_divisors; uint16_t first_vb_use_mask; /* Vertex buffer descriptor list size aligned for optimal prefetch. */ @@ -188,13 +183,13 @@ union si_state { struct si_state_rasterizer *rasterizer; struct si_state_dsa *dsa; struct si_pm4_state *poly_offset; - struct si_pm4_state *ls; - struct si_pm4_state *hs; - struct si_pm4_state *es; - struct si_pm4_state *gs; + struct si_shader *ls; + struct si_shader *hs; + struct si_shader *es; + struct si_shader *gs; struct si_pm4_state *vgt_shader_config; - struct si_pm4_state *vs; - struct si_pm4_state *ps; + struct si_shader *vs; + struct si_shader *ps; } named; struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)]; }; @@ -254,12 +249,6 @@ struct si_shader_data { uint32_t sh_base[SI_NUM_SHADERS]; }; -#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \ - (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) | \ - S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) | \ - S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) | \ - S_02881C_USE_VTX_VRS_RATE(1)) - /* The list of registers whose emitted values are remembered by si_context. */ enum si_tracked_reg { @@ -285,12 +274,11 @@ enum si_tracked_reg SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/ - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */ + SI_TRACKED_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, SI_TRACKED_PA_SC_BINNER_CNTL_0, - SI_TRACKED_DB_DFSM_CONTROL, + SI_TRACKED_DB_VRS_OVERRIDE_CNTL, SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */ @@ -347,7 +335,10 @@ enum si_tracked_reg SI_TRACKED_VGT_TF_PARAM, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + /* Non-context registers: */ SI_TRACKED_GE_PC_ALLOC, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, SI_NUM_TRACKED_REGS, }; @@ -490,8 +481,10 @@ struct si_buffer_resources { void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex, const struct legacy_surf_level *base_level_info, unsigned base_level, unsigned first_level, unsigned block_width, - bool is_stencil, uint16_t access, uint32_t *state); + /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */ + bool is_stencil, uint16_t access, uint32_t * restrict state); void si_update_ps_colorbuf0_slot(struct si_context *sctx); +void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader); void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot, struct pipe_constant_buffer *cbuf); void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot, @@ -527,6 +520,7 @@ struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, uns void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab); void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf); /* si_state.c */ +uint32_t si_translate_colorformat(enum chip_class chip_class, enum pipe_format format); void si_init_state_compute_functions(struct si_context *sctx); void si_init_state_functions(struct si_context *sctx); void si_init_screen_state_functions(struct si_screen *sscreen); @@ -567,7 +561,6 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], struct si_shader *shader, bool insert_into_disk_cache); bool si_shader_mem_ordered(struct si_shader *shader); -bool si_update_shaders(struct si_context *sctx); void si_init_screen_live_shader_cache(struct si_screen *sscreen); void si_init_shader_functions(struct si_context *sctx); bool si_init_shader_cache(struct si_screen *sscreen); @@ -578,18 +571,40 @@ void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage, util_queue_execute_func execute); void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const_and_shader_buffers, uint64_t *samplers_and_images); -int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, int thread_index, bool optimized_or_none); -void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); -unsigned si_get_input_prim(const struct si_shader_selector *gs); +int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state, + const struct si_shader_key *key, int thread_index, + bool optimized_or_none); +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state); +void si_vs_key_update_inputs(struct si_context *sctx); +void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key); +void si_update_ps_inputs_read_or_disabled(struct si_context *sctx); +void si_update_ps_kill_enable(struct si_context *sctx); +void si_update_vrs_flat_shading(struct si_context *sctx); +unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key); bool si_update_ngg(struct si_context *sctx); - -/* si_state_draw.c */ -void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx); -void si_trace_emit(struct si_context *sctx); -void si_init_draw_functions(struct si_context *sctx); +void si_ps_key_update_framebuffer(struct si_context *sctx); +void si_ps_key_update_framebuffer_blend(struct si_context *sctx); +void si_ps_key_update_blend_rasterizer(struct si_context *sctx); +void si_ps_key_update_rasterizer(struct si_context *sctx); +void si_ps_key_update_dsa(struct si_context *sctx); +void si_ps_key_update_sample_shading(struct si_context *sctx); +void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx); +void si_init_tess_factor_ring(struct si_context *sctx); +bool si_update_gs_ring_buffers(struct si_context *sctx); +bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes); + +/* si_state_draw.cpp */ +void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems, + struct pipe_vertex_buffer *vb, unsigned element_index, + uint32_t *out); +void si_init_draw_functions_GFX6(struct si_context *sctx); +void si_init_draw_functions_GFX7(struct si_context *sctx); +void si_init_draw_functions_GFX8(struct si_context *sctx); +void si_init_draw_functions_GFX9(struct si_context *sctx); +void si_init_draw_functions_GFX10(struct si_context *sctx); +void si_init_draw_functions_GFX10_3(struct si_context *sctx); +void si_init_spi_map_functions(struct si_context *sctx); /* si_state_msaa.c */ void si_init_msaa_functions(struct si_context *sctx); diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c index e5e5f1a65..921bd5446 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, shader_variant_flags |= 1 << 0; if (sel->nir) shader_variant_flags |= 1 << 1; - if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32) + if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es) == 32) shader_variant_flags |= 1 << 2; if (sel->info.stage == MESA_SHADER_FRAGMENT && /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */ @@ -78,11 +78,14 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, sel->info.base.fs.uses_discard && sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) shader_variant_flags |= 1 << 3; - if (sel->info.stage == MESA_SHADER_VERTEX) { - /* This varies depending on whether compute-based culling is enabled. */ - assert(sel->screen->num_vbos_in_user_sgprs <= 7); - shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4; - } + /* use_ngg_culling disables NGG passthrough for non-culling shaders to reduce context + * rolls, which can be changed with AMD_DEBUG=nonggc or AMD_DEBUG=nggc. + */ + if (sel->screen->use_ngg_culling) + shader_variant_flags |= 1 << 4; + + /* bit gap */ + if (sel->screen->options.no_infinite_interp) shader_variant_flags |= 1 << 7; if (sel->screen->options.clamp_div_by_zero) @@ -370,7 +373,7 @@ bool si_shader_mem_ordered(struct si_shader *shader) } static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes, - struct si_pm4_state *pm4) + struct si_shader *shader) { const struct si_shader_info *info = &tes->info; unsigned tes_prim_mode = info->base.tess.primitive_mode; @@ -427,10 +430,9 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad } else distribution_mode = V_028B6C_NO_DIST; - assert(pm4->shader); - pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | - S_028B6C_TOPOLOGY(topology) | - S_028B6C_DISTRIBUTION_MODE(distribution_mode); + shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | + S_028B6C_TOPOLOGY(topology) | + S_028B6C_DISTRIBUTION_MODE(distribution_mode); } /* Polaris needs different VTX_REUSE_DEPTH settings depending on @@ -444,18 +446,16 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shad * VS as ES | ES -> GS -> VS | 30 * TES as VS | LS -> HS -> VS | 14 or 30 * TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30 - * - * If "shader" is NULL, it's assumed it's not LS or GS copy shader. */ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel, - struct si_shader *shader, struct si_pm4_state *pm4) + struct si_shader *shader) { if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10) return; /* VS as VS, or VS as ES: */ if ((sel->info.stage == MESA_SHADER_VERTEX && - (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) || + (!shader->key.as_ls && !shader->is_gs_copy_shader)) || /* TES as VS, or TES as ES: */ sel->info.stage == MESA_SHADER_TESS_EVAL) { unsigned vtx_reuse_depth = 30; @@ -464,25 +464,15 @@ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_sh sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD) vtx_reuse_depth = 14; - assert(pm4->shader); - pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; + shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; } } static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader) { - if (shader->pm4) - si_pm4_clear_state(shader->pm4); - else - shader->pm4 = CALLOC_STRUCT(si_pm4_state); - - if (shader->pm4) { - shader->pm4->shader = shader; - return shader->pm4; - } else { - fprintf(stderr, "radeonsi: Failed to create pm4 state.\n"); - return NULL; - } + si_pm4_clear_state(&shader->pm4); + shader->pm4.is_shader = true; + return &shader->pm4; } static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader, @@ -509,22 +499,30 @@ static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_sha assert(shader->selector->info.stage == MESA_SHADER_VERTEX || (shader->previous_stage_sel && shader->previous_stage_sel->info.stage == MESA_SHADER_VERTEX)); - /* GFX6-9 LS (VertexID, RelAutoindex, InstanceID / StepRate0(==1), ...). - * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID, ...) - * GFX10 LS (VertexID, RelAutoindex, UserVGPR1, InstanceID). - * GFX10 ES,VS (VertexID, UserVGPR0, UserVGPR1 or VSPrimID, UserVGPR2 or - * InstanceID) + /* GFX6-9 LS (VertexID, RelAutoIndex, InstanceID / StepRate0, InstanceID) + * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0, VSPrimID, InstanceID) + * GFX10 LS (VertexID, RelAutoIndex, UserVGPR1, UserVGPR2 or InstanceID) + * GFX10 ES,VS (VertexID, UserVGPR1, UserVGPR2 or VSPrimID, UserVGPR3 or InstanceID) */ bool is_ls = shader->selector->info.stage == MESA_SHADER_TESS_CTRL || shader->key.as_ls; + unsigned max = 0; - if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid) - return 3; - else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id) - return 2; - else if (is_ls || shader->info.uses_instanceid) - return 1; - else - return 0; + if (shader->info.uses_instanceid) { + if (sscreen->info.chip_class >= GFX10) + max = MAX2(max, 3); + else if (is_ls) + max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */ + else + max = MAX2(max, 1); /* use (InstanceID / StepRate0) because StepRate0 == 1 */ + } + + if (legacy_vs_prim_id) + max = MAX2(max, 2); /* VSPrimID */ + + if (is_ls) + max = MAX2(max, 1); /* RelAutoIndex */ + + return max; } static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader) @@ -540,7 +538,6 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader) va = shader->bo->gpu_address; si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); - si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40)); shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) | @@ -565,10 +562,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) if (sscreen->info.chip_class >= GFX9) { if (sscreen->info.chip_class >= GFX10) { si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); - si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40)); } else { si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8); - si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40)); } unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR); @@ -582,7 +577,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); } else { si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8); - si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40)); + si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, + S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8)); shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) | S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); @@ -607,7 +603,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) static void si_emit_shader_es(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.es->shader; + struct si_shader *shader = sctx->queued.named.es; if (!shader) return; @@ -656,7 +652,8 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0; si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40)); + si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, + S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8)); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) | @@ -667,9 +664,9 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); + si_set_tesseval_regs(sscreen, shader->selector, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); } void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, @@ -767,7 +764,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * static void si_emit_shader_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -822,6 +819,20 @@ static void si_emit_shader_gs(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); } radeon_end_update_context_roll(sctx); + + /* These don't cause any context rolls. */ + radeon_begin_again(&sctx->gfx_cs); + if (sctx->chip_class >= GFX7) { + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs); + } + if (sctx->chip_class >= GFX10) { + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs); + } + radeon_end(); } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -868,6 +879,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.gs.vgt_gs_instance_cnt = S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0); + /* Copy over fields from the GS copy shader to make them easily accessible from GS. */ + shader->pa_cl_vs_out_cntl = sel->gs_copy_shader->pa_cl_vs_out_cntl; + va = shader->bo->gpu_address; if (sscreen->info.chip_class >= GFX9) { @@ -902,10 +916,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) if (sscreen->info.chip_class >= GFX10) { si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40)); } else { si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40)); } uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) | @@ -929,10 +941,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); - } + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0); shader->ctx_reg.gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | @@ -943,12 +955,16 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4; if (es_stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4); + si_set_tesseval_regs(sscreen, shader->key.part.gs.es, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader); } else { + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F); + si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); - si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40)); + si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, + S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8)); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | @@ -960,28 +976,25 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) } } -static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) +bool gfx10_is_ngg_passthrough(struct si_shader *shader) { - enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - radeon_begin(cs); - - if (sctx->chip_class == GFX10) { - /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); - } + struct si_shader_selector *sel = shader->selector; - radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value); - radeon_end(); + /* Never use NGG passthrough if culling is possible even when it's not used by this shader, + * so that we don't get context rolls when enabling and disabling NGG passthrough. + */ + if (sel->screen->use_ngg_culling) + return false; - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } + /* The definition of NGG passthrough is: + * - user GS is turned off (no amplification, no GS instancing, and no culling) + * - VGT_ESGS_RING_ITEMSIZE is ignored (behaving as if it was equal to 1) + * - vertex indices are packed into 1 VGPR + * - Dimgrey and later chips can optionally skip the gs_alloc_req message + * + * NGG passthrough still allows the use of LDS. + */ + return sel->info.stage != MESA_SHADER_GEOMETRY && !shader->key.opt.ngg_culling; } /* Common tail code for NGG primitive shaders. */ @@ -1012,18 +1025,24 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL, shader->ctx_reg.ngg.pa_cl_ngg_cntl); - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); radeon_end_update_context_roll(sctx); - /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); + /* These don't cause a context roll. */ + radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, + shader->ctx_reg.ngg.ge_pc_alloc); + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs); + radeon_end(); } static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1032,7 +1051,7 @@ static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1046,7 +1065,7 @@ static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1060,7 +1079,7 @@ static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; + struct si_shader *shader = sctx->queued.named.gs; if (!shader) return; @@ -1075,7 +1094,7 @@ static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) gfx10_emit_shader_ngg_tail(sctx, shader); } -unsigned si_get_input_prim(const struct si_shader_selector *gs) +unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key) { if (gs->info.stage == MESA_SHADER_GEOMETRY) return gs->info.base.gs.input_primitive; @@ -1088,22 +1107,26 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs) return PIPE_PRIM_TRIANGLES; } - /* TODO: Set this correctly if the primitive type is set in the shader key. */ + if (key->opt.ngg_culling & SI_NGG_CULL_LINES) + return PIPE_PRIM_LINES; + return PIPE_PRIM_TRIANGLES; /* worst case for all callers */ } static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, const struct si_shader *shader, bool ngg) { - bool writes_psize = sel->info.writes_psize; - - if (shader) - writes_psize &= !shader->key.opt.kill_pointsize; - + /* Clip distances can be killed, but cull distances can't. */ + unsigned clipcull_mask = (sel->clipdist_mask & ~shader->key.opt.kill_clip_distances) | + sel->culldist_mask; + bool writes_psize = sel->info.writes_psize && !shader->key.opt.kill_pointsize; bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) || sel->screen->options.vrs2x2 || sel->info.writes_layer || sel->info.writes_viewport_index; - return S_02881C_USE_VTX_POINT_SIZE(writes_psize) | + + return S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipcull_mask & 0x0F) != 0) | + S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipcull_mask & 0xF0) != 0) | + S_02881C_USE_VTX_POINT_SIZE(writes_psize) | S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) | S_02881C_USE_VTX_VRS_RATE(sel->screen->options.vrs2x2) | S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) | @@ -1132,7 +1155,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader gs_info->base.vs.window_space_position : 0; bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid; unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1); - unsigned input_prim = si_get_input_prim(gs_sel); + unsigned input_prim = si_get_input_prim(gs_sel, &shader->key); bool break_wave_at_eoi = false; struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader); if (!pm4) @@ -1174,7 +1197,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader * for the GL_LINE polygon mode to skip rendering lines on inner edges. */ if (gs_info->uses_invocationid || - (gs_stage == MESA_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader))) + (gfx10_edgeflags_have_effect(shader) && !gfx10_is_ngg_passthrough(shader))) gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */ else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) || (gs_stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)) @@ -1185,9 +1208,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ unsigned wave_size = si_get_shader_wave_size(shader); + unsigned late_alloc_wave64, cu_mask; + + ac_compute_late_alloc(&sscreen->info, true, shader->key.opt.ngg_culling, + shader->config.scratch_bytes_per_wave > 0, + &late_alloc_wave64, &cu_mask); si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40)); si_pm4_set_reg( pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) | @@ -1205,32 +1232,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) | S_00B22C_LDS_SIZE(shader->config.lds_size)); - /* Determine LATE_ALLOC_GS. */ - unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa; - unsigned late_alloc_wave64; /* The limit is per SA. */ - - /* For Wave32, the hw will launch twice the number of late - * alloc waves, so 1 == 2x wave32. - * - * Don't use late alloc for NGG on Navi14 due to a hw bug. - */ - if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc) - late_alloc_wave64 = 0; - else if (shader->key.opt.ngg_culling) - late_alloc_wave64 = num_cu_per_sh * 10; - else - late_alloc_wave64 = num_cu_per_sh * 4; - - /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */ - if (sscreen->info.chip_class == GFX10) - late_alloc_wave64 = MIN2(late_alloc_wave64, 64); - - /* Max number that fits into the register field. */ - late_alloc_wave64 = MIN2(late_alloc_wave64, 127); - - si_pm4_set_reg( - pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) | + S_00B21C_WAVE_LIMIT(0x3F); + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs = + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); nparams = MAX2(shader->info.nr_param_exports, 1); shader->ctx_reg.ngg.spi_vs_out_config = @@ -1261,7 +1266,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader } if (es_stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, es_sel, pm4); + si_set_tesseval_regs(sscreen, es_sel, shader); shader->ctx_reg.ngg.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | @@ -1275,59 +1280,55 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) | S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance); - /* Always output hw-generated edge flags and pass them via the prim + /* Output hw-generated edge flags if needed and pass them via the prim * export to prevent drawing lines on internal edges of decomposed - * primitives (such as quads) with polygon mode = lines. Only VS needs - * this. + * primitives (such as quads) with polygon mode = lines. */ shader->ctx_reg.ngg.pa_cl_ngg_cntl = - S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_stage == MESA_SHADER_VERTEX) | + S_028838_INDEX_BUF_EDGE_FLAG_ENA(gfx10_edgeflags_have_effect(shader)) | /* Reuse for NGG. */ S_028838_VERTEX_REUSE_DEPTH(sscreen->info.chip_class >= GFX10_3 ? 30 : 0); shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true); /* Oversubscribe PC. This improves performance when there are too many varyings. */ - float oversub_pc_factor = 0.25; + unsigned oversub_pc_factor = 1; if (shader->key.opt.ngg_culling) { /* Be more aggressive with NGG culling. */ if (shader->info.nr_param_exports > 4) - oversub_pc_factor = 1; + oversub_pc_factor = 4; else if (shader->info.nr_param_exports > 2) - oversub_pc_factor = 0.75; + oversub_pc_factor = 3; else - oversub_pc_factor = 0.5; + oversub_pc_factor = 2; } - unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor; - shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) | + unsigned oversub_pc_lines = + late_alloc_wave64 ? (sscreen->info.pc_lines / 4) * oversub_pc_factor : 0; + shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1); - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { - shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3); - } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2); - } else { - shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) | - S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); + shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) | + S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); - /* Bug workaround for a possible hang with non-tessellation cases. - * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 - * - * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 + /* On gfx10, the GE only checks against the maximum number of ES verts after + * allocating a full GS primitive. So we need to ensure that whenever + * this check passes, there is enough space for a full primitive without + * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256 + * if we have enough LDS. + * + * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0. + */ + if ((sscreen->info.chip_class == GFX10) && + (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */ + shader->ngg.hw_max_esverts != 256 && + shader->ngg.hw_max_esverts > 5) { + /* This could be based on the input primitive type. 5 is the worst case + * for primitive types with adjacency. */ - if ((sscreen->info.chip_class == GFX10) && - (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */ - shader->ngg.hw_max_esverts != 256) { - shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; - - if (shader->ngg.hw_max_esverts > 5) { - shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); - } - } + shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; + shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); } if (window_space) { @@ -1338,11 +1339,15 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); } + + shader->ctx_reg.ngg.vgt_stages.u.ngg = 1; + shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs; + shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader); } static void si_emit_shader_vs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.vs->shader; + struct si_shader *shader = sctx->queued.named.vs; if (!shader) return; @@ -1385,16 +1390,15 @@ static void si_emit_shader_vs(struct si_context *sctx) S_028A44_GS_INST_PRIMS_IN_SUBGRP(126)); } - if (sctx->chip_class >= GFX10) { - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); - } radeon_end_update_context_roll(sctx); /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - if (sctx->chip_class >= GFX10) - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc); + if (sctx->chip_class >= GFX10) { + radeon_begin_again(&sctx->gfx_cs); + radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, + shader->ctx_reg.vs.ge_pc_alloc); + radeon_end(); + } } /** @@ -1485,14 +1489,26 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, : V_02870C_SPI_SHADER_NONE) | S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE); - shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) | + unsigned late_alloc_wave64, cu_mask; + ac_compute_late_alloc(&sscreen->info, false, false, + shader->config.scratch_bytes_per_wave > 0, + &late_alloc_wave64, &cu_mask); + + shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(late_alloc_wave64 > 0) | S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1); shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, false); oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0; + if (sscreen->info.chip_class >= GFX7) { + si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, + S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); + } + si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8); - si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40)); + si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, + S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8)); uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) | @@ -1530,9 +1546,9 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); + si_set_tesseval_regs(sscreen, shader->selector, shader); - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); } static unsigned si_get_ps_num_interp(struct si_shader *ps) @@ -1567,7 +1583,7 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) static void si_emit_shader_ps(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.ps->shader; + struct si_shader *shader = sctx->queued.named.ps; if (!shader) return; @@ -1695,10 +1711,13 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.ps.spi_ps_input_ena = input_ena; shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr; + unsigned num_interp = si_get_ps_num_interp(shader); + /* Set interpolation controls. */ - spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) | + spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) | S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32); + shader->ctx_reg.ps.num_interp = num_interp; shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl; shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control; shader->ctx_reg.ps.spi_shader_z_format = @@ -1708,7 +1727,8 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) va = shader->bo->gpu_address; si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8); - si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40)); + si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, + S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8)); uint32_t rsrc1 = S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) | @@ -1764,31 +1784,41 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader } } -static unsigned si_get_alpha_test_func(struct si_context *sctx) +static void si_clear_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key) { - /* Alpha-test should be disabled if colorbuffer 0 is integer. */ - return sctx->queued.named.dsa->alpha_func; + prolog_key->instance_divisor_is_one = 0; + prolog_key->instance_divisor_is_fetched = 0; + key->mono.vs_fetch_opencode = 0; + memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch)); } -void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key) +void si_vs_key_update_inputs(struct si_context *sctx) { - if (vs->info.base.vs.blit_sgprs_amd) + struct si_shader_selector *vs = sctx->shader.vs.cso; + struct si_vertex_elements *elts = sctx->vertex_elements; + struct si_shader_key *key = &sctx->shader.vs.key; + + if (!vs) return; - struct si_vertex_elements *elts = sctx->vertex_elements; + if (vs->info.base.vs.blit_sgprs_amd) { + si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog); + key->opt.prefer_mono = 0; + sctx->uses_nontrivial_vs_prolog = false; + return; + } - prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one; - prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched; - prolog_key->unpack_instance_id_from_vertex_id = sctx->prim_discard_cs_instancing; + bool uses_nontrivial_vs_prolog = false; - /* Prefer a monolithic shader to allow scheduling divisions around - * VBO loads. */ - if (prolog_key->instance_divisor_is_fetched) - key->opt.prefer_mono = 1; + if (elts->instance_divisor_is_one || elts->instance_divisor_is_fetched) + uses_nontrivial_vs_prolog = true; + + key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one; + key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched; + key->opt.prefer_mono = elts->instance_divisor_is_fetched; - unsigned count = MIN2(vs->info.num_inputs, elts->count); - unsigned count_mask = (1 << count) - 1; + unsigned count_mask = (1 << vs->info.num_inputs) - 1; unsigned fix = elts->fix_fetch_always & count_mask; unsigned opencode = elts->fix_fetch_opencode & count_mask; @@ -1807,19 +1837,49 @@ void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selecto } } + memset(key->mono.vs_fix_fetch, 0, sizeof(key->mono.vs_fix_fetch)); + while (fix) { unsigned i = u_bit_scan(&fix); - key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i]; + uint8_t fix_fetch = elts->fix_fetch[i]; + + key->mono.vs_fix_fetch[i].bits = fix_fetch; + if (fix_fetch) + uses_nontrivial_vs_prolog = true; } key->mono.vs_fetch_opencode = opencode; + if (opencode) + uses_nontrivial_vs_prolog = true; + + sctx->uses_nontrivial_vs_prolog = uses_nontrivial_vs_prolog; + + /* draw_vertex_state (display lists) requires a trivial VS prolog that ignores + * the current vertex buffers and vertex elements. + * + * We just computed the prolog key because we needed to set uses_nontrivial_vs_prolog, + * so that we know whether the VS prolog should be updated when we switch from + * draw_vertex_state to draw_vbo. Now clear the VS prolog for draw_vertex_state. + * This should happen rarely because the VS prolog should be trivial in most + * cases. + */ + if (uses_nontrivial_vs_prolog && sctx->force_trivial_vs_prolog) + si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog); } -static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs, - struct si_shader_key *key) +void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, + struct si_vs_prolog_bits *prolog_key) { - struct si_shader_selector *ps = sctx->shader.ps.cso; + prolog_key->instance_divisor_is_one = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_one; + prolog_key->instance_divisor_is_fetched = sctx->shader.vs.key.part.vs.prolog.instance_divisor_is_fetched; - key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; + key->mono.vs_fetch_opencode = sctx->shader.vs.key.mono.vs_fetch_opencode; + memcpy(key->mono.vs_fix_fetch, sctx->shader.vs.key.mono.vs_fix_fetch, + sizeof(key->mono.vs_fix_fetch)); +} + +void si_update_ps_inputs_read_or_disabled(struct si_context *sctx) +{ + struct si_shader_selector *ps = sctx->shader.ps.cso; /* Find out if PS is disabled. */ bool ps_disabled = true; @@ -1827,273 +1887,314 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shad bool ps_modifies_zs = ps->info.base.fs.uses_discard || ps->info.writes_z || ps->info.writes_stencil || ps->info.writes_samplemask || sctx->queued.named.blend->alpha_to_coverage || - si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS; + sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS; unsigned ps_colormask = si_get_total_colormask(sctx); ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard || (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory); } - /* Find out which VS outputs aren't used by the PS. */ - uint64_t outputs_written = vs->outputs_written_before_ps; - uint64_t inputs_read = 0; + sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->inputs_read; +} - /* Ignore outputs that are not passed from VS to PS. */ - outputs_written &= ~((1ull << si_shader_io_get_unique_index(VARYING_SLOT_POS, true)) | - (1ull << si_shader_io_get_unique_index(VARYING_SLOT_PSIZ, true)) | - (1ull << si_shader_io_get_unique_index(VARYING_SLOT_CLIP_VERTEX, true))); +static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) +{ - if (!ps_disabled) { - inputs_read = ps->inputs_read; - } + key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable; - uint64_t linked = outputs_written & inputs_read; + /* Find out which VS outputs aren't used by the PS. */ + uint64_t outputs_written = vs->outputs_written_before_ps; + uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled; key->opt.kill_outputs = ~linked & outputs_written; if (vs->info.stage != MESA_SHADER_GEOMETRY) { key->opt.ngg_culling = sctx->ngg_culling; - - if (sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid) - key->mono.u.vs_export_prim_id = 1; + key->mono.u.vs_export_prim_id = sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid; + } else { + key->opt.ngg_culling = 0; + key->mono.u.vs_export_prim_id = 0; } - /* We need PKT3_CONTEXT_REG_RMW, which we currently only use on GFX10+. */ - if (sctx->chip_class >= GFX10 && - vs->info.writes_psize && - sctx->current_rast_prim != PIPE_PRIM_POINTS && - !sctx->queued.named.rasterizer->polygon_mode_is_points) - key->opt.kill_pointsize = 1; + key->opt.kill_pointsize = vs->info.writes_psize && + sctx->current_rast_prim != PIPE_PRIM_POINTS && + !sctx->queued.named.rasterizer->polygon_mode_is_points; } -/* Compute the key for the hw shader variant */ -static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel, - union si_vgt_stages_key stages_key, - struct si_shader_key *key) +static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) { - struct si_context *sctx = (struct si_context *)ctx; + key->opt.kill_clip_distances = 0; + key->opt.kill_outputs = 0; + key->opt.ngg_culling = 0; + key->mono.u.vs_export_prim_id = 0; + key->opt.kill_pointsize = 0; +} + +void si_ps_key_update_framebuffer(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; - memset(key, 0, sizeof(*key)); + if (!sel) + return; - unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms; - if (num_inlinable_uniforms && - sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) { - key->opt.inline_uniforms = true; - memcpy(key->opt.inlined_uniform_values, - sctx->inlinable_uniforms[sel->pipe_shader_type], - num_inlinable_uniforms * 4); + if (sel->info.color0_writes_all_cbufs && + sel->info.colors_written == 0x1) + key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + else + key->part.ps.epilog.last_cbuf = 0; + + /* ps_uses_fbfetch is true only if the color buffer is bound. */ + if (sctx->ps_uses_fbfetch) { + struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; + struct pipe_resource *tex = cb0->texture; + + /* 1D textures are allocated and used as 2D on GFX9. */ + key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; + key->mono.u.ps.fbfetch_is_1D = + sctx->chip_class != GFX9 && + (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY); + key->mono.u.ps.fbfetch_layered = + tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || + tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || + tex->target == PIPE_TEXTURE_3D; + } else { + key->mono.u.ps.fbfetch_msaa = 0; + key->mono.u.ps.fbfetch_is_1D = 0; + key->mono.u.ps.fbfetch_layered = 0; } +} - switch (sel->info.stage) { - case MESA_SHADER_VERTEX: - si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); +void si_ps_key_update_framebuffer_blend(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_blend *blend = sctx->queued.named.blend; - if (sctx->shader.tes.cso) - key->as_ls = 1; - else if (sctx->shader.gs.cso) { - key->as_es = 1; - key->as_ngg = stages_key.u.ngg; - } else { - key->as_ngg = stages_key.u.ngg; - si_shader_selector_key_hw_vs(sctx, sel, key); - } - break; - case MESA_SHADER_TESS_CTRL: - if (sctx->chip_class >= GFX9) { - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.tcs.ls_prolog); - key->part.tcs.ls = sctx->shader.vs.cso; + if (!sel) + return; - /* When the LS VGPR fix is needed, monolithic shaders - * can: - * - avoid initializing EXEC in both the LS prolog - * and the LS main part when !vs_needs_prolog - * - remove the fixup for unused input VGPRs - */ - key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix; + /* Select the shader color format based on whether + * blending or alpha are needed. + */ + key->part.ps.epilog.spi_shader_col_format = + (blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend_alpha) | + (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend) | + (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_alpha) | + (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format); + key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; + + /* The output for dual source blending should have + * the same format as the first output. + */ + if (blend->dual_src_blend) { + key->part.ps.epilog.spi_shader_col_format |= + (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; + } - /* The LS output / HS input layout can be communicated - * directly instead of via user SGPRs for merged LS-HS. - * This also enables jumping over the VS prolog for HS-only waves. - */ - key->opt.prefer_mono = 1; - key->opt.same_patch_vertices = sctx->same_patch_vertices; - } + /* If alpha-to-coverage is enabled, we have to export alpha + * even if there is no color buffer. + */ + if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage) + key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; - key->part.tcs.epilog.prim_mode = - sctx->shader.tes.cso->info.base.tess.primitive_mode; - key->part.tcs.epilog.invoc0_tess_factors_are_def = - sel->info.tessfactors_are_def_in_all_invocs; - key->part.tcs.epilog.tes_reads_tess_factors = sctx->shader.tes.cso->info.reads_tess_factors; + /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs + * to the range supported by the type if a channel has less + * than 16 bits and the export format is 16_ABGR. + */ + if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { + key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; + key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; + } - if (sel == sctx->fixed_func_tcs_shader.cso) - key->mono.u.ff_tcs_inputs_to_copy = sctx->shader.vs.cso->outputs_written; - break; - case MESA_SHADER_TESS_EVAL: - key->as_ngg = stages_key.u.ngg; + /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ + if (!key->part.ps.epilog.last_cbuf) { + key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; + key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; + key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; + } - if (sctx->shader.gs.cso) - key->as_es = 1; - else { - si_shader_selector_key_hw_vs(sctx, sel, key); - } - break; - case MESA_SHADER_GEOMETRY: - if (sctx->chip_class >= GFX9) { - if (sctx->shader.tes.cso) { - key->part.gs.es = sctx->shader.tes.cso; - } else { - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, key, &key->part.gs.vs_prolog); - key->part.gs.es = sctx->shader.vs.cso; - } + /* Eliminate shader code computing output values that are unused. + * This enables dead code elimination between shader parts. + * Check if any output is eliminated. + */ + if (sel->colors_written_4bit & + ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) + key->opt.prefer_mono = 1; + else + key->opt.prefer_mono = 0; +} - key->as_ngg = stages_key.u.ngg; +void si_ps_key_update_blend_rasterizer(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_blend *blend = sctx->queued.named.blend; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - /* Only NGG can eliminate GS outputs, because the code is shared with VS. */ - if (stages_key.u.ngg) - si_shader_selector_key_hw_vs(sctx, sel, key); + key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; +} - /* This enables jumping over the VS prolog for GS-only waves. */ - key->opt.prefer_mono = 1; - } - key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; - break; - case MESA_SHADER_FRAGMENT: { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_state_blend *blend = sctx->queued.named.blend; +void si_ps_key_update_rasterizer(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - if (sel->info.color0_writes_all_cbufs && - sel->info.colors_written == 0x1) - key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + if (!sel) + return; - /* Select the shader color format based on whether - * blending or alpha are needed. - */ - key->part.ps.epilog.spi_shader_col_format = - (blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend_alpha) | - (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend) | - (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_alpha) | - (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format); - key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; - - /* The output for dual source blending should have - * the same format as the first output. - */ - if (blend->dual_src_blend) { - key->part.ps.epilog.spi_shader_col_format |= - (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; - } + key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; + key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; + key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; +} - /* If alpha-to-coverage is enabled, we have to export alpha - * even if there is no color buffer. - */ - if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage) - key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; +void si_ps_key_update_dsa(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; - /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs - * to the range supported by the type if a channel has less - * than 16 bits and the export format is 16_ABGR. - */ - if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { - key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; - key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; - } + key->part.ps.epilog.alpha_func = sctx->queued.named.dsa->alpha_func; +} - /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ - if (!key->part.ps.epilog.last_cbuf) { - key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; - key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; - key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; - } +static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx) +{ + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - /* Eliminate shader code computing output values that are unused. - * This enables dead code elimination between shader parts. - * Check if any output is eliminated. - */ - if (sel->colors_written_4bit & - ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit)) - key->opt.prefer_mono = 1; + bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); + bool is_line = util_prim_is_lines(sctx->current_rast_prim); - bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); - bool is_line = util_prim_is_lines(sctx->current_rast_prim); + key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; + key->part.ps.epilog.poly_line_smoothing = + ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && + sctx->framebuffer.nr_samples <= 1; +} - key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; - key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color; +void si_ps_key_update_sample_shading(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; - key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; + if (!sel) + return; - key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; - key->part.ps.epilog.poly_line_smoothing = - ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && - sctx->framebuffer.nr_samples <= 1; - key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; + if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) + key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); + else + key->part.ps.prolog.samplemask_log_ps_iter = 0; +} - if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) { - key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); - } +void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx) +{ + struct si_shader_selector *sel = sctx->shader.ps.cso; + struct si_shader_key *key = &sctx->shader.ps.key; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - bool uses_persp_center = sel->info.uses_persp_center || - (!rs->flatshade && sel->info.uses_persp_center_color); - bool uses_persp_centroid = sel->info.uses_persp_centroid || - (!rs->flatshade && sel->info.uses_persp_centroid_color); - bool uses_persp_sample = sel->info.uses_persp_sample || - (!rs->flatshade && sel->info.uses_persp_sample_color); - - if (rs->force_persample_interp && rs->multisample_enable && - sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { - key->part.ps.prolog.force_persp_sample_interp = - uses_persp_center || uses_persp_centroid; - - key->part.ps.prolog.force_linear_sample_interp = - sel->info.uses_linear_center || sel->info.uses_linear_centroid; - } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { - key->part.ps.prolog.bc_optimize_for_persp = - uses_persp_center && uses_persp_centroid; - key->part.ps.prolog.bc_optimize_for_linear = - sel->info.uses_linear_center && sel->info.uses_linear_centroid; - } else { - /* Make sure SPI doesn't compute more than 1 pair - * of (i,j), which is the optimization here. */ - key->part.ps.prolog.force_persp_center_interp = uses_persp_center + - uses_persp_centroid + - uses_persp_sample > 1; - - key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + - sel->info.uses_linear_centroid + - sel->info.uses_linear_sample > 1; - - if (sel->info.uses_interp_at_sample) - key->mono.u.ps.interpolate_at_sample_force_center = 1; + if (!sel) + return; + + bool uses_persp_center = sel->info.uses_persp_center || + (!rs->flatshade && sel->info.uses_persp_center_color); + bool uses_persp_centroid = sel->info.uses_persp_centroid || + (!rs->flatshade && sel->info.uses_persp_centroid_color); + bool uses_persp_sample = sel->info.uses_persp_sample || + (!rs->flatshade && sel->info.uses_persp_sample_color); + + if (rs->force_persample_interp && rs->multisample_enable && + sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { + key->part.ps.prolog.force_persp_sample_interp = + uses_persp_center || uses_persp_centroid; + + key->part.ps.prolog.force_linear_sample_interp = + sel->info.uses_linear_center || sel->info.uses_linear_centroid; + + key->part.ps.prolog.force_persp_center_interp = 0; + key->part.ps.prolog.force_linear_center_interp = 0; + key->part.ps.prolog.bc_optimize_for_persp = 0; + key->part.ps.prolog.bc_optimize_for_linear = 0; + key->mono.u.ps.interpolate_at_sample_force_center = 0; + } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { + key->part.ps.prolog.force_persp_sample_interp = 0; + key->part.ps.prolog.force_linear_sample_interp = 0; + key->part.ps.prolog.force_persp_center_interp = 0; + key->part.ps.prolog.force_linear_center_interp = 0; + key->part.ps.prolog.bc_optimize_for_persp = + uses_persp_center && uses_persp_centroid; + key->part.ps.prolog.bc_optimize_for_linear = + sel->info.uses_linear_center && sel->info.uses_linear_centroid; + key->mono.u.ps.interpolate_at_sample_force_center = 0; + } else { + key->part.ps.prolog.force_persp_sample_interp = 0; + key->part.ps.prolog.force_linear_sample_interp = 0; + + /* Make sure SPI doesn't compute more than 1 pair + * of (i,j), which is the optimization here. */ + key->part.ps.prolog.force_persp_center_interp = uses_persp_center + + uses_persp_centroid + + uses_persp_sample > 1; + + key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + + sel->info.uses_linear_centroid + + sel->info.uses_linear_sample > 1; + key->part.ps.prolog.bc_optimize_for_persp = 0; + key->part.ps.prolog.bc_optimize_for_linear = 0; + key->mono.u.ps.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample; + } +} + +/* Compute the key for the hw shader variant */ +static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel, + struct si_shader_key *key) +{ + struct si_context *sctx = (struct si_context *)ctx; + + switch (sel->info.stage) { + case MESA_SHADER_VERTEX: + if (!sctx->shader.tes.cso && !sctx->shader.gs.cso) + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); + break; + case MESA_SHADER_TESS_CTRL: + if (sctx->chip_class >= GFX9) { + si_get_vs_key_inputs(sctx, key, &key->part.tcs.ls_prolog); + key->part.tcs.ls = sctx->shader.vs.cso; } + break; + case MESA_SHADER_TESS_EVAL: + if (!sctx->shader.gs.cso) + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); + break; + case MESA_SHADER_GEOMETRY: + if (sctx->chip_class >= GFX9) { + if (sctx->shader.tes.cso) { + si_clear_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog); + key->part.gs.es = sctx->shader.tes.cso; + } else { + si_get_vs_key_inputs(sctx, key, &key->part.gs.vs_prolog); + key->part.gs.es = sctx->shader.vs.cso; + } - key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx); - - /* ps_uses_fbfetch is true only if the color buffer is bound. */ - if (sctx->ps_uses_fbfetch && !sctx->blitter_running) { - struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; - struct pipe_resource *tex = cb0->texture; - - /* 1D textures are allocated and used as 2D on GFX9. */ - key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; - key->mono.u.ps.fbfetch_is_1D = - sctx->chip_class != GFX9 && - (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY); - key->mono.u.ps.fbfetch_layered = - tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || - tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || - tex->target == PIPE_TEXTURE_3D; + /* Only NGG can eliminate GS outputs, because the code is shared with VS. */ + if (sctx->ngg) + si_get_vs_key_outputs(sctx, sel, key); + else + si_clear_vs_key_outputs(sctx, sel, key); } break; - } + case MESA_SHADER_FRAGMENT: + si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx); + break; default: assert(0); } - - if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT))) - memset(&key->opt, 0, sizeof(key->opt)); } static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority) @@ -2138,7 +2239,7 @@ static void si_build_shader_variant(struct si_shader *shader, int thread_index, si_shader_init_pm4_state(sscreen, shader); } -static void si_build_shader_variant_low_priority(void *job, int thread_index) +static void si_build_shader_variant_low_priority(void *job, void *gdata, int thread_index) { struct si_shader *shader = (struct si_shader *)job; @@ -2151,7 +2252,7 @@ static const struct si_shader_key zeroed; static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel, struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key) + const struct si_shader_key *key) { struct si_shader **mainp = si_get_main_shader_part(sel, key); @@ -2182,6 +2283,16 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad return true; } +/* A helper to copy *key to *local_key and return local_key. */ +static const struct si_shader_key * +use_local_key_copy(const struct si_shader_key *key, struct si_shader_key *local_key) +{ + if (key != local_key) + memcpy(local_key, key, sizeof(*key)); + + return local_key; +} + /** * Select a shader variant according to the shader key. * @@ -2189,14 +2300,26 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad * the compilation isn't finished, don't select any * shader and return an error. */ -int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, int thread_index, bool optimized_or_none) +int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state, + const struct si_shader_key *key, int thread_index, + bool optimized_or_none) { + struct si_screen *sscreen = sctx->screen; struct si_shader_selector *sel = state->cso; struct si_shader_selector *previous_stage_sel = NULL; struct si_shader *current = state->current; struct si_shader *iter, *shader = NULL; + /* si_shader_select_with_key must not modify 'key' because it would affect future shaders. + * If we need to modify it for this specific shader (eg: to disable optimizations), we + * use a copy. + */ + struct si_shader_key local_key; + + if (unlikely(sscreen->debug_flags & DBG(NO_OPT_VARIANT))) { + /* Disable shader variant optimizations. */ + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); + } again: /* Check if we don't need to change anything. @@ -2209,7 +2332,8 @@ again: if (optimized_or_none) return -1; - memset(&key->opt, 0, sizeof(key->opt)); + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); goto current_not_ready; } @@ -2248,9 +2372,10 @@ current_not_ready: key->opt.inlined_uniform_values, MAX_INLINABLE_UNIFORMS * 4) != 0) { if (variant_count++ > max_inline_uniforms_variants) { + key = use_local_key_copy(key, &local_key); /* Too many variants. Disable inlining for this shader. */ - key->opt.inline_uniforms = 0; - memset(key->opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4); + local_key.opt.inline_uniforms = 0; + memset(local_key.opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4); simple_mtx_unlock(&sel->mutex); goto again; } @@ -2267,7 +2392,9 @@ current_not_ready: if (iter->is_optimized) { if (optimized_or_none) return -1; - memset(&key->opt, 0, sizeof(key->opt)); + + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); goto again; } @@ -2292,9 +2419,14 @@ current_not_ready: util_queue_fence_init(&shader->ready); + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + shader->selector = sel; shader->key = *key; - shader->compiler_ctx_state = *compiler_state; + shader->compiler_ctx_state.compiler = &sctx->compiler; + shader->compiler_ctx_state.debug = sctx->debug; + shader->compiler_ctx_state.is_debug_context = sctx->is_debug; /* If this is a merged shader, get the first shader's selector. */ if (sscreen->info.chip_class >= GFX9) { @@ -2313,10 +2445,8 @@ current_not_ready: /* Compile the main shader part if it doesn't exist. This can happen * if the initial guess was wrong. - * - * The prim discard CS doesn't need the main shader part. */ - if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) { + if (!is_pure_monolithic) { bool ok = true; /* Make sure the main shader part is present. This is needed @@ -2342,12 +2472,13 @@ current_not_ready: } simple_mtx_lock(&previous_stage_sel->mutex); - ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key); + ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state, + &shader1_key); simple_mtx_unlock(&previous_stage_sel->mutex); } if (ok) { - ok = si_check_missing_main_part(sscreen, sel, compiler_state, key); + ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state, key); } if (!ok) { @@ -2370,8 +2501,7 @@ current_not_ready: shader->is_monolithic = is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; - /* The prim discard CS is always optimized. */ - shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) && + shader->is_optimized = !is_pure_monolithic && memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; /* If it's an optimized shader, compile it asynchronously. */ @@ -2391,7 +2521,8 @@ current_not_ready: } /* Use the default (unoptimized) shader for now. */ - memset(&key->opt, 0, sizeof(key->opt)); + key = use_local_key_copy(key, &local_key); + memset(&local_key.opt, 0, sizeof(key->opt)); simple_mtx_unlock(&sel->mutex); if (sscreen->options.sync_compile) @@ -2426,15 +2557,12 @@ current_not_ready: return shader->compilation_failed ? -1 : 0; } -static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, - union si_vgt_stages_key stages_key, - struct si_compiler_ctx_state *compiler_state) +int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state) { struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_key key; - si_shader_selector_key(ctx, state->cso, stages_key, &key); - return si_shader_select_with_key(sctx->screen, state, compiler_state, &key, -1, false); + si_shader_selector_key(ctx, state->cso, &state->key); + return si_shader_select_with_key(sctx, state, &state->key, -1, false); } static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout, @@ -2477,7 +2605,7 @@ static void si_parse_next_shader_property(const struct si_shader_info *info, boo * si_shader_selector initialization. Since it can be done asynchronously, * there is no way to report compile failures to applications. */ -static void si_init_shader_selector_async(void *job, int thread_index) +static void si_init_shader_selector_async(void *job, void *gdata, int thread_index) { struct si_shader_selector *sel = (struct si_shader_selector *)job; struct si_screen *sscreen = sel->screen; @@ -2492,6 +2620,19 @@ static void si_init_shader_selector_async(void *job, int thread_index) if (!compiler->passes) si_init_compiler(sscreen, compiler); + /* The GS copy shader is always pre-compiled. */ + if (sel->info.stage == MESA_SHADER_GEOMETRY && + (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ + sel->tess_turns_off_ngg)) { + sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); + if (!sel->gs_copy_shader) { + fprintf(stderr, "radeonsi: can't create GS copy shader\n"); + return; + } + + si_shader_vs(sscreen, sel->gs_copy_shader, sel); + } + /* Serialize NIR to save memory. Monolithic shader variants * have to deserialize NIR before compilation. */ @@ -2576,14 +2717,16 @@ static void si_init_shader_selector_async(void *job, int thread_index) unsigned i; for (i = 0; i < sel->info.num_outputs; i++) { - unsigned offset = shader->info.vs_output_param_offset[i]; + unsigned semantic = sel->info.output_semantic[i]; + unsigned ps_input_cntl = shader->info.vs_output_ps_input_cntl[semantic]; - if (offset <= AC_EXP_PARAM_OFFSET_31) + /* OFFSET=0x20 means DEFAULT_VAL, which means VS doesn't export it. */ + if (G_028644_OFFSET(ps_input_cntl) != 0x20) continue; - unsigned semantic = sel->info.output_semantic[i]; unsigned id; + /* Remove the output from the mask. */ if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_POS && semantic != VARYING_SLOT_PSIZ && @@ -2596,19 +2739,6 @@ static void si_init_shader_selector_async(void *job, int thread_index) } } - /* The GS copy shader is always pre-compiled. */ - if (sel->info.stage == MESA_SHADER_GEOMETRY && - (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ - sel->tess_turns_off_ngg)) { - sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); - if (!sel->gs_copy_shader) { - fprintf(stderr, "radeonsi: can't create GS copy shader\n"); - return; - } - - si_shader_vs(sscreen, sel->gs_copy_shader, sel); - } - /* Free NIR. We only keep serialized NIR after this point. */ if (sel->nir) { ralloc_free(sel->nir); @@ -2724,18 +2854,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd ? sel->info.num_inputs : 0; - sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs); + unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class); + sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, num_vbos_in_sgprs); /* The prolog is a no-op if there are no inputs. */ sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs && !sel->info.base.vs.blit_sgprs_amd; - sel->prim_discard_cs_allowed = - sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images && - !sel->info.uses_bindless_samplers && !sel->info.base.writes_memory && - !sel->info.writes_viewport_index && - !sel->info.base.vs.window_space_position && !sel->so.num_outputs; - if (sel->info.stage == MESA_SHADER_VERTEX || sel->info.stage == MESA_SHADER_TESS_CTRL || sel->info.stage == MESA_SHADER_TESS_EVAL || @@ -2756,8 +2881,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_EDGE) { sel->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false); - sel->outputs_written_before_ps |= 1ull - << si_shader_io_get_unique_index(semantic, true); + + /* Ignore outputs that are not passed from VS to PS. */ + if (semantic != VARYING_SLOT_POS && + semantic != VARYING_SLOT_PSIZ && + semantic != VARYING_SLOT_CLIP_VERTEX) { + sel->outputs_written_before_ps |= 1ull + << si_shader_io_get_unique_index(semantic, true); + } } } } @@ -2824,7 +2955,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, case MESA_SHADER_FRAGMENT: for (i = 0; i < sel->info.num_inputs; i++) { - unsigned semantic = sel->info.input_semantic[i]; + unsigned semantic = sel->info.input[i].semantic; if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && semantic != VARYING_SLOT_PNTC) { @@ -2837,9 +2968,9 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->colors_written_4bit |= 0xf << (4 * i); for (i = 0; i < sel->info.num_inputs; i++) { - if (sel->info.input_semantic[i] == VARYING_SLOT_COL0) + if (sel->info.input[i].semantic == VARYING_SLOT_COL0) sel->color_attr_index[0] = i; - else if (sel->info.input_semantic[i] == VARYING_SLOT_COL1) + else if (sel->info.input[i].semantic == VARYING_SLOT_COL1) sel->color_attr_index[1] = i; } break; @@ -2868,25 +2999,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sscreen->info.chip_class == GFX10_3 || (sscreen->info.chip_class == GFX10 && sscreen->info.is_pro_graphics)) { - /* Rough estimates. */ - switch (sctx->family) { - case CHIP_NAVI10: - case CHIP_NAVI12: - case CHIP_SIENNA_CICHLID: - sel->ngg_cull_vert_threshold = 511; - break; - case CHIP_NAVI14: - case CHIP_NAVY_FLOUNDER: - case CHIP_DIMGREY_CAVEFISH: - case CHIP_VANGOGH: - sel->ngg_cull_vert_threshold = 255; - break; - default: - assert(!sscreen->use_ngg_culling); - } + sel->ngg_cull_vert_threshold = 128; } } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) { - if (sel->rast_prim == PIPE_PRIM_TRIANGLES && + if (sel->rast_prim != PIPE_PRIM_POINTS && (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) || sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS) || sscreen->info.chip_class == GFX10_3)) @@ -2894,10 +3010,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } } - /* PA_CL_VS_OUT_CNTL */ - if (sctx->chip_class <= GFX9) - sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, NULL, false); - sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS : u_bit_consecutive(0, sel->info.base.clip_distance_array_size); sel->culldist_mask = u_bit_consecutive(0, sel->info.base.cull_distance_array_size) << @@ -3005,11 +3117,10 @@ static void si_update_clip_regs(struct si_context *sctx, struct si_shader_select (!old_hw_vs || (old_hw_vs->info.stage == MESA_SHADER_VERTEX && old_hw_vs->info.base.vs.window_space_position) != (next_hw_vs->info.stage == MESA_SHADER_VERTEX && next_hw_vs->info.base.vs.window_space_position) || - old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl || old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask || old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant || !next_hw_vs_variant || - old_hw_vs_variant->key.opt.kill_clip_distances != next_hw_vs_variant->key.opt.kill_clip_distances)) + old_hw_vs_variant->pa_cl_vs_out_cntl != next_hw_vs_variant->pa_cl_vs_out_cntl)) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); } @@ -3053,9 +3164,10 @@ static void si_update_common_shader_state(struct si_context *sctx, struct si_sha si_shader_uses_bindless_images(sctx->shader.tcs.cso) || si_shader_uses_bindless_images(sctx->shader.tes.cso); - /* Invalidate inlinable uniforms. */ - sctx->inlinable_uniforms_valid_mask &= ~(1 << type); + if (type == PIPE_SHADER_VERTEX || type == PIPE_SHADER_TESS_EVAL || type == PIPE_SHADER_GEOMETRY) + sctx->ngg_culling = 0; /* this will be enabled on the first draw if needed */ + si_invalidate_inlinable_uniforms(sctx, type); sctx->do_update_shaders = true; } @@ -3073,6 +3185,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->shader.vs.current = sel ? sel->first_variant : NULL; sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0; sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false; + sctx->fixed_func_tcs_shader.key.mono.u.ff_tcs_inputs_to_copy = sel ? sel->outputs_written : 0; if (si_update_ngg(sctx)) si_shader_change_notify(sctx); @@ -3084,6 +3197,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, si_get_vs(sctx)->current); si_update_rasterized_prim(sctx); + si_vs_key_update_inputs(sctx); } static void si_update_tess_uses_prim_id(struct si_context *sctx) @@ -3118,7 +3232,7 @@ bool si_update_ngg(struct si_context *sctx) * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring * pointers are set. */ - if ((sctx->chip_class == GFX10 || sctx->family == CHIP_SIENNA_CICHLID) && !new_ngg) { + if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) { sctx->flags |= SI_CONTEXT_VGT_FLUSH; if (sctx->chip_class == GFX10) { /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */ @@ -3179,6 +3293,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) sctx->shader.tcs.cso = sel; sctx->shader.tcs.current = sel ? sel->first_variant : NULL; + sctx->shader.tcs.key.part.tcs.epilog.invoc0_tess_factors_are_def = + sel ? sel->info.tessfactors_are_def_in_all_invocs : 0; si_update_tess_uses_prim_id(sctx); si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL); @@ -3203,6 +3319,14 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL; si_update_tess_uses_prim_id(sctx); + sctx->shader.tcs.key.part.tcs.epilog.prim_mode = + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.prim_mode = + sel ? sel->info.base.tess.primitive_mode : 0; + + sctx->shader.tcs.key.part.tcs.epilog.tes_reads_tess_factors = + sctx->fixed_func_tcs_shader.key.part.tcs.epilog.tes_reads_tess_factors = + sel ? sel->info.reads_tess_factors : 0; + si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL); si_select_draw_vbo(sctx); sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ @@ -3219,6 +3343,41 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) si_update_rasterized_prim(sctx); } +void si_update_ps_kill_enable(struct si_context *sctx) +{ + if (!sctx->shader.ps.cso) + return; + + unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control | + S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS); + + if (sctx->ps_db_shader_control != db_shader_control) { + sctx->ps_db_shader_control = db_shader_control; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } +} + +void si_update_vrs_flat_shading(struct si_context *sctx) +{ + if (sctx->chip_class >= GFX10_3 && sctx->shader.ps.cso) { + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader_info *info = &sctx->shader.ps.cso->info; + bool allow_flat_shading = info->allow_flat_shading; + + if (allow_flat_shading && + (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || + (!rs->flatshade && info->uses_interp_color))) + allow_flat_shading = false; + + if (sctx->allow_flat_shading != allow_flat_shading) { + sctx->allow_flat_shading = allow_flat_shading; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } + } +} + static void si_bind_ps_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -3247,6 +3406,17 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } si_update_ps_colorbuf0_slot(sctx); + + si_ps_key_update_framebuffer(sctx); + si_ps_key_update_framebuffer_blend(sctx); + si_ps_key_update_blend_rasterizer(sctx); + si_ps_key_update_rasterizer(sctx); + si_ps_key_update_dsa(sctx); + si_ps_key_update_sample_shading(sctx); + si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx); + si_update_ps_inputs_read_or_disabled(sctx); + si_update_ps_kill_enable(sctx); + si_update_vrs_flat_shading(sctx); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) @@ -3257,55 +3427,55 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) util_queue_fence_destroy(&shader->ready); - if (shader->pm4) { - /* If destroyed shaders were not unbound, the next compiled - * shader variant could get the same pointer address and so - * binding it to the same shader stage would be considered - * a no-op, causing random behavior. - */ - switch (shader->selector->info.stage) { - case MESA_SHADER_VERTEX: - if (shader->key.as_ls) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ls)); - } else if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es)); - } else if (shader->key.as_ngg) { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - } else { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - } - break; - case MESA_SHADER_TESS_CTRL: - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(hs)); - break; - case MESA_SHADER_TESS_EVAL: - if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(es)); - } else if (shader->key.as_ngg) { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - } else { - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - } - break; - case MESA_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(vs)); - else - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(gs)); - break; - case MESA_SHADER_FRAGMENT: - si_pm4_free_state(sctx, shader->pm4, SI_STATE_IDX(ps)); - break; - default:; + /* If destroyed shaders were not unbound, the next compiled + * shader variant could get the same pointer address and so + * binding it to the same shader stage would be considered + * a no-op, causing random behavior. + */ + int state_index = -1; + + switch (shader->selector->info.stage) { + case MESA_SHADER_VERTEX: + if (shader->key.as_ls) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(ls); + } else if (shader->key.as_es) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(es); + } else if (shader->key.as_ngg) { + state_index = SI_STATE_IDX(gs); + } else { + state_index = SI_STATE_IDX(vs); + } + break; + case MESA_SHADER_TESS_CTRL: + state_index = SI_STATE_IDX(hs); + break; + case MESA_SHADER_TESS_EVAL: + if (shader->key.as_es) { + if (sctx->chip_class <= GFX8) + state_index = SI_STATE_IDX(es); + } else if (shader->key.as_ngg) { + state_index = SI_STATE_IDX(gs); + } else { + state_index = SI_STATE_IDX(vs); } + break; + case MESA_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + state_index = SI_STATE_IDX(vs); + else + state_index = SI_STATE_IDX(gs); + break; + case MESA_SHADER_FRAGMENT: + state_index = SI_STATE_IDX(ps); + break; + default:; } si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL); si_shader_destroy(shader); - free(shader); + si_pm4_free_state(sctx, &shader->pm4, state_index); } static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso) @@ -3354,128 +3524,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) si_shader_selector_reference(sctx, &sel, NULL); } -static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs, - unsigned semantic, enum glsl_interp_mode interpolate, - ubyte fp16_lo_hi_mask) -{ - struct si_shader_info *vsinfo = &vs->selector->info; - unsigned offset, ps_input_cntl = 0; - - if (interpolate == INTERP_MODE_FLAT || - (interpolate == INTERP_MODE_COLOR && sctx->flatshade) || - semantic == VARYING_SLOT_PRIMITIVE_ID) - ps_input_cntl |= S_028644_FLAT_SHADE(1); - - if (semantic == VARYING_SLOT_PNTC || - (semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 && - sctx->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) { - ps_input_cntl |= S_028644_PT_SPRITE_TEX(1); - if (fp16_lo_hi_mask & 0x1) { - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_ATTR0_VALID(1); - } - } - - int vs_slot = vsinfo->output_semantic_to_slot[semantic]; - if (vs_slot >= 0) { - offset = vs->info.vs_output_param_offset[vs_slot]; - - if (offset <= AC_EXP_PARAM_OFFSET_31) { - /* The input is loaded from parameter memory. */ - ps_input_cntl |= S_028644_OFFSET(offset); - } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - if (offset == AC_EXP_PARAM_UNDEFINED) { - /* This can happen with depth-only rendering. */ - offset = 0; - } else { - /* The input is a DEFAULT_VAL constant. */ - assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && - offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); - offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; - } - - ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset); - } - - if (fp16_lo_hi_mask && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000); - - ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | - S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) | - S_028644_DEFAULT_VAL_ATTR1(0) | - S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */ - S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2)); - } - } else { - /* VS output not found. */ - if (semantic == VARYING_SLOT_PRIMITIVE_ID) { - /* PrimID is written after the last output when HW VS is used. */ - ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]); - } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - /* No corresponding output found, load defaults into input. - * Don't set any other bits. - * (FLAT_SHADE=1 completely changes behavior) */ - ps_input_cntl = S_028644_OFFSET(0x20); - /* D3D 9 behaviour. GL is undefined */ - if (semantic == VARYING_SLOT_COL0) - ps_input_cntl |= S_028644_DEFAULT_VAL(3); - } - } - - return ps_input_cntl; -} - -static void si_emit_spi_map(struct si_context *sctx) -{ - struct si_shader *ps = sctx->shader.ps.current; - struct si_shader *vs; - struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; - unsigned i, num_interp, num_written = 0; - unsigned spi_ps_input_cntl[32]; - - if (!ps || !ps->selector->info.num_inputs) - return; - - /* With legacy GS, only the GS copy shader contains information about param exports. */ - if (sctx->shader.gs.cso && !sctx->ngg) - vs = sctx->shader.gs.cso->gs_copy_shader; - else - vs = si_get_vs(sctx)->current; - - num_interp = si_get_ps_num_interp(ps); - assert(num_interp > 0); - - for (i = 0; i < psinfo->num_inputs; i++) { - unsigned semantic = psinfo->input_semantic[i]; - unsigned interpolate = psinfo->input_interpolate[i]; - ubyte fp16_lo_hi_mask = psinfo->input_fp16_lo_hi_valid[i]; - - spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, interpolate, - fp16_lo_hi_mask); - } - - if (ps->key.part.ps.prolog.color_two_side) { - for (i = 0; i < 2; i++) { - if (!(psinfo->colors_read & (0xf << (i * 4)))) - continue; - - unsigned semantic = VARYING_SLOT_BFC0 + i; - spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, semantic, - psinfo->color_interpolate[i], - false); - } - } - assert(num_interp == num_written); - - /* R_028644_SPI_PS_INPUT_CNTL_0 */ - /* Dota 2: Only ~16% of SPI map updates set different values. */ - /* Talos: Only ~9% of SPI map updates set different values. */ - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, - sctx->tracked_regs.spi_ps_input_cntl, num_interp); - radeon_end_update_context_roll(sctx); -} - /** * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that. */ @@ -3505,17 +3553,17 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs) radeon_begin(cs); /* This is required before VGT_FLUSH. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); radeon_end(); } /* Initialize state related to ESGS / GSVS ring buffers */ -static bool si_update_gs_ring_buffers(struct si_context *sctx) +bool si_update_gs_ring_buffers(struct si_context *sctx) { struct si_shader_selector *es = sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso; @@ -3610,11 +3658,11 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) /* Set the GS registers. */ if (sctx->esgs_ring) { assert(sctx->chip_class <= GFX8); - radeon_set_uconfig_reg(cs, R_030900_VGT_ESGS_RING_SIZE, + radeon_set_uconfig_reg(R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256); } if (sctx->gsvs_ring) { - radeon_set_uconfig_reg(cs, R_030904_VGT_GSVS_RING_SIZE, + radeon_set_uconfig_reg(R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); } radeon_end(); @@ -3718,11 +3766,6 @@ static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *s return 1; } -static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) -{ - return shader ? shader->config.scratch_bytes_per_wave : 0; -} - static struct si_shader *si_get_tcs_current(struct si_context *sctx) { if (!sctx->shader.tes.cso) @@ -3745,19 +3788,19 @@ static bool si_update_scratch_relocs(struct si_context *sctx) if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); + si_pm4_bind_state(sctx, ps, sctx->shader.ps.current); r = si_update_scratch_buffer(sctx, sctx->shader.gs.current); if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.gs.current); r = si_update_scratch_buffer(sctx, tcs); if (r < 0) return false; if (r == 1) - si_pm4_bind_state(sctx, hs, tcs->pm4); + si_pm4_bind_state(sctx, hs, tcs); /* VS can be bound as LS, ES, or VS. */ r = si_update_scratch_buffer(sctx, sctx->shader.vs.current); @@ -3765,13 +3808,13 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return false; if (r == 1) { if (sctx->shader.vs.current->key.as_ls) - si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, ls, sctx->shader.vs.current); else if (sctx->shader.vs.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.vs.current); else if (sctx->shader.vs.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.vs.current); else - si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.vs.current); } /* TES can be bound as ES or VS. */ @@ -3780,17 +3823,17 @@ static bool si_update_scratch_relocs(struct si_context *sctx) return false; if (r == 1) { if (sctx->shader.tes.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, es, sctx->shader.tes.current); else if (sctx->shader.tes.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, gs, sctx->shader.tes.current); else - si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); + si_pm4_bind_state(sctx, vs, sctx->shader.tes.current); } return true; } -static bool si_update_spi_tmpring_size(struct si_context *sctx) +bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) { /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. * There are 2 cases to handle: @@ -3805,17 +3848,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) * Otherwise, the number of waves that can use scratch is * SPI_TMPRING_SIZE.WAVES. */ - unsigned bytes = 0; - - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.ps.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.gs.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.vs.current)); - - if (sctx->shader.tes.cso) { - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->shader.tes.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx))); - } - sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes); unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; @@ -3834,7 +3866,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) if (!sctx->scratch_buffer) return false; - si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b); } @@ -3855,7 +3886,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) return true; } -static void si_init_tess_factor_ring(struct si_context *sctx) +void si_init_tess_factor_ring(struct si_context *sctx) { assert(!sctx->tess_rings); assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0); @@ -3893,17 +3924,17 @@ static void si_init_tess_factor_ring(struct si_context *sctx) /* Set tessellation registers. */ radeon_begin(cs); - radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE, + radeon_set_uconfig_reg(R_030938_VGT_TF_RING_SIZE, S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4)); - radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8); + radeon_set_uconfig_reg(R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8); if (sctx->chip_class >= GFX10) { - radeon_set_uconfig_reg(cs, R_030984_VGT_TF_MEMORY_BASE_HI_UMD, + radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI_UMD, S_030984_BASE_HI(factor_va >> 40)); } else if (sctx->chip_class == GFX9) { - radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI, + radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(factor_va >> 40)); } - radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, + radeon_set_uconfig_reg(R_03093C_VGT_HS_OFFCHIP_PARAM, sctx->screen->vgt_hs_offchip_param); radeon_end(); return; @@ -3955,8 +3986,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx) si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } -static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, - union si_vgt_stages_key key) +struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key) { struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); uint32_t stages = 0; @@ -3977,7 +4007,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, } if (key.u.ngg) { - stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) | + stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_NGG_WAVE_ID_EN(key.u.streamout) | S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough) | S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key.u.ngg_passthrough && @@ -3988,9 +4018,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, if (screen->info.chip_class >= GFX9) stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); - if (screen->info.chip_class >= GFX10 && - /* GS fast launch hangs with Wave64, so always use Wave32. */ - (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) { + if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) { stages |= S_028B54_HS_W32_EN(1) | S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */ S_028B54_VS_W32_EN(1); @@ -4000,293 +4028,12 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, return pm4; } -static void si_update_vgt_shader_config(struct si_context *sctx, union si_vgt_stages_key key) -{ - struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; - - if (unlikely(!*pm4)) - *pm4 = si_build_vgt_shader_config(sctx->screen, key); - si_pm4_bind_state(sctx, vgt_shader_config, *pm4); -} - -bool si_update_shaders(struct si_context *sctx) -{ - struct pipe_context *ctx = (struct pipe_context *)sctx; - struct si_compiler_ctx_state compiler_state; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader *old_vs = si_get_vs(sctx)->current; - unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0; - struct si_shader *old_ps = sctx->shader.ps.current; - union si_vgt_stages_key key; - unsigned old_spi_shader_col_format = - old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; - int r; - - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - key.index = 0; - - if (sctx->shader.tes.cso) - key.u.tess = 1; - if (sctx->shader.gs.cso) - key.u.gs = 1; - - if (sctx->ngg) { - key.u.ngg = 1; - key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs; - } - - /* Update TCS and TES. */ - if (sctx->shader.tes.cso) { - if (!sctx->tess_rings) { - si_init_tess_factor_ring(sctx); - if (!sctx->tess_rings) - return false; - } - - if (sctx->shader.tcs.cso) { - r = si_shader_select(ctx, &sctx->shader.tcs, key, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current->pm4); - } else { - if (!sctx->fixed_func_tcs_shader.cso) { - sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx); - if (!sctx->fixed_func_tcs_shader.cso) - return false; - } - - r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, key, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4); - } - - if (!sctx->shader.gs.cso || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.tes, key, &compiler_state); - if (r) - return false; - - if (sctx->shader.gs.cso) { - /* TES as ES */ - assert(sctx->chip_class <= GFX8); - si_pm4_bind_state(sctx, es, sctx->shader.tes.current->pm4); - } else if (key.u.ngg) { - si_pm4_bind_state(sctx, gs, sctx->shader.tes.current->pm4); - } else { - si_pm4_bind_state(sctx, vs, sctx->shader.tes.current->pm4); - } - } - } else { - if (sctx->chip_class <= GFX8) - si_pm4_bind_state(sctx, ls, NULL); - si_pm4_bind_state(sctx, hs, NULL); - } - - /* Update GS. */ - if (sctx->shader.gs.cso) { - r = si_shader_select(ctx, &sctx->shader.gs, key, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, gs, sctx->shader.gs.current->pm4); - if (!key.u.ngg) { - si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader->pm4); - - if (!si_update_gs_ring_buffers(sctx)) - return false; - } else { - si_pm4_bind_state(sctx, vs, NULL); - } - } else { - if (!key.u.ngg) { - si_pm4_bind_state(sctx, gs, NULL); - if (sctx->chip_class <= GFX8) - si_pm4_bind_state(sctx, es, NULL); - } - } - - /* Update VS. */ - if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->shader.vs, key, &compiler_state); - if (r) - return false; - - if (!key.u.tess && !key.u.gs) { - if (key.u.ngg) { - si_pm4_bind_state(sctx, gs, sctx->shader.vs.current->pm4); - si_pm4_bind_state(sctx, vs, NULL); - } else { - si_pm4_bind_state(sctx, vs, sctx->shader.vs.current->pm4); - } - } else if (sctx->shader.tes.cso) { - si_pm4_bind_state(sctx, ls, sctx->shader.vs.current->pm4); - } else { - assert(sctx->shader.gs.cso); - si_pm4_bind_state(sctx, es, sctx->shader.vs.current->pm4); - } - } - - /* This must be done after the shader variant is selected. */ - if (sctx->ngg) { - struct si_shader *vs = si_get_vs(sctx)->current; - - key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); - key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); - } - - sctx->vs_uses_base_instance = - sctx->shader.vs.current ? sctx->shader.vs.current->uses_base_instance : - sctx->queued.named.hs ? sctx->queued.named.hs->shader->uses_base_instance : - sctx->shader.gs.current->uses_base_instance; - - si_update_vgt_shader_config(sctx, key); - - if (old_kill_clip_distances != si_get_vs(sctx)->current->key.opt.kill_clip_distances) - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - - if (sctx->shader.ps.cso) { - unsigned db_shader_control; - - r = si_shader_select(ctx, &sctx->shader.ps, key, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, ps, sctx->shader.ps.current->pm4); - - db_shader_control = sctx->shader.ps.cso->db_shader_control | - S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS); - - if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) || - (key.u.ngg && si_pm4_state_changed(sctx, gs)) || - sctx->sprite_coord_enable != rs->sprite_coord_enable || - sctx->flatshade != rs->flatshade) { - sctx->sprite_coord_enable = rs->sprite_coord_enable; - sctx->flatshade = rs->flatshade; - si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); - } - - if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) && - (!old_ps || old_spi_shader_col_format != - sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - - if (sctx->ps_db_shader_control != db_shader_control) { - sctx->ps_db_shader_control = db_shader_control; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - } - - if (sctx->smoothing_enabled != - sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing) { - sctx->smoothing_enabled = sctx->shader.ps.current->key.part.ps.epilog.poly_line_smoothing; - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - /* NGG cull state uses smoothing_enabled. */ - if (sctx->screen->use_ngg_culling) - si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); - - if (sctx->chip_class == GFX6) - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - if (sctx->framebuffer.nr_samples <= 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - - if (sctx->chip_class >= GFX10_3) { - struct si_shader_info *info = &sctx->shader.ps.cso->info; - bool allow_flat_shading = info->allow_flat_shading; - - if (allow_flat_shading && - (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable || - (!rs->flatshade && info->uses_interp_color))) - allow_flat_shading = false; - - if (sctx->allow_flat_shading != allow_flat_shading) { - sctx->allow_flat_shading = allow_flat_shading; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } - } - } - - if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) { - /* Pretend the bound shaders form a vk pipeline */ - uint32_t pipeline_code_hash = 0; - uint64_t base_address = ~0; - - for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { - struct si_shader *shader = sctx->shaders[i].current; - if (sctx->shaders[i].cso && shader) { - pipeline_code_hash = _mesa_hash_data_with_seed( - shader->binary.elf_buffer, - shader->binary.elf_size, - pipeline_code_hash); - base_address = MIN2(base_address, - shader->bo->gpu_address); - } - } - - struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { - si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false); - } - - si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0); - } - - if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) || - si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) || - si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { - if (!si_update_spi_tmpring_size(sctx)) - return false; - } - - if (sctx->chip_class >= GFX7) { - if (si_pm4_state_enabled_and_changed(sctx, ls)) - sctx->prefetch_L2_mask |= SI_PREFETCH_LS; - else if (!sctx->queued.named.ls) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; - - if (si_pm4_state_enabled_and_changed(sctx, hs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_HS; - else if (!sctx->queued.named.hs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; - - if (si_pm4_state_enabled_and_changed(sctx, es)) - sctx->prefetch_L2_mask |= SI_PREFETCH_ES; - else if (!sctx->queued.named.es) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; - - if (si_pm4_state_enabled_and_changed(sctx, gs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_GS; - else if (!sctx->queued.named.gs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; - - if (si_pm4_state_enabled_and_changed(sctx, vs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_VS; - else if (!sctx->queued.named.vs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; - - if (si_pm4_state_enabled_and_changed(sctx, ps)) - sctx->prefetch_L2_mask |= SI_PREFETCH_PS; - else if (!sctx->queued.named.ps) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS; - } - - sctx->do_update_shaders = false; - return true; -} - static void si_emit_scratch_state(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; radeon_begin(cs); - radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size); + radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size); radeon_end(); if (sctx->scratch_buffer) { @@ -4303,7 +4050,6 @@ void si_init_screen_live_shader_cache(struct si_screen *sscreen) void si_init_shader_functions(struct si_context *sctx) { - sctx->atoms.s.spi_map.emit = si_emit_spi_map; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; sctx->b.create_vs_state = si_create_shader; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c b/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c index b6656fdc8..e70987d66 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_uvd.c @@ -46,7 +46,8 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, int modifiers_count = 0; uint64_t mod = DRM_FORMAT_MOD_LINEAR; - /* TODO: get tiling working */ + /* To get tiled buffers, users need to explicitly provide a list of + * modifiers. */ vidbuf.bind |= PIPE_BIND_LINEAR; if (pipe->screen->resource_create_with_modifiers) { @@ -58,6 +59,33 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, modifiers_count); } +struct pipe_video_buffer *si_video_buffer_create_with_modifiers(struct pipe_context *pipe, + const struct pipe_video_buffer *tmpl, + const uint64_t *modifiers, + unsigned int modifiers_count) +{ + uint64_t *allowed_modifiers; + unsigned int allowed_modifiers_count, i; + + /* Filter out DCC modifiers, because we don't support them for video + * for now. */ + allowed_modifiers = calloc(modifiers_count, sizeof(uint64_t)); + if (!allowed_modifiers) + return NULL; + + allowed_modifiers_count = 0; + for (i = 0; i < modifiers_count; i++) { + if (ac_modifier_has_dcc(modifiers[i])) + continue; + allowed_modifiers[allowed_modifiers_count++] = modifiers[i]; + } + + struct pipe_video_buffer *buf = + vl_video_buffer_create_as_resource(pipe, tmpl, allowed_modifiers, allowed_modifiers_count); + free(allowed_modifiers); + return buf; +} + /* set the decoding target buffer offsets */ static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf) { |