summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mesa/src/gallium')
-rw-r--r--lib/mesa/src/gallium/Android.common.mk1
-rw-r--r--lib/mesa/src/gallium/Android.mk3
-rw-r--r--lib/mesa/src/gallium/auxiliary/Android.mk50
-rw-r--r--lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk1
-rw-r--r--lib/mesa/src/gallium/drivers/etnaviv/Android.mk5
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/Android.mk27
-rw-r--r--lib/mesa/src/gallium/drivers/iris/Android.mk62
-rw-r--r--lib/mesa/src/gallium/drivers/iris/Makefile.sources10
-rw-r--r--lib/mesa/src/gallium/drivers/kmsro/Android.mk10
-rw-r--r--lib/mesa/src/gallium/drivers/lima/Android.mk18
-rw-r--r--lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c253
-rw-r--r--lib/mesa/src/gallium/drivers/r600/Android.mk15
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/Android.mk42
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c2684
-rw-r--r--lib/mesa/src/gallium/drivers/virgl/Android.mk17
-rw-r--r--lib/mesa/src/gallium/targets/dri/Android.mk18
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk4
-rw-r--r--lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk2
-rw-r--r--lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk3
-rw-r--r--lib/mesa/src/gallium/winsys/virgl/drm/Android.mk2
-rw-r--r--lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk2
21 files changed, 1636 insertions, 1593 deletions
diff --git a/lib/mesa/src/gallium/Android.common.mk b/lib/mesa/src/gallium/Android.common.mk
index 3f7779892..0d55f04ac 100644
--- a/lib/mesa/src/gallium/Android.common.mk
+++ b/lib/mesa/src/gallium/Android.common.mk
@@ -28,7 +28,6 @@ LOCAL_C_INCLUDES += \
$(GALLIUM_TOP)/auxiliary \
$(GALLIUM_TOP)/winsys \
$(GALLIUM_TOP)/drivers \
- $(MESA_TOP)/src/etnaviv \
$(MESA_TOP)/src/freedreno \
$(MESA_TOP)/src/freedreno/ir3 \
$(MESA_TOP)/src/freedreno/registers
diff --git a/lib/mesa/src/gallium/Android.mk b/lib/mesa/src/gallium/Android.mk
index 78e821581..37e923c22 100644
--- a/lib/mesa/src/gallium/Android.mk
+++ b/lib/mesa/src/gallium/Android.mk
@@ -46,10 +46,9 @@ SUBDIRS += winsys/vc4/drm drivers/vc4
SUBDIRS += winsys/virgl/common winsys/virgl/drm winsys/virgl/vtest drivers/virgl
SUBDIRS += winsys/svga/drm drivers/svga
SUBDIRS += winsys/etnaviv/drm drivers/etnaviv drivers/renderonly
-SUBDIRS += frontends/dri
+SUBDIRS += state_trackers/dri
SUBDIRS += winsys/iris/drm drivers/iris
SUBDIRS += winsys/lima/drm drivers/lima
-SUBDIRS += winsys/panfrost/drm drivers/panfrost
# sort to eliminate any duplicates
INC_DIRS := $(call all-named-subdir-makefiles,$(sort $(SUBDIRS)))
diff --git a/lib/mesa/src/gallium/auxiliary/Android.mk b/lib/mesa/src/gallium/auxiliary/Android.mk
index f668e5237..a2d5fa60d 100644
--- a/lib/mesa/src/gallium/auxiliary/Android.mk
+++ b/lib/mesa/src/gallium/auxiliary/Android.mk
@@ -28,17 +28,14 @@ include $(LOCAL_PATH)/Makefile.sources
include $(CLEAR_VARS)
-# filter-out tessellator/tessellator.hpp to avoid "Unused source files" error
LOCAL_SRC_FILES := \
- $(filter-out tessellator/tessellator.hpp, $(C_SOURCES)) \
+ $(C_SOURCES) \
$(NIR_SOURCES) \
$(RENDERONLY_SOURCES) \
$(VL_STUB_SOURCES)
ifeq ($(USE_LIBBACKTRACE),true)
- LOCAL_CFLAGS += -DHAVE_ANDROID_PLATFORM
- LOCAL_SHARED_LIBRARIES += libbacktrace
- LOCAL_SRC_FILES += ../../util/u_debug_stack_android.cpp
+ LOCAL_SRC_FILES += util/u_debug_stack_android.cpp
endif
LOCAL_C_INCLUDES := \
@@ -55,7 +52,6 @@ LOCAL_CPPFLAGS += -std=c++14
# We need libmesa_nir to get NIR's generated include directories.
LOCAL_MODULE := libmesa_gallium
-LOCAL_SHARED_LIBRARIES += libsync
LOCAL_STATIC_LIBRARIES += libmesa_nir
LOCAL_WHOLE_STATIC_LIBRARIES += cpufeatures
@@ -66,44 +62,18 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
intermediates := $(call local-generated-sources-dir)
LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, $(GENERATED_SOURCES))
-u_indices_gen_deps := \
- $(MESA_TOP)/src/gallium/auxiliary/indices/u_indices_gen.py
+$(LOCAL_GENERATED_SOURCES): PRIVATE_PYTHON := $(MESA_PYTHON2)
+$(LOCAL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@
-$(intermediates)/indices/u_indices_gen.c: $(u_indices_gen_deps)
- @mkdir -p $(dir $@)
- $(hide) $(MESA_PYTHON3) $< > $@
+$(intermediates)/indices/u_indices_gen.c \
+$(intermediates)/indices/u_unfilled_gen.c \
+$(intermediates)/util/u_format_srgb.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py
+ $(transform-generated-source)
-u_unfilled_gen_deps := \
- $(MESA_TOP)/src/gallium/auxiliary/indices/u_unfilled_gen.py
-
-$(intermediates)/indices/u_unfilled_gen.c: $(u_unfilled_gen_deps)
- @mkdir -p $(dir $@)
- $(hide) $(MESA_PYTHON3) $< > $@
-
-u_tracepoints_deps := \
- $(MESA_TOP)/src/gallium/auxiliary/util/u_tracepoints.py \
- $(MESA_TOP)/src/gallium/auxiliary/util/u_trace.py
-
-u_tracepoints_c := $(intermediates)/util/u_tracepoints.c
-u_tracepoints_h := $(intermediates)/util/u_tracepoints.h
-
-$(intermediates)/util/u_tracepoints.c \
-$(intermediates)/util/u_tracepoints.h: $(u_tracepoints_deps)
- @mkdir -p $(dir $@)
- $(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/gallium/auxiliary/util -C $(u_tracepoints_c) -H $(u_tracepoints_h)
+$(intermediates)/util/u_format_table.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(LOCAL_PATH)/util/u_format.csv
+ $(transform-generated-source)
LOCAL_GENERATED_SOURCES += $(MESA_GEN_NIR_H)
include $(GALLIUM_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
-
-# Build libmesa_galliumvl used by radeonsi
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := \
- $(VL_SOURCES)
-
-LOCAL_MODULE := libmesa_galliumvl
-
-include $(GALLIUM_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk b/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk
index de07a03ce..075bf8af4 100644
--- a/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk
+++ b/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk
@@ -31,6 +31,7 @@ include $(CLEAR_VARS)
LOCAL_CFLAGS := \
-DHAVE_PIPE_LOADER_DRI \
-DHAVE_PIPE_LOADER_KMS \
+ -DDROP_PIPE_LOADER_MISC \
-DGALLIUM_STATIC_TARGETS
LOCAL_SRC_FILES := \
diff --git a/lib/mesa/src/gallium/drivers/etnaviv/Android.mk b/lib/mesa/src/gallium/drivers/etnaviv/Android.mk
index 3ba6b819f..6976d223c 100644
--- a/lib/mesa/src/gallium/drivers/etnaviv/Android.mk
+++ b/lib/mesa/src/gallium/drivers/etnaviv/Android.mk
@@ -28,10 +28,7 @@ include $(CLEAR_VARS)
LOCAL_SRC_FILES := \
$(C_SOURCES)
-LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
-
-LOCAL_SHARED_LIBRARIES := libdrm
-LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm
+LOCAL_SHARED_LIBRARIES := libdrm_etnaviv
LOCAL_MODULE := libmesa_pipe_etnaviv
include $(GALLIUM_COMMON_MK)
diff --git a/lib/mesa/src/gallium/drivers/freedreno/Android.mk b/lib/mesa/src/gallium/drivers/freedreno/Android.mk
index 86db01a59..f0b29b116 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/Android.mk
+++ b/lib/mesa/src/gallium/drivers/freedreno/Android.mk
@@ -39,34 +39,15 @@ LOCAL_SRC_FILES := \
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/ir3 \
- $(MESA_TOP)/include \
- $(MESA_TOP)/src/freedreno/common \
- $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_gallium,,)/util
+ $(MESA_TOP)/include
LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
-LOCAL_SHARED_LIBRARIES := libdrm libsync
-LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_perfcntrs libfreedreno_registers
+LOCAL_SHARED_LIBRARIES := libdrm
+LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_registers
LOCAL_MODULE := libmesa_pipe_freedreno
-LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
-intermediates := $(call local-generated-sources-dir)
-
-LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, $(GENERATED_SOURCES))
-
-freedreno_tracepoints_deps := \
- $(MESA_TOP)/src/gallium/drivers/freedreno/freedreno_tracepoints.py \
- $(MESA_TOP)/src/gallium/auxiliary/util/u_trace.py
-
-freedreno_tracepoints_c := $(intermediates)/freedreno_tracepoints.c
-freedreno_tracepoints_h := $(intermediates)/freedreno_tracepoints.h
-
-$(intermediates)/freedreno_tracepoints.c \
-$(intermediates)/freedreno_tracepoints.h: $(freedreno_tracepoints_deps)
- @mkdir -p $(dir $@)
- $(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/gallium/auxiliary/util -C $(freedreno_tracepoints_c) -H $(freedreno_tracepoints_h)
-
+include $(LOCAL_PATH)/Android.gen.mk
include $(GALLIUM_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/gallium/drivers/iris/Android.mk b/lib/mesa/src/gallium/drivers/iris/Android.mk
index 5d5744025..71ec0cf58 100644
--- a/lib/mesa/src/gallium/drivers/iris/Android.mk
+++ b/lib/mesa/src/gallium/drivers/iris/Android.mk
@@ -42,15 +42,15 @@ IRIS_COMMON_INCLUDES := \
$(MESA_TOP)/src/gallium/auxiliary
#
-# libiris for gfx8
+# libiris for gen8
#
include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx8
+LOCAL_MODULE := libmesa_iris_gen8
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=80
+LOCAL_CFLAGS := -DGEN_VERSIONx10=80
LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
@@ -62,15 +62,15 @@ include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
#
-# libiris for gfx9
+# libiris for gen9
#
include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx9
+LOCAL_MODULE := libmesa_iris_gen9
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=90
+LOCAL_CFLAGS := -DGEN_VERSIONx10=90
LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
@@ -82,15 +82,15 @@ include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
#
-# libiris for gfx11
+# libiris for gen10
#
include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx11
+LOCAL_MODULE := libmesa_iris_gen10
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=110
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
@@ -102,15 +102,15 @@ include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
#
-# libiris for gfx12
+# libiris for gen11
#
include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx12
+LOCAL_MODULE := libmesa_iris_gen11
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=120
+LOCAL_CFLAGS := -DGEN_VERSIONx10=110
LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
@@ -121,30 +121,29 @@ LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
-#
-# libiris for gfx125
-#
+###########################################################
include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx125
+
+LOCAL_MODULE := libmesa_pipe_iris
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=125
+intermediates := $(call local-generated-sources-dir)
-LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
+LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/iris/,$(GENERATED_SOURCES))
-LOCAL_STATIC_LIBRARIES := $(LIBIRIS_STATIC_LIBS)
+GEN_DRIINFO_INPUTS := \
+ $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
+ $(LOCAL_PATH)/driinfo_iris.h
-LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
+MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
-include $(MESA_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
+$(intermediates)/iris/iris_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
+ @mkdir -p $(dir $@)
+ @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+ $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
-###########################################################
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libmesa_pipe_iris
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
LOCAL_SRC_FILES := \
$(IRIS_C_SOURCES)
@@ -167,11 +166,10 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
libmesa_intel_common \
libmesa_intel_compiler \
libmesa_intel_perf \
- libmesa_iris_gfx8 \
- libmesa_iris_gfx9 \
- libmesa_iris_gfx11 \
- libmesa_iris_gfx12 \
- libmesa_iris_gfx125
+ libmesa_iris_gen8 \
+ libmesa_iris_gen9 \
+ libmesa_iris_gen10 \
+ libmesa_iris_gen11
include $(GALLIUM_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/gallium/drivers/iris/Makefile.sources b/lib/mesa/src/gallium/drivers/iris/Makefile.sources
index c727bce86..bc8f592d3 100644
--- a/lib/mesa/src/gallium/drivers/iris/Makefile.sources
+++ b/lib/mesa/src/gallium/drivers/iris/Makefile.sources
@@ -20,7 +20,11 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
+GENERATED_SOURCES := \
+ iris_driinfo.h
+
IRIS_C_SOURCES = \
+ $(GENERATED_SOURCES) \
driinfo_iris.h \
iris_batch.c \
iris_batch.h \
@@ -37,16 +41,10 @@ IRIS_C_SOURCES = \
iris_draw.c \
iris_fence.c \
iris_fence.h \
- iris_fine_fence.c \
- iris_fine_fence.h \
iris_formats.c \
iris_genx_macros.h \
iris_genx_protos.h \
- iris_measure.c \
- iris_measure.h \
iris_monitor.c \
- iris_performance_query.c \
- iris_perf.c \
iris_pipe.h \
iris_pipe_control.c \
iris_program.c \
diff --git a/lib/mesa/src/gallium/drivers/kmsro/Android.mk b/lib/mesa/src/gallium/drivers/kmsro/Android.mk
index e0e26482b..2f637b8bf 100644
--- a/lib/mesa/src/gallium/drivers/kmsro/Android.mk
+++ b/lib/mesa/src/gallium/drivers/kmsro/Android.mk
@@ -39,20 +39,14 @@ GALLIUM_TARGET_DRIVERS += exynos
GALLIUM_TARGET_DRIVERS += hx8357d
GALLIUM_TARGET_DRIVERS += ili9225
GALLIUM_TARGET_DRIVERS += ili9341
-GALLIUM_TARGET_DRIVERS += imx-drm
-GALLIUM_TARGET_DRIVERS += imx-dcss
-GALLIUM_TARGET_DRIVERS += ingenic-drm
-GALLIUM_TARGET_DRIVERS += mcde
-GALLIUM_TARGET_DRIVERS += mediatek
-GALLIUM_TARGET_DRIVERS += meson
+GALLIUM_TARGET_DRIVERS += imx
+GALLIUM_TARGET_DRIVERS += stm
GALLIUM_TARGET_DRIVERS += mi0283qt
GALLIUM_TARGET_DRIVERS += mxsfb-drm
GALLIUM_TARGET_DRIVERS += pl111
GALLIUM_TARGET_DRIVERS += repaper
-GALLIUM_TARGET_DRIVERS += rockchip
GALLIUM_TARGET_DRIVERS += st7586
GALLIUM_TARGET_DRIVERS += st7735r
-GALLIUM_TARGET_DRIVERS += stm
GALLIUM_TARGET_DRIVERS += sun4i-drm
$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_kmsro)
endif
diff --git a/lib/mesa/src/gallium/drivers/lima/Android.mk b/lib/mesa/src/gallium/drivers/lima/Android.mk
index 09487d9dc..069ecc4b2 100644
--- a/lib/mesa/src/gallium/drivers/lima/Android.mk
+++ b/lib/mesa/src/gallium/drivers/lima/Android.mk
@@ -31,15 +31,11 @@ LOCAL_SRC_FILES := \
ir/gp/lower.c \
ir/gp/nir.c \
ir/gp/node.c \
- ir/gp/optimize.c \
ir/gp/regalloc.c \
ir/gp/reduce_scheduler.c \
ir/gp/scheduler.c \
ir/lima_ir.h \
- ir/lima_nir_duplicate_consts.c \
- ir/lima_nir_duplicate_intrinsic.c \
ir/lima_nir_lower_uniform_to_scalar.c \
- ir/lima_nir_split_load_input.c \
ir/pp/codegen.c \
ir/pp/codegen.h \
ir/pp/disasm.c \
@@ -50,19 +46,14 @@ LOCAL_SRC_FILES := \
ir/pp/node_to_instr.c \
ir/pp/ppir.h \
ir/pp/regalloc.c \
- ir/pp/liveness.c \
ir/pp/scheduler.c \
lima_bo.c \
lima_bo.h \
lima_context.c \
lima_context.h \
- lima_disk_cache.c \
- lima_disk_cache.h \
lima_draw.c \
lima_fence.c \
lima_fence.h \
- lima_parser.c \
- lima_parser.h \
lima_program.c \
lima_program.h \
lima_query.c \
@@ -71,15 +62,12 @@ LOCAL_SRC_FILES := \
lima_screen.c \
lima_screen.h \
lima_state.c \
- lima_job.c \
- lima_job.h \
+ lima_submit.c \
+ lima_submit.h \
lima_texture.c \
lima_texture.h \
lima_util.c \
- lima_util.h \
- lima_format.c \
- lima_format.h \
- lima_gpu.h
+ lima_util.h
LOCAL_MODULE := libmesa_pipe_lima
diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c b/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c
index 157c23491..43121335f 100644
--- a/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c
+++ b/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c
@@ -27,11 +27,8 @@
#include <stdio.h>
#include "util/u_memory.h"
-#include "gallium/auxiliary/util/u_blend.h"
-#include "pan_context.h"
-#include "pan_blend_cso.h"
-#include "pan_bo.h"
-#include "panfrost-quirks.h"
+#include "pan_blend_shaders.h"
+#include "pan_blending.h"
/* A given Gallium blend state can be encoded to the hardware in numerous,
* dramatically divergent ways due to the interactions of blending with
@@ -60,6 +57,41 @@
* (our subclass of pipe_blend_state).
*/
+/* Given an initialized CSO and a particular framebuffer format, grab a
+ * blend shader, generating and compiling it if it doesn't exist
+ * (lazy-loading in a way). This routine, when the cache hits, should
+ * befast, suitable for calling every draw to avoid wacky dirty
+ * tracking paths. If the cache hits, boom, done. */
+
+static struct panfrost_blend_shader *
+panfrost_get_blend_shader(
+ struct panfrost_context *ctx,
+ struct panfrost_blend_state *blend,
+ enum pipe_format fmt,
+ unsigned rt)
+{
+ /* Prevent NULL collision issues.. */
+ assert(fmt != 0);
+
+ /* Check the cache */
+ struct hash_table_u64 *shaders = blend->rt[rt].shaders;
+
+ struct panfrost_blend_shader *shader =
+ _mesa_hash_table_u64_search(shaders, fmt);
+
+ if (shader)
+ return shader;
+
+ /* Cache miss. Build one instead, cache it, and go */
+
+ struct panfrost_blend_shader generated =
+ panfrost_compile_blend_shader(ctx, &blend->base, fmt);
+
+ shader = mem_dup(&generated, sizeof(generated));
+ _mesa_hash_table_u64_insert(shaders, fmt, shader);
+ return shader;
+}
+
/* Create a blend CSO. Essentially, try to compile a fixed-function
* expression and initialize blend shaders */
@@ -71,34 +103,33 @@ panfrost_create_blend_state(struct pipe_context *pipe,
struct panfrost_blend_state *so = rzalloc(ctx, struct panfrost_blend_state);
so->base = *blend;
- so->pan.dither = blend->dither;
- so->pan.logicop_enable = blend->logicop_enable;
- so->pan.logicop_func = blend->logicop_func;
- so->pan.rt_count = blend->max_rt + 1;
-
/* TODO: The following features are not yet implemented */
+ assert(!blend->logicop_enable);
+ assert(!blend->alpha_to_coverage);
assert(!blend->alpha_to_one);
- for (unsigned c = 0; c < so->pan.rt_count; ++c) {
- unsigned g = blend->independent_blend_enable ? c : 0;
- const struct pipe_rt_blend_state *pipe = &blend->rt[g];
- struct pan_blend_equation *equation = &so->pan.rts[c].equation;
-
- equation->color_mask = pipe->colormask;
- equation->blend_enable = pipe->blend_enable;
- if (!equation->blend_enable)
- continue;
-
- equation->rgb_func = util_blend_func_to_shader(pipe->rgb_func);
- equation->rgb_src_factor = util_blend_factor_to_shader(pipe->rgb_src_factor);
- equation->rgb_invert_src_factor = util_blend_factor_is_inverted(pipe->rgb_src_factor);
- equation->rgb_dst_factor = util_blend_factor_to_shader(pipe->rgb_dst_factor);
- equation->rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe->rgb_dst_factor);
- equation->alpha_func = util_blend_func_to_shader(pipe->alpha_func);
- equation->alpha_src_factor = util_blend_factor_to_shader(pipe->alpha_src_factor);
- equation->alpha_invert_src_factor = util_blend_factor_is_inverted(pipe->alpha_src_factor);
- equation->alpha_dst_factor = util_blend_factor_to_shader(pipe->alpha_dst_factor);
- equation->alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe->alpha_dst_factor);
+ for (unsigned c = 0; c < PIPE_MAX_COLOR_BUFS; ++c) {
+ struct panfrost_blend_rt *rt = &so->rt[c];
+
+ /* There are two paths. First, we would like to try a
+ * fixed-function if we can */
+
+ /* Without indep blending, the first RT settings replicate */
+
+ unsigned g =
+ blend->independent_blend_enable ? c : 0;
+
+ rt->has_fixed_function =
+ panfrost_make_fixed_blend_mode(
+ &blend->rt[g],
+ &rt->equation,
+ &rt->constant_mask,
+ blend->rt[g].colormask);
+
+ /* Regardless if that works, we also need to initialize
+ * the blend shaders */
+
+ rt->shaders = _mesa_hash_table_u64_create(so);
}
return so;
@@ -109,7 +140,28 @@ panfrost_bind_blend_state(struct pipe_context *pipe,
void *cso)
{
struct panfrost_context *ctx = pan_context(pipe);
- ctx->blend = (struct panfrost_blend_state *) cso;
+ struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+ struct pipe_blend_state *blend = (struct pipe_blend_state *) cso;
+ struct panfrost_blend_state *pblend = (struct panfrost_blend_state *) cso;
+ ctx->blend = pblend;
+
+ if (!blend)
+ return;
+
+ if (screen->require_sfbd) {
+ SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_DITHER, !blend->dither);
+ }
+
+ /* Shader itself is not dirty, but the shader core is */
+ ctx->dirty |= PAN_DIRTY_FS;
+}
+
+static void
+panfrost_delete_blend_shader(struct hash_entry *entry)
+{
+ struct panfrost_blend_shader *shader = (struct panfrost_blend_shader *)entry->data;
+ free(shader->buffer);
+ free(shader);
}
static void
@@ -117,6 +169,11 @@ panfrost_delete_blend_state(struct pipe_context *pipe,
void *cso)
{
struct panfrost_blend_state *blend = (struct panfrost_blend_state *) cso;
+
+ for (unsigned c = 0; c < 4; ++c) {
+ struct panfrost_blend_rt *rt = &blend->rt[c];
+ _mesa_hash_table_u64_clear(rt->shaders, panfrost_delete_blend_shader);
+ }
ralloc_free(blend);
}
@@ -130,73 +187,105 @@ panfrost_set_blend_color(struct pipe_context *pipe,
ctx->blend_color = *blend_color;
}
+/* Given a vec4 of constants, reduce it to just a single constant according to
+ * the mask (if we can) */
+
+static bool
+panfrost_blend_constant(float *out, float *in, unsigned mask)
+{
+ /* If there is no components used, it automatically works. Do set a
+ * dummy constant just to avoid reading uninitialized memory. */
+
+ if (!mask) {
+ *out = 0.0;
+ return true;
+ }
+
+ /* Find some starter mask */
+ unsigned first = ffs(mask) - 1;
+ float cons = in[first];
+ mask ^= (1 << first);
+
+ /* Ensure the rest are equal */
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+
+ if (in[i] != cons) {
+ *out = 0.0;
+ return false;
+ }
+ }
+
+ /* Otherwise, we're good to go */
+ *out = cons;
+ return true;
+}
+
/* Create a final blend given the context */
struct panfrost_blend_final
-panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti, struct panfrost_bo **bo, unsigned *shader_offset)
+panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
{
- struct panfrost_device *dev = pan_device(ctx->base.screen);
- struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+ struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+ struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+
+ /* Grab the format, falling back gracefully if called invalidly (which
+ * has to happen for no-color-attachment FBOs, for instance) */
struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
- enum pipe_format fmt = fb->cbufs[rti]->format;
- unsigned nr_samples = fb->cbufs[rti]->nr_samples ? :
- fb->cbufs[rti]->texture->nr_samples;
+ enum pipe_format fmt = PIPE_FORMAT_R8G8B8A8_UNORM;
+
+ if ((fb->nr_cbufs > rti) && fb->cbufs[rti])
+ fmt = fb->cbufs[rti]->format;
/* Grab the blend state */
struct panfrost_blend_state *blend = ctx->blend;
- struct pan_blend_state pan_blend = blend->pan;
-
- pan_blend.rts[rti].format = fmt;
- pan_blend.rts[rti].nr_samples = nr_samples;
- memcpy(pan_blend.constants, ctx->blend_color.color,
- sizeof(pan_blend.constants));
-
- /* First, we'll try fixed function, matching equation and constant */
- if (pan_blend_can_fixed_function(dev, &pan_blend, rti)) {
- struct panfrost_blend_final final = {
- .load_dest = pan_blend_reads_dest(pan_blend.rts[rti].equation),
- .equation.constant = pan_blend_get_constant(dev, &pan_blend, rti),
- .opaque = pan_blend_is_opaque(pan_blend.rts[rti].equation),
- .no_colour = pan_blend.rts[rti].equation.color_mask == 0,
- };
-
- pan_blend_to_fixed_function_equation(dev, &pan_blend, rti,
- &final.equation.equation);
- return final;
- }
+ assert(blend);
+ struct panfrost_blend_rt *rt = &blend->rt[rti];
- /* Otherwise, we need to grab a shader */
- /* Upload the shader, sharing a BO */
- if (!(*bo)) {
- *bo = panfrost_batch_create_bo(batch, 4096,
- PAN_BO_EXECUTE,
- PAN_BO_ACCESS_PRIVATE |
- PAN_BO_ACCESS_READ |
- PAN_BO_ACCESS_FRAGMENT);
+ struct panfrost_blend_final final;
+
+ /* First, we'll try a fixed function path */
+ if (rt->has_fixed_function && panfrost_can_fixed_blend(fmt)) {
+ if (panfrost_blend_constant(
+ &final.equation.constant,
+ ctx->blend_color.color,
+ rt->constant_mask)) {
+ /* There's an equation and suitable constant, so we're good to go */
+ final.is_shader = false;
+ final.equation.equation = &rt->equation;
+
+ final.no_blending =
+ (rt->equation.rgb_mode == 0x122) &&
+ (rt->equation.alpha_mode == 0x122) &&
+ (rt->equation.color_mask == 0xf);
+
+ return final;
+ }
}
- pthread_mutex_lock(&dev->blend_shaders.lock);
- struct pan_blend_shader_variant *shader =
- pan_blend_get_shader_locked(dev, &pan_blend, rti);
+ /* Otherwise, we need to grab a shader */
+ struct panfrost_blend_shader *shader = panfrost_get_blend_shader(ctx, blend, fmt, rti);
+ final.is_shader = true;
+ final.no_blending = false;
+ final.shader.work_count = shader->work_count;
+ final.shader.first_tag = shader->first_tag;
- /* Size check */
- assert((*shader_offset + shader->binary.size) < 4096);
+ /* Upload the shader */
+ final.shader.bo = panfrost_drm_create_bo(screen, shader->size, PAN_ALLOCATE_EXECUTE);
+ memcpy(final.shader.bo->cpu, shader->buffer, shader->size);
- memcpy((*bo)->ptr.cpu + *shader_offset, shader->binary.data, shader->binary.size);
+ /* Pass BO ownership to job */
+ panfrost_job_add_bo(job, final.shader.bo);
+ panfrost_bo_unreference(ctx->base.screen, final.shader.bo);
- struct panfrost_blend_final final = {
- .is_shader = true,
- .shader = {
- .first_tag = shader->first_tag,
- .gpu = (*bo)->ptr.gpu + *shader_offset,
- },
- .load_dest = pan_blend.logicop_enable ||
- pan_blend_reads_dest(pan_blend.rts[rti].equation),
- };
+ if (shader->patch_index) {
+ /* We have to specialize the blend shader to use constants, so
+ * patch in the current constants */
- *shader_offset += shader->binary.size;
- pthread_mutex_unlock(&dev->blend_shaders.lock);
+ float *patch = (float *) (final.shader.bo->cpu + shader->patch_index);
+ memcpy(patch, ctx->blend_color.color, sizeof(float) * 4);
+ }
return final;
}
diff --git a/lib/mesa/src/gallium/drivers/r600/Android.mk b/lib/mesa/src/gallium/drivers/r600/Android.mk
index b87fc91e6..9f684cf24 100644
--- a/lib/mesa/src/gallium/drivers/r600/Android.mk
+++ b/lib/mesa/src/gallium/drivers/r600/Android.mk
@@ -30,12 +30,8 @@ include $(CLEAR_VARS)
LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES)
-LOCAL_C_INCLUDES += \
- $(MESA_TOP)/src/amd/common \
- $(MESA_TOP)/src/amd/llvm \
- $(MESA_TOP)/src/mesa
+LOCAL_C_INCLUDES += $(MESA_TOP)/src/amd/common
-LOCAL_STATIC_LIBRARIES := libmesa_nir
LOCAL_SHARED_LIBRARIES := libdrm_radeon
LOCAL_MODULE := libmesa_pipe_r600
@@ -49,15 +45,6 @@ $(intermediates)/egd_tables.h: $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.p
@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.py $(MESA_TOP)/src/gallium/drivers/r600/evergreend.h > $@
-sfn_nir_algebraic_gen := $(LOCAL_PATH)/sfn/sfn_nir_algebraic.py
-sfn_nir_algebraic_deps := \
- $(LOCAL_PATH)/sfn/sfn_nir_algebraic.py \
- $(MESA_TOP)/src/compiler/nir/nir_algebraic.py
-
-$(intermediates)/sfn_nir_algebraic.c: $(sfn_nir_algebraic_deps)
- @mkdir -p $(dir $@)
- $(hide) $(MESA_PYTHON2) $(sfn_nir_algebraic_gen) -p $(MESA_TOP)/src/compiler/nir/ > $@
-
ifeq ($(MESA_ENABLE_LLVM),true)
$(call mesa-build-with-llvm)
endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
index 75f30f621..e402da639 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
+++ b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
@@ -21,8 +21,6 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
-ifeq ($(MESA_ENABLE_LLVM),true)
-
LOCAL_PATH := $(call my-dir)
# get C_SOURCES and GENERATED_SOURCES
@@ -38,20 +36,48 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_C_INCLUDES := \
$(MESA_TOP)/src/amd/common \
- $(MESA_TOP)/src/amd/llvm \
- $(MESA_TOP)/src/compiler/nir \
$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \
$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir
-LOCAL_STATIC_LIBRARIES := \
- libmesa_amd_common \
- libmesa_galliumvl
+LOCAL_STATIC_LIBRARIES := libmesa_amd_common
LOCAL_SHARED_LIBRARIES := libdrm_radeon
LOCAL_MODULE := libmesa_pipe_radeonsi
+intermediates := $(call local-generated-sources-dir)
+
# We need to get NIR's generated headers.
LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/radeonsi/,$(GENERATED_SOURCES))
+
+GEN_DRIINFO_INPUTS := \
+ $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
+ $(LOCAL_PATH)/driinfo_radeonsi.h
+
+MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
+
+$(intermediates)/radeonsi/si_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
+ @mkdir -p $(dir $@)
+ @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+ $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
+
+GEN10_FORMAT_TABLE_INPUTS := \
+ $(MESA_TOP)/src/gallium/auxiliary/util/u_format.csv \
+ $(MESA_TOP)/src/amd/registers/gfx10-rsrc.json
+
+GEN10_FORMAT_TABLE_DEP := \
+ $(MESA_TOP)/src/amd/registers/regdb.py
+
+GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py
+
+$(intermediates)/radeonsi/gfx10_format_table.h: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP)
+ @mkdir -p $(dir $@)
+ @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+ $(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false)
+
+LOCAL_C_INCLUDES += $(intermediates)/radeonsi
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
$(call mesa-build-with-llvm)
@@ -67,5 +93,3 @@ $(eval GALLIUM_LIBS += \
libmesa_winsys_amdgpu)
$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
endif
-
-endif # MESA_ENABLE_LLVM==true
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index 3d17f08ca..373fd4ffa 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -23,15 +23,16 @@
*
*/
-#include "ac_llvm_cull.h"
-#include "si_build_pm4.h"
#include "si_pipe.h"
#include "si_shader_internal.h"
#include "sid.h"
-#include "util/fast_idiv_by_const.h"
+#include "si_build_pm4.h"
+#include "ac_llvm_cull.h"
+
#include "util/u_prim.h"
#include "util/u_suballoc.h"
#include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
/* Based on:
* https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
@@ -107,6 +108,7 @@
* (patch elimination where tess factors are 0 would be possible to implement)
* - The vertex shader must not contain memory stores.
* - All VS resources must not have a write usage in the command buffer.
+ * (TODO: all shader buffers currently set the write usage)
* - Bindless textures and images must not occur in the vertex shader.
*
* User data SGPR layout:
@@ -153,1400 +155,1426 @@
/* At least 256 is needed for the fastest wave launch rate from compute queues
* due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH 0 /* no limit */
-#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
+#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
+#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
+#define MAX_WAVES_PER_SH 0 /* no limit */
+#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
-#define CULL_Z 0
+#define CULL_Z 0
/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
-#define VERTEX_COUNTER_GDS_MODE 2
-#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
+#define VERTEX_COUNTER_GDS_MODE 2
+#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
/* Grouping compute dispatches for small draw calls: How many primitives from multiple
* draw calls to process by compute before signaling the gfx IB. This reduces the number
* of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH (512 * 1024)
+#define PRIMS_PER_BATCH (512 * 1024)
/* Draw call splitting at the packet level. This allows signaling the gfx IB
* for big draw calls sooner, but doesn't allow context flushes between packets.
* Primitive restart is supported. Only implemented for ordered append. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
+#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
/* If there is not enough ring buffer space for the current IB, split draw calls into
* this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
+#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
/* Derived values. */
-#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL \
- (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE \
- : UINT_MAX & ~(THREADGROUP_SIZE - 1))
+#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
+#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \
+ SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
+ UINT_MAX & ~(THREADGROUP_SIZE - 1))
-#define REWIND_SIGNAL_BIT 0x80000000
+#define REWIND_SIGNAL_BIT 0x80000000
/* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION 0
+#define FORCE_REWIND_EMULATION 0
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
- unsigned *prim_discard_vertex_count_threshold,
- unsigned *index_ring_size_per_ib)
+void si_initialize_prim_discard_tunables(struct si_context *sctx)
{
- *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
- if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
- !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
- return;
-
- /* TODO: enable this after the GDS kernel memory management is fixed */
- bool enable_on_pro_graphics_by_default = false;
-
- if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
- (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
- (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
- sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
- sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
- sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
- *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
- if (sscreen->debug_flags & DBG(ALWAYS_PD))
- *prim_discard_vertex_count_threshold = 0; /* always enable */
-
- const uint32_t MB = 1024 * 1024;
- const uint64_t GB = 1024 * 1024 * 1024;
-
- /* The total size is double this per context.
- * Greater numbers allow bigger gfx IBs.
- */
- if (sscreen->info.vram_size <= 2 * GB)
- *index_ring_size_per_ib = 64 * MB;
- else if (sscreen->info.vram_size <= 4 * GB)
- *index_ring_size_per_ib = 128 * MB;
- else
- *index_ring_size_per_ib = 256 * MB;
- }
+ sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
+
+ if (sctx->chip_class == GFX6 || /* SI support is not implemented */
+ !sctx->screen->info.has_gds_ordered_append ||
+ sctx->screen->debug_flags & DBG(NO_PD) ||
+ /* If aux_context == NULL, we are initializing aux_context right now. */
+ !sctx->screen->aux_context)
+ return;
+
+ /* TODO: enable this after the GDS kernel memory management is fixed */
+ bool enable_on_pro_graphics_by_default = false;
+
+ if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
+ sctx->screen->debug_flags & DBG(PD) ||
+ (enable_on_pro_graphics_by_default &&
+ sctx->screen->info.is_pro_graphics &&
+ (sctx->family == CHIP_BONAIRE ||
+ sctx->family == CHIP_HAWAII ||
+ sctx->family == CHIP_TONGA ||
+ sctx->family == CHIP_FIJI ||
+ sctx->family == CHIP_POLARIS10 ||
+ sctx->family == CHIP_POLARIS11 ||
+ sctx->family == CHIP_VEGA10 ||
+ sctx->family == CHIP_VEGA20))) {
+ sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
+
+ if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
+ sctx->prim_discard_vertex_count_threshold = 0; /* always enable */
+
+ const uint32_t MB = 1024 * 1024;
+ const uint64_t GB = 1024 * 1024 * 1024;
+
+ /* The total size is double this per context.
+ * Greater numbers allow bigger gfx IBs.
+ */
+ if (sctx->screen->info.vram_size <= 2 * GB)
+ sctx->index_ring_size_per_ib = 64 * MB;
+ else if (sctx->screen->info.vram_size <= 4 * GB)
+ sctx->index_ring_size_per_ib = 128 * MB;
+ else
+ sctx->index_ring_size_per_ib = 256 * MB;
+ }
}
/* Opcode can be "add" or "swap". */
-static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
- LLVMValueRef m0, LLVMValueRef value,
- unsigned ordered_count_index, bool release, bool done)
+static LLVMValueRef
+si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
+ LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
+ bool release, bool done)
{
- if (ctx->screen->info.chip_class >= GFX10)
- ordered_count_index |= 1 << 24; /* number of dwords == 1 */
-
- LLVMValueRef args[] = {
- LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
- value,
- LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
- ctx->ac.i32_0, /* scope */
- ctx->ac.i1false, /* volatile */
- LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
- LLVMConstInt(ctx->ac.i1, release, 0),
- LLVMConstInt(ctx->ac.i1, done, 0),
- };
-
- char intrinsic[64];
- snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
- return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
+ LLVMValueRef args[] = {
+ LLVMBuildIntToPtr(ctx->ac.builder, m0,
+ LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""),
+ value,
+ LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
+ ctx->i32_0, /* scope */
+ ctx->i1false, /* volatile */
+ LLVMConstInt(ctx->i32, ordered_count_index, 0),
+ LLVMConstInt(ctx->i1, release, 0),
+ LLVMConstInt(ctx->i1, done, 0),
+ };
+
+ char intrinsic[64];
+ snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
+ return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0);
}
static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
{
- uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
- ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
- ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
- return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
- LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
+ uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
+ ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, "");
+ ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), "");
+ return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
+ LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), "");
}
struct si_thread0_section {
- struct si_shader_context *ctx;
- LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
- LLVMValueRef saved_exec;
+ struct si_shader_context *ctx;
+ LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
+ LLVMValueRef saved_exec;
};
/* Enter a section that only executes on thread 0. */
static void si_enter_thread0_section(struct si_shader_context *ctx,
- struct si_thread0_section *section, LLVMValueRef thread_id)
+ struct si_thread0_section *section,
+ LLVMValueRef thread_id)
{
- section->ctx = ctx;
- section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
-
- /* This IF has 4 instructions:
- * v_and_b32_e32 v, 63, v ; get the thread ID
- * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
- * s_and_saveexec_b64 s, vcc
- * s_cbranch_execz BB0_4
- *
- * It could just be s_and_saveexec_b64 s, 1.
- */
- ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""),
- 12601);
+ section->ctx = ctx;
+ section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0");
+
+ /* This IF has 4 instructions:
+ * v_and_b32_e32 v, 63, v ; get the thread ID
+ * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
+ * s_and_saveexec_b64 s, vcc
+ * s_cbranch_execz BB0_4
+ *
+ * It could just be s_and_saveexec_b64 s, 1.
+ */
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
+ ctx->i32_0, ""), 12601);
}
/* Exit a section that only executes on thread 0 and broadcast the result
* to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
+static void si_exit_thread0_section(struct si_thread0_section *section,
+ LLVMValueRef *result)
{
- struct si_shader_context *ctx = section->ctx;
+ struct si_shader_context *ctx = section->ctx;
- LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
+ LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
- ac_build_endif(&ctx->ac, 12601);
+ ac_build_endif(&ctx->ac, 12601);
- /* Broadcast the result from thread 0 to all threads. */
- *result =
- ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
+ /* Broadcast the result from thread 0 to all threads. */
+ *result = ac_build_readlane(&ctx->ac,
+ LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
}
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
{
- struct si_shader_key *key = &ctx->shader->key;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef vs = ctx->main_fn;
-
- /* Always inline the VS function. */
- ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
- LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
- enum ac_arg_type const_desc_type;
- if (ctx->shader->selector->info.base.num_ubos == 1 &&
- ctx->shader->selector->info.base.num_ssbos == 0)
- const_desc_type = AC_ARG_CONST_FLOAT_PTR;
- else
- const_desc_type = AC_ARG_CONST_DESC_PTR;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
- struct ac_arg param_vb_desc, param_const_desc;
- struct ac_arg param_base_vertex, param_start_instance;
- struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
- struct ac_arg param_restart_index, param_smallprim_precision;
- struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
- struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- &param_index_buffers_and_constants);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
-
- /* Block ID and thread ID inputs. */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
- if (VERTEX_COUNTER_GDS_MODE == 2)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
-
- /* Create the compute shader function. */
- gl_shader_stage old_stage = ctx->stage;
- ctx->stage = MESA_SHADER_COMPUTE;
- si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
- ctx->stage = old_stage;
-
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
- } else if (VERTEX_COUNTER_GDS_MODE == 1) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
- }
-
- /* Assemble parameters for VS. */
- LLVMValueRef vs_params[16];
- unsigned num_vs_params = 0;
- unsigned param_vertex_id, param_instance_id;
-
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
- vs_params[num_vs_params++] =
- LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
-
- vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
- vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
-
- assert(num_vs_params <= ARRAY_SIZE(vs_params));
- assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
- /* Load descriptors. (load 8 dwords at once) */
- LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
- LLVMValueRef index_buffers_and_constants =
- ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
- tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
- ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
- tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
-
- for (unsigned i = 0; i < 8; i++)
- desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
- input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
- output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
- /* Compute PrimID and InstanceID. */
- LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
- LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
- ac_get_arg(&ctx->ac, param_local_id));
- LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
- LLVMValueRef instance_id = ctx->ac.i32_0;
-
- if (key->opt.cs_instancing) {
- LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
- LLVMValueRef num_prims_udiv_multiplier =
- ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
- /* Unpack num_prims_udiv_terms. */
- LLVMValueRef post_shift =
- LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
- LLVMValueRef prims_per_instance =
- LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
- /* Divide the total prim_id by the number of prims per instance. */
- instance_id =
- ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
- /* Compute the remainder. */
- prim_id = LLVMBuildSub(builder, prim_id,
- LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
- }
-
- /* Generate indices (like a non-indexed draw call). */
- LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
- unsigned vertices_per_prim = 3;
-
- switch (key->opt.cs_prim_type) {
- case PIPE_PRIM_TRIANGLES:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
- LLVMConstInt(ctx->ac.i32, i, 0));
- }
- break;
- case PIPE_PRIM_TRIANGLE_STRIP:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
- }
- break;
- case PIPE_PRIM_TRIANGLE_FAN:
- /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
- * and rasterizer as a normal triangle, so we need to put the provoking
- * vertex into the correct index variable and preserve orientation at the same time.
- * gl_VertexID is preserved, because it's equal to the index.
- */
- if (key->opt.cs_provoking_vertex_first) {
- index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
- index[2] = ctx->ac.i32_0;
- } else {
- index[0] = ctx->ac.i32_0;
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
- index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
- }
- break;
- default:
- unreachable("unexpected primitive type");
- }
-
- /* Fetch indices. */
- if (key->opt.cs_indexed) {
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
- 1, 0, true, false, false);
- index[i] = ac_to_integer(&ctx->ac, index[i]);
- }
- }
-
- LLVMValueRef ordered_wave_id = NULL;
-
- /* Extract the ordered wave ID. */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
- ordered_wave_id =
- LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
- ordered_wave_id =
- LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
- }
- LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
- LLVMConstInt(ctx->ac.i32, 63, 0), "");
-
- /* Every other triangle in a strip has a reversed vertex order, so we
- * need to swap vertices of odd primitives to get the correct primitive
- * orientation when converting triangle strips to triangles. Primitive
- * restart complicates it, because a strip can start anywhere.
- */
- LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
- LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
-
- if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
- /* Without primitive restart, odd primitives have reversed orientation.
- * Only primitive restart can flip it with respect to the first vertex
- * of the draw call.
- */
- LLVMValueRef first_is_odd = ctx->ac.i1false;
-
- /* Handle primitive restart. */
- if (key->opt.cs_primitive_restart) {
- /* Get the GDS primitive restart continue flag and clear
- * the flag in vertex_counter. This flag is used when the draw
- * call was split and we need to load the primitive orientation
- * flag from GDS for the first wave too.
- */
- LLVMValueRef gds_prim_restart_continue =
- LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
- gds_prim_restart_continue =
- LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
- vertex_counter =
- LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
-
- LLVMValueRef index0_is_reset;
-
- for (unsigned i = 0; i < 3; i++) {
- LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
- ac_get_arg(&ctx->ac, param_restart_index), "");
- if (i == 0)
- index0_is_reset = LLVMBuildNot(builder, not_reset, "");
- prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
- }
-
- /* If the previous waves flip the primitive orientation
- * of the current triangle strip, it will be stored in GDS.
- *
- * Sometimes the correct orientation is not needed, in which case
- * we don't need to execute this.
- */
- if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
- /* If there are reset indices in this wave, get the thread index
- * where the most recent strip starts relative to each thread.
- */
- LLVMValueRef preceding_threads_mask =
- LLVMBuildSub(builder,
- LLVMBuildShl(builder, ctx->ac.i64_1,
- LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
- ctx->ac.i64_1, "");
-
- LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
- LLVMValueRef preceding_reset_threadmask =
- LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
- LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
- strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
-
- /* This flips the orientation based on reset indices within this wave only. */
- first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
-
- LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
- LLVMValueRef is_first_wave, current_wave_resets_index;
-
- /* Get the thread index where the last strip starts in this wave.
- *
- * If the last strip doesn't start in this wave, the thread index
- * will be 0.
- *
- * If the last strip starts in the next wave, the thread index will
- * be 64.
- */
- last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
- last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
-
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, &section, thread_id);
-
- /* This must be done in the thread 0 section, because
- * we expect PrimID to be 0 for the whole first wave
- * in this expression.
- *
- * NOTE: This will need to be different if we wanna support
- * instancing with primitive restart.
- */
- is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
- is_first_wave = LLVMBuildAnd(builder, is_first_wave,
- LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
- current_wave_resets_index =
- LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
-
- ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
-
- /* Save the last strip start primitive index in GDS and read
- * the value that previous waves stored.
- *
- * if (is_first_wave || current_wave_resets_strip)
- * // Read the value that previous waves stored and store a new one.
- * first_is_odd = ds.ordered.swap(last_strip_start);
- * else
- * // Just read the value that previous waves stored.
- * first_is_odd = ds.ordered.add(0);
- */
- ac_build_ifcc(
- &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
- {
- /* The GDS address is always 0 with ordered append. */
- tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
- false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_else(&ctx->ac, 12603);
- {
- /* Just read the value from GDS. */
- tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
- false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_endif(&ctx->ac, 12602);
-
- prev_wave_state = LLVMBuildLoad(builder, ret, "");
- /* Ignore the return value if this is the first wave. */
- prev_wave_state =
- LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
- si_exit_thread0_section(&section, &prev_wave_state);
- prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
-
- /* If the strip start appears to be on thread 0 for the current primitive
- * (meaning the reset index is not present in this wave and might have
- * appeared in previous waves), use the value from GDS to determine
- * primitive orientation.
- *
- * If the strip start is in this wave for the current primitive, use
- * the value from the current wave to determine primitive orientation.
- */
- LLVMValueRef strip_start_is0 =
- LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
- first_is_odd =
- LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
- }
- }
- /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
- LLVMValueRef prim_is_odd = LLVMBuildXor(
- builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
-
- /* Convert triangle strip indices to triangle indices. */
- ac_build_triangle_strip_indices_to_triangle(
- &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
- index);
- }
-
- /* Execute the vertex shader for each vertex to get vertex positions. */
- LLVMValueRef pos[3][4];
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- vs_params[param_vertex_id] = index[i];
- vs_params[param_instance_id] = instance_id;
-
- LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
- for (unsigned chan = 0; chan < 4; chan++)
- pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
- }
-
- /* Divide XYZ by W. */
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- for (unsigned chan = 0; chan < 3; chan++)
- pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
- }
-
- /* Load the viewport state. */
- LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
- LLVMConstInt(ctx->ac.i32, 2, 0));
- vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
- LLVMValueRef vp_scale[2], vp_translate[2];
- vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
- vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
- vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
- vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
- /* Do culling. */
- struct ac_cull_options options = {};
- options.cull_front = key->opt.cs_cull_front;
- options.cull_back = key->opt.cs_cull_back;
- options.cull_view_xy = true;
- options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_small_prims = true;
- options.cull_zero_area = true;
- options.cull_w = true;
- options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
- LLVMValueRef accepted =
- ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
- ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
-
- ac_build_optimization_barrier(&ctx->ac, &accepted);
- LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
- /* Count the number of active threads by doing bitcount(accepted). */
- LLVMValueRef num_prims_accepted = ac_build_intrinsic(
- &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
- num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
-
- LLVMValueRef start;
-
- /* Execute atomic_add on the vertex count. */
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, &section, thread_id);
- {
- if (VERTEX_COUNTER_GDS_MODE == 0) {
- LLVMValueRef num_indices = LLVMBuildMul(
- builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 1) {
- LLVMValueRef num_indices = LLVMBuildMul(
- builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
- LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 2) {
- LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
- /* If the draw call was split into multiple subdraws, each using
- * a separate draw packet, we need to start counting from 0 for
- * the first compute wave of the subdraw.
- *
- * vertex_counter contains the primitive ID of the first thread
- * in the first wave.
- *
- * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
- */
- LLVMValueRef is_first_wave =
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
-
- /* Store the primitive count for ordered append, not vertex count.
- * The idea is to avoid GDS initialization via CP DMA. The shader
- * effectively stores the first count using "swap".
- *
- * if (first_wave) {
- * ds.ordered.swap(num_prims_accepted); // store the first primitive count
- * previous = 0;
- * } else {
- * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
- * }
- */
- ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
- {
- /* The GDS address is always 0 with ordered append. */
- si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
- LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
- }
- ac_build_else(&ctx->ac, 12605);
- {
- LLVMBuildStore(builder,
- si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
- 0, true, true),
- tmp_store);
- }
- ac_build_endif(&ctx->ac, 12604);
-
- start = LLVMBuildLoad(builder, tmp_store, "");
- }
- }
- si_exit_thread0_section(&section, &start);
-
- /* Write the final vertex count to memory. An EOS/EOP event could do this,
- * but those events are super slow and should be avoided if performance
- * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
- * event like this.
- */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
- ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
- 12606);
- LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
- count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-
- /* GFX8 needs to disable caching, so that the CP can see the stored value.
- * MTYPE=3 bypasses TC L2.
- */
- if (ctx->screen->info.chip_class <= GFX8) {
- LLVMValueRef desc[] = {
- ac_get_arg(&ctx->ac, param_vertex_count_addr),
- LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
- LLVMConstInt(ctx->ac.i32, 4, 0),
- LLVMConstInt(
- ctx->ac.i32,
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
- 0),
- };
- LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
- ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
- ac_glc | ac_slc);
- } else {
- LLVMBuildStore(
- builder, count,
- si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
- }
- ac_build_endif(&ctx->ac, 12606);
- } else {
- /* For unordered modes that increment a vertex count instead of
- * primitive count, convert it into the primitive index.
- */
- start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- }
-
- /* Now we need to store the indices of accepted primitives into
- * the output index buffer.
- */
- ac_build_ifcc(&ctx->ac, accepted, 16607);
- {
- /* Get the number of bits set before the index of this thread. */
- LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
- /* We have lowered instancing. Pack the instance ID into vertex ID. */
- if (key->opt.cs_instancing) {
- instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
-
- for (unsigned i = 0; i < vertices_per_prim; i++)
- index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
- }
-
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- /* vertex_counter contains the first primitive ID
- * for this dispatch. If the draw call was split into
- * multiple subdraws, the first primitive ID is > 0
- * for subsequent subdraws. Each subdraw uses a different
- * portion of the output index buffer. Offset the store
- * vindex by the first primitive ID to get the correct
- * store address for the subdraw.
- */
- start = LLVMBuildAdd(builder, start, vertex_counter, "");
- }
-
- /* Write indices for accepted primitives. */
- LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
- LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
- if (!ac_has_vec3_support(ctx->ac.chip_class, true))
- vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
- ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,
- ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
- }
- ac_build_endif(&ctx->ac, 16607);
-
- LLVMBuildRetVoid(builder);
+ struct si_shader_key *key = &ctx->shader->key;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef vs = ctx->main_fn;
+
+ /* Always inline the VS function. */
+ ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
+ LLVMSetLinkage(vs, LLVMPrivateLinkage);
+
+ LLVMTypeRef const_desc_type;
+ if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+ ctx->shader->selector->info.shader_buffers_declared == 0)
+ const_desc_type = ctx->f32;
+ else
+ const_desc_type = ctx->v4i32;
+
+ struct si_function_info fninfo;
+ si_init_function_info(&fninfo);
+
+ LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc;
+ LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id;
+ LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision;
+ LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc;
+ LLVMValueRef last_wave_prim_id, vertex_count_addr;
+
+ add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
+ &index_buffers_and_constants);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr);
+ add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
+ &vb_desc);
+ add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type),
+ &const_desc);
+ add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32),
+ &sampler_desc);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index);
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision);
+
+ /* Block ID and thread ID inputs. */
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id);
+ if (VERTEX_COUNTER_GDS_MODE == 2)
+ add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id);
+ add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id);
+
+ /* Create the compute shader function. */
+ unsigned old_type = ctx->type;
+ ctx->type = PIPE_SHADER_COMPUTE;
+ si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE);
+ ctx->type = old_type;
+
+ if (VERTEX_COUNTER_GDS_MODE == 1) {
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
+ GDS_SIZE_UNORDERED);
+ }
+
+ /* Assemble parameters for VS. */
+ LLVMValueRef vs_params[16];
+ unsigned num_vs_params = 0;
+ unsigned param_vertex_id, param_instance_id;
+
+ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
+ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
+ vs_params[num_vs_params++] = const_desc;
+ vs_params[num_vs_params++] = sampler_desc;
+ vs_params[num_vs_params++] = LLVMConstInt(ctx->i32,
+ S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
+ vs_params[num_vs_params++] = base_vertex;
+ vs_params[num_vs_params++] = start_instance;
+ vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */
+ vs_params[num_vs_params++] = vb_desc;
+
+ vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
+ vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
+ vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */
+ vs_params[num_vs_params++] = ctx->i32_0; /* unused */
+
+ assert(num_vs_params <= ARRAY_SIZE(vs_params));
+ assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
+
+ /* Load descriptors. (load 8 dwords at once) */
+ LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
+
+ tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
+ ac_array_in_const32_addr_space(ctx->v8i32), "");
+ tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0);
+
+ for (unsigned i = 0; i < 8; i++)
+ desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
+
+ input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
+ output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
+
+ /* Compute PrimID and InstanceID. */
+ LLVMValueRef global_thread_id =
+ ac_build_imad(&ctx->ac, block_id,
+ LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id);
+ LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
+ LLVMValueRef instance_id = ctx->i32_0;
+
+ if (key->opt.cs_instancing) {
+ /* Unpack num_prims_udiv_terms. */
+ LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
+ LLVMConstInt(ctx->i32, 0x1f, 0), "");
+ LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
+ LLVMConstInt(ctx->i32, 5, 0), "");
+ /* Divide the total prim_id by the number of prims per instance. */
+ instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
+ num_prims_udiv_multiplier,
+ post_shift);
+ /* Compute the remainder. */
+ prim_id = LLVMBuildSub(builder, prim_id,
+ LLVMBuildMul(builder, instance_id,
+ prims_per_instance, ""), "");
+ }
+
+ /* Generate indices (like a non-indexed draw call). */
+ LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)};
+ unsigned vertices_per_prim = 3;
+
+ switch (key->opt.cs_prim_type) {
+ case PIPE_PRIM_TRIANGLES:
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = ac_build_imad(&ctx->ac, prim_id,
+ LLVMConstInt(ctx->i32, 3, 0),
+ LLVMConstInt(ctx->i32, i, 0));
+ }
+ break;
+ case PIPE_PRIM_TRIANGLE_STRIP:
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = LLVMBuildAdd(builder, prim_id,
+ LLVMConstInt(ctx->i32, i, 0), "");
+ }
+ break;
+ case PIPE_PRIM_TRIANGLE_FAN:
+ /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
+ * and rasterizer as a normal triangle, so we need to put the provoking
+ * vertex into the correct index variable and preserve orientation at the same time.
+ * gl_VertexID is preserved, because it's equal to the index.
+ */
+ if (key->opt.cs_provoking_vertex_first) {
+ index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
+ index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
+ index[2] = ctx->i32_0;
+ } else {
+ index[0] = ctx->i32_0;
+ index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
+ index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
+ }
+ break;
+ default:
+ unreachable("unexpected primitive type");
+ }
+
+ /* Fetch indices. */
+ if (key->opt.cs_indexed) {
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
+ index[i], ctx->i32_0, 1,
+ 0, true);
+ index[i] = ac_to_integer(&ctx->ac, index[i]);
+ }
+ }
+
+ /* Extract the ordered wave ID. */
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
+ LLVMConstInt(ctx->i32, 6, 0), "");
+ ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
+ LLVMConstInt(ctx->i32, 0xfff, 0), "");
+ }
+ LLVMValueRef thread_id =
+ LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), "");
+
+ /* Every other triangle in a strip has a reversed vertex order, so we
+ * need to swap vertices of odd primitives to get the correct primitive
+ * orientation when converting triangle strips to triangles. Primitive
+ * restart complicates it, because a strip can start anywhere.
+ */
+ LLVMValueRef prim_restart_accepted = ctx->i1true;
+
+ if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
+ /* Without primitive restart, odd primitives have reversed orientation.
+ * Only primitive restart can flip it with respect to the first vertex
+ * of the draw call.
+ */
+ LLVMValueRef first_is_odd = ctx->i1false;
+
+ /* Handle primitive restart. */
+ if (key->opt.cs_primitive_restart) {
+ /* Get the GDS primitive restart continue flag and clear
+ * the flag in vertex_counter. This flag is used when the draw
+ * call was split and we need to load the primitive orientation
+ * flag from GDS for the first wave too.
+ */
+ LLVMValueRef gds_prim_restart_continue =
+ LLVMBuildLShr(builder, vertex_counter,
+ LLVMConstInt(ctx->i32, 31, 0), "");
+ gds_prim_restart_continue =
+ LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, "");
+ vertex_counter = LLVMBuildAnd(builder, vertex_counter,
+ LLVMConstInt(ctx->i32, 0x7fffffff, 0), "");
+
+ LLVMValueRef index0_is_reset;
+
+ for (unsigned i = 0; i < 3; i++) {
+ LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
+ restart_index, "");
+ if (i == 0)
+ index0_is_reset = LLVMBuildNot(builder, not_reset, "");
+ prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
+ not_reset, "");
+ }
+
+ /* If the previous waves flip the primitive orientation
+ * of the current triangle strip, it will be stored in GDS.
+ *
+ * Sometimes the correct orientation is not needed, in which case
+ * we don't need to execute this.
+ */
+ if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
+ /* If there are reset indices in this wave, get the thread index
+ * where the most recent strip starts relative to each thread.
+ */
+ LLVMValueRef preceding_threads_mask =
+ LLVMBuildSub(builder,
+ LLVMBuildShl(builder, ctx->ac.i64_1,
+ LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""),
+ ctx->ac.i64_1, "");
+
+ LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
+ LLVMValueRef preceding_reset_threadmask =
+ LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
+ LLVMValueRef strip_start =
+ ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
+ strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, "");
+
+ /* This flips the orientatino based on reset indices within this wave only. */
+ first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, "");
+
+ LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
+ LLVMValueRef is_first_wave, current_wave_resets_index;
+
+ /* Get the thread index where the last strip starts in this wave.
+ *
+ * If the last strip doesn't start in this wave, the thread index
+ * will be 0.
+ *
+ * If the last strip starts in the next wave, the thread index will
+ * be 64.
+ */
+ last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
+ last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, "");
+
+ struct si_thread0_section section;
+ si_enter_thread0_section(ctx, &section, thread_id);
+
+ /* This must be done in the thread 0 section, because
+ * we expect PrimID to be 0 for the whole first wave
+ * in this expression.
+ *
+ * NOTE: This will need to be different if we wanna support
+ * instancing with primitive restart.
+ */
+ is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, "");
+ is_first_wave = LLVMBuildAnd(builder, is_first_wave,
+ LLVMBuildNot(builder,
+ gds_prim_restart_continue, ""), "");
+ current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
+ last_strip_start, ctx->i32_0, "");
+
+ ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state");
+
+ /* Save the last strip start primitive index in GDS and read
+ * the value that previous waves stored.
+ *
+ * if (is_first_wave || current_wave_resets_strip)
+ * // Read the value that previous waves stored and store a new one.
+ * first_is_odd = ds.ordered.swap(last_strip_start);
+ * else
+ * // Just read the value that previous waves stored.
+ * first_is_odd = ds.ordered.add(0);
+ */
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildOr(builder, is_first_wave,
+ current_wave_resets_index, ""), 12602);
+ {
+ /* The GDS address is always 0 with ordered append. */
+ tmp = si_build_ds_ordered_op(ctx, "swap",
+ ordered_wave_id, last_strip_start,
+ 1, true, false);
+ LLVMBuildStore(builder, tmp, ret);
+ }
+ ac_build_else(&ctx->ac, 12603);
+ {
+ /* Just read the value from GDS. */
+ tmp = si_build_ds_ordered_op(ctx, "add",
+ ordered_wave_id, ctx->i32_0,
+ 1, true, false);
+ LLVMBuildStore(builder, tmp, ret);
+ }
+ ac_build_endif(&ctx->ac, 12602);
+
+ prev_wave_state = LLVMBuildLoad(builder, ret, "");
+ /* Ignore the return value if this is the first wave. */
+ prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
+ ctx->i32_0, prev_wave_state, "");
+ si_exit_thread0_section(&section, &prev_wave_state);
+ prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, "");
+
+ /* If the strip start appears to be on thread 0 for the current primitive
+ * (meaning the reset index is not present in this wave and might have
+ * appeared in previous waves), use the value from GDS to determine
+ * primitive orientation.
+ *
+ * If the strip start is in this wave for the current primitive, use
+ * the value from the current wave to determine primitive orientation.
+ */
+ LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
+ strip_start, ctx->i32_0, "");
+ first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
+ first_is_odd, "");
+ }
+ }
+ /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
+ LLVMValueRef prim_is_odd =
+ LLVMBuildXor(builder, first_is_odd,
+ LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), "");
+
+ /* Determine the primitive orientation.
+ * Only swap the vertices that are not the provoking vertex. We need to keep
+ * the provoking vertex in place.
+ */
+ if (key->opt.cs_provoking_vertex_first) {
+ LLVMValueRef index1 = index[1];
+ LLVMValueRef index2 = index[2];
+ index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, "");
+ index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, "");
+ } else {
+ LLVMValueRef index0 = index[0];
+ LLVMValueRef index1 = index[1];
+ index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, "");
+ index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, "");
+ }
+ }
+
+ /* Execute the vertex shader for each vertex to get vertex positions. */
+ LLVMValueRef pos[3][4];
+ for (unsigned i = 0; i < vertices_per_prim; i++) {
+ vs_params[param_vertex_id] = index[i];
+ vs_params[param_instance_id] = instance_id;
+
+ LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
+ for (unsigned chan = 0; chan < 4; chan++)
+ pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
+ }
+
+ /* Divide XYZ by W. */
+ for (unsigned i = 0; i < vertices_per_prim; i++) {
+ for (unsigned chan = 0; chan < 3; chan++)
+ pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
+ }
+
+ /* Load the viewport state. */
+ LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
+ LLVMConstInt(ctx->i32, 2, 0));
+ vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
+ vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+ vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+ vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+ vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+ /* Do culling. */
+ struct ac_cull_options options = {};
+ options.cull_front = key->opt.cs_cull_front;
+ options.cull_back = key->opt.cs_cull_back;
+ options.cull_view_xy = true;
+ options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
+ options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
+ options.cull_small_prims = true;
+ options.cull_zero_area = true;
+ options.cull_w = true;
+ options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
+
+ LLVMValueRef accepted =
+ ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
+ vp_scale, vp_translate, smallprim_precision,
+ &options);
+
+ LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
+
+ /* Count the number of active threads by doing bitcount(accepted). */
+ LLVMValueRef num_prims_accepted =
+ ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64,
+ &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
+ num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, "");
+
+ LLVMValueRef start;
+
+ /* Execute atomic_add on the vertex count. */
+ struct si_thread0_section section;
+ si_enter_thread0_section(ctx, &section, thread_id);
+ {
+ if (VERTEX_COUNTER_GDS_MODE == 0) {
+ LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
+ LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+ vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
+ start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+ vertex_counter, num_indices,
+ LLVMAtomicOrderingMonotonic, false);
+ } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+ LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
+ LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+ vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
+ LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), "");
+ start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+ vertex_counter, num_indices,
+ LLVMAtomicOrderingMonotonic, false);
+ } else if (VERTEX_COUNTER_GDS_MODE == 2) {
+ LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
+
+ /* If the draw call was split into multiple subdraws, each using
+ * a separate draw packet, we need to start counting from 0 for
+ * the first compute wave of the subdraw.
+ *
+ * vertex_counter contains the primitive ID of the first thread
+ * in the first wave.
+ *
+ * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
+ */
+ LLVMValueRef is_first_wave =
+ LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+ vertex_counter, "");
+
+ /* Store the primitive count for ordered append, not vertex count.
+ * The idea is to avoid GDS initialization via CP DMA. The shader
+ * effectively stores the first count using "swap".
+ *
+ * if (first_wave) {
+ * ds.ordered.swap(num_prims_accepted); // store the first primitive count
+ * previous = 0;
+ * } else {
+ * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
+ * }
+ */
+ ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
+ {
+ /* The GDS address is always 0 with ordered append. */
+ si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
+ num_prims_accepted, 0, true, true);
+ LLVMBuildStore(builder, ctx->i32_0, tmp_store);
+ }
+ ac_build_else(&ctx->ac, 12605);
+ {
+ LLVMBuildStore(builder,
+ si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
+ num_prims_accepted, 0,
+ true, true),
+ tmp_store);
+ }
+ ac_build_endif(&ctx->ac, 12604);
+
+ start = LLVMBuildLoad(builder, tmp_store, "");
+ }
+ }
+ si_exit_thread0_section(&section, &start);
+
+ /* Write the final vertex count to memory. An EOS/EOP event could do this,
+ * but those events are super slow and should be avoided if performance
+ * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
+ * event like this.
+ */
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+ last_wave_prim_id, ""), 12606);
+ LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
+ count = LLVMBuildMul(builder, count,
+ LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+
+ /* GFX8 needs to disable caching, so that the CP can see the stored value.
+ * MTYPE=3 bypasses TC L2.
+ */
+ if (ctx->screen->info.chip_class <= GFX8) {
+ LLVMValueRef desc[] = {
+ vertex_count_addr,
+ LLVMConstInt(ctx->i32,
+ S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
+ LLVMConstInt(ctx->i32, 4, 0),
+ LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+ S_008F0C_MTYPE(3 /* uncached */), 0),
+ };
+ LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
+ ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
+ ctx->i32_0, 0, ac_glc | ac_slc, false);
+ } else {
+ LLVMBuildStore(builder, count,
+ si_expand_32bit_pointer(ctx, vertex_count_addr));
+ }
+ ac_build_endif(&ctx->ac, 12606);
+ } else {
+ /* For unordered modes that increment a vertex count instead of
+ * primitive count, convert it into the primitive index.
+ */
+ start = LLVMBuildUDiv(builder, start,
+ LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+ }
+
+ /* Now we need to store the indices of accepted primitives into
+ * the output index buffer.
+ */
+ ac_build_ifcc(&ctx->ac, accepted, 16607);
+ {
+ /* Get the number of bits set before the index of this thread. */
+ LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
+
+ /* We have lowered instancing. Pack the instance ID into vertex ID. */
+ if (key->opt.cs_instancing) {
+ instance_id = LLVMBuildShl(builder, instance_id,
+ LLVMConstInt(ctx->i32, 16, 0), "");
+
+ for (unsigned i = 0; i < vertices_per_prim; i++)
+ index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
+ }
+
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ /* vertex_counter contains the first primitive ID
+ * for this dispatch. If the draw call was split into
+ * multiple subdraws, the first primitive ID is > 0
+ * for subsequent subdraws. Each subdraw uses a different
+ * portion of the output index buffer. Offset the store
+ * vindex by the first primitive ID to get the correct
+ * store address for the subdraw.
+ */
+ start = LLVMBuildAdd(builder, start, vertex_counter, "");
+ }
+
+ /* Write indices for accepted primitives. */
+ LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
+ LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
+
+ if (!ac_has_vec3_support(ctx->ac.chip_class, true))
+ vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
+
+ ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
+ vindex, ctx->i32_0, 3,
+ ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
+ }
+ ac_build_endif(&ctx->ac, 16607);
+
+ LLVMBuildRetVoid(builder);
}
/* Return false if the shader isn't ready. */
static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
- const struct pipe_draw_info *info,
- bool primitive_restart)
+ const struct pipe_draw_info *info,
+ bool primitive_restart)
{
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct si_shader_key key;
-
- /* Primitive restart needs ordered counters. */
- assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
- assert(!primitive_restart || info->instance_count == 1);
-
- memset(&key, 0, sizeof(key));
- si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog);
- assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
- key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
- key.opt.vs_as_prim_discard_cs = 1;
- key.opt.cs_prim_type = info->mode;
- key.opt.cs_indexed = info->index_size != 0;
- key.opt.cs_instancing = info->instance_count > 1;
- key.opt.cs_primitive_restart = primitive_restart;
- key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
- /* Primitive restart with triangle strips needs to preserve primitive
- * orientation for cases where front and back primitive orientation matters.
- */
- if (primitive_restart) {
- struct si_shader_selector *ps = sctx->shader.ps.cso;
-
- key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back ||
- ps->info.uses_frontface ||
- (rs->two_side && ps->info.colors_read);
- }
-
- if (rs->rasterizer_discard) {
- /* Just for performance testing and analysis of trivial bottlenecks.
- * This should result in a very short compute shader. */
- key.opt.cs_cull_front = 1;
- key.opt.cs_cull_back = 1;
- } else {
- key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front;
- key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back;
- }
-
- if (!rs->depth_clamp_any && CULL_Z) {
- key.opt.cs_cull_z = 1;
- key.opt.cs_halfz_clip_space = rs->clip_halfz;
- }
-
- sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso;
- sctx->cs_prim_discard_state.current = NULL;
-
- if (!sctx->compiler.passes)
- si_init_compiler(sctx->screen, &sctx->compiler);
-
- struct si_compiler_ctx_state compiler_state;
- compiler_state.compiler = &sctx->compiler;
- compiler_state.debug = sctx->debug;
- compiler_state.is_debug_context = sctx->is_debug;
-
- return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
- &key, -1, true) == 0 &&
- /* Disallow compute shaders using the scratch buffer. */
- sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ struct si_shader_key key;
+
+ /* Primitive restart needs ordered counters. */
+ assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
+ assert(!primitive_restart || info->instance_count == 1);
+
+ memset(&key, 0, sizeof(key));
+ si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
+ assert(!key.part.vs.prolog.instance_divisor_is_fetched);
+
+ key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
+ key.opt.vs_as_prim_discard_cs = 1;
+ key.opt.cs_prim_type = info->mode;
+ key.opt.cs_indexed = info->index_size != 0;
+ key.opt.cs_instancing = info->instance_count > 1;
+ key.opt.cs_primitive_restart = primitive_restart;
+ key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
+
+ /* Primitive restart with triangle strips needs to preserve primitive
+ * orientation for cases where front and back primitive orientation matters.
+ */
+ if (primitive_restart) {
+ struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+ key.opt.cs_need_correct_orientation =
+ rs->cull_front != rs->cull_back ||
+ ps->info.uses_frontface ||
+ (rs->two_side && ps->info.colors_read);
+ }
+
+ if (rs->rasterizer_discard) {
+ /* Just for performance testing and analysis of trivial bottlenecks.
+ * This should result in a very short compute shader. */
+ key.opt.cs_cull_front = 1;
+ key.opt.cs_cull_back = 1;
+ } else {
+ key.opt.cs_cull_front =
+ sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
+ key.opt.cs_cull_back =
+ sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
+ }
+
+ if (!rs->depth_clamp_any && CULL_Z) {
+ key.opt.cs_cull_z = 1;
+ key.opt.cs_halfz_clip_space = rs->clip_halfz;
+ }
+
+ sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
+ sctx->cs_prim_discard_state.current = NULL;
+
+ struct si_compiler_ctx_state compiler_state;
+ compiler_state.compiler = &sctx->compiler;
+ compiler_state.debug = sctx->debug;
+ compiler_state.is_debug_context = sctx->is_debug;
+
+ return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
+ &compiler_state, &key, -1, true) == 0 &&
+ /* Disallow compute shaders using the scratch buffer. */
+ sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
}
static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
{
- if (sctx->index_ring)
- return true;
-
- if (!sctx->prim_discard_compute_cs.priv) {
- struct radeon_winsys *ws = sctx->ws;
- unsigned gds_size =
- VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
- unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
-
- if (gds_size) {
- sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS,
- RADEON_FLAG_DRIVER_INTERNAL);
- if (!sctx->gds)
- return false;
-
- ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
- }
- if (num_oa_counters) {
- assert(gds_size);
- sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA,
- RADEON_FLAG_DRIVER_INTERNAL);
- if (!sctx->gds_oa)
- return false;
-
- ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
- }
-
- if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs,
- &sctx->gfx_cs, num_oa_counters > 0))
- return false;
- }
-
- if (!sctx->index_ring) {
- sctx->index_ring = si_aligned_buffer_create(
- sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
- PIPE_USAGE_DEFAULT,
- sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
- if (!sctx->index_ring)
- return false;
- }
- return true;
+ if (sctx->index_ring)
+ return true;
+
+ if (!sctx->prim_discard_compute_cs) {
+ struct radeon_winsys *ws = sctx->ws;
+ unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
+ VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
+ unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
+
+ if (gds_size) {
+ sctx->gds = ws->buffer_create(ws, gds_size, 4,
+ RADEON_DOMAIN_GDS, 0);
+ if (!sctx->gds)
+ return false;
+
+ ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
+ RADEON_USAGE_READWRITE, 0, 0);
+ }
+ if (num_oa_counters) {
+ assert(gds_size);
+ sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
+ 1, RADEON_DOMAIN_OA, 0);
+ if (!sctx->gds_oa)
+ return false;
+
+ ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
+ RADEON_USAGE_READWRITE, 0, 0);
+ }
+
+ sctx->prim_discard_compute_cs =
+ ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
+ num_oa_counters > 0);
+ if (!sctx->prim_discard_compute_cs)
+ return false;
+ }
+
+ if (!sctx->index_ring) {
+ sctx->index_ring =
+ si_aligned_buffer_create(sctx->b.screen,
+ SI_RESOURCE_FLAG_UNMAPPABLE,
+ PIPE_USAGE_DEFAULT,
+ sctx->index_ring_size_per_ib * 2,
+ 2 * 1024 * 1024);
+ if (!sctx->index_ring)
+ return false;
+ }
+ return true;
}
static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
{
- return sctx->index_ring_offset +
- align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
- sctx->index_ring_size_per_ib;
+ return sctx->index_ring_offset +
+ align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
+ sctx->index_ring_size_per_ib;
}
enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
- const struct pipe_draw_start_count *draws,
- unsigned num_draws, bool primitive_restart,
- unsigned total_count)
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
+ const struct pipe_draw_info *info,
+ bool primitive_restart)
{
- /* If the compute shader compilation isn't finished, this returns false. */
- if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
- return SI_PRIM_DISCARD_DISABLED;
-
- if (!si_initialize_prim_discard_cmdbuf(sctx))
- return SI_PRIM_DISCARD_DISABLED;
-
- struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
- unsigned prim = info->mode;
- unsigned count = total_count;
- unsigned instance_count = info->instance_count;
- unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
- unsigned num_prims = num_prims_per_instance * instance_count;
- unsigned out_indexbuf_size = num_prims * 12;
- bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
- const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
-
- /* Split draws at the draw call level if the ring is full. This makes
- * better use of the ring space.
- */
- if (ring_full && num_prims > split_prims_draw_level &&
- instance_count == 1 && /* TODO: support splitting instanced draws */
- (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
- unsigned vert_count_per_subdraw = 0;
-
- if (prim == PIPE_PRIM_TRIANGLES)
- vert_count_per_subdraw = split_prims_draw_level * 3;
- else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
- vert_count_per_subdraw = split_prims_draw_level;
- else
- unreachable("shouldn't get here");
-
- /* Split multi draws first. */
- if (num_draws > 1) {
- unsigned count = 0;
- unsigned first_draw = 0;
- unsigned num_draws_split = 0;
-
- for (unsigned i = 0; i < num_draws; i++) {
- if (count && count + draws[i].count > vert_count_per_subdraw) {
- /* Submit previous draws. */
- sctx->b.draw_vbo(&sctx->b, info, NULL, draws + first_draw, num_draws_split);
- count = 0;
- first_draw = i;
- num_draws_split = 0;
- }
-
- if (draws[i].count > vert_count_per_subdraw) {
- /* Submit just 1 draw. It will be split. */
- sctx->b.draw_vbo(&sctx->b, info, NULL, draws + i, 1);
- assert(count == 0);
- assert(first_draw == i);
- assert(num_draws_split == 0);
- first_draw = i + 1;
- continue;
- }
-
- count += draws[i].count;
- num_draws_split++;
- }
- return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT;
- }
-
- /* Split single draws if splitting multi draws isn't enough. */
- struct pipe_draw_info split_draw = *info;
- struct pipe_draw_start_count split_draw_range = draws[0];
- unsigned base_start = split_draw_range.start;
-
- split_draw.primitive_restart = primitive_restart;
-
- if (prim == PIPE_PRIM_TRIANGLES) {
- assert(vert_count_per_subdraw < count);
-
- for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
- split_draw_range.start = base_start + start;
- split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);
-
- sctx->b.draw_vbo(&sctx->b, &split_draw, NULL, &split_draw_range, 1);
- }
- } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
- /* No primitive pair can be split, because strips reverse orientation
- * for odd primitives. */
- STATIC_ASSERT(split_prims_draw_level % 2 == 0);
-
- for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
- split_draw_range.start = base_start + start;
- split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
- sctx->b.draw_vbo(&sctx->b, &split_draw, NULL, &split_draw_range, 1);
-
- if (start == 0 && primitive_restart &&
- sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
- sctx->preserve_prim_restart_gds_at_flush = true;
- }
- sctx->preserve_prim_restart_gds_at_flush = false;
- }
-
- return SI_PRIM_DISCARD_DRAW_SPLIT;
- }
-
- /* Just quit if the draw call doesn't fit into the ring and can't be split. */
- if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
- if (SI_PRIM_DISCARD_DEBUG)
- puts("PD failed: draw call too big, can't be split");
- return SI_PRIM_DISCARD_DISABLED;
- }
-
- unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL) * num_draws;
- unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
- 24 * (num_subdraws - 1) + /* subdraws */
- 30; /* leave some space at the end */
- unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
-
- if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
- need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
- else
- need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-
- if (ring_full ||
- (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
- !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
- /* If the current IB is empty but the size is too small, add a NOP
- * packet to force a flush and get a bigger IB.
- */
- if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
- gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
- radeon_begin(gfx_cs);
- radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(gfx_cs, 0);
- radeon_end();
- }
-
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- }
-
- /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
- struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
- ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
- assert(compute_has_space);
- assert(si_check_ring_space(sctx, out_indexbuf_size));
- return SI_PRIM_DISCARD_ENABLED;
+ /* If the compute shader compilation isn't finished, this returns false. */
+ if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
+ return SI_PRIM_DISCARD_DISABLED;
+
+ if (!si_initialize_prim_discard_cmdbuf(sctx))
+ return SI_PRIM_DISCARD_DISABLED;
+
+ struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+ unsigned prim = info->mode;
+ unsigned count = info->count;
+ unsigned instance_count = info->instance_count;
+ unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
+ unsigned num_prims = num_prims_per_instance * instance_count;
+ unsigned out_indexbuf_size = num_prims * 12;
+ bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
+ const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
+
+ /* Split draws at the draw call level if the ring is full. This makes
+ * better use of the ring space.
+ */
+ if (ring_full &&
+ num_prims > split_prims_draw_level &&
+ instance_count == 1 && /* TODO: support splitting instanced draws */
+ (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
+ (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
+ /* Split draws. */
+ struct pipe_draw_info split_draw = *info;
+ split_draw.primitive_restart = primitive_restart;
+
+ unsigned base_start = split_draw.start;
+
+ if (prim == PIPE_PRIM_TRIANGLES) {
+ unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
+ assert(vert_count_per_subdraw < count);
+
+ for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
+ split_draw.start = base_start + start;
+ split_draw.count = MIN2(count - start, vert_count_per_subdraw);
+
+ sctx->b.draw_vbo(&sctx->b, &split_draw);
+ }
+ } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
+ /* No primitive pair can be split, because strips reverse orientation
+ * for odd primitives. */
+ STATIC_ASSERT(split_prims_draw_level % 2 == 0);
+
+ unsigned vert_count_per_subdraw = split_prims_draw_level;
+
+ for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
+ split_draw.start = base_start + start;
+ split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
+
+ sctx->b.draw_vbo(&sctx->b, &split_draw);
+
+ if (start == 0 &&
+ primitive_restart &&
+ sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
+ sctx->preserve_prim_restart_gds_at_flush = true;
+ }
+ sctx->preserve_prim_restart_gds_at_flush = false;
+ } else {
+ assert(0);
+ }
+
+ return SI_PRIM_DISCARD_DRAW_SPLIT;
+ }
+
+ /* Just quit if the draw call doesn't fit into the ring and can't be split. */
+ if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
+ if (SI_PRIM_DISCARD_DEBUG)
+ puts("PD failed: draw call too big, can't be split");
+ return SI_PRIM_DISCARD_DISABLED;
+ }
+
+ unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
+ unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
+ 24 * (num_subdraws - 1) + /* subdraws */
+ 20; /* leave some space at the end */
+ unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
+
+ if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
+ need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
+ else
+ need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+
+ if (ring_full ||
+ (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
+ !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
+ /* If the current IB is empty but the size is too small, add a NOP
+ * packet to force a flush and get a bigger IB.
+ */
+ if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
+ gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
+ radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(gfx_cs, 0);
+ }
+
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ }
+
+ /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
+ struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+ ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
+ assert(compute_has_space);
+ assert(si_check_ring_space(sctx, out_indexbuf_size));
+ return SI_PRIM_DISCARD_ENABLED;
}
void si_compute_signal_gfx(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
- unsigned writeback_L2_flags = 0;
-
- /* The writeback L2 flags vary with each chip generation. */
- /* CI needs to flush vertex indices to memory. */
- if (sctx->chip_class <= GFX7)
- writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
- else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
- writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
- if (!sctx->compute_num_prims_in_batch)
- return;
-
- assert(sctx->compute_rewind_va);
-
- /* After the queued dispatches are done and vertex counts are written to
- * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
- * the dispatches to finish, it only adds the CS_DONE event into the event
- * queue.
- */
- si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
- sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
- writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
- EOP_DATA_SEL_VALUE_32BIT, NULL,
- sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
- REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
- SI_NOT_QUERY);
-
- sctx->compute_rewind_va = 0;
- sctx->compute_num_prims_in_batch = 0;
+ struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+ unsigned writeback_L2_flags = 0;
+
+ /* The writeback L2 flags vary with each chip generation. */
+ /* CI needs to flush vertex indices to memory. */
+ if (sctx->chip_class <= GFX7)
+ writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
+ else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
+ writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
+
+ if (!sctx->compute_num_prims_in_batch)
+ return;
+
+ assert(sctx->compute_rewind_va);
+
+ /* After the queued dispatches are done and vertex counts are written to
+ * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
+ * the dispatches to finish, it only adds the CS_DONE event into the event
+ * queue.
+ */
+ si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
+ sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+ writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
+ EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_VALUE_32BIT,
+ NULL,
+ sctx->compute_rewind_va |
+ ((uint64_t)sctx->screen->info.address32_hi << 32),
+ REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
+ SI_NOT_QUERY);
+
+ sctx->compute_rewind_va = 0;
+ sctx->compute_num_prims_in_batch = 0;
}
/* Dispatch a primitive discard compute shader. */
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- unsigned count, unsigned index_size,
- unsigned base_vertex, uint64_t input_indexbuf_va,
- unsigned input_indexbuf_num_elements)
+ const struct pipe_draw_info *info,
+ unsigned index_size,
+ unsigned base_vertex,
+ uint64_t input_indexbuf_va,
+ unsigned input_indexbuf_num_elements)
{
- struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
- struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
- unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count);
- if (!num_prims_per_instance)
- return;
-
- unsigned num_prims = num_prims_per_instance * info->instance_count;
- unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
-
- switch (info->mode) {
- case PIPE_PRIM_TRIANGLES:
- case PIPE_PRIM_TRIANGLE_STRIP:
- case PIPE_PRIM_TRIANGLE_FAN:
- vertices_per_prim = 3;
- output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
- gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
- break;
- default:
- unreachable("unsupported primitive type");
- return;
- }
-
- unsigned out_indexbuf_offset;
- uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
- bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
-
- /* Initialize the compute IB if it's empty. */
- if (!sctx->prim_discard_compute_ib_initialized) {
- /* 1) State initialization. */
- sctx->compute_gds_offset = 0;
- sctx->compute_ib_last_shader = NULL;
-
- if (sctx->last_ib_barrier_fence) {
- assert(!sctx->last_ib_barrier_buf);
- sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
- RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
- }
-
- /* 2) IB initialization. */
-
- /* This needs to be done at the beginning of IBs due to possible
- * TTM buffer moves in the kernel.
- */
- if (sctx->chip_class >= GFX10) {
- radeon_begin(cs);
- radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
- radeon_emit(cs, 0); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- radeon_emit(cs, /* GCR_CNTL */
- S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
- S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
- S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
- radeon_end();
- } else {
- si_emit_surface_sync(sctx, cs,
- S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
- S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
- S_0085F0_SH_ICACHE_ACTION_ENA(1) |
- S_0085F0_SH_KCACHE_ACTION_ENA(1));
- }
-
- /* Restore the GDS prim restart counter if needed. */
- if (sctx->preserve_prim_restart_gds_at_flush) {
- si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM,
- sctx->wait_mem_scratch, 4);
- }
-
- si_emit_initial_compute_regs(sctx, cs);
-
- radeon_begin(cs);
- radeon_set_sh_reg(
- cs, R_00B860_COMPUTE_TMPRING_SIZE,
- S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
-
- /* Only 1D grids are launched. */
- radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
- radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
- radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
-
- radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0);
-
- /* Disable ordered alloc for OA resources. */
- for (unsigned i = 0; i < 2; i++) {
- radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3, false);
- radeon_emit(cs, S_031074_INDEX(i));
- radeon_emit(cs, 0);
- radeon_emit(cs, S_03107C_ENABLE(0));
- }
- radeon_end();
-
- if (sctx->last_ib_barrier_buf) {
- assert(!sctx->last_ib_barrier_fence);
- radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
- RADEON_PRIO_FENCE);
- si_cp_wait_mem(sctx, cs,
- sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
- 1, 1, WAIT_REG_MEM_EQUAL);
- }
-
- sctx->prim_discard_compute_ib_initialized = true;
- }
-
- /* Allocate the output index buffer. */
- output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
- assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
- out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
- sctx->index_ring_offset += output_indexbuf_size;
-
- radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SHADER_RW_BUFFER);
- uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
- /* Prepare index buffer descriptors. */
- struct si_resource *indexbuf_desc = NULL;
- unsigned indexbuf_desc_offset;
- unsigned desc_size = 12 * 4;
- uint32_t *desc;
-
- u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
- &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
- radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
- RADEON_PRIO_DESCRIPTORS);
-
- /* Input index buffer. */
- desc[0] = input_indexbuf_va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
- desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
-
- if (sctx->chip_class >= GFX10) {
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT
- : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT
- : V_008F0C_IMG_FORMAT_32_UINT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[3] =
- S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
- : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
- : V_008F0C_BUF_DATA_FORMAT_32);
- }
-
- /* Output index buffer. */
- desc[4] = out_indexbuf_va;
- desc[5] =
- S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
- desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
-
- if (sctx->chip_class >= GFX10) {
- desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
- S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
- S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- S_008F0C_DATA_FORMAT(output_indexbuf_format);
- }
-
- /* Viewport state. */
- struct si_small_prim_cull_info cull_info;
- si_get_small_prim_cull_info(sctx, &cull_info);
-
- desc[8] = fui(cull_info.scale[0]);
- desc[9] = fui(cull_info.scale[1]);
- desc[10] = fui(cull_info.translate[0]);
- desc[11] = fui(cull_info.translate[1]);
-
- /* Set user data SGPRs. */
- /* This can't be greater than 14 if we want the fastest launch rate. */
- unsigned user_sgprs = 13;
-
- uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
- unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
- unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
- uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
- uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
- uint64_t vb_desc_va = sctx->vb_descriptors_buffer
- ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
- : 0;
- unsigned gds_offset, gds_size;
- struct si_fast_udiv_info32 num_prims_udiv = {};
-
- if (info->instance_count > 1)
- num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
- /* Limitations on how these two are packed in the user SGPR. */
- assert(num_prims_udiv.post_shift < 32);
- assert(num_prims_per_instance < 1 << 27);
-
- si_resource_reference(&indexbuf_desc, NULL);
-
- bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
-
- if (VERTEX_COUNTER_GDS_MODE == 1) {
- gds_offset = sctx->compute_gds_offset;
- gds_size = primitive_restart ? 8 : 4;
- sctx->compute_gds_offset += gds_size;
-
- /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
- * The remainder of the GDS will be cleared after the dispatch packet
- * in parallel with compute shaders.
- */
- if (first_dispatch) {
- radeon_begin(cs);
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
- radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
- radeon_emit(cs, gds_offset);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0); /* value to write */
- if (gds_size == 8)
- radeon_emit(cs, 0);
- radeon_end();
- }
- }
-
- /* Set shader registers. */
- struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
- if (shader != sctx->compute_ib_last_shader) {
- radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
- RADEON_PRIO_SHADER_BINARY);
- uint64_t shader_va = shader->bo->gpu_address;
-
- assert(shader->config.scratch_bytes_per_wave == 0);
- assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
- radeon_begin(cs);
- radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
- radeon_emit(cs, shader_va >> 8);
- radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
- radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
- radeon_emit(
- cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
- S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
- S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
- S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
- radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
- S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
- S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
- S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
- S_00B84C_LDS_SIZE(shader->config.lds_size));
-
- radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
- ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
- MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
- radeon_end();
- sctx->compute_ib_last_shader = shader;
- }
-
- STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
-
- /* Big draw calls are split into smaller dispatches and draw packets. */
- for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
- unsigned num_subdraw_prims;
-
- if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
- num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
- else
- num_subdraw_prims = num_prims - start_prim;
-
- /* Small dispatches are executed back to back until a specific primitive
- * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
- * to start drawing the batch. This batching adds latency to the gfx IB,
- * but CS_DONE and REWIND are too slow.
- */
- if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
- si_compute_signal_gfx(sctx);
-
- if (sctx->compute_num_prims_in_batch == 0) {
- assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
- sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
- if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
- radeon_begin(gfx_cs);
- radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(gfx_cs, 0);
- radeon_end();
-
- si_cp_wait_mem(
- sctx, gfx_cs,
- sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
- REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
- /* Use INDIRECT_BUFFER to chain to a different buffer
- * to discard the CP prefetch cache.
- */
- sctx->ws->cs_check_space(gfx_cs, 0, true);
- } else {
- radeon_begin(gfx_cs);
- radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
- radeon_emit(gfx_cs, 0);
- radeon_end();
- }
- }
-
- sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
- uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
- uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
- /* Emit the draw packet into the gfx IB. */
- radeon_begin(gfx_cs);
- radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
- radeon_emit(gfx_cs, num_prims * vertices_per_prim);
- radeon_emit(gfx_cs, index_va);
- radeon_emit(gfx_cs, index_va >> 32);
- radeon_emit(gfx_cs, 0);
- radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
- radeon_end();
-
- radeon_begin_again(cs);
-
- /* Continue with the compute IB. */
- if (start_prim == 0) {
- uint32_t gds_prim_restart_continue_bit = 0;
-
- if (sctx->preserve_prim_restart_gds_at_flush) {
- assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP);
- assert(start_prim < 1 << 31);
- gds_prim_restart_continue_bit = 1 << 31;
- }
-
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
- radeon_emit(cs, index_buffers_va);
- radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0
- ? count_va
- : VERTEX_COUNTER_GDS_MODE == 1
- ? gds_offset
- : start_prim | gds_prim_restart_continue_bit);
- radeon_emit(cs, start_prim + num_subdraw_prims - 1);
- radeon_emit(cs, count_va);
- radeon_emit(cs, vb_desc_va);
- radeon_emit(cs, vs_const_desc_va);
- radeon_emit(cs, vs_sampler_desc_va);
- radeon_emit(cs, base_vertex);
- radeon_emit(cs, info->start_instance);
- radeon_emit(cs, num_prims_udiv.multiplier);
- radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
- radeon_emit(cs, info->restart_index);
- /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
- radeon_emit(cs, fui(cull_info.small_prim_precision));
- } else {
- assert(VERTEX_COUNTER_GDS_MODE == 2);
- /* Only update the SGPRs that changed. */
- radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
- radeon_emit(cs, start_prim);
- radeon_emit(cs, start_prim + num_subdraw_prims - 1);
- radeon_emit(cs, count_va);
- }
-
- /* Set grid dimensions. */
- unsigned start_block = start_prim / THREADGROUP_SIZE;
- unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
- unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
- radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
- radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
- S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
- S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
- radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
- radeon_emit(cs, 1);
- radeon_emit(cs, 1);
- radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
- S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
- S_00B800_ORDER_MODE(0 /* launch in order */));
- radeon_end();
-
- /* This is only for unordered append. Ordered append writes this from
- * the shader.
- *
- * Note that EOP and EOS events are super slow, so emulating the event
- * in a shader is an important optimization.
- */
- if (VERTEX_COUNTER_GDS_MODE == 1) {
- si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
- sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
- EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL,
- count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
- EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY);
-
- /* Now that compute shaders are running, clear the remainder of GDS. */
- if (first_dispatch) {
- unsigned offset = gds_offset + gds_size;
- si_cp_dma_clear_buffer(
- sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0,
- SI_OP_CPDMA_SKIP_CHECK_CS_SPACE, SI_COHERENCY_NONE, L2_BYPASS);
- }
- }
- first_dispatch = false;
-
- assert(cs->current.cdw <= cs->current.max_dw);
- assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
- }
+ struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+ unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
+ if (!num_prims_per_instance)
+ return;
+
+ unsigned num_prims = num_prims_per_instance * info->instance_count;
+ unsigned vertices_per_prim, output_indexbuf_format;
+
+ switch (info->mode) {
+ case PIPE_PRIM_TRIANGLES:
+ case PIPE_PRIM_TRIANGLE_STRIP:
+ case PIPE_PRIM_TRIANGLE_FAN:
+ vertices_per_prim = 3;
+ output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
+ break;
+ default:
+ unreachable("unsupported primitive type");
+ return;
+ }
+
+ unsigned out_indexbuf_offset;
+ uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
+ bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
+
+ /* Initialize the compute IB if it's empty. */
+ if (!sctx->prim_discard_compute_ib_initialized) {
+ /* 1) State initialization. */
+ sctx->compute_gds_offset = 0;
+ sctx->compute_ib_last_shader = NULL;
+
+ if (sctx->last_ib_barrier_fence) {
+ assert(!sctx->last_ib_barrier_buf);
+ sctx->ws->cs_add_fence_dependency(gfx_cs,
+ sctx->last_ib_barrier_fence,
+ RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
+ }
+
+ /* 2) IB initialization. */
+
+ /* This needs to be done at the beginning of IBs due to possible
+ * TTM buffer moves in the kernel.
+ *
+ * TODO: update for GFX10
+ */
+ si_emit_surface_sync(sctx, cs,
+ S_0085F0_TC_ACTION_ENA(1) |
+ S_0085F0_TCL1_ACTION_ENA(1) |
+ S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
+ S_0085F0_SH_ICACHE_ACTION_ENA(1) |
+ S_0085F0_SH_KCACHE_ACTION_ENA(1));
+
+ /* Restore the GDS prim restart counter if needed. */
+ if (sctx->preserve_prim_restart_gds_at_flush) {
+ si_cp_copy_data(sctx, cs,
+ COPY_DATA_GDS, NULL, 4,
+ COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
+ }
+
+ si_emit_initial_compute_regs(sctx, cs);
+
+ radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+ S_00B860_WAVES(sctx->scratch_waves) |
+ S_00B860_WAVESIZE(0)); /* no scratch */
+
+ /* Only 1D grids are launched. */
+ radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
+ radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
+ S_00B820_NUM_THREAD_PARTIAL(1));
+ radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
+ S_00B824_NUM_THREAD_PARTIAL(1));
+
+ radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
+ radeon_emit(cs, 0);
+ radeon_emit(cs, 0);
+
+ /* Disable ordered alloc for OA resources. */
+ for (unsigned i = 0; i < 2; i++) {
+ radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
+ radeon_emit(cs, S_031074_INDEX(i));
+ radeon_emit(cs, 0);
+ radeon_emit(cs, S_03107C_ENABLE(0));
+ }
+
+ if (sctx->last_ib_barrier_buf) {
+ assert(!sctx->last_ib_barrier_fence);
+ radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
+ RADEON_USAGE_READ, RADEON_PRIO_FENCE);
+ si_cp_wait_mem(sctx, cs,
+ sctx->last_ib_barrier_buf->gpu_address +
+ sctx->last_ib_barrier_buf_offset, 1, 1,
+ WAIT_REG_MEM_EQUAL);
+ }
+
+ sctx->prim_discard_compute_ib_initialized = true;
+ }
+
+ /* Allocate the output index buffer. */
+ output_indexbuf_size = align(output_indexbuf_size,
+ sctx->screen->info.tcc_cache_line_size);
+ assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
+ out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
+ sctx->index_ring_offset += output_indexbuf_size;
+
+ radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
+
+ /* Prepare index buffer descriptors. */
+ struct si_resource *indexbuf_desc = NULL;
+ unsigned indexbuf_desc_offset;
+ unsigned desc_size = 12 * 4;
+ uint32_t *desc;
+
+ u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
+ si_optimal_tcc_alignment(sctx, desc_size),
+ &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
+ (void**)&desc);
+ radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
+
+ /* Input index buffer. */
+ desc[0] = input_indexbuf_va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
+ S_008F04_STRIDE(index_size);
+ desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+ S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+ S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
+ index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
+ V_008F0C_BUF_DATA_FORMAT_32);
+
+ /* Output index buffer. */
+ desc[4] = out_indexbuf_va;
+ desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
+ S_008F04_STRIDE(vertices_per_prim * 4);
+ desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
+ desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+ S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+ S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+ S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+ S_008F0C_DATA_FORMAT(output_indexbuf_format);
+
+ /* Viewport state.
+ * This is needed by the small primitive culling, because it's done
+ * in screen space.
+ */
+ float scale[2], translate[2];
+
+ scale[0] = sctx->viewports.states[0].scale[0];
+ scale[1] = sctx->viewports.states[0].scale[1];
+ translate[0] = sctx->viewports.states[0].translate[0];
+ translate[1] = sctx->viewports.states[0].translate[1];
+
+ /* The viewport shouldn't flip the X axis for the small prim culling to work. */
+ assert(-scale[0] + translate[0] <= scale[0] + translate[0]);
+
+ /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
+ * This is because the viewport transformation inverts the clip space
+ * bounding box, so min becomes max, which breaks small primitive
+ * culling.
+ */
+ if (sctx->viewports.y_inverted) {
+ scale[1] = -scale[1];
+ translate[1] = -translate[1];
+ }
+
+ /* Scale the framebuffer up, so that samples become pixels and small
+ * primitive culling is the same for all sample counts.
+ * This only works with the standard DX sample positions, because
+ * the samples are evenly spaced on both X and Y axes.
+ */
+ unsigned num_samples = sctx->framebuffer.nr_samples;
+ assert(num_samples >= 1);
+
+ for (unsigned i = 0; i < 2; i++) {
+ scale[i] *= num_samples;
+ translate[i] *= num_samples;
+ }
+
+ desc[8] = fui(scale[0]);
+ desc[9] = fui(scale[1]);
+ desc[10] = fui(translate[0]);
+ desc[11] = fui(translate[1]);
+
+ /* Better subpixel precision increases the efficiency of small
+ * primitive culling. */
+ unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+ float small_prim_cull_precision;
+
+ if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+ small_prim_cull_precision = num_samples / 4096.0;
+ else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+ small_prim_cull_precision = num_samples / 1024.0;
+ else
+ small_prim_cull_precision = num_samples / 256.0;
+
+ /* Set user data SGPRs. */
+ /* This can't be greater than 14 if we want the fastest launch rate. */
+ unsigned user_sgprs = 13;
+
+ uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
+ unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
+ unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
+ uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
+ uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
+ uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
+ sctx->vb_descriptors_buffer->gpu_address +
+ sctx->vb_descriptors_offset : 0;
+ unsigned gds_offset, gds_size;
+ struct si_fast_udiv_info32 num_prims_udiv = {};
+
+ if (info->instance_count > 1)
+ num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
+
+ /* Limitations on how these two are packed in the user SGPR. */
+ assert(num_prims_udiv.post_shift < 32);
+ assert(num_prims_per_instance < 1 << 27);
+
+ si_resource_reference(&indexbuf_desc, NULL);
+
+ bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
+
+ if (VERTEX_COUNTER_GDS_MODE == 1) {
+ gds_offset = sctx->compute_gds_offset;
+ gds_size = primitive_restart ? 8 : 4;
+ sctx->compute_gds_offset += gds_size;
+
+ /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
+ * The remainder of the GDS will be cleared after the dispatch packet
+ * in parallel with compute shaders.
+ */
+ if (first_dispatch) {
+ radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
+ radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
+ radeon_emit(cs, gds_offset);
+ radeon_emit(cs, 0);
+ radeon_emit(cs, 0); /* value to write */
+ if (gds_size == 8)
+ radeon_emit(cs, 0);
+ }
+ }
+
+ /* Set shader registers. */
+ struct si_shader *shader = sctx->cs_prim_discard_state.current;
+
+ if (shader != sctx->compute_ib_last_shader) {
+ radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_BINARY);
+ uint64_t shader_va = shader->bo->gpu_address;
+
+ assert(shader->config.scratch_bytes_per_wave == 0);
+ assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
+
+ radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+ radeon_emit(cs, shader_va >> 8);
+ radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+ radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+ radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
+ S_00B848_FLOAT_MODE(shader->config.float_mode) |
+ S_00B848_DX10_CLAMP(1));
+ radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
+ S_00B84C_USER_SGPR(user_sgprs) |
+ S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
+ S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
+ S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
+ S_00B84C_LDS_SIZE(shader->config.lds_size));
+
+ radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+ ac_get_compute_resource_limits(&sctx->screen->info,
+ WAVES_PER_TG,
+ MAX_WAVES_PER_SH,
+ THREADGROUPS_PER_CU));
+ sctx->compute_ib_last_shader = shader;
+ }
+
+ STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
+
+ /* Big draw calls are split into smaller dispatches and draw packets. */
+ for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
+ unsigned num_subdraw_prims;
+
+ if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
+ num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
+ else
+ num_subdraw_prims = num_prims - start_prim;
+
+ /* Small dispatches are executed back to back until a specific primitive
+ * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
+ * to start drawing the batch. This batching adds latency to the gfx IB,
+ * but CS_DONE and REWIND are too slow.
+ */
+ if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
+ si_compute_signal_gfx(sctx);
+
+ if (sctx->compute_num_prims_in_batch == 0) {
+ assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
+ sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+
+ if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
+ radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(gfx_cs, 0);
+
+ si_cp_wait_mem(sctx, gfx_cs,
+ sctx->compute_rewind_va |
+ (uint64_t)sctx->screen->info.address32_hi << 32,
+ REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
+ WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
+
+ /* Use INDIRECT_BUFFER to chain to a different buffer
+ * to discard the CP prefetch cache.
+ */
+ sctx->ws->cs_check_space(gfx_cs, 0, true);
+ } else {
+ radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+ radeon_emit(gfx_cs, 0);
+ }
+ }
+
+ sctx->compute_num_prims_in_batch += num_subdraw_prims;
+
+ uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
+ uint64_t index_va = out_indexbuf_va + start_prim * 12;
+
+ /* Emit the draw packet into the gfx IB. */
+ radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
+ radeon_emit(gfx_cs, num_prims * vertices_per_prim);
+ radeon_emit(gfx_cs, index_va);
+ radeon_emit(gfx_cs, index_va >> 32);
+ radeon_emit(gfx_cs, 0);
+ radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+
+ /* Continue with the compute IB. */
+ if (start_prim == 0) {
+ uint32_t gds_prim_restart_continue_bit = 0;
+
+ if (sctx->preserve_prim_restart_gds_at_flush) {
+ assert(primitive_restart &&
+ info->mode == PIPE_PRIM_TRIANGLE_STRIP);
+ assert(start_prim < 1 << 31);
+ gds_prim_restart_continue_bit = 1 << 31;
+ }
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
+ radeon_emit(cs, index_buffers_va);
+ radeon_emit(cs,
+ VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
+ VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
+ start_prim |
+ gds_prim_restart_continue_bit);
+ radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+ radeon_emit(cs, count_va);
+ radeon_emit(cs, vb_desc_va);
+ radeon_emit(cs, vs_const_desc_va);
+ radeon_emit(cs, vs_sampler_desc_va);
+ radeon_emit(cs, base_vertex);
+ radeon_emit(cs, info->start_instance);
+ radeon_emit(cs, num_prims_udiv.multiplier);
+ radeon_emit(cs, num_prims_udiv.post_shift |
+ (num_prims_per_instance << 5));
+ radeon_emit(cs, info->restart_index);
+ /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
+ radeon_emit(cs, fui(small_prim_cull_precision));
+ } else {
+ assert(VERTEX_COUNTER_GDS_MODE == 2);
+ /* Only update the SGPRs that changed. */
+ radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
+ radeon_emit(cs, start_prim);
+ radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+ radeon_emit(cs, count_va);
+ }
+
+ /* Set grid dimensions. */
+ unsigned start_block = start_prim / THREADGROUP_SIZE;
+ unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
+ unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
+
+ radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
+ radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
+ S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
+ S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
+ S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
+ S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
+ S_00B800_ORDER_MODE(0 /* launch in order */));
+
+ /* This is only for unordered append. Ordered append writes this from
+ * the shader.
+ *
+ * Note that EOP and EOS events are super slow, so emulating the event
+ * in a shader is an important optimization.
+ */
+ if (VERTEX_COUNTER_GDS_MODE == 1) {
+ si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
+ sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+ EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_GDS,
+ NULL,
+ count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+ EOP_DATA_GDS(gds_offset / 4, 1),
+ SI_NOT_QUERY);
+
+ /* Now that compute shaders are running, clear the remainder of GDS. */
+ if (first_dispatch) {
+ unsigned offset = gds_offset + gds_size;
+ si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
+ GDS_SIZE_UNORDERED - offset,
+ 0,
+ SI_CPDMA_SKIP_CHECK_CS_SPACE |
+ SI_CPDMA_SKIP_GFX_SYNC |
+ SI_CPDMA_SKIP_SYNC_BEFORE,
+ SI_COHERENCY_NONE, L2_BYPASS);
+ }
+ }
+ first_dispatch = false;
+
+ assert(cs->current.cdw <= cs->current.max_dw);
+ assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+ }
}
diff --git a/lib/mesa/src/gallium/drivers/virgl/Android.mk b/lib/mesa/src/gallium/drivers/virgl/Android.mk
index a64828e90..c06c16558 100644
--- a/lib/mesa/src/gallium/drivers/virgl/Android.mk
+++ b/lib/mesa/src/gallium/drivers/virgl/Android.mk
@@ -30,7 +30,22 @@ LOCAL_SRC_FILES := \
LOCAL_MODULE := libmesa_pipe_virgl
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+intermediates := $(call local-generated-sources-dir)
+LOCAL_GENERATED_SOURCES := $(intermediates)/virgl/virgl_driinfo.h
+
+GEN_DRIINFO_INPUTS := \
+ $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
+ $(LOCAL_PATH)/virgl_driinfo.h.in
+
+MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
+
+$(intermediates)/virgl/virgl_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
+ @mkdir -p $(dir $@)
+ @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+ $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
include $(GALLIUM_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/gallium/targets/dri/Android.mk b/lib/mesa/src/gallium/targets/dri/Android.mk
index 6ec4055f1..c7d564a23 100644
--- a/lib/mesa/src/gallium/targets/dri/Android.mk
+++ b/lib/mesa/src/gallium/targets/dri/Android.mk
@@ -42,9 +42,7 @@ LOCAL_LDFLAGS := \
LOCAL_SHARED_LIBRARIES := \
libdl \
libglapi \
- libz \
- liblog \
- libsync
+ libz
# If Android version >=8 MESA should static link libexpat else should dynamic link
ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
@@ -56,20 +54,9 @@ LOCAL_SHARED_LIBRARIES += \
endif
LOCAL_STATIC_LIBRARIES += \
- libetnaviv_drm \
- libfreedreno_common \
libfreedreno_drm \
- libfreedreno_ir2 \
libfreedreno_ir3 \
- libfreedreno_perfcntrs \
- libmesa_gallium \
- libpanfrost_lib \
- libpanfrost_bifrost \
- libpanfrost_bifrost_disasm \
- libpanfrost_midgard \
- libpanfrost_midgard_disasm \
libpanfrost_shared \
- libpanfrost_util \
ifeq ($(USE_LIBBACKTRACE),true)
LOCAL_SHARED_LIBRARIES += libbacktrace
@@ -87,12 +74,11 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
libmesa_nir \
libmesa_dri_common \
libmesa_megadriver_stub \
+ libmesa_gallium \
libmesa_pipe_loader \
libmesa_util \
libmesa_loader
-LOCAL_SHARED_LIBRARIES += libcutils
-
# sort GALLIUM_SHARED_LIBS to remove any duplicates
LOCAL_SHARED_LIBRARIES += $(sort $(GALLIUM_SHARED_LIBS))
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk b/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk
index 90f56e45b..0b8edf972 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk
@@ -21,8 +21,6 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
-ifeq ($(MESA_ENABLE_LLVM),true)
-
LOCAL_PATH := $(call my-dir)
# get C_SOURCES
@@ -48,5 +46,3 @@ ifneq ($(HAVE_GALLIUM_RADEONSI),)
$(eval GALLIUM_LIBS += $(LOCAL_MODULE) $(LOCAL_STATIC_LIBRARIES))
$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
endif
-
-endif # MESA_ENABLE_LLVM==true
diff --git a/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk b/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk
index 31edabd68..32091bea0 100644
--- a/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk
@@ -25,7 +25,7 @@ include $(CLEAR_VARS)
LOCAL_SRC_FILES := $(C_SOURCES)
-LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm
+LOCAL_SHARED_LIBRARIES := libdrm_etnaviv
LOCAL_MODULE := libmesa_winsys_etnaviv
diff --git a/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk b/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk
index 669559583..09edab391 100644
--- a/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk
@@ -27,9 +27,6 @@ include $(CLEAR_VARS)
LOCAL_SRC_FILES := $(C_SOURCES)
-LOCAL_C_INCLUDES := \
- $(MESA_TOP)/src/freedreno/common
-
LOCAL_SHARED_LIBRARIES := libdrm_freedreno
LOCAL_STATIC_LIBRARIES := libfreedreno_registers
diff --git a/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk b/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk
index f3d9df79c..5e2500774 100644
--- a/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk
@@ -29,8 +29,6 @@ LOCAL_SRC_FILES := $(C_SOURCES)
LOCAL_MODULE := libmesa_winsys_virgl
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio
-
LOCAL_STATIC_LIBRARIES := libmesa_winsys_virgl_common
include $(GALLIUM_COMMON_MK)
diff --git a/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk b/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk
index 454d830d0..5b33f6771 100644
--- a/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk
+++ b/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk
@@ -29,8 +29,6 @@ LOCAL_SRC_FILES := $(C_SOURCES)
LOCAL_MODULE := libmesa_winsys_virgl_vtest
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio
-
LOCAL_STATIC_LIBRARIES := libmesa_winsys_virgl_common
include $(GALLIUM_COMMON_MK)