diff options
Diffstat (limited to 'lib/mesa')
62 files changed, 4274 insertions, 3864 deletions
diff --git a/lib/mesa/Android.common.mk b/lib/mesa/Android.common.mk index 7ef6a90a1..6bf64f55c 100644 --- a/lib/mesa/Android.common.mk +++ b/lib/mesa/Android.common.mk @@ -39,7 +39,7 @@ LOCAL_CFLAGS += \ -Wno-initializer-overrides \ -Wno-mismatched-tags \ -DPACKAGE_VERSION=\"$(MESA_VERSION)\" \ - -DPACKAGE_BUGREPORT=\"https://gitlab.freedesktop.org/mesa/mesa/-/issues\" + -DPACKAGE_BUGREPORT=\"https://gitlab.freedesktop.org/mesa/mesa/issues\" # XXX: The following __STDC_*_MACROS defines should not be needed. # It's likely due to a bug elsewhere, but let's temporarily add them @@ -73,7 +73,6 @@ LOCAL_CFLAGS += \ -DHAVE_LINUX_FUTEX_H \ -DHAVE_ENDIAN_H \ -DHAVE_ZLIB \ - -DHAVE_COMPRESSION \ -DMAJOR_IN_SYSMACROS \ -DVK_USE_PLATFORM_ANDROID_KHR \ -fvisibility=hidden \ @@ -104,9 +103,12 @@ ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 26 && echo true),true) LOCAL_CFLAGS += -DHAVE_SYS_SHM_H endif +ifeq ($(strip $(MESA_ENABLE_ASM)),true) ifeq ($(TARGET_ARCH),x86) LOCAL_CFLAGS += \ -DUSE_X86_ASM + +endif endif ifeq ($(ARCH_ARM_HAVE_NEON),true) LOCAL_CFLAGS_arm += -DUSE_ARM_ASM diff --git a/lib/mesa/Android.mk b/lib/mesa/Android.mk index 07eba7b83..e86c9bd51 100644 --- a/lib/mesa/Android.mk +++ b/lib/mesa/Android.mk @@ -24,7 +24,7 @@ # BOARD_GPU_DRIVERS should be defined. The valid values are # # classic drivers: i915 i965 -# gallium drivers: swrast freedreno i915g nouveau kmsro r300g r600g radeonsi vc4 virgl vmwgfx etnaviv iris lima panfrost +# gallium drivers: swrast freedreno i915g nouveau kmsro r300g r600g radeonsi vc4 virgl vmwgfx etnaviv iris lima # # The main target is libGLES_mesa. For each classic driver enabled, a DRI # module will also be built. DRI modules will be loaded by libGLES_mesa. @@ -43,12 +43,6 @@ MESA_DRI_LDFLAGS := -Wl,--build-id=sha1 MESA_COMMON_MK := $(MESA_TOP)/Android.common.mk MESA_PYTHON2 := python -MESA_PYTHON3 := python3 -ifeq ($(filter 5 6 7 8 9 10, $(MESA_ANDROID_MAJOR_VERSION)),) -MESA_LEX := M4=$(M4) $(LEX) -else -MESA_LEX := $(LEX) -endif # Lists to convert driver names to boolean variables # in form of <driver name>.<boolean make variable> @@ -67,8 +61,7 @@ gallium_drivers := \ virgl.HAVE_GALLIUM_VIRGL \ etnaviv.HAVE_GALLIUM_ETNAVIV \ iris.HAVE_GALLIUM_IRIS \ - lima.HAVE_GALLIUM_LIMA \ - panfrost.HAVE_GALLIUM_PANFROST + lima.HAVE_GALLIUM_LIMA ifeq ($(BOARD_GPU_DRIVERS),all) MESA_BUILD_CLASSIC := $(filter HAVE_%, $(subst ., , $(classic_drivers))) @@ -90,20 +83,33 @@ endif $(foreach d, $(MESA_BUILD_CLASSIC) $(MESA_BUILD_GALLIUM), $(eval $(d) := true)) +# host and target must be the same arch to generate matypes.h +ifeq ($(TARGET_ARCH),$(HOST_ARCH)) +MESA_ENABLE_ASM := true +else +MESA_ENABLE_ASM := false +endif + ifneq ($(filter true, $(HAVE_GALLIUM_RADEONSI)),) MESA_ENABLE_LLVM := true endif define mesa-build-with-llvm - $(if $(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7), \ + $(if $(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5), \ $(warning Unsupported LLVM version in Android $(MESA_ANDROID_MAJOR_VERSION)),) \ - $(eval LOCAL_CFLAGS += -DLLVM_AVAILABLE -DDRAW_LLVM_AVAILABLE -DLLVM_IS_SHARED=1 -DMESA_LLVM_VERSION_STRING=\"3.9\") \ + $(if $(filter 6,$(MESA_ANDROID_MAJOR_VERSION)), \ + $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0307 -DMESA_LLVM_VERSION_STRING=\"3.7\")) \ + $(if $(filter 7,$(MESA_ANDROID_MAJOR_VERSION)), \ + $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0308 -DMESA_LLVM_VERSION_STRING=\"3.8\")) \ + $(if $(filter 8,$(MESA_ANDROID_MAJOR_VERSION)), \ + $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0309 -DMESA_LLVM_VERSION_STRING=\"3.9\")) \ + $(if $(filter P,$(MESA_ANDROID_MAJOR_VERSION)), \ + $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0309 -DMESA_LLVM_VERSION_STRING=\"3.9\")) \ $(eval LOCAL_SHARED_LIBRARIES += libLLVM) endef # add subdirectories SUBDIRS := \ - src/etnaviv \ src/freedreno \ src/gbm \ src/loader \ diff --git a/lib/mesa/REVIEWERS b/lib/mesa/REVIEWERS index ece5394cf..921e0ba38 100644 --- a/lib/mesa/REVIEWERS +++ b/lib/mesa/REVIEWERS @@ -1,11 +1,30 @@ Overview: This file is similar in syntax (or more precisly a subset) of what is - used by the MAINTAINERS file in the linux kernel. + used by the MAINTAINERS file in the linux kernel. Some fields do not + apply, for example, in all cases, send patches to: + + mesa-dev@lists.freedesktop.org + + and in all cases the patchwork instance is: + + https://patchwork.freedesktop.org/project/mesa/ + The purpose is not exactly the same the MAINTAINERS file in the linux kernel, as there are not official/formal maintainers of different subsystems in mesa, but is meant to give an idea of who to CC for - various patches for review. + various patches for review, and to allow the use of + scripts/get_reviewer.pl as git --cc-cmd. + +Usage: + + When sending patches: + + git send-email --cc-cmd ./scripts/get_reviewer.pl ... + + Or to configure as default: + + git config sendemail.cccmd ./scripts/get_reviewer.pl Descriptions of section entries: @@ -17,6 +36,14 @@ Descriptions of section entries: F: drivers/net/* all files in drivers/net, but not below F: */net/* all files in "any top level directory"/net One pattern per line. Multiple F: lines acceptable. + N: Files and directories with regex patterns. + N: [^a-z]tegra all files whose path contains the word tegra + One pattern per line. Multiple N: lines acceptable. + scripts/get_maintainer.pl has different behavior for files that + match F: pattern and matches of N: patterns. By default, + get_maintainer will not look at git log history when an F: pattern + match occurs. When an N: match occurs, git log history is used + to also notify the people that have git commit signatures. Maintainers List (try to look for most precise areas first) @@ -53,7 +80,7 @@ HAIKU R: Alexander von Gluck IV <kallisti5@unixzen.com> F: include/HaikuGL/ F: src/egl/drivers/haiku/ -F: src/gallium/frontends/hgl/ +F: src/gallium/state_trackers/hgl/ F: src/gallium/targets/haiku-softpipe/ F: src/gallium/winsys/sw/hgl/ F: src/hgl/ @@ -67,6 +94,11 @@ GALLIUM TARGETS R: Emil Velikov <emil.l.velikov@gmail.com> F: src/gallium/targets/ +SCONS BUILD +F: scons/ +F: */SConscript* +F: */Makefile.sources + ANDROID BUILD R: Emil Velikov <emil.l.velikov@gmail.com> R: Rob Herring <robh@kernel.org> @@ -103,13 +135,3 @@ VULKAN R: Eric Engestrom <eric@engestrom.ch> F: src/vulkan/ F: include/vulkan/ - -VMWARE DRIVER -R: Brian Paul <brianp@vmware.com> -R: Charmaine Lee <charmainel@vmware.com> -F: src/gallium/drivers/svga/ - -VMWARE WINSYS CODE -R: Thomas Hellstrom <thellstrom@vmware.com> -R: Deepak Rawat <drawat@vmware.com> -F: src/gallium/winsys/svga/ diff --git a/lib/mesa/src/amd/Android.addrlib.mk b/lib/mesa/src/amd/Android.addrlib.mk index 4e13ae1fd..eec78fc8b 100644 --- a/lib/mesa/src/amd/Android.addrlib.mk +++ b/lib/mesa/src/amd/Android.addrlib.mk @@ -30,8 +30,6 @@ LOCAL_MODULE := libmesa_amdgpu_addrlib LOCAL_SRC_FILES := $(ADDRLIB_FILES) -LOCAL_CPPFLAGS += -DLITTLEENDIAN_CPU - LOCAL_C_INCLUDES := \ $(MESA_TOP)/src \ $(MESA_TOP)/src/amd/common \ diff --git a/lib/mesa/src/amd/Android.common.mk b/lib/mesa/src/amd/Android.common.mk index 23bf129d1..d5a266215 100644 --- a/lib/mesa/src/amd/Android.common.mk +++ b/lib/mesa/src/amd/Android.common.mk @@ -20,8 +20,6 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. -ifeq ($(MESA_ENABLE_LLVM),true) - # --------------------------------------- # Build libmesa_amd_common # --------------------------------------- @@ -32,8 +30,9 @@ LOCAL_MODULE := libmesa_amd_common LOCAL_SRC_FILES := \ $(AMD_COMMON_FILES) \ - $(AMD_COMMON_LLVM_FILES) \ - $(AMD_DEBUG_FILES) + $(AMD_COMPILER_FILES) \ + $(AMD_DEBUG_FILES) \ + $(AMD_NIR_FILES) LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU # instructs LLVM to declare LLVMInitializeAMDGPU* functions @@ -42,23 +41,14 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES intermediates := $(call local-generated-sources-dir) LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, $(AMD_GENERATED_FILES)) -AMD_JSON_FILES := \ - $(LOCAL_PATH)/registers/gfx6.json \ - $(LOCAL_PATH)/registers/gfx7.json \ - $(LOCAL_PATH)/registers/gfx8.json \ - $(LOCAL_PATH)/registers/gfx81.json \ - $(LOCAL_PATH)/registers/gfx9.json \ - $(LOCAL_PATH)/registers/gfx10.json \ - $(LOCAL_PATH)/registers/gfx103.json \ - $(LOCAL_PATH)/registers/pkt3.json \ - $(LOCAL_PATH)/registers/gfx10-rsrc.json \ - $(LOCAL_PATH)/registers/registers-manually-defined.json - SID_TABLES := $(LOCAL_PATH)/common/sid_tables.py SID_TABLES_INPUTS := \ $(LOCAL_PATH)/common/sid.h \ - $(AMD_JSON_FILES) + $(LOCAL_PATH)/registers/amdgfxregs.json \ + $(LOCAL_PATH)/registers/pkt3.json \ + $(LOCAL_PATH)/registers/gfx10.json \ + $(LOCAL_PATH)/registers/gfx10-rsrc.json $(intermediates)/common/sid_tables.h: $(SID_TABLES) $(SID_TABLES_INPUTS) @mkdir -p $(dir $@) @@ -68,34 +58,21 @@ $(intermediates)/common/sid_tables.h: $(SID_TABLES) $(SID_TABLES_INPUTS) AMDGFXREGS := $(LOCAL_PATH)/registers/makeregheader.py AMDGFXREGS_INPUTS := \ - $(AMD_JSON_FILES) + $(LOCAL_PATH)/registers/amdgfxregs.json \ + $(LOCAL_PATH)/registers/pkt3.json \ + $(LOCAL_PATH)/registers/gfx10.json \ + $(LOCAL_PATH)/registers/gfx10-rsrc.json $(intermediates)/common/amdgfxregs.h: $(AMDGFXREGS) $(AMDGFXREGS_INPUTS) @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" $(hide) $(MESA_PYTHON2) $(AMDGFXREGS) $(AMDGFXREGS_INPUTS) --sort address --guard AMDGFXREGS_H > $@ || ($(RM) $@; false) -GEN10_FORMAT_TABLE_INPUTS := \ - $(MESA_TOP)/src/util/format/u_format.csv \ - $(MESA_TOP)/src/amd/registers/gfx10-rsrc.json - -GEN10_FORMAT_TABLE_DEP := \ - $(MESA_TOP)/src/amd/registers/regdb.py - -GEN10_FORMAT_TABLE := $(LOCAL_PATH)/common/gfx10_format_table.py - -$(intermediates)/common/gfx10_format_table.c: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP) - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false) - LOCAL_C_INCLUDES := \ $(MESA_TOP)/include \ $(MESA_TOP)/src \ $(MESA_TOP)/src/amd/common \ - $(MESA_TOP)/src/amd/llvm \ $(MESA_TOP)/src/compiler \ - $(MESA_TOP)/src/compiler/nir \ $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary \ @@ -104,7 +81,6 @@ LOCAL_C_INCLUDES := \ LOCAL_EXPORT_C_INCLUDE_DIRS := \ $(LOCAL_PATH)/common \ - $(LOCAL_PATH)/llvm \ $(intermediates)/common LOCAL_SHARED_LIBRARIES := \ @@ -120,5 +96,3 @@ $(call mesa-build-with-llvm) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) - -endif # MESA_ENABLE_LLVM == true diff --git a/lib/mesa/src/amd/Android.mk b/lib/mesa/src/amd/Android.mk index c9dbeafde..e40e7da01 100644 --- a/lib/mesa/src/amd/Android.mk +++ b/lib/mesa/src/amd/Android.mk @@ -28,6 +28,5 @@ include $(LOCAL_PATH)/Makefile.sources include $(LOCAL_PATH)/Android.addrlib.mk include $(LOCAL_PATH)/Android.common.mk ifneq ($(filter radeonsi,$(BOARD_GPU_DRIVERS)),) -include $(LOCAL_PATH)/Android.compiler.mk include $(LOCAL_PATH)/vulkan/Android.mk endif diff --git a/lib/mesa/src/amd/vulkan/Android.mk b/lib/mesa/src/amd/vulkan/Android.mk index f0eb5119a..d0002b8aa 100644 --- a/lib/mesa/src/amd/vulkan/Android.mk +++ b/lib/mesa/src/amd/vulkan/Android.mk @@ -30,7 +30,6 @@ include $(LOCAL_PATH)/Makefile.sources RADV_COMMON_INCLUDES := \ $(MESA_TOP)/include \ $(MESA_TOP)/src/ \ - $(MESA_TOP)/src/amd/vulkan \ $(MESA_TOP)/src/vulkan/wsi \ $(MESA_TOP)/src/vulkan/util \ $(MESA_TOP)/src/amd \ @@ -68,7 +67,6 @@ $(call mesa-build-with-llvm) LOCAL_C_INCLUDES := $(RADV_COMMON_INCLUDES) LOCAL_STATIC_LIBRARIES := \ - libmesa_aco \ libmesa_amd_common \ libmesa_nir \ libmesa_util \ @@ -77,23 +75,58 @@ LOCAL_STATIC_LIBRARIES := \ LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.c LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.h +LOCAL_GENERATED_SOURCES += $(intermediates)/radv_extensions.c +LOCAL_GENERATED_SOURCES += $(intermediates)/radv_extensions.h +LOCAL_GENERATED_SOURCES += $(intermediates)/vk_format_table.c +LOCAL_GENERATED_SOURCES += $(intermediates)/gfx10_format_table.h -RADV_ENTRYPOINTS_SCRIPT := $(MESA_TOP)/src/vulkan/util/vk_entrypoints_gen.py +RADV_ENTRYPOINTS_SCRIPT := $(MESA_TOP)/src/amd/vulkan/radv_entrypoints_gen.py +RADV_EXTENSIONS_SCRIPT := $(MESA_TOP)/src/amd/vulkan/radv_extensions.py +VK_FORMAT_TABLE_SCRIPT := $(MESA_TOP)/src/amd/vulkan/vk_format_table.py +VK_FORMAT_PARSE_SCRIPT := $(MESA_TOP)/src/amd/vulkan/vk_format_parse.py vulkan_api_xml = $(MESA_TOP)/src/vulkan/registry/vk.xml +vk_format_layout_csv = $(MESA_TOP)/src/amd/vulkan/vk_format_layout.csv $(intermediates)/radv_entrypoints.c: $(RADV_ENTRYPOINTS_SCRIPT) \ + $(RADV_EXTENSIONS_SCRIPT) \ $(vulkan_api_xml) @mkdir -p $(dir $@) $(MESA_PYTHON2) $(RADV_ENTRYPOINTS_SCRIPT) \ --xml $(vulkan_api_xml) \ - --proto --weak \ - --out-c $@ \ - --out-h $(addsuffix .h,$(basename $@)) \ - --prefix radv --device-prefix sqtt + --outdir $(dir $@) $(intermediates)/radv_entrypoints.h: $(intermediates)/radv_entrypoints.c +$(intermediates)/radv_extensions.c: $(RADV_EXTENSIONS_SCRIPT) $(vulkan_api_xml) + @mkdir -p $(dir $@) + $(MESA_PYTHON2) $(RADV_EXTENSIONS_SCRIPT) \ + --xml $(vulkan_api_xml) \ + --out-c $@ \ + --out-h $(addsuffix .h,$(basename $@)) + +$(intermediates)/radv_extensions.h: $(intermediates)/radv_extensions.c + +$(intermediates)/vk_format_table.c: $(VK_FORMAT_TABLE_SCRIPT) \ + $(VK_FORMAT_PARSE_SCRIPT) \ + $(vk_format_layout_csv) + @mkdir -p $(dir $@) + $(MESA_PYTHON2) $(VK_FORMAT_TABLE_SCRIPT) $(vk_format_layout_csv) > $@ + +RADV_GEN10_FORMAT_TABLE_INPUTS := \ + $(MESA_TOP)/src/amd/vulkan/vk_format_layout.csv \ + $(MESA_TOP)/src/amd/registers/gfx10-rsrc.json + +RADV_GEN10_FORMAT_TABLE_DEP := \ + $(MESA_TOP)/src/amd/registers/regdb.py + +RADV_GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py + +$(intermediates)/gfx10_format_table.h: $(RADV_GEN10_FORMAT_TABLE) $(RADV_GEN10_FORMAT_TABLE_INPUTS) $(RADV_GEN10_FORMAT_TABLE_DEP) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(RADV_GEN10_FORMAT_TABLE) $(RADV_GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false) + LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES) LOCAL_EXPORT_C_INCLUDE_DIRS := \ @@ -134,10 +167,9 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libmesa_amdgpu_addrlib \ libmesa_amd_common \ libmesa_radv_common \ - libmesa_vulkan_util \ - libmesa_aco + libmesa_vulkan_util -LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES) libz libsync liblog libcutils +LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES) libz libsync liblog # If Android version >=8 MESA should static link libexpat else should dynamic link ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) diff --git a/lib/mesa/src/compiler/Android.glsl.gen.mk b/lib/mesa/src/compiler/Android.glsl.gen.mk index f654c869f..1308de2db 100644 --- a/lib/mesa/src/compiler/Android.glsl.gen.mk +++ b/lib/mesa/src/compiler/Android.glsl.gen.mk @@ -53,7 +53,7 @@ MESA_GEN_GLSL_H := $(addprefix $(call local-generated-sources-dir)/, \ define local-l-or-ll-to-c-or-cpp @mkdir -p $(dir $@) @echo "Mesa Lex: $(PRIVATE_MODULE) <= $<" - $(hide) $(MESA_LEX) --nounistd -o$@ $< + $(hide) $(LEX) --nounistd -o$@ $< endef define glsl_local-y-to-c-and-h @@ -102,6 +102,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $< strings > $@ -$(intermediates)/glsl/float64_glsl.h: $(MESA_TOP)/src/util/xxd.py +$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@ diff --git a/lib/mesa/src/compiler/Android.nir.gen.mk b/lib/mesa/src/compiler/Android.nir.gen.mk index ad640dd04..e753bb77a 100644 --- a/lib/mesa/src/compiler/Android.nir.gen.mk +++ b/lib/mesa/src/compiler/Android.nir.gen.mk @@ -33,7 +33,6 @@ LOCAL_SRC_FILES := $(LOCAL_SRC_FILES) LOCAL_C_INCLUDES += \ $(intermediates)/nir \ - $(intermediates)/spirv \ $(MESA_TOP)/src/compiler/nir LOCAL_EXPORT_C_INCLUDE_DIRS += \ @@ -100,11 +99,7 @@ $(intermediates)/spirv/spirv_info.c: $(LOCAL_PATH)/spirv/spirv_info_c.py $(LOCAL @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $^ $@ || ($(RM) $@; false) -$(intermediates)/spirv/vtn_gather_types.c: $(LOCAL_PATH)/spirv/vtn_gather_types_c.py $(LOCAL_PATH)/spirv/spirv.core.grammar.json - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $^ $@ || ($(RM) $@; false) - -$(intermediates)/spirv/vtn_generator_ids.h: $(LOCAL_PATH)/spirv/vtn_generator_ids_h.py $(LOCAL_PATH)/spirv/spir-v.xml +$(intermediates)/spirv/vtn_gather_types.c:: $(LOCAL_PATH)/spirv/vtn_gather_types_c.py $(LOCAL_PATH)/spirv/spirv.core.grammar.json @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $^ $@ || ($(RM) $@; false) @@ -117,8 +112,3 @@ nir_intrinsics_c_gen := $(LOCAL_PATH)/nir/nir_intrinsics_c.py $(intermediates)/nir/nir_intrinsics.c: $(LOCAL_PATH)/nir/nir_intrinsics.py $(nir_intrinsics_c_gen) @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $(nir_intrinsics_c_gen) --outdir $(dir $@) || ($(RM) $@; false) - -nir_intrinsics_indices_h_gen := $(LOCAL_PATH)/nir/nir_intrinsics_indices_h.py -$(intermediates)/nir/nir_intrinsics_indices.h: $(LOCAL_PATH)/nir/nir_intrinsics.py $(nir_intrinsics_indices_h_gen) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $(nir_intrinsics_indices_h_gen) --outdir $(dir $@) || ($(RM) $@; false) diff --git a/lib/mesa/src/egl/Android.mk b/lib/mesa/src/egl/Android.mk index 83bd442de..01c33298e 100644 --- a/lib/mesa/src/egl/Android.mk +++ b/lib/mesa/src/egl/Android.mk @@ -37,8 +37,7 @@ LOCAL_SRC_FILES := \ $(LIBEGL_C_FILES) \ $(dri2_backend_core_FILES) \ drivers/dri2/platform_device.c \ - drivers/dri2/platform_android.c \ - drivers/dri2/platform_surfaceless.c \ + drivers/dri2/platform_android.c LOCAL_CFLAGS := \ -D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID \ diff --git a/lib/mesa/src/freedreno/Android.drm.mk b/lib/mesa/src/freedreno/Android.drm.mk index 0a79fcf9a..dfa9bed7d 100644 --- a/lib/mesa/src/freedreno/Android.drm.mk +++ b/lib/mesa/src/freedreno/Android.drm.mk @@ -37,7 +37,5 @@ LOCAL_C_INCLUDES := \ LOCAL_MODULE := libfreedreno_drm -LOCAL_STATIC_LIBRARIES := libfreedreno_registers - include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/freedreno/Android.ir3.mk b/lib/mesa/src/freedreno/Android.ir3.mk index 0fbb8c50c..c6a9d3288 100644 --- a/lib/mesa/src/freedreno/Android.ir3.mk +++ b/lib/mesa/src/freedreno/Android.ir3.mk @@ -31,72 +31,21 @@ include $(CLEAR_VARS) LOCAL_SRC_FILES := \ $(ir3_SOURCES) -LOCAL_MODULE := libfreedreno_ir3 - -LOCAL_MODULE_CLASS := STATIC_LIBRARIES - -intermediates := $(call local-generated-sources-dir) - LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/compiler/nir \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary \ $(MESA_TOP)/prebuilt-intermediates/nir \ - $(MESA_TOP)/src/freedreno/common \ - $(MESA_TOP)/src/freedreno/ir3 \ - $(intermediates)/ir3 - -LOCAL_WHOLE_STATIC_LIBRARIES := \ - libir3decode \ - libir3encode # We need libmesa_nir to get NIR's generated include directories. LOCAL_STATIC_LIBRARIES := \ libmesa_nir +LOCAL_MODULE := libfreedreno_ir3 + LOCAL_GENERATED_SOURCES := \ $(MESA_GEN_GLSL_H) \ $(MESA_GEN_NIR_H) -LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, \ - $(ir3_GENERATED_FILES)) - -ir3_lexer_deps := \ - $(MESA_TOP)/src/freedreno/ir3/ir3_lexer.l - -ir3_nir_imul_deps := \ - $(MESA_TOP)/src/freedreno/ir3/ir3_nir_imul.py \ - $(MESA_TOP)/src/compiler/nir/nir_algebraic.py - -ir3_nir_trig_deps := \ - $(MESA_TOP)/src/freedreno/ir3/ir3_nir_trig.py \ - $(MESA_TOP)/src/compiler/nir/nir_algebraic.py - -ir3_parser_deps := \ - $(MESA_TOP)/src/freedreno/ir3/ir3_parser.y - -$(intermediates)/ir3/ir3_lexer.c: $(ir3_lexer_deps) - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_LEX) -o $@ $< - -$(intermediates)/ir3/ir3_nir_imul.c: $(ir3_nir_imul_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/compiler/nir > $@ - -$(intermediates)/ir3/ir3_nir_trig.c: $(ir3_nir_trig_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/compiler/nir > $@ - -$(intermediates)/ir3/ir3_parser.c: $(ir3_parser_deps) - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(BISON) $< --name-prefix=ir3_yy --output=$@ - -$(intermediates)/ir3/ir3_parser.h: $(ir3_parser_deps) - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(BISON) $< --name-prefix=ir3_yy --defines=$@ - include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/freedreno/Android.mk b/lib/mesa/src/freedreno/Android.mk index a488803f9..84d0c82c2 100644 --- a/lib/mesa/src/freedreno/Android.mk +++ b/lib/mesa/src/freedreno/Android.mk @@ -25,10 +25,7 @@ LOCAL_PATH := $(call my-dir) include $(LOCAL_PATH)/Makefile.sources -include $(LOCAL_PATH)/Android.common.mk +include $(MESA_TOP)/src/gallium/drivers/freedreno/Android.gen.mk include $(LOCAL_PATH)/Android.drm.mk -include $(LOCAL_PATH)/Android.ir2.mk include $(LOCAL_PATH)/Android.ir3.mk -include $(LOCAL_PATH)/Android.isa.mk -include $(LOCAL_PATH)/Android.perfcntrs.mk include $(LOCAL_PATH)/Android.registers.mk diff --git a/lib/mesa/src/freedreno/Android.registers.mk b/lib/mesa/src/freedreno/Android.registers.mk index f66e57794..085eb5f07 100644 --- a/lib/mesa/src/freedreno/Android.registers.mk +++ b/lib/mesa/src/freedreno/Android.registers.mk @@ -42,59 +42,48 @@ $(intermediates)/dummy.c: @echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))" $(hide) touch $@ -RNN_SRC_PATH := $(MESA_TOP)/src/freedreno/registers/ - # This is the list of auto-generated files headers -LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/registers/adreno/, \ - a2xx.xml.h a3xx.xml.h a4xx.xml.h a5xx.xml.h a6xx.xml.h a6xx-pack.xml.h adreno_common.xml.h adreno_pm4.xml.h adreno-pm4-pack.xml.h) - -$(intermediates)/registers/adreno/a2xx.xml.h: $(LOCAL_PATH)/registers/adreno/a2xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@ +LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/registers/, \ + a2xx.xml.h a3xx.xml.h a4xx.xml.h a5xx.xml.h a6xx.xml.h adreno_common.xml.h adreno_pm4.xml.h) -$(intermediates)/registers/adreno/a3xx.xml.h: $(LOCAL_PATH)/registers/adreno/a3xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py +$(intermediates)/registers/a2xx.xml.h: $(LOCAL_PATH)/registers/a2xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@ + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@ -$(intermediates)/registers/adreno/a4xx.xml.h: $(LOCAL_PATH)/registers/adreno/a4xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py +$(intermediates)/registers/a3xx.xml.h: $(LOCAL_PATH)/registers/a3xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@ + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@ -$(intermediates)/registers/adreno/a5xx.xml.h: $(LOCAL_PATH)/registers/adreno/a5xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py +$(intermediates)/registers/a4xx.xml.h: $(LOCAL_PATH)/registers/a4xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@ + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@ -$(intermediates)/registers/adreno/a6xx.xml.h: $(LOCAL_PATH)/registers/adreno/a6xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py +$(intermediates)/registers/a5xx.xml.h: $(LOCAL_PATH)/registers/a5xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@ + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@ -$(intermediates)/registers/adreno/a6xx-pack.xml.h: $(LOCAL_PATH)/registers/adreno/a6xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py +$(intermediates)/registers/a6xx.xml.h: $(LOCAL_PATH)/registers/a6xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< --pack-structs > $@ + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@ -$(intermediates)/registers/adreno/adreno_common.xml.h: $(LOCAL_PATH)/registers/adreno/adreno_common.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py +$(intermediates)/registers/adreno_common.xml.h: $(LOCAL_PATH)/registers/adreno_common.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@ + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@ -$(intermediates)/registers/adreno/adreno_pm4.xml.h: $(LOCAL_PATH)/registers/adreno/adreno_pm4.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py +$(intermediates)/registers/adreno_pm4.xml.h: $(LOCAL_PATH)/registers/adreno_pm4.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@ + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@ -$(intermediates)/registers/adreno/adreno-pm4-pack.xml.h: $(LOCAL_PATH)/registers/adreno/adreno_pm4.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py - @mkdir -p $(dir $@) - @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" - $(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< --pack-structs > $@ LOCAL_EXPORT_C_INCLUDE_DIRS := \ - $(intermediates)/registers/adreno/ + $(intermediates)/registers/ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/freedreno/vulkan/tu_extensions.py b/lib/mesa/src/freedreno/vulkan/tu_extensions.py index 762f5b595..0a45b859e 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_extensions.py +++ b/lib/mesa/src/freedreno/vulkan/tu_extensions.py @@ -25,17 +25,24 @@ COPYRIGHT = """\ """ import argparse -import os.path +import copy import re -import sys +import xml.etree.cElementTree as et -VULKAN_UTIL = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../vulkan/util')) -sys.path.append(VULKAN_UTIL) +from mako.template import Template -from vk_extensions import * -from vk_extensions_gen import * +MAX_API_VERSION = '1.1.82' -MAX_API_VERSION = '1.2.131' +class Extension: + def __init__(self, name, ext_version, enable): + self.name = name + self.ext_version = int(ext_version) + if enable is True: + self.enable = 'true'; + elif enable is False: + self.enable = 'false'; + else: + self.enable = enable; # On Android, we disable all surface and swapchain extensions. Android's Vulkan # loader implements VK_KHR_surface and VK_KHR_swapchain, and applications @@ -53,8 +60,6 @@ EXTENSIONS = [ Extension('VK_KHR_maintenance1', 1, True), Extension('VK_KHR_maintenance2', 1, True), Extension('VK_KHR_maintenance3', 1, True), - Extension('VK_KHR_sampler_mirror_clamp_to_edge', 1, True), - Extension('VK_KHR_sampler_ycbcr_conversion', 1, True), Extension('VK_KHR_surface', 25, 'TU_HAS_SURFACE'), Extension('VK_KHR_swapchain', 68, 'TU_HAS_SURFACE'), Extension('VK_KHR_wayland_surface', 6, 'VK_USE_PLATFORM_WAYLAND_KHR'), @@ -70,56 +75,180 @@ EXTENSIONS = [ Extension('VK_KHR_external_memory', 1, True), Extension('VK_KHR_external_memory_fd', 1, True), Extension('VK_EXT_external_memory_dma_buf', 1, True), - Extension('VK_EXT_image_drm_format_modifier', 1, True), - Extension('VK_EXT_sample_locations', 1, 'device->gpu_id == 650'), - Extension('VK_EXT_sampler_filter_minmax', 1, True), - Extension('VK_EXT_transform_feedback', 1, True), - Extension('VK_ANDROID_native_buffer', 1, 'ANDROID'), - Extension('VK_KHR_external_fence', 1, True), - Extension('VK_KHR_external_fence_fd', 1, True), - Extension('VK_KHR_external_semaphore', 1, True), - Extension('VK_KHR_external_semaphore_capabilities', 1, True), - Extension('VK_KHR_external_semaphore_fd', 1, True), - Extension('VK_IMG_filter_cubic', 1, 'device->gpu_id == 650'), - Extension('VK_EXT_filter_cubic', 1, 'device->gpu_id == 650'), - Extension('VK_EXT_index_type_uint8', 1, True), - Extension('VK_EXT_vertex_attribute_divisor', 1, True), - Extension('VK_KHR_shader_draw_parameters', 1, True), - Extension('VK_KHR_variable_pointers', 1, True), - Extension('VK_EXT_private_data', 1, True), - Extension('VK_EXT_shader_stencil_export', 1, True), - Extension('VK_EXT_depth_clip_enable', 1, True), - Extension('VK_KHR_draw_indirect_count', 1, True), - Extension('VK_EXT_4444_formats', 1, True), - Extension('VK_EXT_conditional_rendering', 1, True), - Extension('VK_EXT_custom_border_color', 12, True), - Extension('VK_KHR_multiview', 1, True), - Extension('VK_EXT_host_query_reset', 1, True), - Extension('VK_EXT_shader_viewport_index_layer', 1, True), - Extension('VK_EXT_extended_dynamic_state', 1, True), - Extension('VK_KHR_push_descriptor', 1, True), - Extension('VK_KHR_incremental_present', 1, 'TU_HAS_SURFACE'), - Extension('VK_KHR_image_format_list', 1, True), - Extension('VK_KHR_depth_stencil_resolve', 1, True), - Extension('VK_KHR_performance_query', 1, 'device->instance->debug_flags & TU_DEBUG_PERFC'), - Extension('VK_EXT_memory_budget', 1, True), - Extension('VK_KHR_device_group', 4, True), - Extension('VK_KHR_device_group_creation', 1, True), - Extension('VK_EXT_descriptor_indexing', 2, True), - Extension('VK_KHR_descriptor_update_template', 1, True), - Extension('VK_KHR_storage_buffer_storage_class', 1, True), - Extension('VK_KHR_external_fence_capabilities', 1, True), - Extension('VK_KHR_pipeline_executable_properties', 1, True), - Extension('VK_KHR_shader_float_controls', 1, True), - Extension('VK_KHR_shader_float16_int8', 1, True), - Extension('VK_KHR_16bit_storage', 1, 'device->gpu_id >= 650'), - Extension('VK_EXT_scalar_block_layout', 1, True), - Extension('VK_KHR_spirv_1_4', 1, True), - Extension('VK_KHR_relaxed_block_layout', 1, True), ] +class VkVersion: + def __init__(self, string): + split = string.split('.') + self.major = int(split[0]) + self.minor = int(split[1]) + if len(split) > 2: + assert len(split) == 3 + self.patch = int(split[2]) + else: + self.patch = None + + # Sanity check. The range bits are required by the definition of the + # VK_MAKE_VERSION macro + assert self.major < 1024 and self.minor < 1024 + assert self.patch is None or self.patch < 4096 + assert(str(self) == string) + + def __str__(self): + ver_list = [str(self.major), str(self.minor)] + if self.patch is not None: + ver_list.append(str(self.patch)) + return '.'.join(ver_list) + + def c_vk_version(self): + patch = self.patch if self.patch is not None else 0 + ver_list = [str(self.major), str(self.minor), str(patch)] + return 'VK_MAKE_VERSION(' + ', '.join(ver_list) + ')' + + def __int_ver(self): + # This is just an expansion of VK_VERSION + patch = self.patch if self.patch is not None else 0 + return (self.major << 22) | (self.minor << 12) | patch + + def __gt__(self, other): + # If only one of them has a patch version, "ignore" it by making + # other's patch version match self. + if (self.patch is None) != (other.patch is None): + other = copy.copy(other) + other.patch = self.patch + + return self.__int_ver() > other.__int_ver() + MAX_API_VERSION = VkVersion(MAX_API_VERSION) -API_VERSIONS = [ ApiVersion(MAX_API_VERSION, True) ] + +def _init_exts_from_xml(xml): + """ Walk the Vulkan XML and fill out extra extension information. """ + + xml = et.parse(xml) + + ext_name_map = {} + for ext in EXTENSIONS: + ext_name_map[ext.name] = ext + + for ext_elem in xml.findall('.extensions/extension'): + ext_name = ext_elem.attrib['name'] + if ext_name not in ext_name_map: + continue + + ext = ext_name_map[ext_name] + ext.type = ext_elem.attrib['type'] + +_TEMPLATE_H = Template(COPYRIGHT + """ +#ifndef TU_EXTENSIONS_H +#define TU_EXTENSIONS_H + +enum { + TU_INSTANCE_EXTENSION_COUNT = ${len(instance_extensions)}, + TU_DEVICE_EXTENSION_COUNT = ${len(device_extensions)}, +}; + +struct tu_instance_extension_table { + union { + bool extensions[TU_INSTANCE_EXTENSION_COUNT]; + struct { +%for ext in instance_extensions: + bool ${ext.name[3:]}; +%endfor + }; + }; +}; + +struct tu_device_extension_table { + union { + bool extensions[TU_DEVICE_EXTENSION_COUNT]; + struct { +%for ext in device_extensions: + bool ${ext.name[3:]}; +%endfor + }; + }; +}; + +extern const VkExtensionProperties tu_instance_extensions[TU_INSTANCE_EXTENSION_COUNT]; +extern const VkExtensionProperties tu_device_extensions[TU_DEVICE_EXTENSION_COUNT]; +extern const struct tu_instance_extension_table tu_supported_instance_extensions; + + +struct tu_physical_device; + +void tu_fill_device_extension_table(const struct tu_physical_device *device, + struct tu_device_extension_table* table); +#endif +""") + +_TEMPLATE_C = Template(COPYRIGHT + """ +#include "tu_private.h" + +#include "vk_util.h" + +/* Convert the VK_USE_PLATFORM_* defines to booleans */ +%for platform in ['ANDROID_KHR', 'WAYLAND_KHR', 'XCB_KHR', 'XLIB_KHR', 'DISPLAY_KHR', 'XLIB_XRANDR_EXT']: +#ifdef VK_USE_PLATFORM_${platform} +# undef VK_USE_PLATFORM_${platform} +# define VK_USE_PLATFORM_${platform} true +#else +# define VK_USE_PLATFORM_${platform} false +#endif +%endfor + +/* And ANDROID too */ +#ifdef ANDROID +# undef ANDROID +# define ANDROID true +#else +# define ANDROID false +#endif + +#define TU_HAS_SURFACE (VK_USE_PLATFORM_WAYLAND_KHR || \\ + VK_USE_PLATFORM_XCB_KHR || \\ + VK_USE_PLATFORM_XLIB_KHR || \\ + VK_USE_PLATFORM_DISPLAY_KHR) + + +const VkExtensionProperties tu_instance_extensions[TU_INSTANCE_EXTENSION_COUNT] = { +%for ext in instance_extensions: + {"${ext.name}", ${ext.ext_version}}, +%endfor +}; + +const VkExtensionProperties tu_device_extensions[TU_DEVICE_EXTENSION_COUNT] = { +%for ext in device_extensions: + {"${ext.name}", ${ext.ext_version}}, +%endfor +}; + +const struct tu_instance_extension_table tu_supported_instance_extensions = { +%for ext in instance_extensions: + .${ext.name[3:]} = ${ext.enable}, +%endfor +}; + +void tu_fill_device_extension_table(const struct tu_physical_device *device, + struct tu_device_extension_table* table) +{ +%for ext in device_extensions: + table->${ext.name[3:]} = ${ext.enable}; +%endfor +} + +VkResult tu_EnumerateInstanceVersion( + uint32_t* pApiVersion) +{ + *pApiVersion = ${MAX_API_VERSION.c_vk_version()}; + return VK_SUCCESS; +} + +uint32_t +tu_physical_device_api_version(struct tu_physical_device *dev) +{ + return VK_MAKE_VERSION(1, 1, 82); +} +""") if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -132,5 +261,19 @@ if __name__ == '__main__': dest='xml_files') args = parser.parse_args() - gen_extensions('tu', args.xml_files, API_VERSIONS, MAX_API_VERSION, - EXTENSIONS, args.out_c, args.out_h) + for filename in args.xml_files: + _init_exts_from_xml(filename) + + for ext in EXTENSIONS: + assert ext.type == 'instance' or ext.type == 'device' + + template_env = { + 'MAX_API_VERSION': MAX_API_VERSION, + 'instance_extensions': [e for e in EXTENSIONS if e.type == 'instance'], + 'device_extensions': [e for e in EXTENSIONS if e.type == 'device'], + } + + with open(args.out_c, 'w') as f: + f.write(_TEMPLATE_C.render(**template_env)) + with open(args.out_h, 'w') as f: + f.write(_TEMPLATE_H.render(**template_env)) diff --git a/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c b/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c index cfaea0622..b9148a1e2 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c +++ b/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c @@ -53,7 +53,7 @@ tu_CreateWaylandSurfaceKHR(VkInstance _instance, if (pAllocator) alloc = pAllocator; else - alloc = &instance->vk.alloc; + alloc = &instance->alloc; return wsi_create_wl_surface(alloc, pCreateInfo, pSurface); } diff --git a/lib/mesa/src/gallium/Android.common.mk b/lib/mesa/src/gallium/Android.common.mk index 3f7779892..0d55f04ac 100644 --- a/lib/mesa/src/gallium/Android.common.mk +++ b/lib/mesa/src/gallium/Android.common.mk @@ -28,7 +28,6 @@ LOCAL_C_INCLUDES += \ $(GALLIUM_TOP)/auxiliary \ $(GALLIUM_TOP)/winsys \ $(GALLIUM_TOP)/drivers \ - $(MESA_TOP)/src/etnaviv \ $(MESA_TOP)/src/freedreno \ $(MESA_TOP)/src/freedreno/ir3 \ $(MESA_TOP)/src/freedreno/registers diff --git a/lib/mesa/src/gallium/Android.mk b/lib/mesa/src/gallium/Android.mk index 78e821581..37e923c22 100644 --- a/lib/mesa/src/gallium/Android.mk +++ b/lib/mesa/src/gallium/Android.mk @@ -46,10 +46,9 @@ SUBDIRS += winsys/vc4/drm drivers/vc4 SUBDIRS += winsys/virgl/common winsys/virgl/drm winsys/virgl/vtest drivers/virgl SUBDIRS += winsys/svga/drm drivers/svga SUBDIRS += winsys/etnaviv/drm drivers/etnaviv drivers/renderonly -SUBDIRS += frontends/dri +SUBDIRS += state_trackers/dri SUBDIRS += winsys/iris/drm drivers/iris SUBDIRS += winsys/lima/drm drivers/lima -SUBDIRS += winsys/panfrost/drm drivers/panfrost # sort to eliminate any duplicates INC_DIRS := $(call all-named-subdir-makefiles,$(sort $(SUBDIRS))) diff --git a/lib/mesa/src/gallium/auxiliary/Android.mk b/lib/mesa/src/gallium/auxiliary/Android.mk index f668e5237..a2d5fa60d 100644 --- a/lib/mesa/src/gallium/auxiliary/Android.mk +++ b/lib/mesa/src/gallium/auxiliary/Android.mk @@ -28,17 +28,14 @@ include $(LOCAL_PATH)/Makefile.sources include $(CLEAR_VARS) -# filter-out tessellator/tessellator.hpp to avoid "Unused source files" error LOCAL_SRC_FILES := \ - $(filter-out tessellator/tessellator.hpp, $(C_SOURCES)) \ + $(C_SOURCES) \ $(NIR_SOURCES) \ $(RENDERONLY_SOURCES) \ $(VL_STUB_SOURCES) ifeq ($(USE_LIBBACKTRACE),true) - LOCAL_CFLAGS += -DHAVE_ANDROID_PLATFORM - LOCAL_SHARED_LIBRARIES += libbacktrace - LOCAL_SRC_FILES += ../../util/u_debug_stack_android.cpp + LOCAL_SRC_FILES += util/u_debug_stack_android.cpp endif LOCAL_C_INCLUDES := \ @@ -55,7 +52,6 @@ LOCAL_CPPFLAGS += -std=c++14 # We need libmesa_nir to get NIR's generated include directories. LOCAL_MODULE := libmesa_gallium -LOCAL_SHARED_LIBRARIES += libsync LOCAL_STATIC_LIBRARIES += libmesa_nir LOCAL_WHOLE_STATIC_LIBRARIES += cpufeatures @@ -66,44 +62,18 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES intermediates := $(call local-generated-sources-dir) LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, $(GENERATED_SOURCES)) -u_indices_gen_deps := \ - $(MESA_TOP)/src/gallium/auxiliary/indices/u_indices_gen.py +$(LOCAL_GENERATED_SOURCES): PRIVATE_PYTHON := $(MESA_PYTHON2) +$(LOCAL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@ -$(intermediates)/indices/u_indices_gen.c: $(u_indices_gen_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON3) $< > $@ +$(intermediates)/indices/u_indices_gen.c \ +$(intermediates)/indices/u_unfilled_gen.c \ +$(intermediates)/util/u_format_srgb.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py + $(transform-generated-source) -u_unfilled_gen_deps := \ - $(MESA_TOP)/src/gallium/auxiliary/indices/u_unfilled_gen.py - -$(intermediates)/indices/u_unfilled_gen.c: $(u_unfilled_gen_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON3) $< > $@ - -u_tracepoints_deps := \ - $(MESA_TOP)/src/gallium/auxiliary/util/u_tracepoints.py \ - $(MESA_TOP)/src/gallium/auxiliary/util/u_trace.py - -u_tracepoints_c := $(intermediates)/util/u_tracepoints.c -u_tracepoints_h := $(intermediates)/util/u_tracepoints.h - -$(intermediates)/util/u_tracepoints.c \ -$(intermediates)/util/u_tracepoints.h: $(u_tracepoints_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/gallium/auxiliary/util -C $(u_tracepoints_c) -H $(u_tracepoints_h) +$(intermediates)/util/u_format_table.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(LOCAL_PATH)/util/u_format.csv + $(transform-generated-source) LOCAL_GENERATED_SOURCES += $(MESA_GEN_NIR_H) include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) - -# Build libmesa_galliumvl used by radeonsi -include $(CLEAR_VARS) - -LOCAL_SRC_FILES := \ - $(VL_SOURCES) - -LOCAL_MODULE := libmesa_galliumvl - -include $(GALLIUM_COMMON_MK) -include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk b/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk index de07a03ce..075bf8af4 100644 --- a/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk +++ b/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk @@ -31,6 +31,7 @@ include $(CLEAR_VARS) LOCAL_CFLAGS := \ -DHAVE_PIPE_LOADER_DRI \ -DHAVE_PIPE_LOADER_KMS \ + -DDROP_PIPE_LOADER_MISC \ -DGALLIUM_STATIC_TARGETS LOCAL_SRC_FILES := \ diff --git a/lib/mesa/src/gallium/drivers/etnaviv/Android.mk b/lib/mesa/src/gallium/drivers/etnaviv/Android.mk index 3ba6b819f..6976d223c 100644 --- a/lib/mesa/src/gallium/drivers/etnaviv/Android.mk +++ b/lib/mesa/src/gallium/drivers/etnaviv/Android.mk @@ -28,10 +28,7 @@ include $(CLEAR_VARS) LOCAL_SRC_FILES := \ $(C_SOURCES) -LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H) - -LOCAL_SHARED_LIBRARIES := libdrm -LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm +LOCAL_SHARED_LIBRARIES := libdrm_etnaviv LOCAL_MODULE := libmesa_pipe_etnaviv include $(GALLIUM_COMMON_MK) diff --git a/lib/mesa/src/gallium/drivers/freedreno/Android.mk b/lib/mesa/src/gallium/drivers/freedreno/Android.mk index 86db01a59..f0b29b116 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/Android.mk +++ b/lib/mesa/src/gallium/drivers/freedreno/Android.mk @@ -39,34 +39,15 @@ LOCAL_SRC_FILES := \ LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/ir3 \ - $(MESA_TOP)/include \ - $(MESA_TOP)/src/freedreno/common \ - $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_gallium,,)/util + $(MESA_TOP)/include LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H) -LOCAL_SHARED_LIBRARIES := libdrm libsync -LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_perfcntrs libfreedreno_registers +LOCAL_SHARED_LIBRARIES := libdrm +LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_registers LOCAL_MODULE := libmesa_pipe_freedreno -LOCAL_MODULE_CLASS := STATIC_LIBRARIES - -intermediates := $(call local-generated-sources-dir) - -LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, $(GENERATED_SOURCES)) - -freedreno_tracepoints_deps := \ - $(MESA_TOP)/src/gallium/drivers/freedreno/freedreno_tracepoints.py \ - $(MESA_TOP)/src/gallium/auxiliary/util/u_trace.py - -freedreno_tracepoints_c := $(intermediates)/freedreno_tracepoints.c -freedreno_tracepoints_h := $(intermediates)/freedreno_tracepoints.h - -$(intermediates)/freedreno_tracepoints.c \ -$(intermediates)/freedreno_tracepoints.h: $(freedreno_tracepoints_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/gallium/auxiliary/util -C $(freedreno_tracepoints_c) -H $(freedreno_tracepoints_h) - +include $(LOCAL_PATH)/Android.gen.mk include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/gallium/drivers/iris/Android.mk b/lib/mesa/src/gallium/drivers/iris/Android.mk index 5d5744025..71ec0cf58 100644 --- a/lib/mesa/src/gallium/drivers/iris/Android.mk +++ b/lib/mesa/src/gallium/drivers/iris/Android.mk @@ -42,15 +42,15 @@ IRIS_COMMON_INCLUDES := \ $(MESA_TOP)/src/gallium/auxiliary # -# libiris for gfx8 +# libiris for gen8 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_iris_gfx8 +LOCAL_MODULE := libmesa_iris_gen8 LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=80 +LOCAL_CFLAGS := -DGEN_VERSIONx10=80 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES) @@ -62,15 +62,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libiris for gfx9 +# libiris for gen9 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_iris_gfx9 +LOCAL_MODULE := libmesa_iris_gen9 LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=90 +LOCAL_CFLAGS := -DGEN_VERSIONx10=90 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES) @@ -82,15 +82,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libiris for gfx11 +# libiris for gen10 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_iris_gfx11 +LOCAL_MODULE := libmesa_iris_gen10 LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=110 +LOCAL_CFLAGS := -DGEN_VERSIONx10=100 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES) @@ -102,15 +102,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libiris for gfx12 +# libiris for gen11 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_iris_gfx12 +LOCAL_MODULE := libmesa_iris_gen11 LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=120 +LOCAL_CFLAGS := -DGEN_VERSIONx10=110 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES) @@ -121,30 +121,29 @@ LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) -# -# libiris for gfx125 -# +########################################################### include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_iris_gfx125 + +LOCAL_MODULE := libmesa_pipe_iris LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=125 +intermediates := $(call local-generated-sources-dir) -LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES) +LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/iris/,$(GENERATED_SOURCES)) -LOCAL_STATIC_LIBRARIES := $(LIBIRIS_STATIC_LIBS) +GEN_DRIINFO_INPUTS := \ + $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \ + $(LOCAL_PATH)/driinfo_iris.h -LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml +MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py -include $(MESA_COMMON_MK) -include $(BUILD_STATIC_LIBRARY) +$(intermediates)/iris/iris_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false) -########################################################### -include $(CLEAR_VARS) - -LOCAL_MODULE := libmesa_pipe_iris +LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates) LOCAL_SRC_FILES := \ $(IRIS_C_SOURCES) @@ -167,11 +166,10 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libmesa_intel_common \ libmesa_intel_compiler \ libmesa_intel_perf \ - libmesa_iris_gfx8 \ - libmesa_iris_gfx9 \ - libmesa_iris_gfx11 \ - libmesa_iris_gfx12 \ - libmesa_iris_gfx125 + libmesa_iris_gen8 \ + libmesa_iris_gen9 \ + libmesa_iris_gen10 \ + libmesa_iris_gen11 include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/gallium/drivers/iris/Makefile.sources b/lib/mesa/src/gallium/drivers/iris/Makefile.sources index c727bce86..bc8f592d3 100644 --- a/lib/mesa/src/gallium/drivers/iris/Makefile.sources +++ b/lib/mesa/src/gallium/drivers/iris/Makefile.sources @@ -20,7 +20,11 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. +GENERATED_SOURCES := \ + iris_driinfo.h + IRIS_C_SOURCES = \ + $(GENERATED_SOURCES) \ driinfo_iris.h \ iris_batch.c \ iris_batch.h \ @@ -37,16 +41,10 @@ IRIS_C_SOURCES = \ iris_draw.c \ iris_fence.c \ iris_fence.h \ - iris_fine_fence.c \ - iris_fine_fence.h \ iris_formats.c \ iris_genx_macros.h \ iris_genx_protos.h \ - iris_measure.c \ - iris_measure.h \ iris_monitor.c \ - iris_performance_query.c \ - iris_perf.c \ iris_pipe.h \ iris_pipe_control.c \ iris_program.c \ diff --git a/lib/mesa/src/gallium/drivers/kmsro/Android.mk b/lib/mesa/src/gallium/drivers/kmsro/Android.mk index e0e26482b..2f637b8bf 100644 --- a/lib/mesa/src/gallium/drivers/kmsro/Android.mk +++ b/lib/mesa/src/gallium/drivers/kmsro/Android.mk @@ -39,20 +39,14 @@ GALLIUM_TARGET_DRIVERS += exynos GALLIUM_TARGET_DRIVERS += hx8357d GALLIUM_TARGET_DRIVERS += ili9225 GALLIUM_TARGET_DRIVERS += ili9341 -GALLIUM_TARGET_DRIVERS += imx-drm -GALLIUM_TARGET_DRIVERS += imx-dcss -GALLIUM_TARGET_DRIVERS += ingenic-drm -GALLIUM_TARGET_DRIVERS += mcde -GALLIUM_TARGET_DRIVERS += mediatek -GALLIUM_TARGET_DRIVERS += meson +GALLIUM_TARGET_DRIVERS += imx +GALLIUM_TARGET_DRIVERS += stm GALLIUM_TARGET_DRIVERS += mi0283qt GALLIUM_TARGET_DRIVERS += mxsfb-drm GALLIUM_TARGET_DRIVERS += pl111 GALLIUM_TARGET_DRIVERS += repaper -GALLIUM_TARGET_DRIVERS += rockchip GALLIUM_TARGET_DRIVERS += st7586 GALLIUM_TARGET_DRIVERS += st7735r -GALLIUM_TARGET_DRIVERS += stm GALLIUM_TARGET_DRIVERS += sun4i-drm $(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_kmsro) endif diff --git a/lib/mesa/src/gallium/drivers/lima/Android.mk b/lib/mesa/src/gallium/drivers/lima/Android.mk index 09487d9dc..069ecc4b2 100644 --- a/lib/mesa/src/gallium/drivers/lima/Android.mk +++ b/lib/mesa/src/gallium/drivers/lima/Android.mk @@ -31,15 +31,11 @@ LOCAL_SRC_FILES := \ ir/gp/lower.c \ ir/gp/nir.c \ ir/gp/node.c \ - ir/gp/optimize.c \ ir/gp/regalloc.c \ ir/gp/reduce_scheduler.c \ ir/gp/scheduler.c \ ir/lima_ir.h \ - ir/lima_nir_duplicate_consts.c \ - ir/lima_nir_duplicate_intrinsic.c \ ir/lima_nir_lower_uniform_to_scalar.c \ - ir/lima_nir_split_load_input.c \ ir/pp/codegen.c \ ir/pp/codegen.h \ ir/pp/disasm.c \ @@ -50,19 +46,14 @@ LOCAL_SRC_FILES := \ ir/pp/node_to_instr.c \ ir/pp/ppir.h \ ir/pp/regalloc.c \ - ir/pp/liveness.c \ ir/pp/scheduler.c \ lima_bo.c \ lima_bo.h \ lima_context.c \ lima_context.h \ - lima_disk_cache.c \ - lima_disk_cache.h \ lima_draw.c \ lima_fence.c \ lima_fence.h \ - lima_parser.c \ - lima_parser.h \ lima_program.c \ lima_program.h \ lima_query.c \ @@ -71,15 +62,12 @@ LOCAL_SRC_FILES := \ lima_screen.c \ lima_screen.h \ lima_state.c \ - lima_job.c \ - lima_job.h \ + lima_submit.c \ + lima_submit.h \ lima_texture.c \ lima_texture.h \ lima_util.c \ - lima_util.h \ - lima_format.c \ - lima_format.h \ - lima_gpu.h + lima_util.h LOCAL_MODULE := libmesa_pipe_lima diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c b/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c index 157c23491..43121335f 100644 --- a/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c +++ b/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c @@ -27,11 +27,8 @@ #include <stdio.h> #include "util/u_memory.h" -#include "gallium/auxiliary/util/u_blend.h" -#include "pan_context.h" -#include "pan_blend_cso.h" -#include "pan_bo.h" -#include "panfrost-quirks.h" +#include "pan_blend_shaders.h" +#include "pan_blending.h" /* A given Gallium blend state can be encoded to the hardware in numerous, * dramatically divergent ways due to the interactions of blending with @@ -60,6 +57,41 @@ * (our subclass of pipe_blend_state). */ +/* Given an initialized CSO and a particular framebuffer format, grab a + * blend shader, generating and compiling it if it doesn't exist + * (lazy-loading in a way). This routine, when the cache hits, should + * befast, suitable for calling every draw to avoid wacky dirty + * tracking paths. If the cache hits, boom, done. */ + +static struct panfrost_blend_shader * +panfrost_get_blend_shader( + struct panfrost_context *ctx, + struct panfrost_blend_state *blend, + enum pipe_format fmt, + unsigned rt) +{ + /* Prevent NULL collision issues.. */ + assert(fmt != 0); + + /* Check the cache */ + struct hash_table_u64 *shaders = blend->rt[rt].shaders; + + struct panfrost_blend_shader *shader = + _mesa_hash_table_u64_search(shaders, fmt); + + if (shader) + return shader; + + /* Cache miss. Build one instead, cache it, and go */ + + struct panfrost_blend_shader generated = + panfrost_compile_blend_shader(ctx, &blend->base, fmt); + + shader = mem_dup(&generated, sizeof(generated)); + _mesa_hash_table_u64_insert(shaders, fmt, shader); + return shader; +} + /* Create a blend CSO. Essentially, try to compile a fixed-function * expression and initialize blend shaders */ @@ -71,34 +103,33 @@ panfrost_create_blend_state(struct pipe_context *pipe, struct panfrost_blend_state *so = rzalloc(ctx, struct panfrost_blend_state); so->base = *blend; - so->pan.dither = blend->dither; - so->pan.logicop_enable = blend->logicop_enable; - so->pan.logicop_func = blend->logicop_func; - so->pan.rt_count = blend->max_rt + 1; - /* TODO: The following features are not yet implemented */ + assert(!blend->logicop_enable); + assert(!blend->alpha_to_coverage); assert(!blend->alpha_to_one); - for (unsigned c = 0; c < so->pan.rt_count; ++c) { - unsigned g = blend->independent_blend_enable ? c : 0; - const struct pipe_rt_blend_state *pipe = &blend->rt[g]; - struct pan_blend_equation *equation = &so->pan.rts[c].equation; - - equation->color_mask = pipe->colormask; - equation->blend_enable = pipe->blend_enable; - if (!equation->blend_enable) - continue; - - equation->rgb_func = util_blend_func_to_shader(pipe->rgb_func); - equation->rgb_src_factor = util_blend_factor_to_shader(pipe->rgb_src_factor); - equation->rgb_invert_src_factor = util_blend_factor_is_inverted(pipe->rgb_src_factor); - equation->rgb_dst_factor = util_blend_factor_to_shader(pipe->rgb_dst_factor); - equation->rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe->rgb_dst_factor); - equation->alpha_func = util_blend_func_to_shader(pipe->alpha_func); - equation->alpha_src_factor = util_blend_factor_to_shader(pipe->alpha_src_factor); - equation->alpha_invert_src_factor = util_blend_factor_is_inverted(pipe->alpha_src_factor); - equation->alpha_dst_factor = util_blend_factor_to_shader(pipe->alpha_dst_factor); - equation->alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe->alpha_dst_factor); + for (unsigned c = 0; c < PIPE_MAX_COLOR_BUFS; ++c) { + struct panfrost_blend_rt *rt = &so->rt[c]; + + /* There are two paths. First, we would like to try a + * fixed-function if we can */ + + /* Without indep blending, the first RT settings replicate */ + + unsigned g = + blend->independent_blend_enable ? c : 0; + + rt->has_fixed_function = + panfrost_make_fixed_blend_mode( + &blend->rt[g], + &rt->equation, + &rt->constant_mask, + blend->rt[g].colormask); + + /* Regardless if that works, we also need to initialize + * the blend shaders */ + + rt->shaders = _mesa_hash_table_u64_create(so); } return so; @@ -109,7 +140,28 @@ panfrost_bind_blend_state(struct pipe_context *pipe, void *cso) { struct panfrost_context *ctx = pan_context(pipe); - ctx->blend = (struct panfrost_blend_state *) cso; + struct panfrost_screen *screen = pan_screen(ctx->base.screen); + struct pipe_blend_state *blend = (struct pipe_blend_state *) cso; + struct panfrost_blend_state *pblend = (struct panfrost_blend_state *) cso; + ctx->blend = pblend; + + if (!blend) + return; + + if (screen->require_sfbd) { + SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_DITHER, !blend->dither); + } + + /* Shader itself is not dirty, but the shader core is */ + ctx->dirty |= PAN_DIRTY_FS; +} + +static void +panfrost_delete_blend_shader(struct hash_entry *entry) +{ + struct panfrost_blend_shader *shader = (struct panfrost_blend_shader *)entry->data; + free(shader->buffer); + free(shader); } static void @@ -117,6 +169,11 @@ panfrost_delete_blend_state(struct pipe_context *pipe, void *cso) { struct panfrost_blend_state *blend = (struct panfrost_blend_state *) cso; + + for (unsigned c = 0; c < 4; ++c) { + struct panfrost_blend_rt *rt = &blend->rt[c]; + _mesa_hash_table_u64_clear(rt->shaders, panfrost_delete_blend_shader); + } ralloc_free(blend); } @@ -130,73 +187,105 @@ panfrost_set_blend_color(struct pipe_context *pipe, ctx->blend_color = *blend_color; } +/* Given a vec4 of constants, reduce it to just a single constant according to + * the mask (if we can) */ + +static bool +panfrost_blend_constant(float *out, float *in, unsigned mask) +{ + /* If there is no components used, it automatically works. Do set a + * dummy constant just to avoid reading uninitialized memory. */ + + if (!mask) { + *out = 0.0; + return true; + } + + /* Find some starter mask */ + unsigned first = ffs(mask) - 1; + float cons = in[first]; + mask ^= (1 << first); + + /* Ensure the rest are equal */ + while (mask) { + unsigned i = u_bit_scan(&mask); + + if (in[i] != cons) { + *out = 0.0; + return false; + } + } + + /* Otherwise, we're good to go */ + *out = cons; + return true; +} + /* Create a final blend given the context */ struct panfrost_blend_final -panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti, struct panfrost_bo **bo, unsigned *shader_offset) +panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti) { - struct panfrost_device *dev = pan_device(ctx->base.screen); - struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + struct panfrost_screen *screen = pan_screen(ctx->base.screen); + struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); + + /* Grab the format, falling back gracefully if called invalidly (which + * has to happen for no-color-attachment FBOs, for instance) */ struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer; - enum pipe_format fmt = fb->cbufs[rti]->format; - unsigned nr_samples = fb->cbufs[rti]->nr_samples ? : - fb->cbufs[rti]->texture->nr_samples; + enum pipe_format fmt = PIPE_FORMAT_R8G8B8A8_UNORM; + + if ((fb->nr_cbufs > rti) && fb->cbufs[rti]) + fmt = fb->cbufs[rti]->format; /* Grab the blend state */ struct panfrost_blend_state *blend = ctx->blend; - struct pan_blend_state pan_blend = blend->pan; - - pan_blend.rts[rti].format = fmt; - pan_blend.rts[rti].nr_samples = nr_samples; - memcpy(pan_blend.constants, ctx->blend_color.color, - sizeof(pan_blend.constants)); - - /* First, we'll try fixed function, matching equation and constant */ - if (pan_blend_can_fixed_function(dev, &pan_blend, rti)) { - struct panfrost_blend_final final = { - .load_dest = pan_blend_reads_dest(pan_blend.rts[rti].equation), - .equation.constant = pan_blend_get_constant(dev, &pan_blend, rti), - .opaque = pan_blend_is_opaque(pan_blend.rts[rti].equation), - .no_colour = pan_blend.rts[rti].equation.color_mask == 0, - }; - - pan_blend_to_fixed_function_equation(dev, &pan_blend, rti, - &final.equation.equation); - return final; - } + assert(blend); + struct panfrost_blend_rt *rt = &blend->rt[rti]; - /* Otherwise, we need to grab a shader */ - /* Upload the shader, sharing a BO */ - if (!(*bo)) { - *bo = panfrost_batch_create_bo(batch, 4096, - PAN_BO_EXECUTE, - PAN_BO_ACCESS_PRIVATE | - PAN_BO_ACCESS_READ | - PAN_BO_ACCESS_FRAGMENT); + struct panfrost_blend_final final; + + /* First, we'll try a fixed function path */ + if (rt->has_fixed_function && panfrost_can_fixed_blend(fmt)) { + if (panfrost_blend_constant( + &final.equation.constant, + ctx->blend_color.color, + rt->constant_mask)) { + /* There's an equation and suitable constant, so we're good to go */ + final.is_shader = false; + final.equation.equation = &rt->equation; + + final.no_blending = + (rt->equation.rgb_mode == 0x122) && + (rt->equation.alpha_mode == 0x122) && + (rt->equation.color_mask == 0xf); + + return final; + } } - pthread_mutex_lock(&dev->blend_shaders.lock); - struct pan_blend_shader_variant *shader = - pan_blend_get_shader_locked(dev, &pan_blend, rti); + /* Otherwise, we need to grab a shader */ + struct panfrost_blend_shader *shader = panfrost_get_blend_shader(ctx, blend, fmt, rti); + final.is_shader = true; + final.no_blending = false; + final.shader.work_count = shader->work_count; + final.shader.first_tag = shader->first_tag; - /* Size check */ - assert((*shader_offset + shader->binary.size) < 4096); + /* Upload the shader */ + final.shader.bo = panfrost_drm_create_bo(screen, shader->size, PAN_ALLOCATE_EXECUTE); + memcpy(final.shader.bo->cpu, shader->buffer, shader->size); - memcpy((*bo)->ptr.cpu + *shader_offset, shader->binary.data, shader->binary.size); + /* Pass BO ownership to job */ + panfrost_job_add_bo(job, final.shader.bo); + panfrost_bo_unreference(ctx->base.screen, final.shader.bo); - struct panfrost_blend_final final = { - .is_shader = true, - .shader = { - .first_tag = shader->first_tag, - .gpu = (*bo)->ptr.gpu + *shader_offset, - }, - .load_dest = pan_blend.logicop_enable || - pan_blend_reads_dest(pan_blend.rts[rti].equation), - }; + if (shader->patch_index) { + /* We have to specialize the blend shader to use constants, so + * patch in the current constants */ - *shader_offset += shader->binary.size; - pthread_mutex_unlock(&dev->blend_shaders.lock); + float *patch = (float *) (final.shader.bo->cpu + shader->patch_index); + memcpy(patch, ctx->blend_color.color, sizeof(float) * 4); + } return final; } diff --git a/lib/mesa/src/gallium/drivers/r600/Android.mk b/lib/mesa/src/gallium/drivers/r600/Android.mk index b87fc91e6..9f684cf24 100644 --- a/lib/mesa/src/gallium/drivers/r600/Android.mk +++ b/lib/mesa/src/gallium/drivers/r600/Android.mk @@ -30,12 +30,8 @@ include $(CLEAR_VARS) LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES) -LOCAL_C_INCLUDES += \ - $(MESA_TOP)/src/amd/common \ - $(MESA_TOP)/src/amd/llvm \ - $(MESA_TOP)/src/mesa +LOCAL_C_INCLUDES += $(MESA_TOP)/src/amd/common -LOCAL_STATIC_LIBRARIES := libmesa_nir LOCAL_SHARED_LIBRARIES := libdrm_radeon LOCAL_MODULE := libmesa_pipe_r600 @@ -49,15 +45,6 @@ $(intermediates)/egd_tables.h: $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.p @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.py $(MESA_TOP)/src/gallium/drivers/r600/evergreend.h > $@ -sfn_nir_algebraic_gen := $(LOCAL_PATH)/sfn/sfn_nir_algebraic.py -sfn_nir_algebraic_deps := \ - $(LOCAL_PATH)/sfn/sfn_nir_algebraic.py \ - $(MESA_TOP)/src/compiler/nir/nir_algebraic.py - -$(intermediates)/sfn_nir_algebraic.c: $(sfn_nir_algebraic_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $(sfn_nir_algebraic_gen) -p $(MESA_TOP)/src/compiler/nir/ > $@ - ifeq ($(MESA_ENABLE_LLVM),true) $(call mesa-build-with-llvm) endif diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk index 75f30f621..e402da639 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk +++ b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk @@ -21,8 +21,6 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -ifeq ($(MESA_ENABLE_LLVM),true) - LOCAL_PATH := $(call my-dir) # get C_SOURCES and GENERATED_SOURCES @@ -38,20 +36,48 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/amd/common \ - $(MESA_TOP)/src/amd/llvm \ - $(MESA_TOP)/src/compiler/nir \ $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \ $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir -LOCAL_STATIC_LIBRARIES := \ - libmesa_amd_common \ - libmesa_galliumvl +LOCAL_STATIC_LIBRARIES := libmesa_amd_common LOCAL_SHARED_LIBRARIES := libdrm_radeon LOCAL_MODULE := libmesa_pipe_radeonsi +intermediates := $(call local-generated-sources-dir) + # We need to get NIR's generated headers. LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H) +LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/radeonsi/,$(GENERATED_SOURCES)) + +GEN_DRIINFO_INPUTS := \ + $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \ + $(LOCAL_PATH)/driinfo_radeonsi.h + +MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py + +$(intermediates)/radeonsi/si_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false) + +GEN10_FORMAT_TABLE_INPUTS := \ + $(MESA_TOP)/src/gallium/auxiliary/util/u_format.csv \ + $(MESA_TOP)/src/amd/registers/gfx10-rsrc.json + +GEN10_FORMAT_TABLE_DEP := \ + $(MESA_TOP)/src/amd/registers/regdb.py + +GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py + +$(intermediates)/radeonsi/gfx10_format_table.h: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false) + +LOCAL_C_INCLUDES += $(intermediates)/radeonsi + +LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates) $(call mesa-build-with-llvm) @@ -67,5 +93,3 @@ $(eval GALLIUM_LIBS += \ libmesa_winsys_amdgpu) $(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES)) endif - -endif # MESA_ENABLE_LLVM==true diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index 3d17f08ca..373fd4ffa 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -23,15 +23,16 @@ * */ -#include "ac_llvm_cull.h" -#include "si_build_pm4.h" #include "si_pipe.h" #include "si_shader_internal.h" #include "sid.h" -#include "util/fast_idiv_by_const.h" +#include "si_build_pm4.h" +#include "ac_llvm_cull.h" + #include "util/u_prim.h" #include "util/u_suballoc.h" #include "util/u_upload_mgr.h" +#include "util/fast_idiv_by_const.h" /* Based on: * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf @@ -107,6 +108,7 @@ * (patch elimination where tess factors are 0 would be possible to implement) * - The vertex shader must not contain memory stores. * - All VS resources must not have a write usage in the command buffer. + * (TODO: all shader buffers currently set the write usage) * - Bindless textures and images must not occur in the vertex shader. * * User data SGPR layout: @@ -153,1400 +155,1426 @@ /* At least 256 is needed for the fastest wave launch rate from compute queues * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */ -#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */ -#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */ -#define MAX_WAVES_PER_SH 0 /* no limit */ -#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */ +#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */ +#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */ +#define MAX_WAVES_PER_SH 0 /* no limit */ +#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */ /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */ -#define CULL_Z 0 +#define CULL_Z 0 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */ -#define VERTEX_COUNTER_GDS_MODE 2 -#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */ +#define VERTEX_COUNTER_GDS_MODE 2 +#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */ /* Grouping compute dispatches for small draw calls: How many primitives from multiple * draw calls to process by compute before signaling the gfx IB. This reduces the number * of EOP events + REWIND packets, because they decrease performance. */ -#define PRIMS_PER_BATCH (512 * 1024) +#define PRIMS_PER_BATCH (512 * 1024) /* Draw call splitting at the packet level. This allows signaling the gfx IB * for big draw calls sooner, but doesn't allow context flushes between packets. * Primitive restart is supported. Only implemented for ordered append. */ -#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH +#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH /* If there is not enough ring buffer space for the current IB, split draw calls into * this number of primitives, so that we can flush the context and get free ring space. */ -#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH +#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH /* Derived values. */ -#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64) -#define SPLIT_PRIMS_PACKET_LEVEL \ - (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE \ - : UINT_MAX & ~(THREADGROUP_SIZE - 1)) +#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64) +#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \ + SPLIT_PRIMS_PACKET_LEVEL_VALUE : \ + UINT_MAX & ~(THREADGROUP_SIZE - 1)) -#define REWIND_SIGNAL_BIT 0x80000000 +#define REWIND_SIGNAL_BIT 0x80000000 /* For emulating the rewind packet on CI. */ -#define FORCE_REWIND_EMULATION 0 +#define FORCE_REWIND_EMULATION 0 -void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context, - unsigned *prim_discard_vertex_count_threshold, - unsigned *index_ring_size_per_ib) +void si_initialize_prim_discard_tunables(struct si_context *sctx) { - *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ - - if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */ - !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context) - return; - - /* TODO: enable this after the GDS kernel memory management is fixed */ - bool enable_on_pro_graphics_by_default = false; - - if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) || - (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics && - (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII || - sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI || - sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 || - sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) { - *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ - - if (sscreen->debug_flags & DBG(ALWAYS_PD)) - *prim_discard_vertex_count_threshold = 0; /* always enable */ - - const uint32_t MB = 1024 * 1024; - const uint64_t GB = 1024 * 1024 * 1024; - - /* The total size is double this per context. - * Greater numbers allow bigger gfx IBs. - */ - if (sscreen->info.vram_size <= 2 * GB) - *index_ring_size_per_ib = 64 * MB; - else if (sscreen->info.vram_size <= 4 * GB) - *index_ring_size_per_ib = 128 * MB; - else - *index_ring_size_per_ib = 256 * MB; - } + sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ + + if (sctx->chip_class == GFX6 || /* SI support is not implemented */ + !sctx->screen->info.has_gds_ordered_append || + sctx->screen->debug_flags & DBG(NO_PD) || + /* If aux_context == NULL, we are initializing aux_context right now. */ + !sctx->screen->aux_context) + return; + + /* TODO: enable this after the GDS kernel memory management is fixed */ + bool enable_on_pro_graphics_by_default = false; + + if (sctx->screen->debug_flags & DBG(ALWAYS_PD) || + sctx->screen->debug_flags & DBG(PD) || + (enable_on_pro_graphics_by_default && + sctx->screen->info.is_pro_graphics && + (sctx->family == CHIP_BONAIRE || + sctx->family == CHIP_HAWAII || + sctx->family == CHIP_TONGA || + sctx->family == CHIP_FIJI || + sctx->family == CHIP_POLARIS10 || + sctx->family == CHIP_POLARIS11 || + sctx->family == CHIP_VEGA10 || + sctx->family == CHIP_VEGA20))) { + sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ + + if (sctx->screen->debug_flags & DBG(ALWAYS_PD)) + sctx->prim_discard_vertex_count_threshold = 0; /* always enable */ + + const uint32_t MB = 1024 * 1024; + const uint64_t GB = 1024 * 1024 * 1024; + + /* The total size is double this per context. + * Greater numbers allow bigger gfx IBs. + */ + if (sctx->screen->info.vram_size <= 2 * GB) + sctx->index_ring_size_per_ib = 64 * MB; + else if (sctx->screen->info.vram_size <= 4 * GB) + sctx->index_ring_size_per_ib = 128 * MB; + else + sctx->index_ring_size_per_ib = 256 * MB; + } } /* Opcode can be "add" or "swap". */ -static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode, - LLVMValueRef m0, LLVMValueRef value, - unsigned ordered_count_index, bool release, bool done) +static LLVMValueRef +si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode, + LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index, + bool release, bool done) { - if (ctx->screen->info.chip_class >= GFX10) - ordered_count_index |= 1 << 24; /* number of dwords == 1 */ - - LLVMValueRef args[] = { - LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""), - value, - LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ - ctx->ac.i32_0, /* scope */ - ctx->ac.i1false, /* volatile */ - LLVMConstInt(ctx->ac.i32, ordered_count_index, 0), - LLVMConstInt(ctx->ac.i1, release, 0), - LLVMConstInt(ctx->ac.i1, done, 0), - }; - - char intrinsic[64]; - snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode); - return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0); + LLVMValueRef args[] = { + LLVMBuildIntToPtr(ctx->ac.builder, m0, + LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""), + value, + LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ + ctx->i32_0, /* scope */ + ctx->i1false, /* volatile */ + LLVMConstInt(ctx->i32, ordered_count_index, 0), + LLVMConstInt(ctx->i1, release, 0), + LLVMConstInt(ctx->i1, done, 0), + }; + + char intrinsic[64]; + snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode); + return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0); } static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr) { - uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; - ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, ""); - ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), ""); - return LLVMBuildIntToPtr(ctx->ac.builder, ptr, - LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), ""); + uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; + ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, ""); + ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), ""); + return LLVMBuildIntToPtr(ctx->ac.builder, ptr, + LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), ""); } struct si_thread0_section { - struct si_shader_context *ctx; - LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */ - LLVMValueRef saved_exec; + struct si_shader_context *ctx; + LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */ + LLVMValueRef saved_exec; }; /* Enter a section that only executes on thread 0. */ static void si_enter_thread0_section(struct si_shader_context *ctx, - struct si_thread0_section *section, LLVMValueRef thread_id) + struct si_thread0_section *section, + LLVMValueRef thread_id) { - section->ctx = ctx; - section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0"); - - /* This IF has 4 instructions: - * v_and_b32_e32 v, 63, v ; get the thread ID - * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0 - * s_and_saveexec_b64 s, vcc - * s_cbranch_execz BB0_4 - * - * It could just be s_and_saveexec_b64 s, 1. - */ - ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""), - 12601); + section->ctx = ctx; + section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0"); + + /* This IF has 4 instructions: + * v_and_b32_e32 v, 63, v ; get the thread ID + * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0 + * s_and_saveexec_b64 s, vcc + * s_cbranch_execz BB0_4 + * + * It could just be s_and_saveexec_b64 s, 1. + */ + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, + ctx->i32_0, ""), 12601); } /* Exit a section that only executes on thread 0 and broadcast the result * to all threads. */ -static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result) +static void si_exit_thread0_section(struct si_thread0_section *section, + LLVMValueRef *result) { - struct si_shader_context *ctx = section->ctx; + struct si_shader_context *ctx = section->ctx; - LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result); + LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result); - ac_build_endif(&ctx->ac, 12601); + ac_build_endif(&ctx->ac, 12601); - /* Broadcast the result from thread 0 to all threads. */ - *result = - ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); + /* Broadcast the result from thread 0 to all threads. */ + *result = ac_build_readlane(&ctx->ac, + LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); } void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) { - struct si_shader_key *key = &ctx->shader->key; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef vs = ctx->main_fn; - - /* Always inline the VS function. */ - ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); - LLVMSetLinkage(vs, LLVMPrivateLinkage); - - enum ac_arg_type const_desc_type; - if (ctx->shader->selector->info.base.num_ubos == 1 && - ctx->shader->selector->info.base.num_ssbos == 0) - const_desc_type = AC_ARG_CONST_FLOAT_PTR; - else - const_desc_type = AC_ARG_CONST_DESC_PTR; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - struct ac_arg param_index_buffers_and_constants, param_vertex_counter; - struct ac_arg param_vb_desc, param_const_desc; - struct ac_arg param_base_vertex, param_start_instance; - struct ac_arg param_block_id, param_local_id, param_ordered_wave_id; - struct ac_arg param_restart_index, param_smallprim_precision; - struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms; - struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr; - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - ¶m_index_buffers_and_constants); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision); - - /* Block ID and thread ID inputs. */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id); - if (VERTEX_COUNTER_GDS_MODE == 2) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id); - - /* Create the compute shader function. */ - gl_shader_stage old_stage = ctx->stage; - ctx->stage = MESA_SHADER_COMPUTE; - si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE); - ctx->stage = old_stage; - - if (VERTEX_COUNTER_GDS_MODE == 2) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256); - } else if (VERTEX_COUNTER_GDS_MODE == 1) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED); - } - - /* Assemble parameters for VS. */ - LLVMValueRef vs_params[16]; - unsigned num_vs_params = 0; - unsigned param_vertex_id, param_instance_id; - - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */ - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc); - vs_params[num_vs_params++] = - LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance); - vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */ - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc); - - vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ - vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */ - vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */ - vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */ - - assert(num_vs_params <= ARRAY_SIZE(vs_params)); - assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); - - /* Load descriptors. (load 8 dwords at once) */ - LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; - - LLVMValueRef index_buffers_and_constants = - ac_get_arg(&ctx->ac, param_index_buffers_and_constants); - tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, - ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); - tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0); - - for (unsigned i = 0; i < 8; i++) - desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); - - input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4); - output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4); - - /* Compute PrimID and InstanceID. */ - LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id), - LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0), - ac_get_arg(&ctx->ac, param_local_id)); - LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */ - LLVMValueRef instance_id = ctx->ac.i32_0; - - if (key->opt.cs_instancing) { - LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms); - LLVMValueRef num_prims_udiv_multiplier = - ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier); - /* Unpack num_prims_udiv_terms. */ - LLVMValueRef post_shift = - LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), ""); - LLVMValueRef prims_per_instance = - LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), ""); - /* Divide the total prim_id by the number of prims per instance. */ - instance_id = - ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift); - /* Compute the remainder. */ - prim_id = LLVMBuildSub(builder, prim_id, - LLVMBuildMul(builder, instance_id, prims_per_instance, ""), ""); - } - - /* Generate indices (like a non-indexed draw call). */ - LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)}; - unsigned vertices_per_prim = 3; - - switch (key->opt.cs_prim_type) { - case PIPE_PRIM_TRIANGLES: - for (unsigned i = 0; i < 3; i++) { - index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0), - LLVMConstInt(ctx->ac.i32, i, 0)); - } - break; - case PIPE_PRIM_TRIANGLE_STRIP: - for (unsigned i = 0; i < 3; i++) { - index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), ""); - } - break; - case PIPE_PRIM_TRIANGLE_FAN: - /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper - * and rasterizer as a normal triangle, so we need to put the provoking - * vertex into the correct index variable and preserve orientation at the same time. - * gl_VertexID is preserved, because it's equal to the index. - */ - if (key->opt.cs_provoking_vertex_first) { - index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - index[2] = ctx->ac.i32_0; - } else { - index[0] = ctx->ac.i32_0; - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); - index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - } - break; - default: - unreachable("unexpected primitive type"); - } - - /* Fetch indices. */ - if (key->opt.cs_indexed) { - for (unsigned i = 0; i < 3; i++) { - index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0, - 1, 0, true, false, false); - index[i] = ac_to_integer(&ctx->ac, index[i]); - } - } - - LLVMValueRef ordered_wave_id = NULL; - - /* Extract the ordered wave ID. */ - if (VERTEX_COUNTER_GDS_MODE == 2) { - ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id); - ordered_wave_id = - LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), ""); - ordered_wave_id = - LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), ""); - } - LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id), - LLVMConstInt(ctx->ac.i32, 63, 0), ""); - - /* Every other triangle in a strip has a reversed vertex order, so we - * need to swap vertices of odd primitives to get the correct primitive - * orientation when converting triangle strips to triangles. Primitive - * restart complicates it, because a strip can start anywhere. - */ - LLVMValueRef prim_restart_accepted = ctx->ac.i1true; - LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter); - - if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { - /* Without primitive restart, odd primitives have reversed orientation. - * Only primitive restart can flip it with respect to the first vertex - * of the draw call. - */ - LLVMValueRef first_is_odd = ctx->ac.i1false; - - /* Handle primitive restart. */ - if (key->opt.cs_primitive_restart) { - /* Get the GDS primitive restart continue flag and clear - * the flag in vertex_counter. This flag is used when the draw - * call was split and we need to load the primitive orientation - * flag from GDS for the first wave too. - */ - LLVMValueRef gds_prim_restart_continue = - LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), ""); - gds_prim_restart_continue = - LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, ""); - vertex_counter = - LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), ""); - - LLVMValueRef index0_is_reset; - - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i], - ac_get_arg(&ctx->ac, param_restart_index), ""); - if (i == 0) - index0_is_reset = LLVMBuildNot(builder, not_reset, ""); - prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, ""); - } - - /* If the previous waves flip the primitive orientation - * of the current triangle strip, it will be stored in GDS. - * - * Sometimes the correct orientation is not needed, in which case - * we don't need to execute this. - */ - if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) { - /* If there are reset indices in this wave, get the thread index - * where the most recent strip starts relative to each thread. - */ - LLVMValueRef preceding_threads_mask = - LLVMBuildSub(builder, - LLVMBuildShl(builder, ctx->ac.i64_1, - LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""), - ctx->ac.i64_1, ""); - - LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset); - LLVMValueRef preceding_reset_threadmask = - LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, ""); - LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL); - strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, ""); - - /* This flips the orientation based on reset indices within this wave only. */ - first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, ""); - - LLVMValueRef last_strip_start, prev_wave_state, ret, tmp; - LLVMValueRef is_first_wave, current_wave_resets_index; - - /* Get the thread index where the last strip starts in this wave. - * - * If the last strip doesn't start in this wave, the thread index - * will be 0. - * - * If the last strip starts in the next wave, the thread index will - * be 64. - */ - last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL); - last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, ""); - - struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, thread_id); - - /* This must be done in the thread 0 section, because - * we expect PrimID to be 0 for the whole first wave - * in this expression. - * - * NOTE: This will need to be different if we wanna support - * instancing with primitive restart. - */ - is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, ""); - is_first_wave = LLVMBuildAnd(builder, is_first_wave, - LLVMBuildNot(builder, gds_prim_restart_continue, ""), ""); - current_wave_resets_index = - LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, ""); - - ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state"); - - /* Save the last strip start primitive index in GDS and read - * the value that previous waves stored. - * - * if (is_first_wave || current_wave_resets_strip) - * // Read the value that previous waves stored and store a new one. - * first_is_odd = ds.ordered.swap(last_strip_start); - * else - * // Just read the value that previous waves stored. - * first_is_odd = ds.ordered.add(0); - */ - ac_build_ifcc( - &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602); - { - /* The GDS address is always 0 with ordered append. */ - tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true, - false); - LLVMBuildStore(builder, tmp, ret); - } - ac_build_else(&ctx->ac, 12603); - { - /* Just read the value from GDS. */ - tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true, - false); - LLVMBuildStore(builder, tmp, ret); - } - ac_build_endif(&ctx->ac, 12602); - - prev_wave_state = LLVMBuildLoad(builder, ret, ""); - /* Ignore the return value if this is the first wave. */ - prev_wave_state = - LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, ""); - si_exit_thread0_section(§ion, &prev_wave_state); - prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, ""); - - /* If the strip start appears to be on thread 0 for the current primitive - * (meaning the reset index is not present in this wave and might have - * appeared in previous waves), use the value from GDS to determine - * primitive orientation. - * - * If the strip start is in this wave for the current primitive, use - * the value from the current wave to determine primitive orientation. - */ - LLVMValueRef strip_start_is0 = - LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, ""); - first_is_odd = - LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, ""); - } - } - /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */ - LLVMValueRef prim_is_odd = LLVMBuildXor( - builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), ""); - - /* Convert triangle strip indices to triangle indices. */ - ac_build_triangle_strip_indices_to_triangle( - &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0), - index); - } - - /* Execute the vertex shader for each vertex to get vertex positions. */ - LLVMValueRef pos[3][4]; - for (unsigned i = 0; i < vertices_per_prim; i++) { - vs_params[param_vertex_id] = index[i]; - vs_params[param_instance_id] = instance_id; - - LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params); - for (unsigned chan = 0; chan < 4; chan++) - pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, ""); - } - - /* Divide XYZ by W. */ - for (unsigned i = 0; i < vertices_per_prim; i++) { - for (unsigned chan = 0; chan < 3; chan++) - pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]); - } - - /* Load the viewport state. */ - LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, - LLVMConstInt(ctx->ac.i32, 2, 0)); - vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); - LLVMValueRef vp_scale[2], vp_translate[2]; - vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); - vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); - vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); - vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); - - /* Do culling. */ - struct ac_cull_options options = {}; - options.cull_front = key->opt.cs_cull_front; - options.cull_back = key->opt.cs_cull_back; - options.cull_view_xy = true; - options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z; - options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z; - options.cull_small_prims = true; - options.cull_zero_area = true; - options.cull_w = true; - options.use_halfz_clip_space = key->opt.cs_halfz_clip_space; - - LLVMValueRef accepted = - ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate, - ac_get_arg(&ctx->ac, param_smallprim_precision), &options); - - ac_build_optimization_barrier(&ctx->ac, &accepted); - LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); - - /* Count the number of active threads by doing bitcount(accepted). */ - LLVMValueRef num_prims_accepted = ac_build_intrinsic( - &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE); - num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, ""); - - LLVMValueRef start; - - /* Execute atomic_add on the vertex count. */ - struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, thread_id); - { - if (VERTEX_COUNTER_GDS_MODE == 0) { - LLVMValueRef num_indices = LLVMBuildMul( - builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); - start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices, - LLVMAtomicOrderingMonotonic, false); - } else if (VERTEX_COUNTER_GDS_MODE == 1) { - LLVMValueRef num_indices = LLVMBuildMul( - builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter, - LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""); - start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices, - LLVMAtomicOrderingMonotonic, false); - } else if (VERTEX_COUNTER_GDS_MODE == 2) { - LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - - /* If the draw call was split into multiple subdraws, each using - * a separate draw packet, we need to start counting from 0 for - * the first compute wave of the subdraw. - * - * vertex_counter contains the primitive ID of the first thread - * in the first wave. - * - * This is only correct with VERTEX_COUNTER_GDS_MODE == 2: - */ - LLVMValueRef is_first_wave = - LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, ""); - - /* Store the primitive count for ordered append, not vertex count. - * The idea is to avoid GDS initialization via CP DMA. The shader - * effectively stores the first count using "swap". - * - * if (first_wave) { - * ds.ordered.swap(num_prims_accepted); // store the first primitive count - * previous = 0; - * } else { - * previous = ds.ordered.add(num_prims_accepted) // add the primitive count - * } - */ - ac_build_ifcc(&ctx->ac, is_first_wave, 12604); - { - /* The GDS address is always 0 with ordered append. */ - si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true); - LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store); - } - ac_build_else(&ctx->ac, 12605); - { - LLVMBuildStore(builder, - si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted, - 0, true, true), - tmp_store); - } - ac_build_endif(&ctx->ac, 12604); - - start = LLVMBuildLoad(builder, tmp_store, ""); - } - } - si_exit_thread0_section(§ion, &start); - - /* Write the final vertex count to memory. An EOS/EOP event could do this, - * but those events are super slow and should be avoided if performance - * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE - * event like this. - */ - if (VERTEX_COUNTER_GDS_MODE == 2) { - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, - ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""), - 12606); - LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, ""); - count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - - /* GFX8 needs to disable caching, so that the CP can see the stored value. - * MTYPE=3 bypasses TC L2. - */ - if (ctx->screen->info.chip_class <= GFX8) { - LLVMValueRef desc[] = { - ac_get_arg(&ctx->ac, param_vertex_count_addr), - LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0), - LLVMConstInt(ctx->ac.i32, 4, 0), - LLVMConstInt( - ctx->ac.i32, - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */), - 0), - }; - LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4); - ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0, - ac_glc | ac_slc); - } else { - LLVMBuildStore( - builder, count, - si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr))); - } - ac_build_endif(&ctx->ac, 12606); - } else { - /* For unordered modes that increment a vertex count instead of - * primitive count, convert it into the primitive index. - */ - start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - } - - /* Now we need to store the indices of accepted primitives into - * the output index buffer. - */ - ac_build_ifcc(&ctx->ac, accepted, 16607); - { - /* Get the number of bits set before the index of this thread. */ - LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); - - /* We have lowered instancing. Pack the instance ID into vertex ID. */ - if (key->opt.cs_instancing) { - instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), ""); - - for (unsigned i = 0; i < vertices_per_prim; i++) - index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); - } - - if (VERTEX_COUNTER_GDS_MODE == 2) { - /* vertex_counter contains the first primitive ID - * for this dispatch. If the draw call was split into - * multiple subdraws, the first primitive ID is > 0 - * for subsequent subdraws. Each subdraw uses a different - * portion of the output index buffer. Offset the store - * vindex by the first primitive ID to get the correct - * store address for the subdraw. - */ - start = LLVMBuildAdd(builder, start, vertex_counter, ""); - } - - /* Write indices for accepted primitives. */ - LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); - LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); - - if (!ac_has_vec3_support(ctx->ac.chip_class, true)) - vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); - - ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, - ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); - } - ac_build_endif(&ctx->ac, 16607); - - LLVMBuildRetVoid(builder); + struct si_shader_key *key = &ctx->shader->key; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef vs = ctx->main_fn; + + /* Always inline the VS function. */ + ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); + LLVMSetLinkage(vs, LLVMPrivateLinkage); + + LLVMTypeRef const_desc_type; + if (ctx->shader->selector->info.const_buffers_declared == 1 && + ctx->shader->selector->info.shader_buffers_declared == 0) + const_desc_type = ctx->f32; + else + const_desc_type = ctx->v4i32; + + struct si_function_info fninfo; + si_init_function_info(&fninfo); + + LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc; + LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id; + LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision; + LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc; + LLVMValueRef last_wave_prim_id, vertex_count_addr; + + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), + &index_buffers_and_constants); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr); + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), + &vb_desc); + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type), + &const_desc); + add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32), + &sampler_desc); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms); + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index); + add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision); + + /* Block ID and thread ID inputs. */ + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id); + if (VERTEX_COUNTER_GDS_MODE == 2) + add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id); + + /* Create the compute shader function. */ + unsigned old_type = ctx->type; + ctx->type = PIPE_SHADER_COMPUTE; + si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE); + ctx->type = old_type; + + if (VERTEX_COUNTER_GDS_MODE == 1) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", + GDS_SIZE_UNORDERED); + } + + /* Assemble parameters for VS. */ + LLVMValueRef vs_params[16]; + unsigned num_vs_params = 0; + unsigned param_vertex_id, param_instance_id; + + vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */ + vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ + vs_params[num_vs_params++] = const_desc; + vs_params[num_vs_params++] = sampler_desc; + vs_params[num_vs_params++] = LLVMConstInt(ctx->i32, + S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); + vs_params[num_vs_params++] = base_vertex; + vs_params[num_vs_params++] = start_instance; + vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */ + vs_params[num_vs_params++] = vb_desc; + + vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ + vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */ + vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */ + vs_params[num_vs_params++] = ctx->i32_0; /* unused */ + + assert(num_vs_params <= ARRAY_SIZE(vs_params)); + assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); + + /* Load descriptors. (load 8 dwords at once) */ + LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; + + tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, + ac_array_in_const32_addr_space(ctx->v8i32), ""); + tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0); + + for (unsigned i = 0; i < 8; i++) + desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); + + input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4); + output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4); + + /* Compute PrimID and InstanceID. */ + LLVMValueRef global_thread_id = + ac_build_imad(&ctx->ac, block_id, + LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id); + LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */ + LLVMValueRef instance_id = ctx->i32_0; + + if (key->opt.cs_instancing) { + /* Unpack num_prims_udiv_terms. */ + LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms, + LLVMConstInt(ctx->i32, 0x1f, 0), ""); + LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms, + LLVMConstInt(ctx->i32, 5, 0), ""); + /* Divide the total prim_id by the number of prims per instance. */ + instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, + num_prims_udiv_multiplier, + post_shift); + /* Compute the remainder. */ + prim_id = LLVMBuildSub(builder, prim_id, + LLVMBuildMul(builder, instance_id, + prims_per_instance, ""), ""); + } + + /* Generate indices (like a non-indexed draw call). */ + LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)}; + unsigned vertices_per_prim = 3; + + switch (key->opt.cs_prim_type) { + case PIPE_PRIM_TRIANGLES: + for (unsigned i = 0; i < 3; i++) { + index[i] = ac_build_imad(&ctx->ac, prim_id, + LLVMConstInt(ctx->i32, 3, 0), + LLVMConstInt(ctx->i32, i, 0)); + } + break; + case PIPE_PRIM_TRIANGLE_STRIP: + for (unsigned i = 0; i < 3; i++) { + index[i] = LLVMBuildAdd(builder, prim_id, + LLVMConstInt(ctx->i32, i, 0), ""); + } + break; + case PIPE_PRIM_TRIANGLE_FAN: + /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper + * and rasterizer as a normal triangle, so we need to put the provoking + * vertex into the correct index variable and preserve orientation at the same time. + * gl_VertexID is preserved, because it's equal to the index. + */ + if (key->opt.cs_provoking_vertex_first) { + index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); + index[2] = ctx->i32_0; + } else { + index[0] = ctx->i32_0; + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); + index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); + } + break; + default: + unreachable("unexpected primitive type"); + } + + /* Fetch indices. */ + if (key->opt.cs_indexed) { + for (unsigned i = 0; i < 3; i++) { + index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, + index[i], ctx->i32_0, 1, + 0, true); + index[i] = ac_to_integer(&ctx->ac, index[i]); + } + } + + /* Extract the ordered wave ID. */ + if (VERTEX_COUNTER_GDS_MODE == 2) { + ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id, + LLVMConstInt(ctx->i32, 6, 0), ""); + ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id, + LLVMConstInt(ctx->i32, 0xfff, 0), ""); + } + LLVMValueRef thread_id = + LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), ""); + + /* Every other triangle in a strip has a reversed vertex order, so we + * need to swap vertices of odd primitives to get the correct primitive + * orientation when converting triangle strips to triangles. Primitive + * restart complicates it, because a strip can start anywhere. + */ + LLVMValueRef prim_restart_accepted = ctx->i1true; + + if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { + /* Without primitive restart, odd primitives have reversed orientation. + * Only primitive restart can flip it with respect to the first vertex + * of the draw call. + */ + LLVMValueRef first_is_odd = ctx->i1false; + + /* Handle primitive restart. */ + if (key->opt.cs_primitive_restart) { + /* Get the GDS primitive restart continue flag and clear + * the flag in vertex_counter. This flag is used when the draw + * call was split and we need to load the primitive orientation + * flag from GDS for the first wave too. + */ + LLVMValueRef gds_prim_restart_continue = + LLVMBuildLShr(builder, vertex_counter, + LLVMConstInt(ctx->i32, 31, 0), ""); + gds_prim_restart_continue = + LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, ""); + vertex_counter = LLVMBuildAnd(builder, vertex_counter, + LLVMConstInt(ctx->i32, 0x7fffffff, 0), ""); + + LLVMValueRef index0_is_reset; + + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i], + restart_index, ""); + if (i == 0) + index0_is_reset = LLVMBuildNot(builder, not_reset, ""); + prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, + not_reset, ""); + } + + /* If the previous waves flip the primitive orientation + * of the current triangle strip, it will be stored in GDS. + * + * Sometimes the correct orientation is not needed, in which case + * we don't need to execute this. + */ + if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) { + /* If there are reset indices in this wave, get the thread index + * where the most recent strip starts relative to each thread. + */ + LLVMValueRef preceding_threads_mask = + LLVMBuildSub(builder, + LLVMBuildShl(builder, ctx->ac.i64_1, + LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""), + ctx->ac.i64_1, ""); + + LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset); + LLVMValueRef preceding_reset_threadmask = + LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, ""); + LLVMValueRef strip_start = + ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL); + strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, ""); + + /* This flips the orientatino based on reset indices within this wave only. */ + first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, ""); + + LLVMValueRef last_strip_start, prev_wave_state, ret, tmp; + LLVMValueRef is_first_wave, current_wave_resets_index; + + /* Get the thread index where the last strip starts in this wave. + * + * If the last strip doesn't start in this wave, the thread index + * will be 0. + * + * If the last strip starts in the next wave, the thread index will + * be 64. + */ + last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL); + last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, ""); + + struct si_thread0_section section; + si_enter_thread0_section(ctx, §ion, thread_id); + + /* This must be done in the thread 0 section, because + * we expect PrimID to be 0 for the whole first wave + * in this expression. + * + * NOTE: This will need to be different if we wanna support + * instancing with primitive restart. + */ + is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, ""); + is_first_wave = LLVMBuildAnd(builder, is_first_wave, + LLVMBuildNot(builder, + gds_prim_restart_continue, ""), ""); + current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE, + last_strip_start, ctx->i32_0, ""); + + ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state"); + + /* Save the last strip start primitive index in GDS and read + * the value that previous waves stored. + * + * if (is_first_wave || current_wave_resets_strip) + * // Read the value that previous waves stored and store a new one. + * first_is_odd = ds.ordered.swap(last_strip_start); + * else + * // Just read the value that previous waves stored. + * first_is_odd = ds.ordered.add(0); + */ + ac_build_ifcc(&ctx->ac, + LLVMBuildOr(builder, is_first_wave, + current_wave_resets_index, ""), 12602); + { + /* The GDS address is always 0 with ordered append. */ + tmp = si_build_ds_ordered_op(ctx, "swap", + ordered_wave_id, last_strip_start, + 1, true, false); + LLVMBuildStore(builder, tmp, ret); + } + ac_build_else(&ctx->ac, 12603); + { + /* Just read the value from GDS. */ + tmp = si_build_ds_ordered_op(ctx, "add", + ordered_wave_id, ctx->i32_0, + 1, true, false); + LLVMBuildStore(builder, tmp, ret); + } + ac_build_endif(&ctx->ac, 12602); + + prev_wave_state = LLVMBuildLoad(builder, ret, ""); + /* Ignore the return value if this is the first wave. */ + prev_wave_state = LLVMBuildSelect(builder, is_first_wave, + ctx->i32_0, prev_wave_state, ""); + si_exit_thread0_section(§ion, &prev_wave_state); + prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, ""); + + /* If the strip start appears to be on thread 0 for the current primitive + * (meaning the reset index is not present in this wave and might have + * appeared in previous waves), use the value from GDS to determine + * primitive orientation. + * + * If the strip start is in this wave for the current primitive, use + * the value from the current wave to determine primitive orientation. + */ + LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ, + strip_start, ctx->i32_0, ""); + first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, + first_is_odd, ""); + } + } + /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */ + LLVMValueRef prim_is_odd = + LLVMBuildXor(builder, first_is_odd, + LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), ""); + + /* Determine the primitive orientation. + * Only swap the vertices that are not the provoking vertex. We need to keep + * the provoking vertex in place. + */ + if (key->opt.cs_provoking_vertex_first) { + LLVMValueRef index1 = index[1]; + LLVMValueRef index2 = index[2]; + index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, ""); + index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, ""); + } else { + LLVMValueRef index0 = index[0]; + LLVMValueRef index1 = index[1]; + index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, ""); + index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, ""); + } + } + + /* Execute the vertex shader for each vertex to get vertex positions. */ + LLVMValueRef pos[3][4]; + for (unsigned i = 0; i < vertices_per_prim; i++) { + vs_params[param_vertex_id] = index[i]; + vs_params[param_instance_id] = instance_id; + + LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params); + for (unsigned chan = 0; chan < 4; chan++) + pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, ""); + } + + /* Divide XYZ by W. */ + for (unsigned i = 0; i < vertices_per_prim; i++) { + for (unsigned chan = 0; chan < 3; chan++) + pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]); + } + + /* Load the viewport state. */ + LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, + LLVMConstInt(ctx->i32, 2, 0)); + vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, ""); + vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); + vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); + vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); + vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); + + /* Do culling. */ + struct ac_cull_options options = {}; + options.cull_front = key->opt.cs_cull_front; + options.cull_back = key->opt.cs_cull_back; + options.cull_view_xy = true; + options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z; + options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z; + options.cull_small_prims = true; + options.cull_zero_area = true; + options.cull_w = true; + options.use_halfz_clip_space = key->opt.cs_halfz_clip_space; + + LLVMValueRef accepted = + ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, + vp_scale, vp_translate, smallprim_precision, + &options); + + LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); + + /* Count the number of active threads by doing bitcount(accepted). */ + LLVMValueRef num_prims_accepted = + ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64, + &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE); + num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, ""); + + LLVMValueRef start; + + /* Execute atomic_add on the vertex count. */ + struct si_thread0_section section; + si_enter_thread0_section(ctx, §ion, thread_id); + { + if (VERTEX_COUNTER_GDS_MODE == 0) { + LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); + start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + vertex_counter, num_indices, + LLVMAtomicOrderingMonotonic, false); + } else if (VERTEX_COUNTER_GDS_MODE == 1) { + LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter, + LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""); + start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + vertex_counter, num_indices, + LLVMAtomicOrderingMonotonic, false); + } else if (VERTEX_COUNTER_GDS_MODE == 2) { + LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); + + /* If the draw call was split into multiple subdraws, each using + * a separate draw packet, we need to start counting from 0 for + * the first compute wave of the subdraw. + * + * vertex_counter contains the primitive ID of the first thread + * in the first wave. + * + * This is only correct with VERTEX_COUNTER_GDS_MODE == 2: + */ + LLVMValueRef is_first_wave = + LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, + vertex_counter, ""); + + /* Store the primitive count for ordered append, not vertex count. + * The idea is to avoid GDS initialization via CP DMA. The shader + * effectively stores the first count using "swap". + * + * if (first_wave) { + * ds.ordered.swap(num_prims_accepted); // store the first primitive count + * previous = 0; + * } else { + * previous = ds.ordered.add(num_prims_accepted) // add the primitive count + * } + */ + ac_build_ifcc(&ctx->ac, is_first_wave, 12604); + { + /* The GDS address is always 0 with ordered append. */ + si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, + num_prims_accepted, 0, true, true); + LLVMBuildStore(builder, ctx->i32_0, tmp_store); + } + ac_build_else(&ctx->ac, 12605); + { + LLVMBuildStore(builder, + si_build_ds_ordered_op(ctx, "add", ordered_wave_id, + num_prims_accepted, 0, + true, true), + tmp_store); + } + ac_build_endif(&ctx->ac, 12604); + + start = LLVMBuildLoad(builder, tmp_store, ""); + } + } + si_exit_thread0_section(§ion, &start); + + /* Write the final vertex count to memory. An EOS/EOP event could do this, + * but those events are super slow and should be avoided if performance + * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE + * event like this. + */ + if (VERTEX_COUNTER_GDS_MODE == 2) { + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, + last_wave_prim_id, ""), 12606); + LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, ""); + count = LLVMBuildMul(builder, count, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + + /* GFX8 needs to disable caching, so that the CP can see the stored value. + * MTYPE=3 bypasses TC L2. + */ + if (ctx->screen->info.chip_class <= GFX8) { + LLVMValueRef desc[] = { + vertex_count_addr, + LLVMConstInt(ctx->i32, + S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0), + LLVMConstInt(ctx->i32, 4, 0), + LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_MTYPE(3 /* uncached */), 0), + }; + LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4); + ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0, + ctx->i32_0, 0, ac_glc | ac_slc, false); + } else { + LLVMBuildStore(builder, count, + si_expand_32bit_pointer(ctx, vertex_count_addr)); + } + ac_build_endif(&ctx->ac, 12606); + } else { + /* For unordered modes that increment a vertex count instead of + * primitive count, convert it into the primitive index. + */ + start = LLVMBuildUDiv(builder, start, + LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + } + + /* Now we need to store the indices of accepted primitives into + * the output index buffer. + */ + ac_build_ifcc(&ctx->ac, accepted, 16607); + { + /* Get the number of bits set before the index of this thread. */ + LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); + + /* We have lowered instancing. Pack the instance ID into vertex ID. */ + if (key->opt.cs_instancing) { + instance_id = LLVMBuildShl(builder, instance_id, + LLVMConstInt(ctx->i32, 16, 0), ""); + + for (unsigned i = 0; i < vertices_per_prim; i++) + index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); + } + + if (VERTEX_COUNTER_GDS_MODE == 2) { + /* vertex_counter contains the first primitive ID + * for this dispatch. If the draw call was split into + * multiple subdraws, the first primitive ID is > 0 + * for subsequent subdraws. Each subdraw uses a different + * portion of the output index buffer. Offset the store + * vindex by the first primitive ID to get the correct + * store address for the subdraw. + */ + start = LLVMBuildAdd(builder, start, vertex_counter, ""); + } + + /* Write indices for accepted primitives. */ + LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); + LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); + + if (!ac_has_vec3_support(ctx->ac.chip_class, true)) + vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); + + ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, + vindex, ctx->i32_0, 3, + ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); + } + ac_build_endif(&ctx->ac, 16607); + + LLVMBuildRetVoid(builder); } /* Return false if the shader isn't ready. */ static bool si_shader_select_prim_discard_cs(struct si_context *sctx, - const struct pipe_draw_info *info, - bool primitive_restart) + const struct pipe_draw_info *info, + bool primitive_restart) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader_key key; - - /* Primitive restart needs ordered counters. */ - assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2); - assert(!primitive_restart || info->instance_count == 1); - - memset(&key, 0, sizeof(key)); - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog); - assert(!key.part.vs.prolog.instance_divisor_is_fetched); - - key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0; - key.opt.vs_as_prim_discard_cs = 1; - key.opt.cs_prim_type = info->mode; - key.opt.cs_indexed = info->index_size != 0; - key.opt.cs_instancing = info->instance_count > 1; - key.opt.cs_primitive_restart = primitive_restart; - key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first; - - /* Primitive restart with triangle strips needs to preserve primitive - * orientation for cases where front and back primitive orientation matters. - */ - if (primitive_restart) { - struct si_shader_selector *ps = sctx->shader.ps.cso; - - key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back || - ps->info.uses_frontface || - (rs->two_side && ps->info.colors_read); - } - - if (rs->rasterizer_discard) { - /* Just for performance testing and analysis of trivial bottlenecks. - * This should result in a very short compute shader. */ - key.opt.cs_cull_front = 1; - key.opt.cs_cull_back = 1; - } else { - key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front; - key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back; - } - - if (!rs->depth_clamp_any && CULL_Z) { - key.opt.cs_cull_z = 1; - key.opt.cs_halfz_clip_space = rs->clip_halfz; - } - - sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso; - sctx->cs_prim_discard_state.current = NULL; - - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - struct si_compiler_ctx_state compiler_state; - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state, - &key, -1, true) == 0 && - /* Disallow compute shaders using the scratch buffer. */ - sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader_key key; + + /* Primitive restart needs ordered counters. */ + assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2); + assert(!primitive_restart || info->instance_count == 1); + + memset(&key, 0, sizeof(key)); + si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog); + assert(!key.part.vs.prolog.instance_divisor_is_fetched); + + key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0; + key.opt.vs_as_prim_discard_cs = 1; + key.opt.cs_prim_type = info->mode; + key.opt.cs_indexed = info->index_size != 0; + key.opt.cs_instancing = info->instance_count > 1; + key.opt.cs_primitive_restart = primitive_restart; + key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first; + + /* Primitive restart with triangle strips needs to preserve primitive + * orientation for cases where front and back primitive orientation matters. + */ + if (primitive_restart) { + struct si_shader_selector *ps = sctx->ps_shader.cso; + + key.opt.cs_need_correct_orientation = + rs->cull_front != rs->cull_back || + ps->info.uses_frontface || + (rs->two_side && ps->info.colors_read); + } + + if (rs->rasterizer_discard) { + /* Just for performance testing and analysis of trivial bottlenecks. + * This should result in a very short compute shader. */ + key.opt.cs_cull_front = 1; + key.opt.cs_cull_back = 1; + } else { + key.opt.cs_cull_front = + sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front; + key.opt.cs_cull_back = + sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back; + } + + if (!rs->depth_clamp_any && CULL_Z) { + key.opt.cs_cull_z = 1; + key.opt.cs_halfz_clip_space = rs->clip_halfz; + } + + sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso; + sctx->cs_prim_discard_state.current = NULL; + + struct si_compiler_ctx_state compiler_state; + compiler_state.compiler = &sctx->compiler; + compiler_state.debug = sctx->debug; + compiler_state.is_debug_context = sctx->is_debug; + + return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, + &compiler_state, &key, -1, true) == 0 && + /* Disallow compute shaders using the scratch buffer. */ + sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0; } static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx) { - if (sctx->index_ring) - return true; - - if (!sctx->prim_discard_compute_cs.priv) { - struct radeon_winsys *ws = sctx->ws; - unsigned gds_size = - VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0; - unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0; - - if (gds_size) { - sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS, - RADEON_FLAG_DRIVER_INTERNAL); - if (!sctx->gds) - return false; - - ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0); - } - if (num_oa_counters) { - assert(gds_size); - sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA, - RADEON_FLAG_DRIVER_INTERNAL); - if (!sctx->gds_oa) - return false; - - ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0); - } - - if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs, - &sctx->gfx_cs, num_oa_counters > 0)) - return false; - } - - if (!sctx->index_ring) { - sctx->index_ring = si_aligned_buffer_create( - sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL, - PIPE_USAGE_DEFAULT, - sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size); - if (!sctx->index_ring) - return false; - } - return true; + if (sctx->index_ring) + return true; + + if (!sctx->prim_discard_compute_cs) { + struct radeon_winsys *ws = sctx->ws; + unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : + VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0; + unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0; + + if (gds_size) { + sctx->gds = ws->buffer_create(ws, gds_size, 4, + RADEON_DOMAIN_GDS, 0); + if (!sctx->gds) + return false; + + ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, + RADEON_USAGE_READWRITE, 0, 0); + } + if (num_oa_counters) { + assert(gds_size); + sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, + 1, RADEON_DOMAIN_OA, 0); + if (!sctx->gds_oa) + return false; + + ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, + RADEON_USAGE_READWRITE, 0, 0); + } + + sctx->prim_discard_compute_cs = + ws->cs_add_parallel_compute_ib(sctx->gfx_cs, + num_oa_counters > 0); + if (!sctx->prim_discard_compute_cs) + return false; + } + + if (!sctx->index_ring) { + sctx->index_ring = + si_aligned_buffer_create(sctx->b.screen, + SI_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, + sctx->index_ring_size_per_ib * 2, + 2 * 1024 * 1024); + if (!sctx->index_ring) + return false; + } + return true; } static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size) { - return sctx->index_ring_offset + - align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <= - sctx->index_ring_size_per_ib; + return sctx->index_ring_offset + + align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <= + sctx->index_ring_size_per_ib; } enum si_prim_discard_outcome -si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, - const struct pipe_draw_start_count *draws, - unsigned num_draws, bool primitive_restart, - unsigned total_count) +si_prepare_prim_discard_or_split_draw(struct si_context *sctx, + const struct pipe_draw_info *info, + bool primitive_restart) { - /* If the compute shader compilation isn't finished, this returns false. */ - if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart)) - return SI_PRIM_DISCARD_DISABLED; - - if (!si_initialize_prim_discard_cmdbuf(sctx)) - return SI_PRIM_DISCARD_DISABLED; - - struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs; - unsigned prim = info->mode; - unsigned count = total_count; - unsigned instance_count = info->instance_count; - unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count); - unsigned num_prims = num_prims_per_instance * instance_count; - unsigned out_indexbuf_size = num_prims * 12; - bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size); - const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL; - - /* Split draws at the draw call level if the ring is full. This makes - * better use of the ring space. - */ - if (ring_full && num_prims > split_prims_draw_level && - instance_count == 1 && /* TODO: support splitting instanced draws */ - (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) { - unsigned vert_count_per_subdraw = 0; - - if (prim == PIPE_PRIM_TRIANGLES) - vert_count_per_subdraw = split_prims_draw_level * 3; - else if (prim == PIPE_PRIM_TRIANGLE_STRIP) - vert_count_per_subdraw = split_prims_draw_level; - else - unreachable("shouldn't get here"); - - /* Split multi draws first. */ - if (num_draws > 1) { - unsigned count = 0; - unsigned first_draw = 0; - unsigned num_draws_split = 0; - - for (unsigned i = 0; i < num_draws; i++) { - if (count && count + draws[i].count > vert_count_per_subdraw) { - /* Submit previous draws. */ - sctx->b.draw_vbo(&sctx->b, info, NULL, draws + first_draw, num_draws_split); - count = 0; - first_draw = i; - num_draws_split = 0; - } - - if (draws[i].count > vert_count_per_subdraw) { - /* Submit just 1 draw. It will be split. */ - sctx->b.draw_vbo(&sctx->b, info, NULL, draws + i, 1); - assert(count == 0); - assert(first_draw == i); - assert(num_draws_split == 0); - first_draw = i + 1; - continue; - } - - count += draws[i].count; - num_draws_split++; - } - return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT; - } - - /* Split single draws if splitting multi draws isn't enough. */ - struct pipe_draw_info split_draw = *info; - struct pipe_draw_start_count split_draw_range = draws[0]; - unsigned base_start = split_draw_range.start; - - split_draw.primitive_restart = primitive_restart; - - if (prim == PIPE_PRIM_TRIANGLES) { - assert(vert_count_per_subdraw < count); - - for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { - split_draw_range.start = base_start + start; - split_draw_range.count = MIN2(count - start, vert_count_per_subdraw); - - sctx->b.draw_vbo(&sctx->b, &split_draw, NULL, &split_draw_range, 1); - } - } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { - /* No primitive pair can be split, because strips reverse orientation - * for odd primitives. */ - STATIC_ASSERT(split_prims_draw_level % 2 == 0); - - for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { - split_draw_range.start = base_start + start; - split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2); - - sctx->b.draw_vbo(&sctx->b, &split_draw, NULL, &split_draw_range, 1); - - if (start == 0 && primitive_restart && - sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation) - sctx->preserve_prim_restart_gds_at_flush = true; - } - sctx->preserve_prim_restart_gds_at_flush = false; - } - - return SI_PRIM_DISCARD_DRAW_SPLIT; - } - - /* Just quit if the draw call doesn't fit into the ring and can't be split. */ - if (out_indexbuf_size > sctx->index_ring_size_per_ib) { - if (SI_PRIM_DISCARD_DEBUG) - puts("PD failed: draw call too big, can't be split"); - return SI_PRIM_DISCARD_DISABLED; - } - - unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL) * num_draws; - unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ + - 24 * (num_subdraws - 1) + /* subdraws */ - 30; /* leave some space at the end */ - unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0); - - if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) - need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */ - else - need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */ - - if (ring_full || - (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) || - !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) { - /* If the current IB is empty but the size is too small, add a NOP - * packet to force a flush and get a bigger IB. - */ - if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && - gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { - radeon_begin(gfx_cs); - radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(gfx_cs, 0); - radeon_end(); - } - - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - } - - /* The compute IB is always chained, but we need to call cs_check_space to add more space. */ - struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs; - ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false); - assert(compute_has_space); - assert(si_check_ring_space(sctx, out_indexbuf_size)); - return SI_PRIM_DISCARD_ENABLED; + /* If the compute shader compilation isn't finished, this returns false. */ + if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart)) + return SI_PRIM_DISCARD_DISABLED; + + if (!si_initialize_prim_discard_cmdbuf(sctx)) + return SI_PRIM_DISCARD_DISABLED; + + struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; + unsigned prim = info->mode; + unsigned count = info->count; + unsigned instance_count = info->instance_count; + unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count); + unsigned num_prims = num_prims_per_instance * instance_count; + unsigned out_indexbuf_size = num_prims * 12; + bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size); + const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL; + + /* Split draws at the draw call level if the ring is full. This makes + * better use of the ring space. + */ + if (ring_full && + num_prims > split_prims_draw_level && + instance_count == 1 && /* TODO: support splitting instanced draws */ + (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | + (1 << PIPE_PRIM_TRIANGLE_STRIP))) { + /* Split draws. */ + struct pipe_draw_info split_draw = *info; + split_draw.primitive_restart = primitive_restart; + + unsigned base_start = split_draw.start; + + if (prim == PIPE_PRIM_TRIANGLES) { + unsigned vert_count_per_subdraw = split_prims_draw_level * 3; + assert(vert_count_per_subdraw < count); + + for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { + split_draw.start = base_start + start; + split_draw.count = MIN2(count - start, vert_count_per_subdraw); + + sctx->b.draw_vbo(&sctx->b, &split_draw); + } + } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { + /* No primitive pair can be split, because strips reverse orientation + * for odd primitives. */ + STATIC_ASSERT(split_prims_draw_level % 2 == 0); + + unsigned vert_count_per_subdraw = split_prims_draw_level; + + for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { + split_draw.start = base_start + start; + split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2); + + sctx->b.draw_vbo(&sctx->b, &split_draw); + + if (start == 0 && + primitive_restart && + sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation) + sctx->preserve_prim_restart_gds_at_flush = true; + } + sctx->preserve_prim_restart_gds_at_flush = false; + } else { + assert(0); + } + + return SI_PRIM_DISCARD_DRAW_SPLIT; + } + + /* Just quit if the draw call doesn't fit into the ring and can't be split. */ + if (out_indexbuf_size > sctx->index_ring_size_per_ib) { + if (SI_PRIM_DISCARD_DEBUG) + puts("PD failed: draw call too big, can't be split"); + return SI_PRIM_DISCARD_DISABLED; + } + + unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL); + unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ + + 24 * (num_subdraws - 1) + /* subdraws */ + 20; /* leave some space at the end */ + unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx); + + if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) + need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */ + else + need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */ + + if (ring_full || + (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) || + !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) { + /* If the current IB is empty but the size is too small, add a NOP + * packet to force a flush and get a bigger IB. + */ + if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && + gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { + radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(gfx_cs, 0); + } + + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + } + + /* The compute IB is always chained, but we need to call cs_check_space to add more space. */ + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false); + assert(compute_has_space); + assert(si_check_ring_space(sctx, out_indexbuf_size)); + return SI_PRIM_DISCARD_ENABLED; } void si_compute_signal_gfx(struct si_context *sctx) { - struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs; - unsigned writeback_L2_flags = 0; - - /* The writeback L2 flags vary with each chip generation. */ - /* CI needs to flush vertex indices to memory. */ - if (sctx->chip_class <= GFX7) - writeback_L2_flags = EVENT_TC_WB_ACTION_ENA; - else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0) - writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA; - - if (!sctx->compute_num_prims_in_batch) - return; - - assert(sctx->compute_rewind_va); - - /* After the queued dispatches are done and vertex counts are written to - * the gfx IB, signal the gfx IB to continue. CP doesn't wait for - * the dispatches to finish, it only adds the CS_DONE event into the event - * queue. - */ - si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags, - sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, - writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, NULL, - sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32), - REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */ - SI_NOT_QUERY); - - sctx->compute_rewind_va = 0; - sctx->compute_num_prims_in_batch = 0; + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + unsigned writeback_L2_flags = 0; + + /* The writeback L2 flags vary with each chip generation. */ + /* CI needs to flush vertex indices to memory. */ + if (sctx->chip_class <= GFX7) + writeback_L2_flags = EVENT_TC_WB_ACTION_ENA; + else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0) + writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA; + + if (!sctx->compute_num_prims_in_batch) + return; + + assert(sctx->compute_rewind_va); + + /* After the queued dispatches are done and vertex counts are written to + * the gfx IB, signal the gfx IB to continue. CP doesn't wait for + * the dispatches to finish, it only adds the CS_DONE event into the event + * queue. + */ + si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags, + sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, + writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : + EOP_INT_SEL_NONE, + EOP_DATA_SEL_VALUE_32BIT, + NULL, + sctx->compute_rewind_va | + ((uint64_t)sctx->screen->info.address32_hi << 32), + REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */ + SI_NOT_QUERY); + + sctx->compute_rewind_va = 0; + sctx->compute_num_prims_in_batch = 0; } /* Dispatch a primitive discard compute shader. */ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - unsigned count, unsigned index_size, - unsigned base_vertex, uint64_t input_indexbuf_va, - unsigned input_indexbuf_num_elements) + const struct pipe_draw_info *info, + unsigned index_size, + unsigned base_vertex, + uint64_t input_indexbuf_va, + unsigned input_indexbuf_num_elements) { - struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs; - struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs; - unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count); - if (!num_prims_per_instance) - return; - - unsigned num_prims = num_prims_per_instance * info->instance_count; - unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format; - - switch (info->mode) { - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - vertices_per_prim = 3; - output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32; - gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT; - break; - default: - unreachable("unsupported primitive type"); - return; - } - - unsigned out_indexbuf_offset; - uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4; - bool first_dispatch = !sctx->prim_discard_compute_ib_initialized; - - /* Initialize the compute IB if it's empty. */ - if (!sctx->prim_discard_compute_ib_initialized) { - /* 1) State initialization. */ - sctx->compute_gds_offset = 0; - sctx->compute_ib_last_shader = NULL; - - if (sctx->last_ib_barrier_fence) { - assert(!sctx->last_ib_barrier_buf); - sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence, - RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY); - } - - /* 2) IB initialization. */ - - /* This needs to be done at the beginning of IBs due to possible - * TTM buffer moves in the kernel. - */ - if (sctx->chip_class >= GFX10) { - radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(cs, 0); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - radeon_emit(cs, /* GCR_CNTL */ - S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) | - S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | - S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD)); - radeon_end(); - } else { - si_emit_surface_sync(sctx, cs, - S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | - S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) | - S_0085F0_SH_ICACHE_ACTION_ENA(1) | - S_0085F0_SH_KCACHE_ACTION_ENA(1)); - } - - /* Restore the GDS prim restart counter if needed. */ - if (sctx->preserve_prim_restart_gds_at_flush) { - si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM, - sctx->wait_mem_scratch, 4); - } - - si_emit_initial_compute_regs(sctx, cs); - - radeon_begin(cs); - radeon_set_sh_reg( - cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */ - - /* Only 1D grids are launched. */ - radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1)); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1)); - - radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2); - radeon_emit(cs, 0); - radeon_emit(cs, 0); - - /* Disable ordered alloc for OA resources. */ - for (unsigned i = 0; i < 2; i++) { - radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3, false); - radeon_emit(cs, S_031074_INDEX(i)); - radeon_emit(cs, 0); - radeon_emit(cs, S_03107C_ENABLE(0)); - } - radeon_end(); - - if (sctx->last_ib_barrier_buf) { - assert(!sctx->last_ib_barrier_fence); - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ, - RADEON_PRIO_FENCE); - si_cp_wait_mem(sctx, cs, - sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset, - 1, 1, WAIT_REG_MEM_EQUAL); - } - - sctx->prim_discard_compute_ib_initialized = true; - } - - /* Allocate the output index buffer. */ - output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size); - assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib); - out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset; - sctx->index_ring_offset += output_indexbuf_size; - - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE, - RADEON_PRIO_SHADER_RW_BUFFER); - uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset; - - /* Prepare index buffer descriptors. */ - struct si_resource *indexbuf_desc = NULL; - unsigned indexbuf_desc_offset; - unsigned desc_size = 12 * 4; - uint32_t *desc; - - u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size), - &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc); - radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); - - /* Input index buffer. */ - desc[0] = input_indexbuf_va; - desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size); - desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1); - - if (sctx->chip_class >= GFX10) { - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT - : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT - : V_008F0C_IMG_FORMAT_32_UINT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] = - S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 - : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 - : V_008F0C_BUF_DATA_FORMAT_32); - } - - /* Output index buffer. */ - desc[4] = out_indexbuf_va; - desc[5] = - S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4); - desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1); - - if (sctx->chip_class >= GFX10) { - desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | - S_008F0C_FORMAT(gfx10_output_indexbuf_format) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(output_indexbuf_format); - } - - /* Viewport state. */ - struct si_small_prim_cull_info cull_info; - si_get_small_prim_cull_info(sctx, &cull_info); - - desc[8] = fui(cull_info.scale[0]); - desc[9] = fui(cull_info.scale[1]); - desc[10] = fui(cull_info.translate[0]); - desc[11] = fui(cull_info.translate[1]); - - /* Set user data SGPRs. */ - /* This can't be greater than 14 if we want the fastest launch rate. */ - unsigned user_sgprs = 13; - - uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset; - unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX); - unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX); - uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address; - uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address; - uint64_t vb_desc_va = sctx->vb_descriptors_buffer - ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset - : 0; - unsigned gds_offset, gds_size; - struct si_fast_udiv_info32 num_prims_udiv = {}; - - if (info->instance_count > 1) - num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31); - - /* Limitations on how these two are packed in the user SGPR. */ - assert(num_prims_udiv.post_shift < 32); - assert(num_prims_per_instance < 1 << 27); - - si_resource_reference(&indexbuf_desc, NULL); - - bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart; - - if (VERTEX_COUNTER_GDS_MODE == 1) { - gds_offset = sctx->compute_gds_offset; - gds_size = primitive_restart ? 8 : 4; - sctx->compute_gds_offset += gds_size; - - /* Reset the counters in GDS for the first dispatch using WRITE_DATA. - * The remainder of the GDS will be cleared after the dispatch packet - * in parallel with compute shaders. - */ - if (first_dispatch) { - radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0)); - radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1)); - radeon_emit(cs, gds_offset); - radeon_emit(cs, 0); - radeon_emit(cs, 0); /* value to write */ - if (gds_size == 8) - radeon_emit(cs, 0); - radeon_end(); - } - } - - /* Set shader registers. */ - struct si_shader *shader = sctx->cs_prim_discard_state.current; - - if (shader != sctx->compute_ib_last_shader) { - radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_BINARY); - uint64_t shader_va = shader->bo->gpu_address; - - assert(shader->config.scratch_bytes_per_wave == 0); - assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); - - radeon_begin(cs); - radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cs, shader_va >> 8); - radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); - - radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit( - cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) | - S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) | - S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) | - S_00B848_WGP_MODE(sctx->chip_class >= GFX10)); - radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) | - S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) | - S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) | - S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) | - S_00B84C_LDS_SIZE(shader->config.lds_size)); - - radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG, - MAX_WAVES_PER_SH, THREADGROUPS_PER_CU)); - radeon_end(); - sctx->compute_ib_last_shader = shader; - } - - STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0); - - /* Big draw calls are split into smaller dispatches and draw packets. */ - for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) { - unsigned num_subdraw_prims; - - if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims) - num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL; - else - num_subdraw_prims = num_prims - start_prim; - - /* Small dispatches are executed back to back until a specific primitive - * count is reached. Then, a CS_DONE is inserted to signal the gfx IB - * to start drawing the batch. This batching adds latency to the gfx IB, - * but CS_DONE and REWIND are too slow. - */ - if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH) - si_compute_signal_gfx(sctx); - - if (sctx->compute_num_prims_in_batch == 0) { - assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi); - sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; - - if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) { - radeon_begin(gfx_cs); - radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(gfx_cs, 0); - radeon_end(); - - si_cp_wait_mem( - sctx, gfx_cs, - sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32, - REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP); - - /* Use INDIRECT_BUFFER to chain to a different buffer - * to discard the CP prefetch cache. - */ - sctx->ws->cs_check_space(gfx_cs, 0, true); - } else { - radeon_begin(gfx_cs); - radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); - radeon_emit(gfx_cs, 0); - radeon_end(); - } - } - - sctx->compute_num_prims_in_batch += num_subdraw_prims; - - uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4; - uint64_t index_va = out_indexbuf_va + start_prim * 12; - - /* Emit the draw packet into the gfx IB. */ - radeon_begin(gfx_cs); - radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); - radeon_emit(gfx_cs, num_prims * vertices_per_prim); - radeon_emit(gfx_cs, index_va); - radeon_emit(gfx_cs, index_va >> 32); - radeon_emit(gfx_cs, 0); - radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); - radeon_end(); - - radeon_begin_again(cs); - - /* Continue with the compute IB. */ - if (start_prim == 0) { - uint32_t gds_prim_restart_continue_bit = 0; - - if (sctx->preserve_prim_restart_gds_at_flush) { - assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP); - assert(start_prim < 1 << 31); - gds_prim_restart_continue_bit = 1 << 31; - } - - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs); - radeon_emit(cs, index_buffers_va); - radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0 - ? count_va - : VERTEX_COUNTER_GDS_MODE == 1 - ? gds_offset - : start_prim | gds_prim_restart_continue_bit); - radeon_emit(cs, start_prim + num_subdraw_prims - 1); - radeon_emit(cs, count_va); - radeon_emit(cs, vb_desc_va); - radeon_emit(cs, vs_const_desc_va); - radeon_emit(cs, vs_sampler_desc_va); - radeon_emit(cs, base_vertex); - radeon_emit(cs, info->start_instance); - radeon_emit(cs, num_prims_udiv.multiplier); - radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5)); - radeon_emit(cs, info->restart_index); - /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ - radeon_emit(cs, fui(cull_info.small_prim_precision)); - } else { - assert(VERTEX_COUNTER_GDS_MODE == 2); - /* Only update the SGPRs that changed. */ - radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3); - radeon_emit(cs, start_prim); - radeon_emit(cs, start_prim + num_subdraw_prims - 1); - radeon_emit(cs, count_va); - } - - /* Set grid dimensions. */ - unsigned start_block = start_prim / THREADGROUP_SIZE; - unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE; - unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE; - - radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block); - radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X, - S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) | - S_00B81C_NUM_THREAD_PARTIAL(partial_block_size)); - - radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size); - radeon_emit(cs, 1); - radeon_emit(cs, 1); - radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) | - S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) | - S_00B800_ORDER_MODE(0 /* launch in order */)); - radeon_end(); - - /* This is only for unordered append. Ordered append writes this from - * the shader. - * - * Note that EOP and EOS events are super slow, so emulating the event - * in a shader is an important optimization. - */ - if (VERTEX_COUNTER_GDS_MODE == 1) { - si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0, - sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, - EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL, - count_va | ((uint64_t)sctx->screen->info.address32_hi << 32), - EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY); - - /* Now that compute shaders are running, clear the remainder of GDS. */ - if (first_dispatch) { - unsigned offset = gds_offset + gds_size; - si_cp_dma_clear_buffer( - sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0, - SI_OP_CPDMA_SKIP_CHECK_CS_SPACE, SI_COHERENCY_NONE, L2_BYPASS); - } - } - first_dispatch = false; - - assert(cs->current.cdw <= cs->current.max_dw); - assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw); - } + struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count); + if (!num_prims_per_instance) + return; + + unsigned num_prims = num_prims_per_instance * info->instance_count; + unsigned vertices_per_prim, output_indexbuf_format; + + switch (info->mode) { + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_TRIANGLE_FAN: + vertices_per_prim = 3; + output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32; + break; + default: + unreachable("unsupported primitive type"); + return; + } + + unsigned out_indexbuf_offset; + uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4; + bool first_dispatch = !sctx->prim_discard_compute_ib_initialized; + + /* Initialize the compute IB if it's empty. */ + if (!sctx->prim_discard_compute_ib_initialized) { + /* 1) State initialization. */ + sctx->compute_gds_offset = 0; + sctx->compute_ib_last_shader = NULL; + + if (sctx->last_ib_barrier_fence) { + assert(!sctx->last_ib_barrier_buf); + sctx->ws->cs_add_fence_dependency(gfx_cs, + sctx->last_ib_barrier_fence, + RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY); + } + + /* 2) IB initialization. */ + + /* This needs to be done at the beginning of IBs due to possible + * TTM buffer moves in the kernel. + * + * TODO: update for GFX10 + */ + si_emit_surface_sync(sctx, cs, + S_0085F0_TC_ACTION_ENA(1) | + S_0085F0_TCL1_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) | + S_0085F0_SH_ICACHE_ACTION_ENA(1) | + S_0085F0_SH_KCACHE_ACTION_ENA(1)); + + /* Restore the GDS prim restart counter if needed. */ + if (sctx->preserve_prim_restart_gds_at_flush) { + si_cp_copy_data(sctx, cs, + COPY_DATA_GDS, NULL, 4, + COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4); + } + + si_emit_initial_compute_regs(sctx, cs); + + radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(sctx->scratch_waves) | + S_00B860_WAVESIZE(0)); /* no scratch */ + + /* Only 1D grids are launched. */ + radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | + S_00B820_NUM_THREAD_PARTIAL(1)); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | + S_00B824_NUM_THREAD_PARTIAL(1)); + + radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + + /* Disable ordered alloc for OA resources. */ + for (unsigned i = 0; i < 2; i++) { + radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3); + radeon_emit(cs, S_031074_INDEX(i)); + radeon_emit(cs, 0); + radeon_emit(cs, S_03107C_ENABLE(0)); + } + + if (sctx->last_ib_barrier_buf) { + assert(!sctx->last_ib_barrier_fence); + radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, + RADEON_USAGE_READ, RADEON_PRIO_FENCE); + si_cp_wait_mem(sctx, cs, + sctx->last_ib_barrier_buf->gpu_address + + sctx->last_ib_barrier_buf_offset, 1, 1, + WAIT_REG_MEM_EQUAL); + } + + sctx->prim_discard_compute_ib_initialized = true; + } + + /* Allocate the output index buffer. */ + output_indexbuf_size = align(output_indexbuf_size, + sctx->screen->info.tcc_cache_line_size); + assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib); + out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset; + sctx->index_ring_offset += output_indexbuf_size; + + radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RW_BUFFER); + uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset; + + /* Prepare index buffer descriptors. */ + struct si_resource *indexbuf_desc = NULL; + unsigned indexbuf_desc_offset; + unsigned desc_size = 12 * 4; + uint32_t *desc; + + u_upload_alloc(sctx->b.const_uploader, 0, desc_size, + si_optimal_tcc_alignment(sctx, desc_size), + &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc, + (void**)&desc); + radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + + /* Input index buffer. */ + desc[0] = input_indexbuf_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | + S_008F04_STRIDE(index_size); + desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1); + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : + index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 : + V_008F0C_BUF_DATA_FORMAT_32); + + /* Output index buffer. */ + desc[4] = out_indexbuf_va; + desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | + S_008F04_STRIDE(vertices_per_prim * 4); + desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1); + desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(output_indexbuf_format); + + /* Viewport state. + * This is needed by the small primitive culling, because it's done + * in screen space. + */ + float scale[2], translate[2]; + + scale[0] = sctx->viewports.states[0].scale[0]; + scale[1] = sctx->viewports.states[0].scale[1]; + translate[0] = sctx->viewports.states[0].translate[0]; + translate[1] = sctx->viewports.states[0].translate[1]; + + /* The viewport shouldn't flip the X axis for the small prim culling to work. */ + assert(-scale[0] + translate[0] <= scale[0] + translate[0]); + + /* If the Y axis is inverted (OpenGL default framebuffer), reverse it. + * This is because the viewport transformation inverts the clip space + * bounding box, so min becomes max, which breaks small primitive + * culling. + */ + if (sctx->viewports.y_inverted) { + scale[1] = -scale[1]; + translate[1] = -translate[1]; + } + + /* Scale the framebuffer up, so that samples become pixels and small + * primitive culling is the same for all sample counts. + * This only works with the standard DX sample positions, because + * the samples are evenly spaced on both X and Y axes. + */ + unsigned num_samples = sctx->framebuffer.nr_samples; + assert(num_samples >= 1); + + for (unsigned i = 0; i < 2; i++) { + scale[i] *= num_samples; + translate[i] *= num_samples; + } + + desc[8] = fui(scale[0]); + desc[9] = fui(scale[1]); + desc[10] = fui(translate[0]); + desc[11] = fui(translate[1]); + + /* Better subpixel precision increases the efficiency of small + * primitive culling. */ + unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; + float small_prim_cull_precision; + + if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) + small_prim_cull_precision = num_samples / 4096.0; + else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) + small_prim_cull_precision = num_samples / 1024.0; + else + small_prim_cull_precision = num_samples / 256.0; + + /* Set user data SGPRs. */ + /* This can't be greater than 14 if we want the fastest launch rate. */ + unsigned user_sgprs = 13; + + uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset; + unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX); + unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX); + uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address; + uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address; + uint64_t vb_desc_va = sctx->vb_descriptors_buffer ? + sctx->vb_descriptors_buffer->gpu_address + + sctx->vb_descriptors_offset : 0; + unsigned gds_offset, gds_size; + struct si_fast_udiv_info32 num_prims_udiv = {}; + + if (info->instance_count > 1) + num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31); + + /* Limitations on how these two are packed in the user SGPR. */ + assert(num_prims_udiv.post_shift < 32); + assert(num_prims_per_instance < 1 << 27); + + si_resource_reference(&indexbuf_desc, NULL); + + bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart; + + if (VERTEX_COUNTER_GDS_MODE == 1) { + gds_offset = sctx->compute_gds_offset; + gds_size = primitive_restart ? 8 : 4; + sctx->compute_gds_offset += gds_size; + + /* Reset the counters in GDS for the first dispatch using WRITE_DATA. + * The remainder of the GDS will be cleared after the dispatch packet + * in parallel with compute shaders. + */ + if (first_dispatch) { + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1)); + radeon_emit(cs, gds_offset); + radeon_emit(cs, 0); + radeon_emit(cs, 0); /* value to write */ + if (gds_size == 8) + radeon_emit(cs, 0); + } + } + + /* Set shader registers. */ + struct si_shader *shader = sctx->cs_prim_discard_state.current; + + if (shader != sctx->compute_ib_last_shader) { + radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_BINARY); + uint64_t shader_va = shader->bo->gpu_address; + + assert(shader->config.scratch_bytes_per_wave == 0); + assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); + + radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); + radeon_emit(cs, shader_va >> 8); + radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); + + radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); + radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) | + S_00B848_FLOAT_MODE(shader->config.float_mode) | + S_00B848_DX10_CLAMP(1)); + radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | + S_00B84C_USER_SGPR(user_sgprs) | + S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) | + S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) | + S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) | + S_00B84C_LDS_SIZE(shader->config.lds_size)); + + radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, + ac_get_compute_resource_limits(&sctx->screen->info, + WAVES_PER_TG, + MAX_WAVES_PER_SH, + THREADGROUPS_PER_CU)); + sctx->compute_ib_last_shader = shader; + } + + STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0); + + /* Big draw calls are split into smaller dispatches and draw packets. */ + for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) { + unsigned num_subdraw_prims; + + if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims) + num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL; + else + num_subdraw_prims = num_prims - start_prim; + + /* Small dispatches are executed back to back until a specific primitive + * count is reached. Then, a CS_DONE is inserted to signal the gfx IB + * to start drawing the batch. This batching adds latency to the gfx IB, + * but CS_DONE and REWIND are too slow. + */ + if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH) + si_compute_signal_gfx(sctx); + + if (sctx->compute_num_prims_in_batch == 0) { + assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi); + sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; + + if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) { + radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(gfx_cs, 0); + + si_cp_wait_mem(sctx, gfx_cs, + sctx->compute_rewind_va | + (uint64_t)sctx->screen->info.address32_hi << 32, + REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, + WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP); + + /* Use INDIRECT_BUFFER to chain to a different buffer + * to discard the CP prefetch cache. + */ + sctx->ws->cs_check_space(gfx_cs, 0, true); + } else { + radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); + radeon_emit(gfx_cs, 0); + } + } + + sctx->compute_num_prims_in_batch += num_subdraw_prims; + + uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4; + uint64_t index_va = out_indexbuf_va + start_prim * 12; + + /* Emit the draw packet into the gfx IB. */ + radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); + radeon_emit(gfx_cs, num_prims * vertices_per_prim); + radeon_emit(gfx_cs, index_va); + radeon_emit(gfx_cs, index_va >> 32); + radeon_emit(gfx_cs, 0); + radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); + + /* Continue with the compute IB. */ + if (start_prim == 0) { + uint32_t gds_prim_restart_continue_bit = 0; + + if (sctx->preserve_prim_restart_gds_at_flush) { + assert(primitive_restart && + info->mode == PIPE_PRIM_TRIANGLE_STRIP); + assert(start_prim < 1 << 31); + gds_prim_restart_continue_bit = 1 << 31; + } + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs); + radeon_emit(cs, index_buffers_va); + radeon_emit(cs, + VERTEX_COUNTER_GDS_MODE == 0 ? count_va : + VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset : + start_prim | + gds_prim_restart_continue_bit); + radeon_emit(cs, start_prim + num_subdraw_prims - 1); + radeon_emit(cs, count_va); + radeon_emit(cs, vb_desc_va); + radeon_emit(cs, vs_const_desc_va); + radeon_emit(cs, vs_sampler_desc_va); + radeon_emit(cs, base_vertex); + radeon_emit(cs, info->start_instance); + radeon_emit(cs, num_prims_udiv.multiplier); + radeon_emit(cs, num_prims_udiv.post_shift | + (num_prims_per_instance << 5)); + radeon_emit(cs, info->restart_index); + /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ + radeon_emit(cs, fui(small_prim_cull_precision)); + } else { + assert(VERTEX_COUNTER_GDS_MODE == 2); + /* Only update the SGPRs that changed. */ + radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3); + radeon_emit(cs, start_prim); + radeon_emit(cs, start_prim + num_subdraw_prims - 1); + radeon_emit(cs, count_va); + } + + /* Set grid dimensions. */ + unsigned start_block = start_prim / THREADGROUP_SIZE; + unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE; + unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE; + + radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block); + radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X, + S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) | + S_00B81C_NUM_THREAD_PARTIAL(partial_block_size)); + + radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size); + radeon_emit(cs, 1); + radeon_emit(cs, 1); + radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | + S_00B800_PARTIAL_TG_EN(!!partial_block_size) | + S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) | + S_00B800_ORDER_MODE(0 /* launch in order */)); + + /* This is only for unordered append. Ordered append writes this from + * the shader. + * + * Note that EOP and EOS events are super slow, so emulating the event + * in a shader is an important optimization. + */ + if (VERTEX_COUNTER_GDS_MODE == 1) { + si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0, + sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, + EOP_INT_SEL_NONE, + EOP_DATA_SEL_GDS, + NULL, + count_va | ((uint64_t)sctx->screen->info.address32_hi << 32), + EOP_DATA_GDS(gds_offset / 4, 1), + SI_NOT_QUERY); + + /* Now that compute shaders are running, clear the remainder of GDS. */ + if (first_dispatch) { + unsigned offset = gds_offset + gds_size; + si_cp_dma_clear_buffer(sctx, cs, NULL, offset, + GDS_SIZE_UNORDERED - offset, + 0, + SI_CPDMA_SKIP_CHECK_CS_SPACE | + SI_CPDMA_SKIP_GFX_SYNC | + SI_CPDMA_SKIP_SYNC_BEFORE, + SI_COHERENCY_NONE, L2_BYPASS); + } + } + first_dispatch = false; + + assert(cs->current.cdw <= cs->current.max_dw); + assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw); + } } diff --git a/lib/mesa/src/gallium/drivers/virgl/Android.mk b/lib/mesa/src/gallium/drivers/virgl/Android.mk index a64828e90..c06c16558 100644 --- a/lib/mesa/src/gallium/drivers/virgl/Android.mk +++ b/lib/mesa/src/gallium/drivers/virgl/Android.mk @@ -30,7 +30,22 @@ LOCAL_SRC_FILES := \ LOCAL_MODULE := libmesa_pipe_virgl -LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +intermediates := $(call local-generated-sources-dir) +LOCAL_GENERATED_SOURCES := $(intermediates)/virgl/virgl_driinfo.h + +GEN_DRIINFO_INPUTS := \ + $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \ + $(LOCAL_PATH)/virgl_driinfo.h.in + +MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py + +$(intermediates)/virgl/virgl_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false) + +LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates) include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/gallium/targets/dri/Android.mk b/lib/mesa/src/gallium/targets/dri/Android.mk index 6ec4055f1..c7d564a23 100644 --- a/lib/mesa/src/gallium/targets/dri/Android.mk +++ b/lib/mesa/src/gallium/targets/dri/Android.mk @@ -42,9 +42,7 @@ LOCAL_LDFLAGS := \ LOCAL_SHARED_LIBRARIES := \ libdl \ libglapi \ - libz \ - liblog \ - libsync + libz # If Android version >=8 MESA should static link libexpat else should dynamic link ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) @@ -56,20 +54,9 @@ LOCAL_SHARED_LIBRARIES += \ endif LOCAL_STATIC_LIBRARIES += \ - libetnaviv_drm \ - libfreedreno_common \ libfreedreno_drm \ - libfreedreno_ir2 \ libfreedreno_ir3 \ - libfreedreno_perfcntrs \ - libmesa_gallium \ - libpanfrost_lib \ - libpanfrost_bifrost \ - libpanfrost_bifrost_disasm \ - libpanfrost_midgard \ - libpanfrost_midgard_disasm \ libpanfrost_shared \ - libpanfrost_util \ ifeq ($(USE_LIBBACKTRACE),true) LOCAL_SHARED_LIBRARIES += libbacktrace @@ -87,12 +74,11 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libmesa_nir \ libmesa_dri_common \ libmesa_megadriver_stub \ + libmesa_gallium \ libmesa_pipe_loader \ libmesa_util \ libmesa_loader -LOCAL_SHARED_LIBRARIES += libcutils - # sort GALLIUM_SHARED_LIBS to remove any duplicates LOCAL_SHARED_LIBRARIES += $(sort $(GALLIUM_SHARED_LIBS)) diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk b/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk index 90f56e45b..0b8edf972 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk @@ -21,8 +21,6 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -ifeq ($(MESA_ENABLE_LLVM),true) - LOCAL_PATH := $(call my-dir) # get C_SOURCES @@ -48,5 +46,3 @@ ifneq ($(HAVE_GALLIUM_RADEONSI),) $(eval GALLIUM_LIBS += $(LOCAL_MODULE) $(LOCAL_STATIC_LIBRARIES)) $(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES)) endif - -endif # MESA_ENABLE_LLVM==true diff --git a/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk b/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk index 31edabd68..32091bea0 100644 --- a/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk +++ b/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk @@ -25,7 +25,7 @@ include $(CLEAR_VARS) LOCAL_SRC_FILES := $(C_SOURCES) -LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm +LOCAL_SHARED_LIBRARIES := libdrm_etnaviv LOCAL_MODULE := libmesa_winsys_etnaviv diff --git a/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk b/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk index 669559583..09edab391 100644 --- a/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk +++ b/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk @@ -27,9 +27,6 @@ include $(CLEAR_VARS) LOCAL_SRC_FILES := $(C_SOURCES) -LOCAL_C_INCLUDES := \ - $(MESA_TOP)/src/freedreno/common - LOCAL_SHARED_LIBRARIES := libdrm_freedreno LOCAL_STATIC_LIBRARIES := libfreedreno_registers diff --git a/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk b/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk index f3d9df79c..5e2500774 100644 --- a/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk +++ b/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk @@ -29,8 +29,6 @@ LOCAL_SRC_FILES := $(C_SOURCES) LOCAL_MODULE := libmesa_winsys_virgl -LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio - LOCAL_STATIC_LIBRARIES := libmesa_winsys_virgl_common include $(GALLIUM_COMMON_MK) diff --git a/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk b/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk index 454d830d0..5b33f6771 100644 --- a/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk +++ b/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk @@ -29,8 +29,6 @@ LOCAL_SRC_FILES := $(C_SOURCES) LOCAL_MODULE := libmesa_winsys_virgl_vtest -LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio - LOCAL_STATIC_LIBRARIES := libmesa_winsys_virgl_common include $(GALLIUM_COMMON_MK) diff --git a/lib/mesa/src/intel/Android.common.mk b/lib/mesa/src/intel/Android.common.mk index 0e1118e65..79d9f1284 100644 --- a/lib/mesa/src/intel/Android.common.mk +++ b/lib/mesa/src/intel/Android.common.mk @@ -36,8 +36,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary \ $(MESA_TOP)/src/mapi \ - $(MESA_TOP)/src/mesa \ - $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_git_sha1,,) + $(MESA_TOP)/src/mesa LOCAL_SHARED_LIBRARIES := libz liblog diff --git a/lib/mesa/src/intel/Android.dev.mk b/lib/mesa/src/intel/Android.dev.mk index 5c7ddd4d3..4f14b0362 100644 --- a/lib/mesa/src/intel/Android.dev.mk +++ b/lib/mesa/src/intel/Android.dev.mk @@ -29,12 +29,7 @@ LOCAL_MODULE := libmesa_intel_dev LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_STATIC_LIBRARIES := \ - libmesa_git_sha1 - -LOCAL_C_INCLUDES := \ - $(MESA_TOP)/include \ - $(MESA_TOP)/src +LOCAL_C_INCLUDES := $(MESA_TOP)/include LOCAL_SRC_FILES := $(DEV_FILES) diff --git a/lib/mesa/src/intel/Android.genxml.mk b/lib/mesa/src/intel/Android.genxml.mk index e5548e2b7..8b867920c 100644 --- a/lib/mesa/src/intel/Android.genxml.mk +++ b/lib/mesa/src/intel/Android.genxml.mk @@ -96,21 +96,16 @@ $(intermediates)/genxml/gen9_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen9.xm $(intermediates)/genxml/gen9_pack.h: $(LOCAL_PATH)/genxml/gen9.xml $(LOCAL_PATH)/genxml/gen_pack_header.py $(call header-gen) +$(intermediates)/genxml/gen10_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py +$(intermediates)/genxml/gen10_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen10.xml +$(intermediates)/genxml/gen10_pack.h: $(LOCAL_PATH)/genxml/gen10.xml $(LOCAL_PATH)/genxml/gen_pack_header.py + $(call header-gen) + $(intermediates)/genxml/gen11_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py $(intermediates)/genxml/gen11_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen11.xml $(intermediates)/genxml/gen11_pack.h: $(LOCAL_PATH)/genxml/gen11.xml $(LOCAL_PATH)/genxml/gen_pack_header.py $(call header-gen) -$(intermediates)/genxml/gen12_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py -$(intermediates)/genxml/gen12_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen12.xml -$(intermediates)/genxml/gen12_pack.h: $(LOCAL_PATH)/genxml/gen12.xml $(LOCAL_PATH)/genxml/gen_pack_header.py - $(call header-gen) - -$(intermediates)/genxml/gen125_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py -$(intermediates)/genxml/gen125_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen125.xml -$(intermediates)/genxml/gen125_pack.h: $(LOCAL_PATH)/genxml/gen125.xml $(LOCAL_PATH)/genxml/gen_pack_header.py - $(call header-gen) - $(intermediates)/genxml/genX_xml.h: $(addprefix $(MESA_TOP)/src/intel/,$(GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" diff --git a/lib/mesa/src/intel/Android.isl.mk b/lib/mesa/src/intel/Android.isl.mk index e1ef62b73..07a64b8ed 100644 --- a/lib/mesa/src/intel/Android.isl.mk +++ b/lib/mesa/src/intel/Android.isl.mk @@ -25,20 +25,19 @@ # --------------------------------------- LIBISL_GENX_COMMON_INCLUDES := \ - $(MESA_TOP)/src/ \ - $(MESA_TOP)/src/gallium/include/ + $(MESA_TOP)/src/ # --------------------------------------- -# Build libmesa_isl_gfx4 +# Build libmesa_isl_gen4 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx4 +LOCAL_MODULE := libmesa_isl_gen4 -LOCAL_SRC_FILES := $(ISL_GFX4_FILES) +LOCAL_SRC_FILES := $(ISL_GEN4_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=40 +LOCAL_CFLAGS := -DGEN_VERSIONx10=40 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -48,16 +47,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx5 +# Build libmesa_isl_gen5 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx5 +LOCAL_MODULE := libmesa_isl_gen5 -LOCAL_SRC_FILES := $(ISL_GFX5_FILES) +LOCAL_SRC_FILES := $(ISL_GEN5_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=50 +LOCAL_CFLAGS := -DGEN_VERSIONx10=50 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -67,16 +66,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx6 +# Build libmesa_isl_gen6 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx6 +LOCAL_MODULE := libmesa_isl_gen6 -LOCAL_SRC_FILES := $(ISL_GFX6_FILES) +LOCAL_SRC_FILES := $(ISL_GEN6_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=60 +LOCAL_CFLAGS := -DGEN_VERSIONx10=60 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -86,16 +85,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx7 +# Build libmesa_isl_gen7 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx7 +LOCAL_MODULE := libmesa_isl_gen7 -LOCAL_SRC_FILES := $(ISL_GFX7_FILES) +LOCAL_SRC_FILES := $(ISL_GEN7_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=70 +LOCAL_CFLAGS := -DGEN_VERSIONx10=70 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -105,16 +104,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx75 +# Build libmesa_isl_gen75 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx75 +LOCAL_MODULE := libmesa_isl_gen75 -LOCAL_SRC_FILES := $(ISL_GFX75_FILES) +LOCAL_SRC_FILES := $(ISL_GEN75_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=75 +LOCAL_CFLAGS := -DGEN_VERSIONx10=75 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -124,16 +123,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx8 +# Build libmesa_isl_gen8 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx8 +LOCAL_MODULE := libmesa_isl_gen8 -LOCAL_SRC_FILES := $(ISL_GFX8_FILES) +LOCAL_SRC_FILES := $(ISL_GEN8_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=80 +LOCAL_CFLAGS := -DGEN_VERSIONx10=80 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -143,16 +142,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx9 +# Build libmesa_isl_gen9 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx9 +LOCAL_MODULE := libmesa_isl_gen9 -LOCAL_SRC_FILES := $(ISL_GFX9_FILES) +LOCAL_SRC_FILES := $(ISL_GEN9_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=90 +LOCAL_CFLAGS := -DGEN_VERSIONx10=90 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -162,16 +161,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx11 +# Build libmesa_isl_gen10 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx11 +LOCAL_MODULE := libmesa_isl_gen10 -LOCAL_SRC_FILES := $(ISL_GFX11_FILES) +LOCAL_SRC_FILES := $(ISL_GEN10_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=110 +LOCAL_CFLAGS := -DGEN_VERSIONx10=100 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -181,35 +180,16 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_isl_gfx12 +# Build libmesa_isl_gen11 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_isl_gfx12 +LOCAL_MODULE := libmesa_isl_gen11 -LOCAL_SRC_FILES := $(ISL_GFX12_FILES) +LOCAL_SRC_FILES := $(ISL_GEN11_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=120 - -LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) - -LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml - -include $(MESA_COMMON_MK) -include $(BUILD_STATIC_LIBRARY) - -# --------------------------------------- -# Build libmesa_isl_gfx125 -# --------------------------------------- - -include $(CLEAR_VARS) - -LOCAL_MODULE := libmesa_isl_gfx125 - -LOCAL_SRC_FILES := $(ISL_GFX125_FILES) - -LOCAL_CFLAGS := -DGFX_VERx10=125 +LOCAL_CFLAGS := -DGEN_VERSIONx10=110 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) @@ -279,16 +259,15 @@ LOCAL_C_INCLUDES := \ LOCAL_EXPORT_C_INCLUDE_DIRS := $(MESA_TOP)/src/intel LOCAL_WHOLE_STATIC_LIBRARIES := \ - libmesa_isl_gfx4 \ - libmesa_isl_gfx5 \ - libmesa_isl_gfx6 \ - libmesa_isl_gfx7 \ - libmesa_isl_gfx75 \ - libmesa_isl_gfx8 \ - libmesa_isl_gfx9 \ - libmesa_isl_gfx11 \ - libmesa_isl_gfx12 \ - libmesa_isl_gfx125 \ + libmesa_isl_gen4 \ + libmesa_isl_gen5 \ + libmesa_isl_gen6 \ + libmesa_isl_gen7 \ + libmesa_isl_gen75 \ + libmesa_isl_gen8 \ + libmesa_isl_gen9 \ + libmesa_isl_gen10 \ + libmesa_isl_gen11 \ libmesa_genxml \ libmesa_isl_tiled_memcpy diff --git a/lib/mesa/src/intel/Android.vulkan.mk b/lib/mesa/src/intel/Android.vulkan.mk index 81ced17e2..00eb49a38 100644 --- a/lib/mesa/src/intel/Android.vulkan.mk +++ b/lib/mesa/src/intel/Android.vulkan.mk @@ -23,7 +23,9 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) include $(LOCAL_PATH)/Makefile.sources -VK_ENTRYPOINTS_GEN_SCRIPT := $(MESA_TOP)/src/vulkan/util/vk_entrypoints_gen.py +ANV_ENTRYPOINTS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py +ANV_EXTENSIONS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions_gen.py +ANV_EXTENSIONS_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions.py VULKAN_API_XML := $(MESA_TOP)/src/vulkan/registry/vk.xml VULKAN_COMMON_INCLUDES := \ @@ -51,7 +53,6 @@ VULKAN_COMMON_HEADER_LIBRARIES := \ endif ANV_STATIC_LIBRARIES := \ - libmesa_vulkan_util \ libmesa_vulkan_common \ libmesa_genxml \ libmesa_nir @@ -63,15 +64,15 @@ ANV_SHARED_LIBRARIES += libnativewindow endif # -# libanv for gfx7 +# libanv for gen7 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_anv_gfx7 +LOCAL_MODULE := libmesa_anv_gen7 LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_SRC_FILES := $(VULKAN_GFX7_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=70 +LOCAL_SRC_FILES := $(VULKAN_GEN7_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=70 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) @@ -84,15 +85,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libanv for gfx75 +# libanv for gen75 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_anv_gfx75 +LOCAL_MODULE := libmesa_anv_gen75 LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_SRC_FILES := $(VULKAN_GFX75_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=75 +LOCAL_SRC_FILES := $(VULKAN_GEN75_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=75 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) @@ -105,15 +106,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libanv for gfx8 +# libanv for gen8 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_anv_gfx8 +LOCAL_MODULE := libmesa_anv_gen8 LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_SRC_FILES := $(VULKAN_GFX8_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=80 +LOCAL_SRC_FILES := $(VULKAN_GEN8_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=80 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) @@ -126,15 +127,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libanv for gfx9 +# libanv for gen9 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_anv_gfx9 +LOCAL_MODULE := libmesa_anv_gen9 LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_SRC_FILES := $(VULKAN_GFX9_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=90 +LOCAL_SRC_FILES := $(VULKAN_GEN9_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=90 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) @@ -147,15 +148,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libanv for gfx11 +# libanv for gen10 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_anv_gfx11 +LOCAL_MODULE := libmesa_anv_gen10 LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_SRC_FILES := $(VULKAN_GFX11_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=110 +LOCAL_SRC_FILES := $(VULKAN_GEN10_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=100 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) @@ -168,15 +169,15 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libanv for gfx12 +# libanv for gen11 # include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_anv_gfx12 +LOCAL_MODULE := libmesa_anv_gen11 LOCAL_MODULE_CLASS := STATIC_LIBRARIES -LOCAL_SRC_FILES := $(VULKAN_GFX12_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=120 +LOCAL_SRC_FILES := $(VULKAN_GEN11_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=110 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) @@ -189,28 +190,6 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # -# libanv for gfx125 -# - -include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_anv_gfx125 -LOCAL_MODULE_CLASS := STATIC_LIBRARIES - -LOCAL_SRC_FILES := $(VULKAN_GFX125_FILES) -LOCAL_CFLAGS := -DGFX_VERx10=125 - -LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) - -LOCAL_STATIC_LIBRARIES := $(ANV_STATIC_LIBRARIES) - -LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) -LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) - -include $(MESA_COMMON_MK) -include $(BUILD_STATIC_LIBRARY) - - -# # libmesa_vulkan_common # @@ -235,25 +214,39 @@ LOCAL_STATIC_LIBRARIES := \ libmesa_vulkan_util \ libmesa_util -LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(VULKAN_GENERATED_FILES)) +# The rule generates both C and H files, but due to some strange +# reason generating the files once leads to link-time issues. +# Work around create them here as well - we're safe from race +# conditions since they are stored in another location. -ANV_VK_ENTRYPOINTS_GEN_ARGS= \ - --proto --weak --prefix anv \ - --device-prefix gfx7 --device-prefix gfx75 \ - --device-prefix gfx8 --device-prefix gfx9 \ - --device-prefix gfx11 --device-prefix gfx12 \ - --device-prefix gfx125 +LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(VULKAN_GENERATED_FILES)) -$(intermediates)/vulkan/anv_entrypoints.c: $(VK_ENTRYPOINTS_GEN_SCRIPT) \ +$(intermediates)/vulkan/anv_entrypoints.c: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \ + $(ANV_EXTENSIONS_SCRIPT) \ $(VULKAN_API_XML) @mkdir -p $(dir $@) - $(MESA_PYTHON2) $(VK_ENTRYPOINTS_GEN_SCRIPT) \ + $(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \ --xml $(VULKAN_API_XML) \ - $(ANV_VK_ENTRYPOINTS_GEN_ARGS) \ - --out-c $@ --out-h $(dir $@)/anv_entrypoints.h + --outdir $(dir $@) $(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/anv_entrypoints.c +$(intermediates)/vulkan/anv_extensions.c: $(ANV_EXTENSIONS_GEN_SCRIPT) \ + $(ANV_EXTENSIONS_SCRIPT) \ + $(VULKAN_API_XML) + @mkdir -p $(dir $@) + $(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \ + --xml $(VULKAN_API_XML) \ + --out-c $@ + +$(intermediates)/vulkan/anv_extensions.h: $(ANV_EXTENSIONS_GEN_SCRIPT) \ + $(ANV_EXTENSIONS_SCRIPT) \ + $(VULKAN_API_XML) + @mkdir -p $(dir $@) + $(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \ + --xml $(VULKAN_API_XML) \ + --out-h $@ + LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) @@ -290,19 +283,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libmesa_compiler \ libmesa_intel_common \ libmesa_intel_dev \ - libmesa_intel_perf \ libmesa_vulkan_common \ libmesa_vulkan_util \ - libmesa_anv_gfx7 \ - libmesa_anv_gfx75 \ - libmesa_anv_gfx8 \ - libmesa_anv_gfx9 \ - libmesa_anv_gfx11 \ - libmesa_anv_gfx12 \ - libmesa_anv_gfx125 \ + libmesa_anv_gen7 \ + libmesa_anv_gen75 \ + libmesa_anv_gen8 \ + libmesa_anv_gen9 \ + libmesa_anv_gen10 \ + libmesa_anv_gen11 \ libmesa_intel_compiler -LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog libcutils +LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) # If Android version >=8 MESA should static link libexpat else should dynamic link @@ -314,5 +305,9 @@ else libexpat endif +ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) +LOCAL_STATIC_LIBRARIES += libgrallocusage +endif + include $(MESA_COMMON_MK) include $(BUILD_SHARED_LIBRARY) diff --git a/lib/mesa/src/intel/dev/gen_debug.c b/lib/mesa/src/intel/dev/gen_debug.c index a99abe48e..a4823286d 100644 --- a/lib/mesa/src/intel/dev/gen_debug.c +++ b/lib/mesa/src/intel/dev/gen_debug.c @@ -29,17 +29,14 @@ * miscellaneous debugging code. */ -#include <stdio.h> #include <stdlib.h> -#include <string.h> #include "dev/gen_debug.h" -#include "git_sha1.h" #include "util/macros.h" #include "util/debug.h" #include "c11/threads.h" -uint64_t intel_debug = 0; +uint64_t INTEL_DEBUG = 0; static const struct debug_control debug_control[] = { { "tex", DEBUG_TEXTURE}, @@ -92,12 +89,6 @@ static const struct debug_control debug_control[] = { { "tcs8", DEBUG_TCS_EIGHT_PATCH }, { "bt", DEBUG_BT }, { "pc", DEBUG_PIPE_CONTROL }, - { "nofc", DEBUG_NO_FAST_CLEAR }, - { "no32", DEBUG_NO32 }, - { "shaders", DEBUG_WM | DEBUG_VS | DEBUG_TCS | - DEBUG_TES | DEBUG_GS | DEBUG_CS | - DEBUG_RT }, - { "rt", DEBUG_RT }, { NULL, 0 } }; @@ -111,21 +102,15 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage) [MESA_SHADER_GEOMETRY] = DEBUG_GS, [MESA_SHADER_FRAGMENT] = DEBUG_WM, [MESA_SHADER_COMPUTE] = DEBUG_CS, - - [MESA_SHADER_RAYGEN] = DEBUG_RT, - [MESA_SHADER_ANY_HIT] = DEBUG_RT, - [MESA_SHADER_CLOSEST_HIT] = DEBUG_RT, - [MESA_SHADER_MISS] = DEBUG_RT, - [MESA_SHADER_INTERSECTION] = DEBUG_RT, - [MESA_SHADER_CALLABLE] = DEBUG_RT, }; + STATIC_ASSERT(MESA_SHADER_STAGES == 6); return flags[stage]; } static void brw_process_intel_debug_variable_once(void) { - intel_debug = parse_debug_string(getenv("INTEL_DEBUG"), debug_control); + INTEL_DEBUG = parse_debug_string(getenv("INTEL_DEBUG"), debug_control); } void @@ -136,108 +121,3 @@ brw_process_intel_debug_variable(void) call_once(&process_intel_debug_variable_flag, brw_process_intel_debug_variable_once); } - -static uint64_t debug_identifier[4] = { - 0xffeeddccbbaa9988, - 0x7766554433221100, - 0xffeeddccbbaa9988, - 0x7766554433221100, -}; - -void * -intel_debug_identifier(void) -{ - return debug_identifier; -} - -uint32_t -intel_debug_identifier_size(void) -{ - return sizeof(debug_identifier); -} - -uint32_t -intel_debug_write_identifiers(void *_output, - uint32_t output_size, - const char *driver_name) -{ - void *output = _output, *output_end = _output + output_size; - - assert(output_size > intel_debug_identifier_size()); - - memcpy(output, intel_debug_identifier(), intel_debug_identifier_size()); - output += intel_debug_identifier_size(); - - for (uint32_t id = GEN_DEBUG_BLOCK_TYPE_DRIVER; id < GEN_DEBUG_BLOCK_TYPE_MAX; id++) { - switch (id) { - case GEN_DEBUG_BLOCK_TYPE_DRIVER: { - struct gen_debug_block_driver driver_desc = { - .base = { - .type = id, - }, - }; - int len = snprintf(output + sizeof(driver_desc), - output_end - (output + sizeof(driver_desc)), - "%s " PACKAGE_VERSION " build " MESA_GIT_SHA1, - driver_name); - driver_desc.base.length = sizeof(driver_desc) + len + 1; - memcpy(output, &driver_desc, sizeof(driver_desc)); - output += driver_desc.base.length; - break; - } - - case GEN_DEBUG_BLOCK_TYPE_FRAME: { - struct gen_debug_block_frame frame_desc = { - .base = { - .type = GEN_DEBUG_BLOCK_TYPE_FRAME, - .length = sizeof(frame_desc), - }, - }; - memcpy(output, &frame_desc, sizeof(frame_desc)); - output += sizeof(frame_desc); - break; - } - - default: - unreachable("Missing identifier write"); - } - - assert(output < output_end); - } - - struct gen_debug_block_base end = { - .type = GEN_DEBUG_BLOCK_TYPE_END, - .length = sizeof(end), - }; - memcpy(output, &end, sizeof(end)); - output += sizeof(end); - - assert(output < output_end); - - /* Return the how many bytes where written, so that the rest of the buffer - * can be used for other things. - */ - return output - _output; -} - -void * -intel_debug_get_identifier_block(void *_buffer, - uint32_t buffer_size, - enum gen_debug_block_type type) -{ - void *buffer = _buffer + intel_debug_identifier_size(), - *end_buffer = _buffer + buffer_size; - - while (buffer < end_buffer) { - struct gen_debug_block_base *item = buffer; - - if (item->type == type) - return item; - if (item->type == GEN_DEBUG_BLOCK_TYPE_END) - return NULL; - - buffer += item->length; - } - - return NULL; -} diff --git a/lib/mesa/src/intel/dev/gen_debug.h b/lib/mesa/src/intel/dev/gen_debug.h index efaea6f34..edd3f8a66 100644 --- a/lib/mesa/src/intel/dev/gen_debug.h +++ b/lib/mesa/src/intel/dev/gen_debug.h @@ -28,7 +28,6 @@ #include <stdint.h> #include "compiler/shader_enums.h" -#include "util/macros.h" #ifdef __cplusplus extern "C" { @@ -40,9 +39,7 @@ extern "C" { * list of debugging flags, as well as some macros for handling them. */ -extern uint64_t intel_debug; - -#define INTEL_DEBUG __builtin_expect(intel_debug, 0) +extern uint64_t INTEL_DEBUG; #define DEBUG_TEXTURE (1ull << 0) #define DEBUG_STATE (1ull << 1) @@ -90,9 +87,6 @@ extern uint64_t intel_debug; #define DEBUG_TCS_EIGHT_PATCH (1ull << 43) #define DEBUG_BT (1ull << 44) #define DEBUG_PIPE_CONTROL (1ull << 45) -#define DEBUG_NO_FAST_CLEAR (1ull << 46) -#define DEBUG_NO32 (1ull << 47) -#define DEBUG_RT (1ull << 48) /* These flags are not compatible with the disk shader cache */ #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME @@ -101,7 +95,7 @@ extern uint64_t intel_debug; #define DEBUG_DISK_CACHE_MASK \ (DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 | DEBUG_SPILL_FS | \ DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | \ - DEBUG_TCS_EIGHT_PATCH | DEBUG_NO32) + DEBUG_TCS_EIGHT_PATCH) #ifdef HAVE_ANDROID_PLATFORM #define LOG_TAG "INTEL-MESA" @@ -119,7 +113,7 @@ extern uint64_t intel_debug; #endif /* HAVE_ANDROID_PLATFORM */ #define DBG(...) do { \ - if (INTEL_DEBUG & FILE_DEBUG_FLAG) \ + if (unlikely(INTEL_DEBUG & FILE_DEBUG_FLAG)) \ dbg_printf(__VA_ARGS__); \ } while(0) @@ -127,50 +121,6 @@ extern uint64_t intel_debug_flag_for_shader_stage(gl_shader_stage stage); extern void brw_process_intel_debug_variable(void); -/* Below is a list of structure located in the identifier buffer. The driver - * can fill those in for debug purposes. - */ - -enum gen_debug_block_type { - /* End of the debug blocks */ - GEN_DEBUG_BLOCK_TYPE_END = 1, - - /* Driver identifier (struct gen_debug_block_driver) */ - GEN_DEBUG_BLOCK_TYPE_DRIVER, - - /* Frame identifier (struct gen_debug_block_frame) */ - GEN_DEBUG_BLOCK_TYPE_FRAME, - - /* Internal, never to be written out */ - GEN_DEBUG_BLOCK_TYPE_MAX, -}; - -struct gen_debug_block_base { - uint32_t type; /* enum gen_debug_block_type */ - uint32_t length; /* inclusive of this structure size */ -}; - -struct gen_debug_block_driver { - struct gen_debug_block_base base; - uint8_t description[]; -}; - -struct gen_debug_block_frame { - struct gen_debug_block_base base; - uint64_t frame_id; -}; - -extern void *intel_debug_identifier(void); -extern uint32_t intel_debug_identifier_size(void); - -extern uint32_t intel_debug_write_identifiers(void *output, - uint32_t output_size, - const char *driver_name); - -extern void *intel_debug_get_identifier_block(void *buffer, - uint32_t buffer_size, - enum gen_debug_block_type type); - #ifdef __cplusplus } #endif diff --git a/lib/mesa/src/intel/dev/gen_device_info_test.c b/lib/mesa/src/intel/dev/gen_device_info_test.c index 815213938..495772f18 100644 --- a/lib/mesa/src/intel/dev/gen_device_info_test.c +++ b/lib/mesa/src/intel/dev/gen_device_info_test.c @@ -13,9 +13,8 @@ main(int argc, char *argv[]) const char *name; } chipsets[] = { #undef CHIPSET -#define CHIPSET(id, family, family_str, str_name) { .pci_id = id, .name = str_name, }, +#define CHIPSET(id, family, str_name) { .pci_id = id, .name = str_name, }, #include "pci_ids/i965_pci_ids.h" -#include "pci_ids/iris_pci_ids.h" }; for (uint32_t i = 0; i < ARRAY_SIZE(chipsets); i++) { @@ -23,11 +22,11 @@ main(int argc, char *argv[]) assert(gen_get_device_info_from_pci_id(chipsets[i].pci_id, &devinfo)); - assert(devinfo.ver != 0); + assert(devinfo.gen != 0); + assert(devinfo.urb.size != 0); assert(devinfo.num_eu_per_subslice != 0); assert(devinfo.num_thread_per_eu != 0); assert(devinfo.timestamp_frequency != 0); - assert(devinfo.cs_prefetch_size > 0); } return 0; diff --git a/lib/mesa/src/intel/perf/gen_perf.c b/lib/mesa/src/intel/perf/gen_perf.c index 0d3c9a22e..6b10b9d53 100644 --- a/lib/mesa/src/intel/perf/gen_perf.c +++ b/lib/mesa/src/intel/perf/gen_perf.c @@ -29,44 +29,363 @@ #include <unistd.h> #include <errno.h> -#ifndef HAVE_DIRENT_D_TYPE -#include <limits.h> // PATH_MAX -#endif - #include <drm-uapi/i915_drm.h> -#include "common/intel_gem.h" - -#include "dev/gen_debug.h" -#include "dev/gen_device_info.h" - -#include "perf/gen_perf.h" -#include "perf/gen_perf_regs.h" +#include "common/gen_gem.h" +#include "gen_perf.h" #include "perf/gen_perf_mdapi.h" #include "perf/gen_perf_metrics.h" -#include "perf/gen_perf_private.h" +#include "dev/gen_debug.h" +#include "dev/gen_device_info.h" #include "util/bitscan.h" -#include "util/macros.h" -#include "util/mesa-sha1.h" #include "util/u_math.h" #define FILE_DEBUG_FLAG DEBUG_PERFMON +#define MI_RPC_BO_SIZE 4096 +#define MI_FREQ_START_OFFSET_BYTES (3072) +#define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2) +#define MI_FREQ_END_OFFSET_BYTES (3076) + +#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low)) + +#define GEN7_RPSTAT1 0xA01C +#define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7 +#define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7) +#define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0 +#define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0) + +#define GEN9_RPSTAT0 0xA01C +#define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23 +#define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23) +#define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0 +#define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0) + +#define GEN6_SO_PRIM_STORAGE_NEEDED 0x2280 +#define GEN7_SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8) +#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288 +#define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8) + +#define MAP_READ (1 << 0) +#define MAP_WRITE (1 << 1) #define OA_REPORT_INVALID_CTX_ID (0xffffffff) -static bool -is_dir_or_link(const struct dirent *entry, const char *parent_dir) +/** + * Periodic OA samples are read() into these buffer structures via the + * i915 perf kernel interface and appended to the + * perf_ctx->sample_buffers linked list. When we process the + * results of an OA metrics query we need to consider all the periodic + * samples between the Begin and End MI_REPORT_PERF_COUNT command + * markers. + * + * 'Periodic' is a simplification as there are other automatic reports + * written by the hardware also buffered here. + * + * Considering three queries, A, B and C: + * + * Time ----> + * ________________A_________________ + * | | + * | ________B_________ _____C___________ + * | | | | | | + * + * And an illustration of sample buffers read over this time frame: + * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] + * + * These nodes may hold samples for query A: + * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] + * + * These nodes may hold samples for query B: + * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] + * + * These nodes may hold samples for query C: + * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] + * + * The illustration assumes we have an even distribution of periodic + * samples so all nodes have the same size plotted against time: + * + * Note, to simplify code, the list is never empty. + * + * With overlapping queries we can see that periodic OA reports may + * relate to multiple queries and care needs to be take to keep + * track of sample buffers until there are no queries that might + * depend on their contents. + * + * We use a node ref counting system where a reference ensures that a + * node and all following nodes can't be freed/recycled until the + * reference drops to zero. + * + * E.g. with a ref of one here: + * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * These nodes could be freed or recycled ("reaped"): + * [ 0 ][ 0 ] + * + * These must be preserved until the leading ref drops to zero: + * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * When a query starts we take a reference on the current tail of + * the list, knowing that no already-buffered samples can possibly + * relate to the newly-started query. A pointer to this node is + * also saved in the query object's ->oa.samples_head. + * + * E.g. starting query A while there are two nodes in .sample_buffers: + * ________________A________ + * | + * + * [ 0 ][ 1 ] + * ^_______ Add a reference and store pointer to node in + * A->oa.samples_head + * + * Moving forward to when the B query starts with no new buffer nodes: + * (for reference, i915 perf reads() are only done when queries finish) + * ________________A_______ + * | ________B___ + * | | + * + * [ 0 ][ 2 ] + * ^_______ Add a reference and store pointer to + * node in B->oa.samples_head + * + * Once a query is finished, after an OA query has become 'Ready', + * once the End OA report has landed and after we we have processed + * all the intermediate periodic samples then we drop the + * ->oa.samples_head reference we took at the start. + * + * So when the B query has finished we have: + * ________________A________ + * | ______B___________ + * | | | + * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] + * ^_______ Drop B->oa.samples_head reference + * + * We still can't free these due to the A->oa.samples_head ref: + * [ 1 ][ 0 ][ 0 ][ 0 ] + * + * When the A query finishes: (note there's a new ref for C's samples_head) + * ________________A_________________ + * | | + * | _____C_________ + * | | | + * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] + * ^_______ Drop A->oa.samples_head reference + * + * And we can now reap these nodes up to the C->oa.samples_head: + * [ X ][ X ][ X ][ X ] + * keeping -> [ 1 ][ 0 ][ 0 ] + * + * We reap old sample buffers each time we finish processing an OA + * query by iterating the sample_buffers list from the head until we + * find a referenced node and stop. + * + * Reaped buffers move to a perfquery.free_sample_buffers list and + * when we come to read() we first look to recycle a buffer from the + * free_sample_buffers list before allocating a new buffer. + */ +struct oa_sample_buf { + struct exec_node link; + int refcount; + int len; + uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; + uint32_t last_timestamp; +}; + +/** + * gen representation of a performance query object. + * + * NB: We want to keep this structure relatively lean considering that + * applications may expect to allocate enough objects to be able to + * query around all draw calls in a frame. + */ +struct gen_perf_query_object { -#ifdef HAVE_DIRENT_D_TYPE - return entry->d_type == DT_DIR || entry->d_type == DT_LNK; -#else - struct stat st; - char path[PATH_MAX + 1]; - snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name); - lstat(path, &st); - return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode); -#endif + const struct gen_perf_query_info *queryinfo; + + /* See query->kind to know which state below is in use... */ + union { + struct { + + /** + * BO containing OA counter snapshots at query Begin/End time. + */ + void *bo; + + /** + * Address of mapped of @bo + */ + void *map; + + /** + * The MI_REPORT_PERF_COUNT command lets us specify a unique + * ID that will be reflected in the resulting OA report + * that's written by the GPU. This is the ID we're expecting + * in the begin report and the the end report should be + * @begin_report_id + 1. + */ + int begin_report_id; + + /** + * Reference the head of the brw->perfquery.sample_buffers + * list at the time that the query started (so we only need + * to look at nodes after this point when looking for samples + * related to this query) + * + * (See struct brw_oa_sample_buf description for more details) + */ + struct exec_node *samples_head; + + /** + * false while in the unaccumulated_elements list, and set to + * true when the final, end MI_RPC snapshot has been + * accumulated. + */ + bool results_accumulated; + + /** + * Frequency of the GT at begin and end of the query. + */ + uint64_t gt_frequency[2]; + + /** + * Accumulated OA results between begin and end of the query. + */ + struct gen_perf_query_result result; + } oa; + + struct { + /** + * BO containing starting and ending snapshots for the + * statistics counters. + */ + void *bo; + } pipeline_stats; + }; +}; + +struct gen_perf_context { + struct gen_perf_config *perf; + + void * ctx; /* driver context (eg, brw_context) */ + void * bufmgr; + const struct gen_device_info *devinfo; + + uint32_t hw_ctx; + int drm_fd; + + /* The i915 perf stream we open to setup + enable the OA counters */ + int oa_stream_fd; + + /* An i915 perf stream fd gives exclusive access to the OA unit that will + * report counter snapshots for a specific counter set/profile in a + * specific layout/format so we can only start OA queries that are + * compatible with the currently open fd... + */ + int current_oa_metrics_set_id; + int current_oa_format; + + /* List of buffers containing OA reports */ + struct exec_list sample_buffers; + + /* Cached list of empty sample buffers */ + struct exec_list free_sample_buffers; + + int n_active_oa_queries; + int n_active_pipeline_stats_queries; + + /* The number of queries depending on running OA counters which + * extends beyond brw_end_perf_query() since we need to wait until + * the last MI_RPC command has parsed by the GPU. + * + * Accurate accounting is important here as emitting an + * MI_REPORT_PERF_COUNT command while the OA unit is disabled will + * effectively hang the gpu. + */ + int n_oa_users; + + /* To help catch an spurious problem with the hardware or perf + * forwarding samples, we emit each MI_REPORT_PERF_COUNT command + * with a unique ID that we can explicitly check for... + */ + int next_query_start_report_id; + + /** + * An array of queries whose results haven't yet been assembled + * based on the data in buffer objects. + * + * These may be active, or have already ended. However, the + * results have not been requested. + */ + struct gen_perf_query_object **unaccumulated; + int unaccumulated_elements; + int unaccumulated_array_size; + + /* The total number of query objects so we can relinquish + * our exclusive access to perf if the application deletes + * all of its objects. (NB: We only disable perf while + * there are no active queries) + */ + int n_query_instances; +}; + +const struct gen_perf_query_info* +gen_perf_query_info(const struct gen_perf_query_object *query) +{ + return query->queryinfo; +} + +struct gen_perf_context * +gen_perf_new_context(void *parent) +{ + struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context); + if (! ctx) + fprintf(stderr, "%s: failed to alloc context\n", __func__); + return ctx; +} + +struct gen_perf_config * +gen_perf_config(struct gen_perf_context *ctx) +{ + return ctx->perf; +} + +struct gen_perf_query_object * +gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index) +{ + const struct gen_perf_query_info *query = + &perf_ctx->perf->queries[query_index]; + struct gen_perf_query_object *obj = + calloc(1, sizeof(struct gen_perf_query_object)); + + if (!obj) + return NULL; + + obj->queryinfo = query; + + perf_ctx->n_query_instances++; + return obj; +} + +int +gen_perf_active_queries(struct gen_perf_context *perf_ctx, + const struct gen_perf_query_info *query) +{ + assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0); + + switch (query->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + return perf_ctx->n_active_oa_queries; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + return perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; + } } static bool @@ -80,9 +399,6 @@ get_sysfs_dev_dir(struct gen_perf_config *perf, int fd) perf->sysfs_dev_dir[0] = '\0'; - if (INTEL_DEBUG & DEBUG_NO_OACONFIG) - return true; - if (fstat(fd, &sb)) { DBG("Failed to stat DRM fd\n"); return false; @@ -111,7 +427,8 @@ get_sysfs_dev_dir(struct gen_perf_config *perf, int fd) } while ((drm_entry = readdir(drmdir))) { - if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) && + if ((drm_entry->d_type == DT_DIR || + drm_entry->d_type == DT_LNK) && strncmp(drm_entry->d_name, "card", 4) == 0) { len = snprintf(perf->sysfs_dev_dir, @@ -172,26 +489,41 @@ read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf, return read_file_uint64(buf, value); } +static inline struct gen_perf_query_info * +append_query_info(struct gen_perf_config *perf, int max_counters) +{ + struct gen_perf_query_info *query; + + perf->queries = reralloc(perf, perf->queries, + struct gen_perf_query_info, + ++perf->n_queries); + query = &perf->queries[perf->n_queries - 1]; + memset(query, 0, sizeof(*query)); + + if (max_counters > 0) { + query->max_counters = max_counters; + query->counters = + rzalloc_array(perf, struct gen_perf_query_counter, max_counters); + } + + return query; +} + static void register_oa_config(struct gen_perf_config *perf, - const struct gen_device_info *devinfo, const struct gen_perf_query_info *query, uint64_t config_id) { - struct gen_perf_query_info *registered_query = - gen_perf_append_query_info(perf, 0); + struct gen_perf_query_info *registered_query = append_query_info(perf, 0); *registered_query = *query; - registered_query->oa_format = devinfo->ver >= 8 ? - I915_OA_FORMAT_A32u40_A4u32_B8_C8 : I915_OA_FORMAT_A45_B8_C8; registered_query->oa_metrics_set_id = config_id; DBG("metric set registered: id = %" PRIu64", guid = %s\n", registered_query->oa_metrics_set_id, query->guid); } static void -enumerate_sysfs_metrics(struct gen_perf_config *perf, - const struct gen_device_info *devinfo) +enumerate_sysfs_metrics(struct gen_perf_config *perf) { DIR *metricsdir = NULL; struct dirent *metric_entry; @@ -212,7 +544,9 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf, while ((metric_entry = readdir(metricsdir))) { struct hash_entry *entry; - if (!is_dir_or_link(metric_entry, buf) || + + if ((metric_entry->d_type != DT_DIR && + metric_entry->d_type != DT_LNK) || metric_entry->d_name[0] == '.') continue; @@ -221,13 +555,20 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf, metric_entry->d_name); if (entry) { uint64_t id; - if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) { + + len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id", + perf->sysfs_dev_dir, metric_entry->d_name); + if (len < 0 || len >= sizeof(buf)) { + DBG("Failed to concatenate path to sysfs metric id file\n"); + continue; + } + + if (!read_file_uint64(buf, &id)) { DBG("Failed to read metric set id from %s: %m", buf); continue; } - register_oa_config(perf, devinfo, - (const struct gen_perf_query_info *)entry->data, id); + register_oa_config(perf, (const struct gen_perf_query_info *)entry->data, id); } else DBG("metric set not known by mesa (skipping)\n"); } @@ -235,133 +576,64 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf, closedir(metricsdir); } -static void -add_all_metrics(struct gen_perf_config *perf, - const struct gen_device_info *devinfo) -{ - hash_table_foreach(perf->oa_metrics_table, entry) { - const struct gen_perf_query_info *query = entry->data; - register_oa_config(perf, devinfo, query, 0); - } -} - static bool kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd) { uint64_t invalid_config_id = UINT64_MAX; - return intel_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, + return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &invalid_config_id) < 0 && errno == ENOENT; } -static int -i915_query_items(struct gen_perf_config *perf, int fd, - struct drm_i915_query_item *items, uint32_t n_items) -{ - struct drm_i915_query q = { - .num_items = n_items, - .items_ptr = to_user_pointer(items), - }; - return intel_ioctl(fd, DRM_IOCTL_I915_QUERY, &q); -} - -static bool -i915_query_perf_config_supported(struct gen_perf_config *perf, int fd) -{ - struct drm_i915_query_item item = { - .query_id = DRM_I915_QUERY_PERF_CONFIG, - .flags = DRM_I915_QUERY_PERF_CONFIG_LIST, - }; - - return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0; -} - static bool -i915_query_perf_config_data(struct gen_perf_config *perf, - int fd, const char *guid, - struct drm_i915_perf_oa_config *config) -{ - struct { - struct drm_i915_query_perf_config query; - struct drm_i915_perf_oa_config config; - } item_data; - struct drm_i915_query_item item = { - .query_id = DRM_I915_QUERY_PERF_CONFIG, - .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID, - .data_ptr = to_user_pointer(&item_data), - .length = sizeof(item_data), - }; - - memset(&item_data, 0, sizeof(item_data)); - memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid)); - memcpy(&item_data.config, config, sizeof(item_data.config)); - - if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0)) - return false; - - memcpy(config, &item_data.config, sizeof(item_data.config)); - - return true; -} - -bool -gen_perf_load_metric_id(struct gen_perf_config *perf_cfg, - const char *guid, - uint64_t *metric_id) +load_metric_id(struct gen_perf_config *perf, const char *guid, + uint64_t *metric_id) { char config_path[280]; snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id", - perf_cfg->sysfs_dev_dir, guid); + perf->sysfs_dev_dir, guid); /* Don't recreate already loaded configs. */ return read_file_uint64(config_path, metric_id); } -static uint64_t -i915_add_config(struct gen_perf_config *perf, int fd, - const struct gen_perf_registers *config, - const char *guid) -{ - struct drm_i915_perf_oa_config i915_config = { 0, }; - - memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid)); - - i915_config.n_mux_regs = config->n_mux_regs; - i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs); - - i915_config.n_boolean_regs = config->n_b_counter_regs; - i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs); - - i915_config.n_flex_regs = config->n_flex_regs; - i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs); - - int ret = intel_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config); - return ret > 0 ? ret : 0; -} - static void -init_oa_configs(struct gen_perf_config *perf, int fd, - const struct gen_device_info *devinfo) +init_oa_configs(struct gen_perf_config *perf, int fd) { hash_table_foreach(perf->oa_metrics_table, entry) { const struct gen_perf_query_info *query = entry->data; + struct drm_i915_perf_oa_config config; uint64_t config_id; + int ret; - if (gen_perf_load_metric_id(perf, query->guid, &config_id)) { + if (load_metric_id(perf, query->guid, &config_id)) { DBG("metric set: %s (already loaded)\n", query->guid); - register_oa_config(perf, devinfo, query, config_id); + register_oa_config(perf, query, config_id); continue; } - int ret = i915_add_config(perf, fd, &query->config, query->guid); + memset(&config, 0, sizeof(config)); + + memcpy(config.uuid, query->guid, sizeof(config.uuid)); + + config.n_mux_regs = query->n_mux_regs; + config.mux_regs_ptr = (uintptr_t) query->mux_regs; + + config.n_boolean_regs = query->n_b_counter_regs; + config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs; + + config.n_flex_regs = query->n_flex_regs; + config.flex_regs_ptr = (uintptr_t) query->flex_regs; + + ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config); if (ret < 0) { DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n", query->name, query->guid, strerror(errno)); continue; } - register_oa_config(perf, devinfo, query, ret); + register_oa_config(perf, query, ret); DBG("metric set: %s (added)\n", query->guid); } } @@ -375,16 +647,16 @@ compute_topology_builtins(struct gen_perf_config *perf, for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) { perf->sys_vars.n_eu_sub_slices += - util_bitcount(devinfo->subslice_masks[i]); + __builtin_popcount(devinfo->subslice_masks[i]); } for (int i = 0; i < sizeof(devinfo->eu_masks); i++) - perf->sys_vars.n_eus += util_bitcount(devinfo->eu_masks[i]); + perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]); perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu; - /* The subslice mask builtin contains bits for all slices. Prior to Gfx11 - * it had groups of 3bits for each slice, on Gfx11 it's 8bits for each + /* The subslice mask builtin contains bits for all slices. Prior to Gen11 + * it had groups of 3bits for each slice, on Gen11 it's 8bits for each * slice. * * Ideally equations would be updated to have a slice/subslice query @@ -392,7 +664,7 @@ compute_topology_builtins(struct gen_perf_config *perf, */ perf->sys_vars.subslice_mask = 0; - int bits_per_subslice = devinfo->ver == 11 ? 8 : 3; + int bits_per_subslice = devinfo->gen == 11 ? 8 : 3; for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) { for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) { @@ -407,23 +679,17 @@ init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *dev { uint64_t min_freq_mhz = 0, max_freq_mhz = 0; - if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) { - if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz)) - return false; + if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz)) + return false; - if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz)) - return false; - } else { - min_freq_mhz = 300; - max_freq_mhz = 1000; - } + if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz)) + return false; memset(&perf->sys_vars, 0, sizeof(perf->sys_vars)); perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000; perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000; perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency; perf->sys_vars.revision = devinfo->revision; - perf->sys_vars.query_mode = true; compute_topology_builtins(perf, devinfo); return true; @@ -464,270 +730,143 @@ get_register_queries_function(const struct gen_device_info *devinfo) if (devinfo->gt == 3) return gen_oa_register_queries_cflgt3; } - if (devinfo->ver == 11) { - if (devinfo->is_elkhartlake) - return gen_oa_register_queries_ehl; + if (devinfo->is_cannonlake) + return gen_oa_register_queries_cnl; + if (devinfo->gen == 11) return gen_oa_register_queries_icl; - } - if (devinfo->is_tigerlake) { - if (devinfo->gt == 1) - return gen_oa_register_queries_tglgt1; - if (devinfo->gt == 2) - return gen_oa_register_queries_tglgt2; - } - if (devinfo->is_rocketlake) - return gen_oa_register_queries_rkl; - if (devinfo->is_dg1) - return gen_oa_register_queries_dg1; - if (devinfo->is_alderlake) - return gen_oa_register_queries_adl; return NULL; } -static int -gen_perf_compare_counter_names(const void *v1, const void *v2) +static inline void +add_stat_reg(struct gen_perf_query_info *query, uint32_t reg, + uint32_t numerator, uint32_t denominator, + const char *name, const char *description) { - const struct gen_perf_query_counter *c1 = v1; - const struct gen_perf_query_counter *c2 = v2; + struct gen_perf_query_counter *counter; + + assert(query->n_counters < query->max_counters); + + counter = &query->counters[query->n_counters]; + counter->name = name; + counter->desc = description; + counter->type = GEN_PERF_COUNTER_TYPE_RAW; + counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64; + counter->offset = sizeof(uint64_t) * query->n_counters; + counter->pipeline_stat.reg = reg; + counter->pipeline_stat.numerator = numerator; + counter->pipeline_stat.denominator = denominator; - return strcmp(c1->name, c2->name); + query->n_counters++; } -static void -sort_query(struct gen_perf_query_info *q) +static inline void +add_basic_stat_reg(struct gen_perf_query_info *query, + uint32_t reg, const char *name) { - qsort(q->counters, q->n_counters, sizeof(q->counters[0]), - gen_perf_compare_counter_names); + add_stat_reg(query, reg, 1, 1, name, name); } static void load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg, - const struct gen_device_info *devinfo) + const struct gen_device_info *devinfo) { struct gen_perf_query_info *query = - gen_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS); + append_query_info(perf_cfg, MAX_STAT_COUNTERS); query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; query->name = "Pipeline Statistics Registers"; - gen_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT, - "N vertices submitted"); - gen_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, - "N primitives submitted"); - gen_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT, - "N vertex shader invocations"); - - if (devinfo->ver == 6) { - gen_perf_query_add_stat_reg(query, GFX6_SO_PRIM_STORAGE_NEEDED, 1, 1, - "SO_PRIM_STORAGE_NEEDED", - "N geometry shader stream-out primitives (total)"); - gen_perf_query_add_stat_reg(query, GFX6_SO_NUM_PRIMS_WRITTEN, 1, 1, - "SO_NUM_PRIMS_WRITTEN", - "N geometry shader stream-out primitives (written)"); + add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); + + if (devinfo->gen == 6) { + add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1, + "SO_PRIM_STORAGE_NEEDED", + "N geometry shader stream-out primitives (total)"); + add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1, + "SO_NUM_PRIMS_WRITTEN", + "N geometry shader stream-out primitives (written)"); } else { - gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 0)", - "N stream-out (stream 0) primitives (total)"); - gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 1)", - "N stream-out (stream 1) primitives (total)"); - gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 2)", - "N stream-out (stream 2) primitives (total)"); - gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 3)", - "N stream-out (stream 3) primitives (total)"); - gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 0)", - "N stream-out (stream 0) primitives (written)"); - gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 1)", - "N stream-out (stream 1) primitives (written)"); - gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 2)", - "N stream-out (stream 2) primitives (written)"); - gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 3)", - "N stream-out (stream 3) primitives (written)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 0)", + "N stream-out (stream 0) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 1)", + "N stream-out (stream 1) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 2)", + "N stream-out (stream 2) primitives (total)"); + add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 3)", + "N stream-out (stream 3) primitives (total)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 0)", + "N stream-out (stream 0) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 1)", + "N stream-out (stream 1) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 2)", + "N stream-out (stream 2) primitives (written)"); + add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 3)", + "N stream-out (stream 3) primitives (written)"); } - gen_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT, - "N TCS shader invocations"); - gen_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT, - "N TES shader invocations"); - - gen_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT, - "N geometry shader invocations"); - gen_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, - "N geometry shader primitives emitted"); - - gen_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT, - "N primitives entering clipping"); - gen_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, - "N primitives leaving clipping"); - - if (devinfo->is_haswell || devinfo->ver == 8) { - gen_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, - "N fragment shader invocations", - "N fragment shader invocations"); + add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + + add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + + add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); + + if (devinfo->is_haswell || devinfo->gen == 8) { + add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); } else { - gen_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT, - "N fragment shader invocations"); + add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); } - gen_perf_query_add_basic_stat_reg(query, PS_DEPTH_COUNT, - "N z-pass fragments"); + add_basic_stat_reg(query, PS_DEPTH_COUNT, + "N z-pass fragments"); - if (devinfo->ver >= 7) { - gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "N compute shader invocations"); + if (devinfo->gen >= 7) { + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); } query->data_size = sizeof(uint64_t) * query->n_counters; - - sort_query(query); -} - -static int -i915_perf_version(int drm_fd) -{ - int tmp; - drm_i915_getparam_t gp = { - .param = I915_PARAM_PERF_REVISION, - .value = &tmp, - }; - - int ret = intel_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp); - - /* Return 0 if this getparam is not supported, the first version supported - * is 1. - */ - return ret < 0 ? 0 : tmp; -} - -static void -i915_get_sseu(int drm_fd, struct drm_i915_gem_context_param_sseu *sseu) -{ - struct drm_i915_gem_context_param arg = { - .param = I915_CONTEXT_PARAM_SSEU, - .size = sizeof(*sseu), - .value = to_user_pointer(sseu) - }; - - intel_ioctl(drm_fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg); -} - -static inline int -compare_str_or_null(const char *s1, const char *s2) -{ - if (s1 == NULL && s2 == NULL) - return 0; - if (s1 == NULL) - return -1; - if (s2 == NULL) - return 1; - - return strcmp(s1, s2); -} - -static int -compare_counter_categories_and_names(const void *_c1, const void *_c2) -{ - const struct gen_perf_query_counter_info *c1 = (const struct gen_perf_query_counter_info *)_c1; - const struct gen_perf_query_counter_info *c2 = (const struct gen_perf_query_counter_info *)_c2; - - /* pipeline counters don't have an assigned category */ - int r = compare_str_or_null(c1->counter->category, c2->counter->category); - if (r) - return r; - - return strcmp(c1->counter->name, c2->counter->name); -} - -static void -build_unique_counter_list(struct gen_perf_config *perf) -{ - assert(perf->n_queries < 64); - - size_t max_counters = 0; - - for (int q = 0; q < perf->n_queries; q++) - max_counters += perf->queries[q].n_counters; - - /* - * Allocate big enough array to hold maximum possible number of counters. - * We can't alloc it small and realloc when needed because the hash table - * below contains pointers to this array. - */ - struct gen_perf_query_counter_info *counter_infos = - ralloc_array_size(perf, sizeof(counter_infos[0]), max_counters); - - perf->n_counters = 0; - - struct hash_table *counters_table = - _mesa_hash_table_create(perf, - _mesa_hash_string, - _mesa_key_string_equal); - struct hash_entry *entry; - for (int q = 0; q < perf->n_queries ; q++) { - struct gen_perf_query_info *query = &perf->queries[q]; - - for (int c = 0; c < query->n_counters; c++) { - struct gen_perf_query_counter *counter; - struct gen_perf_query_counter_info *counter_info; - - counter = &query->counters[c]; - entry = _mesa_hash_table_search(counters_table, counter->symbol_name); - - if (entry) { - counter_info = entry->data; - counter_info->query_mask |= BITFIELD64_BIT(q); - continue; - } - assert(perf->n_counters < max_counters); - - counter_info = &counter_infos[perf->n_counters++]; - counter_info->counter = counter; - counter_info->query_mask = BITFIELD64_BIT(q); - - counter_info->location.group_idx = q; - counter_info->location.counter_idx = c; - - _mesa_hash_table_insert(counters_table, counter->symbol_name, counter_info); - } - } - - _mesa_hash_table_destroy(counters_table, NULL); - - /* Now we can realloc counter_infos array because hash table doesn't exist. */ - perf->counter_infos = reralloc_array_size(perf, counter_infos, - sizeof(counter_infos[0]), perf->n_counters); - - qsort(perf->counter_infos, perf->n_counters, sizeof(perf->counter_infos[0]), - compare_counter_categories_and_names); } static bool -oa_metrics_available(struct gen_perf_config *perf, int fd, - const struct gen_device_info *devinfo) +load_oa_metrics(struct gen_perf_config *perf, int fd, + const struct gen_device_info *devinfo) { perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); bool i915_perf_oa_available = false; struct stat sb; - perf->i915_query_supported = i915_query_perf_config_supported(perf, fd); - perf->i915_perf_version = i915_perf_version(fd); - - /* Record the default SSEU configuration. */ - i915_get_sseu(fd, &perf->sseu); - /* The existence of this sysctl parameter implies the kernel supports * the i915 perf interface. */ if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) { - /* If _paranoid == 1 then on Gfx8+ we won't be able to access OA + /* If _paranoid == 1 then on Gen8+ we won't be able to access OA * metrics unless running as root. */ if (devinfo->is_haswell) @@ -740,26 +879,16 @@ oa_metrics_available(struct gen_perf_config *perf, int fd, if (paranoid == 0 || geteuid() == 0) i915_perf_oa_available = true; } - - perf->platform_supported = oa_register != NULL; } - return i915_perf_oa_available && - oa_register && - get_sysfs_dev_dir(perf, fd) && - init_oa_sys_vars(perf, devinfo); -} - -static void -load_oa_metrics(struct gen_perf_config *perf, int fd, - const struct gen_device_info *devinfo) -{ - int existing_queries = perf->n_queries; - - perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); + if (!i915_perf_oa_available || + !oa_register || + !get_sysfs_dev_dir(perf, fd) || + !init_oa_sys_vars(perf, devinfo)) + return false; perf->oa_metrics_table = - _mesa_hash_table_create(perf, _mesa_hash_string, + _mesa_hash_table_create(perf, _mesa_key_hash_string, _mesa_key_string_equal); /* Index all the metric sets mesa knows about before looking to see what @@ -767,188 +896,13 @@ load_oa_metrics(struct gen_perf_config *perf, int fd, */ oa_register(perf); - if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) { - if (kernel_has_dynamic_config_support(perf, fd)) - init_oa_configs(perf, fd, devinfo); - else - enumerate_sysfs_metrics(perf, devinfo); - } else { - add_all_metrics(perf, devinfo); - } - - /* sort counters in each individual group created by this function by name */ - for (int i = existing_queries; i < perf->n_queries; ++i) - sort_query(&perf->queries[i]); - - /* Select a fallback OA metric. Look for the TestOa metric or use the last - * one if no present (on HSW). - */ - for (int i = existing_queries; i < perf->n_queries; i++) { - if (perf->queries[i].symbol_name && - strcmp(perf->queries[i].symbol_name, "TestOa") == 0) { - perf->fallback_raw_oa_metric = perf->queries[i].oa_metrics_set_id; - break; - } - } - if (perf->fallback_raw_oa_metric == 0 && perf->n_queries > 0) - perf->fallback_raw_oa_metric = perf->queries[perf->n_queries - 1].oa_metrics_set_id; -} - -struct gen_perf_registers * -gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid) -{ - if (!perf_cfg->i915_query_supported) - return NULL; - - struct drm_i915_perf_oa_config i915_config = { 0, }; - if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) - return NULL; - - struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers); - config->n_flex_regs = i915_config.n_flex_regs; - config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs); - config->n_mux_regs = i915_config.n_mux_regs; - config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs); - config->n_b_counter_regs = i915_config.n_boolean_regs; - config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs); - - /* - * struct gen_perf_query_register_prog maps exactly to the tuple of - * (register offset, register value) returned by the i915. - */ - i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs); - i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs); - i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs); - if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) { - ralloc_free(config); - return NULL; - } - - return config; -} - -uint64_t -gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd, - const struct gen_perf_registers *config, - const char *guid) -{ - if (guid) - return i915_add_config(perf_cfg, fd, config, guid); - - struct mesa_sha1 sha1_ctx; - _mesa_sha1_init(&sha1_ctx); - - if (config->flex_regs) { - _mesa_sha1_update(&sha1_ctx, config->flex_regs, - sizeof(config->flex_regs[0]) * - config->n_flex_regs); - } - if (config->mux_regs) { - _mesa_sha1_update(&sha1_ctx, config->mux_regs, - sizeof(config->mux_regs[0]) * - config->n_mux_regs); - } - if (config->b_counter_regs) { - _mesa_sha1_update(&sha1_ctx, config->b_counter_regs, - sizeof(config->b_counter_regs[0]) * - config->n_b_counter_regs); - } - - uint8_t hash[20]; - _mesa_sha1_final(&sha1_ctx, hash); - - char formatted_hash[41]; - _mesa_sha1_format(formatted_hash, hash); - - char generated_guid[37]; - snprintf(generated_guid, sizeof(generated_guid), - "%.8s-%.4s-%.4s-%.4s-%.12s", - &formatted_hash[0], &formatted_hash[8], - &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4], - &formatted_hash[8 + 4 + 4 + 4]); - - /* Check if already present. */ - uint64_t id; - if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id)) - return id; - - return i915_add_config(perf_cfg, fd, config, generated_guid); -} - -static uint64_t -get_passes_mask(struct gen_perf_config *perf, - const uint32_t *counter_indices, - uint32_t counter_indices_count) -{ - uint64_t queries_mask = 0; - - assert(perf->n_queries < 64); - - /* Compute the number of passes by going through all counters N times (with - * N the number of queries) to make sure we select the most constraining - * counters first and look at the more flexible ones (that could be - * obtained from multiple queries) later. That way we minimize the number - * of passes required. - */ - for (uint32_t q = 0; q < perf->n_queries; q++) { - for (uint32_t i = 0; i < counter_indices_count; i++) { - assert(counter_indices[i] < perf->n_counters); - - uint32_t idx = counter_indices[i]; - if (util_bitcount64(perf->counter_infos[idx].query_mask) != (q + 1)) - continue; - - if (queries_mask & perf->counter_infos[idx].query_mask) - continue; - - queries_mask |= BITFIELD64_BIT(ffsll(perf->counter_infos[idx].query_mask) - 1); - } - } - - return queries_mask; -} - -uint32_t -gen_perf_get_n_passes(struct gen_perf_config *perf, - const uint32_t *counter_indices, - uint32_t counter_indices_count, - struct gen_perf_query_info **pass_queries) -{ - uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); - - if (pass_queries) { - uint32_t pass = 0; - for (uint32_t q = 0; q < perf->n_queries; q++) { - if ((1ULL << q) & queries_mask) - pass_queries[pass++] = &perf->queries[q]; - } - } - - return util_bitcount64(queries_mask); -} - -void -gen_perf_get_counters_passes(struct gen_perf_config *perf, - const uint32_t *counter_indices, - uint32_t counter_indices_count, - struct gen_perf_counter_pass *counter_pass) -{ - uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); - ASSERTED uint32_t n_passes = util_bitcount64(queries_mask); - - for (uint32_t i = 0; i < counter_indices_count; i++) { - assert(counter_indices[i] < perf->n_counters); - - uint32_t idx = counter_indices[i]; - counter_pass[i].counter = perf->counter_infos[idx].counter; - - uint32_t query_idx = ffsll(perf->counter_infos[idx].query_mask & queries_mask) - 1; - counter_pass[i].query = &perf->queries[query_idx]; + if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) && + kernel_has_dynamic_config_support(perf, fd)) + init_oa_configs(perf, fd); + else + enumerate_sysfs_metrics(perf); - uint32_t clear_bits = 63 - query_idx; - counter_pass[i].pass = util_bitcount64((queries_mask << clear_bits) >> clear_bits) - 1; - assert(counter_pass[i].pass < n_passes); - } + return true; } /* Accumulate 32bits OA counters */ @@ -984,7 +938,7 @@ accumulate_uint40(int a_index, } static void -gfx8_read_report_clock_ratios(const uint32_t *report, +gen8_read_report_clock_ratios(const uint32_t *report, uint64_t *slice_freq_hz, uint64_t *unslice_freq_hz) { @@ -1012,94 +966,67 @@ gfx8_read_report_clock_ratios(const uint32_t *report, *unslice_freq_hz = unslice_freq * 16666667ULL; } -void -gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, - const struct gen_device_info *devinfo, - const uint32_t *start, - const uint32_t *end) +static void +query_result_read_frequencies(struct gen_perf_query_result *result, + const struct gen_device_info *devinfo, + const uint32_t *start, + const uint32_t *end) { /* Slice/Unslice frequency is only available in the OA reports when the * "Disable OA reports due to clock ratio change" field in * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this * global register (see drivers/gpu/drm/i915/i915_perf.c) * - * Documentation says this should be available on Gfx9+ but experimentation - * shows that Gfx8 reports similar values, so we enable it there too. + * Documentation says this should be available on Gen9+ but experimentation + * shows that Gen8 reports similar values, so we enable it there too. */ - if (devinfo->ver < 8) + if (devinfo->gen < 8) return; - gfx8_read_report_clock_ratios(start, + gen8_read_report_clock_ratios(start, &result->slice_frequency[0], &result->unslice_frequency[0]); - gfx8_read_report_clock_ratios(end, + gen8_read_report_clock_ratios(end, &result->slice_frequency[1], &result->unslice_frequency[1]); } -static inline bool -can_use_mi_rpc_bc_counters(const struct gen_device_info *devinfo) -{ - return devinfo->ver <= 11; -} - -void -gen_perf_query_result_accumulate(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const struct gen_device_info *devinfo, - const uint32_t *start, - const uint32_t *end) +static void +query_result_accumulate(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint32_t *start, + const uint32_t *end) { - int i; + int i, idx = 0; if (result->hw_id == OA_REPORT_INVALID_CTX_ID && start[2] != OA_REPORT_INVALID_CTX_ID) result->hw_id = start[2]; - if (result->reports_accumulated == 0) - result->begin_timestamp = start[1]; result->reports_accumulated++; switch (query->oa_format) { case I915_OA_FORMAT_A32u40_A4u32_B8_C8: - accumulate_uint32(start + 1, end + 1, - result->accumulator + query->gpu_time_offset); /* timestamp */ - accumulate_uint32(start + 3, end + 3, - result->accumulator + query->gpu_clock_offset); /* clock */ + accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */ + accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */ /* 32x 40bit A counters... */ - for (i = 0; i < 32; i++) { - accumulate_uint40(i, start, end, - result->accumulator + query->a_offset + i); - } + for (i = 0; i < 32; i++) + accumulate_uint40(i, start, end, result->accumulator + idx++); /* 4x 32bit A counters... */ - for (i = 0; i < 4; i++) { - accumulate_uint32(start + 36 + i, end + 36 + i, - result->accumulator + query->a_offset + 32 + i); - } + for (i = 0; i < 4; i++) + accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++); - if (can_use_mi_rpc_bc_counters(devinfo)) { - /* 8x 32bit B counters */ - for (i = 0; i < 8; i++) { - accumulate_uint32(start + 48 + i, end + 48 + i, - result->accumulator + query->b_offset + i); - } - - /* 8x 32bit C counters... */ - for (i = 0; i < 8; i++) { - accumulate_uint32(start + 56 + i, end + 56 + i, - result->accumulator + query->c_offset + i); - } - } + /* 8x 32bit B counters + 8x 32bit C counters... */ + for (i = 0; i < 16; i++) + accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++); break; case I915_OA_FORMAT_A45_B8_C8: accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */ - for (i = 0; i < 61; i++) { - accumulate_uint32(start + 3 + i, end + 3 + i, - result->accumulator + query->a_offset + i); - } + for (i = 0; i < 61; i++) + accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i); break; default: @@ -1108,287 +1035,1458 @@ gen_perf_query_result_accumulate(struct gen_perf_query_result *result, } -#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) +static void +query_result_clear(struct gen_perf_query_result *result) +{ + memset(result, 0, sizeof(*result)); + result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */ +} -void -gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result, - const struct gen_device_info *devinfo, - const uint32_t start, - const uint32_t end) +static void +register_mdapi_statistic_query(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo) { - switch (devinfo->ver) { - case 7: - case 8: - result->gt_frequency[0] = GET_FIELD(start, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL; - result->gt_frequency[1] = GET_FIELD(end, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL; + if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) + return; + + struct gen_perf_query_info *query = + append_query_info(perf_cfg, MAX_STAT_COUNTERS); + + query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; + query->name = "Intel_Raw_Pipeline_Statistics_Query"; + + /* The order has to match mdapi_pipeline_metrics. */ + add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); + add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); + if (devinfo->is_haswell || devinfo->gen == 8) { + add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); + } else { + add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); + } + add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + if (devinfo->gen >= 7) { + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); + } + + if (devinfo->gen >= 10) { + /* Reuse existing CS invocation register until we can expose this new + * one. + */ + add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "Reserved1"); + } + + query->data_size = sizeof(uint64_t) * query->n_counters; +} + +static void +fill_mdapi_perf_query_counter(struct gen_perf_query_info *query, + const char *name, + uint32_t data_offset, + uint32_t data_size, + enum gen_perf_counter_data_type data_type) +{ + struct gen_perf_query_counter *counter = &query->counters[query->n_counters]; + + assert(query->n_counters <= query->max_counters); + + counter->name = name; + counter->desc = "Raw counter value"; + counter->type = GEN_PERF_COUNTER_TYPE_RAW; + counter->data_type = data_type; + counter->offset = data_offset; + + query->n_counters++; + + assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size); +} + +#define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \ + fill_mdapi_perf_query_counter(query, #field_name, \ + (uint8_t *) &struct_name.field_name - \ + (uint8_t *) &struct_name, \ + sizeof(struct_name.field_name), \ + GEN_PERF_COUNTER_DATA_TYPE_##type_name) +#define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \ + fill_mdapi_perf_query_counter(query, \ + ralloc_asprintf(ctx, "%s%i", #field_name, idx), \ + (uint8_t *) &struct_name.field_name[idx] - \ + (uint8_t *) &struct_name, \ + sizeof(struct_name.field_name[0]), \ + GEN_PERF_COUNTER_DATA_TYPE_##type_name) + +static void +register_mdapi_oa_query(const struct gen_device_info *devinfo, + struct gen_perf_config *perf) +{ + struct gen_perf_query_info *query = NULL; + + /* MDAPI requires different structures for pretty much every generation + * (right now we have definitions for gen 7 to 11). + */ + if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) + return; + + switch (devinfo->gen) { + case 7: { + query = append_query_info(perf, 1 + 45 + 16 + 7); + query->oa_format = I915_OA_FORMAT_A45_B8_C8; + + struct gen7_mdapi_metrics metric_data; + query->data_size = sizeof(metric_data); + + MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); + for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, ACounters, i, UINT64); + } + for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, NOACounters, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); + break; + } + case 8: { + query = append_query_info(perf, 2 + 36 + 16 + 16); + query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; + + struct gen8_mdapi_metrics metric_data; + query->data_size = sizeof(metric_data); + + MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); + for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, OaCntr, i, UINT64); + } + for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, NoaCntr, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); break; + } case 9: - case 11: - case 12: - result->gt_frequency[0] = GET_FIELD(start, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; - result->gt_frequency[1] = GET_FIELD(end, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; + case 10: + case 11: { + query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); + query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; + + struct gen9_mdapi_metrics metric_data; + query->data_size = sizeof(metric_data); + + MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); + for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, OaCntr, i, UINT64); + } + for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, NoaCntr, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); + for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, UserCntr, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32); break; + } default: - unreachable("unexpected gen"); + unreachable("Unsupported gen"); + break; } - /* Put the numbers into Hz. */ - result->gt_frequency[0] *= 1000000ULL; - result->gt_frequency[1] *= 1000000ULL; + query->kind = GEN_PERF_QUERY_TYPE_RAW; + query->name = "Intel_Raw_Hardware_Counters_Set_0_Query"; + query->guid = GEN_PERF_QUERY_GUID_MDAPI; + + { + /* Accumulation buffer offsets copied from an actual query... */ + const struct gen_perf_query_info *copy_query = + &perf->queries[0]; + + query->gpu_time_offset = copy_query->gpu_time_offset; + query->gpu_clock_offset = copy_query->gpu_clock_offset; + query->a_offset = copy_query->a_offset; + query->b_offset = copy_query->b_offset; + query->c_offset = copy_query->c_offset; + } } -void -gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const uint64_t *start, - const uint64_t *end) +static uint64_t +get_metric_id(struct gen_perf_config *perf, + const struct gen_perf_query_info *query) { - for (uint32_t i = 0; i < 2; i++) { - uint64_t v0 = start[i] & PERF_CNT_VALUE_MASK; - uint64_t v1 = end[i] & PERF_CNT_VALUE_MASK; + /* These queries are know not to ever change, their config ID has been + * loaded upon the first query creation. No need to look them up again. + */ + if (query->kind == GEN_PERF_QUERY_TYPE_OA) + return query->oa_metrics_set_id; + + assert(query->kind == GEN_PERF_QUERY_TYPE_RAW); - result->accumulator[query->perfcnt_offset + i] = v0 > v1 ? - (PERF_CNT_VALUE_MASK + 1 + v1 - v0) : - (v1 - v0); + /* Raw queries can be reprogrammed up by an external application/library. + * When a raw query is used for the first time it's id is set to a value != + * 0. When it stops being used the id returns to 0. No need to reload the + * ID when it's already loaded. + */ + if (query->oa_metrics_set_id != 0) { + DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n", + query->name, query->guid, query->oa_metrics_set_id); + return query->oa_metrics_set_id; } + + struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query; + if (!load_metric_id(perf, query->guid, + &raw_query->oa_metrics_set_id)) { + DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid); + raw_query->oa_metrics_set_id = 1ULL; + } else { + DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n", + query->name, query->guid, query->oa_metrics_set_id); + } + return query->oa_metrics_set_id; } -static uint32_t -query_accumulator_offset(const struct gen_perf_query_info *query, - enum gen_perf_query_field_type type, - uint8_t index) +static struct oa_sample_buf * +get_free_sample_buf(struct gen_perf_context *perf_ctx) { - switch (type) { - case GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: - return query->perfcnt_offset + index; - case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B: - return query->b_offset + index; - case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C: - return query->c_offset + index; - default: - unreachable("Invalid register type"); - return 0; + struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers); + struct oa_sample_buf *buf; + + if (node) + buf = exec_node_data(struct oa_sample_buf, node, link); + else { + buf = ralloc_size(perf_ctx->perf, sizeof(*buf)); + + exec_node_init(&buf->link); + buf->refcount = 0; + } + buf->len = 0; + + return buf; +} + +static void +reap_old_sample_buffers(struct gen_perf_context *perf_ctx) +{ + struct exec_node *tail_node = + exec_list_get_tail(&perf_ctx->sample_buffers); + struct oa_sample_buf *tail_buf = + exec_node_data(struct oa_sample_buf, tail_node, link); + + /* Remove all old, unreferenced sample buffers walking forward from + * the head of the list, except always leave at least one node in + * the list so we always have a node to reference when we Begin + * a new query. + */ + foreach_list_typed_safe(struct oa_sample_buf, buf, link, + &perf_ctx->sample_buffers) + { + if (buf->refcount == 0 && buf != tail_buf) { + exec_node_remove(&buf->link); + exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link); + } else + return; + } +} + +static void +free_sample_bufs(struct gen_perf_context *perf_ctx) +{ + foreach_list_typed_safe(struct oa_sample_buf, buf, link, + &perf_ctx->free_sample_buffers) + ralloc_free(buf); + + exec_list_make_empty(&perf_ctx->free_sample_buffers); +} + +/******************************************************************************/ + +/** + * Emit MI_STORE_REGISTER_MEM commands to capture all of the + * pipeline statistics for the performance query object. + */ +static void +snapshot_statistics_registers(void *context, + struct gen_perf_config *perf, + struct gen_perf_query_object *obj, + uint32_t offset_in_bytes) +{ + const struct gen_perf_query_info *query = obj->queryinfo; + const int n_counters = query->n_counters; + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &query->counters[i]; + + assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64); + + perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo, + counter->pipeline_stat.reg, + offset_in_bytes + i * sizeof(uint64_t)); + } +} + +static void +gen_perf_close(struct gen_perf_context *perfquery, + const struct gen_perf_query_info *query) +{ + if (perfquery->oa_stream_fd != -1) { + close(perfquery->oa_stream_fd); + perfquery->oa_stream_fd = -1; + } + if (query->kind == GEN_PERF_QUERY_TYPE_RAW) { + struct gen_perf_query_info *raw_query = + (struct gen_perf_query_info *) query; + raw_query->oa_metrics_set_id = 0; + } +} + +static bool +gen_perf_open(struct gen_perf_context *perf_ctx, + int metrics_set_id, + int report_format, + int period_exponent, + int drm_fd, + uint32_t ctx_id) +{ + uint64_t properties[] = { + /* Single context sampling */ + DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id, + + /* Include OA reports in samples */ + DRM_I915_PERF_PROP_SAMPLE_OA, true, + + /* OA unit configuration */ + DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id, + DRM_I915_PERF_PROP_OA_FORMAT, report_format, + DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent, + }; + struct drm_i915_perf_open_param param = { + .flags = I915_PERF_FLAG_FD_CLOEXEC | + I915_PERF_FLAG_FD_NONBLOCK | + I915_PERF_FLAG_DISABLED, + .num_properties = ARRAY_SIZE(properties) / 2, + .properties_ptr = (uintptr_t) properties, + }; + int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); + if (fd == -1) { + DBG("Error opening gen perf OA stream: %m\n"); + return false; + } + + perf_ctx->oa_stream_fd = fd; + + perf_ctx->current_oa_metrics_set_id = metrics_set_id; + perf_ctx->current_oa_format = report_format; + + return true; +} + +static bool +inc_n_users(struct gen_perf_context *perf_ctx) +{ + if (perf_ctx->n_oa_users == 0 && + gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0) + { + return false; + } + ++perf_ctx->n_oa_users; + + return true; +} + +static void +dec_n_users(struct gen_perf_context *perf_ctx) +{ + /* Disabling the i915 perf stream will effectively disable the OA + * counters. Note it's important to be sure there are no outstanding + * MI_RPC commands at this point since they could stall the CS + * indefinitely once OACONTROL is disabled. + */ + --perf_ctx->n_oa_users; + if (perf_ctx->n_oa_users == 0 && + gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0) + { + DBG("WARNING: Error disabling gen perf stream: %m\n"); } } void -gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const struct gen_device_info *devinfo, - const void *start, - const void *end, - bool no_oa_accumulate) +gen_perf_init_metrics(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo, + int drm_fd) +{ + load_pipeline_statistic_metrics(perf_cfg, devinfo); + register_mdapi_statistic_query(perf_cfg, devinfo); + if (load_oa_metrics(perf_cfg, drm_fd, devinfo)) + register_mdapi_oa_query(devinfo, perf_cfg); +} + +void +gen_perf_init_context(struct gen_perf_context *perf_ctx, + struct gen_perf_config *perf_cfg, + void * ctx, /* driver context (eg, brw_context) */ + void * bufmgr, /* eg brw_bufmgr */ + const struct gen_device_info *devinfo, + uint32_t hw_ctx, + int drm_fd) +{ + perf_ctx->perf = perf_cfg; + perf_ctx->ctx = ctx; + perf_ctx->bufmgr = bufmgr; + perf_ctx->drm_fd = drm_fd; + perf_ctx->hw_ctx = hw_ctx; + perf_ctx->devinfo = devinfo; + + perf_ctx->unaccumulated = + ralloc_array(ctx, struct gen_perf_query_object *, 2); + perf_ctx->unaccumulated_elements = 0; + perf_ctx->unaccumulated_array_size = 2; + + exec_list_make_empty(&perf_ctx->sample_buffers); + exec_list_make_empty(&perf_ctx->free_sample_buffers); + + /* It's convenient to guarantee that this linked list of sample + * buffers is never empty so we add an empty head so when we + * Begin an OA query we can always take a reference on a buffer + * in this list. + */ + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); + exec_list_push_head(&perf_ctx->sample_buffers, &buf->link); + + perf_ctx->oa_stream_fd = -1; + perf_ctx->next_query_start_report_id = 1000; +} + +/** + * Add a query to the global list of "unaccumulated queries." + * + * Queries are tracked here until all the associated OA reports have + * been accumulated via accumulate_oa_reports() after the end + * MI_REPORT_PERF_COUNT has landed in query->oa.bo. + */ +static void +add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *obj) +{ + if (perf_ctx->unaccumulated_elements >= + perf_ctx->unaccumulated_array_size) + { + perf_ctx->unaccumulated_array_size *= 1.5; + perf_ctx->unaccumulated = + reralloc(perf_ctx->ctx, perf_ctx->unaccumulated, + struct gen_perf_query_object *, + perf_ctx->unaccumulated_array_size); + } + + perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj; +} + +bool +gen_perf_begin_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) { - struct gen_perf_query_field_layout *layout = &query->perf->query_layout; - - for (uint32_t r = 0; r < layout->n_fields; r++) { - struct gen_perf_query_field *field = &layout->fields[r]; - - if (field->type == GEN_PERF_QUERY_FIELD_TYPE_MI_RPC) { - gen_perf_query_result_read_frequencies(result, devinfo, - start + field->location, - end + field->location); - /* no_oa_accumulate=true is used when doing GL perf queries, we - * manually parse the OA reports from the OA buffer and substract - * unrelated deltas, so don't accumulate the begin/end reports here. + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + + /* XXX: We have to consider that the command parser unit that parses batch + * buffer commands and is used to capture begin/end counter snapshots isn't + * implicitly synchronized with what's currently running across other GPU + * units (such as the EUs running shaders) that the performance counters are + * associated with. + * + * The intention of performance queries is to measure the work associated + * with commands between the begin/end delimiters and so for that to be the + * case we need to explicitly synchronize the parsing of commands to capture + * Begin/End counter snapshots with what's running across other parts of the + * GPU. + * + * When the command parser reaches a Begin marker it effectively needs to + * drain everything currently running on the GPU until the hardware is idle + * before capturing the first snapshot of counters - otherwise the results + * would also be measuring the effects of earlier commands. + * + * When the command parser reaches an End marker it needs to stall until + * everything currently running on the GPU has finished before capturing the + * end snapshot - otherwise the results won't be a complete representation + * of the work. + * + * Theoretically there could be opportunities to minimize how much of the + * GPU pipeline is drained, or that we stall for, when we know what specific + * units the performance counters being queried relate to but we don't + * currently attempt to be clever here. + * + * Note: with our current simple approach here then for back-to-back queries + * we will redundantly emit duplicate commands to synchronize the command + * streamer with the rest of the GPU pipeline, but we assume that in HW the + * second synchronization is effectively a NOOP. + * + * N.B. The final results are based on deltas of counters between (inside) + * Begin/End markers so even though the total wall clock time of the + * workload is stretched by larger pipeline bubbles the bubbles themselves + * are generally invisible to the query results. Whether that's a good or a + * bad thing depends on the use case. For a lower real-time impact while + * capturing metrics then periodic sampling may be a better choice than + * INTEL_performance_query. + * + * + * This is our Begin synchronization point to drain current work on the + * GPU before we capture our first counter snapshot... + */ + perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); + + switch (queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: { + + /* Opening an i915 perf stream implies exclusive access to the OA unit + * which will generate counter reports for a specific counter set with a + * specific layout/format so we can't begin any OA based queries that + * require a different counter set or format unless we get an opportunity + * to close the stream and open a new one... + */ + uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo); + + if (perf_ctx->oa_stream_fd != -1 && + perf_ctx->current_oa_metrics_set_id != metric_id) { + + if (perf_ctx->n_oa_users != 0) { + DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n", + perf_ctx->current_oa_metrics_set_id, metric_id); + return false; + } else + gen_perf_close(perf_ctx, queryinfo); + } + + /* If the OA counters aren't already on, enable them. */ + if (perf_ctx->oa_stream_fd == -1) { + const struct gen_device_info *devinfo = perf_ctx->devinfo; + + /* The period_exponent gives a sampling period as follows: + * sample_period = timestamp_period * 2^(period_exponent + 1) + * + * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or + * ~83ns (GEN8/9). + * + * The counter overflow period is derived from the EuActive counter + * which reads a counter that increments by the number of clock + * cycles multiplied by the number of EUs. It can be calculated as: + * + * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2) + * + * (E.g. 40 EUs @ 1GHz = ~53ms) + * + * We select a sampling period inferior to that overflow period to + * ensure we cannot see more than 1 counter overflow, otherwise we + * could loose information. */ - if (!no_oa_accumulate) { - gen_perf_query_result_accumulate(result, query, devinfo, - start + field->location, - end + field->location); - } - } else { - uint64_t v0, v1; - - if (field->size == 4) { - v0 = *(const uint32_t *)(start + field->location); - v1 = *(const uint32_t *)(end + field->location); - } else { - assert(field->size == 8); - v0 = *(const uint64_t *)(start + field->location); - v1 = *(const uint64_t *)(end + field->location); + + int a_counter_in_bits = 32; + if (devinfo->gen >= 8) + a_counter_in_bits = 40; + + uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus * + /* drop 1GHz freq to have units in nanoseconds */ + 2); + + DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n", + overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus); + + int period_exponent = 0; + uint64_t prev_sample_period, next_sample_period; + for (int e = 0; e < 30; e++) { + prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency; + next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency; + + /* Take the previous sampling period, lower than the overflow + * period. + */ + if (prev_sample_period < overflow_period && + next_sample_period > overflow_period) + period_exponent = e + 1; } - if (field->mask) { - v0 = field->mask & v0; - v1 = field->mask & v1; + if (period_exponent == 0) { + DBG("WARNING: enable to find a sampling exponent\n"); + return false; } - /* RPSTAT is a bit of a special case because its begin/end values - * represent frequencies. We store it in a separate location. - */ - if (field->type == GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT) - gen_perf_query_result_read_gt_frequency(result, devinfo, v0, v1); - else - result->accumulator[query_accumulator_offset(query, field->type, field->index)] = v1 - v0; + DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent, + prev_sample_period / 1000000ul); + + if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format, + period_exponent, perf_ctx->drm_fd, + perf_ctx->hw_ctx)) + return false; + } else { + assert(perf_ctx->current_oa_metrics_set_id == metric_id && + perf_ctx->current_oa_format == queryinfo->oa_format); + } + + if (!inc_n_users(perf_ctx)) { + DBG("WARNING: Error enabling i915 perf stream: %m\n"); + return false; + } + + if (query->oa.bo) { + perf_cfg->vtbl.bo_unreference(query->oa.bo); + query->oa.bo = NULL; + } + + query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, + "perf. query OA MI_RPC bo", + MI_RPC_BO_SIZE); +#ifdef DEBUG + /* Pre-filling the BO helps debug whether writes landed. */ + void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE); + memset(map, 0x80, MI_RPC_BO_SIZE); + perf_cfg->vtbl.bo_unmap(query->oa.bo); +#endif + + query->oa.begin_report_id = perf_ctx->next_query_start_report_id; + perf_ctx->next_query_start_report_id += 2; + + /* We flush the batchbuffer here to minimize the chances that MI_RPC + * delimiting commands end up in different batchbuffers. If that's the + * case, the measurement will include the time it takes for the kernel + * scheduler to load a new request into the hardware. This is manifested in + * tools like frameretrace by spikes in the "GPU Core Clocks" counter. + */ + perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__); + + /* Take a starting OA counter snapshot. */ + perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0, + query->oa.begin_report_id); + perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo, + MI_FREQ_START_OFFSET_BYTES); + + ++perf_ctx->n_active_oa_queries; + + /* No already-buffered samples can possibly be associated with this query + * so create a marker within the list of sample buffers enabling us to + * easily ignore earlier samples when processing this query after + * completion. + */ + assert(!exec_list_is_empty(&perf_ctx->sample_buffers)); + query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers); + + struct oa_sample_buf *buf = + exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); + + /* This reference will ensure that future/following sample + * buffers (that may relate to this query) can't be freed until + * this drops to zero. + */ + buf->refcount++; + + query_result_clear(&query->oa.result); + query->oa.results_accumulated = false; + + add_to_unaccumulated_query_list(perf_ctx, query); + break; + } + + case GEN_PERF_QUERY_TYPE_PIPELINE: + if (query->pipeline_stats.bo) { + perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); + query->pipeline_stats.bo = NULL; } + + query->pipeline_stats.bo = + perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, + "perf. query pipeline stats bo", + STATS_BO_SIZE); + + /* Take starting snapshots. */ + snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0); + + ++perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; } + + return true; } void -gen_perf_query_result_clear(struct gen_perf_query_result *result) +gen_perf_end_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) { - memset(result, 0, sizeof(*result)); - result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + /* Ensure that the work associated with the queried commands will have + * finished before taking our query end counter readings. + * + * For more details see comment in brw_begin_perf_query for + * corresponding flush. + */ + perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + + /* NB: It's possible that the query will have already been marked + * as 'accumulated' if an error was seen while reading samples + * from perf. In this case we mustn't try and emit a closing + * MI_RPC command in case the OA unit has already been disabled + */ + if (!query->oa.results_accumulated) { + /* Take an ending OA counter snapshot. */ + perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo, + MI_FREQ_END_OFFSET_BYTES); + perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, + MI_RPC_BO_END_OFFSET_BYTES, + query->oa.begin_report_id + 1); + } + + --perf_ctx->n_active_oa_queries; + + /* NB: even though the query has now ended, it can't be accumulated + * until the end MI_REPORT_PERF_COUNT snapshot has been written + * to query->oa.bo + */ + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query, + STATS_BO_END_OFFSET_BYTES); + --perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; + } } -void -gen_perf_query_result_print_fields(const struct gen_perf_query_info *query, - const struct gen_device_info *devinfo, - const void *data) +enum OaReadStatus { + OA_READ_STATUS_ERROR, + OA_READ_STATUS_UNFINISHED, + OA_READ_STATUS_FINISHED, +}; + +static enum OaReadStatus +read_oa_samples_until(struct gen_perf_context *perf_ctx, + uint32_t start_timestamp, + uint32_t end_timestamp) { - const struct gen_perf_query_field_layout *layout = &query->perf->query_layout; + struct exec_node *tail_node = + exec_list_get_tail(&perf_ctx->sample_buffers); + struct oa_sample_buf *tail_buf = + exec_node_data(struct oa_sample_buf, tail_node, link); + uint32_t last_timestamp = + tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp; + + while (1) { + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); + uint32_t offset; + int len; + + while ((len = read(perf_ctx->oa_stream_fd, buf->buf, + sizeof(buf->buf))) < 0 && errno == EINTR) + ; + + if (len <= 0) { + exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link); + + if (len < 0) { + if (errno == EAGAIN) { + return ((last_timestamp - start_timestamp) < INT32_MAX && + (last_timestamp - start_timestamp) >= + (end_timestamp - start_timestamp)) ? + OA_READ_STATUS_FINISHED : + OA_READ_STATUS_UNFINISHED; + } else { + DBG("Error reading i915 perf samples: %m\n"); + } + } else + DBG("Spurious EOF reading i915 perf samples\n"); + + return OA_READ_STATUS_ERROR; + } - for (uint32_t r = 0; r < layout->n_fields; r++) { - const struct gen_perf_query_field *field = &layout->fields[r]; - const uint32_t *value32 = data + field->location; + buf->len = len; + exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link); - switch (field->type) { - case GEN_PERF_QUERY_FIELD_TYPE_MI_RPC: - fprintf(stderr, "MI_RPC:\n"); - fprintf(stderr, " TS: 0x%08x\n", *(value32 + 1)); - fprintf(stderr, " CLK: 0x%08x\n", *(value32 + 3)); - break; - case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B: - fprintf(stderr, "B%u: 0x%08x\n", field->index, *value32); - break; - case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C: - fprintf(stderr, "C%u: 0x%08x\n", field->index, *value32); - break; - default: - break; + /* Go through the reports and update the last timestamp. */ + offset = 0; + while (offset < buf->len) { + const struct drm_i915_perf_record_header *header = + (const struct drm_i915_perf_record_header *) &buf->buf[offset]; + uint32_t *report = (uint32_t *) (header + 1); + + if (header->type == DRM_I915_PERF_RECORD_SAMPLE) + last_timestamp = report[1]; + + offset += header->size; } + + buf->last_timestamp = last_timestamp; } + + unreachable("not reached"); + return OA_READ_STATUS_ERROR; } -static int -gen_perf_compare_query_names(const void *v1, const void *v2) +/** + * Try to read all the reports until either the delimiting timestamp + * or an error arises. + */ +static bool +read_oa_samples_for_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch) { - const struct gen_perf_query_info *q1 = v1; - const struct gen_perf_query_info *q2 = v2; + uint32_t *start; + uint32_t *last; + uint32_t *end; + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + /* We need the MI_REPORT_PERF_COUNT to land before we can start + * accumulate. */ + assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && + !perf_cfg->vtbl.bo_busy(query->oa.bo)); + + /* Map the BO once here and let accumulate_oa_reports() unmap + * it. */ + if (query->oa.map == NULL) + query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ); + + start = last = query->oa.map; + end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + + if (start[0] != query->oa.begin_report_id) { + DBG("Spurious start report id=%"PRIu32"\n", start[0]); + return true; + } + if (end[0] != (query->oa.begin_report_id + 1)) { + DBG("Spurious end report id=%"PRIu32"\n", end[0]); + return true; + } + + /* Read the reports until the end timestamp. */ + switch (read_oa_samples_until(perf_ctx, start[1], end[1])) { + case OA_READ_STATUS_ERROR: + /* Fallthrough and let accumulate_oa_reports() deal with the + * error. */ + case OA_READ_STATUS_FINISHED: + return true; + case OA_READ_STATUS_UNFINISHED: + return false; + } - return strcmp(q1->name, q2->name); + unreachable("invalid read status"); + return false; } -static inline struct gen_perf_query_field * -add_query_register(struct gen_perf_query_field_layout *layout, - enum gen_perf_query_field_type type, - uint16_t offset, - uint16_t size, - uint8_t index) +void +gen_perf_wait_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch) { - /* Align MI_RPC to 64bytes (HW requirement) & 64bit registers to 8bytes - * (shows up nicely in the debugger). + struct gen_perf_config *perf_cfg = perf_ctx->perf; + struct brw_bo *bo = NULL; + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + bo = query->oa.bo; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + bo = query->pipeline_stats.bo; + break; + + default: + unreachable("Unknown query type"); + break; + } + + if (bo == NULL) + return; + + /* If the current batch references our results bo then we need to + * flush first... */ - if (type == GEN_PERF_QUERY_FIELD_TYPE_MI_RPC) - layout->size = align(layout->size, 64); - else if (size % 8 == 0) - layout->size = align(layout->size, 8); - - layout->fields[layout->n_fields++] = (struct gen_perf_query_field) { - .mmio_offset = offset, - .location = layout->size, - .type = type, - .index = index, - .size = size, - }; - layout->size += size; + if (perf_cfg->vtbl.batch_references(current_batch, bo)) + perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__); + + perf_cfg->vtbl.bo_wait_rendering(bo); + + /* Due to a race condition between the OA unit signaling report + * availability and the report actually being written into memory, + * we need to wait for all the reports to come in before we can + * read them. + */ + if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA || + query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) { + while (!read_oa_samples_for_query(perf_ctx, query, current_batch)) + ; + } +} - return &layout->fields[layout->n_fields - 1]; +bool +gen_perf_is_query_ready(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + return (query->oa.results_accumulated || + (query->oa.bo && + !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && + !perf_cfg->vtbl.bo_busy(query->oa.bo) && + read_oa_samples_for_query(perf_ctx, query, current_batch))); + case GEN_PERF_QUERY_TYPE_PIPELINE: + return (query->pipeline_stats.bo && + !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) && + !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo)); + + default: + unreachable("Unknown query type"); + break; + } + + return false; } +/** + * Remove a query from the global list of unaccumulated queries once + * after successfully accumulating the OA reports associated with the + * query in accumulate_oa_reports() or when discarding unwanted query + * results. + */ static void -gen_perf_init_query_fields(struct gen_perf_config *perf_cfg, - const struct gen_device_info *devinfo) +drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) { - struct gen_perf_query_field_layout *layout = &perf_cfg->query_layout; + for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) { + if (perf_ctx->unaccumulated[i] == query) { + int last_elt = --perf_ctx->unaccumulated_elements; + + if (i == last_elt) + perf_ctx->unaccumulated[i] = NULL; + else { + perf_ctx->unaccumulated[i] = + perf_ctx->unaccumulated[last_elt]; + } + + break; + } + } - layout->n_fields = 0; + /* Drop our samples_head reference so that associated periodic + * sample data buffers can potentially be reaped if they aren't + * referenced by any other queries... + */ - /* MI_RPC requires a 64byte alignment. */ - layout->alignment = 64; + struct oa_sample_buf *buf = + exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); - layout->fields = rzalloc_array(perf_cfg, struct gen_perf_query_field, 5 + 16); + assert(buf->refcount > 0); + buf->refcount--; - add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_MI_RPC, - 0, 256, 0); + query->oa.samples_head = NULL; - if (devinfo->ver <= 11) { - struct gen_perf_query_field *field = - add_query_register(layout, - GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, - PERF_CNT_1_DW0, 8, 0); - field->mask = PERF_CNT_VALUE_MASK; + reap_old_sample_buffers(perf_ctx); +} - field = add_query_register(layout, - GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, - PERF_CNT_2_DW0, 8, 1); - field->mask = PERF_CNT_VALUE_MASK; - } +/* In general if we see anything spurious while accumulating results, + * we don't try and continue accumulating the current query, hoping + * for the best, we scrap anything outstanding, and then hope for the + * best with new queries. + */ +static void +discard_all_queries(struct gen_perf_context *perf_ctx) +{ + while (perf_ctx->unaccumulated_elements) { + struct gen_perf_query_object *query = perf_ctx->unaccumulated[0]; - if (devinfo->ver == 8 && !devinfo->is_cherryview) { - add_query_register(layout, - GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, - GFX7_RPSTAT1, 4, 0); + query->oa.results_accumulated = true; + drop_from_unaccumulated_query_list(perf_ctx, query); + + dec_n_users(perf_ctx); } +} - if (devinfo->ver >= 9) { - add_query_register(layout, - GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, - GFX9_RPSTAT0, 4, 0); +/* Looks for the validity bit of context ID (dword 2) of an OA report. */ +static bool +oa_report_ctx_id_valid(const struct gen_device_info *devinfo, + const uint32_t *report) +{ + assert(devinfo->gen >= 8); + if (devinfo->gen == 8) + return (report[0] & (1 << 25)) != 0; + return (report[0] & (1 << 16)) != 0; +} + +/** + * Accumulate raw OA counter values based on deltas between pairs of + * OA reports. + * + * Accumulation starts from the first report captured via + * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the + * last MI_RPC report requested by brw_end_perf_query(). Between these + * two reports there may also some number of periodically sampled OA + * reports collected via the i915 perf interface - depending on the + * duration of the query. + * + * These periodic snapshots help to ensure we handle counter overflow + * correctly by being frequent enough to ensure we don't miss multiple + * overflows of a counter between snapshots. For Gen8+ the i915 perf + * snapshots provide the extra context-switch reports that let us + * subtract out the progress of counters associated with other + * contexts running on the system. + */ +static void +accumulate_oa_reports(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + const struct gen_device_info *devinfo = perf_ctx->devinfo; + uint32_t *start; + uint32_t *last; + uint32_t *end; + struct exec_node *first_samples_node; + bool last_report_ctx_match = true; + int out_duration = 0; + + assert(query->oa.map != NULL); + + start = last = query->oa.map; + end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + + if (start[0] != query->oa.begin_report_id) { + DBG("Spurious start report id=%"PRIu32"\n", start[0]); + goto error; + } + if (end[0] != (query->oa.begin_report_id + 1)) { + DBG("Spurious end report id=%"PRIu32"\n", end[0]); + goto error; } - if (!can_use_mi_rpc_bc_counters(devinfo)) { - if (devinfo->ver >= 8 && devinfo->ver <= 11) { - for (uint32_t i = 0; i < GFX8_N_OA_PERF_B32; i++) { - add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B, - GFX8_OA_PERF_B32(i), 4, i); - } - for (uint32_t i = 0; i < GFX8_N_OA_PERF_C32; i++) { - add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C, - GFX8_OA_PERF_C32(i), 4, i); + /* See if we have any periodic reports to accumulate too... */ + + /* N.B. The oa.samples_head was set when the query began and + * pointed to the tail of the perf_ctx->sample_buffers list at + * the time the query started. Since the buffer existed before the + * first MI_REPORT_PERF_COUNT command was emitted we therefore know + * that no data in this particular node's buffer can possibly be + * associated with the query - so skip ahead one... + */ + first_samples_node = query->oa.samples_head->next; + + foreach_list_typed_from(struct oa_sample_buf, buf, link, + &perf_ctx->sample_buffers, + first_samples_node) + { + int offset = 0; + + while (offset < buf->len) { + const struct drm_i915_perf_record_header *header = + (const struct drm_i915_perf_record_header *)(buf->buf + offset); + + assert(header->size != 0); + assert(header->size <= buf->len); + + offset += header->size; + + switch (header->type) { + case DRM_I915_PERF_RECORD_SAMPLE: { + uint32_t *report = (uint32_t *)(header + 1); + bool report_ctx_match = true; + bool add = true; + + /* Ignore reports that come before the start marker. + * (Note: takes care to allow overflow of 32bit timestamps) + */ + if (gen_device_info_timebase_scale(devinfo, + report[1] - start[1]) > 5000000000) { + continue; + } + + /* Ignore reports that come after the end marker. + * (Note: takes care to allow overflow of 32bit timestamps) + */ + if (gen_device_info_timebase_scale(devinfo, + report[1] - end[1]) <= 5000000000) { + goto end; + } + + /* For Gen8+ since the counters continue while other + * contexts are running we need to discount any unrelated + * deltas. The hardware automatically generates a report + * on context switch which gives us a new reference point + * to continuing adding deltas from. + * + * For Haswell we can rely on the HW to stop the progress + * of OA counters while any other context is acctive. + */ + if (devinfo->gen >= 8) { + /* Consider that the current report matches our context only if + * the report says the report ID is valid. + */ + report_ctx_match = oa_report_ctx_id_valid(devinfo, report) && + report[2] == start[2]; + if (report_ctx_match) + out_duration = 0; + else + out_duration++; + + /* Only add the delta between <last, report> if the last report + * was clearly identified as our context, or if we have at most + * 1 report without a matching ID. + * + * The OA unit will sometimes label reports with an invalid + * context ID when i915 rewrites the execlist submit register + * with the same context as the one currently running. This + * happens when i915 wants to notify the HW of ringbuffer tail + * register update. We have to consider this report as part of + * our context as the 3d pipeline behind the OACS unit is still + * processing the operations started at the previous execlist + * submission. + */ + add = last_report_ctx_match && out_duration < 2; + } + + if (add) { + query_result_accumulate(&query->oa.result, query->queryinfo, + last, report); + } + + last = report; + last_report_ctx_match = report_ctx_match; + + break; } - } else if (devinfo->ver == 12) { - for (uint32_t i = 0; i < GFX12_N_OAG_PERF_B32; i++) { - add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B, - GFX12_OAG_PERF_B32(i), 4, i); + + case DRM_I915_PERF_RECORD_OA_BUFFER_LOST: + DBG("i915 perf: OA error: all reports lost\n"); + goto error; + case DRM_I915_PERF_RECORD_OA_REPORT_LOST: + DBG("i915 perf: OA report lost\n"); + break; } - for (uint32_t i = 0; i < GFX12_N_OAG_PERF_C32; i++) { - add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C, - GFX12_OAG_PERF_C32(i), 4, i); + } + } + +end: + + query_result_accumulate(&query->oa.result, query->queryinfo, + last, end); + + query->oa.results_accumulated = true; + drop_from_unaccumulated_query_list(perf_ctx, query); + dec_n_users(perf_ctx); + + return; + +error: + + discard_all_queries(perf_ctx); +} + +void +gen_perf_delete_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + /* We can assume that the frontend waits for a query to complete + * before ever calling into here, so we don't have to worry about + * deleting an in-flight query object. + */ + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + if (query->oa.bo) { + if (!query->oa.results_accumulated) { + drop_from_unaccumulated_query_list(perf_ctx, query); + dec_n_users(perf_ctx); } + + perf_cfg->vtbl.bo_unreference(query->oa.bo); + query->oa.bo = NULL; + } + + query->oa.results_accumulated = false; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + if (query->pipeline_stats.bo) { + perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); + query->pipeline_stats.bo = NULL; } + break; + + default: + unreachable("Unknown query type"); + break; } - /* Align the whole package to 64bytes so that 2 snapshots can be put - * together without extract alignment for the user. + /* As an indication that the INTEL_performance_query extension is no + * longer in use, it's a good time to free our cache of sample + * buffers and close any current i915-perf stream. */ - layout->size = align(layout->size, 64); + if (--perf_ctx->n_query_instances == 0) { + free_sample_bufs(perf_ctx); + gen_perf_close(perf_ctx, query->queryinfo); + } + + free(query); } -void -gen_perf_init_metrics(struct gen_perf_config *perf_cfg, - const struct gen_device_info *devinfo, - int drm_fd, - bool include_pipeline_statistics) +#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) + +static void +read_gt_frequency(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *obj) { - gen_perf_init_query_fields(perf_cfg, devinfo); + const struct gen_device_info *devinfo = perf_ctx->devinfo; + uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)), + end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES)); - if (include_pipeline_statistics) { - load_pipeline_statistic_metrics(perf_cfg, devinfo); - gen_perf_register_mdapi_statistic_query(perf_cfg, devinfo); + switch (devinfo->gen) { + case 7: + case 8: + obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; + obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; + break; + case 9: + case 10: + case 11: + obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; + obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; + break; + default: + unreachable("unexpected gen"); } - bool oa_metrics = oa_metrics_available(perf_cfg, drm_fd, devinfo); - if (oa_metrics) - load_oa_metrics(perf_cfg, drm_fd, devinfo); + /* Put the numbers into Hz. */ + obj->oa.gt_frequency[0] *= 1000000ULL; + obj->oa.gt_frequency[1] *= 1000000ULL; +} - /* sort query groups by name */ - qsort(perf_cfg->queries, perf_cfg->n_queries, - sizeof(perf_cfg->queries[0]), gen_perf_compare_query_names); +static int +get_oa_counter_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + size_t data_size, + uint8_t *data) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + int n_counters = queryinfo->n_counters; + int written = 0; + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; + uint64_t *out_uint64; + float *out_float; + size_t counter_size = gen_perf_query_counter_get_size(counter); + + if (counter_size) { + switch (counter->data_type) { + case GEN_PERF_COUNTER_DATA_TYPE_UINT64: + out_uint64 = (uint64_t *)(data + counter->offset); + *out_uint64 = + counter->oa_counter_read_uint64(perf_cfg, queryinfo, + query->oa.result.accumulator); + break; + case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: + out_float = (float *)(data + counter->offset); + *out_float = + counter->oa_counter_read_float(perf_cfg, queryinfo, + query->oa.result.accumulator); + break; + default: + /* So far we aren't using uint32, double or bool32... */ + unreachable("unexpected counter data type"); + } + written = counter->offset + counter_size; + } + } + + return written; +} + +static int +get_pipeline_stats_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + size_t data_size, + uint8_t *data) + +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + int n_counters = queryinfo->n_counters; + uint8_t *p = data; + + uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ); + uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t)); + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; + uint64_t value = end[i] - start[i]; + + if (counter->pipeline_stat.numerator != + counter->pipeline_stat.denominator) { + value *= counter->pipeline_stat.numerator; + value /= counter->pipeline_stat.denominator; + } + + *((uint64_t *)p) = value; + p += 8; + } - build_unique_counter_list(perf_cfg); + perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo); - if (oa_metrics) - gen_perf_register_mdapi_oa_query(perf_cfg, devinfo); + return p - data; +} + +void +gen_perf_get_query_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + int data_size, + unsigned *data, + unsigned *bytes_written) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + int written = 0; + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + if (!query->oa.results_accumulated) { + read_gt_frequency(perf_ctx, query); + uint32_t *begin_report = query->oa.map; + uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + query_result_read_frequencies(&query->oa.result, + perf_ctx->devinfo, + begin_report, + end_report); + accumulate_oa_reports(perf_ctx, query); + assert(query->oa.results_accumulated); + + perf_cfg->vtbl.bo_unmap(query->oa.bo); + query->oa.map = NULL; + } + if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) { + written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data); + } else { + const struct gen_device_info *devinfo = perf_ctx->devinfo; + + written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size, + devinfo, &query->oa.result, + query->oa.gt_frequency[0], + query->oa.gt_frequency[1]); + } + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data); + break; + + default: + unreachable("Unknown query type"); + break; + } + + if (bytes_written) + *bytes_written = written; +} + +void +gen_perf_dump_query_count(struct gen_perf_context *perf_ctx) +{ + DBG("Queries: (Open queries = %d, OA users = %d)\n", + perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users); +} + +void +gen_perf_dump_query(struct gen_perf_context *ctx, + struct gen_perf_query_object *obj, + void *current_batch) +{ + switch (obj->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + DBG("BO: %-4s OA data: %-10s %-15s\n", + obj->oa.bo ? "yes," : "no,", + gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,", + obj->oa.results_accumulated ? "accumulated" : "not accumulated"); + break; + case GEN_PERF_QUERY_TYPE_PIPELINE: + DBG("BO: %-4s\n", + obj->pipeline_stats.bo ? "yes" : "no"); + break; + default: + unreachable("Unknown query type"); + break; + } } diff --git a/lib/mesa/src/intel/perf/gen_perf.h b/lib/mesa/src/intel/perf/gen_perf.h index 6b061c420..e33d9b0c9 100644 --- a/lib/mesa/src/intel/perf/gen_perf.h +++ b/lib/mesa/src/intel/perf/gen_perf.h @@ -25,7 +25,6 @@ #define GEN_PERF_H #include <stdio.h> -#include <stdbool.h> #include <stdint.h> #include <string.h> @@ -39,13 +38,23 @@ #include "compiler/glsl/list.h" #include "util/ralloc.h" -#include "drm-uapi/i915_drm.h" - struct gen_device_info; struct gen_perf_config; struct gen_perf_query_info; +#define GEN7_RPSTAT1 0xA01C +#define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7 +#define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7) +#define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0 +#define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0) + +#define GEN9_RPSTAT0 0xA01C +#define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23 +#define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23) +#define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0 +#define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0) + enum gen_perf_counter_type { GEN_PERF_COUNTER_TYPE_EVENT, GEN_PERF_COUNTER_TYPE_DURATION_NORM, @@ -63,39 +72,6 @@ enum gen_perf_counter_data_type { GEN_PERF_COUNTER_DATA_TYPE_DOUBLE, }; -enum gen_perf_counter_units { - /* size */ - GEN_PERF_COUNTER_UNITS_BYTES, - - /* frequency */ - GEN_PERF_COUNTER_UNITS_HZ, - - /* time */ - GEN_PERF_COUNTER_UNITS_NS, - GEN_PERF_COUNTER_UNITS_US, - - /**/ - GEN_PERF_COUNTER_UNITS_PIXELS, - GEN_PERF_COUNTER_UNITS_TEXELS, - GEN_PERF_COUNTER_UNITS_THREADS, - GEN_PERF_COUNTER_UNITS_PERCENT, - - /* events */ - GEN_PERF_COUNTER_UNITS_MESSAGES, - GEN_PERF_COUNTER_UNITS_NUMBER, - GEN_PERF_COUNTER_UNITS_CYCLES, - GEN_PERF_COUNTER_UNITS_EVENTS, - GEN_PERF_COUNTER_UNITS_UTILIZATION, - - /**/ - GEN_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES, - GEN_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES, - GEN_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES, - GEN_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE, - - GEN_PERF_COUNTER_UNITS_MAX -}; - struct gen_pipeline_stat { uint32_t reg; uint32_t numerator; @@ -106,12 +82,23 @@ struct gen_pipeline_stat { * The largest OA formats we can use include: * For Haswell: * 1 timestamp, 45 A counters, 8 B counters and 8 C counters. - * For Gfx8+ + * For Gen8+ * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters - * - * Plus 2 PERF_CNT registers and 1 RPSTAT register. */ -#define MAX_OA_REPORT_COUNTERS (62 + 2 + 1) +#define MAX_OA_REPORT_COUNTERS 62 + +#define IA_VERTICES_COUNT 0x2310 +#define IA_PRIMITIVES_COUNT 0x2318 +#define VS_INVOCATION_COUNT 0x2320 +#define HS_INVOCATION_COUNT 0x2300 +#define DS_INVOCATION_COUNT 0x2308 +#define GS_INVOCATION_COUNT 0x2328 +#define GS_PRIMITIVES_COUNT 0x2330 +#define CL_INVOCATION_COUNT 0x2338 +#define CL_PRIMITIVES_COUNT 0x2340 +#define PS_INVOCATION_COUNT 0x2348 +#define CS_INVOCATION_COUNT 0x2290 +#define PS_DEPTH_COUNT 0x2350 /* * When currently allocate only one page for pipeline statistics queries. Here @@ -151,41 +138,23 @@ struct gen_perf_query_result { * query. */ uint64_t unslice_frequency[2]; - - /** - * Frequency of the whole GT at the begin and end of the query. - */ - uint64_t gt_frequency[2]; - - /** - * Timestamp of the query. - */ - uint64_t begin_timestamp; - - /** - * Whether the query was interrupted by another workload (aka preemption). - */ - bool query_disjoint; }; struct gen_perf_query_counter { const char *name; const char *desc; - const char *symbol_name; - const char *category; enum gen_perf_counter_type type; enum gen_perf_counter_data_type data_type; - enum gen_perf_counter_units units; uint64_t raw_max; size_t offset; union { uint64_t (*oa_counter_read_uint64)(struct gen_perf_config *perf, const struct gen_perf_query_info *query, - const struct gen_perf_query_result *results); + const uint64_t *accumulator); float (*oa_counter_read_float)(struct gen_perf_config *perf, const struct gen_perf_query_info *query, - const struct gen_perf_query_result *results); + const uint64_t *accumulator); struct gen_pipeline_stat pipeline_stat; }; }; @@ -195,28 +164,13 @@ struct gen_perf_query_register_prog { uint32_t val; }; -/* Register programming for a given query */ -struct gen_perf_registers { - const struct gen_perf_query_register_prog *flex_regs; - uint32_t n_flex_regs; - - const struct gen_perf_query_register_prog *mux_regs; - uint32_t n_mux_regs; - - const struct gen_perf_query_register_prog *b_counter_regs; - uint32_t n_b_counter_regs; -}; - struct gen_perf_query_info { - struct gen_perf_config *perf; - enum gen_perf_query_type { GEN_PERF_QUERY_TYPE_OA, GEN_PERF_QUERY_TYPE_RAW, GEN_PERF_QUERY_TYPE_PIPELINE, } kind; const char *name; - const char *symbol_name; const char *guid; struct gen_perf_query_counter *counters; int n_counters; @@ -233,90 +187,22 @@ struct gen_perf_query_info { int a_offset; int b_offset; int c_offset; - int perfcnt_offset; - int rpstat_offset; - - struct gen_perf_registers config; -}; - -/* When not using the MI_RPC command, this structure describes the list of - * register offsets as well as their storage location so that they can be - * stored through a series of MI_SRM commands and accumulated with - * gen_perf_query_result_accumulate_snapshots(). - */ -struct gen_perf_query_field_layout { - /* Alignment for the layout */ - uint32_t alignment; - - /* Size of the whole layout */ - uint32_t size; - - uint32_t n_fields; - - struct gen_perf_query_field { - /* MMIO location of this register */ - uint16_t mmio_offset; - - /* Location of this register in the storage */ - uint16_t location; - - /* Type of register, for accumulation (see gen_perf_query_info:*_offset - * fields) - */ - enum gen_perf_query_field_type { - GEN_PERF_QUERY_FIELD_TYPE_MI_RPC, - GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, - GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, - GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B, - GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C, - } type; - - /* Index of register in the given type (for instance A31 or B2, - * etc...) - */ - uint8_t index; - - /* 4, 8 or 256 */ - uint16_t size; - - /* If not 0, mask to apply to the register value. */ - uint64_t mask; - } *fields; -}; -struct gen_perf_query_counter_info { - struct gen_perf_query_counter *counter; + /* Register programming for a given query */ + struct gen_perf_query_register_prog *flex_regs; + uint32_t n_flex_regs; - uint64_t query_mask; + struct gen_perf_query_register_prog *mux_regs; + uint32_t n_mux_regs; - /** - * Each counter can be a part of many groups, each time at different index. - * This struct stores one of those locations. - */ - struct { - int group_idx; /* query/group number */ - int counter_idx; /* index inside of query/group */ - } location; + struct gen_perf_query_register_prog *b_counter_regs; + uint32_t n_b_counter_regs; }; struct gen_perf_config { - /* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */ - bool i915_query_supported; - - /* Version of the i915-perf subsystem, refer to i915_drm.h. */ - int i915_perf_version; - - /* Powergating configuration for the running the query. */ - struct drm_i915_gem_context_param_sseu sseu; - struct gen_perf_query_info *queries; int n_queries; - struct gen_perf_query_counter_info *counter_infos; - int n_counters; - - struct gen_perf_query_field_layout query_layout; - /* Variables referenced in the XML meta data for OA performance * counters, e.g in the normalization equations. * @@ -333,7 +219,6 @@ struct gen_perf_config { uint64_t gt_min_freq; /** $GpuMinFrequency */ uint64_t gt_max_freq; /** $GpuMaxFrequency */ uint64_t revision; /** $SkuRevisionId */ - bool query_mode; /** $QueryMode */ } sys_vars; /* OA metric sets, indexed by GUID, as know by Mesa at build time, to @@ -342,17 +227,6 @@ struct gen_perf_config { */ struct hash_table *oa_metrics_table; - /* When MDAPI hasn't configured the metric we need to use by the time the - * query begins, this OA metric is used as a fallback. - */ - uint64_t fallback_raw_oa_metric; - - /* Whether we have support for this platform. If true && n_queries == 0, - * this means we will not be able to use i915-perf because of it is in - * paranoid mode. - */ - bool platform_supported; - /* Location of the device's sysfs entry. */ char sysfs_dev_dir[256]; @@ -364,96 +238,41 @@ struct gen_perf_config { bool (*batch_references)(void *batch, void *bo); void (*bo_wait_rendering)(void *bo); int (*bo_busy)(void *bo); - void (*emit_stall_at_pixel_scoreboard)(void *ctx); + void (*emit_mi_flush)(void *ctx); void (*emit_mi_report_perf_count)(void *ctx, void *bo, uint32_t offset_in_bytes, uint32_t report_id); void (*batchbuffer_flush)(void *ctx, const char *file, int line); - void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset); + void (*capture_frequency_stat_register)(void *ctx, void *bo, + uint32_t bo_offset); + void (*store_register_mem64)(void *ctx, void *bo, uint32_t reg, uint32_t offset); } vtbl; }; -struct gen_perf_counter_pass { - struct gen_perf_query_info *query; - struct gen_perf_query_counter *counter; - uint32_t pass; -}; +struct gen_perf_query_object; +const struct gen_perf_query_info* gen_perf_query_info(const struct gen_perf_query_object *); + +struct gen_perf_context; +struct gen_perf_context *gen_perf_new_context(void *parent); void gen_perf_init_metrics(struct gen_perf_config *perf_cfg, const struct gen_device_info *devinfo, - int drm_fd, - bool include_pipeline_statistics); - -/** Query i915 for a metric id using guid. - */ -bool gen_perf_load_metric_id(struct gen_perf_config *perf_cfg, - const char *guid, - uint64_t *metric_id); - -/** Load a configuation's content from i915 using a guid. - */ -struct gen_perf_registers *gen_perf_load_configuration(struct gen_perf_config *perf_cfg, - int fd, const char *guid); - -/** Store a configuration into i915 using guid and return a new metric id. - * - * If guid is NULL, then a generated one will be provided by hashing the - * content of the configuration. - */ -uint64_t gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd, - const struct gen_perf_registers *config, - const char *guid); - -/** Read the slice/unslice frequency from 2 OA reports and store then into - * result. - */ -void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, - const struct gen_device_info *devinfo, - const uint32_t *start, - const uint32_t *end); - -/** Store the GT frequency as reported by the RPSTAT register. - */ -void gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result, - const struct gen_device_info *devinfo, - const uint32_t start, - const uint32_t end); - -/** Store PERFCNT registers values. - */ -void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const uint64_t *start, - const uint64_t *end); - -/** Accumulate the delta between 2 OA reports into result for a given query. - */ -void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const struct gen_device_info *devinfo, - const uint32_t *start, - const uint32_t *end); - -/** Accumulate the delta between 2 snapshots of OA perf registers (layout - * should match description specified through gen_perf_query_register_layout). - */ -void gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const struct gen_device_info *devinfo, - const void *start, - const void *end, - bool no_oa_accumulate); + int drm_fd); +void gen_perf_init_context(struct gen_perf_context *perf_ctx, + struct gen_perf_config *perf_cfg, + void * ctx, /* driver context (eg, brw_context) */ + void * bufmgr, /* eg brw_bufmgr */ + const struct gen_device_info *devinfo, + uint32_t hw_ctx, + int drm_fd); -void gen_perf_query_result_clear(struct gen_perf_query_result *result); +struct gen_perf_config *gen_perf_config(struct gen_perf_context *ctx); -/** Debug helper printing out query data. - */ -void gen_perf_query_result_print_fields(const struct gen_perf_query_info *query, - const struct gen_device_info *devinfo, - const void *data); +int gen_perf_active_queries(struct gen_perf_context *perf_ctx, + const struct gen_perf_query_info *query); static inline size_t gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter) @@ -481,33 +300,31 @@ gen_perf_new(void *ctx) return perf; } -/** Whether we have the ability to hold off preemption on a batch so we don't - * have to look at the OA buffer to subtract unrelated workloads off the - * values captured through MI_* commands. - */ -static inline bool -gen_perf_has_hold_preemption(const struct gen_perf_config *perf) -{ - return perf->i915_perf_version >= 3; -} - -/** Whether we have the ability to lock EU array power configuration for the - * duration of the performance recording. This is useful on Gfx11 where the HW - * architecture requires half the EU for particular workloads. - */ -static inline bool -gen_perf_has_global_sseu(const struct gen_perf_config *perf) -{ - return perf->i915_perf_version >= 4; -} - -uint32_t gen_perf_get_n_passes(struct gen_perf_config *perf, - const uint32_t *counter_indices, - uint32_t counter_indices_count, - struct gen_perf_query_info **pass_queries); -void gen_perf_get_counters_passes(struct gen_perf_config *perf, - const uint32_t *counter_indices, - uint32_t counter_indices_count, - struct gen_perf_counter_pass *counter_pass); +struct gen_perf_query_object * +gen_perf_new_query(struct gen_perf_context *, unsigned query_index); + + +bool gen_perf_begin_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query); +void gen_perf_end_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query); +void gen_perf_wait_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch); +bool gen_perf_is_query_ready(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch); +void gen_perf_delete_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query); +void gen_perf_get_query_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + int data_size, + unsigned *data, + unsigned *bytes_written); + +void gen_perf_dump_query_count(struct gen_perf_context *perf_ctx); +void gen_perf_dump_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *obj, + void *current_batch); #endif /* GEN_PERF_H */ diff --git a/lib/mesa/src/intel/perf/gen_perf_mdapi.c b/lib/mesa/src/intel/perf/gen_perf_mdapi.c index 5508baba5..38ca23088 100644 --- a/lib/mesa/src/intel/perf/gen_perf_mdapi.c +++ b/lib/mesa/src/intel/perf/gen_perf_mdapi.c @@ -23,23 +23,18 @@ #include "gen_perf.h" #include "gen_perf_mdapi.h" -#include "gen_perf_private.h" -#include "gen_perf_regs.h" #include "dev/gen_device_info.h" -#include <drm-uapi/i915_drm.h> - - int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, const struct gen_device_info *devinfo, - const struct gen_perf_query_info *query, - const struct gen_perf_query_result *result) + const struct gen_perf_query_result *result, + uint64_t freq_start, uint64_t freq_end) { - switch (devinfo->ver) { + switch (devinfo->gen) { case 7: { - struct gfx7_mdapi_metrics *mdapi_data = (struct gfx7_mdapi_metrics *) data; + struct gen7_mdapi_metrics *mdapi_data = (struct gen7_mdapi_metrics *) data; if (data_size < sizeof(*mdapi_data)) return 0; @@ -54,19 +49,15 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i]; } - mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; - mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; - mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = gen_device_info_timebase_scale(devinfo, result->accumulator[0]); - mdapi_data->CoreFrequency = result->gt_frequency[1]; - mdapi_data->CoreFrequencyChanged = result->gt_frequency[1] != result->gt_frequency[0]; - mdapi_data->SplitOccured = result->query_disjoint; + mdapi_data->CoreFrequency = freq_end; + mdapi_data->CoreFrequencyChanged = freq_end != freq_start; return sizeof(*mdapi_data); } case 8: { - struct gfx8_mdapi_metrics *mdapi_data = (struct gfx8_mdapi_metrics *) data; + struct gen8_mdapi_metrics *mdapi_data = (struct gen8_mdapi_metrics *) data; if (data_size < sizeof(*mdapi_data)) return 0; @@ -78,29 +69,23 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i]; } - mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; - mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; - mdapi_data->ReportId = result->hw_id; mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = gen_device_info_timebase_scale(devinfo, result->accumulator[0]); - mdapi_data->BeginTimestamp = - gen_device_info_timebase_scale(devinfo, result->begin_timestamp); mdapi_data->GPUTicks = result->accumulator[1]; - mdapi_data->CoreFrequency = result->gt_frequency[1]; - mdapi_data->CoreFrequencyChanged = result->gt_frequency[1] != result->gt_frequency[0]; + mdapi_data->CoreFrequency = freq_end; + mdapi_data->CoreFrequencyChanged = freq_end != freq_start; mdapi_data->SliceFrequency = (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL; mdapi_data->UnsliceFrequency = (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL; - mdapi_data->SplitOccured = result->query_disjoint; return sizeof(*mdapi_data); } case 9: - case 11: - case 12:{ - struct gfx9_mdapi_metrics *mdapi_data = (struct gfx9_mdapi_metrics *) data; + case 10: + case 11: { + struct gen9_mdapi_metrics *mdapi_data = (struct gen9_mdapi_metrics *) data; if (data_size < sizeof(*mdapi_data)) return 0; @@ -112,257 +97,20 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i]; } - mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0]; - mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1]; - mdapi_data->ReportId = result->hw_id; mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = gen_device_info_timebase_scale(devinfo, result->accumulator[0]); - mdapi_data->BeginTimestamp = - gen_device_info_timebase_scale(devinfo, result->begin_timestamp); mdapi_data->GPUTicks = result->accumulator[1]; - mdapi_data->CoreFrequency = result->gt_frequency[1]; - mdapi_data->CoreFrequencyChanged = result->gt_frequency[1] != result->gt_frequency[0]; + mdapi_data->CoreFrequency = freq_end; + mdapi_data->CoreFrequencyChanged = freq_end != freq_start; mdapi_data->SliceFrequency = (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL; mdapi_data->UnsliceFrequency = (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL; - mdapi_data->SplitOccured = result->query_disjoint; return sizeof(*mdapi_data); } default: unreachable("unexpected gen"); } } - -void -gen_perf_register_mdapi_statistic_query(struct gen_perf_config *perf_cfg, - const struct gen_device_info *devinfo) -{ - if (!(devinfo->ver >= 7 && devinfo->ver <= 12)) - return; - - struct gen_perf_query_info *query = - gen_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS); - - query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; - query->name = "Intel_Raw_Pipeline_Statistics_Query"; - - /* The order has to match mdapi_pipeline_metrics. */ - gen_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT, - "N vertices submitted"); - gen_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, - "N primitives submitted"); - gen_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT, - "N vertex shader invocations"); - gen_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT, - "N geometry shader invocations"); - gen_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, - "N geometry shader primitives emitted"); - gen_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT, - "N primitives entering clipping"); - gen_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, - "N primitives leaving clipping"); - if (devinfo->is_haswell || devinfo->ver == 8) { - gen_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, - "N fragment shader invocations", - "N fragment shader invocations"); - } else { - gen_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT, - "N fragment shader invocations"); - } - gen_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT, - "N TCS shader invocations"); - gen_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT, - "N TES shader invocations"); - if (devinfo->ver >= 7) { - gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "N compute shader invocations"); - } - - if (devinfo->ver >= 10) { - /* Reuse existing CS invocation register until we can expose this new - * one. - */ - gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "Reserved1"); - } - - query->data_size = sizeof(uint64_t) * query->n_counters; -} - -static void -fill_mdapi_perf_query_counter(struct gen_perf_query_info *query, - const char *name, - uint32_t data_offset, - uint32_t data_size, - enum gen_perf_counter_data_type data_type) -{ - struct gen_perf_query_counter *counter = &query->counters[query->n_counters]; - - assert(query->n_counters <= query->max_counters); - - counter->name = name; - counter->desc = "Raw counter value"; - counter->type = GEN_PERF_COUNTER_TYPE_RAW; - counter->data_type = data_type; - counter->offset = data_offset; - - query->n_counters++; - - assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size); -} - -#define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \ - fill_mdapi_perf_query_counter(query, #field_name, \ - (uint8_t *) &struct_name.field_name - \ - (uint8_t *) &struct_name, \ - sizeof(struct_name.field_name), \ - GEN_PERF_COUNTER_DATA_TYPE_##type_name) -#define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \ - fill_mdapi_perf_query_counter(query, \ - ralloc_asprintf(ctx, "%s%i", #field_name, idx), \ - (uint8_t *) &struct_name.field_name[idx] - \ - (uint8_t *) &struct_name, \ - sizeof(struct_name.field_name[0]), \ - GEN_PERF_COUNTER_DATA_TYPE_##type_name) - -void -gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf, - const struct gen_device_info *devinfo) -{ - struct gen_perf_query_info *query = NULL; - - /* MDAPI requires different structures for pretty much every generation - * (right now we have definitions for gen 7 to 12). - */ - if (!(devinfo->ver >= 7 && devinfo->ver <= 12)) - return; - - switch (devinfo->ver) { - case 7: { - query = gen_perf_append_query_info(perf, 1 + 45 + 16 + 7); - query->oa_format = I915_OA_FORMAT_A45_B8_C8; - - struct gfx7_mdapi_metrics metric_data; - query->data_size = sizeof(metric_data); - - MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); - for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, ACounters, i, UINT64); - } - for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, NOACounters, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); - break; - } - case 8: { - query = gen_perf_append_query_info(perf, 2 + 36 + 16 + 16); - query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; - - struct gfx8_mdapi_metrics metric_data; - query->data_size = sizeof(metric_data); - - MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); - for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, OaCntr, i, UINT64); - } - for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, NoaCntr, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); - break; - } - case 9: - case 11: - case 12: { - query = gen_perf_append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); - query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; - - struct gfx9_mdapi_metrics metric_data; - query->data_size = sizeof(metric_data); - - MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); - for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, OaCntr, i, UINT64); - } - for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, NoaCntr, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); - for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, UserCntr, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32); - break; - } - default: - unreachable("Unsupported gen"); - break; - } - - query->kind = GEN_PERF_QUERY_TYPE_RAW; - query->name = "Intel_Raw_Hardware_Counters_Set_0_Query"; - query->guid = GEN_PERF_QUERY_GUID_MDAPI; - - { - /* Accumulation buffer offsets copied from an actual query... */ - const struct gen_perf_query_info *copy_query = - &perf->queries[0]; - - query->gpu_time_offset = copy_query->gpu_time_offset; - query->gpu_clock_offset = copy_query->gpu_clock_offset; - query->a_offset = copy_query->a_offset; - query->b_offset = copy_query->b_offset; - query->c_offset = copy_query->c_offset; - query->perfcnt_offset = copy_query->perfcnt_offset; - } -} diff --git a/lib/mesa/src/intel/perf/gen_perf_mdapi.h b/lib/mesa/src/intel/perf/gen_perf_mdapi.h index 4e77e2beb..3c3aec2c6 100644 --- a/lib/mesa/src/intel/perf/gen_perf_mdapi.h +++ b/lib/mesa/src/intel/perf/gen_perf_mdapi.h @@ -26,8 +26,7 @@ #include <stdint.h> -#include "dev/gen_device_info.h" - +struct gen_device_info; struct gen_perf_query_result; /* Guid has to matches with MDAPI's. */ @@ -37,7 +36,7 @@ struct gen_perf_query_result; * Data format expected by MDAPI. */ -struct gfx7_mdapi_metrics { +struct gen7_mdapi_metrics { uint64_t TotalTime; uint64_t ACounters[45]; @@ -55,7 +54,7 @@ struct gfx7_mdapi_metrics { #define GTDI_QUERY_BDW_METRICS_OA_COUNT 36 #define GTDI_QUERY_BDW_METRICS_OA_40b_COUNT 32 #define GTDI_QUERY_BDW_METRICS_NOA_COUNT 16 -struct gfx8_mdapi_metrics { +struct gen8_mdapi_metrics { uint64_t TotalTime; uint64_t GPUTicks; uint64_t OaCntr[GTDI_QUERY_BDW_METRICS_OA_COUNT]; @@ -81,7 +80,7 @@ struct gfx8_mdapi_metrics { #define GTDI_MAX_READ_REGS 16 -struct gfx9_mdapi_metrics { +struct gen9_mdapi_metrics { uint64_t TotalTime; uint64_t GPUTicks; uint64_t OaCntr[GTDI_QUERY_BDW_METRICS_OA_COUNT]; @@ -110,7 +109,8 @@ struct gfx9_mdapi_metrics { }; /* Add new definition */ -#define gfx11_mdapi_metrics gfx9_mdapi_metrics +#define gen10_mdapi_metrics gen9_mdapi_metrics +#define gen11_mdapi_metrics gen9_mdapi_metrics struct mdapi_pipeline_metrics { uint64_t IAVertices; @@ -124,37 +124,12 @@ struct mdapi_pipeline_metrics { uint64_t HSInvocations; uint64_t DSInvocations; uint64_t CSInvocations; - uint64_t Reserved1; /* Gfx10+ */ + uint64_t Reserved1; /* Gen10+ */ }; int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, const struct gen_device_info *devinfo, - const struct gen_perf_query_info *query, - const struct gen_perf_query_result *result); - -static inline void gen_perf_query_mdapi_write_marker(void *data, uint32_t data_size, - const struct gen_device_info *devinfo, - uint64_t value) -{ - switch (devinfo->ver) { - case 8: { - if (data_size < sizeof(struct gfx8_mdapi_metrics)) - return; - struct gfx8_mdapi_metrics *mdapi_data = data; - mdapi_data->MarkerUser = value; - break; - } - case 9: - case 11: { - if (data_size < sizeof(struct gfx9_mdapi_metrics)) - return; - struct gfx9_mdapi_metrics *mdapi_data = data; - mdapi_data->MarkerUser = value; - break; - } - default: - break; - } -} + const struct gen_perf_query_result *result, + uint64_t freq_start, uint64_t freq_end); #endif /* GEN_PERF_MDAPI_H */ diff --git a/lib/mesa/src/loader/Android.mk b/lib/mesa/src/loader/Android.mk index 6aaaa1dac..ca9218846 100644 --- a/lib/mesa/src/loader/Android.mk +++ b/lib/mesa/src/loader/Android.mk @@ -35,10 +35,6 @@ LOCAL_SRC_FILES := \ LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH) -ifneq ($(HAVE_GALLIUM_IRIS),) -LOCAL_CFLAGS += -DPREFER_IRIS -endif - LOCAL_MODULE := libmesa_loader include $(MESA_COMMON_MK) diff --git a/lib/mesa/src/mesa/Android.gen.mk b/lib/mesa/src/mesa/Android.gen.mk index ae79a7cc0..ff4f5e4e4 100644 --- a/lib/mesa/src/mesa/Android.gen.mk +++ b/lib/mesa/src/mesa/Android.gen.mk @@ -36,17 +36,11 @@ sources := \ main/dispatch.h \ main/format_fallback.c \ main/format_pack.c \ + main/format_unpack.c \ main/format_info.h \ main/remap_helper.h \ main/get_hash.h \ - main/marshal_generated0.c \ - main/marshal_generated1.c \ - main/marshal_generated2.c \ - main/marshal_generated3.c \ - main/marshal_generated4.c \ - main/marshal_generated5.c \ - main/marshal_generated6.c \ - main/marshal_generated7.c \ + main/marshal_generated.c \ main/marshal_generated.h LOCAL_SRC_FILES := $(filter-out $(sources), $(LOCAL_SRC_FILES)) @@ -93,52 +87,10 @@ $(intermediates)/main/api_exec.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml $(intermediates)/main/api_exec.c: $(dispatch_deps) $(call es-gen) -$(intermediates)/main/marshal_generated0.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated0.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 0 -n 8 +$(intermediates)/main/marshal_generated.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py +$(intermediates)/main/marshal_generated.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -$(intermediates)/main/marshal_generated0.c: $(dispatch_deps) - $(call es-gen) - -$(intermediates)/main/marshal_generated1.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated1.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 1 -n 8 - -$(intermediates)/main/marshal_generated1.c: $(dispatch_deps) - $(call es-gen) - -$(intermediates)/main/marshal_generated2.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated2.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 2 -n 8 - -$(intermediates)/main/marshal_generated2.c: $(dispatch_deps) - $(call es-gen) - -$(intermediates)/main/marshal_generated3.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated3.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 3 -n 8 - -$(intermediates)/main/marshal_generated3.c: $(dispatch_deps) - $(call es-gen) - -$(intermediates)/main/marshal_generated4.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated4.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 4 -n 8 - -$(intermediates)/main/marshal_generated4.c: $(dispatch_deps) - $(call es-gen) - -$(intermediates)/main/marshal_generated5.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated5.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 5 -n 8 - -$(intermediates)/main/marshal_generated5.c: $(dispatch_deps) - $(call es-gen) - -$(intermediates)/main/marshal_generated6.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated6.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 6 -n 8 - -$(intermediates)/main/marshal_generated6.c: $(dispatch_deps) - $(call es-gen) - -$(intermediates)/main/marshal_generated7.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py -$(intermediates)/main/marshal_generated7.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 7 -n 8 - -$(intermediates)/main/marshal_generated7.c: $(dispatch_deps) +$(intermediates)/main/marshal_generated.c: $(dispatch_deps) $(call es-gen) $(intermediates)/main/marshal_generated.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal_h.py @@ -187,3 +139,14 @@ $(intermediates)/main/format_pack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_ $(intermediates)/main/format_pack.c: PRIVATE_XML := $(intermediates)/main/format_pack.c: $(format_pack_deps) $(call es-gen, $<) + +FORMAT_UNPACK := $(LOCAL_PATH)/main/format_unpack.py +format_unpack_deps := \ + $(LOCAL_PATH)/main/formats.csv \ + $(LOCAL_PATH)/main/format_parser.py \ + $(FORMAT_UNPACK) + +$(intermediates)/main/format_unpack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_UNPACK) +$(intermediates)/main/format_unpack.c: PRIVATE_XML := +$(intermediates)/main/format_unpack.c: $(format_unpack_deps) + $(call es-gen, $<) diff --git a/lib/mesa/src/mesa/Android.libmesa_dricore.mk b/lib/mesa/src/mesa/Android.libmesa_dricore.mk index 8eb6aabe8..792117767 100644 --- a/lib/mesa/src/mesa/Android.libmesa_dricore.mk +++ b/lib/mesa/src/mesa/Android.libmesa_dricore.mk @@ -39,9 +39,11 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_SRC_FILES := \ $(MESA_FILES) +ifeq ($(strip $(MESA_ENABLE_ASM)),true) ifeq ($(TARGET_ARCH),x86) LOCAL_SRC_FILES += $(X86_FILES) endif # x86 +endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) LOCAL_WHOLE_STATIC_LIBRARIES := \ diff --git a/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk b/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk index 66b6ef13a..0d83cd5a9 100644 --- a/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk +++ b/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk @@ -43,6 +43,7 @@ LOCAL_C_INCLUDES := \ LOCAL_SRC_FILES := \ main/extensions_table.c \ + main/imports.c \ program/symbol_table.c \ program/dummy_errors.c @@ -67,6 +68,7 @@ LOCAL_C_INCLUDES := \ LOCAL_SRC_FILES := \ main/extensions_table.c \ + main/imports.c \ program/symbol_table.c \ program/dummy_errors.c diff --git a/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk b/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk index 16153a3c5..ddfd03059 100644 --- a/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk +++ b/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk @@ -42,9 +42,11 @@ LOCAL_GENERATED_SOURCES := \ $(MESA_GEN_GLSL_H) \ $(MESA_GEN_NIR_H) +ifeq ($(strip $(MESA_ENABLE_ASM)),true) ifeq ($(TARGET_ARCH),x86) LOCAL_SRC_FILES += $(X86_FILES) endif # x86 +endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) LOCAL_WHOLE_STATIC_LIBRARIES := \ diff --git a/lib/mesa/src/mesa/drivers/dri/Android.mk b/lib/mesa/src/mesa/drivers/dri/Android.mk index fb7d97f1a..60c8476a3 100644 --- a/lib/mesa/src/mesa/drivers/dri/Android.mk +++ b/lib/mesa/src/mesa/drivers/dri/Android.mk @@ -51,7 +51,6 @@ MESA_DRI_SHARED_LIBRARIES := \ libdl \ libglapi \ liblog \ - libsync \ libz # If Android version >=8 MESA should static link libexpat else should dynamic link diff --git a/lib/mesa/src/mesa/drivers/dri/i965/Android.mk b/lib/mesa/src/mesa/drivers/dri/i965/Android.mk index 816492581..29b46147f 100644 --- a/lib/mesa/src/mesa/drivers/dri/i965/Android.mk +++ b/lib/mesa/src/mesa/drivers/dri/i965/Android.mk @@ -40,201 +40,223 @@ I965_PERGEN_STATIC_LIBRARIES := \ libmesa_nir I965_PERGEN_LIBS := \ - libmesa_i965_gfx4 \ - libmesa_i965_gfx45 \ - libmesa_i965_gfx5 \ - libmesa_i965_gfx6 \ - libmesa_i965_gfx7 \ - libmesa_i965_gfx75 \ - libmesa_i965_gfx8 \ - libmesa_i965_gfx9 \ - libmesa_i965_gfx11 + libmesa_i965_gen4 \ + libmesa_i965_gen45 \ + libmesa_i965_gen5 \ + libmesa_i965_gen6 \ + libmesa_i965_gen7 \ + libmesa_i965_gen75 \ + libmesa_i965_gen8 \ + libmesa_i965_gen9 \ + libmesa_i965_gen10 \ + libmesa_i965_gen11 # --------------------------------------- -# Build libmesa_i965_gfx4 +# Build libmesa_i965_gen4 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx4 +LOCAL_MODULE := libmesa_i965_gen4 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx4_FILES) +LOCAL_SRC_FILES := $(i965_gen4_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=40 +LOCAL_CFLAGS := -DGEN_VERSIONx10=40 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx45 +# Build libmesa_i965_gen45 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx45 +LOCAL_MODULE := libmesa_i965_gen45 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx45_FILES) +LOCAL_SRC_FILES := $(i965_gen45_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=45 +LOCAL_CFLAGS := -DGEN_VERSIONx10=45 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx5 +# Build libmesa_i965_gen5 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx5 +LOCAL_MODULE := libmesa_i965_gen5 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx5_FILES) +LOCAL_SRC_FILES := $(i965_gen5_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=50 +LOCAL_CFLAGS := -DGEN_VERSIONx10=50 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx6 +# Build libmesa_i965_gen6 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx6 +LOCAL_MODULE := libmesa_i965_gen6 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx6_FILES) +LOCAL_SRC_FILES := $(i965_gen6_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=60 +LOCAL_CFLAGS := -DGEN_VERSIONx10=60 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx7 +# Build libmesa_i965_gen7 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx7 +LOCAL_MODULE := libmesa_i965_gen7 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx7_FILES) +LOCAL_SRC_FILES := $(i965_gen7_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=70 +LOCAL_CFLAGS := -DGEN_VERSIONx10=70 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx75 +# Build libmesa_i965_gen75 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx75 +LOCAL_MODULE := libmesa_i965_gen75 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx75_FILES) +LOCAL_SRC_FILES := $(i965_gen75_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=75 +LOCAL_CFLAGS := -DGEN_VERSIONx10=75 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx8 +# Build libmesa_i965_gen8 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx8 +LOCAL_MODULE := libmesa_i965_gen8 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx8_FILES) +LOCAL_SRC_FILES := $(i965_gen8_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=80 +LOCAL_CFLAGS := -DGEN_VERSIONx10=80 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx9 +# Build libmesa_i965_gen9 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx9 +LOCAL_MODULE := libmesa_i965_gen9 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx9_FILES) +LOCAL_SRC_FILES := $(i965_gen9_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=90 +LOCAL_CFLAGS := -DGEN_VERSIONx10=90 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) # --------------------------------------- -# Build libmesa_i965_gfx11 +# Build libmesa_i965_gen10 # --------------------------------------- include $(CLEAR_VARS) -LOCAL_MODULE := libmesa_i965_gfx11 +LOCAL_MODULE := libmesa_i965_gen10 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) -LOCAL_SRC_FILES := $(i965_gfx11_FILES) +LOCAL_SRC_FILES := $(i965_gen10_FILES) LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) -LOCAL_CFLAGS := -DGFX_VERx10=110 +LOCAL_CFLAGS := -DGEN_VERSIONx10=100 + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) + +# --------------------------------------- +# Build libmesa_i965_gen11 +# --------------------------------------- + +include $(CLEAR_VARS) + +LOCAL_MODULE := libmesa_i965_gen11 + +LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES) + +LOCAL_SRC_FILES := $(i965_gen11_FILES) + +LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES) + +LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES) + +LOCAL_CFLAGS := -DGEN_VERSIONx10=110 include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/mesa/program/Android.mk b/lib/mesa/src/mesa/program/Android.mk index 6b4e19167..c6470e628 100644 --- a/lib/mesa/src/mesa/program/Android.mk +++ b/lib/mesa/src/mesa/program/Android.mk @@ -23,7 +23,7 @@ LOCAL_PATH := $(call my-dir) define local-l-to-c @mkdir -p $(dir $@) @echo "Mesa Lex: $(PRIVATE_MODULE) <= $<" - $(hide) $(MESA_LEX) -o$@ $< + $(hide) $(LEX) -o$@ $< endef define mesa_local-y-to-c-and-h diff --git a/lib/mesa/src/panfrost/Android.mk b/lib/mesa/src/panfrost/Android.mk index 0681651ab..9ab5ddf9f 100644 --- a/lib/mesa/src/panfrost/Android.mk +++ b/lib/mesa/src/panfrost/Android.mk @@ -25,8 +25,4 @@ LOCAL_PATH := $(call my-dir) include $(LOCAL_PATH)/Makefile.sources -include $(LOCAL_PATH)/Android.util.mk -include $(LOCAL_PATH)/Android.bifrost.mk -include $(LOCAL_PATH)/Android.lib.mk -include $(LOCAL_PATH)/Android.midgard.mk include $(LOCAL_PATH)/Android.shared.mk diff --git a/lib/mesa/src/panfrost/Android.shared.mk b/lib/mesa/src/panfrost/Android.shared.mk index 81024607e..6b921756e 100644 --- a/lib/mesa/src/panfrost/Android.shared.mk +++ b/lib/mesa/src/panfrost/Android.shared.mk @@ -33,7 +33,7 @@ LOCAL_SRC_FILES := \ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/gallium/auxiliary/ \ - $(MESA_TOP)/src/gallium/include/ + $(MESA_TOP)/src/gallium/include/ \ LOCAL_STATIC_LIBRARIES := \ diff --git a/lib/mesa/src/panfrost/Makefile.sources b/lib/mesa/src/panfrost/Makefile.sources index 2f55e07db..3ab90f279 100644 --- a/lib/mesa/src/panfrost/Makefile.sources +++ b/lib/mesa/src/panfrost/Makefile.sources @@ -1,121 +1,3 @@ -bifrost_FILES := \ - bifrost/bifrost.h \ - bifrost/bifrost_compile.c \ - bifrost/bifrost_compile.h \ - bifrost/bi_layout.c \ - bifrost/bi_liveness.c \ - bifrost/bi_lower_swizzle.c \ - bifrost/bi_schedule.c \ - bifrost/bi_scoreboard.c \ - bifrost/bi_pack.c \ - bifrost/bi_print.c \ - bifrost/bi_print.h \ - bifrost/bi_ra.c \ - bifrost/bi_opt_copy_prop.c \ - bifrost/bi_opt_dce.c \ - bifrost/bi_opt_push_ubo.c \ - bifrost/bi_quirks.h \ - bifrost/bi_test_pack.c \ - bifrost/bir.c \ - bifrost/compiler.h \ - bifrost/cmdline.c - -bifrost_disasm_FILES := \ - bifrost/disassemble.c \ - bifrost/disassemble.h \ - bifrost/bi_print_common.c \ - bifrost/bi_print_common.h - -lib_FILES := \ - lib/decode_common.c \ - lib/decode.c \ - lib/pan_afbc.c \ - lib/pan_attributes.c \ - lib/pan_bo.c \ - lib/pan_bo.h \ - lib/pan_blend.c \ - lib/pan_blend.h \ - lib/pan_blitter.c \ - lib/pan_blitter.h \ - lib/pan_cs.c \ - lib/pan_cs.h \ - lib/pan_device.h \ - lib/pan_encoder.h \ - lib/pan_format.c \ - lib/pan_indirect_draw.c \ - lib/pan_indirect_draw.h \ - lib/pan_invocation.c \ - lib/pan_pool.c \ - lib/pan_pool.h \ - lib/pan_props.c \ - lib/pan_sampler.c \ - lib/pan_samples.c \ - lib/pan_shader.c \ - lib/pan_shader.h \ - lib/pan_scoreboard.c \ - lib/pan_scoreboard.h \ - lib/pan_tiler.c \ - lib/pan_texture.c \ - lib/pan_scratch.c \ - lib/pan_util.h - -midgard_FILES := \ - midgard/compiler.h \ - midgard/disassemble.c \ - midgard/disassemble.h \ - midgard/helpers.h \ - midgard/midgard_address.c \ - midgard/midgard_compile.c \ - midgard/midgard_compile.h \ - midgard/midgard_derivatives.c \ - midgard/midgard_emit.c \ - midgard/midgard.h \ - midgard/midgard_liveness.c \ - midgard/midgard_nir_lower_helper_writes.c \ - midgard/midgard_helper_invocations.c \ - midgard/midgard_nir.h \ - midgard/midgard_nir_lower_image_bitsize.c \ - midgard/midgard_ops.c \ - midgard/midgard_ops.h \ - midgard/midgard_opt_copy_prop.c \ - midgard/midgard_opt_dce.c \ - midgard/midgard_opt_perspective.c \ - midgard/midgard-parse.h \ - midgard/midgard_print.c \ - midgard/midgard_ra.c \ - midgard/midgard_ra_pipeline.c \ - midgard/midgard_schedule.c \ - midgard/midgard_errata_lod.c \ - midgard/mir.c \ - midgard/mir_promote_uniforms.c \ - midgard/mir_squeeze.c \ - midgard/nir_fuse_io_16.c \ - -midgard_disasm_FILES := \ - midgard/disassemble.c \ - midgard/disassemble.h \ - midgard/midgard_ops.c \ - midgard/midgard_ops.h \ - midgard/midgard_print_constant.c - shared_FILES := \ - shared/pan_minmax_cache.c \ shared/pan_tiling.c \ - shared/pan_minmax_cache.h \ - shared/pan_tiling.h \ - -util_FILES := \ - util/lcra.c \ - util/lcra.h \ - util/nir_lower_blend.c \ - util/nir_lower_blend.h \ - util/nir_mod_helpers.c \ - util/pan_ir.c \ - util/pan_ir.h \ - util/pan_liveness.c \ - util/pan_lower_framebuffer.c \ - util/pan_lower_helper_invocation.c \ - util/pan_lower_sample_position.c \ - util/pan_lower_writeout.c \ - util/pan_lower_64bit_intrin.c \ - util/pan_sysval.c \ + shared/pan_tiling.h diff --git a/lib/mesa/src/util/Android.mk b/lib/mesa/src/util/Android.mk index 829699db6..6d770ca95 100644 --- a/lib/mesa/src/util/Android.mk +++ b/lib/mesa/src/util/Android.mk @@ -34,21 +34,12 @@ LOCAL_SRC_FILES := \ $(MESA_UTIL_FILES) \ $(XMLCONFIG_FILES) -LOCAL_MODULE := libmesa_util - -LOCAL_MODULE_CLASS := STATIC_LIBRARIES - -intermediates := $(call local-generated-sources-dir) - LOCAL_C_INCLUDES := \ external/zlib \ $(MESA_TOP)/src/mesa \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/gallium/include \ - $(MESA_TOP)/src/gallium/auxiliary \ - $(MESA_TOP)/src/util/format \ - $(intermediates)/util/format \ - $(intermediates) + $(MESA_TOP)/src/gallium/auxiliary # If Android version >=8 MESA should static link libexpat else should dynamic link ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) @@ -59,41 +50,70 @@ LOCAL_SHARED_LIBRARIES := \ libexpat endif -LOCAL_SHARED_LIBRARIES += liblog libsync libcutils +LOCAL_MODULE := libmesa_util # Generated sources -LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates) - -# Some sources do require "util/format/u_format_pack.h" generated header -UTIL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(subst format/u_format_pack.h,util/format/u_format_pack.h,$(MESA_UTIL_GENERATED_FILES))) -LOCAL_GENERATED_SOURCES := $(UTIL_GENERATED_SOURCES) +LOCAL_MODULE_CLASS := STATIC_LIBRARIES -driconf_static_gen := $(LOCAL_PATH)/driconf_static.py -driconf_static_deps := $(LOCAL_PATH)/00-mesa-defaults.conf +intermediates := $(call local-generated-sources-dir) -$(intermediates)/driconf_static.h: $(driconf_static_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $(driconf_static_gen) $^ $@ +LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates) -format_srgb_gen := $(LOCAL_PATH)/format_srgb.py +UTIL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(MESA_UTIL_GENERATED_FILES)) +LOCAL_GENERATED_SOURCES := $(UTIL_GENERATED_SOURCES) -$(intermediates)/format_srgb.c: $(format_srgb_gen) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $(format_srgb_gen) $< > $@ +MESA_DRI_OPTIONS_H := $(intermediates)/xmlpool/options.h +LOCAL_GENERATED_SOURCES += $(MESA_DRI_OPTIONS_H) -u_format_gen := $(LOCAL_PATH)/format/u_format_table.py -u_format_deps := $(LOCAL_PATH)/format/u_format.csv \ - $(LOCAL_PATH)/format/u_format_pack.py \ - $(LOCAL_PATH)/format/u_format_parse.py +# +# Generate options.h from gettext translations. +# -$(intermediates)/util/format/u_format_pack.h: $(u_format_deps) - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $(u_format_gen) --header $< > $@ +MESA_DRI_OPTIONS_LANGS := de es nl fr sv +POT := $(intermediates)/xmlpool.pot -$(intermediates)/format/u_format_table.c: $(u_format_deps) +$(POT): $(LOCAL_PATH)/xmlpool/t_options.h @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $(u_format_gen) $< > $@ + xgettext -L C --from-code utf-8 -o $@ $< + +$(MESA_DRI_OPTIONS_LANGS:%=$(intermediates)/xmlpool/%.po): $(intermediates)/xmlpool/%.po: $(LOCAL_PATH)/xmlpool/%.po $(POT) + lang=$(basename $(notdir $@)); \ + mkdir -p $(dir $@); \ + if [ -f $< ]; then \ + msgmerge -o $@ $^; \ + else \ + msginit -i $(POT) \ + -o $@ \ + --locale=$$lang \ + --no-translator; \ + sed -i -e 's/charset=.*\\n/charset=UTF-8\\n/' $@; \ + fi + +PRIVATE_SCRIPT := $(LOCAL_PATH)/xmlpool/gen_xmlpool.py +PRIVATE_LOCALEDIR := $(intermediates)/xmlpool +PRIVATE_TEMPLATE_HEADER := $(LOCAL_PATH)/xmlpool/t_options.h +PRIVATE_MO_FILES := $(MESA_DRI_OPTIONS_LANGS:%=$(intermediates)/xmlpool/%.gmo) + +LOCAL_GENERATED_SOURCES += $(PRIVATE_MO_FILES) + +$(LOCAL_GENERATED_SOURCES): PRIVATE_PYTHON := $(MESA_PYTHON2) + +$(PRIVATE_MO_FILES): $(intermediates)/xmlpool/%.gmo: $(intermediates)/xmlpool/%.po + mkdir -p $(dir $@) + msgfmt -o $@ $< + +$(UTIL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@ +$(UTIL_GENERATED_SOURCES): $(intermediates)/%.c: $(LOCAL_PATH)/%.py + $(transform-generated-source) + +$(MESA_DRI_OPTIONS_H): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $< \ + --template $(PRIVATE_TEMPLATE_HEADER) \ + --output $@ \ + --localedir $(PRIVATE_LOCALEDIR) \ + --languages $(MESA_DRI_OPTIONS_LANGS) +$(MESA_DRI_OPTIONS_H): $(PRIVATE_SCRIPT) $(PRIVATE_TEMPLATE_HEADER) $(PRIVATE_MO_FILES) + $(transform-generated-source) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/vulkan/Android.mk b/lib/mesa/src/vulkan/Android.mk index 295c57d3c..71aa5e5f0 100644 --- a/lib/mesa/src/vulkan/Android.mk +++ b/lib/mesa/src/vulkan/Android.mk @@ -37,9 +37,7 @@ intermediates := $(call local-generated-sources-dir) LOCAL_C_INCLUDES := \ $(MESA_TOP)/include/vulkan \ - $(MESA_TOP)/src/vulkan/util \ - $(MESA_TOP)/src/gallium/include \ - $(intermediates)/util \ + $(MESA_TOP)/src/vulkan/util ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) LOCAL_C_INCLUDES += \ @@ -56,7 +54,7 @@ LOCAL_SRC_FILES := $(VULKAN_UTIL_FILES) $(VULKAN_WSI_FILES) vulkan_api_xml = $(MESA_TOP)/src/vulkan/registry/vk.xml -$(intermediates)/util/vk_enum_to_str.c: $(MESA_TOP)/src/vulkan/util/gen_enum_to_str.py \ +$(firstword $(LOCAL_GENERATED_SOURCES)): $(MESA_TOP)/src/vulkan/util/gen_enum_to_str.py \ $(vulkan_api_xml) @echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))" @mkdir -p $(dir $@) @@ -64,50 +62,7 @@ $(intermediates)/util/vk_enum_to_str.c: $(MESA_TOP)/src/vulkan/util/gen_enum_to_ --xml $(vulkan_api_xml) \ --outdir $(dir $@) -$(intermediates)/util/vk_enum_to_str.h: $(intermediates)/util/vk_enum_to_str.c - -$(intermediates)/util/vk_common_entrypoints.c: $(MESA_TOP)/src/vulkan/util/vk_entrypoints_gen.py \ - $(vulkan_api_xml) - @echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))" - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $< \ - --xml $(vulkan_api_xml) \ - --proto --weak --prefix vk_common \ - --out-c $@ --out-h $(dir $@)/vk_common_entrypoints.h - -$(intermediates)/util/vk_common_entrypoints.h: $(intermediates)/util/vk_common_entrypoints.c - -$(intermediates)/util/vk_dispatch_table.c: $(MESA_TOP)/src/vulkan/util/vk_dispatch_table_gen.py \ - $(vulkan_api_xml) - @echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))" - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $< \ - --xml $(vulkan_api_xml) \ - --out-c $@ - -$(intermediates)/util/vk_dispatch_table.h: $(MESA_TOP)/src/vulkan/util/vk_dispatch_table_gen.py \ - $(vulkan_api_xml) - @echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))" - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $< \ - --xml $(vulkan_api_xml) \ - --out-h $@ - -$(intermediates)/util/vk_extensions.c: $(MESA_TOP)/src/vulkan/util/vk_extensions_gen.py \ - $(vulkan_api_xml) - @echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))" - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $< \ - --xml $(vulkan_api_xml) \ - --out-c $@ - -$(intermediates)/util/vk_extensions.h: $(MESA_TOP)/src/vulkan/util/vk_extensions_gen.py \ - $(vulkan_api_xml) - @echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))" - @mkdir -p $(dir $@) - $(hide) $(MESA_PYTHON2) $< \ - --xml $(vulkan_api_xml) \ - --out-h $@ +$(lastword $(LOCAL_GENERATED_SOURCES)): $(firstword $(LOCAL_GENERATED_SOURCES)) LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)/util |