62 files changed, 4274 insertions, 3864 deletions
diff --git a/lib/mesa/Android.common.mk b/lib/mesa/Android.common.mk
index 7ef6a90a1..6bf64f55c 100644
--- a/lib/mesa/Android.common.mk
+++ b/lib/mesa/Android.common.mk
@@ -39,7 +39,7 @@ LOCAL_CFLAGS += \
 	-Wno-initializer-overrides \
 	-Wno-mismatched-tags \
 	-DPACKAGE_VERSION=\"$(MESA_VERSION)\" \
-	-DPACKAGE_BUGREPORT=\"https://gitlab.freedesktop.org/mesa/mesa/-/issues\"
+	-DPACKAGE_BUGREPORT=\"https://gitlab.freedesktop.org/mesa/mesa/issues\"
 
 # XXX: The following __STDC_*_MACROS defines should not be needed.
 # It's likely due to a bug elsewhere, but let's temporarily add them
@@ -73,7 +73,6 @@ LOCAL_CFLAGS += \
 	-DHAVE_LINUX_FUTEX_H \
 	-DHAVE_ENDIAN_H \
 	-DHAVE_ZLIB \
-	-DHAVE_COMPRESSION \
 	-DMAJOR_IN_SYSMACROS \
 	-DVK_USE_PLATFORM_ANDROID_KHR \
 	-fvisibility=hidden \
@@ -104,9 +103,12 @@ ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 26 && echo true),true)
 LOCAL_CFLAGS += -DHAVE_SYS_SHM_H
 endif
 
+ifeq ($(strip $(MESA_ENABLE_ASM)),true)
 ifeq ($(TARGET_ARCH),x86)
 LOCAL_CFLAGS += \
 	-DUSE_X86_ASM
+
+endif
 endif
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_CFLAGS_arm += -DUSE_ARM_ASM
diff --git a/lib/mesa/Android.mk b/lib/mesa/Android.mk
index 07eba7b83..e86c9bd51 100644
--- a/lib/mesa/Android.mk
+++ b/lib/mesa/Android.mk
@@ -24,7 +24,7 @@
 # BOARD_GPU_DRIVERS should be defined.  The valid values are
 #
 #   classic drivers: i915 i965
-#   gallium drivers: swrast freedreno i915g nouveau kmsro r300g r600g radeonsi vc4 virgl vmwgfx etnaviv iris lima panfrost
+#   gallium drivers: swrast freedreno i915g nouveau kmsro r300g r600g radeonsi vc4 virgl vmwgfx etnaviv iris lima
 #
 # The main target is libGLES_mesa.  For each classic driver enabled, a DRI
 # module will also be built.  DRI modules will be loaded by libGLES_mesa.
@@ -43,12 +43,6 @@ MESA_DRI_LDFLAGS := -Wl,--build-id=sha1
 
 MESA_COMMON_MK := $(MESA_TOP)/Android.common.mk
 MESA_PYTHON2 := python
-MESA_PYTHON3 := python3
-ifeq ($(filter 5 6 7 8 9 10, $(MESA_ANDROID_MAJOR_VERSION)),)
-MESA_LEX     := M4=$(M4) $(LEX)
-else
-MESA_LEX     := $(LEX)
-endif
 
 # Lists to convert driver names to boolean variables
 # in form of <driver name>.<boolean make variable>
@@ -67,8 +61,7 @@ gallium_drivers := \
 	virgl.HAVE_GALLIUM_VIRGL \
 	etnaviv.HAVE_GALLIUM_ETNAVIV \
 	iris.HAVE_GALLIUM_IRIS \
-	lima.HAVE_GALLIUM_LIMA \
-	panfrost.HAVE_GALLIUM_PANFROST
+	lima.HAVE_GALLIUM_LIMA
 
 ifeq ($(BOARD_GPU_DRIVERS),all)
 MESA_BUILD_CLASSIC := $(filter HAVE_%, $(subst ., , $(classic_drivers)))
@@ -90,20 +83,33 @@ endif
 
 $(foreach d, $(MESA_BUILD_CLASSIC) $(MESA_BUILD_GALLIUM), $(eval $(d) := true))
 
+# host and target must be the same arch to generate matypes.h
+ifeq ($(TARGET_ARCH),$(HOST_ARCH))
+MESA_ENABLE_ASM := true
+else
+MESA_ENABLE_ASM := false
+endif
+
 ifneq ($(filter true, $(HAVE_GALLIUM_RADEONSI)),)
 MESA_ENABLE_LLVM := true
 endif
 
 define mesa-build-with-llvm
-  $(if $(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7), \
+  $(if $(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5), \
     $(warning Unsupported LLVM version in Android $(MESA_ANDROID_MAJOR_VERSION)),) \
-  $(eval LOCAL_CFLAGS += -DLLVM_AVAILABLE -DDRAW_LLVM_AVAILABLE -DLLVM_IS_SHARED=1 -DMESA_LLVM_VERSION_STRING=\"3.9\") \
+  $(if $(filter 6,$(MESA_ANDROID_MAJOR_VERSION)), \
+    $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0307 -DMESA_LLVM_VERSION_STRING=\"3.7\")) \
+  $(if $(filter 7,$(MESA_ANDROID_MAJOR_VERSION)), \
+    $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0308 -DMESA_LLVM_VERSION_STRING=\"3.8\")) \
+  $(if $(filter 8,$(MESA_ANDROID_MAJOR_VERSION)), \
+    $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0309 -DMESA_LLVM_VERSION_STRING=\"3.9\")) \
+  $(if $(filter P,$(MESA_ANDROID_MAJOR_VERSION)), \
+    $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0309 -DMESA_LLVM_VERSION_STRING=\"3.9\")) \
   $(eval LOCAL_SHARED_LIBRARIES += libLLVM)
 endef
 
 # add subdirectories
 SUBDIRS := \
-	src/etnaviv \
 	src/freedreno \
 	src/gbm \
 	src/loader \
diff --git a/lib/mesa/REVIEWERS b/lib/mesa/REVIEWERS
index ece5394cf..921e0ba38 100644
--- a/lib/mesa/REVIEWERS
+++ b/lib/mesa/REVIEWERS
@@ -1,11 +1,30 @@
 Overview:
 
 	This file is similar in syntax (or more precisly a subset) of what is
-	used by the MAINTAINERS file in the linux kernel.
+	used by the MAINTAINERS file in the linux kernel.  Some fields do not
+	apply, for example, in all cases, send patches to:
+
+		mesa-dev@lists.freedesktop.org
+
+	and in all cases the patchwork instance is:
+
+		https://patchwork.freedesktop.org/project/mesa/
+
 	The purpose is not exactly the same the MAINTAINERS file in the linux
 	kernel, as there are not official/formal maintainers of different
 	subsystems in mesa, but is meant to give an idea of who to CC for
-	various patches for review.
+	various patches for review, and to allow the use of
+	scripts/get_reviewer.pl as git --cc-cmd.
+
+Usage:
+
+	When sending patches:
+
+		git send-email --cc-cmd ./scripts/get_reviewer.pl ...
+
+	Or to configure as default:
+
+		git config sendemail.cccmd ./scripts/get_reviewer.pl
 
 Descriptions of section entries:
 
@@ -17,6 +36,14 @@ Descriptions of section entries:
 	   F:	drivers/net/*	all files in drivers/net, but not below
 	   F:	*/net/*		all files in "any top level directory"/net
 	   One pattern per line.  Multiple F: lines acceptable.
+	N: Files and directories with regex patterns.
+	   N:	[^a-z]tegra	all files whose path contains the word tegra
+	   One pattern per line.  Multiple N: lines acceptable.
+	   scripts/get_maintainer.pl has different behavior for files that
+	   match F: pattern and matches of N: patterns.  By default,
+	   get_maintainer will not look at git log history when an F: pattern
+	   match occurs.  When an N: match occurs, git log history is used
+	   to also notify the people that have git commit signatures.
 
 Maintainers List (try to look for most precise areas first)
 
@@ -53,7 +80,7 @@ HAIKU
 R: Alexander von Gluck IV <kallisti5@unixzen.com>
 F: include/HaikuGL/
 F: src/egl/drivers/haiku/
-F: src/gallium/frontends/hgl/
+F: src/gallium/state_trackers/hgl/
 F: src/gallium/targets/haiku-softpipe/
 F: src/gallium/winsys/sw/hgl/
 F: src/hgl/
@@ -67,6 +94,11 @@ GALLIUM TARGETS
 R: Emil Velikov <emil.l.velikov@gmail.com>
 F: src/gallium/targets/
 
+SCONS BUILD
+F: scons/
+F: */SConscript*
+F: */Makefile.sources
+
 ANDROID BUILD
 R: Emil Velikov <emil.l.velikov@gmail.com>
 R: Rob Herring <robh@kernel.org>
@@ -103,13 +135,3 @@ VULKAN
 R: Eric Engestrom <eric@engestrom.ch>
 F: src/vulkan/
 F: include/vulkan/
-
-VMWARE DRIVER
-R: Brian Paul <brianp@vmware.com>
-R: Charmaine Lee <charmainel@vmware.com>
-F: src/gallium/drivers/svga/
-
-VMWARE WINSYS CODE
-R: Thomas Hellstrom <thellstrom@vmware.com>
-R: Deepak Rawat <drawat@vmware.com>
-F: src/gallium/winsys/svga/
diff --git a/lib/mesa/src/amd/Android.addrlib.mk b/lib/mesa/src/amd/Android.addrlib.mk
index 4e13ae1fd..eec78fc8b 100644
--- a/lib/mesa/src/amd/Android.addrlib.mk
+++ b/lib/mesa/src/amd/Android.addrlib.mk
@@ -30,8 +30,6 @@ LOCAL_MODULE := libmesa_amdgpu_addrlib
 
 LOCAL_SRC_FILES := $(ADDRLIB_FILES)
 
-LOCAL_CPPFLAGS += -DLITTLEENDIAN_CPU
-
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src \
 	$(MESA_TOP)/src/amd/common \
diff --git a/lib/mesa/src/amd/Android.common.mk b/lib/mesa/src/amd/Android.common.mk
index 23bf129d1..d5a266215 100644
--- a/lib/mesa/src/amd/Android.common.mk
+++ b/lib/mesa/src/amd/Android.common.mk
@@ -20,8 +20,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-ifeq ($(MESA_ENABLE_LLVM),true)
-
 # ---------------------------------------
 # Build libmesa_amd_common
 # ---------------------------------------
@@ -32,8 +30,9 @@ LOCAL_MODULE := libmesa_amd_common
 
 LOCAL_SRC_FILES := \
 	$(AMD_COMMON_FILES) \
-	$(AMD_COMMON_LLVM_FILES) \
-	$(AMD_DEBUG_FILES)
+	$(AMD_COMPILER_FILES) \
+	$(AMD_DEBUG_FILES) \
+	$(AMD_NIR_FILES)
 
 LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
 
@@ -42,23 +41,14 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 intermediates := $(call local-generated-sources-dir)
 LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, $(AMD_GENERATED_FILES))
 
-AMD_JSON_FILES := \
-	$(LOCAL_PATH)/registers/gfx6.json \
-	$(LOCAL_PATH)/registers/gfx7.json \
-	$(LOCAL_PATH)/registers/gfx8.json \
-	$(LOCAL_PATH)/registers/gfx81.json \
-	$(LOCAL_PATH)/registers/gfx9.json \
-	$(LOCAL_PATH)/registers/gfx10.json \
-	$(LOCAL_PATH)/registers/gfx103.json \
-	$(LOCAL_PATH)/registers/pkt3.json \
-	$(LOCAL_PATH)/registers/gfx10-rsrc.json \
-	$(LOCAL_PATH)/registers/registers-manually-defined.json
-
 SID_TABLES := $(LOCAL_PATH)/common/sid_tables.py
 
 SID_TABLES_INPUTS := \
 	$(LOCAL_PATH)/common/sid.h \
-	$(AMD_JSON_FILES)
+	$(LOCAL_PATH)/registers/amdgfxregs.json \
+	$(LOCAL_PATH)/registers/pkt3.json \
+	$(LOCAL_PATH)/registers/gfx10.json \
+	$(LOCAL_PATH)/registers/gfx10-rsrc.json
 
 $(intermediates)/common/sid_tables.h: $(SID_TABLES) $(SID_TABLES_INPUTS)
 	@mkdir -p $(dir $@)
@@ -68,34 +58,21 @@ $(intermediates)/common/sid_tables.h: $(SID_TABLES) $(SID_TABLES_INPUTS)
 AMDGFXREGS := $(LOCAL_PATH)/registers/makeregheader.py
 
 AMDGFXREGS_INPUTS := \
-	$(AMD_JSON_FILES)
+	$(LOCAL_PATH)/registers/amdgfxregs.json \
+	$(LOCAL_PATH)/registers/pkt3.json \
+	$(LOCAL_PATH)/registers/gfx10.json \
+	$(LOCAL_PATH)/registers/gfx10-rsrc.json
 
 $(intermediates)/common/amdgfxregs.h: $(AMDGFXREGS) $(AMDGFXREGS_INPUTS)
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
 	$(hide) $(MESA_PYTHON2) $(AMDGFXREGS) $(AMDGFXREGS_INPUTS) --sort address --guard AMDGFXREGS_H > $@ || ($(RM) $@; false)
 
-GEN10_FORMAT_TABLE_INPUTS := \
-	$(MESA_TOP)/src/util/format/u_format.csv \
-	$(MESA_TOP)/src/amd/registers/gfx10-rsrc.json
-
-GEN10_FORMAT_TABLE_DEP := \
-	$(MESA_TOP)/src/amd/registers/regdb.py
-
-GEN10_FORMAT_TABLE := $(LOCAL_PATH)/common/gfx10_format_table.py
-
-$(intermediates)/common/gfx10_format_table.c: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP)
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false)
-
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/include \
 	$(MESA_TOP)/src \
 	$(MESA_TOP)/src/amd/common \
-	$(MESA_TOP)/src/amd/llvm \
 	$(MESA_TOP)/src/compiler \
-	$(MESA_TOP)/src/compiler/nir \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary \
@@ -104,7 +81,6 @@ LOCAL_C_INCLUDES := \
 
 LOCAL_EXPORT_C_INCLUDE_DIRS := \
 	$(LOCAL_PATH)/common \
-	$(LOCAL_PATH)/llvm \
 	$(intermediates)/common
 
 LOCAL_SHARED_LIBRARIES := \
@@ -120,5 +96,3 @@ $(call mesa-build-with-llvm)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
-
-endif # MESA_ENABLE_LLVM == true
diff --git a/lib/mesa/src/amd/Android.mk b/lib/mesa/src/amd/Android.mk
index c9dbeafde..e40e7da01 100644
--- a/lib/mesa/src/amd/Android.mk
+++ b/lib/mesa/src/amd/Android.mk
@@ -28,6 +28,5 @@ include $(LOCAL_PATH)/Makefile.sources
 include $(LOCAL_PATH)/Android.addrlib.mk
 include $(LOCAL_PATH)/Android.common.mk
 ifneq ($(filter radeonsi,$(BOARD_GPU_DRIVERS)),)
-include $(LOCAL_PATH)/Android.compiler.mk
 include $(LOCAL_PATH)/vulkan/Android.mk
 endif
diff --git a/lib/mesa/src/amd/vulkan/Android.mk b/lib/mesa/src/amd/vulkan/Android.mk
index f0eb5119a..d0002b8aa 100644
--- a/lib/mesa/src/amd/vulkan/Android.mk
+++ b/lib/mesa/src/amd/vulkan/Android.mk
@@ -30,7 +30,6 @@ include $(LOCAL_PATH)/Makefile.sources
 RADV_COMMON_INCLUDES := \
 	$(MESA_TOP)/include \
 	$(MESA_TOP)/src/ \
-	$(MESA_TOP)/src/amd/vulkan \
 	$(MESA_TOP)/src/vulkan/wsi \
 	$(MESA_TOP)/src/vulkan/util \
 	$(MESA_TOP)/src/amd \
@@ -68,7 +67,6 @@ $(call mesa-build-with-llvm)
 LOCAL_C_INCLUDES := $(RADV_COMMON_INCLUDES)
 
 LOCAL_STATIC_LIBRARIES := \
-	libmesa_aco \
 	libmesa_amd_common \
 	libmesa_nir \
 	libmesa_util \
@@ -77,23 +75,58 @@ LOCAL_STATIC_LIBRARIES := \
 
 LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.c
 LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.h
+LOCAL_GENERATED_SOURCES += $(intermediates)/radv_extensions.c
+LOCAL_GENERATED_SOURCES += $(intermediates)/radv_extensions.h
+LOCAL_GENERATED_SOURCES += $(intermediates)/vk_format_table.c
+LOCAL_GENERATED_SOURCES += $(intermediates)/gfx10_format_table.h
 
-RADV_ENTRYPOINTS_SCRIPT := $(MESA_TOP)/src/vulkan/util/vk_entrypoints_gen.py
+RADV_ENTRYPOINTS_SCRIPT := $(MESA_TOP)/src/amd/vulkan/radv_entrypoints_gen.py
+RADV_EXTENSIONS_SCRIPT := $(MESA_TOP)/src/amd/vulkan/radv_extensions.py
+VK_FORMAT_TABLE_SCRIPT := $(MESA_TOP)/src/amd/vulkan/vk_format_table.py
+VK_FORMAT_PARSE_SCRIPT := $(MESA_TOP)/src/amd/vulkan/vk_format_parse.py
 
 vulkan_api_xml = $(MESA_TOP)/src/vulkan/registry/vk.xml
+vk_format_layout_csv = $(MESA_TOP)/src/amd/vulkan/vk_format_layout.csv
 
 $(intermediates)/radv_entrypoints.c: $(RADV_ENTRYPOINTS_SCRIPT) \
+					$(RADV_EXTENSIONS_SCRIPT) \
 					$(vulkan_api_xml)
 	@mkdir -p $(dir $@)
 	$(MESA_PYTHON2) $(RADV_ENTRYPOINTS_SCRIPT) \
 		--xml $(vulkan_api_xml) \
-		--proto --weak \
-		--out-c $@ \
-		--out-h $(addsuffix .h,$(basename $@)) \
-		--prefix radv --device-prefix sqtt
+		--outdir $(dir $@)
 
 $(intermediates)/radv_entrypoints.h: $(intermediates)/radv_entrypoints.c
 
+$(intermediates)/radv_extensions.c: $(RADV_EXTENSIONS_SCRIPT) $(vulkan_api_xml)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(RADV_EXTENSIONS_SCRIPT) \
+		--xml $(vulkan_api_xml) \
+		--out-c $@ \
+		--out-h $(addsuffix .h,$(basename $@))
+
+$(intermediates)/radv_extensions.h: $(intermediates)/radv_extensions.c
+
+$(intermediates)/vk_format_table.c: $(VK_FORMAT_TABLE_SCRIPT) \
+					$(VK_FORMAT_PARSE_SCRIPT) \
+					$(vk_format_layout_csv)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(VK_FORMAT_TABLE_SCRIPT) $(vk_format_layout_csv) > $@
+
+RADV_GEN10_FORMAT_TABLE_INPUTS := \
+	$(MESA_TOP)/src/amd/vulkan/vk_format_layout.csv \
+	$(MESA_TOP)/src/amd/registers/gfx10-rsrc.json
+
+RADV_GEN10_FORMAT_TABLE_DEP := \
+	$(MESA_TOP)/src/amd/registers/regdb.py
+
+RADV_GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py
+
+$(intermediates)/gfx10_format_table.h: $(RADV_GEN10_FORMAT_TABLE) $(RADV_GEN10_FORMAT_TABLE_INPUTS) $(RADV_GEN10_FORMAT_TABLE_DEP)
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(MESA_PYTHON2) $(RADV_GEN10_FORMAT_TABLE) $(RADV_GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false)
+
 LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES)
 
 LOCAL_EXPORT_C_INCLUDE_DIRS := \
@@ -134,10 +167,9 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 	libmesa_amdgpu_addrlib \
 	libmesa_amd_common \
 	libmesa_radv_common \
-	libmesa_vulkan_util \
-	libmesa_aco
+	libmesa_vulkan_util
 
-LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES) libz libsync liblog libcutils
+LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES) libz libsync liblog
 
 # If Android version >=8 MESA should static link libexpat else should dynamic link
 ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
diff --git a/lib/mesa/src/compiler/Android.glsl.gen.mk b/lib/mesa/src/compiler/Android.glsl.gen.mk
index f654c869f..1308de2db 100644
--- a/lib/mesa/src/compiler/Android.glsl.gen.mk
+++ b/lib/mesa/src/compiler/Android.glsl.gen.mk
@@ -53,7 +53,7 @@ MESA_GEN_GLSL_H := $(addprefix $(call local-generated-sources-dir)/, \
 define local-l-or-ll-to-c-or-cpp
 	@mkdir -p $(dir $@)
 	@echo "Mesa Lex: $(PRIVATE_MODULE) <= $<"
-	$(hide) $(MESA_LEX) --nounistd -o$@ $<
+	$(hide) $(LEX) --nounistd -o$@ $<
 endef
 
 define glsl_local-y-to-c-and-h
@@ -102,6 +102,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $< strings > $@
 
-$(intermediates)/glsl/float64_glsl.h: $(MESA_TOP)/src/util/xxd.py
+$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@
diff --git a/lib/mesa/src/compiler/Android.nir.gen.mk b/lib/mesa/src/compiler/Android.nir.gen.mk
index ad640dd04..e753bb77a 100644
--- a/lib/mesa/src/compiler/Android.nir.gen.mk
+++ b/lib/mesa/src/compiler/Android.nir.gen.mk
@@ -33,7 +33,6 @@ LOCAL_SRC_FILES := $(LOCAL_SRC_FILES)
 
 LOCAL_C_INCLUDES += \
 	$(intermediates)/nir \
-	$(intermediates)/spirv \
 	$(MESA_TOP)/src/compiler/nir
 
 LOCAL_EXPORT_C_INCLUDE_DIRS += \
@@ -100,11 +99,7 @@ $(intermediates)/spirv/spirv_info.c: $(LOCAL_PATH)/spirv/spirv_info_c.py $(LOCAL
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $^ $@ || ($(RM) $@; false)
 
-$(intermediates)/spirv/vtn_gather_types.c: $(LOCAL_PATH)/spirv/vtn_gather_types_c.py $(LOCAL_PATH)/spirv/spirv.core.grammar.json
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $^ $@ || ($(RM) $@; false)
-
-$(intermediates)/spirv/vtn_generator_ids.h: $(LOCAL_PATH)/spirv/vtn_generator_ids_h.py $(LOCAL_PATH)/spirv/spir-v.xml
+$(intermediates)/spirv/vtn_gather_types.c:: $(LOCAL_PATH)/spirv/vtn_gather_types_c.py $(LOCAL_PATH)/spirv/spirv.core.grammar.json
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $^ $@ || ($(RM) $@; false)
 
@@ -117,8 +112,3 @@ nir_intrinsics_c_gen := $(LOCAL_PATH)/nir/nir_intrinsics_c.py
 $(intermediates)/nir/nir_intrinsics.c: $(LOCAL_PATH)/nir/nir_intrinsics.py $(nir_intrinsics_c_gen)
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $(nir_intrinsics_c_gen) --outdir $(dir $@) || ($(RM) $@; false)
-
-nir_intrinsics_indices_h_gen := $(LOCAL_PATH)/nir/nir_intrinsics_indices_h.py
-$(intermediates)/nir/nir_intrinsics_indices.h: $(LOCAL_PATH)/nir/nir_intrinsics.py $(nir_intrinsics_indices_h_gen)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $(nir_intrinsics_indices_h_gen) --outdir $(dir $@) || ($(RM) $@; false)
diff --git a/lib/mesa/src/egl/Android.mk b/lib/mesa/src/egl/Android.mk
index 83bd442de..01c33298e 100644
--- a/lib/mesa/src/egl/Android.mk
+++ b/lib/mesa/src/egl/Android.mk
@@ -37,8 +37,7 @@ LOCAL_SRC_FILES := \
 	$(LIBEGL_C_FILES) \
 	$(dri2_backend_core_FILES) \
 	drivers/dri2/platform_device.c \
-	drivers/dri2/platform_android.c \
-	drivers/dri2/platform_surfaceless.c \
+	drivers/dri2/platform_android.c
 
 LOCAL_CFLAGS := \
 	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID \
diff --git a/lib/mesa/src/freedreno/Android.drm.mk b/lib/mesa/src/freedreno/Android.drm.mk
index 0a79fcf9a..dfa9bed7d 100644
--- a/lib/mesa/src/freedreno/Android.drm.mk
+++ b/lib/mesa/src/freedreno/Android.drm.mk
@@ -37,7 +37,5 @@ LOCAL_C_INCLUDES := \
 
 LOCAL_MODULE := libfreedreno_drm
 
-LOCAL_STATIC_LIBRARIES := libfreedreno_registers
-
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/freedreno/Android.ir3.mk b/lib/mesa/src/freedreno/Android.ir3.mk
index 0fbb8c50c..c6a9d3288 100644
--- a/lib/mesa/src/freedreno/Android.ir3.mk
+++ b/lib/mesa/src/freedreno/Android.ir3.mk
@@ -31,72 +31,21 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(ir3_SOURCES)
 
-LOCAL_MODULE := libfreedreno_ir3
-
-LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
-intermediates := $(call local-generated-sources-dir)
-
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/compiler/nir \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary \
 	$(MESA_TOP)/prebuilt-intermediates/nir \
-	$(MESA_TOP)/src/freedreno/common \
-	$(MESA_TOP)/src/freedreno/ir3 \
-	$(intermediates)/ir3
-
-LOCAL_WHOLE_STATIC_LIBRARIES := \
-	libir3decode \
-	libir3encode
 
 # We need libmesa_nir to get NIR's generated include directories.
 LOCAL_STATIC_LIBRARIES := \
 	libmesa_nir
 
+LOCAL_MODULE := libfreedreno_ir3
+
 LOCAL_GENERATED_SOURCES := \
 	$(MESA_GEN_GLSL_H) \
 	$(MESA_GEN_NIR_H)
 
-LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, \
-	$(ir3_GENERATED_FILES))
-
-ir3_lexer_deps := \
-	$(MESA_TOP)/src/freedreno/ir3/ir3_lexer.l
-
-ir3_nir_imul_deps := \
-	$(MESA_TOP)/src/freedreno/ir3/ir3_nir_imul.py \
-	$(MESA_TOP)/src/compiler/nir/nir_algebraic.py
-
-ir3_nir_trig_deps := \
-	$(MESA_TOP)/src/freedreno/ir3/ir3_nir_trig.py \
-	$(MESA_TOP)/src/compiler/nir/nir_algebraic.py
-
-ir3_parser_deps := \
-	$(MESA_TOP)/src/freedreno/ir3/ir3_parser.y
-
-$(intermediates)/ir3/ir3_lexer.c: $(ir3_lexer_deps)
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_LEX) -o $@ $<
-
-$(intermediates)/ir3/ir3_nir_imul.c: $(ir3_nir_imul_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/compiler/nir > $@
-
-$(intermediates)/ir3/ir3_nir_trig.c: $(ir3_nir_trig_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/compiler/nir > $@
-
-$(intermediates)/ir3/ir3_parser.c: $(ir3_parser_deps)
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(BISON) $< --name-prefix=ir3_yy --output=$@
-
-$(intermediates)/ir3/ir3_parser.h: $(ir3_parser_deps)
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(BISON) $< --name-prefix=ir3_yy --defines=$@
-
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/freedreno/Android.mk b/lib/mesa/src/freedreno/Android.mk
index a488803f9..84d0c82c2 100644
--- a/lib/mesa/src/freedreno/Android.mk
+++ b/lib/mesa/src/freedreno/Android.mk
@@ -25,10 +25,7 @@
 LOCAL_PATH := $(call my-dir)
 
 include $(LOCAL_PATH)/Makefile.sources
-include $(LOCAL_PATH)/Android.common.mk
+include $(MESA_TOP)/src/gallium/drivers/freedreno/Android.gen.mk
 include $(LOCAL_PATH)/Android.drm.mk
-include $(LOCAL_PATH)/Android.ir2.mk
 include $(LOCAL_PATH)/Android.ir3.mk
-include $(LOCAL_PATH)/Android.isa.mk
-include $(LOCAL_PATH)/Android.perfcntrs.mk
 include $(LOCAL_PATH)/Android.registers.mk
diff --git a/lib/mesa/src/freedreno/Android.registers.mk b/lib/mesa/src/freedreno/Android.registers.mk
index f66e57794..085eb5f07 100644
--- a/lib/mesa/src/freedreno/Android.registers.mk
+++ b/lib/mesa/src/freedreno/Android.registers.mk
@@ -42,59 +42,48 @@ $(intermediates)/dummy.c:
 	@echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
 	$(hide) touch $@
 
-RNN_SRC_PATH := $(MESA_TOP)/src/freedreno/registers/
-
 # This is the list of auto-generated files headers
-LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/registers/adreno/, \
-	a2xx.xml.h a3xx.xml.h a4xx.xml.h a5xx.xml.h a6xx.xml.h a6xx-pack.xml.h adreno_common.xml.h adreno_pm4.xml.h adreno-pm4-pack.xml.h)
-
-$(intermediates)/registers/adreno/a2xx.xml.h: $(LOCAL_PATH)/registers/adreno/a2xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/registers/, \
+	a2xx.xml.h a3xx.xml.h a4xx.xml.h a5xx.xml.h a6xx.xml.h adreno_common.xml.h adreno_pm4.xml.h)
 
-$(intermediates)/registers/adreno/a3xx.xml.h: $(LOCAL_PATH)/registers/adreno/a3xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
+$(intermediates)/registers/a2xx.xml.h: $(LOCAL_PATH)/registers/a2xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@
 
-$(intermediates)/registers/adreno/a4xx.xml.h: $(LOCAL_PATH)/registers/adreno/a4xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
+$(intermediates)/registers/a3xx.xml.h: $(LOCAL_PATH)/registers/a3xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@
 
-$(intermediates)/registers/adreno/a5xx.xml.h: $(LOCAL_PATH)/registers/adreno/a5xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
+$(intermediates)/registers/a4xx.xml.h: $(LOCAL_PATH)/registers/a4xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@
 
-$(intermediates)/registers/adreno/a6xx.xml.h: $(LOCAL_PATH)/registers/adreno/a6xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
+$(intermediates)/registers/a5xx.xml.h: $(LOCAL_PATH)/registers/a5xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@
 
-$(intermediates)/registers/adreno/a6xx-pack.xml.h: $(LOCAL_PATH)/registers/adreno/a6xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
+$(intermediates)/registers/a6xx.xml.h: $(LOCAL_PATH)/registers/a6xx.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< --pack-structs > $@
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@
 
-$(intermediates)/registers/adreno/adreno_common.xml.h: $(LOCAL_PATH)/registers/adreno/adreno_common.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
+$(intermediates)/registers/adreno_common.xml.h: $(LOCAL_PATH)/registers/adreno_common.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@
 
-$(intermediates)/registers/adreno/adreno_pm4.xml.h: $(LOCAL_PATH)/registers/adreno/adreno_pm4.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
+$(intermediates)/registers/adreno_pm4.xml.h: $(LOCAL_PATH)/registers/adreno_pm4.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< > $@
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/freedreno/registers/gen_header.py $< > $@
 
-$(intermediates)/registers/adreno/adreno-pm4-pack.xml.h: $(LOCAL_PATH)/registers/adreno/adreno_pm4.xml $(MESA_TOP)/src/freedreno/registers/gen_header.py
-	@mkdir -p $(dir $@)
-	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(MESA_PYTHON3) $(MESA_TOP)/src/freedreno/registers/gen_header.py $(RNN_SRC_PATH) $< --pack-structs > $@
 
 LOCAL_EXPORT_C_INCLUDE_DIRS := \
-	$(intermediates)/registers/adreno/
+	$(intermediates)/registers/
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/freedreno/vulkan/tu_extensions.py b/lib/mesa/src/freedreno/vulkan/tu_extensions.py
index 762f5b595..0a45b859e 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_extensions.py
+++ b/lib/mesa/src/freedreno/vulkan/tu_extensions.py
@@ -25,17 +25,24 @@ COPYRIGHT = """\
 """
 
 import argparse
-import os.path
+import copy
 import re
-import sys
+import xml.etree.cElementTree as et
 
-VULKAN_UTIL = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../vulkan/util'))
-sys.path.append(VULKAN_UTIL)
+from mako.template import Template
 
-from vk_extensions import *
-from vk_extensions_gen import *
+MAX_API_VERSION = '1.1.82'
 
-MAX_API_VERSION = '1.2.131'
+class Extension:
+    def __init__(self, name, ext_version, enable):
+        self.name = name
+        self.ext_version = int(ext_version)
+        if enable is True:
+            self.enable = 'true';
+        elif enable is False:
+            self.enable = 'false';
+        else:
+            self.enable = enable;
 
 # On Android, we disable all surface and swapchain extensions. Android's Vulkan
 # loader implements VK_KHR_surface and VK_KHR_swapchain, and applications
@@ -53,8 +60,6 @@ EXTENSIONS = [
     Extension('VK_KHR_maintenance1',                      1, True),
     Extension('VK_KHR_maintenance2',                      1, True),
     Extension('VK_KHR_maintenance3',                      1, True),
-    Extension('VK_KHR_sampler_mirror_clamp_to_edge',      1, True),
-    Extension('VK_KHR_sampler_ycbcr_conversion',          1, True),
     Extension('VK_KHR_surface',                          25, 'TU_HAS_SURFACE'),
     Extension('VK_KHR_swapchain',                        68, 'TU_HAS_SURFACE'),
     Extension('VK_KHR_wayland_surface',                   6, 'VK_USE_PLATFORM_WAYLAND_KHR'),
@@ -70,56 +75,180 @@ EXTENSIONS = [
     Extension('VK_KHR_external_memory',                   1, True),
     Extension('VK_KHR_external_memory_fd',                1, True),
     Extension('VK_EXT_external_memory_dma_buf',           1, True),
-    Extension('VK_EXT_image_drm_format_modifier',         1, True),
-    Extension('VK_EXT_sample_locations',                  1, 'device->gpu_id == 650'),
-    Extension('VK_EXT_sampler_filter_minmax',             1, True),
-    Extension('VK_EXT_transform_feedback',                1, True),
-    Extension('VK_ANDROID_native_buffer',                 1, 'ANDROID'),
-    Extension('VK_KHR_external_fence',                    1, True),
-    Extension('VK_KHR_external_fence_fd',                 1, True),
-    Extension('VK_KHR_external_semaphore',                1, True),
-    Extension('VK_KHR_external_semaphore_capabilities',   1, True),
-    Extension('VK_KHR_external_semaphore_fd',             1, True),
-    Extension('VK_IMG_filter_cubic',                      1, 'device->gpu_id == 650'),
-    Extension('VK_EXT_filter_cubic',                      1, 'device->gpu_id == 650'),
-    Extension('VK_EXT_index_type_uint8',                  1, True),
-    Extension('VK_EXT_vertex_attribute_divisor',          1, True),
-    Extension('VK_KHR_shader_draw_parameters',            1, True),
-    Extension('VK_KHR_variable_pointers',                 1, True),
-    Extension('VK_EXT_private_data',                      1, True),
-    Extension('VK_EXT_shader_stencil_export',             1, True),
-    Extension('VK_EXT_depth_clip_enable',                 1, True),
-    Extension('VK_KHR_draw_indirect_count',               1, True),
-    Extension('VK_EXT_4444_formats',                      1, True),
-    Extension('VK_EXT_conditional_rendering',             1, True),
-    Extension('VK_EXT_custom_border_color',              12, True),
-    Extension('VK_KHR_multiview',                         1, True),
-    Extension('VK_EXT_host_query_reset',                  1, True),
-    Extension('VK_EXT_shader_viewport_index_layer',       1, True),
-    Extension('VK_EXT_extended_dynamic_state',            1, True),
-    Extension('VK_KHR_push_descriptor',                   1, True),
-    Extension('VK_KHR_incremental_present',               1, 'TU_HAS_SURFACE'),
-    Extension('VK_KHR_image_format_list',                 1, True),
-    Extension('VK_KHR_depth_stencil_resolve',             1, True),
-    Extension('VK_KHR_performance_query',                 1, 'device->instance->debug_flags & TU_DEBUG_PERFC'),
-    Extension('VK_EXT_memory_budget',                     1, True),
-    Extension('VK_KHR_device_group',                      4, True),
-    Extension('VK_KHR_device_group_creation',             1, True),
-    Extension('VK_EXT_descriptor_indexing',               2, True),
-    Extension('VK_KHR_descriptor_update_template',        1, True),
-    Extension('VK_KHR_storage_buffer_storage_class',      1, True),
-    Extension('VK_KHR_external_fence_capabilities',       1, True),
-    Extension('VK_KHR_pipeline_executable_properties',    1, True),
-    Extension('VK_KHR_shader_float_controls',             1, True),
-    Extension('VK_KHR_shader_float16_int8',               1, True),
-    Extension('VK_KHR_16bit_storage',                     1, 'device->gpu_id >= 650'),
-    Extension('VK_EXT_scalar_block_layout',               1, True),
-    Extension('VK_KHR_spirv_1_4',                         1, True),
-    Extension('VK_KHR_relaxed_block_layout',              1, True),
 ]
 
+class VkVersion:
+    def __init__(self, string):
+        split = string.split('.')
+        self.major = int(split[0])
+        self.minor = int(split[1])
+        if len(split) > 2:
+            assert len(split) == 3
+            self.patch = int(split[2])
+        else:
+            self.patch = None
+
+        # Sanity check.  The range bits are required by the definition of the
+        # VK_MAKE_VERSION macro
+        assert self.major < 1024 and self.minor < 1024
+        assert self.patch is None or self.patch < 4096
+        assert(str(self) == string)
+
+    def __str__(self):
+        ver_list = [str(self.major), str(self.minor)]
+        if self.patch is not None:
+            ver_list.append(str(self.patch))
+        return '.'.join(ver_list)
+
+    def c_vk_version(self):
+        patch = self.patch if self.patch is not None else 0
+        ver_list = [str(self.major), str(self.minor), str(patch)]
+        return 'VK_MAKE_VERSION(' + ', '.join(ver_list) + ')'
+
+    def __int_ver(self):
+        # This is just an expansion of VK_VERSION
+        patch = self.patch if self.patch is not None else 0
+        return (self.major << 22) | (self.minor << 12) | patch
+
+    def __gt__(self, other):
+        # If only one of them has a patch version, "ignore" it by making
+        # other's patch version match self.
+        if (self.patch is None) != (other.patch is None):
+            other = copy.copy(other)
+            other.patch = self.patch
+
+        return self.__int_ver() > other.__int_ver()
+
 MAX_API_VERSION = VkVersion(MAX_API_VERSION)
-API_VERSIONS = [ ApiVersion(MAX_API_VERSION,  True) ]
+
+def _init_exts_from_xml(xml):
+    """ Walk the Vulkan XML and fill out extra extension information. """
+
+    xml = et.parse(xml)
+
+    ext_name_map = {}
+    for ext in EXTENSIONS:
+        ext_name_map[ext.name] = ext
+
+    for ext_elem in xml.findall('.extensions/extension'):
+        ext_name = ext_elem.attrib['name']
+        if ext_name not in ext_name_map:
+            continue
+
+        ext = ext_name_map[ext_name]
+        ext.type = ext_elem.attrib['type']
+
+_TEMPLATE_H = Template(COPYRIGHT + """
+#ifndef TU_EXTENSIONS_H
+#define TU_EXTENSIONS_H
+
+enum {
+   TU_INSTANCE_EXTENSION_COUNT = ${len(instance_extensions)},
+   TU_DEVICE_EXTENSION_COUNT = ${len(device_extensions)},
+};
+
+struct tu_instance_extension_table {
+   union {
+      bool extensions[TU_INSTANCE_EXTENSION_COUNT];
+      struct {
+%for ext in instance_extensions:
+        bool ${ext.name[3:]};
+%endfor
+      };
+   };
+};
+
+struct tu_device_extension_table {
+   union {
+      bool extensions[TU_DEVICE_EXTENSION_COUNT];
+      struct {
+%for ext in device_extensions:
+        bool ${ext.name[3:]};
+%endfor
+      };
+   };
+};
+
+extern const VkExtensionProperties tu_instance_extensions[TU_INSTANCE_EXTENSION_COUNT];
+extern const VkExtensionProperties tu_device_extensions[TU_DEVICE_EXTENSION_COUNT];
+extern const struct tu_instance_extension_table tu_supported_instance_extensions;
+
+
+struct tu_physical_device;
+
+void tu_fill_device_extension_table(const struct tu_physical_device *device,
+                                      struct tu_device_extension_table* table);
+#endif
+""")
+
+_TEMPLATE_C = Template(COPYRIGHT + """
+#include "tu_private.h"
+
+#include "vk_util.h"
+
+/* Convert the VK_USE_PLATFORM_* defines to booleans */
+%for platform in ['ANDROID_KHR', 'WAYLAND_KHR', 'XCB_KHR', 'XLIB_KHR', 'DISPLAY_KHR', 'XLIB_XRANDR_EXT']:
+#ifdef VK_USE_PLATFORM_${platform}
+#   undef VK_USE_PLATFORM_${platform}
+#   define VK_USE_PLATFORM_${platform} true
+#else
+#   define VK_USE_PLATFORM_${platform} false
+#endif
+%endfor
+
+/* And ANDROID too */
+#ifdef ANDROID
+#   undef ANDROID
+#   define ANDROID true
+#else
+#   define ANDROID false
+#endif
+
+#define TU_HAS_SURFACE (VK_USE_PLATFORM_WAYLAND_KHR || \\
+                         VK_USE_PLATFORM_XCB_KHR || \\
+                         VK_USE_PLATFORM_XLIB_KHR || \\
+                         VK_USE_PLATFORM_DISPLAY_KHR)
+
+
+const VkExtensionProperties tu_instance_extensions[TU_INSTANCE_EXTENSION_COUNT] = {
+%for ext in instance_extensions:
+   {"${ext.name}", ${ext.ext_version}},
+%endfor
+};
+
+const VkExtensionProperties tu_device_extensions[TU_DEVICE_EXTENSION_COUNT] = {
+%for ext in device_extensions:
+   {"${ext.name}", ${ext.ext_version}},
+%endfor
+};
+
+const struct tu_instance_extension_table tu_supported_instance_extensions = {
+%for ext in instance_extensions:
+   .${ext.name[3:]} = ${ext.enable},
+%endfor
+};
+
+void tu_fill_device_extension_table(const struct tu_physical_device *device,
+                                      struct tu_device_extension_table* table)
+{
+%for ext in device_extensions:
+   table->${ext.name[3:]} = ${ext.enable};
+%endfor
+}
+
+VkResult tu_EnumerateInstanceVersion(
+    uint32_t*                                   pApiVersion)
+{
+    *pApiVersion = ${MAX_API_VERSION.c_vk_version()};
+    return VK_SUCCESS;
+}
+
+uint32_t
+tu_physical_device_api_version(struct tu_physical_device *dev)
+{
+    return VK_MAKE_VERSION(1, 1, 82);
+}
+""")
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -132,5 +261,19 @@ if __name__ == '__main__':
                         dest='xml_files')
     args = parser.parse_args()
 
-    gen_extensions('tu', args.xml_files, API_VERSIONS, MAX_API_VERSION,
-                   EXTENSIONS, args.out_c, args.out_h)
+    for filename in args.xml_files:
+        _init_exts_from_xml(filename)
+
+    for ext in EXTENSIONS:
+        assert ext.type == 'instance' or ext.type == 'device'
+
+    template_env = {
+        'MAX_API_VERSION': MAX_API_VERSION,
+        'instance_extensions': [e for e in EXTENSIONS if e.type == 'instance'],
+        'device_extensions': [e for e in EXTENSIONS if e.type == 'device'],
+    }
+
+    with open(args.out_c, 'w') as f:
+        f.write(_TEMPLATE_C.render(**template_env))
+    with open(args.out_h, 'w') as f:
+        f.write(_TEMPLATE_H.render(**template_env))
diff --git a/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c b/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c
index cfaea0622..b9148a1e2 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_wsi_wayland.c
@@ -53,7 +53,7 @@ tu_CreateWaylandSurfaceKHR(VkInstance _instance,
    if (pAllocator)
       alloc = pAllocator;
    else
-      alloc = &instance->vk.alloc;
+      alloc = &instance->alloc;
 
    return wsi_create_wl_surface(alloc, pCreateInfo, pSurface);
 }
diff --git a/lib/mesa/src/gallium/Android.common.mk b/lib/mesa/src/gallium/Android.common.mk
index 3f7779892..0d55f04ac 100644
--- a/lib/mesa/src/gallium/Android.common.mk
+++ b/lib/mesa/src/gallium/Android.common.mk
@@ -28,7 +28,6 @@ LOCAL_C_INCLUDES += \
 	$(GALLIUM_TOP)/auxiliary \
 	$(GALLIUM_TOP)/winsys \
 	$(GALLIUM_TOP)/drivers \
-	$(MESA_TOP)/src/etnaviv \
 	$(MESA_TOP)/src/freedreno \
 	$(MESA_TOP)/src/freedreno/ir3 \
 	$(MESA_TOP)/src/freedreno/registers
diff --git a/lib/mesa/src/gallium/Android.mk b/lib/mesa/src/gallium/Android.mk
index 78e821581..37e923c22 100644
--- a/lib/mesa/src/gallium/Android.mk
+++ b/lib/mesa/src/gallium/Android.mk
@@ -46,10 +46,9 @@ SUBDIRS += winsys/vc4/drm drivers/vc4
 SUBDIRS += winsys/virgl/common winsys/virgl/drm winsys/virgl/vtest drivers/virgl
 SUBDIRS += winsys/svga/drm drivers/svga
 SUBDIRS += winsys/etnaviv/drm drivers/etnaviv drivers/renderonly
-SUBDIRS += frontends/dri
+SUBDIRS += state_trackers/dri
 SUBDIRS += winsys/iris/drm drivers/iris
 SUBDIRS += winsys/lima/drm drivers/lima
-SUBDIRS += winsys/panfrost/drm drivers/panfrost
 
 # sort to eliminate any duplicates
 INC_DIRS := $(call all-named-subdir-makefiles,$(sort $(SUBDIRS)))
diff --git a/lib/mesa/src/gallium/auxiliary/Android.mk b/lib/mesa/src/gallium/auxiliary/Android.mk
index f668e5237..a2d5fa60d 100644
--- a/lib/mesa/src/gallium/auxiliary/Android.mk
+++ b/lib/mesa/src/gallium/auxiliary/Android.mk
@@ -28,17 +28,14 @@ include $(LOCAL_PATH)/Makefile.sources
 
 include $(CLEAR_VARS)
 
-# filter-out tessellator/tessellator.hpp to avoid "Unused source files" error
 LOCAL_SRC_FILES := \
-	$(filter-out tessellator/tessellator.hpp, $(C_SOURCES)) \
+	$(C_SOURCES) \
 	$(NIR_SOURCES) \
 	$(RENDERONLY_SOURCES) \
 	$(VL_STUB_SOURCES)
 
 ifeq ($(USE_LIBBACKTRACE),true)
-	LOCAL_CFLAGS += -DHAVE_ANDROID_PLATFORM
-	LOCAL_SHARED_LIBRARIES += libbacktrace
-	LOCAL_SRC_FILES += ../../util/u_debug_stack_android.cpp
+	LOCAL_SRC_FILES += util/u_debug_stack_android.cpp
 endif
 
 LOCAL_C_INCLUDES := \
@@ -55,7 +52,6 @@ LOCAL_CPPFLAGS += -std=c++14
 
 # We need libmesa_nir to get NIR's generated include directories.
 LOCAL_MODULE := libmesa_gallium
-LOCAL_SHARED_LIBRARIES += libsync
 LOCAL_STATIC_LIBRARIES += libmesa_nir
 
 LOCAL_WHOLE_STATIC_LIBRARIES += cpufeatures
@@ -66,44 +62,18 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 intermediates := $(call local-generated-sources-dir)
 LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, $(GENERATED_SOURCES))
 
-u_indices_gen_deps := \
-	$(MESA_TOP)/src/gallium/auxiliary/indices/u_indices_gen.py
+$(LOCAL_GENERATED_SOURCES): PRIVATE_PYTHON := $(MESA_PYTHON2)
+$(LOCAL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@
 
-$(intermediates)/indices/u_indices_gen.c: $(u_indices_gen_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON3) $< > $@
+$(intermediates)/indices/u_indices_gen.c \
+$(intermediates)/indices/u_unfilled_gen.c \
+$(intermediates)/util/u_format_srgb.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py
+	$(transform-generated-source)
 
-u_unfilled_gen_deps := \
-	$(MESA_TOP)/src/gallium/auxiliary/indices/u_unfilled_gen.py
-
-$(intermediates)/indices/u_unfilled_gen.c: $(u_unfilled_gen_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON3) $< > $@
-
-u_tracepoints_deps := \
-	$(MESA_TOP)/src/gallium/auxiliary/util/u_tracepoints.py \
-	$(MESA_TOP)/src/gallium/auxiliary/util/u_trace.py
-
-u_tracepoints_c := $(intermediates)/util/u_tracepoints.c
-u_tracepoints_h := $(intermediates)/util/u_tracepoints.h
-
-$(intermediates)/util/u_tracepoints.c \
-$(intermediates)/util/u_tracepoints.h: $(u_tracepoints_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/gallium/auxiliary/util -C $(u_tracepoints_c) -H $(u_tracepoints_h)
+$(intermediates)/util/u_format_table.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(LOCAL_PATH)/util/u_format.csv
+	$(transform-generated-source)
 
 LOCAL_GENERATED_SOURCES += $(MESA_GEN_NIR_H)
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
-
-# Build libmesa_galliumvl used by radeonsi
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := \
-	$(VL_SOURCES)
-
-LOCAL_MODULE := libmesa_galliumvl
-
-include $(GALLIUM_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk b/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk
index de07a03ce..075bf8af4 100644
--- a/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk
+++ b/lib/mesa/src/gallium/auxiliary/pipe-loader/Android.mk
@@ -31,6 +31,7 @@ include $(CLEAR_VARS)
 LOCAL_CFLAGS := \
 	-DHAVE_PIPE_LOADER_DRI \
 	-DHAVE_PIPE_LOADER_KMS \
+	-DDROP_PIPE_LOADER_MISC \
 	-DGALLIUM_STATIC_TARGETS
 
 LOCAL_SRC_FILES := \
diff --git a/lib/mesa/src/gallium/drivers/etnaviv/Android.mk b/lib/mesa/src/gallium/drivers/etnaviv/Android.mk
index 3ba6b819f..6976d223c 100644
--- a/lib/mesa/src/gallium/drivers/etnaviv/Android.mk
+++ b/lib/mesa/src/gallium/drivers/etnaviv/Android.mk
@@ -28,10 +28,7 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(C_SOURCES)
 
-LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
-
-LOCAL_SHARED_LIBRARIES := libdrm
-LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm
+LOCAL_SHARED_LIBRARIES := libdrm_etnaviv
 LOCAL_MODULE := libmesa_pipe_etnaviv
 
 include $(GALLIUM_COMMON_MK)
diff --git a/lib/mesa/src/gallium/drivers/freedreno/Android.mk b/lib/mesa/src/gallium/drivers/freedreno/Android.mk
index 86db01a59..f0b29b116 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/Android.mk
+++ b/lib/mesa/src/gallium/drivers/freedreno/Android.mk
@@ -39,34 +39,15 @@ LOCAL_SRC_FILES := \
 
 LOCAL_C_INCLUDES := \
 	$(LOCAL_PATH)/ir3 \
-	$(MESA_TOP)/include \
-	$(MESA_TOP)/src/freedreno/common \
-	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_gallium,,)/util
+	$(MESA_TOP)/include
 
 LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
 
-LOCAL_SHARED_LIBRARIES := libdrm libsync
-LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_perfcntrs libfreedreno_registers
+LOCAL_SHARED_LIBRARIES := libdrm
+LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_registers
 LOCAL_MODULE := libmesa_pipe_freedreno
 
-LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
-intermediates := $(call local-generated-sources-dir)
-
-LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, $(GENERATED_SOURCES))
-
-freedreno_tracepoints_deps := \
-	$(MESA_TOP)/src/gallium/drivers/freedreno/freedreno_tracepoints.py \
-	$(MESA_TOP)/src/gallium/auxiliary/util/u_trace.py
-
-freedreno_tracepoints_c := $(intermediates)/freedreno_tracepoints.c
-freedreno_tracepoints_h := $(intermediates)/freedreno_tracepoints.h
-
-$(intermediates)/freedreno_tracepoints.c \
-$(intermediates)/freedreno_tracepoints.h: $(freedreno_tracepoints_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON3) $< -p $(MESA_TOP)/src/gallium/auxiliary/util -C $(freedreno_tracepoints_c) -H $(freedreno_tracepoints_h)
-
+include $(LOCAL_PATH)/Android.gen.mk
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
diff --git a/lib/mesa/src/gallium/drivers/iris/Android.mk b/lib/mesa/src/gallium/drivers/iris/Android.mk
index 5d5744025..71ec0cf58 100644
--- a/lib/mesa/src/gallium/drivers/iris/Android.mk
+++ b/lib/mesa/src/gallium/drivers/iris/Android.mk
@@ -42,15 +42,15 @@ IRIS_COMMON_INCLUDES := \
 	$(MESA_TOP)/src/gallium/auxiliary
 
 #
-# libiris for gfx8
+# libiris for gen8
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx8
+LOCAL_MODULE := libmesa_iris_gen8
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=80
+LOCAL_CFLAGS := -DGEN_VERSIONx10=80
 
 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
 
@@ -62,15 +62,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libiris for gfx9
+# libiris for gen9
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx9
+LOCAL_MODULE := libmesa_iris_gen9
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=90
+LOCAL_CFLAGS := -DGEN_VERSIONx10=90
 
 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
 
@@ -82,15 +82,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libiris for gfx11
+# libiris for gen10
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx11
+LOCAL_MODULE := libmesa_iris_gen10
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=110
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
 
 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
 
@@ -102,15 +102,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libiris for gfx12
+# libiris for gen11
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx12
+LOCAL_MODULE := libmesa_iris_gen11
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=120
+LOCAL_CFLAGS := -DGEN_VERSIONx10=110
 
 LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
 
@@ -121,30 +121,29 @@ LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
-#
-# libiris for gfx125
-#
 
+###########################################################
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_iris_gfx125
+
+LOCAL_MODULE := libmesa_pipe_iris
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=125
+intermediates := $(call local-generated-sources-dir)
 
-LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES)
+LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/iris/,$(GENERATED_SOURCES))
 
-LOCAL_STATIC_LIBRARIES := $(LIBIRIS_STATIC_LIBS)
+GEN_DRIINFO_INPUTS := \
+        $(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
+        $(LOCAL_PATH)/driinfo_iris.h
 
-LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
+MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
 
-include $(MESA_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
+$(intermediates)/iris/iris_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
 
-###########################################################
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libmesa_pipe_iris
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
 
 LOCAL_SRC_FILES := \
 	$(IRIS_C_SOURCES)
@@ -167,11 +166,10 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 	libmesa_intel_common \
 	libmesa_intel_compiler \
 	libmesa_intel_perf \
-	libmesa_iris_gfx8 \
-	libmesa_iris_gfx9 \
-	libmesa_iris_gfx11 \
-	libmesa_iris_gfx12 \
-	libmesa_iris_gfx125
+	libmesa_iris_gen8 \
+	libmesa_iris_gen9 \
+	libmesa_iris_gen10 \
+	libmesa_iris_gen11
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/gallium/drivers/iris/Makefile.sources b/lib/mesa/src/gallium/drivers/iris/Makefile.sources
index c727bce86..bc8f592d3 100644
--- a/lib/mesa/src/gallium/drivers/iris/Makefile.sources
+++ b/lib/mesa/src/gallium/drivers/iris/Makefile.sources
@@ -20,7 +20,11 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
+GENERATED_SOURCES := \
+	iris_driinfo.h
+
 IRIS_C_SOURCES = \
+	$(GENERATED_SOURCES) \
 	driinfo_iris.h \
 	iris_batch.c \
 	iris_batch.h \
@@ -37,16 +41,10 @@ IRIS_C_SOURCES = \
 	iris_draw.c \
 	iris_fence.c \
 	iris_fence.h \
-	iris_fine_fence.c \
-	iris_fine_fence.h \
 	iris_formats.c \
 	iris_genx_macros.h \
 	iris_genx_protos.h \
-	iris_measure.c \
-	iris_measure.h \
 	iris_monitor.c \
-	iris_performance_query.c \
-	iris_perf.c \
 	iris_pipe.h \
 	iris_pipe_control.c \
 	iris_program.c \
diff --git a/lib/mesa/src/gallium/drivers/kmsro/Android.mk b/lib/mesa/src/gallium/drivers/kmsro/Android.mk
index e0e26482b..2f637b8bf 100644
--- a/lib/mesa/src/gallium/drivers/kmsro/Android.mk
+++ b/lib/mesa/src/gallium/drivers/kmsro/Android.mk
@@ -39,20 +39,14 @@ GALLIUM_TARGET_DRIVERS += exynos
 GALLIUM_TARGET_DRIVERS += hx8357d
 GALLIUM_TARGET_DRIVERS += ili9225
 GALLIUM_TARGET_DRIVERS += ili9341
-GALLIUM_TARGET_DRIVERS += imx-drm
-GALLIUM_TARGET_DRIVERS += imx-dcss
-GALLIUM_TARGET_DRIVERS += ingenic-drm
-GALLIUM_TARGET_DRIVERS += mcde
-GALLIUM_TARGET_DRIVERS += mediatek
-GALLIUM_TARGET_DRIVERS += meson
+GALLIUM_TARGET_DRIVERS += imx
+GALLIUM_TARGET_DRIVERS += stm
 GALLIUM_TARGET_DRIVERS += mi0283qt
 GALLIUM_TARGET_DRIVERS += mxsfb-drm
 GALLIUM_TARGET_DRIVERS += pl111
 GALLIUM_TARGET_DRIVERS += repaper
-GALLIUM_TARGET_DRIVERS += rockchip
 GALLIUM_TARGET_DRIVERS += st7586
 GALLIUM_TARGET_DRIVERS += st7735r
-GALLIUM_TARGET_DRIVERS += stm
 GALLIUM_TARGET_DRIVERS += sun4i-drm
 $(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_kmsro)
 endif
diff --git a/lib/mesa/src/gallium/drivers/lima/Android.mk b/lib/mesa/src/gallium/drivers/lima/Android.mk
index 09487d9dc..069ecc4b2 100644
--- a/lib/mesa/src/gallium/drivers/lima/Android.mk
+++ b/lib/mesa/src/gallium/drivers/lima/Android.mk
@@ -31,15 +31,11 @@ LOCAL_SRC_FILES := \
 	ir/gp/lower.c \
 	ir/gp/nir.c \
 	ir/gp/node.c \
-	ir/gp/optimize.c \
 	ir/gp/regalloc.c \
 	ir/gp/reduce_scheduler.c \
 	ir/gp/scheduler.c \
 	ir/lima_ir.h \
-	ir/lima_nir_duplicate_consts.c \
-	ir/lima_nir_duplicate_intrinsic.c \
 	ir/lima_nir_lower_uniform_to_scalar.c \
-	ir/lima_nir_split_load_input.c \
 	ir/pp/codegen.c \
 	ir/pp/codegen.h \
 	ir/pp/disasm.c \
@@ -50,19 +46,14 @@ LOCAL_SRC_FILES := \
 	ir/pp/node_to_instr.c \
 	ir/pp/ppir.h \
 	ir/pp/regalloc.c \
-	ir/pp/liveness.c \
 	ir/pp/scheduler.c \
 	lima_bo.c \
 	lima_bo.h \
 	lima_context.c \
 	lima_context.h \
-	lima_disk_cache.c \
-	lima_disk_cache.h \
 	lima_draw.c \
 	lima_fence.c \
 	lima_fence.h \
-	lima_parser.c \
-	lima_parser.h \
 	lima_program.c \
 	lima_program.h \
 	lima_query.c \
@@ -71,15 +62,12 @@ LOCAL_SRC_FILES := \
 	lima_screen.c \
 	lima_screen.h \
 	lima_state.c \
-	lima_job.c \
-	lima_job.h \
+	lima_submit.c \
+	lima_submit.h \
 	lima_texture.c \
 	lima_texture.h \
 	lima_util.c \
-	lima_util.h \
-	lima_format.c \
-	lima_format.h \
-	lima_gpu.h
+	lima_util.h
 
 LOCAL_MODULE := libmesa_pipe_lima
 
diff --git a/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c b/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c
index 157c23491..43121335f 100644
--- a/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c
+++ b/lib/mesa/src/gallium/drivers/panfrost/pan_blend_cso.c
@@ -27,11 +27,8 @@
 
 #include <stdio.h>
 #include "util/u_memory.h"
-#include "gallium/auxiliary/util/u_blend.h"
-#include "pan_context.h"
-#include "pan_blend_cso.h"
-#include "pan_bo.h"
-#include "panfrost-quirks.h"
+#include "pan_blend_shaders.h"
+#include "pan_blending.h"
 
 /* A given Gallium blend state can be encoded to the hardware in numerous,
  * dramatically divergent ways due to the interactions of blending with
@@ -60,6 +57,41 @@
  * (our subclass of pipe_blend_state).
  */
 
+/* Given an initialized CSO and a particular framebuffer format, grab a
+ * blend shader, generating and compiling it if it doesn't exist
+ * (lazy-loading in a way). This routine, when the cache hits, should
+ * befast, suitable for calling every draw to avoid wacky dirty
+ * tracking paths. If the cache hits, boom, done. */
+
+static struct panfrost_blend_shader *
+panfrost_get_blend_shader(
+        struct panfrost_context *ctx,
+        struct panfrost_blend_state *blend,
+        enum pipe_format fmt,
+        unsigned rt)
+{
+        /* Prevent NULL collision issues.. */
+        assert(fmt != 0);
+
+        /* Check the cache */
+        struct hash_table_u64 *shaders = blend->rt[rt].shaders;
+
+        struct panfrost_blend_shader *shader =
+                _mesa_hash_table_u64_search(shaders, fmt);
+
+        if (shader)
+                return shader;
+
+        /* Cache miss. Build one instead, cache it, and go */
+
+        struct panfrost_blend_shader generated =
+                panfrost_compile_blend_shader(ctx, &blend->base, fmt);
+
+        shader = mem_dup(&generated, sizeof(generated));
+        _mesa_hash_table_u64_insert(shaders, fmt, shader);
+        return  shader;
+}
+
 /* Create a blend CSO. Essentially, try to compile a fixed-function
  * expression and initialize blend shaders */
 
@@ -71,34 +103,33 @@ panfrost_create_blend_state(struct pipe_context *pipe,
         struct panfrost_blend_state *so = rzalloc(ctx, struct panfrost_blend_state);
         so->base = *blend;
 
-        so->pan.dither = blend->dither;
-        so->pan.logicop_enable = blend->logicop_enable;
-        so->pan.logicop_func = blend->logicop_func;
-        so->pan.rt_count = blend->max_rt + 1;
-
         /* TODO: The following features are not yet implemented */
+        assert(!blend->logicop_enable);
+        assert(!blend->alpha_to_coverage);
         assert(!blend->alpha_to_one);
 
-        for (unsigned c = 0; c < so->pan.rt_count; ++c) {
-                unsigned g = blend->independent_blend_enable ? c : 0;
-                const struct pipe_rt_blend_state *pipe = &blend->rt[g];
-                struct pan_blend_equation *equation = &so->pan.rts[c].equation;
-
-                equation->color_mask = pipe->colormask;
-                equation->blend_enable = pipe->blend_enable;
-                if (!equation->blend_enable)
-                        continue;
-
-                equation->rgb_func = util_blend_func_to_shader(pipe->rgb_func);
-                equation->rgb_src_factor = util_blend_factor_to_shader(pipe->rgb_src_factor);
-                equation->rgb_invert_src_factor = util_blend_factor_is_inverted(pipe->rgb_src_factor);
-                equation->rgb_dst_factor = util_blend_factor_to_shader(pipe->rgb_dst_factor);
-                equation->rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe->rgb_dst_factor);
-                equation->alpha_func = util_blend_func_to_shader(pipe->alpha_func);
-                equation->alpha_src_factor = util_blend_factor_to_shader(pipe->alpha_src_factor);
-                equation->alpha_invert_src_factor = util_blend_factor_is_inverted(pipe->alpha_src_factor);
-                equation->alpha_dst_factor = util_blend_factor_to_shader(pipe->alpha_dst_factor);
-                equation->alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe->alpha_dst_factor);
+        for (unsigned c = 0; c < PIPE_MAX_COLOR_BUFS; ++c) {
+                struct panfrost_blend_rt *rt = &so->rt[c];
+
+                /* There are two paths. First, we would like to try a
+                 * fixed-function if we can */
+
+                /* Without indep blending, the first RT settings replicate */
+
+                unsigned g =
+                        blend->independent_blend_enable ? c : 0;
+
+                rt->has_fixed_function =
+                        panfrost_make_fixed_blend_mode(
+                                &blend->rt[g],
+                                &rt->equation,
+                                &rt->constant_mask,
+                                blend->rt[g].colormask);
+
+                /* Regardless if that works, we also need to initialize
+                 * the blend shaders */
+
+                rt->shaders = _mesa_hash_table_u64_create(so);
         }
 
         return so;
@@ -109,7 +140,28 @@ panfrost_bind_blend_state(struct pipe_context *pipe,
                           void *cso)
 {
         struct panfrost_context *ctx = pan_context(pipe);
-        ctx->blend = (struct panfrost_blend_state *) cso;
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+        struct pipe_blend_state *blend = (struct pipe_blend_state *) cso;
+        struct panfrost_blend_state *pblend = (struct panfrost_blend_state *) cso;
+        ctx->blend = pblend;
+
+        if (!blend)
+                return;
+
+        if (screen->require_sfbd) {
+                SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_DITHER, !blend->dither);
+        }
+
+        /* Shader itself is not dirty, but the shader core is */
+        ctx->dirty |= PAN_DIRTY_FS;
+}
+
+static void
+panfrost_delete_blend_shader(struct hash_entry *entry)
+{
+        struct panfrost_blend_shader *shader = (struct panfrost_blend_shader *)entry->data;
+        free(shader->buffer);
+        free(shader);
 }
 
 static void
@@ -117,6 +169,11 @@ panfrost_delete_blend_state(struct pipe_context *pipe,
                             void *cso)
 {
         struct panfrost_blend_state *blend = (struct panfrost_blend_state *) cso;
+
+        for (unsigned c = 0; c < 4; ++c) {
+                struct panfrost_blend_rt *rt = &blend->rt[c];
+                _mesa_hash_table_u64_clear(rt->shaders, panfrost_delete_blend_shader);
+        }
         ralloc_free(blend);
 }
 
@@ -130,73 +187,105 @@ panfrost_set_blend_color(struct pipe_context *pipe,
                 ctx->blend_color = *blend_color;
 }
 
+/* Given a vec4 of constants, reduce it to just a single constant according to
+ * the mask (if we can) */
+
+static bool
+panfrost_blend_constant(float *out, float *in, unsigned mask)
+{
+        /* If there is no components used, it automatically works. Do set a
+         * dummy constant just to avoid reading uninitialized memory. */
+
+        if (!mask) {
+                *out = 0.0;
+                return true;
+        }
+
+        /* Find some starter mask */
+        unsigned first = ffs(mask) - 1;
+        float cons = in[first];
+        mask ^= (1 << first);
+
+        /* Ensure the rest are equal */
+        while (mask) {
+                unsigned i = u_bit_scan(&mask);
+
+                if (in[i] != cons) {
+                        *out = 0.0;
+                        return false;
+                }
+        }
+
+        /* Otherwise, we're good to go */
+        *out = cons;
+        return true;
+}
+
 /* Create a final blend given the context */
 
 struct panfrost_blend_final
-panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti, struct panfrost_bo **bo, unsigned *shader_offset)
+panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
 {
-        struct panfrost_device *dev = pan_device(ctx->base.screen);
-        struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+
+        /* Grab the format, falling back gracefully if called invalidly (which
+         * has to happen for no-color-attachment FBOs, for instance)  */
         struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
-        enum pipe_format fmt = fb->cbufs[rti]->format;
-        unsigned nr_samples = fb->cbufs[rti]->nr_samples ? :
-                              fb->cbufs[rti]->texture->nr_samples;
+        enum pipe_format fmt = PIPE_FORMAT_R8G8B8A8_UNORM;
+
+        if ((fb->nr_cbufs > rti) && fb->cbufs[rti])
+                fmt = fb->cbufs[rti]->format;
 
         /* Grab the blend state */
         struct panfrost_blend_state *blend = ctx->blend;
-        struct pan_blend_state pan_blend = blend->pan;
-
-        pan_blend.rts[rti].format = fmt;
-        pan_blend.rts[rti].nr_samples = nr_samples;
-        memcpy(pan_blend.constants, ctx->blend_color.color,
-               sizeof(pan_blend.constants));
-
-        /* First, we'll try fixed function, matching equation and constant */
-        if (pan_blend_can_fixed_function(dev, &pan_blend, rti)) {
-                struct panfrost_blend_final final = {
-                        .load_dest = pan_blend_reads_dest(pan_blend.rts[rti].equation),
-                        .equation.constant = pan_blend_get_constant(dev, &pan_blend, rti),
-                        .opaque = pan_blend_is_opaque(pan_blend.rts[rti].equation),
-                        .no_colour = pan_blend.rts[rti].equation.color_mask == 0,
-                };
-
-                pan_blend_to_fixed_function_equation(dev, &pan_blend, rti,
-                                                     &final.equation.equation);
-                return final;
-        }
+        assert(blend);
 
+        struct panfrost_blend_rt *rt = &blend->rt[rti];
 
-        /* Otherwise, we need to grab a shader */
-        /* Upload the shader, sharing a BO */
-        if (!(*bo)) {
-                *bo = panfrost_batch_create_bo(batch, 4096,
-                   PAN_BO_EXECUTE,
-                   PAN_BO_ACCESS_PRIVATE |
-                   PAN_BO_ACCESS_READ |
-                   PAN_BO_ACCESS_FRAGMENT);
+        struct panfrost_blend_final final;
+
+        /* First, we'll try a fixed function path */
+        if (rt->has_fixed_function && panfrost_can_fixed_blend(fmt)) {
+                if (panfrost_blend_constant(
+                            &final.equation.constant,
+                            ctx->blend_color.color,
+                            rt->constant_mask)) {
+                        /* There's an equation and suitable constant, so we're good to go */
+                        final.is_shader = false;
+                        final.equation.equation = &rt->equation;
+
+                        final.no_blending =
+                                (rt->equation.rgb_mode == 0x122) &&
+                                (rt->equation.alpha_mode == 0x122) &&
+                                (rt->equation.color_mask == 0xf);
+
+                        return final;
+                }
         }
 
-        pthread_mutex_lock(&dev->blend_shaders.lock);
-        struct pan_blend_shader_variant *shader =
-                pan_blend_get_shader_locked(dev, &pan_blend, rti);
+        /* Otherwise, we need to grab a shader */
+        struct panfrost_blend_shader *shader = panfrost_get_blend_shader(ctx, blend, fmt, rti);
+        final.is_shader = true;
+        final.no_blending = false;
+        final.shader.work_count = shader->work_count;
+        final.shader.first_tag = shader->first_tag;
 
-        /* Size check */
-        assert((*shader_offset + shader->binary.size) < 4096);
+        /* Upload the shader */
+        final.shader.bo = panfrost_drm_create_bo(screen, shader->size, PAN_ALLOCATE_EXECUTE);
+        memcpy(final.shader.bo->cpu, shader->buffer, shader->size);
 
-        memcpy((*bo)->ptr.cpu + *shader_offset, shader->binary.data, shader->binary.size);
+        /* Pass BO ownership to job */
+        panfrost_job_add_bo(job, final.shader.bo);
+        panfrost_bo_unreference(ctx->base.screen, final.shader.bo);
 
-        struct panfrost_blend_final final = {
-                .is_shader = true,
-                .shader = {
-                        .first_tag = shader->first_tag,
-                        .gpu = (*bo)->ptr.gpu + *shader_offset,
-                },
-                .load_dest = pan_blend.logicop_enable ||
-                        pan_blend_reads_dest(pan_blend.rts[rti].equation),
-        };
+        if (shader->patch_index) {
+                /* We have to specialize the blend shader to use constants, so
+                 * patch in the current constants */
 
-        *shader_offset += shader->binary.size;
-        pthread_mutex_unlock(&dev->blend_shaders.lock);
+                float *patch = (float *) (final.shader.bo->cpu + shader->patch_index);
+                memcpy(patch, ctx->blend_color.color, sizeof(float) * 4);
+        }
 
         return final;
 }
diff --git a/lib/mesa/src/gallium/drivers/r600/Android.mk b/lib/mesa/src/gallium/drivers/r600/Android.mk
index b87fc91e6..9f684cf24 100644
--- a/lib/mesa/src/gallium/drivers/r600/Android.mk
+++ b/lib/mesa/src/gallium/drivers/r600/Android.mk
@@ -30,12 +30,8 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES)
 
-LOCAL_C_INCLUDES += \
-    $(MESA_TOP)/src/amd/common \
-    $(MESA_TOP)/src/amd/llvm \
-    $(MESA_TOP)/src/mesa
+LOCAL_C_INCLUDES += $(MESA_TOP)/src/amd/common
 
-LOCAL_STATIC_LIBRARIES := libmesa_nir
 LOCAL_SHARED_LIBRARIES := libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_r600
 
@@ -49,15 +45,6 @@ $(intermediates)/egd_tables.h: $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.p
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
 	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.py $(MESA_TOP)/src/gallium/drivers/r600/evergreend.h > $@
 
-sfn_nir_algebraic_gen := $(LOCAL_PATH)/sfn/sfn_nir_algebraic.py
-sfn_nir_algebraic_deps := \
-	$(LOCAL_PATH)/sfn/sfn_nir_algebraic.py \
-	$(MESA_TOP)/src/compiler/nir/nir_algebraic.py
-
-$(intermediates)/sfn_nir_algebraic.c: $(sfn_nir_algebraic_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $(sfn_nir_algebraic_gen) -p $(MESA_TOP)/src/compiler/nir/ > $@
-
 ifeq ($(MESA_ENABLE_LLVM),true)
 $(call mesa-build-with-llvm)
 endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
index 75f30f621..e402da639 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
+++ b/lib/mesa/src/gallium/drivers/radeonsi/Android.mk
@@ -21,8 +21,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-ifeq ($(MESA_ENABLE_LLVM),true)
-
 LOCAL_PATH := $(call my-dir)
 
 # get C_SOURCES and GENERATED_SOURCES
@@ -38,20 +36,48 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/amd/common \
-	$(MESA_TOP)/src/amd/llvm \
-	$(MESA_TOP)/src/compiler/nir \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir
 
-LOCAL_STATIC_LIBRARIES := \
-	libmesa_amd_common \
-	libmesa_galliumvl
+LOCAL_STATIC_LIBRARIES := libmesa_amd_common
 
 LOCAL_SHARED_LIBRARIES := libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_radeonsi
 
+intermediates := $(call local-generated-sources-dir)
+
 # We need to get NIR's generated headers.
 LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/radeonsi/,$(GENERATED_SOURCES))
+
+GEN_DRIINFO_INPUTS := \
+	$(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
+	$(LOCAL_PATH)/driinfo_radeonsi.h
+
+MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
+
+$(intermediates)/radeonsi/si_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
+
+GEN10_FORMAT_TABLE_INPUTS := \
+	$(MESA_TOP)/src/gallium/auxiliary/util/u_format.csv \
+	$(MESA_TOP)/src/amd/registers/gfx10-rsrc.json
+
+GEN10_FORMAT_TABLE_DEP := \
+	$(MESA_TOP)/src/amd/registers/regdb.py
+
+GEN10_FORMAT_TABLE := $(LOCAL_PATH)/gfx10_format_table.py
+
+$(intermediates)/radeonsi/gfx10_format_table.h: $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) $(GEN10_FORMAT_TABLE_DEP)
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(MESA_PYTHON2) $(GEN10_FORMAT_TABLE) $(GEN10_FORMAT_TABLE_INPUTS) > $@ || ($(RM) $@; false)
+
+LOCAL_C_INCLUDES += $(intermediates)/radeonsi
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
 
 $(call mesa-build-with-llvm)
 
@@ -67,5 +93,3 @@ $(eval GALLIUM_LIBS += \
 	libmesa_winsys_amdgpu)
 $(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
 endif
-
-endif # MESA_ENABLE_LLVM==true
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index 3d17f08ca..373fd4ffa 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -23,15 +23,16 @@
  *
  */
 
-#include "ac_llvm_cull.h"
-#include "si_build_pm4.h"
 #include "si_pipe.h"
 #include "si_shader_internal.h"
 #include "sid.h"
-#include "util/fast_idiv_by_const.h"
+#include "si_build_pm4.h"
+#include "ac_llvm_cull.h"
+
 #include "util/u_prim.h"
 #include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
 
 /* Based on:
  * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
@@ -107,6 +108,7 @@
  *   (patch elimination where tess factors are 0 would be possible to implement)
  * - The vertex shader must not contain memory stores.
  * - All VS resources must not have a write usage in the command buffer.
+ *   (TODO: all shader buffers currently set the write usage)
  * - Bindless textures and images must not occur in the vertex shader.
  *
  * User data SGPR layout:
@@ -153,1400 +155,1426 @@
 
 /* At least 256 is needed for the fastest wave launch rate from compute queues
  * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE     256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU  1   /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH     0   /* no limit */
-#define INDEX_STORES_USE_SLC 1   /* don't cache indices if L2 is full */
+#define THREADGROUP_SIZE		256 /* high numbers limit available VGPRs */
+#define THREADGROUPS_PER_CU		1 /* TGs to launch on 1 CU before going onto the next, max 8 */
+#define MAX_WAVES_PER_SH		0 /* no limit */
+#define INDEX_STORES_USE_SLC		1 /* don't cache indices if L2 is full */
 /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
-#define CULL_Z 0
+#define CULL_Z				0
 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
-#define VERTEX_COUNTER_GDS_MODE 2
-#define GDS_SIZE_UNORDERED      (4 * 1024) /* only for the unordered GDS counter */
+#define VERTEX_COUNTER_GDS_MODE		2
+#define GDS_SIZE_UNORDERED		(4 * 1024) /* only for the unordered GDS counter */
 
 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
  * draw calls to process by compute before signaling the gfx IB. This reduces the number
  * of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH (512 * 1024)
+#define PRIMS_PER_BATCH			(512 * 1024)
 /* Draw call splitting at the packet level. This allows signaling the gfx IB
  * for big draw calls sooner, but doesn't allow context flushes between packets.
  * Primitive restart is supported. Only implemented for ordered append. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
+#define SPLIT_PRIMS_PACKET_LEVEL_VALUE	PRIMS_PER_BATCH
 /* If there is not enough ring buffer space for the current IB, split draw calls into
  * this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
+#define SPLIT_PRIMS_DRAW_LEVEL		PRIMS_PER_BATCH
 
 /* Derived values. */
-#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL                                                                   \
-   (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE                                  \
-                                 : UINT_MAX & ~(THREADGROUP_SIZE - 1))
+#define WAVES_PER_TG			DIV_ROUND_UP(THREADGROUP_SIZE, 64)
+#define SPLIT_PRIMS_PACKET_LEVEL	(VERTEX_COUNTER_GDS_MODE == 2 ? \
+					 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
+					 UINT_MAX & ~(THREADGROUP_SIZE - 1))
 
-#define REWIND_SIGNAL_BIT 0x80000000
+#define REWIND_SIGNAL_BIT		0x80000000
 /* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION 0
+#define FORCE_REWIND_EMULATION		0
 
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
-                                         unsigned *prim_discard_vertex_count_threshold,
-                                         unsigned *index_ring_size_per_ib)
+void si_initialize_prim_discard_tunables(struct si_context *sctx)
 {
-   *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
-   if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
-       !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
-      return;
-
-   /* TODO: enable this after the GDS kernel memory management is fixed */
-   bool enable_on_pro_graphics_by_default = false;
-
-   if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
-       (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
-        (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
-         sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
-         sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
-         sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
-      *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
-      if (sscreen->debug_flags & DBG(ALWAYS_PD))
-         *prim_discard_vertex_count_threshold = 0; /* always enable */
-
-      const uint32_t MB = 1024 * 1024;
-      const uint64_t GB = 1024 * 1024 * 1024;
-
-      /* The total size is double this per context.
-       * Greater numbers allow bigger gfx IBs.
-       */
-      if (sscreen->info.vram_size <= 2 * GB)
-         *index_ring_size_per_ib = 64 * MB;
-      else if (sscreen->info.vram_size <= 4 * GB)
-         *index_ring_size_per_ib = 128 * MB;
-      else
-         *index_ring_size_per_ib = 256 * MB;
-   }
+	sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
+
+	if (sctx->chip_class == GFX6 || /* SI support is not implemented */
+	    !sctx->screen->info.has_gds_ordered_append ||
+	    sctx->screen->debug_flags & DBG(NO_PD) ||
+	    /* If aux_context == NULL, we are initializing aux_context right now. */
+	    !sctx->screen->aux_context)
+		return;
+
+	/* TODO: enable this after the GDS kernel memory management is fixed */
+	bool enable_on_pro_graphics_by_default = false;
+
+	if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
+	    sctx->screen->debug_flags & DBG(PD) ||
+	    (enable_on_pro_graphics_by_default &&
+	     sctx->screen->info.is_pro_graphics &&
+	     (sctx->family == CHIP_BONAIRE ||
+	      sctx->family == CHIP_HAWAII ||
+	      sctx->family == CHIP_TONGA ||
+	      sctx->family == CHIP_FIJI ||
+	      sctx->family == CHIP_POLARIS10 ||
+	      sctx->family == CHIP_POLARIS11 ||
+	      sctx->family == CHIP_VEGA10 ||
+	      sctx->family == CHIP_VEGA20))) {
+		sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
+
+		if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
+			sctx->prim_discard_vertex_count_threshold = 0; /* always enable */
+
+		const uint32_t MB = 1024 * 1024;
+		const uint64_t GB = 1024 * 1024 * 1024;
+
+		/* The total size is double this per context.
+		 * Greater numbers allow bigger gfx IBs.
+		 */
+		if (sctx->screen->info.vram_size <= 2 * GB)
+			sctx->index_ring_size_per_ib = 64 * MB;
+		else if (sctx->screen->info.vram_size <= 4 * GB)
+			sctx->index_ring_size_per_ib = 128 * MB;
+		else
+			sctx->index_ring_size_per_ib = 256 * MB;
+	}
 }
 
 /* Opcode can be "add" or "swap". */
-static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
-                                           LLVMValueRef m0, LLVMValueRef value,
-                                           unsigned ordered_count_index, bool release, bool done)
+static LLVMValueRef
+si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
+		       LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
+		       bool release, bool done)
 {
-   if (ctx->screen->info.chip_class >= GFX10)
-      ordered_count_index |= 1 << 24; /* number of dwords == 1 */
-
-   LLVMValueRef args[] = {
-      LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
-      value,
-      LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
-      ctx->ac.i32_0,                                             /* scope */
-      ctx->ac.i1false,                                           /* volatile */
-      LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
-      LLVMConstInt(ctx->ac.i1, release, 0),
-      LLVMConstInt(ctx->ac.i1, done, 0),
-   };
-
-   char intrinsic[64];
-   snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
-   return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
+	LLVMValueRef args[] = {
+		LLVMBuildIntToPtr(ctx->ac.builder, m0,
+				  LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""),
+		value,
+		LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
+		ctx->i32_0, /* scope */
+		ctx->i1false, /* volatile */
+		LLVMConstInt(ctx->i32, ordered_count_index, 0),
+		LLVMConstInt(ctx->i1, release, 0),
+		LLVMConstInt(ctx->i1, done, 0),
+	};
+
+	char intrinsic[64];
+	snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
+	return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0);
 }
 
 static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
 {
-   uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
-   ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
-   ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
-   return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
-                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
+	uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
+	ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, "");
+	ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), "");
+	return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
+				 LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), "");
 }
 
 struct si_thread0_section {
-   struct si_shader_context *ctx;
-   LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
-   LLVMValueRef saved_exec;
+	struct si_shader_context *ctx;
+	LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
+	LLVMValueRef saved_exec;
 };
 
 /* Enter a section that only executes on thread 0. */
 static void si_enter_thread0_section(struct si_shader_context *ctx,
-                                     struct si_thread0_section *section, LLVMValueRef thread_id)
+				     struct si_thread0_section *section,
+				     LLVMValueRef thread_id)
 {
-   section->ctx = ctx;
-   section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
-
-   /* This IF has 4 instructions:
-    *   v_and_b32_e32 v, 63, v         ; get the thread ID
-    *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
-    *   s_and_saveexec_b64 s, vcc
-    *   s_cbranch_execz BB0_4
-    *
-    * It could just be s_and_saveexec_b64 s, 1.
-    */
-   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""),
-                 12601);
+	section->ctx = ctx;
+	section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0");
+
+	/* This IF has 4 instructions:
+	 *   v_and_b32_e32 v, 63, v         ; get the thread ID
+	 *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
+	 *   s_and_saveexec_b64 s, vcc
+	 *   s_cbranch_execz BB0_4
+	 *
+	 * It could just be s_and_saveexec_b64 s, 1.
+	 */
+	ac_build_ifcc(&ctx->ac,
+		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
+				    ctx->i32_0, ""), 12601);
 }
 
 /* Exit a section that only executes on thread 0 and broadcast the result
  * to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
+static void si_exit_thread0_section(struct si_thread0_section *section,
+				    LLVMValueRef *result)
 {
-   struct si_shader_context *ctx = section->ctx;
+	struct si_shader_context *ctx = section->ctx;
 
-   LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
+	LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
 
-   ac_build_endif(&ctx->ac, 12601);
+	ac_build_endif(&ctx->ac, 12601);
 
-   /* Broadcast the result from thread 0 to all threads. */
-   *result =
-      ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
+	/* Broadcast the result from thread 0 to all threads. */
+	*result = ac_build_readlane(&ctx->ac,
+			LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
 }
 
 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
 {
-   struct si_shader_key *key = &ctx->shader->key;
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef vs = ctx->main_fn;
-
-   /* Always inline the VS function. */
-   ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
-   LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
-   enum ac_arg_type const_desc_type;
-   if (ctx->shader->selector->info.base.num_ubos == 1 &&
-       ctx->shader->selector->info.base.num_ssbos == 0)
-      const_desc_type = AC_ARG_CONST_FLOAT_PTR;
-   else
-      const_desc_type = AC_ARG_CONST_DESC_PTR;
-
-   memset(&ctx->args, 0, sizeof(ctx->args));
-
-   struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
-   struct ac_arg param_vb_desc, param_const_desc;
-   struct ac_arg param_base_vertex, param_start_instance;
-   struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
-   struct ac_arg param_restart_index, param_smallprim_precision;
-   struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
-   struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
-
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-              &param_index_buffers_and_constants);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
-
-   /* Block ID and thread ID inputs. */
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
-   if (VERTEX_COUNTER_GDS_MODE == 2)
-      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
-   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
-
-   /* Create the compute shader function. */
-   gl_shader_stage old_stage = ctx->stage;
-   ctx->stage = MESA_SHADER_COMPUTE;
-   si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
-   ctx->stage = old_stage;
-
-   if (VERTEX_COUNTER_GDS_MODE == 2) {
-      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
-   } else if (VERTEX_COUNTER_GDS_MODE == 1) {
-      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
-   }
-
-   /* Assemble parameters for VS. */
-   LLVMValueRef vs_params[16];
-   unsigned num_vs_params = 0;
-   unsigned param_vertex_id, param_instance_id;
-
-   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */
-   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
-   vs_params[num_vs_params++] =
-      LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
-   vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
-
-   vs_params[(param_vertex_id = num_vs_params++)] = NULL;   /* VertexID */
-   vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
-   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused (PrimID) */
-   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused */
-
-   assert(num_vs_params <= ARRAY_SIZE(vs_params));
-   assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
-   /* Load descriptors. (load 8 dwords at once) */
-   LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
-   LLVMValueRef index_buffers_and_constants =
-      ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
-   tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
-                              ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
-   tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
-
-   for (unsigned i = 0; i < 8; i++)
-      desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
-   input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
-   output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
-   /* Compute PrimID and InstanceID. */
-   LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
-                                                 LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
-                                                 ac_get_arg(&ctx->ac, param_local_id));
-   LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
-   LLVMValueRef instance_id = ctx->ac.i32_0;
-
-   if (key->opt.cs_instancing) {
-      LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
-      LLVMValueRef num_prims_udiv_multiplier =
-         ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
-      /* Unpack num_prims_udiv_terms. */
-      LLVMValueRef post_shift =
-         LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
-      LLVMValueRef prims_per_instance =
-         LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
-      /* Divide the total prim_id by the number of prims per instance. */
-      instance_id =
-         ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
-      /* Compute the remainder. */
-      prim_id = LLVMBuildSub(builder, prim_id,
-                             LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
-   }
-
-   /* Generate indices (like a non-indexed draw call). */
-   LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
-   unsigned vertices_per_prim = 3;
-
-   switch (key->opt.cs_prim_type) {
-   case PIPE_PRIM_TRIANGLES:
-      for (unsigned i = 0; i < 3; i++) {
-         index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
-                                  LLVMConstInt(ctx->ac.i32, i, 0));
-      }
-      break;
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      for (unsigned i = 0; i < 3; i++) {
-         index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
-      }
-      break;
-   case PIPE_PRIM_TRIANGLE_FAN:
-      /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
-       * and rasterizer as a normal triangle, so we need to put the provoking
-       * vertex into the correct index variable and preserve orientation at the same time.
-       * gl_VertexID is preserved, because it's equal to the index.
-       */
-      if (key->opt.cs_provoking_vertex_first) {
-         index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
-         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-         index[2] = ctx->ac.i32_0;
-      } else {
-         index[0] = ctx->ac.i32_0;
-         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
-         index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-      }
-      break;
-   default:
-      unreachable("unexpected primitive type");
-   }
-
-   /* Fetch indices. */
-   if (key->opt.cs_indexed) {
-      for (unsigned i = 0; i < 3; i++) {
-         index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
-                                                1, 0, true, false, false);
-         index[i] = ac_to_integer(&ctx->ac, index[i]);
-      }
-   }
-
-   LLVMValueRef ordered_wave_id = NULL;
-
-   /* Extract the ordered wave ID. */
-   if (VERTEX_COUNTER_GDS_MODE == 2) {
-      ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
-      ordered_wave_id =
-         LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
-      ordered_wave_id =
-         LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
-   }
-   LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
-                                         LLVMConstInt(ctx->ac.i32, 63, 0), "");
-
-   /* Every other triangle in a strip has a reversed vertex order, so we
-    * need to swap vertices of odd primitives to get the correct primitive
-    * orientation when converting triangle strips to triangles. Primitive
-    * restart complicates it, because a strip can start anywhere.
-    */
-   LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
-   LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
-
-   if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
-      /* Without primitive restart, odd primitives have reversed orientation.
-       * Only primitive restart can flip it with respect to the first vertex
-       * of the draw call.
-       */
-      LLVMValueRef first_is_odd = ctx->ac.i1false;
-
-      /* Handle primitive restart. */
-      if (key->opt.cs_primitive_restart) {
-         /* Get the GDS primitive restart continue flag and clear
-          * the flag in vertex_counter. This flag is used when the draw
-          * call was split and we need to load the primitive orientation
-          * flag from GDS for the first wave too.
-          */
-         LLVMValueRef gds_prim_restart_continue =
-            LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
-         gds_prim_restart_continue =
-            LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
-         vertex_counter =
-            LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
-
-         LLVMValueRef index0_is_reset;
-
-         for (unsigned i = 0; i < 3; i++) {
-            LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
-                                                   ac_get_arg(&ctx->ac, param_restart_index), "");
-            if (i == 0)
-               index0_is_reset = LLVMBuildNot(builder, not_reset, "");
-            prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
-         }
-
-         /* If the previous waves flip the primitive orientation
-          * of the current triangle strip, it will be stored in GDS.
-          *
-          * Sometimes the correct orientation is not needed, in which case
-          * we don't need to execute this.
-          */
-         if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
-            /* If there are reset indices in this wave, get the thread index
-             * where the most recent strip starts relative to each thread.
-             */
-            LLVMValueRef preceding_threads_mask =
-               LLVMBuildSub(builder,
-                            LLVMBuildShl(builder, ctx->ac.i64_1,
-                                         LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
-                            ctx->ac.i64_1, "");
-
-            LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
-            LLVMValueRef preceding_reset_threadmask =
-               LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
-            LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
-            strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
-
-            /* This flips the orientation based on reset indices within this wave only. */
-            first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
-
-            LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
-            LLVMValueRef is_first_wave, current_wave_resets_index;
-
-            /* Get the thread index where the last strip starts in this wave.
-             *
-             * If the last strip doesn't start in this wave, the thread index
-             * will be 0.
-             *
-             * If the last strip starts in the next wave, the thread index will
-             * be 64.
-             */
-            last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
-            last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
-
-            struct si_thread0_section section;
-            si_enter_thread0_section(ctx, &section, thread_id);
-
-            /* This must be done in the thread 0 section, because
-             * we expect PrimID to be 0 for the whole first wave
-             * in this expression.
-             *
-             * NOTE: This will need to be different if we wanna support
-             * instancing with primitive restart.
-             */
-            is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
-            is_first_wave = LLVMBuildAnd(builder, is_first_wave,
-                                         LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
-            current_wave_resets_index =
-               LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
-
-            ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
-
-            /* Save the last strip start primitive index in GDS and read
-             * the value that previous waves stored.
-             *
-             * if (is_first_wave || current_wave_resets_strip)
-             *    // Read the value that previous waves stored and store a new one.
-             *    first_is_odd = ds.ordered.swap(last_strip_start);
-             * else
-             *    // Just read the value that previous waves stored.
-             *    first_is_odd = ds.ordered.add(0);
-             */
-            ac_build_ifcc(
-               &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
-            {
-               /* The GDS address is always 0 with ordered append. */
-               tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
-                                            false);
-               LLVMBuildStore(builder, tmp, ret);
-            }
-            ac_build_else(&ctx->ac, 12603);
-            {
-               /* Just read the value from GDS. */
-               tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
-                                            false);
-               LLVMBuildStore(builder, tmp, ret);
-            }
-            ac_build_endif(&ctx->ac, 12602);
-
-            prev_wave_state = LLVMBuildLoad(builder, ret, "");
-            /* Ignore the return value if this is the first wave. */
-            prev_wave_state =
-               LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
-            si_exit_thread0_section(&section, &prev_wave_state);
-            prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
-
-            /* If the strip start appears to be on thread 0 for the current primitive
-             * (meaning the reset index is not present in this wave and might have
-             * appeared in previous waves), use the value from GDS to determine
-             * primitive orientation.
-             *
-             * If the strip start is in this wave for the current primitive, use
-             * the value from the current wave to determine primitive orientation.
-             */
-            LLVMValueRef strip_start_is0 =
-               LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
-            first_is_odd =
-               LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
-         }
-      }
-      /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
-      LLVMValueRef prim_is_odd = LLVMBuildXor(
-         builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
-
-      /* Convert triangle strip indices to triangle indices. */
-      ac_build_triangle_strip_indices_to_triangle(
-         &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
-         index);
-   }
-
-   /* Execute the vertex shader for each vertex to get vertex positions. */
-   LLVMValueRef pos[3][4];
-   for (unsigned i = 0; i < vertices_per_prim; i++) {
-      vs_params[param_vertex_id] = index[i];
-      vs_params[param_instance_id] = instance_id;
-
-      LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
-      for (unsigned chan = 0; chan < 4; chan++)
-         pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
-   }
-
-   /* Divide XYZ by W. */
-   for (unsigned i = 0; i < vertices_per_prim; i++) {
-      for (unsigned chan = 0; chan < 3; chan++)
-         pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
-   }
-
-   /* Load the viewport state. */
-   LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
-                                             LLVMConstInt(ctx->ac.i32, 2, 0));
-   vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
-   LLVMValueRef vp_scale[2], vp_translate[2];
-   vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
-   vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
-   vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
-   vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
-   /* Do culling. */
-   struct ac_cull_options options = {};
-   options.cull_front = key->opt.cs_cull_front;
-   options.cull_back = key->opt.cs_cull_back;
-   options.cull_view_xy = true;
-   options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
-   options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
-   options.cull_small_prims = true;
-   options.cull_zero_area = true;
-   options.cull_w = true;
-   options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
-   LLVMValueRef accepted =
-      ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
-                       ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
-
-   ac_build_optimization_barrier(&ctx->ac, &accepted);
-   LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
-   /* Count the number of active threads by doing bitcount(accepted). */
-   LLVMValueRef num_prims_accepted = ac_build_intrinsic(
-      &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
-   num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
-
-   LLVMValueRef start;
-
-   /* Execute atomic_add on the vertex count. */
-   struct si_thread0_section section;
-   si_enter_thread0_section(ctx, &section, thread_id);
-   {
-      if (VERTEX_COUNTER_GDS_MODE == 0) {
-         LLVMValueRef num_indices = LLVMBuildMul(
-            builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-         vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
-         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
-                                    LLVMAtomicOrderingMonotonic, false);
-      } else if (VERTEX_COUNTER_GDS_MODE == 1) {
-         LLVMValueRef num_indices = LLVMBuildMul(
-            builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-         vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
-                                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
-         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
-                                    LLVMAtomicOrderingMonotonic, false);
-      } else if (VERTEX_COUNTER_GDS_MODE == 2) {
-         LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-         /* If the draw call was split into multiple subdraws, each using
-          * a separate draw packet, we need to start counting from 0 for
-          * the first compute wave of the subdraw.
-          *
-          * vertex_counter contains the primitive ID of the first thread
-          * in the first wave.
-          *
-          * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
-          */
-         LLVMValueRef is_first_wave =
-            LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
-
-         /* Store the primitive count for ordered append, not vertex count.
-          * The idea is to avoid GDS initialization via CP DMA. The shader
-          * effectively stores the first count using "swap".
-          *
-          * if (first_wave) {
-          *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
-          *    previous = 0;
-          * } else {
-          *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
-          * }
-          */
-         ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
-         {
-            /* The GDS address is always 0 with ordered append. */
-            si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
-            LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
-         }
-         ac_build_else(&ctx->ac, 12605);
-         {
-            LLVMBuildStore(builder,
-                           si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
-                                                  0, true, true),
-                           tmp_store);
-         }
-         ac_build_endif(&ctx->ac, 12604);
-
-         start = LLVMBuildLoad(builder, tmp_store, "");
-      }
-   }
-   si_exit_thread0_section(&section, &start);
-
-   /* Write the final vertex count to memory. An EOS/EOP event could do this,
-    * but those events are super slow and should be avoided if performance
-    * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
-    * event like this.
-    */
-   if (VERTEX_COUNTER_GDS_MODE == 2) {
-      ac_build_ifcc(&ctx->ac,
-                    LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
-                                  ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
-                    12606);
-      LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
-      count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-
-      /* GFX8 needs to disable caching, so that the CP can see the stored value.
-       * MTYPE=3 bypasses TC L2.
-       */
-      if (ctx->screen->info.chip_class <= GFX8) {
-         LLVMValueRef desc[] = {
-            ac_get_arg(&ctx->ac, param_vertex_count_addr),
-            LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
-            LLVMConstInt(ctx->ac.i32, 4, 0),
-            LLVMConstInt(
-               ctx->ac.i32,
-               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
-               0),
-         };
-         LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
-         ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
-                                     ac_glc | ac_slc);
-      } else {
-         LLVMBuildStore(
-            builder, count,
-            si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
-      }
-      ac_build_endif(&ctx->ac, 12606);
-   } else {
-      /* For unordered modes that increment a vertex count instead of
-       * primitive count, convert it into the primitive index.
-       */
-      start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-   }
-
-   /* Now we need to store the indices of accepted primitives into
-    * the output index buffer.
-    */
-   ac_build_ifcc(&ctx->ac, accepted, 16607);
-   {
-      /* Get the number of bits set before the index of this thread. */
-      LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
-      /* We have lowered instancing. Pack the instance ID into vertex ID. */
-      if (key->opt.cs_instancing) {
-         instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
-
-         for (unsigned i = 0; i < vertices_per_prim; i++)
-            index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
-      }
-
-      if (VERTEX_COUNTER_GDS_MODE == 2) {
-         /* vertex_counter contains the first primitive ID
-          * for this dispatch. If the draw call was split into
-          * multiple subdraws, the first primitive ID is > 0
-          * for subsequent subdraws. Each subdraw uses a different
-          * portion of the output index buffer. Offset the store
-          * vindex by the first primitive ID to get the correct
-          * store address for the subdraw.
-          */
-         start = LLVMBuildAdd(builder, start, vertex_counter, "");
-      }
-
-      /* Write indices for accepted primitives. */
-      LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
-      LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
-      if (!ac_has_vec3_support(ctx->ac.chip_class, true))
-         vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
-      ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,
-                                   ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
-   }
-   ac_build_endif(&ctx->ac, 16607);
-
-   LLVMBuildRetVoid(builder);
+	struct si_shader_key *key = &ctx->shader->key;
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef vs = ctx->main_fn;
+
+	/* Always inline the VS function. */
+	ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
+	LLVMSetLinkage(vs, LLVMPrivateLinkage);
+
+	LLVMTypeRef const_desc_type;
+	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+	    ctx->shader->selector->info.shader_buffers_declared == 0)
+		const_desc_type = ctx->f32;
+	else
+		const_desc_type = ctx->v4i32;
+
+	struct si_function_info fninfo;
+	si_init_function_info(&fninfo);
+
+	LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc;
+	LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id;
+	LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision;
+	LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc;
+	LLVMValueRef last_wave_prim_id, vertex_count_addr;
+
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
+		       &index_buffers_and_constants);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr);
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
+		       &vb_desc);
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type),
+		       &const_desc);
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32),
+		       &sampler_desc);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision);
+
+	/* Block ID and thread ID inputs. */
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id);
+	if (VERTEX_COUNTER_GDS_MODE == 2)
+		add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id);
+	add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id);
+
+	/* Create the compute shader function. */
+	unsigned old_type = ctx->type;
+	ctx->type = PIPE_SHADER_COMPUTE;
+	si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE);
+	ctx->type = old_type;
+
+	if (VERTEX_COUNTER_GDS_MODE == 1) {
+		ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
+						     GDS_SIZE_UNORDERED);
+	}
+
+	/* Assemble parameters for VS. */
+	LLVMValueRef vs_params[16];
+	unsigned num_vs_params = 0;
+	unsigned param_vertex_id, param_instance_id;
+
+	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
+	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
+	vs_params[num_vs_params++] = const_desc;
+	vs_params[num_vs_params++] = sampler_desc;
+	vs_params[num_vs_params++] = LLVMConstInt(ctx->i32,
+					S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
+	vs_params[num_vs_params++] = base_vertex;
+	vs_params[num_vs_params++] = start_instance;
+	vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */
+	vs_params[num_vs_params++] = vb_desc;
+
+	vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
+	vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
+	vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */
+	vs_params[num_vs_params++] = ctx->i32_0; /* unused */
+
+	assert(num_vs_params <= ARRAY_SIZE(vs_params));
+	assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
+
+	/* Load descriptors. (load 8 dwords at once) */
+	LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
+
+	tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
+				   ac_array_in_const32_addr_space(ctx->v8i32), "");
+	tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0);
+
+	for (unsigned i = 0; i < 8; i++)
+		desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
+
+	input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
+	output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
+
+	/* Compute PrimID and InstanceID. */
+	LLVMValueRef global_thread_id =
+		ac_build_imad(&ctx->ac, block_id,
+			      LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id);
+	LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
+	LLVMValueRef instance_id = ctx->i32_0;
+
+	if (key->opt.cs_instancing) {
+		/* Unpack num_prims_udiv_terms. */
+		LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
+						       LLVMConstInt(ctx->i32, 0x1f, 0), "");
+		LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
+								LLVMConstInt(ctx->i32, 5, 0), "");
+		/* Divide the total prim_id by the number of prims per instance. */
+		instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
+							       num_prims_udiv_multiplier,
+							       post_shift);
+		/* Compute the remainder. */
+		prim_id = LLVMBuildSub(builder, prim_id,
+				       LLVMBuildMul(builder, instance_id,
+						    prims_per_instance, ""), "");
+	}
+
+	/* Generate indices (like a non-indexed draw call). */
+	LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)};
+	unsigned vertices_per_prim = 3;
+
+	switch (key->opt.cs_prim_type) {
+	case PIPE_PRIM_TRIANGLES:
+		for (unsigned i = 0; i < 3; i++) {
+			index[i] = ac_build_imad(&ctx->ac, prim_id,
+						 LLVMConstInt(ctx->i32, 3, 0),
+						 LLVMConstInt(ctx->i32, i, 0));
+		}
+		break;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		for (unsigned i = 0; i < 3; i++) {
+			index[i] = LLVMBuildAdd(builder, prim_id,
+						LLVMConstInt(ctx->i32, i, 0), "");
+		}
+		break;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		/* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
+		 * and rasterizer as a normal triangle, so we need to put the provoking
+		 * vertex into the correct index variable and preserve orientation at the same time.
+		 * gl_VertexID is preserved, because it's equal to the index.
+		 */
+		if (key->opt.cs_provoking_vertex_first) {
+			index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
+			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
+			index[2] = ctx->i32_0;
+		} else {
+			index[0] = ctx->i32_0;
+			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
+			index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
+		}
+		break;
+	default:
+		unreachable("unexpected primitive type");
+	}
+
+	/* Fetch indices. */
+	if (key->opt.cs_indexed) {
+		for (unsigned i = 0; i < 3; i++) {
+			index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
+							       index[i], ctx->i32_0, 1,
+							       0, true);
+			index[i] = ac_to_integer(&ctx->ac, index[i]);
+		}
+	}
+
+	/* Extract the ordered wave ID. */
+	if (VERTEX_COUNTER_GDS_MODE == 2) {
+		ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
+						LLVMConstInt(ctx->i32, 6, 0), "");
+		ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
+					       LLVMConstInt(ctx->i32, 0xfff, 0), "");
+	}
+	LLVMValueRef thread_id =
+		LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), "");
+
+	/* Every other triangle in a strip has a reversed vertex order, so we
+	 * need to swap vertices of odd primitives to get the correct primitive
+	 * orientation when converting triangle strips to triangles. Primitive
+	 * restart complicates it, because a strip can start anywhere.
+	 */
+	LLVMValueRef prim_restart_accepted = ctx->i1true;
+
+	if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
+		/* Without primitive restart, odd primitives have reversed orientation.
+		 * Only primitive restart can flip it with respect to the first vertex
+		 * of the draw call.
+		 */
+		LLVMValueRef first_is_odd = ctx->i1false;
+
+		/* Handle primitive restart. */
+		if (key->opt.cs_primitive_restart) {
+			/* Get the GDS primitive restart continue flag and clear
+			 * the flag in vertex_counter. This flag is used when the draw
+			 * call was split and we need to load the primitive orientation
+			 * flag from GDS for the first wave too.
+			 */
+			LLVMValueRef gds_prim_restart_continue =
+				LLVMBuildLShr(builder, vertex_counter,
+					      LLVMConstInt(ctx->i32, 31, 0), "");
+			gds_prim_restart_continue =
+				LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, "");
+			vertex_counter = LLVMBuildAnd(builder, vertex_counter,
+						      LLVMConstInt(ctx->i32, 0x7fffffff, 0), "");
+
+			LLVMValueRef index0_is_reset;
+
+			for (unsigned i = 0; i < 3; i++) {
+				LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
+								       restart_index, "");
+				if (i == 0)
+					index0_is_reset = LLVMBuildNot(builder, not_reset, "");
+				prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
+								     not_reset, "");
+			}
+
+			/* If the previous waves flip the primitive orientation
+			 * of the current triangle strip, it will be stored in GDS.
+			 *
+			 * Sometimes the correct orientation is not needed, in which case
+			 * we don't need to execute this.
+			 */
+			if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
+				/* If there are reset indices in this wave, get the thread index
+				 * where the most recent strip starts relative to each thread.
+				 */
+				LLVMValueRef preceding_threads_mask =
+					LLVMBuildSub(builder,
+						     LLVMBuildShl(builder, ctx->ac.i64_1,
+								  LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""),
+						     ctx->ac.i64_1, "");
+
+				LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
+				LLVMValueRef preceding_reset_threadmask =
+					LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
+				LLVMValueRef strip_start =
+					ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
+				strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, "");
+
+				/* This flips the orientatino based on reset indices within this wave only. */
+				first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, "");
+
+				LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
+				LLVMValueRef is_first_wave, current_wave_resets_index;
+
+				/* Get the thread index where the last strip starts in this wave.
+				 *
+				 * If the last strip doesn't start in this wave, the thread index
+				 * will be 0.
+				 *
+				 * If the last strip starts in the next wave, the thread index will
+				 * be 64.
+				 */
+				last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
+				last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, "");
+
+				struct si_thread0_section section;
+				si_enter_thread0_section(ctx, &section, thread_id);
+
+				/* This must be done in the thread 0 section, because
+				 * we expect PrimID to be 0 for the whole first wave
+				 * in this expression.
+				 *
+				 * NOTE: This will need to be different if we wanna support
+				 * instancing with primitive restart.
+				 */
+				is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, "");
+				is_first_wave = LLVMBuildAnd(builder, is_first_wave,
+							     LLVMBuildNot(builder,
+									  gds_prim_restart_continue, ""), "");
+				current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
+									  last_strip_start, ctx->i32_0, "");
+
+				ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state");
+
+				/* Save the last strip start primitive index in GDS and read
+				 * the value that previous waves stored.
+				 *
+				 * if (is_first_wave || current_wave_resets_strip)
+				 *    // Read the value that previous waves stored and store a new one.
+				 *    first_is_odd = ds.ordered.swap(last_strip_start);
+				 * else
+				 *    // Just read the value that previous waves stored.
+				 *    first_is_odd = ds.ordered.add(0);
+				 */
+				ac_build_ifcc(&ctx->ac,
+					      LLVMBuildOr(builder, is_first_wave,
+							  current_wave_resets_index, ""), 12602);
+				{
+					/* The GDS address is always 0 with ordered append. */
+					tmp = si_build_ds_ordered_op(ctx, "swap",
+								     ordered_wave_id, last_strip_start,
+								     1, true, false);
+					LLVMBuildStore(builder, tmp, ret);
+				}
+				ac_build_else(&ctx->ac, 12603);
+				{
+					/* Just read the value from GDS. */
+					tmp = si_build_ds_ordered_op(ctx, "add",
+								     ordered_wave_id, ctx->i32_0,
+								     1, true, false);
+					LLVMBuildStore(builder, tmp, ret);
+				}
+				ac_build_endif(&ctx->ac, 12602);
+
+				prev_wave_state = LLVMBuildLoad(builder, ret, "");
+				/* Ignore the return value if this is the first wave. */
+				prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
+								  ctx->i32_0, prev_wave_state, "");
+				si_exit_thread0_section(&section, &prev_wave_state);
+				prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, "");
+
+				/* If the strip start appears to be on thread 0 for the current primitive
+				 * (meaning the reset index is not present in this wave and might have
+				 * appeared in previous waves), use the value from GDS to determine
+				 * primitive orientation.
+				 *
+				 * If the strip start is in this wave for the current primitive, use
+				 * the value from the current wave to determine primitive orientation.
+				 */
+				LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
+									     strip_start, ctx->i32_0, "");
+				first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
+							       first_is_odd, "");
+			}
+		}
+		/* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
+		LLVMValueRef prim_is_odd =
+			LLVMBuildXor(builder, first_is_odd,
+				     LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), "");
+
+		/* Determine the primitive orientation.
+		 * Only swap the vertices that are not the provoking vertex. We need to keep
+		 * the provoking vertex in place.
+		 */
+		if (key->opt.cs_provoking_vertex_first) {
+			LLVMValueRef index1 = index[1];
+			LLVMValueRef index2 = index[2];
+			index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, "");
+			index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, "");
+		} else {
+			LLVMValueRef index0 = index[0];
+			LLVMValueRef index1 = index[1];
+			index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, "");
+			index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, "");
+		}
+	}
+
+	/* Execute the vertex shader for each vertex to get vertex positions. */
+	LLVMValueRef pos[3][4];
+	for (unsigned i = 0; i < vertices_per_prim; i++) {
+		vs_params[param_vertex_id] = index[i];
+		vs_params[param_instance_id] = instance_id;
+
+		LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
+		for (unsigned chan = 0; chan < 4; chan++)
+			pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
+	}
+
+	/* Divide XYZ by W. */
+	for (unsigned i = 0; i < vertices_per_prim; i++) {
+		for (unsigned chan = 0; chan < 3; chan++)
+			pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
+	}
+
+	/* Load the viewport state. */
+	LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
+						  LLVMConstInt(ctx->i32, 2, 0));
+	vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
+	vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+	vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+	vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+	vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+	/* Do culling. */
+	struct ac_cull_options options = {};
+	options.cull_front = key->opt.cs_cull_front;
+	options.cull_back = key->opt.cs_cull_back;
+	options.cull_view_xy = true;
+	options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
+	options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
+	options.cull_small_prims = true;
+	options.cull_zero_area = true;
+	options.cull_w = true;
+	options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
+
+	LLVMValueRef accepted =
+		ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
+				 vp_scale, vp_translate, smallprim_precision,
+				 &options);
+
+	LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
+
+	/* Count the number of active threads by doing bitcount(accepted). */
+	LLVMValueRef num_prims_accepted =
+		ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64,
+				   &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
+	num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, "");
+
+	LLVMValueRef start;
+
+	/* Execute atomic_add on the vertex count. */
+	struct si_thread0_section section;
+	si_enter_thread0_section(ctx, &section, thread_id);
+	{
+		if (VERTEX_COUNTER_GDS_MODE == 0) {
+			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
+						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+			vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
+			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+						   vertex_counter, num_indices,
+						   LLVMAtomicOrderingMonotonic, false);
+		} else if (VERTEX_COUNTER_GDS_MODE == 1) {
+			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
+						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+			vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
+							   LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), "");
+			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+						   vertex_counter, num_indices,
+						   LLVMAtomicOrderingMonotonic, false);
+		} else if (VERTEX_COUNTER_GDS_MODE == 2) {
+			LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
+
+			/* If the draw call was split into multiple subdraws, each using
+			 * a separate draw packet, we need to start counting from 0 for
+			 * the first compute wave of the subdraw.
+			 *
+			 * vertex_counter contains the primitive ID of the first thread
+			 * in the first wave.
+			 *
+			 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
+			 */
+			LLVMValueRef is_first_wave =
+				LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+					      vertex_counter, "");
+
+			/* Store the primitive count for ordered append, not vertex count.
+			 * The idea is to avoid GDS initialization via CP DMA. The shader
+			 * effectively stores the first count using "swap".
+			 *
+			 * if (first_wave) {
+			 *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
+			 *    previous = 0;
+			 * } else {
+			 *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
+			 * }
+			 */
+			ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
+			{
+				/* The GDS address is always 0 with ordered append. */
+				si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
+						       num_prims_accepted, 0, true, true);
+				LLVMBuildStore(builder, ctx->i32_0, tmp_store);
+			}
+			ac_build_else(&ctx->ac, 12605);
+			{
+				LLVMBuildStore(builder,
+					       si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
+								      num_prims_accepted, 0,
+								      true, true),
+					       tmp_store);
+			}
+			ac_build_endif(&ctx->ac, 12604);
+
+			start = LLVMBuildLoad(builder, tmp_store, "");
+		}
+	}
+	si_exit_thread0_section(&section, &start);
+
+	/* Write the final vertex count to memory. An EOS/EOP event could do this,
+	 * but those events are super slow and should be avoided if performance
+	 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
+	 * event like this.
+	 */
+	if (VERTEX_COUNTER_GDS_MODE == 2) {
+		ac_build_ifcc(&ctx->ac,
+			      LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+					    last_wave_prim_id, ""), 12606);
+		LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
+		count = LLVMBuildMul(builder, count,
+				     LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+
+		/* GFX8 needs to disable caching, so that the CP can see the stored value.
+		 * MTYPE=3 bypasses TC L2.
+		 */
+		if (ctx->screen->info.chip_class <= GFX8) {
+			LLVMValueRef desc[] = {
+				vertex_count_addr,
+				LLVMConstInt(ctx->i32,
+					S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
+				LLVMConstInt(ctx->i32, 4, 0),
+				LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+						       S_008F0C_MTYPE(3 /* uncached */), 0),
+			};
+			LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
+			ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
+						    ctx->i32_0, 0, ac_glc | ac_slc, false);
+		} else {
+			LLVMBuildStore(builder, count,
+				       si_expand_32bit_pointer(ctx, vertex_count_addr));
+		}
+		ac_build_endif(&ctx->ac, 12606);
+	} else {
+		/* For unordered modes that increment a vertex count instead of
+		 * primitive count, convert it into the primitive index.
+		 */
+		start = LLVMBuildUDiv(builder, start,
+				      LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+	}
+
+	/* Now we need to store the indices of accepted primitives into
+	 * the output index buffer.
+	 */
+	ac_build_ifcc(&ctx->ac, accepted, 16607);
+	{
+		/* Get the number of bits set before the index of this thread. */
+		LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
+
+		/* We have lowered instancing. Pack the instance ID into vertex ID. */
+		if (key->opt.cs_instancing) {
+			instance_id = LLVMBuildShl(builder, instance_id,
+						   LLVMConstInt(ctx->i32, 16, 0), "");
+
+			for (unsigned i = 0; i < vertices_per_prim; i++)
+				index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
+		}
+
+		if (VERTEX_COUNTER_GDS_MODE == 2) {
+			/* vertex_counter contains the first primitive ID
+			 * for this dispatch. If the draw call was split into
+			 * multiple subdraws, the first primitive ID is > 0
+			 * for subsequent subdraws. Each subdraw uses a different
+			 * portion of the output index buffer. Offset the store
+			 * vindex by the first primitive ID to get the correct
+			 * store address for the subdraw.
+			 */
+			start = LLVMBuildAdd(builder, start, vertex_counter, "");
+		}
+
+		/* Write indices for accepted primitives. */
+		LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
+		LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
+
+		if (!ac_has_vec3_support(ctx->ac.chip_class, true))
+			vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
+
+		ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
+					     vindex, ctx->i32_0, 3,
+					     ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
+	}
+	ac_build_endif(&ctx->ac, 16607);
+
+	LLVMBuildRetVoid(builder);
 }
 
 /* Return false if the shader isn't ready. */
 static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
-                                             const struct pipe_draw_info *info,
-                                             bool primitive_restart)
+					     const struct pipe_draw_info *info,
+					     bool primitive_restart)
 {
-   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-   struct si_shader_key key;
-
-   /* Primitive restart needs ordered counters. */
-   assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
-   assert(!primitive_restart || info->instance_count == 1);
-
-   memset(&key, 0, sizeof(key));
-   si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog);
-   assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
-   key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
-   key.opt.vs_as_prim_discard_cs = 1;
-   key.opt.cs_prim_type = info->mode;
-   key.opt.cs_indexed = info->index_size != 0;
-   key.opt.cs_instancing = info->instance_count > 1;
-   key.opt.cs_primitive_restart = primitive_restart;
-   key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
-   /* Primitive restart with triangle strips needs to preserve primitive
-    * orientation for cases where front and back primitive orientation matters.
-    */
-   if (primitive_restart) {
-      struct si_shader_selector *ps = sctx->shader.ps.cso;
-
-      key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back ||
-                                            ps->info.uses_frontface ||
-                                            (rs->two_side && ps->info.colors_read);
-   }
-
-   if (rs->rasterizer_discard) {
-      /* Just for performance testing and analysis of trivial bottlenecks.
-       * This should result in a very short compute shader. */
-      key.opt.cs_cull_front = 1;
-      key.opt.cs_cull_back = 1;
-   } else {
-      key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front;
-      key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back;
-   }
-
-   if (!rs->depth_clamp_any && CULL_Z) {
-      key.opt.cs_cull_z = 1;
-      key.opt.cs_halfz_clip_space = rs->clip_halfz;
-   }
-
-   sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso;
-   sctx->cs_prim_discard_state.current = NULL;
-
-   if (!sctx->compiler.passes)
-      si_init_compiler(sctx->screen, &sctx->compiler);
-
-   struct si_compiler_ctx_state compiler_state;
-   compiler_state.compiler = &sctx->compiler;
-   compiler_state.debug = sctx->debug;
-   compiler_state.is_debug_context = sctx->is_debug;
-
-   return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
-                                    &key, -1, true) == 0 &&
-          /* Disallow compute shaders using the scratch buffer. */
-          sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
+	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+	struct si_shader_key key;
+
+	/* Primitive restart needs ordered counters. */
+	assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
+	assert(!primitive_restart || info->instance_count == 1);
+
+	memset(&key, 0, sizeof(key));
+	si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
+	assert(!key.part.vs.prolog.instance_divisor_is_fetched);
+
+	key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
+	key.opt.vs_as_prim_discard_cs = 1;
+	key.opt.cs_prim_type = info->mode;
+	key.opt.cs_indexed = info->index_size != 0;
+	key.opt.cs_instancing = info->instance_count > 1;
+	key.opt.cs_primitive_restart = primitive_restart;
+	key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
+
+	/* Primitive restart with triangle strips needs to preserve primitive
+	 * orientation for cases where front and back primitive orientation matters.
+	 */
+	if (primitive_restart) {
+		struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+		key.opt.cs_need_correct_orientation =
+			rs->cull_front != rs->cull_back ||
+			ps->info.uses_frontface ||
+			(rs->two_side && ps->info.colors_read);
+	}
+
+	if (rs->rasterizer_discard) {
+		/* Just for performance testing and analysis of trivial bottlenecks.
+		 * This should result in a very short compute shader. */
+		key.opt.cs_cull_front = 1;
+		key.opt.cs_cull_back = 1;
+	} else {
+		key.opt.cs_cull_front =
+			sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
+		key.opt.cs_cull_back =
+			sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
+	}
+
+	if (!rs->depth_clamp_any && CULL_Z) {
+		key.opt.cs_cull_z = 1;
+		key.opt.cs_halfz_clip_space = rs->clip_halfz;
+	}
+
+	sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
+	sctx->cs_prim_discard_state.current = NULL;
+
+	struct si_compiler_ctx_state compiler_state;
+	compiler_state.compiler = &sctx->compiler;
+	compiler_state.debug = sctx->debug;
+	compiler_state.is_debug_context = sctx->is_debug;
+
+	return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
+					 &compiler_state, &key, -1, true) == 0 &&
+	       /* Disallow compute shaders using the scratch buffer. */
+	       sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
 }
 
 static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
 {
-   if (sctx->index_ring)
-      return true;
-
-   if (!sctx->prim_discard_compute_cs.priv) {
-      struct radeon_winsys *ws = sctx->ws;
-      unsigned gds_size =
-         VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
-      unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
-
-      if (gds_size) {
-         sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS,
-                                       RADEON_FLAG_DRIVER_INTERNAL);
-         if (!sctx->gds)
-            return false;
-
-         ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
-      }
-      if (num_oa_counters) {
-         assert(gds_size);
-         sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA,
-                                          RADEON_FLAG_DRIVER_INTERNAL);
-         if (!sctx->gds_oa)
-            return false;
-
-         ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
-      }
-
-      if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs,
-                                          &sctx->gfx_cs, num_oa_counters > 0))
-         return false;
-   }
-
-   if (!sctx->index_ring) {
-      sctx->index_ring = si_aligned_buffer_create(
-         sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
-         PIPE_USAGE_DEFAULT,
-         sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
-      if (!sctx->index_ring)
-         return false;
-   }
-   return true;
+	if (sctx->index_ring)
+		return true;
+
+	if (!sctx->prim_discard_compute_cs) {
+		struct radeon_winsys *ws = sctx->ws;
+		unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
+				    VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
+		unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
+
+		if (gds_size) {
+			sctx->gds = ws->buffer_create(ws, gds_size, 4,
+						      RADEON_DOMAIN_GDS, 0);
+			if (!sctx->gds)
+				return false;
+
+			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
+					  RADEON_USAGE_READWRITE, 0, 0);
+		}
+		if (num_oa_counters) {
+			assert(gds_size);
+			sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
+							 1, RADEON_DOMAIN_OA, 0);
+			if (!sctx->gds_oa)
+				return false;
+
+			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
+					  RADEON_USAGE_READWRITE, 0, 0);
+		}
+
+		sctx->prim_discard_compute_cs =
+			ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
+						       num_oa_counters > 0);
+		if (!sctx->prim_discard_compute_cs)
+			return false;
+	}
+
+	if (!sctx->index_ring) {
+		sctx->index_ring =
+			si_aligned_buffer_create(sctx->b.screen,
+						 SI_RESOURCE_FLAG_UNMAPPABLE,
+						 PIPE_USAGE_DEFAULT,
+						 sctx->index_ring_size_per_ib * 2,
+						 2 * 1024 * 1024);
+		if (!sctx->index_ring)
+			return false;
+	}
+	return true;
 }
 
 static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
 {
-   return sctx->index_ring_offset +
-             align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
-          sctx->index_ring_size_per_ib;
+	return sctx->index_ring_offset +
+	       align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
+	       sctx->index_ring_size_per_ib;
 }
 
 enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      const struct pipe_draw_start_count *draws,
-                                      unsigned num_draws, bool primitive_restart,
-                                      unsigned total_count)
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
+				      const struct pipe_draw_info *info,
+				      bool primitive_restart)
 {
-   /* If the compute shader compilation isn't finished, this returns false. */
-   if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
-      return SI_PRIM_DISCARD_DISABLED;
-
-   if (!si_initialize_prim_discard_cmdbuf(sctx))
-      return SI_PRIM_DISCARD_DISABLED;
-
-   struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
-   unsigned prim = info->mode;
-   unsigned count = total_count;
-   unsigned instance_count = info->instance_count;
-   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
-   unsigned num_prims = num_prims_per_instance * instance_count;
-   unsigned out_indexbuf_size = num_prims * 12;
-   bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
-   const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
-
-   /* Split draws at the draw call level if the ring is full. This makes
-    * better use of the ring space.
-    */
-   if (ring_full && num_prims > split_prims_draw_level &&
-       instance_count == 1 && /* TODO: support splitting instanced draws */
-       (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
-      unsigned vert_count_per_subdraw = 0;
-
-      if (prim == PIPE_PRIM_TRIANGLES)
-         vert_count_per_subdraw = split_prims_draw_level * 3;
-      else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
-         vert_count_per_subdraw = split_prims_draw_level;
-      else
-         unreachable("shouldn't get here");
-
-      /* Split multi draws first. */
-      if (num_draws > 1) {
-         unsigned count = 0;
-         unsigned first_draw = 0;
-         unsigned num_draws_split = 0;
-
-         for (unsigned i = 0; i < num_draws; i++) {
-            if (count && count + draws[i].count > vert_count_per_subdraw) {
-               /* Submit previous draws.  */
-               sctx->b.draw_vbo(&sctx->b, info, NULL, draws + first_draw, num_draws_split);
-               count = 0;
-               first_draw = i;
-               num_draws_split = 0;
-            }
-
-            if (draws[i].count > vert_count_per_subdraw) {
-               /* Submit just 1 draw. It will be split. */
-               sctx->b.draw_vbo(&sctx->b, info, NULL, draws + i, 1);
-               assert(count == 0);
-               assert(first_draw == i);
-               assert(num_draws_split == 0);
-               first_draw = i + 1;
-               continue;
-            }
-
-            count += draws[i].count;
-            num_draws_split++;
-         }
-         return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT;
-      }
-
-      /* Split single draws if splitting multi draws isn't enough. */
-      struct pipe_draw_info split_draw = *info;
-      struct pipe_draw_start_count split_draw_range = draws[0];
-      unsigned base_start = split_draw_range.start;
-
-      split_draw.primitive_restart = primitive_restart;
-
-      if (prim == PIPE_PRIM_TRIANGLES) {
-         assert(vert_count_per_subdraw < count);
-
-         for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
-            split_draw_range.start = base_start + start;
-            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);
-
-            sctx->b.draw_vbo(&sctx->b, &split_draw, NULL, &split_draw_range, 1);
-         }
-      } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
-         /* No primitive pair can be split, because strips reverse orientation
-          * for odd primitives. */
-         STATIC_ASSERT(split_prims_draw_level % 2 == 0);
-
-         for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
-            split_draw_range.start = base_start + start;
-            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
-            sctx->b.draw_vbo(&sctx->b, &split_draw, NULL, &split_draw_range, 1);
-
-            if (start == 0 && primitive_restart &&
-                sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
-               sctx->preserve_prim_restart_gds_at_flush = true;
-         }
-         sctx->preserve_prim_restart_gds_at_flush = false;
-      }
-
-      return SI_PRIM_DISCARD_DRAW_SPLIT;
-   }
-
-   /* Just quit if the draw call doesn't fit into the ring and can't be split. */
-   if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
-      if (SI_PRIM_DISCARD_DEBUG)
-         puts("PD failed: draw call too big, can't be split");
-      return SI_PRIM_DISCARD_DISABLED;
-   }
-
-   unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL) * num_draws;
-   unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
-                              24 * (num_subdraws - 1) + /* subdraws */
-                              30;                       /* leave some space at the end */
-   unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
-
-   if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
-      need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
-   else
-      need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-
-   if (ring_full ||
-       (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
-       !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
-      /* If the current IB is empty but the size is too small, add a NOP
-       * packet to force a flush and get a bigger IB.
-       */
-      if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
-          gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
-         radeon_begin(gfx_cs);
-         radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-         radeon_emit(gfx_cs, 0);
-         radeon_end();
-      }
-
-      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-   }
-
-   /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
-   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
-   ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
-   assert(compute_has_space);
-   assert(si_check_ring_space(sctx, out_indexbuf_size));
-   return SI_PRIM_DISCARD_ENABLED;
+	/* If the compute shader compilation isn't finished, this returns false. */
+	if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
+		return SI_PRIM_DISCARD_DISABLED;
+
+	if (!si_initialize_prim_discard_cmdbuf(sctx))
+		return SI_PRIM_DISCARD_DISABLED;
+
+	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+	unsigned prim = info->mode;
+	unsigned count = info->count;
+	unsigned instance_count = info->instance_count;
+	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
+	unsigned num_prims = num_prims_per_instance * instance_count;
+	unsigned out_indexbuf_size = num_prims * 12;
+	bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
+	const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
+
+	/* Split draws at the draw call level if the ring is full. This makes
+	 * better use of the ring space.
+	 */
+	if (ring_full &&
+	    num_prims > split_prims_draw_level &&
+	    instance_count == 1 && /* TODO: support splitting instanced draws */
+	    (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
+			   (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
+		/* Split draws. */
+		struct pipe_draw_info split_draw = *info;
+		split_draw.primitive_restart = primitive_restart;
+
+		unsigned base_start = split_draw.start;
+
+		if (prim == PIPE_PRIM_TRIANGLES) {
+			unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
+			assert(vert_count_per_subdraw < count);
+
+			for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
+				split_draw.start = base_start + start;
+				split_draw.count = MIN2(count - start, vert_count_per_subdraw);
+
+				sctx->b.draw_vbo(&sctx->b, &split_draw);
+			}
+		} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
+			/* No primitive pair can be split, because strips reverse orientation
+			 * for odd primitives. */
+			STATIC_ASSERT(split_prims_draw_level % 2 == 0);
+
+			unsigned vert_count_per_subdraw = split_prims_draw_level;
+
+			for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
+				split_draw.start = base_start + start;
+				split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
+
+				sctx->b.draw_vbo(&sctx->b, &split_draw);
+
+				if (start == 0 &&
+				    primitive_restart &&
+				    sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
+					sctx->preserve_prim_restart_gds_at_flush = true;
+			}
+			sctx->preserve_prim_restart_gds_at_flush = false;
+		} else {
+			assert(0);
+		}
+
+		return SI_PRIM_DISCARD_DRAW_SPLIT;
+	}
+
+	/* Just quit if the draw call doesn't fit into the ring and can't be split. */
+	if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
+		if (SI_PRIM_DISCARD_DEBUG)
+			puts("PD failed: draw call too big, can't be split");
+		return SI_PRIM_DISCARD_DISABLED;
+	}
+
+	unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
+	unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
+				   24 * (num_subdraws - 1) + /* subdraws */
+				   20; /* leave some space at the end */
+	unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
+
+	if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
+		need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
+	else
+		need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+
+	if (ring_full ||
+	    (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
+	    !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
+		/* If the current IB is empty but the size is too small, add a NOP
+		 * packet to force a flush and get a bigger IB.
+		 */
+		if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
+		    gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
+			radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+			radeon_emit(gfx_cs, 0);
+		}
+
+		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+	}
+
+	/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
+	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+	ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
+	assert(compute_has_space);
+	assert(si_check_ring_space(sctx, out_indexbuf_size));
+	return SI_PRIM_DISCARD_ENABLED;
 }
 
 void si_compute_signal_gfx(struct si_context *sctx)
 {
-   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
-   unsigned writeback_L2_flags = 0;
-
-   /* The writeback L2 flags vary with each chip generation. */
-   /* CI needs to flush vertex indices to memory. */
-   if (sctx->chip_class <= GFX7)
-      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
-   else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
-      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
-   if (!sctx->compute_num_prims_in_batch)
-      return;
-
-   assert(sctx->compute_rewind_va);
-
-   /* After the queued dispatches are done and vertex counts are written to
-    * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
-    * the dispatches to finish, it only adds the CS_DONE event into the event
-    * queue.
-    */
-   si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
-                     sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-                     writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
-                     EOP_DATA_SEL_VALUE_32BIT, NULL,
-                     sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
-                     REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
-                     SI_NOT_QUERY);
-
-   sctx->compute_rewind_va = 0;
-   sctx->compute_num_prims_in_batch = 0;
+	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+	unsigned writeback_L2_flags = 0;
+
+	/* The writeback L2 flags vary with each chip generation. */
+	/* CI needs to flush vertex indices to memory. */
+	if (sctx->chip_class <= GFX7)
+		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
+	else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
+		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
+
+	if (!sctx->compute_num_prims_in_batch)
+		return;
+
+	assert(sctx->compute_rewind_va);
+
+	/* After the queued dispatches are done and vertex counts are written to
+	 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
+	 * the dispatches to finish, it only adds the CS_DONE event into the event
+	 * queue.
+	 */
+	si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
+			  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+			  writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
+					       EOP_INT_SEL_NONE,
+			  EOP_DATA_SEL_VALUE_32BIT,
+			  NULL,
+			  sctx->compute_rewind_va |
+			  ((uint64_t)sctx->screen->info.address32_hi << 32),
+			  REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
+			  SI_NOT_QUERY);
+
+	sctx->compute_rewind_va = 0;
+	sctx->compute_num_prims_in_batch = 0;
 }
 
 /* Dispatch a primitive discard compute shader. */
 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info,
-                                          unsigned count, unsigned index_size,
-                                          unsigned base_vertex, uint64_t input_indexbuf_va,
-                                          unsigned input_indexbuf_num_elements)
+					  const struct pipe_draw_info *info,
+					  unsigned index_size,
+					  unsigned base_vertex,
+					  uint64_t input_indexbuf_va,
+					  unsigned input_indexbuf_num_elements)
 {
-   struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
-   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
-   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count);
-   if (!num_prims_per_instance)
-      return;
-
-   unsigned num_prims = num_prims_per_instance * info->instance_count;
-   unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
-
-   switch (info->mode) {
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      vertices_per_prim = 3;
-      output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
-      gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
-      break;
-   default:
-      unreachable("unsupported primitive type");
-      return;
-   }
-
-   unsigned out_indexbuf_offset;
-   uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
-   bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
-
-   /* Initialize the compute IB if it's empty. */
-   if (!sctx->prim_discard_compute_ib_initialized) {
-      /* 1) State initialization. */
-      sctx->compute_gds_offset = 0;
-      sctx->compute_ib_last_shader = NULL;
-
-      if (sctx->last_ib_barrier_fence) {
-         assert(!sctx->last_ib_barrier_buf);
-         sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
-                                           RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
-      }
-
-      /* 2) IB initialization. */
-
-      /* This needs to be done at the beginning of IBs due to possible
-       * TTM buffer moves in the kernel.
-       */
-      if (sctx->chip_class >= GFX10) {
-         radeon_begin(cs);
-         radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
-         radeon_emit(cs, 0);          /* CP_COHER_CNTL */
-         radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
-         radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
-         radeon_emit(cs, 0);          /* CP_COHER_BASE */
-         radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
-         radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
-         radeon_emit(cs,              /* GCR_CNTL */
-                     S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
-                        S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
-                        S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
-         radeon_end();
-      } else {
-         si_emit_surface_sync(sctx, cs,
-                              S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
-                                 S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
-                                 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
-                                 S_0085F0_SH_KCACHE_ACTION_ENA(1));
-      }
-
-      /* Restore the GDS prim restart counter if needed. */
-      if (sctx->preserve_prim_restart_gds_at_flush) {
-         si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM,
-                         sctx->wait_mem_scratch, 4);
-      }
-
-      si_emit_initial_compute_regs(sctx, cs);
-
-      radeon_begin(cs);
-      radeon_set_sh_reg(
-         cs, R_00B860_COMPUTE_TMPRING_SIZE,
-         S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
-
-      /* Only 1D grids are launched. */
-      radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
-      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
-      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
-
-      radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
-      radeon_emit(cs, 0);
-      radeon_emit(cs, 0);
-
-      /* Disable ordered alloc for OA resources. */
-      for (unsigned i = 0; i < 2; i++) {
-         radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3, false);
-         radeon_emit(cs, S_031074_INDEX(i));
-         radeon_emit(cs, 0);
-         radeon_emit(cs, S_03107C_ENABLE(0));
-      }
-      radeon_end();
-
-      if (sctx->last_ib_barrier_buf) {
-         assert(!sctx->last_ib_barrier_fence);
-         radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
-                                   RADEON_PRIO_FENCE);
-         si_cp_wait_mem(sctx, cs,
-                        sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
-                        1, 1, WAIT_REG_MEM_EQUAL);
-      }
-
-      sctx->prim_discard_compute_ib_initialized = true;
-   }
-
-   /* Allocate the output index buffer. */
-   output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
-   assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
-   out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
-   sctx->index_ring_offset += output_indexbuf_size;
-
-   radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
-                             RADEON_PRIO_SHADER_RW_BUFFER);
-   uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
-   /* Prepare index buffer descriptors. */
-   struct si_resource *indexbuf_desc = NULL;
-   unsigned indexbuf_desc_offset;
-   unsigned desc_size = 12 * 4;
-   uint32_t *desc;
-
-   u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
-                  &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
-   radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
-                             RADEON_PRIO_DESCRIPTORS);
-
-   /* Input index buffer. */
-   desc[0] = input_indexbuf_va;
-   desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
-   desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
-
-   if (sctx->chip_class >= GFX10) {
-      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT
-                                                : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT
-                                                                  : V_008F0C_IMG_FORMAT_32_UINT) |
-                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-                S_008F0C_RESOURCE_LEVEL(1);
-   } else {
-      desc[3] =
-         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-         S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
-                                              : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
-                                                                : V_008F0C_BUF_DATA_FORMAT_32);
-   }
-
-   /* Output index buffer. */
-   desc[4] = out_indexbuf_va;
-   desc[5] =
-      S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
-   desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
-
-   if (sctx->chip_class >= GFX10) {
-      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-                S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
-                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-                S_008F0C_RESOURCE_LEVEL(1);
-   } else {
-      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-                S_008F0C_DATA_FORMAT(output_indexbuf_format);
-   }
-
-   /* Viewport state. */
-   struct si_small_prim_cull_info cull_info;
-   si_get_small_prim_cull_info(sctx, &cull_info);
-
-   desc[8] = fui(cull_info.scale[0]);
-   desc[9] = fui(cull_info.scale[1]);
-   desc[10] = fui(cull_info.translate[0]);
-   desc[11] = fui(cull_info.translate[1]);
-
-   /* Set user data SGPRs. */
-   /* This can't be greater than 14 if we want the fastest launch rate. */
-   unsigned user_sgprs = 13;
-
-   uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
-   unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
-   unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
-   uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
-   uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
-   uint64_t vb_desc_va = sctx->vb_descriptors_buffer
-                            ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
-                            : 0;
-   unsigned gds_offset, gds_size;
-   struct si_fast_udiv_info32 num_prims_udiv = {};
-
-   if (info->instance_count > 1)
-      num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
-   /* Limitations on how these two are packed in the user SGPR. */
-   assert(num_prims_udiv.post_shift < 32);
-   assert(num_prims_per_instance < 1 << 27);
-
-   si_resource_reference(&indexbuf_desc, NULL);
-
-   bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
-
-   if (VERTEX_COUNTER_GDS_MODE == 1) {
-      gds_offset = sctx->compute_gds_offset;
-      gds_size = primitive_restart ? 8 : 4;
-      sctx->compute_gds_offset += gds_size;
-
-      /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
-       * The remainder of the GDS will be cleared after the dispatch packet
-       * in parallel with compute shaders.
-       */
-      if (first_dispatch) {
-         radeon_begin(cs);
-         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
-         radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
-         radeon_emit(cs, gds_offset);
-         radeon_emit(cs, 0);
-         radeon_emit(cs, 0); /* value to write */
-         if (gds_size == 8)
-            radeon_emit(cs, 0);
-         radeon_end();
-      }
-   }
-
-   /* Set shader registers. */
-   struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
-   if (shader != sctx->compute_ib_last_shader) {
-      radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
-                                RADEON_PRIO_SHADER_BINARY);
-      uint64_t shader_va = shader->bo->gpu_address;
-
-      assert(shader->config.scratch_bytes_per_wave == 0);
-      assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
-      radeon_begin(cs);
-      radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-      radeon_emit(cs, shader_va >> 8);
-      radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
-      radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-      radeon_emit(
-         cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
-                S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
-                S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
-                S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
-                S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
-      radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
-                         S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
-                         S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
-                         S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
-                         S_00B84C_LDS_SIZE(shader->config.lds_size));
-
-      radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-                        ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
-                                                       MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
-      radeon_end();
-      sctx->compute_ib_last_shader = shader;
-   }
-
-   STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
-
-   /* Big draw calls are split into smaller dispatches and draw packets. */
-   for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
-      unsigned num_subdraw_prims;
-
-      if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
-         num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
-      else
-         num_subdraw_prims = num_prims - start_prim;
-
-      /* Small dispatches are executed back to back until a specific primitive
-       * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
-       * to start drawing the batch. This batching adds latency to the gfx IB,
-       * but CS_DONE and REWIND are too slow.
-       */
-      if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
-         si_compute_signal_gfx(sctx);
-
-      if (sctx->compute_num_prims_in_batch == 0) {
-         assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
-         sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
-         if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
-            radeon_begin(gfx_cs);
-            radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-            radeon_emit(gfx_cs, 0);
-            radeon_end();
-
-            si_cp_wait_mem(
-               sctx, gfx_cs,
-               sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
-               REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
-            /* Use INDIRECT_BUFFER to chain to a different buffer
-             * to discard the CP prefetch cache.
-             */
-            sctx->ws->cs_check_space(gfx_cs, 0, true);
-         } else {
-            radeon_begin(gfx_cs);
-            radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
-            radeon_emit(gfx_cs, 0);
-            radeon_end();
-         }
-      }
-
-      sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
-      uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
-      uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
-      /* Emit the draw packet into the gfx IB. */
-      radeon_begin(gfx_cs);
-      radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
-      radeon_emit(gfx_cs, num_prims * vertices_per_prim);
-      radeon_emit(gfx_cs, index_va);
-      radeon_emit(gfx_cs, index_va >> 32);
-      radeon_emit(gfx_cs, 0);
-      radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
-      radeon_end();
-
-      radeon_begin_again(cs);
-
-      /* Continue with the compute IB. */
-      if (start_prim == 0) {
-         uint32_t gds_prim_restart_continue_bit = 0;
-
-         if (sctx->preserve_prim_restart_gds_at_flush) {
-            assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP);
-            assert(start_prim < 1 << 31);
-            gds_prim_restart_continue_bit = 1 << 31;
-         }
-
-         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
-         radeon_emit(cs, index_buffers_va);
-         radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0
-                            ? count_va
-                            : VERTEX_COUNTER_GDS_MODE == 1
-                                 ? gds_offset
-                                 : start_prim | gds_prim_restart_continue_bit);
-         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-         radeon_emit(cs, count_va);
-         radeon_emit(cs, vb_desc_va);
-         radeon_emit(cs, vs_const_desc_va);
-         radeon_emit(cs, vs_sampler_desc_va);
-         radeon_emit(cs, base_vertex);
-         radeon_emit(cs, info->start_instance);
-         radeon_emit(cs, num_prims_udiv.multiplier);
-         radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
-         radeon_emit(cs, info->restart_index);
-         /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
-         radeon_emit(cs, fui(cull_info.small_prim_precision));
-      } else {
-         assert(VERTEX_COUNTER_GDS_MODE == 2);
-         /* Only update the SGPRs that changed. */
-         radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
-         radeon_emit(cs, start_prim);
-         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-         radeon_emit(cs, count_va);
-      }
-
-      /* Set grid dimensions. */
-      unsigned start_block = start_prim / THREADGROUP_SIZE;
-      unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
-      unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
-      radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
-      radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
-                        S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
-                           S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
-      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
-      radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
-      radeon_emit(cs, 1);
-      radeon_emit(cs, 1);
-      radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
-                         S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
-                         S_00B800_ORDER_MODE(0 /* launch in order */));
-      radeon_end();
-
-      /* This is only for unordered append. Ordered append writes this from
-       * the shader.
-       *
-       * Note that EOP and EOS events are super slow, so emulating the event
-       * in a shader is an important optimization.
-       */
-      if (VERTEX_COUNTER_GDS_MODE == 1) {
-         si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
-                           sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-                           EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL,
-                           count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
-                           EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY);
-
-         /* Now that compute shaders are running, clear the remainder of GDS. */
-         if (first_dispatch) {
-            unsigned offset = gds_offset + gds_size;
-            si_cp_dma_clear_buffer(
-               sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0,
-               SI_OP_CPDMA_SKIP_CHECK_CS_SPACE, SI_COHERENCY_NONE, L2_BYPASS);
-         }
-      }
-      first_dispatch = false;
-
-      assert(cs->current.cdw <= cs->current.max_dw);
-      assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
-   }
+	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
+	if (!num_prims_per_instance)
+		return;
+
+	unsigned num_prims = num_prims_per_instance * info->instance_count;
+	unsigned vertices_per_prim, output_indexbuf_format;
+
+	switch (info->mode) {
+	case PIPE_PRIM_TRIANGLES:
+	case PIPE_PRIM_TRIANGLE_STRIP:
+	case PIPE_PRIM_TRIANGLE_FAN:
+		vertices_per_prim = 3;
+		output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
+		break;
+	default:
+		unreachable("unsupported primitive type");
+		return;
+	}
+
+	unsigned out_indexbuf_offset;
+	uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
+	bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
+
+	/* Initialize the compute IB if it's empty. */
+	if (!sctx->prim_discard_compute_ib_initialized) {
+		/* 1) State initialization. */
+		sctx->compute_gds_offset = 0;
+		sctx->compute_ib_last_shader = NULL;
+
+		if (sctx->last_ib_barrier_fence) {
+			assert(!sctx->last_ib_barrier_buf);
+			sctx->ws->cs_add_fence_dependency(gfx_cs,
+							  sctx->last_ib_barrier_fence,
+							  RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
+		}
+
+		/* 2) IB initialization. */
+
+		/* This needs to be done at the beginning of IBs due to possible
+		 * TTM buffer moves in the kernel.
+		 *
+		 * TODO: update for GFX10
+		 */
+		si_emit_surface_sync(sctx, cs,
+				     S_0085F0_TC_ACTION_ENA(1) |
+				     S_0085F0_TCL1_ACTION_ENA(1) |
+				     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
+				     S_0085F0_SH_ICACHE_ACTION_ENA(1) |
+				     S_0085F0_SH_KCACHE_ACTION_ENA(1));
+
+		/* Restore the GDS prim restart counter if needed. */
+		if (sctx->preserve_prim_restart_gds_at_flush) {
+			si_cp_copy_data(sctx, cs,
+					COPY_DATA_GDS, NULL, 4,
+					COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
+		}
+
+		si_emit_initial_compute_regs(sctx, cs);
+
+		radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+				  S_00B860_WAVES(sctx->scratch_waves) |
+				  S_00B860_WAVESIZE(0)); /* no scratch */
+
+		/* Only 1D grids are launched. */
+		radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
+		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
+				S_00B820_NUM_THREAD_PARTIAL(1));
+		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
+				S_00B824_NUM_THREAD_PARTIAL(1));
+
+		radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
+		radeon_emit(cs, 0);
+		radeon_emit(cs, 0);
+
+		/* Disable ordered alloc for OA resources. */
+		for (unsigned i = 0; i < 2; i++) {
+			radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
+			radeon_emit(cs, S_031074_INDEX(i));
+			radeon_emit(cs, 0);
+			radeon_emit(cs, S_03107C_ENABLE(0));
+		}
+
+		if (sctx->last_ib_barrier_buf) {
+			assert(!sctx->last_ib_barrier_fence);
+			radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
+						  RADEON_USAGE_READ, RADEON_PRIO_FENCE);
+			si_cp_wait_mem(sctx, cs,
+				       sctx->last_ib_barrier_buf->gpu_address +
+				       sctx->last_ib_barrier_buf_offset, 1, 1,
+				       WAIT_REG_MEM_EQUAL);
+		}
+
+		sctx->prim_discard_compute_ib_initialized = true;
+	}
+
+	/* Allocate the output index buffer. */
+	output_indexbuf_size = align(output_indexbuf_size,
+				     sctx->screen->info.tcc_cache_line_size);
+	assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
+	out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
+	sctx->index_ring_offset += output_indexbuf_size;
+
+	radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
+				  RADEON_PRIO_SHADER_RW_BUFFER);
+	uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
+
+	/* Prepare index buffer descriptors. */
+	struct si_resource *indexbuf_desc = NULL;
+	unsigned indexbuf_desc_offset;
+	unsigned desc_size = 12 * 4;
+	uint32_t *desc;
+
+	u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
+		       si_optimal_tcc_alignment(sctx, desc_size),
+		       &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
+		       (void**)&desc);
+	radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
+				  RADEON_PRIO_DESCRIPTORS);
+
+	/* Input index buffer. */
+	desc[0] = input_indexbuf_va;
+	desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
+		  S_008F04_STRIDE(index_size);
+	desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
+	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+		  S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
+				       index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
+							 V_008F0C_BUF_DATA_FORMAT_32);
+
+	/* Output index buffer. */
+	desc[4] = out_indexbuf_va;
+	desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
+		  S_008F04_STRIDE(vertices_per_prim * 4);
+	desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
+	desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+		  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+		  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+		  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+		  S_008F0C_DATA_FORMAT(output_indexbuf_format);
+
+	/* Viewport state.
+	 * This is needed by the small primitive culling, because it's done
+	 * in screen space.
+	 */
+	float scale[2], translate[2];
+
+	scale[0] = sctx->viewports.states[0].scale[0];
+	scale[1] = sctx->viewports.states[0].scale[1];
+	translate[0] = sctx->viewports.states[0].translate[0];
+	translate[1] = sctx->viewports.states[0].translate[1];
+
+	/* The viewport shouldn't flip the X axis for the small prim culling to work. */
+	assert(-scale[0] + translate[0] <= scale[0] + translate[0]);
+
+	/* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
+	 * This is because the viewport transformation inverts the clip space
+	 * bounding box, so min becomes max, which breaks small primitive
+	 * culling.
+	 */
+	if (sctx->viewports.y_inverted) {
+		scale[1] = -scale[1];
+		translate[1] = -translate[1];
+	}
+
+	/* Scale the framebuffer up, so that samples become pixels and small
+	 * primitive culling is the same for all sample counts.
+	 * This only works with the standard DX sample positions, because
+	 * the samples are evenly spaced on both X and Y axes.
+	 */
+	unsigned num_samples = sctx->framebuffer.nr_samples;
+	assert(num_samples >= 1);
+
+	for (unsigned i = 0; i < 2; i++) {
+		scale[i] *= num_samples;
+		translate[i] *= num_samples;
+	}
+
+	desc[8] = fui(scale[0]);
+	desc[9] = fui(scale[1]);
+	desc[10] = fui(translate[0]);
+	desc[11] = fui(translate[1]);
+
+	/* Better subpixel precision increases the efficiency of small
+	 * primitive culling. */
+	unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+	float small_prim_cull_precision;
+
+	if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+		small_prim_cull_precision = num_samples / 4096.0;
+	else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+		small_prim_cull_precision = num_samples / 1024.0;
+	else
+		small_prim_cull_precision = num_samples / 256.0;
+
+	/* Set user data SGPRs. */
+	/* This can't be greater than 14 if we want the fastest launch rate. */
+	unsigned user_sgprs = 13;
+
+	uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
+	unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
+	unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
+	uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
+	uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
+	uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
+				      sctx->vb_descriptors_buffer->gpu_address +
+				      sctx->vb_descriptors_offset : 0;
+	unsigned gds_offset, gds_size;
+	struct si_fast_udiv_info32 num_prims_udiv = {};
+
+	if (info->instance_count > 1)
+		num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
+
+	/* Limitations on how these two are packed in the user SGPR. */
+	assert(num_prims_udiv.post_shift < 32);
+	assert(num_prims_per_instance < 1 << 27);
+
+	si_resource_reference(&indexbuf_desc, NULL);
+
+	bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
+
+	if (VERTEX_COUNTER_GDS_MODE == 1) {
+		gds_offset = sctx->compute_gds_offset;
+		gds_size = primitive_restart ? 8 : 4;
+		sctx->compute_gds_offset += gds_size;
+
+		/* Reset the counters in GDS for the first dispatch using WRITE_DATA.
+		 * The remainder of the GDS will be cleared after the dispatch packet
+		 * in parallel with compute shaders.
+		 */
+		if (first_dispatch) {
+			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
+			radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
+			radeon_emit(cs, gds_offset);
+			radeon_emit(cs, 0);
+			radeon_emit(cs, 0); /* value to write */
+			if (gds_size == 8)
+				radeon_emit(cs, 0);
+		}
+	}
+
+	/* Set shader registers. */
+	struct si_shader *shader = sctx->cs_prim_discard_state.current;
+
+	if (shader != sctx->compute_ib_last_shader) {
+		radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
+					  RADEON_PRIO_SHADER_BINARY);
+		uint64_t shader_va = shader->bo->gpu_address;
+
+		assert(shader->config.scratch_bytes_per_wave == 0);
+		assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
+
+		radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+		radeon_emit(cs, shader_va >> 8);
+		radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+		radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+		radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+				S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
+				S_00B848_FLOAT_MODE(shader->config.float_mode) |
+				S_00B848_DX10_CLAMP(1));
+		radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
+				S_00B84C_USER_SGPR(user_sgprs) |
+				S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
+				S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
+				S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
+				S_00B84C_LDS_SIZE(shader->config.lds_size));
+
+		radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+			ac_get_compute_resource_limits(&sctx->screen->info,
+						       WAVES_PER_TG,
+						       MAX_WAVES_PER_SH,
+						       THREADGROUPS_PER_CU));
+		sctx->compute_ib_last_shader = shader;
+	}
+
+	STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
+
+	/* Big draw calls are split into smaller dispatches and draw packets. */
+	for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
+		unsigned num_subdraw_prims;
+
+		if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
+			num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
+		else
+			num_subdraw_prims = num_prims - start_prim;
+
+		/* Small dispatches are executed back to back until a specific primitive
+		 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
+		 * to start drawing the batch. This batching adds latency to the gfx IB,
+		 * but CS_DONE and REWIND are too slow.
+		 */
+		if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
+			si_compute_signal_gfx(sctx);
+
+		if (sctx->compute_num_prims_in_batch == 0) {
+			assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
+			sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+
+			if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
+				radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+				radeon_emit(gfx_cs, 0);
+
+				si_cp_wait_mem(sctx, gfx_cs,
+					       sctx->compute_rewind_va |
+					       (uint64_t)sctx->screen->info.address32_hi << 32,
+					       REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
+					       WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
+
+				/* Use INDIRECT_BUFFER to chain to a different buffer
+				 * to discard the CP prefetch cache.
+				 */
+				sctx->ws->cs_check_space(gfx_cs, 0, true);
+			} else {
+				radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+				radeon_emit(gfx_cs, 0);
+			}
+		}
+
+		sctx->compute_num_prims_in_batch += num_subdraw_prims;
+
+		uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
+		uint64_t index_va = out_indexbuf_va + start_prim * 12;
+
+		/* Emit the draw packet into the gfx IB. */
+		radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
+		radeon_emit(gfx_cs, num_prims * vertices_per_prim);
+		radeon_emit(gfx_cs, index_va);
+		radeon_emit(gfx_cs, index_va >> 32);
+		radeon_emit(gfx_cs, 0);
+		radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+
+		/* Continue with the compute IB. */
+		if (start_prim == 0) {
+			uint32_t gds_prim_restart_continue_bit = 0;
+
+			if (sctx->preserve_prim_restart_gds_at_flush) {
+				assert(primitive_restart &&
+				       info->mode == PIPE_PRIM_TRIANGLE_STRIP);
+				assert(start_prim < 1 << 31);
+				gds_prim_restart_continue_bit = 1 << 31;
+			}
+
+			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
+			radeon_emit(cs, index_buffers_va);
+			radeon_emit(cs,
+				    VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
+				    VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
+								   start_prim |
+								   gds_prim_restart_continue_bit);
+			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+			radeon_emit(cs, count_va);
+			radeon_emit(cs, vb_desc_va);
+			radeon_emit(cs, vs_const_desc_va);
+			radeon_emit(cs, vs_sampler_desc_va);
+			radeon_emit(cs, base_vertex);
+			radeon_emit(cs, info->start_instance);
+			radeon_emit(cs, num_prims_udiv.multiplier);
+			radeon_emit(cs, num_prims_udiv.post_shift |
+					(num_prims_per_instance << 5));
+			radeon_emit(cs, info->restart_index);
+			/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
+			radeon_emit(cs, fui(small_prim_cull_precision));
+		} else {
+			assert(VERTEX_COUNTER_GDS_MODE == 2);
+			/* Only update the SGPRs that changed. */
+			radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
+			radeon_emit(cs, start_prim);
+			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+			radeon_emit(cs, count_va);
+		}
+
+		/* Set grid dimensions. */
+		unsigned start_block = start_prim / THREADGROUP_SIZE;
+		unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
+		unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
+
+		radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
+		radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
+				  S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
+				  S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
+
+		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
+				PKT3_SHADER_TYPE_S(1));
+		radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
+		radeon_emit(cs, 1);
+		radeon_emit(cs, 1);
+		radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
+				S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
+				S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
+				S_00B800_ORDER_MODE(0 /* launch in order */));
+
+		/* This is only for unordered append. Ordered append writes this from
+		 * the shader.
+		 *
+		 * Note that EOP and EOS events are super slow, so emulating the event
+		 * in a shader is an important optimization.
+		 */
+		if (VERTEX_COUNTER_GDS_MODE == 1) {
+			si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
+					  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+					  EOP_INT_SEL_NONE,
+					  EOP_DATA_SEL_GDS,
+					  NULL,
+					  count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+					  EOP_DATA_GDS(gds_offset / 4, 1),
+					  SI_NOT_QUERY);
+
+			/* Now that compute shaders are running, clear the remainder of GDS. */
+			if (first_dispatch) {
+				unsigned offset = gds_offset + gds_size;
+				si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
+						       GDS_SIZE_UNORDERED - offset,
+						       0,
+						       SI_CPDMA_SKIP_CHECK_CS_SPACE |
+						       SI_CPDMA_SKIP_GFX_SYNC |
+						       SI_CPDMA_SKIP_SYNC_BEFORE,
+						       SI_COHERENCY_NONE, L2_BYPASS);
+			}
+		}
+		first_dispatch = false;
+
+		assert(cs->current.cdw <= cs->current.max_dw);
+		assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+	}
 }
diff --git a/lib/mesa/src/gallium/drivers/virgl/Android.mk b/lib/mesa/src/gallium/drivers/virgl/Android.mk
index a64828e90..c06c16558 100644
--- a/lib/mesa/src/gallium/drivers/virgl/Android.mk
+++ b/lib/mesa/src/gallium/drivers/virgl/Android.mk
@@ -30,7 +30,22 @@ LOCAL_SRC_FILES := \
 
 LOCAL_MODULE := libmesa_pipe_virgl
 
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+intermediates := $(call local-generated-sources-dir)
+LOCAL_GENERATED_SOURCES := $(intermediates)/virgl/virgl_driinfo.h
+
+GEN_DRIINFO_INPUTS := \
+	$(MESA_TOP)/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h \
+	$(LOCAL_PATH)/virgl_driinfo.h.in
+
+MERGE_DRIINFO := $(MESA_TOP)/src/util/merge_driinfo.py
+
+$(intermediates)/virgl/virgl_driinfo.h: $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS)
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/gallium/targets/dri/Android.mk b/lib/mesa/src/gallium/targets/dri/Android.mk
index 6ec4055f1..c7d564a23 100644
--- a/lib/mesa/src/gallium/targets/dri/Android.mk
+++ b/lib/mesa/src/gallium/targets/dri/Android.mk
@@ -42,9 +42,7 @@ LOCAL_LDFLAGS := \
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
 	libglapi \
-	libz \
-	liblog \
-	libsync
+	libz
 
 # If Android version >=8 MESA should static link libexpat else should dynamic link
 ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
@@ -56,20 +54,9 @@ LOCAL_SHARED_LIBRARIES += \
 endif
 
 LOCAL_STATIC_LIBRARIES += \
-	libetnaviv_drm \
-	libfreedreno_common \
 	libfreedreno_drm \
-	libfreedreno_ir2 \
 	libfreedreno_ir3 \
-	libfreedreno_perfcntrs \
-	libmesa_gallium \
-	libpanfrost_lib \
-	libpanfrost_bifrost \
-	libpanfrost_bifrost_disasm \
-	libpanfrost_midgard \
-	libpanfrost_midgard_disasm \
 	libpanfrost_shared \
-	libpanfrost_util \
 
 ifeq ($(USE_LIBBACKTRACE),true)
 	LOCAL_SHARED_LIBRARIES += libbacktrace
@@ -87,12 +74,11 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 	libmesa_nir \
 	libmesa_dri_common \
 	libmesa_megadriver_stub \
+	libmesa_gallium \
 	libmesa_pipe_loader \
 	libmesa_util \
 	libmesa_loader
 
-LOCAL_SHARED_LIBRARIES += libcutils
-
 # sort GALLIUM_SHARED_LIBS to remove any duplicates
 LOCAL_SHARED_LIBRARIES += $(sort $(GALLIUM_SHARED_LIBS))
 
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk b/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk
index 90f56e45b..0b8edf972 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Android.mk
@@ -21,8 +21,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-ifeq ($(MESA_ENABLE_LLVM),true)
-
 LOCAL_PATH := $(call my-dir)
 
 # get C_SOURCES
@@ -48,5 +46,3 @@ ifneq ($(HAVE_GALLIUM_RADEONSI),)
 $(eval GALLIUM_LIBS += $(LOCAL_MODULE) $(LOCAL_STATIC_LIBRARIES))
 $(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
 endif
-
-endif # MESA_ENABLE_LLVM==true
diff --git a/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk b/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk
index 31edabd68..32091bea0 100644
--- a/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/etnaviv/drm/Android.mk
@@ -25,7 +25,7 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
-LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm
+LOCAL_SHARED_LIBRARIES := libdrm_etnaviv
 
 LOCAL_MODULE := libmesa_winsys_etnaviv
 
diff --git a/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk b/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk
index 669559583..09edab391 100644
--- a/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/freedreno/drm/Android.mk
@@ -27,9 +27,6 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
-LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/src/freedreno/common
-
 LOCAL_SHARED_LIBRARIES := libdrm_freedreno
 LOCAL_STATIC_LIBRARIES := libfreedreno_registers
 
diff --git a/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk b/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk
index f3d9df79c..5e2500774 100644
--- a/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk
+++ b/lib/mesa/src/gallium/winsys/virgl/drm/Android.mk
@@ -29,8 +29,6 @@ LOCAL_SRC_FILES := $(C_SOURCES)
 
 LOCAL_MODULE := libmesa_winsys_virgl
 
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio
-
 LOCAL_STATIC_LIBRARIES := libmesa_winsys_virgl_common
 
 include $(GALLIUM_COMMON_MK)
diff --git a/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk b/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk
index 454d830d0..5b33f6771 100644
--- a/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk
+++ b/lib/mesa/src/gallium/winsys/virgl/vtest/Android.mk
@@ -29,8 +29,6 @@ LOCAL_SRC_FILES := $(C_SOURCES)
 
 LOCAL_MODULE := libmesa_winsys_virgl_vtest
 
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/virtio
-
 LOCAL_STATIC_LIBRARIES := libmesa_winsys_virgl_common
 
 include $(GALLIUM_COMMON_MK)
diff --git a/lib/mesa/src/intel/Android.common.mk b/lib/mesa/src/intel/Android.common.mk
index 0e1118e65..79d9f1284 100644
--- a/lib/mesa/src/intel/Android.common.mk
+++ b/lib/mesa/src/intel/Android.common.mk
@@ -36,8 +36,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary \
 	$(MESA_TOP)/src/mapi \
-	$(MESA_TOP)/src/mesa \
-	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_git_sha1,,)
+	$(MESA_TOP)/src/mesa
 
 LOCAL_SHARED_LIBRARIES := libz liblog
 
diff --git a/lib/mesa/src/intel/Android.dev.mk b/lib/mesa/src/intel/Android.dev.mk
index 5c7ddd4d3..4f14b0362 100644
--- a/lib/mesa/src/intel/Android.dev.mk
+++ b/lib/mesa/src/intel/Android.dev.mk
@@ -29,12 +29,7 @@ LOCAL_MODULE := libmesa_intel_dev
 
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_STATIC_LIBRARIES := \
-	libmesa_git_sha1
-
-LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/include \
-	$(MESA_TOP)/src
+LOCAL_C_INCLUDES := $(MESA_TOP)/include
 
 LOCAL_SRC_FILES := $(DEV_FILES)
 
diff --git a/lib/mesa/src/intel/Android.genxml.mk b/lib/mesa/src/intel/Android.genxml.mk
index e5548e2b7..8b867920c 100644
--- a/lib/mesa/src/intel/Android.genxml.mk
+++ b/lib/mesa/src/intel/Android.genxml.mk
@@ -96,21 +96,16 @@ $(intermediates)/genxml/gen9_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen9.xm
 $(intermediates)/genxml/gen9_pack.h: $(LOCAL_PATH)/genxml/gen9.xml $(LOCAL_PATH)/genxml/gen_pack_header.py
 	$(call header-gen)
 
+$(intermediates)/genxml/gen10_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py
+$(intermediates)/genxml/gen10_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen10.xml
+$(intermediates)/genxml/gen10_pack.h: $(LOCAL_PATH)/genxml/gen10.xml $(LOCAL_PATH)/genxml/gen_pack_header.py
+	$(call header-gen)
+
 $(intermediates)/genxml/gen11_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py
 $(intermediates)/genxml/gen11_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen11.xml
 $(intermediates)/genxml/gen11_pack.h: $(LOCAL_PATH)/genxml/gen11.xml $(LOCAL_PATH)/genxml/gen_pack_header.py
 	$(call header-gen)
 
-$(intermediates)/genxml/gen12_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py
-$(intermediates)/genxml/gen12_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen12.xml
-$(intermediates)/genxml/gen12_pack.h: $(LOCAL_PATH)/genxml/gen12.xml $(LOCAL_PATH)/genxml/gen_pack_header.py
-	$(call header-gen)
-
-$(intermediates)/genxml/gen125_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py
-$(intermediates)/genxml/gen125_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen125.xml
-$(intermediates)/genxml/gen125_pack.h: $(LOCAL_PATH)/genxml/gen125.xml $(LOCAL_PATH)/genxml/gen_pack_header.py
-	$(call header-gen)
-
 $(intermediates)/genxml/genX_xml.h: $(addprefix $(MESA_TOP)/src/intel/,$(GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
diff --git a/lib/mesa/src/intel/Android.isl.mk b/lib/mesa/src/intel/Android.isl.mk
index e1ef62b73..07a64b8ed 100644
--- a/lib/mesa/src/intel/Android.isl.mk
+++ b/lib/mesa/src/intel/Android.isl.mk
@@ -25,20 +25,19 @@
 # ---------------------------------------
 
 LIBISL_GENX_COMMON_INCLUDES := \
-	$(MESA_TOP)/src/ \
-	$(MESA_TOP)/src/gallium/include/
+	$(MESA_TOP)/src/
 
 # ---------------------------------------
-# Build libmesa_isl_gfx4
+# Build libmesa_isl_gen4
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx4
+LOCAL_MODULE := libmesa_isl_gen4
 
-LOCAL_SRC_FILES := $(ISL_GFX4_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN4_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=40
+LOCAL_CFLAGS := -DGEN_VERSIONx10=40
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -48,16 +47,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx5
+# Build libmesa_isl_gen5
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx5
+LOCAL_MODULE := libmesa_isl_gen5
 
-LOCAL_SRC_FILES := $(ISL_GFX5_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN5_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=50
+LOCAL_CFLAGS := -DGEN_VERSIONx10=50
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -67,16 +66,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx6
+# Build libmesa_isl_gen6
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx6
+LOCAL_MODULE := libmesa_isl_gen6
 
-LOCAL_SRC_FILES := $(ISL_GFX6_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN6_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=60
+LOCAL_CFLAGS := -DGEN_VERSIONx10=60
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -86,16 +85,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx7
+# Build libmesa_isl_gen7
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx7
+LOCAL_MODULE := libmesa_isl_gen7
 
-LOCAL_SRC_FILES := $(ISL_GFX7_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN7_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=70
+LOCAL_CFLAGS := -DGEN_VERSIONx10=70
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -105,16 +104,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx75
+# Build libmesa_isl_gen75
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx75
+LOCAL_MODULE := libmesa_isl_gen75
 
-LOCAL_SRC_FILES := $(ISL_GFX75_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN75_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=75
+LOCAL_CFLAGS := -DGEN_VERSIONx10=75
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -124,16 +123,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx8
+# Build libmesa_isl_gen8
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx8
+LOCAL_MODULE := libmesa_isl_gen8
 
-LOCAL_SRC_FILES := $(ISL_GFX8_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN8_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=80
+LOCAL_CFLAGS := -DGEN_VERSIONx10=80
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -143,16 +142,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx9
+# Build libmesa_isl_gen9
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx9
+LOCAL_MODULE := libmesa_isl_gen9
 
-LOCAL_SRC_FILES := $(ISL_GFX9_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN9_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=90
+LOCAL_CFLAGS := -DGEN_VERSIONx10=90
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -162,16 +161,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx11
+# Build libmesa_isl_gen10
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx11
+LOCAL_MODULE := libmesa_isl_gen10
 
-LOCAL_SRC_FILES := $(ISL_GFX11_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN10_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=110
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -181,35 +180,16 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_isl_gfx12
+# Build libmesa_isl_gen11
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_isl_gfx12
+LOCAL_MODULE := libmesa_isl_gen11
 
-LOCAL_SRC_FILES := $(ISL_GFX12_FILES)
+LOCAL_SRC_FILES := $(ISL_GEN11_FILES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=120
-
-LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
-
-LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
-
-include $(MESA_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
-
-# ---------------------------------------
-# Build libmesa_isl_gfx125
-# ---------------------------------------
-
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libmesa_isl_gfx125
-
-LOCAL_SRC_FILES := $(ISL_GFX125_FILES)
-
-LOCAL_CFLAGS := -DGFX_VERx10=125
+LOCAL_CFLAGS := -DGEN_VERSIONx10=110
 
 LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
 
@@ -279,16 +259,15 @@ LOCAL_C_INCLUDES := \
 LOCAL_EXPORT_C_INCLUDE_DIRS := $(MESA_TOP)/src/intel
 
 LOCAL_WHOLE_STATIC_LIBRARIES := \
-	libmesa_isl_gfx4 \
-	libmesa_isl_gfx5 \
-	libmesa_isl_gfx6 \
-	libmesa_isl_gfx7 \
-	libmesa_isl_gfx75 \
-	libmesa_isl_gfx8 \
-	libmesa_isl_gfx9 \
-	libmesa_isl_gfx11 \
-	libmesa_isl_gfx12 \
-	libmesa_isl_gfx125 \
+	libmesa_isl_gen4 \
+	libmesa_isl_gen5 \
+	libmesa_isl_gen6 \
+	libmesa_isl_gen7 \
+	libmesa_isl_gen75 \
+	libmesa_isl_gen8 \
+	libmesa_isl_gen9 \
+	libmesa_isl_gen10 \
+	libmesa_isl_gen11 \
 	libmesa_genxml \
 	libmesa_isl_tiled_memcpy
 
diff --git a/lib/mesa/src/intel/Android.vulkan.mk b/lib/mesa/src/intel/Android.vulkan.mk
index 81ced17e2..00eb49a38 100644
--- a/lib/mesa/src/intel/Android.vulkan.mk
+++ b/lib/mesa/src/intel/Android.vulkan.mk
@@ -23,7 +23,9 @@ LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 include $(LOCAL_PATH)/Makefile.sources
 
-VK_ENTRYPOINTS_GEN_SCRIPT := $(MESA_TOP)/src/vulkan/util/vk_entrypoints_gen.py
+ANV_ENTRYPOINTS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py
+ANV_EXTENSIONS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions_gen.py
+ANV_EXTENSIONS_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions.py
 VULKAN_API_XML := $(MESA_TOP)/src/vulkan/registry/vk.xml
 
 VULKAN_COMMON_INCLUDES := \
@@ -51,7 +53,6 @@ VULKAN_COMMON_HEADER_LIBRARIES := \
 endif
 
 ANV_STATIC_LIBRARIES := \
-	libmesa_vulkan_util \
 	libmesa_vulkan_common \
 	libmesa_genxml \
 	libmesa_nir
@@ -63,15 +64,15 @@ ANV_SHARED_LIBRARIES += libnativewindow
 endif
 
 #
-# libanv for gfx7
+# libanv for gen7
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_anv_gfx7
+LOCAL_MODULE := libmesa_anv_gen7
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_SRC_FILES := $(VULKAN_GFX7_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=70
+LOCAL_SRC_FILES := $(VULKAN_GEN7_FILES)
+LOCAL_CFLAGS := -DGEN_VERSIONx10=70
 
 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES)
 
@@ -84,15 +85,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libanv for gfx75
+# libanv for gen75
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_anv_gfx75
+LOCAL_MODULE := libmesa_anv_gen75
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_SRC_FILES := $(VULKAN_GFX75_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=75
+LOCAL_SRC_FILES := $(VULKAN_GEN75_FILES)
+LOCAL_CFLAGS := -DGEN_VERSIONx10=75
 
 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES)
 
@@ -105,15 +106,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libanv for gfx8
+# libanv for gen8
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_anv_gfx8
+LOCAL_MODULE := libmesa_anv_gen8
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_SRC_FILES := $(VULKAN_GFX8_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=80
+LOCAL_SRC_FILES := $(VULKAN_GEN8_FILES)
+LOCAL_CFLAGS := -DGEN_VERSIONx10=80
 
 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES)
 
@@ -126,15 +127,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libanv for gfx9
+# libanv for gen9
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_anv_gfx9
+LOCAL_MODULE := libmesa_anv_gen9
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_SRC_FILES := $(VULKAN_GFX9_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=90
+LOCAL_SRC_FILES := $(VULKAN_GEN9_FILES)
+LOCAL_CFLAGS := -DGEN_VERSIONx10=90
 
 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES)
 
@@ -147,15 +148,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libanv for gfx11
+# libanv for gen10
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_anv_gfx11
+LOCAL_MODULE := libmesa_anv_gen10
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_SRC_FILES := $(VULKAN_GFX11_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=110
+LOCAL_SRC_FILES := $(VULKAN_GEN10_FILES)
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
 
 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES)
 
@@ -168,15 +169,15 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libanv for gfx12
+# libanv for gen11
 #
 
 include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_anv_gfx12
+LOCAL_MODULE := libmesa_anv_gen11
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-LOCAL_SRC_FILES := $(VULKAN_GFX12_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=120
+LOCAL_SRC_FILES := $(VULKAN_GEN11_FILES)
+LOCAL_CFLAGS := -DGEN_VERSIONx10=110
 
 LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES)
 
@@ -189,28 +190,6 @@ include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 #
-# libanv for gfx125
-#
-
-include $(CLEAR_VARS)
-LOCAL_MODULE := libmesa_anv_gfx125
-LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
-LOCAL_SRC_FILES := $(VULKAN_GFX125_FILES)
-LOCAL_CFLAGS := -DGFX_VERx10=125
-
-LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES)
-
-LOCAL_STATIC_LIBRARIES := $(ANV_STATIC_LIBRARIES)
-
-LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
-LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
-
-include $(MESA_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
-
-
-#
 # libmesa_vulkan_common
 #
 
@@ -235,25 +214,39 @@ LOCAL_STATIC_LIBRARIES := \
 	libmesa_vulkan_util \
 	libmesa_util
 
-LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(VULKAN_GENERATED_FILES))
+# The rule generates both C and H files, but due to some strange
+# reason generating the files once leads to link-time issues.
+# Work around create them here as well - we're safe from race
+# conditions since they are stored in another location.
 
-ANV_VK_ENTRYPOINTS_GEN_ARGS= \
-	--proto --weak --prefix anv \
-	--device-prefix gfx7 --device-prefix gfx75 \
-	--device-prefix gfx8 --device-prefix gfx9 \
-	--device-prefix gfx11 --device-prefix gfx12 \
-	--device-prefix gfx125
+LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(VULKAN_GENERATED_FILES))
 
-$(intermediates)/vulkan/anv_entrypoints.c: $(VK_ENTRYPOINTS_GEN_SCRIPT) \
+$(intermediates)/vulkan/anv_entrypoints.c: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+					   $(ANV_EXTENSIONS_SCRIPT) \
 					   $(VULKAN_API_XML)
 	@mkdir -p $(dir $@)
-	$(MESA_PYTHON2) $(VK_ENTRYPOINTS_GEN_SCRIPT) \
+	$(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
 		--xml $(VULKAN_API_XML) \
-		$(ANV_VK_ENTRYPOINTS_GEN_ARGS) \
-		--out-c $@ --out-h $(dir $@)/anv_entrypoints.h
+		--outdir $(dir $@)
 
 $(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/anv_entrypoints.c
 
+$(intermediates)/vulkan/anv_extensions.c: $(ANV_EXTENSIONS_GEN_SCRIPT) \
+					  $(ANV_EXTENSIONS_SCRIPT) \
+					  $(VULKAN_API_XML)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \
+		--xml $(VULKAN_API_XML) \
+		--out-c $@
+
+$(intermediates)/vulkan/anv_extensions.h: $(ANV_EXTENSIONS_GEN_SCRIPT) \
+					   $(ANV_EXTENSIONS_SCRIPT) \
+					   $(VULKAN_API_XML)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \
+		--xml $(VULKAN_API_XML) \
+		--out-h $@
+
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
@@ -290,19 +283,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 	libmesa_compiler \
 	libmesa_intel_common \
 	libmesa_intel_dev \
-	libmesa_intel_perf \
 	libmesa_vulkan_common \
 	libmesa_vulkan_util \
-	libmesa_anv_gfx7 \
-	libmesa_anv_gfx75 \
-	libmesa_anv_gfx8 \
-	libmesa_anv_gfx9 \
-	libmesa_anv_gfx11 \
-	libmesa_anv_gfx12 \
-	libmesa_anv_gfx125 \
+	libmesa_anv_gen7 \
+	libmesa_anv_gen75 \
+	libmesa_anv_gen8 \
+	libmesa_anv_gen9 \
+	libmesa_anv_gen10 \
+	libmesa_anv_gen11 \
 	libmesa_intel_compiler
 
-LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog libcutils
+LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog
 LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 # If Android version >=8 MESA should static link libexpat else should dynamic link
@@ -314,5 +305,9 @@ else
         libexpat
 endif
 
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES += libgrallocusage
+endif
+
 include $(MESA_COMMON_MK)
 include $(BUILD_SHARED_LIBRARY)
diff --git a/lib/mesa/src/intel/dev/gen_debug.c b/lib/mesa/src/intel/dev/gen_debug.c
index a99abe48e..a4823286d 100644
--- a/lib/mesa/src/intel/dev/gen_debug.c
+++ b/lib/mesa/src/intel/dev/gen_debug.c
@@ -29,17 +29,14 @@
  * miscellaneous debugging code.
  */
 
-#include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 
 #include "dev/gen_debug.h"
-#include "git_sha1.h"
 #include "util/macros.h"
 #include "util/debug.h"
 #include "c11/threads.h"
 
-uint64_t intel_debug = 0;
+uint64_t INTEL_DEBUG = 0;
 
 static const struct debug_control debug_control[] = {
    { "tex",         DEBUG_TEXTURE},
@@ -92,12 +89,6 @@ static const struct debug_control debug_control[] = {
    { "tcs8",        DEBUG_TCS_EIGHT_PATCH },
    { "bt",          DEBUG_BT },
    { "pc",          DEBUG_PIPE_CONTROL },
-   { "nofc",        DEBUG_NO_FAST_CLEAR },
-   { "no32",        DEBUG_NO32 },
-   { "shaders",     DEBUG_WM | DEBUG_VS | DEBUG_TCS |
-                    DEBUG_TES | DEBUG_GS | DEBUG_CS |
-                    DEBUG_RT },
-   { "rt",          DEBUG_RT },
    { NULL,    0 }
 };
 
@@ -111,21 +102,15 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage)
       [MESA_SHADER_GEOMETRY] = DEBUG_GS,
       [MESA_SHADER_FRAGMENT] = DEBUG_WM,
       [MESA_SHADER_COMPUTE] = DEBUG_CS,
-
-      [MESA_SHADER_RAYGEN]       = DEBUG_RT,
-      [MESA_SHADER_ANY_HIT]      = DEBUG_RT,
-      [MESA_SHADER_CLOSEST_HIT]  = DEBUG_RT,
-      [MESA_SHADER_MISS]         = DEBUG_RT,
-      [MESA_SHADER_INTERSECTION] = DEBUG_RT,
-      [MESA_SHADER_CALLABLE]     = DEBUG_RT,
    };
+   STATIC_ASSERT(MESA_SHADER_STAGES == 6);
    return flags[stage];
 }
 
 static void
 brw_process_intel_debug_variable_once(void)
 {
-   intel_debug = parse_debug_string(getenv("INTEL_DEBUG"), debug_control);
+   INTEL_DEBUG = parse_debug_string(getenv("INTEL_DEBUG"), debug_control);
 }
 
 void
@@ -136,108 +121,3 @@ brw_process_intel_debug_variable(void)
    call_once(&process_intel_debug_variable_flag,
              brw_process_intel_debug_variable_once);
 }
-
-static uint64_t debug_identifier[4] = {
-   0xffeeddccbbaa9988,
-   0x7766554433221100,
-   0xffeeddccbbaa9988,
-   0x7766554433221100,
-};
-
-void *
-intel_debug_identifier(void)
-{
-   return debug_identifier;
-}
-
-uint32_t
-intel_debug_identifier_size(void)
-{
-   return sizeof(debug_identifier);
-}
-
-uint32_t
-intel_debug_write_identifiers(void *_output,
-                              uint32_t output_size,
-                              const char *driver_name)
-{
-   void *output = _output, *output_end = _output + output_size;
-
-   assert(output_size > intel_debug_identifier_size());
-
-   memcpy(output, intel_debug_identifier(), intel_debug_identifier_size());
-   output += intel_debug_identifier_size();
-
-   for (uint32_t id = GEN_DEBUG_BLOCK_TYPE_DRIVER; id < GEN_DEBUG_BLOCK_TYPE_MAX; id++) {
-      switch (id) {
-      case GEN_DEBUG_BLOCK_TYPE_DRIVER: {
-         struct gen_debug_block_driver driver_desc = {
-            .base = {
-               .type = id,
-            },
-         };
-         int len = snprintf(output + sizeof(driver_desc),
-                            output_end - (output + sizeof(driver_desc)),
-                            "%s " PACKAGE_VERSION " build " MESA_GIT_SHA1,
-                            driver_name);
-         driver_desc.base.length = sizeof(driver_desc) + len + 1;
-         memcpy(output, &driver_desc, sizeof(driver_desc));
-         output += driver_desc.base.length;
-         break;
-      }
-
-      case GEN_DEBUG_BLOCK_TYPE_FRAME: {
-         struct gen_debug_block_frame frame_desc = {
-            .base = {
-               .type = GEN_DEBUG_BLOCK_TYPE_FRAME,
-               .length = sizeof(frame_desc),
-            },
-         };
-         memcpy(output, &frame_desc, sizeof(frame_desc));
-         output += sizeof(frame_desc);
-         break;
-      }
-
-      default:
-         unreachable("Missing identifier write");
-      }
-
-      assert(output < output_end);
-   }
-
-   struct gen_debug_block_base end = {
-      .type = GEN_DEBUG_BLOCK_TYPE_END,
-      .length = sizeof(end),
-   };
-   memcpy(output, &end, sizeof(end));
-   output += sizeof(end);
-
-   assert(output < output_end);
-
-   /* Return the how many bytes where written, so that the rest of the buffer
-    * can be used for other things.
-    */
-   return output - _output;
-}
-
-void *
-intel_debug_get_identifier_block(void *_buffer,
-                                 uint32_t buffer_size,
-                                 enum gen_debug_block_type type)
-{
-   void *buffer = _buffer + intel_debug_identifier_size(),
-      *end_buffer = _buffer + buffer_size;
-
-   while (buffer < end_buffer) {
-      struct gen_debug_block_base *item = buffer;
-
-      if (item->type == type)
-         return item;
-      if (item->type == GEN_DEBUG_BLOCK_TYPE_END)
-         return NULL;
-
-      buffer += item->length;
-   }
-
-   return NULL;
-}
diff --git a/lib/mesa/src/intel/dev/gen_debug.h b/lib/mesa/src/intel/dev/gen_debug.h
index efaea6f34..edd3f8a66 100644
--- a/lib/mesa/src/intel/dev/gen_debug.h
+++ b/lib/mesa/src/intel/dev/gen_debug.h
@@ -28,7 +28,6 @@
 
 #include <stdint.h>
 #include "compiler/shader_enums.h"
-#include "util/macros.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,9 +39,7 @@ extern "C" {
  * list of debugging flags, as well as some macros for handling them.
  */
 
-extern uint64_t intel_debug;
-
-#define INTEL_DEBUG __builtin_expect(intel_debug, 0)
+extern uint64_t INTEL_DEBUG;
 
 #define DEBUG_TEXTURE             (1ull <<  0)
 #define DEBUG_STATE               (1ull <<  1)
@@ -90,9 +87,6 @@ extern uint64_t intel_debug;
 #define DEBUG_TCS_EIGHT_PATCH     (1ull << 43)
 #define DEBUG_BT                  (1ull << 44)
 #define DEBUG_PIPE_CONTROL        (1ull << 45)
-#define DEBUG_NO_FAST_CLEAR       (1ull << 46)
-#define DEBUG_NO32                (1ull << 47)
-#define DEBUG_RT                  (1ull << 48)
 
 /* These flags are not compatible with the disk shader cache */
 #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
@@ -101,7 +95,7 @@ extern uint64_t intel_debug;
 #define DEBUG_DISK_CACHE_MASK \
    (DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 |  DEBUG_SPILL_FS | \
    DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | \
-   DEBUG_TCS_EIGHT_PATCH | DEBUG_NO32)
+   DEBUG_TCS_EIGHT_PATCH)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
@@ -119,7 +113,7 @@ extern uint64_t intel_debug;
 #endif /* HAVE_ANDROID_PLATFORM */
 
 #define DBG(...) do {						\
-	if (INTEL_DEBUG & FILE_DEBUG_FLAG)		\
+	if (unlikely(INTEL_DEBUG & FILE_DEBUG_FLAG))		\
 		dbg_printf(__VA_ARGS__);			\
 } while(0)
 
@@ -127,50 +121,6 @@ extern uint64_t intel_debug_flag_for_shader_stage(gl_shader_stage stage);
 
 extern void brw_process_intel_debug_variable(void);
 
-/* Below is a list of structure located in the identifier buffer. The driver
- * can fill those in for debug purposes.
- */
-
-enum gen_debug_block_type {
-   /* End of the debug blocks */
-   GEN_DEBUG_BLOCK_TYPE_END = 1,
-
-   /* Driver identifier (struct gen_debug_block_driver) */
-   GEN_DEBUG_BLOCK_TYPE_DRIVER,
-
-   /* Frame identifier (struct gen_debug_block_frame) */
-   GEN_DEBUG_BLOCK_TYPE_FRAME,
-
-   /* Internal, never to be written out */
-   GEN_DEBUG_BLOCK_TYPE_MAX,
-};
-
-struct gen_debug_block_base {
-   uint32_t type; /* enum gen_debug_block_type */
-   uint32_t length; /* inclusive of this structure size */
-};
-
-struct gen_debug_block_driver {
-   struct gen_debug_block_base base;
-   uint8_t description[];
-};
-
-struct gen_debug_block_frame {
-   struct gen_debug_block_base base;
-   uint64_t frame_id;
-};
-
-extern void *intel_debug_identifier(void);
-extern uint32_t intel_debug_identifier_size(void);
-
-extern uint32_t intel_debug_write_identifiers(void *output,
-                                              uint32_t output_size,
-                                              const char *driver_name);
-
-extern void *intel_debug_get_identifier_block(void *buffer,
-                                              uint32_t buffer_size,
-                                              enum gen_debug_block_type type);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/mesa/src/intel/dev/gen_device_info_test.c b/lib/mesa/src/intel/dev/gen_device_info_test.c
index 815213938..495772f18 100644
--- a/lib/mesa/src/intel/dev/gen_device_info_test.c
+++ b/lib/mesa/src/intel/dev/gen_device_info_test.c
@@ -13,9 +13,8 @@ main(int argc, char *argv[])
       const char *name;
    } chipsets[] = {
 #undef CHIPSET
-#define CHIPSET(id, family, family_str, str_name) { .pci_id = id, .name = str_name, },
+#define CHIPSET(id, family, str_name) { .pci_id = id, .name = str_name, },
 #include "pci_ids/i965_pci_ids.h"
-#include "pci_ids/iris_pci_ids.h"
    };
 
    for (uint32_t i = 0; i < ARRAY_SIZE(chipsets); i++) {
@@ -23,11 +22,11 @@ main(int argc, char *argv[])
 
       assert(gen_get_device_info_from_pci_id(chipsets[i].pci_id, &devinfo));
 
-      assert(devinfo.ver != 0);
+      assert(devinfo.gen != 0);
+      assert(devinfo.urb.size != 0);
       assert(devinfo.num_eu_per_subslice != 0);
       assert(devinfo.num_thread_per_eu != 0);
       assert(devinfo.timestamp_frequency != 0);
-      assert(devinfo.cs_prefetch_size > 0);
    }
 
    return 0;
diff --git a/lib/mesa/src/intel/perf/gen_perf.c b/lib/mesa/src/intel/perf/gen_perf.c
index 0d3c9a22e..6b10b9d53 100644
--- a/lib/mesa/src/intel/perf/gen_perf.c
+++ b/lib/mesa/src/intel/perf/gen_perf.c
@@ -29,44 +29,363 @@
 #include <unistd.h>
 #include <errno.h>
 
-#ifndef HAVE_DIRENT_D_TYPE
-#include <limits.h> // PATH_MAX
-#endif
-
 #include <drm-uapi/i915_drm.h>
 
-#include "common/intel_gem.h"
-
-#include "dev/gen_debug.h"
-#include "dev/gen_device_info.h"
-
-#include "perf/gen_perf.h"
-#include "perf/gen_perf_regs.h"
+#include "common/gen_gem.h"
+#include "gen_perf.h"
 #include "perf/gen_perf_mdapi.h"
 #include "perf/gen_perf_metrics.h"
-#include "perf/gen_perf_private.h"
 
+#include "dev/gen_debug.h"
+#include "dev/gen_device_info.h"
 #include "util/bitscan.h"
-#include "util/macros.h"
-#include "util/mesa-sha1.h"
 #include "util/u_math.h"
 
 #define FILE_DEBUG_FLAG DEBUG_PERFMON
+#define MI_RPC_BO_SIZE              4096
+#define MI_FREQ_START_OFFSET_BYTES  (3072)
+#define MI_RPC_BO_END_OFFSET_BYTES  (MI_RPC_BO_SIZE / 2)
+#define MI_FREQ_END_OFFSET_BYTES    (3076)
+
+#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low))
+
+#define GEN7_RPSTAT1                       0xA01C
+#define  GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT   7
+#define  GEN7_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(13, 7)
+#define  GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT   0
+#define  GEN7_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(6, 0)
+
+#define GEN9_RPSTAT0                       0xA01C
+#define  GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT   23
+#define  GEN9_RPSTAT0_CURR_GT_FREQ_MASK    INTEL_MASK(31, 23)
+#define  GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT   0
+#define  GEN9_RPSTAT0_PREV_GT_FREQ_MASK    INTEL_MASK(8, 0)
+
+#define GEN6_SO_PRIM_STORAGE_NEEDED     0x2280
+#define GEN7_SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
+#define GEN6_SO_NUM_PRIMS_WRITTEN       0x2288
+#define GEN7_SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
+
+#define MAP_READ  (1 << 0)
+#define MAP_WRITE (1 << 1)
 
 #define OA_REPORT_INVALID_CTX_ID (0xffffffff)
 
-static bool
-is_dir_or_link(const struct dirent *entry, const char *parent_dir)
+/**
+ * Periodic OA samples are read() into these buffer structures via the
+ * i915 perf kernel interface and appended to the
+ * perf_ctx->sample_buffers linked list. When we process the
+ * results of an OA metrics query we need to consider all the periodic
+ * samples between the Begin and End MI_REPORT_PERF_COUNT command
+ * markers.
+ *
+ * 'Periodic' is a simplification as there are other automatic reports
+ * written by the hardware also buffered here.
+ *
+ * Considering three queries, A, B and C:
+ *
+ *  Time ---->
+ *                ________________A_________________
+ *                |                                |
+ *                | ________B_________ _____C___________
+ *                | |                | |           |   |
+ *
+ * And an illustration of sample buffers read over this time frame:
+ * [HEAD ][     ][     ][     ][     ][     ][     ][     ][TAIL ]
+ *
+ * These nodes may hold samples for query A:
+ * [     ][     ][  A  ][  A  ][  A  ][  A  ][  A  ][     ][     ]
+ *
+ * These nodes may hold samples for query B:
+ * [     ][     ][  B  ][  B  ][  B  ][     ][     ][     ][     ]
+ *
+ * These nodes may hold samples for query C:
+ * [     ][     ][     ][     ][     ][  C  ][  C  ][  C  ][     ]
+ *
+ * The illustration assumes we have an even distribution of periodic
+ * samples so all nodes have the same size plotted against time:
+ *
+ * Note, to simplify code, the list is never empty.
+ *
+ * With overlapping queries we can see that periodic OA reports may
+ * relate to multiple queries and care needs to be take to keep
+ * track of sample buffers until there are no queries that might
+ * depend on their contents.
+ *
+ * We use a node ref counting system where a reference ensures that a
+ * node and all following nodes can't be freed/recycled until the
+ * reference drops to zero.
+ *
+ * E.g. with a ref of one here:
+ * [  0  ][  0  ][  1  ][  0  ][  0  ][  0  ][  0  ][  0  ][  0  ]
+ *
+ * These nodes could be freed or recycled ("reaped"):
+ * [  0  ][  0  ]
+ *
+ * These must be preserved until the leading ref drops to zero:
+ *               [  1  ][  0  ][  0  ][  0  ][  0  ][  0  ][  0  ]
+ *
+ * When a query starts we take a reference on the current tail of
+ * the list, knowing that no already-buffered samples can possibly
+ * relate to the newly-started query. A pointer to this node is
+ * also saved in the query object's ->oa.samples_head.
+ *
+ * E.g. starting query A while there are two nodes in .sample_buffers:
+ *                ________________A________
+ *                |
+ *
+ * [  0  ][  1  ]
+ *           ^_______ Add a reference and store pointer to node in
+ *                    A->oa.samples_head
+ *
+ * Moving forward to when the B query starts with no new buffer nodes:
+ * (for reference, i915 perf reads() are only done when queries finish)
+ *                ________________A_______
+ *                | ________B___
+ *                | |
+ *
+ * [  0  ][  2  ]
+ *           ^_______ Add a reference and store pointer to
+ *                    node in B->oa.samples_head
+ *
+ * Once a query is finished, after an OA query has become 'Ready',
+ * once the End OA report has landed and after we we have processed
+ * all the intermediate periodic samples then we drop the
+ * ->oa.samples_head reference we took at the start.
+ *
+ * So when the B query has finished we have:
+ *                ________________A________
+ *                | ______B___________
+ *                | |                |
+ * [  0  ][  1  ][  0  ][  0  ][  0  ]
+ *           ^_______ Drop B->oa.samples_head reference
+ *
+ * We still can't free these due to the A->oa.samples_head ref:
+ *        [  1  ][  0  ][  0  ][  0  ]
+ *
+ * When the A query finishes: (note there's a new ref for C's samples_head)
+ *                ________________A_________________
+ *                |                                |
+ *                |                    _____C_________
+ *                |                    |           |
+ * [  0  ][  0  ][  0  ][  0  ][  1  ][  0  ][  0  ]
+ *           ^_______ Drop A->oa.samples_head reference
+ *
+ * And we can now reap these nodes up to the C->oa.samples_head:
+ * [  X  ][  X  ][  X  ][  X  ]
+ *                  keeping -> [  1  ][  0  ][  0  ]
+ *
+ * We reap old sample buffers each time we finish processing an OA
+ * query by iterating the sample_buffers list from the head until we
+ * find a referenced node and stop.
+ *
+ * Reaped buffers move to a perfquery.free_sample_buffers list and
+ * when we come to read() we first look to recycle a buffer from the
+ * free_sample_buffers list before allocating a new buffer.
+ */
+struct oa_sample_buf {
+   struct exec_node link;
+   int refcount;
+   int len;
+   uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
+   uint32_t last_timestamp;
+};
+
+/**
+ * gen representation of a performance query object.
+ *
+ * NB: We want to keep this structure relatively lean considering that
+ * applications may expect to allocate enough objects to be able to
+ * query around all draw calls in a frame.
+ */
+struct gen_perf_query_object
 {
-#ifdef HAVE_DIRENT_D_TYPE
-   return entry->d_type == DT_DIR || entry->d_type == DT_LNK;
-#else
-   struct stat st;
-   char path[PATH_MAX + 1];
-   snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name);
-   lstat(path, &st);
-   return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode);
-#endif
+   const struct gen_perf_query_info *queryinfo;
+
+   /* See query->kind to know which state below is in use... */
+   union {
+      struct {
+
+         /**
+          * BO containing OA counter snapshots at query Begin/End time.
+          */
+         void *bo;
+
+         /**
+          * Address of mapped of @bo
+          */
+         void *map;
+
+         /**
+          * The MI_REPORT_PERF_COUNT command lets us specify a unique
+          * ID that will be reflected in the resulting OA report
+          * that's written by the GPU. This is the ID we're expecting
+          * in the begin report and the the end report should be
+          * @begin_report_id + 1.
+          */
+         int begin_report_id;
+
+         /**
+          * Reference the head of the brw->perfquery.sample_buffers
+          * list at the time that the query started (so we only need
+          * to look at nodes after this point when looking for samples
+          * related to this query)
+          *
+          * (See struct brw_oa_sample_buf description for more details)
+          */
+         struct exec_node *samples_head;
+
+         /**
+          * false while in the unaccumulated_elements list, and set to
+          * true when the final, end MI_RPC snapshot has been
+          * accumulated.
+          */
+         bool results_accumulated;
+
+         /**
+          * Frequency of the GT at begin and end of the query.
+          */
+         uint64_t gt_frequency[2];
+
+         /**
+          * Accumulated OA results between begin and end of the query.
+          */
+         struct gen_perf_query_result result;
+      } oa;
+
+      struct {
+         /**
+          * BO containing starting and ending snapshots for the
+          * statistics counters.
+          */
+         void *bo;
+      } pipeline_stats;
+   };
+};
+
+struct gen_perf_context {
+   struct gen_perf_config *perf;
+
+   void * ctx;  /* driver context (eg, brw_context) */
+   void * bufmgr;
+   const struct gen_device_info *devinfo;
+
+   uint32_t hw_ctx;
+   int drm_fd;
+
+   /* The i915 perf stream we open to setup + enable the OA counters */
+   int oa_stream_fd;
+
+   /* An i915 perf stream fd gives exclusive access to the OA unit that will
+    * report counter snapshots for a specific counter set/profile in a
+    * specific layout/format so we can only start OA queries that are
+    * compatible with the currently open fd...
+    */
+   int current_oa_metrics_set_id;
+   int current_oa_format;
+
+   /* List of buffers containing OA reports */
+   struct exec_list sample_buffers;
+
+   /* Cached list of empty sample buffers */
+   struct exec_list free_sample_buffers;
+
+   int n_active_oa_queries;
+   int n_active_pipeline_stats_queries;
+
+   /* The number of queries depending on running OA counters which
+    * extends beyond brw_end_perf_query() since we need to wait until
+    * the last MI_RPC command has parsed by the GPU.
+    *
+    * Accurate accounting is important here as emitting an
+    * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
+    * effectively hang the gpu.
+    */
+   int n_oa_users;
+
+   /* To help catch an spurious problem with the hardware or perf
+    * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
+    * with a unique ID that we can explicitly check for...
+    */
+   int next_query_start_report_id;
+
+   /**
+    * An array of queries whose results haven't yet been assembled
+    * based on the data in buffer objects.
+    *
+    * These may be active, or have already ended.  However, the
+    * results have not been requested.
+    */
+   struct gen_perf_query_object **unaccumulated;
+   int unaccumulated_elements;
+   int unaccumulated_array_size;
+
+   /* The total number of query objects so we can relinquish
+    * our exclusive access to perf if the application deletes
+    * all of its objects. (NB: We only disable perf while
+    * there are no active queries)
+    */
+   int n_query_instances;
+};
+
+const struct gen_perf_query_info*
+gen_perf_query_info(const struct gen_perf_query_object *query)
+{
+   return query->queryinfo;
+}
+
+struct gen_perf_context *
+gen_perf_new_context(void *parent)
+{
+   struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context);
+   if (! ctx)
+      fprintf(stderr, "%s: failed to alloc context\n", __func__);
+   return ctx;
+}
+
+struct gen_perf_config *
+gen_perf_config(struct gen_perf_context *ctx)
+{
+   return ctx->perf;
+}
+
+struct gen_perf_query_object *
+gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index)
+{
+   const struct gen_perf_query_info *query =
+      &perf_ctx->perf->queries[query_index];
+   struct gen_perf_query_object *obj =
+      calloc(1, sizeof(struct gen_perf_query_object));
+
+   if (!obj)
+      return NULL;
+
+   obj->queryinfo = query;
+
+   perf_ctx->n_query_instances++;
+   return obj;
+}
+
+int
+gen_perf_active_queries(struct gen_perf_context *perf_ctx,
+                        const struct gen_perf_query_info *query)
+{
+   assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0);
+
+   switch (query->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      return perf_ctx->n_active_oa_queries;
+      break;
+
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      return perf_ctx->n_active_pipeline_stats_queries;
+      break;
+
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
 }
 
 static bool
@@ -80,9 +399,6 @@ get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
 
    perf->sysfs_dev_dir[0] = '\0';
 
-   if (INTEL_DEBUG & DEBUG_NO_OACONFIG)
-      return true;
-
    if (fstat(fd, &sb)) {
       DBG("Failed to stat DRM fd\n");
       return false;
@@ -111,7 +427,8 @@ get_sysfs_dev_dir(struct gen_perf_config *perf, int fd)
    }
 
    while ((drm_entry = readdir(drmdir))) {
-      if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) &&
+      if ((drm_entry->d_type == DT_DIR ||
+           drm_entry->d_type == DT_LNK) &&
           strncmp(drm_entry->d_name, "card", 4) == 0)
       {
          len = snprintf(perf->sysfs_dev_dir,
@@ -172,26 +489,41 @@ read_sysfs_drm_device_file_uint64(struct gen_perf_config *perf,
    return read_file_uint64(buf, value);
 }
 
+static inline struct gen_perf_query_info *
+append_query_info(struct gen_perf_config *perf, int max_counters)
+{
+   struct gen_perf_query_info *query;
+
+   perf->queries = reralloc(perf, perf->queries,
+                            struct gen_perf_query_info,
+                            ++perf->n_queries);
+   query = &perf->queries[perf->n_queries - 1];
+   memset(query, 0, sizeof(*query));
+
+   if (max_counters > 0) {
+      query->max_counters = max_counters;
+      query->counters =
+         rzalloc_array(perf, struct gen_perf_query_counter, max_counters);
+   }
+
+   return query;
+}
+
 static void
 register_oa_config(struct gen_perf_config *perf,
-                   const struct gen_device_info *devinfo,
                    const struct gen_perf_query_info *query,
                    uint64_t config_id)
 {
-   struct gen_perf_query_info *registered_query =
-      gen_perf_append_query_info(perf, 0);
+   struct gen_perf_query_info *registered_query = append_query_info(perf, 0);
 
    *registered_query = *query;
-   registered_query->oa_format = devinfo->ver >= 8 ?
-      I915_OA_FORMAT_A32u40_A4u32_B8_C8 : I915_OA_FORMAT_A45_B8_C8;
    registered_query->oa_metrics_set_id = config_id;
    DBG("metric set registered: id = %" PRIu64", guid = %s\n",
        registered_query->oa_metrics_set_id, query->guid);
 }
 
 static void
-enumerate_sysfs_metrics(struct gen_perf_config *perf,
-                        const struct gen_device_info *devinfo)
+enumerate_sysfs_metrics(struct gen_perf_config *perf)
 {
    DIR *metricsdir = NULL;
    struct dirent *metric_entry;
@@ -212,7 +544,9 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf,
 
    while ((metric_entry = readdir(metricsdir))) {
       struct hash_entry *entry;
-      if (!is_dir_or_link(metric_entry, buf) ||
+
+      if ((metric_entry->d_type != DT_DIR &&
+           metric_entry->d_type != DT_LNK) ||
           metric_entry->d_name[0] == '.')
          continue;
 
@@ -221,13 +555,20 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf,
                                       metric_entry->d_name);
       if (entry) {
          uint64_t id;
-         if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) {
+
+         len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
+                        perf->sysfs_dev_dir, metric_entry->d_name);
+         if (len < 0 || len >= sizeof(buf)) {
+            DBG("Failed to concatenate path to sysfs metric id file\n");
+            continue;
+         }
+
+         if (!read_file_uint64(buf, &id)) {
             DBG("Failed to read metric set id from %s: %m", buf);
             continue;
          }
 
-         register_oa_config(perf, devinfo,
-                            (const struct gen_perf_query_info *)entry->data, id);
+         register_oa_config(perf, (const struct gen_perf_query_info *)entry->data, id);
       } else
          DBG("metric set not known by mesa (skipping)\n");
    }
@@ -235,133 +576,64 @@ enumerate_sysfs_metrics(struct gen_perf_config *perf,
    closedir(metricsdir);
 }
 
-static void
-add_all_metrics(struct gen_perf_config *perf,
-                const struct gen_device_info *devinfo)
-{
-   hash_table_foreach(perf->oa_metrics_table, entry) {
-      const struct gen_perf_query_info *query = entry->data;
-      register_oa_config(perf, devinfo, query, 0);
-   }
-}
-
 static bool
 kernel_has_dynamic_config_support(struct gen_perf_config *perf, int fd)
 {
    uint64_t invalid_config_id = UINT64_MAX;
 
-   return intel_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
+   return gen_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
                     &invalid_config_id) < 0 && errno == ENOENT;
 }
 
-static int
-i915_query_items(struct gen_perf_config *perf, int fd,
-                 struct drm_i915_query_item *items, uint32_t n_items)
-{
-   struct drm_i915_query q = {
-      .num_items = n_items,
-      .items_ptr = to_user_pointer(items),
-   };
-   return intel_ioctl(fd, DRM_IOCTL_I915_QUERY, &q);
-}
-
-static bool
-i915_query_perf_config_supported(struct gen_perf_config *perf, int fd)
-{
-   struct drm_i915_query_item item = {
-      .query_id = DRM_I915_QUERY_PERF_CONFIG,
-      .flags = DRM_I915_QUERY_PERF_CONFIG_LIST,
-   };
-
-   return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0;
-}
-
 static bool
-i915_query_perf_config_data(struct gen_perf_config *perf,
-                            int fd, const char *guid,
-                            struct drm_i915_perf_oa_config *config)
-{
-   struct {
-      struct drm_i915_query_perf_config query;
-      struct drm_i915_perf_oa_config config;
-   } item_data;
-   struct drm_i915_query_item item = {
-      .query_id = DRM_I915_QUERY_PERF_CONFIG,
-      .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID,
-      .data_ptr = to_user_pointer(&item_data),
-      .length = sizeof(item_data),
-   };
-
-   memset(&item_data, 0, sizeof(item_data));
-   memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid));
-   memcpy(&item_data.config, config, sizeof(item_data.config));
-
-   if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0))
-      return false;
-
-   memcpy(config, &item_data.config, sizeof(item_data.config));
-
-   return true;
-}
-
-bool
-gen_perf_load_metric_id(struct gen_perf_config *perf_cfg,
-                        const char *guid,
-                        uint64_t *metric_id)
+load_metric_id(struct gen_perf_config *perf, const char *guid,
+               uint64_t *metric_id)
 {
    char config_path[280];
 
    snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
-            perf_cfg->sysfs_dev_dir, guid);
+            perf->sysfs_dev_dir, guid);
 
    /* Don't recreate already loaded configs. */
    return read_file_uint64(config_path, metric_id);
 }
 
-static uint64_t
-i915_add_config(struct gen_perf_config *perf, int fd,
-                const struct gen_perf_registers *config,
-                const char *guid)
-{
-   struct drm_i915_perf_oa_config i915_config = { 0, };
-
-   memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid));
-
-   i915_config.n_mux_regs = config->n_mux_regs;
-   i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs);
-
-   i915_config.n_boolean_regs = config->n_b_counter_regs;
-   i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs);
-
-   i915_config.n_flex_regs = config->n_flex_regs;
-   i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs);
-
-   int ret = intel_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config);
-   return ret > 0 ? ret : 0;
-}
-
 static void
-init_oa_configs(struct gen_perf_config *perf, int fd,
-                const struct gen_device_info *devinfo)
+init_oa_configs(struct gen_perf_config *perf, int fd)
 {
    hash_table_foreach(perf->oa_metrics_table, entry) {
       const struct gen_perf_query_info *query = entry->data;
+      struct drm_i915_perf_oa_config config;
       uint64_t config_id;
+      int ret;
 
-      if (gen_perf_load_metric_id(perf, query->guid, &config_id)) {
+      if (load_metric_id(perf, query->guid, &config_id)) {
          DBG("metric set: %s (already loaded)\n", query->guid);
-         register_oa_config(perf, devinfo, query, config_id);
+         register_oa_config(perf, query, config_id);
          continue;
       }
 
-      int ret = i915_add_config(perf, fd, &query->config, query->guid);
+      memset(&config, 0, sizeof(config));
+
+      memcpy(config.uuid, query->guid, sizeof(config.uuid));
+
+      config.n_mux_regs = query->n_mux_regs;
+      config.mux_regs_ptr = (uintptr_t) query->mux_regs;
+
+      config.n_boolean_regs = query->n_b_counter_regs;
+      config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
+
+      config.n_flex_regs = query->n_flex_regs;
+      config.flex_regs_ptr = (uintptr_t) query->flex_regs;
+
+      ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
       if (ret < 0) {
          DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
              query->name, query->guid, strerror(errno));
          continue;
       }
 
-      register_oa_config(perf, devinfo, query, ret);
+      register_oa_config(perf, query, ret);
       DBG("metric set: %s (added)\n", query->guid);
    }
 }
@@ -375,16 +647,16 @@ compute_topology_builtins(struct gen_perf_config *perf,
 
    for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
       perf->sys_vars.n_eu_sub_slices +=
-         util_bitcount(devinfo->subslice_masks[i]);
+         __builtin_popcount(devinfo->subslice_masks[i]);
    }
 
    for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
-      perf->sys_vars.n_eus += util_bitcount(devinfo->eu_masks[i]);
+      perf->sys_vars.n_eus += __builtin_popcount(devinfo->eu_masks[i]);
 
    perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu;
 
-   /* The subslice mask builtin contains bits for all slices. Prior to Gfx11
-    * it had groups of 3bits for each slice, on Gfx11 it's 8bits for each
+   /* The subslice mask builtin contains bits for all slices. Prior to Gen11
+    * it had groups of 3bits for each slice, on Gen11 it's 8bits for each
     * slice.
     *
     * Ideally equations would be updated to have a slice/subslice query
@@ -392,7 +664,7 @@ compute_topology_builtins(struct gen_perf_config *perf,
     */
    perf->sys_vars.subslice_mask = 0;
 
-   int bits_per_subslice = devinfo->ver == 11 ? 8 : 3;
+   int bits_per_subslice = devinfo->gen == 11 ? 8 : 3;
 
    for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
       for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
@@ -407,23 +679,17 @@ init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *dev
 {
    uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
 
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
-      if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
-         return false;
+   if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
+      return false;
 
-      if (!read_sysfs_drm_device_file_uint64(perf,  "gt_max_freq_mhz", &max_freq_mhz))
-         return false;
-   } else {
-      min_freq_mhz = 300;
-      max_freq_mhz = 1000;
-   }
+   if (!read_sysfs_drm_device_file_uint64(perf,  "gt_max_freq_mhz", &max_freq_mhz))
+      return false;
 
    memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
    perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
    perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
    perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
    perf->sys_vars.revision = devinfo->revision;
-   perf->sys_vars.query_mode = true;
    compute_topology_builtins(perf, devinfo);
 
    return true;
@@ -464,270 +730,143 @@ get_register_queries_function(const struct gen_device_info *devinfo)
       if (devinfo->gt == 3)
          return gen_oa_register_queries_cflgt3;
    }
-   if (devinfo->ver == 11) {
-      if (devinfo->is_elkhartlake)
-         return gen_oa_register_queries_ehl;
+   if (devinfo->is_cannonlake)
+      return gen_oa_register_queries_cnl;
+   if (devinfo->gen == 11)
       return gen_oa_register_queries_icl;
-   }
-   if (devinfo->is_tigerlake) {
-      if (devinfo->gt == 1)
-         return gen_oa_register_queries_tglgt1;
-      if (devinfo->gt == 2)
-         return gen_oa_register_queries_tglgt2;
-   }
-   if (devinfo->is_rocketlake)
-      return gen_oa_register_queries_rkl;
-   if (devinfo->is_dg1)
-      return gen_oa_register_queries_dg1;
-   if (devinfo->is_alderlake)
-      return gen_oa_register_queries_adl;
 
    return NULL;
 }
 
-static int
-gen_perf_compare_counter_names(const void *v1, const void *v2)
+static inline void
+add_stat_reg(struct gen_perf_query_info *query, uint32_t reg,
+             uint32_t numerator, uint32_t denominator,
+             const char *name, const char *description)
 {
-   const struct gen_perf_query_counter *c1 = v1;
-   const struct gen_perf_query_counter *c2 = v2;
+   struct gen_perf_query_counter *counter;
+
+   assert(query->n_counters < query->max_counters);
+
+   counter = &query->counters[query->n_counters];
+   counter->name = name;
+   counter->desc = description;
+   counter->type = GEN_PERF_COUNTER_TYPE_RAW;
+   counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64;
+   counter->offset = sizeof(uint64_t) * query->n_counters;
+   counter->pipeline_stat.reg = reg;
+   counter->pipeline_stat.numerator = numerator;
+   counter->pipeline_stat.denominator = denominator;
 
-   return strcmp(c1->name, c2->name);
+   query->n_counters++;
 }
 
-static void
-sort_query(struct gen_perf_query_info *q)
+static inline void
+add_basic_stat_reg(struct gen_perf_query_info *query,
+                                       uint32_t reg, const char *name)
 {
-   qsort(q->counters, q->n_counters, sizeof(q->counters[0]),
-         gen_perf_compare_counter_names);
+   add_stat_reg(query, reg, 1, 1, name, name);
 }
 
 static void
 load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg,
-                                const struct gen_device_info *devinfo)
+                                         const struct gen_device_info *devinfo)
 {
    struct gen_perf_query_info *query =
-      gen_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS);
+      append_query_info(perf_cfg, MAX_STAT_COUNTERS);
 
    query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
    query->name = "Pipeline Statistics Registers";
 
-   gen_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT,
-                                     "N vertices submitted");
-   gen_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
-                                     "N primitives submitted");
-   gen_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
-                                     "N vertex shader invocations");
-
-   if (devinfo->ver == 6) {
-      gen_perf_query_add_stat_reg(query, GFX6_SO_PRIM_STORAGE_NEEDED, 1, 1,
-                                  "SO_PRIM_STORAGE_NEEDED",
-                                  "N geometry shader stream-out primitives (total)");
-      gen_perf_query_add_stat_reg(query, GFX6_SO_NUM_PRIMS_WRITTEN, 1, 1,
-                                  "SO_NUM_PRIMS_WRITTEN",
-                                  "N geometry shader stream-out primitives (written)");
+   add_basic_stat_reg(query, IA_VERTICES_COUNT,
+                                          "N vertices submitted");
+   add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
+                                          "N primitives submitted");
+   add_basic_stat_reg(query, VS_INVOCATION_COUNT,
+                                          "N vertex shader invocations");
+
+   if (devinfo->gen == 6) {
+      add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
+                   "SO_PRIM_STORAGE_NEEDED",
+                   "N geometry shader stream-out primitives (total)");
+      add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
+                   "SO_NUM_PRIMS_WRITTEN",
+                   "N geometry shader stream-out primitives (written)");
    } else {
-      gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
-                                  "SO_PRIM_STORAGE_NEEDED (Stream 0)",
-                                  "N stream-out (stream 0) primitives (total)");
-      gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
-                                  "SO_PRIM_STORAGE_NEEDED (Stream 1)",
-                                  "N stream-out (stream 1) primitives (total)");
-      gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
-                                  "SO_PRIM_STORAGE_NEEDED (Stream 2)",
-                                  "N stream-out (stream 2) primitives (total)");
-      gen_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
-                                  "SO_PRIM_STORAGE_NEEDED (Stream 3)",
-                                  "N stream-out (stream 3) primitives (total)");
-      gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
-                                  "SO_NUM_PRIMS_WRITTEN (Stream 0)",
-                                  "N stream-out (stream 0) primitives (written)");
-      gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
-                                  "SO_NUM_PRIMS_WRITTEN (Stream 1)",
-                                  "N stream-out (stream 1) primitives (written)");
-      gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
-                                  "SO_NUM_PRIMS_WRITTEN (Stream 2)",
-                                  "N stream-out (stream 2) primitives (written)");
-      gen_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
-                                  "SO_NUM_PRIMS_WRITTEN (Stream 3)",
-                                  "N stream-out (stream 3) primitives (written)");
+      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
+                   "SO_PRIM_STORAGE_NEEDED (Stream 0)",
+                   "N stream-out (stream 0) primitives (total)");
+      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
+                   "SO_PRIM_STORAGE_NEEDED (Stream 1)",
+                   "N stream-out (stream 1) primitives (total)");
+      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
+                   "SO_PRIM_STORAGE_NEEDED (Stream 2)",
+                   "N stream-out (stream 2) primitives (total)");
+      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
+                   "SO_PRIM_STORAGE_NEEDED (Stream 3)",
+                   "N stream-out (stream 3) primitives (total)");
+      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
+                   "SO_NUM_PRIMS_WRITTEN (Stream 0)",
+                   "N stream-out (stream 0) primitives (written)");
+      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
+                   "SO_NUM_PRIMS_WRITTEN (Stream 1)",
+                   "N stream-out (stream 1) primitives (written)");
+      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
+                   "SO_NUM_PRIMS_WRITTEN (Stream 2)",
+                   "N stream-out (stream 2) primitives (written)");
+      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
+                   "SO_NUM_PRIMS_WRITTEN (Stream 3)",
+                   "N stream-out (stream 3) primitives (written)");
    }
 
-   gen_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
-                                     "N TCS shader invocations");
-   gen_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
-                                     "N TES shader invocations");
-
-   gen_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
-                                     "N geometry shader invocations");
-   gen_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
-                                     "N geometry shader primitives emitted");
-
-   gen_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
-                                     "N primitives entering clipping");
-   gen_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
-                                     "N primitives leaving clipping");
-
-   if (devinfo->is_haswell || devinfo->ver == 8) {
-      gen_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
-                                  "N fragment shader invocations",
-                                  "N fragment shader invocations");
+   add_basic_stat_reg(query, HS_INVOCATION_COUNT,
+                                          "N TCS shader invocations");
+   add_basic_stat_reg(query, DS_INVOCATION_COUNT,
+                                          "N TES shader invocations");
+
+   add_basic_stat_reg(query, GS_INVOCATION_COUNT,
+                                          "N geometry shader invocations");
+   add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
+                                          "N geometry shader primitives emitted");
+
+   add_basic_stat_reg(query, CL_INVOCATION_COUNT,
+                                          "N primitives entering clipping");
+   add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
+                                          "N primitives leaving clipping");
+
+   if (devinfo->is_haswell || devinfo->gen == 8) {
+      add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
+                   "N fragment shader invocations",
+                   "N fragment shader invocations");
    } else {
-      gen_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
-                                        "N fragment shader invocations");
+      add_basic_stat_reg(query, PS_INVOCATION_COUNT,
+                                             "N fragment shader invocations");
    }
 
-   gen_perf_query_add_basic_stat_reg(query, PS_DEPTH_COUNT,
-                                     "N z-pass fragments");
+   add_basic_stat_reg(query, PS_DEPTH_COUNT,
+                                          "N z-pass fragments");
 
-   if (devinfo->ver >= 7) {
-      gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
-                                        "N compute shader invocations");
+   if (devinfo->gen >= 7) {
+      add_basic_stat_reg(query, CS_INVOCATION_COUNT,
+                                             "N compute shader invocations");
    }
 
    query->data_size = sizeof(uint64_t) * query->n_counters;
-
-   sort_query(query);
-}
-
-static int
-i915_perf_version(int drm_fd)
-{
-   int tmp;
-   drm_i915_getparam_t gp = {
-      .param = I915_PARAM_PERF_REVISION,
-      .value = &tmp,
-   };
-
-   int ret = intel_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
-
-   /* Return 0 if this getparam is not supported, the first version supported
-    * is 1.
-    */
-   return ret < 0 ? 0 : tmp;
-}
-
-static void
-i915_get_sseu(int drm_fd, struct drm_i915_gem_context_param_sseu *sseu)
-{
-   struct drm_i915_gem_context_param arg = {
-      .param = I915_CONTEXT_PARAM_SSEU,
-      .size = sizeof(*sseu),
-      .value = to_user_pointer(sseu)
-   };
-
-   intel_ioctl(drm_fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg);
-}
-
-static inline int
-compare_str_or_null(const char *s1, const char *s2)
-{
-   if (s1 == NULL && s2 == NULL)
-      return 0;
-   if (s1 == NULL)
-      return -1;
-   if (s2 == NULL)
-      return 1;
-
-   return strcmp(s1, s2);
-}
-
-static int
-compare_counter_categories_and_names(const void *_c1, const void *_c2)
-{
-   const struct gen_perf_query_counter_info *c1 = (const struct gen_perf_query_counter_info *)_c1;
-   const struct gen_perf_query_counter_info *c2 = (const struct gen_perf_query_counter_info *)_c2;
-
-   /* pipeline counters don't have an assigned category */
-   int r = compare_str_or_null(c1->counter->category, c2->counter->category);
-   if (r)
-      return r;
-
-   return strcmp(c1->counter->name, c2->counter->name);
-}
-
-static void
-build_unique_counter_list(struct gen_perf_config *perf)
-{
-   assert(perf->n_queries < 64);
-
-   size_t max_counters = 0;
-
-   for (int q = 0; q < perf->n_queries; q++)
-      max_counters += perf->queries[q].n_counters;
-
-   /*
-    * Allocate big enough array to hold maximum possible number of counters.
-    * We can't alloc it small and realloc when needed because the hash table
-    * below contains pointers to this array.
-    */
-   struct gen_perf_query_counter_info *counter_infos =
-         ralloc_array_size(perf, sizeof(counter_infos[0]), max_counters);
-
-   perf->n_counters = 0;
-
-   struct hash_table *counters_table =
-      _mesa_hash_table_create(perf,
-                              _mesa_hash_string,
-                              _mesa_key_string_equal);
-   struct hash_entry *entry;
-   for (int q = 0; q < perf->n_queries ; q++) {
-      struct gen_perf_query_info *query = &perf->queries[q];
-
-      for (int c = 0; c < query->n_counters; c++) {
-         struct gen_perf_query_counter *counter;
-         struct gen_perf_query_counter_info *counter_info;
-
-         counter = &query->counters[c];
-         entry = _mesa_hash_table_search(counters_table, counter->symbol_name);
-
-         if (entry) {
-            counter_info = entry->data;
-            counter_info->query_mask |= BITFIELD64_BIT(q);
-            continue;
-         }
-         assert(perf->n_counters < max_counters);
-
-         counter_info = &counter_infos[perf->n_counters++];
-         counter_info->counter = counter;
-         counter_info->query_mask = BITFIELD64_BIT(q);
-
-         counter_info->location.group_idx = q;
-         counter_info->location.counter_idx = c;
-
-         _mesa_hash_table_insert(counters_table, counter->symbol_name, counter_info);
-      }
-   }
-
-   _mesa_hash_table_destroy(counters_table, NULL);
-
-   /* Now we can realloc counter_infos array because hash table doesn't exist. */
-   perf->counter_infos = reralloc_array_size(perf, counter_infos,
-         sizeof(counter_infos[0]), perf->n_counters);
-
-   qsort(perf->counter_infos, perf->n_counters, sizeof(perf->counter_infos[0]),
-         compare_counter_categories_and_names);
 }
 
 static bool
-oa_metrics_available(struct gen_perf_config *perf, int fd,
-      const struct gen_device_info *devinfo)
+load_oa_metrics(struct gen_perf_config *perf, int fd,
+                         const struct gen_device_info *devinfo)
 {
    perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
    bool i915_perf_oa_available = false;
    struct stat sb;
 
-   perf->i915_query_supported = i915_query_perf_config_supported(perf, fd);
-   perf->i915_perf_version = i915_perf_version(fd);
-
-   /* Record the default SSEU configuration. */
-   i915_get_sseu(fd, &perf->sseu);
-
    /* The existence of this sysctl parameter implies the kernel supports
     * the i915 perf interface.
     */
    if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
 
-      /* If _paranoid == 1 then on Gfx8+ we won't be able to access OA
+      /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
        * metrics unless running as root.
        */
       if (devinfo->is_haswell)
@@ -740,26 +879,16 @@ oa_metrics_available(struct gen_perf_config *perf, int fd,
          if (paranoid == 0 || geteuid() == 0)
             i915_perf_oa_available = true;
       }
-
-      perf->platform_supported = oa_register != NULL;
    }
 
-   return i915_perf_oa_available &&
-          oa_register &&
-          get_sysfs_dev_dir(perf, fd) &&
-          init_oa_sys_vars(perf, devinfo);
-}
-
-static void
-load_oa_metrics(struct gen_perf_config *perf, int fd,
-                const struct gen_device_info *devinfo)
-{
-   int existing_queries = perf->n_queries;
-
-   perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
+   if (!i915_perf_oa_available ||
+       !oa_register ||
+       !get_sysfs_dev_dir(perf, fd) ||
+       !init_oa_sys_vars(perf, devinfo))
+      return false;
 
    perf->oa_metrics_table =
-      _mesa_hash_table_create(perf, _mesa_hash_string,
+      _mesa_hash_table_create(perf, _mesa_key_hash_string,
                               _mesa_key_string_equal);
 
    /* Index all the metric sets mesa knows about before looking to see what
@@ -767,188 +896,13 @@ load_oa_metrics(struct gen_perf_config *perf, int fd,
     */
    oa_register(perf);
 
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
-      if (kernel_has_dynamic_config_support(perf, fd))
-         init_oa_configs(perf, fd, devinfo);
-      else
-         enumerate_sysfs_metrics(perf, devinfo);
-   } else {
-      add_all_metrics(perf, devinfo);
-   }
-
-   /* sort counters in each individual group created by this function by name */
-   for (int i = existing_queries; i < perf->n_queries; ++i)
-      sort_query(&perf->queries[i]);
-
-   /* Select a fallback OA metric. Look for the TestOa metric or use the last
-    * one if no present (on HSW).
-    */
-   for (int i = existing_queries; i < perf->n_queries; i++) {
-      if (perf->queries[i].symbol_name &&
-          strcmp(perf->queries[i].symbol_name, "TestOa") == 0) {
-         perf->fallback_raw_oa_metric = perf->queries[i].oa_metrics_set_id;
-         break;
-      }
-   }
-   if (perf->fallback_raw_oa_metric == 0 && perf->n_queries > 0)
-      perf->fallback_raw_oa_metric = perf->queries[perf->n_queries - 1].oa_metrics_set_id;
-}
-
-struct gen_perf_registers *
-gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid)
-{
-   if (!perf_cfg->i915_query_supported)
-      return NULL;
-
-   struct drm_i915_perf_oa_config i915_config = { 0, };
-   if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config))
-      return NULL;
-
-   struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers);
-   config->n_flex_regs = i915_config.n_flex_regs;
-   config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs);
-   config->n_mux_regs = i915_config.n_mux_regs;
-   config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs);
-   config->n_b_counter_regs = i915_config.n_boolean_regs;
-   config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs);
-
-   /*
-    * struct gen_perf_query_register_prog maps exactly to the tuple of
-    * (register offset, register value) returned by the i915.
-    */
-   i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs);
-   i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs);
-   i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs);
-   if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) {
-      ralloc_free(config);
-      return NULL;
-   }
-
-   return config;
-}
-
-uint64_t
-gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd,
-                             const struct gen_perf_registers *config,
-                             const char *guid)
-{
-   if (guid)
-      return i915_add_config(perf_cfg, fd, config, guid);
-
-   struct mesa_sha1 sha1_ctx;
-   _mesa_sha1_init(&sha1_ctx);
-
-   if (config->flex_regs) {
-      _mesa_sha1_update(&sha1_ctx, config->flex_regs,
-                        sizeof(config->flex_regs[0]) *
-                        config->n_flex_regs);
-   }
-   if (config->mux_regs) {
-      _mesa_sha1_update(&sha1_ctx, config->mux_regs,
-                        sizeof(config->mux_regs[0]) *
-                        config->n_mux_regs);
-   }
-   if (config->b_counter_regs) {
-      _mesa_sha1_update(&sha1_ctx, config->b_counter_regs,
-                        sizeof(config->b_counter_regs[0]) *
-                        config->n_b_counter_regs);
-   }
-
-   uint8_t hash[20];
-   _mesa_sha1_final(&sha1_ctx, hash);
-
-   char formatted_hash[41];
-   _mesa_sha1_format(formatted_hash, hash);
-
-   char generated_guid[37];
-   snprintf(generated_guid, sizeof(generated_guid),
-            "%.8s-%.4s-%.4s-%.4s-%.12s",
-            &formatted_hash[0], &formatted_hash[8],
-            &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4],
-            &formatted_hash[8 + 4 + 4 + 4]);
-
-   /* Check if already present. */
-   uint64_t id;
-   if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id))
-      return id;
-
-   return i915_add_config(perf_cfg, fd, config, generated_guid);
-}
-
-static uint64_t
-get_passes_mask(struct gen_perf_config *perf,
-                const uint32_t *counter_indices,
-                uint32_t counter_indices_count)
-{
-   uint64_t queries_mask = 0;
-
-   assert(perf->n_queries < 64);
-
-   /* Compute the number of passes by going through all counters N times (with
-    * N the number of queries) to make sure we select the most constraining
-    * counters first and look at the more flexible ones (that could be
-    * obtained from multiple queries) later. That way we minimize the number
-    * of passes required.
-    */
-   for (uint32_t q = 0; q < perf->n_queries; q++) {
-      for (uint32_t i = 0; i < counter_indices_count; i++) {
-         assert(counter_indices[i] < perf->n_counters);
-
-         uint32_t idx = counter_indices[i];
-         if (util_bitcount64(perf->counter_infos[idx].query_mask) != (q + 1))
-            continue;
-
-         if (queries_mask & perf->counter_infos[idx].query_mask)
-            continue;
-
-         queries_mask |= BITFIELD64_BIT(ffsll(perf->counter_infos[idx].query_mask) - 1);
-      }
-   }
-
-   return queries_mask;
-}
-
-uint32_t
-gen_perf_get_n_passes(struct gen_perf_config *perf,
-                      const uint32_t *counter_indices,
-                      uint32_t counter_indices_count,
-                      struct gen_perf_query_info **pass_queries)
-{
-   uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
-
-   if (pass_queries) {
-      uint32_t pass = 0;
-      for (uint32_t q = 0; q < perf->n_queries; q++) {
-         if ((1ULL << q) & queries_mask)
-            pass_queries[pass++] = &perf->queries[q];
-      }
-   }
-
-   return util_bitcount64(queries_mask);
-}
-
-void
-gen_perf_get_counters_passes(struct gen_perf_config *perf,
-                             const uint32_t *counter_indices,
-                             uint32_t counter_indices_count,
-                             struct gen_perf_counter_pass *counter_pass)
-{
-   uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
-   ASSERTED uint32_t n_passes = util_bitcount64(queries_mask);
-
-   for (uint32_t i = 0; i < counter_indices_count; i++) {
-      assert(counter_indices[i] < perf->n_counters);
-
-      uint32_t idx = counter_indices[i];
-      counter_pass[i].counter = perf->counter_infos[idx].counter;
-
-      uint32_t query_idx = ffsll(perf->counter_infos[idx].query_mask & queries_mask) - 1;
-      counter_pass[i].query = &perf->queries[query_idx];
+   if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
+       kernel_has_dynamic_config_support(perf, fd))
+      init_oa_configs(perf, fd);
+   else
+      enumerate_sysfs_metrics(perf);
 
-      uint32_t clear_bits = 63 - query_idx;
-      counter_pass[i].pass = util_bitcount64((queries_mask << clear_bits) >> clear_bits) - 1;
-      assert(counter_pass[i].pass < n_passes);
-   }
+   return true;
 }
 
 /* Accumulate 32bits OA counters */
@@ -984,7 +938,7 @@ accumulate_uint40(int a_index,
 }
 
 static void
-gfx8_read_report_clock_ratios(const uint32_t *report,
+gen8_read_report_clock_ratios(const uint32_t *report,
                               uint64_t *slice_freq_hz,
                               uint64_t *unslice_freq_hz)
 {
@@ -1012,94 +966,67 @@ gfx8_read_report_clock_ratios(const uint32_t *report,
    *unslice_freq_hz = unslice_freq * 16666667ULL;
 }
 
-void
-gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
-                                       const struct gen_device_info *devinfo,
-                                       const uint32_t *start,
-                                       const uint32_t *end)
+static void
+query_result_read_frequencies(struct gen_perf_query_result *result,
+                              const struct gen_device_info *devinfo,
+                              const uint32_t *start,
+                              const uint32_t *end)
 {
    /* Slice/Unslice frequency is only available in the OA reports when the
     * "Disable OA reports due to clock ratio change" field in
     * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
     * global register (see drivers/gpu/drm/i915/i915_perf.c)
     *
-    * Documentation says this should be available on Gfx9+ but experimentation
-    * shows that Gfx8 reports similar values, so we enable it there too.
+    * Documentation says this should be available on Gen9+ but experimentation
+    * shows that Gen8 reports similar values, so we enable it there too.
     */
-   if (devinfo->ver < 8)
+   if (devinfo->gen < 8)
       return;
 
-   gfx8_read_report_clock_ratios(start,
+   gen8_read_report_clock_ratios(start,
                                  &result->slice_frequency[0],
                                  &result->unslice_frequency[0]);
-   gfx8_read_report_clock_ratios(end,
+   gen8_read_report_clock_ratios(end,
                                  &result->slice_frequency[1],
                                  &result->unslice_frequency[1]);
 }
 
-static inline bool
-can_use_mi_rpc_bc_counters(const struct gen_device_info *devinfo)
-{
-   return devinfo->ver <= 11;
-}
-
-void
-gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
-                                 const struct gen_perf_query_info *query,
-                                 const struct gen_device_info *devinfo,
-                                 const uint32_t *start,
-                                 const uint32_t *end)
+static void
+query_result_accumulate(struct gen_perf_query_result *result,
+                        const struct gen_perf_query_info *query,
+                        const uint32_t *start,
+                        const uint32_t *end)
 {
-   int i;
+   int i, idx = 0;
 
    if (result->hw_id == OA_REPORT_INVALID_CTX_ID &&
        start[2] != OA_REPORT_INVALID_CTX_ID)
       result->hw_id = start[2];
-   if (result->reports_accumulated == 0)
-      result->begin_timestamp = start[1];
    result->reports_accumulated++;
 
    switch (query->oa_format) {
    case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
-      accumulate_uint32(start + 1, end + 1,
-                        result->accumulator + query->gpu_time_offset); /* timestamp */
-      accumulate_uint32(start + 3, end + 3,
-                        result->accumulator + query->gpu_clock_offset); /* clock */
+      accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */
+      accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */
 
       /* 32x 40bit A counters... */
-      for (i = 0; i < 32; i++) {
-         accumulate_uint40(i, start, end,
-                           result->accumulator + query->a_offset + i);
-      }
+      for (i = 0; i < 32; i++)
+         accumulate_uint40(i, start, end, result->accumulator + idx++);
 
       /* 4x 32bit A counters... */
-      for (i = 0; i < 4; i++) {
-         accumulate_uint32(start + 36 + i, end + 36 + i,
-                           result->accumulator + query->a_offset + 32 + i);
-      }
+      for (i = 0; i < 4; i++)
+         accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++);
 
-      if (can_use_mi_rpc_bc_counters(devinfo)) {
-         /* 8x 32bit B counters */
-         for (i = 0; i < 8; i++) {
-            accumulate_uint32(start + 48 + i, end + 48 + i,
-                              result->accumulator + query->b_offset + i);
-         }
-
-         /* 8x 32bit C counters... */
-         for (i = 0; i < 8; i++) {
-            accumulate_uint32(start + 56 + i, end + 56 + i,
-                              result->accumulator + query->c_offset + i);
-         }
-      }
+      /* 8x 32bit B counters + 8x 32bit C counters... */
+      for (i = 0; i < 16; i++)
+         accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++);
       break;
 
    case I915_OA_FORMAT_A45_B8_C8:
       accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
 
-      for (i = 0; i < 61; i++) {
-         accumulate_uint32(start + 3 + i, end + 3 + i,
-                           result->accumulator + query->a_offset + i);
-      }
+      for (i = 0; i < 61; i++)
+         accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i);
       break;
 
    default:
@@ -1108,287 +1035,1458 @@ gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
 
 }
 
-#define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
+static void
+query_result_clear(struct gen_perf_query_result *result)
+{
+   memset(result, 0, sizeof(*result));
+   result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
+}
 
-void
-gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result,
-                                        const struct gen_device_info *devinfo,
-                                        const uint32_t start,
-                                        const uint32_t end)
+static void
+register_mdapi_statistic_query(struct gen_perf_config *perf_cfg,
+                               const struct gen_device_info *devinfo)
 {
-   switch (devinfo->ver) {
-   case 7:
-   case 8:
-      result->gt_frequency[0] = GET_FIELD(start, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
-      result->gt_frequency[1] = GET_FIELD(end, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
+   if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
+      return;
+
+   struct gen_perf_query_info *query =
+      append_query_info(perf_cfg, MAX_STAT_COUNTERS);
+
+   query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
+   query->name = "Intel_Raw_Pipeline_Statistics_Query";
+
+   /* The order has to match mdapi_pipeline_metrics. */
+   add_basic_stat_reg(query, IA_VERTICES_COUNT,
+                      "N vertices submitted");
+   add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
+                      "N primitives submitted");
+   add_basic_stat_reg(query, VS_INVOCATION_COUNT,
+                      "N vertex shader invocations");
+   add_basic_stat_reg(query, GS_INVOCATION_COUNT,
+                      "N geometry shader invocations");
+   add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
+                      "N geometry shader primitives emitted");
+   add_basic_stat_reg(query, CL_INVOCATION_COUNT,
+                      "N primitives entering clipping");
+   add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
+                      "N primitives leaving clipping");
+   if (devinfo->is_haswell || devinfo->gen == 8) {
+      add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
+                   "N fragment shader invocations",
+                   "N fragment shader invocations");
+   } else {
+      add_basic_stat_reg(query, PS_INVOCATION_COUNT,
+                         "N fragment shader invocations");
+   }
+   add_basic_stat_reg(query, HS_INVOCATION_COUNT,
+                      "N TCS shader invocations");
+   add_basic_stat_reg(query, DS_INVOCATION_COUNT,
+                      "N TES shader invocations");
+   if (devinfo->gen >= 7) {
+      add_basic_stat_reg(query, CS_INVOCATION_COUNT,
+                         "N compute shader invocations");
+   }
+
+   if (devinfo->gen >= 10) {
+      /* Reuse existing CS invocation register until we can expose this new
+       * one.
+       */
+      add_basic_stat_reg(query, CS_INVOCATION_COUNT,
+                         "Reserved1");
+   }
+
+   query->data_size = sizeof(uint64_t) * query->n_counters;
+}
+
+static void
+fill_mdapi_perf_query_counter(struct gen_perf_query_info *query,
+                              const char *name,
+                              uint32_t data_offset,
+                              uint32_t data_size,
+                              enum gen_perf_counter_data_type data_type)
+{
+   struct gen_perf_query_counter *counter = &query->counters[query->n_counters];
+
+   assert(query->n_counters <= query->max_counters);
+
+   counter->name = name;
+   counter->desc = "Raw counter value";
+   counter->type = GEN_PERF_COUNTER_TYPE_RAW;
+   counter->data_type = data_type;
+   counter->offset = data_offset;
+
+   query->n_counters++;
+
+   assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size);
+}
+
+#define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
+   fill_mdapi_perf_query_counter(query, #field_name,                    \
+                                 (uint8_t *) &struct_name.field_name -  \
+                                 (uint8_t *) &struct_name,              \
+                                 sizeof(struct_name.field_name),        \
+                                 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
+#define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
+   fill_mdapi_perf_query_counter(query,                                 \
+                                 ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
+                                 (uint8_t *) &struct_name.field_name[idx] - \
+                                 (uint8_t *) &struct_name,              \
+                                 sizeof(struct_name.field_name[0]),     \
+                                 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
+
+static void
+register_mdapi_oa_query(const struct gen_device_info *devinfo,
+                        struct gen_perf_config *perf)
+{
+   struct gen_perf_query_info *query = NULL;
+
+   /* MDAPI requires different structures for pretty much every generation
+    * (right now we have definitions for gen 7 to 11).
+    */
+   if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
+      return;
+
+   switch (devinfo->gen) {
+   case 7: {
+      query = append_query_info(perf, 1 + 45 + 16 + 7);
+      query->oa_format = I915_OA_FORMAT_A45_B8_C8;
+
+      struct gen7_mdapi_metrics metric_data;
+      query->data_size = sizeof(metric_data);
+
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
+                                       metric_data, ACounters, i, UINT64);
+      }
+      for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
+                                       metric_data, NOACounters, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
+      break;
+   }
+   case 8: {
+      query = append_query_info(perf, 2 + 36 + 16 + 16);
+      query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
+
+      struct gen8_mdapi_metrics metric_data;
+      query->data_size = sizeof(metric_data);
+
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
+                                       metric_data, OaCntr, i, UINT64);
+      }
+      for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
+                                       metric_data, NoaCntr, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
       break;
+   }
    case 9:
-   case 11:
-   case 12:
-      result->gt_frequency[0] = GET_FIELD(start, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
-      result->gt_frequency[1] = GET_FIELD(end, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
+   case 10:
+   case 11: {
+      query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2);
+      query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
+
+      struct gen9_mdapi_metrics metric_data;
+      query->data_size = sizeof(metric_data);
+
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
+                                       metric_data, OaCntr, i, UINT64);
+      }
+      for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
+                                       metric_data, NoaCntr, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
+                                       metric_data, UserCntr, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
       break;
+   }
    default:
-      unreachable("unexpected gen");
+      unreachable("Unsupported gen");
+      break;
    }
 
-   /* Put the numbers into Hz. */
-   result->gt_frequency[0] *= 1000000ULL;
-   result->gt_frequency[1] *= 1000000ULL;
+   query->kind = GEN_PERF_QUERY_TYPE_RAW;
+   query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
+   query->guid = GEN_PERF_QUERY_GUID_MDAPI;
+
+   {
+      /* Accumulation buffer offsets copied from an actual query... */
+      const struct gen_perf_query_info *copy_query =
+         &perf->queries[0];
+
+      query->gpu_time_offset = copy_query->gpu_time_offset;
+      query->gpu_clock_offset = copy_query->gpu_clock_offset;
+      query->a_offset = copy_query->a_offset;
+      query->b_offset = copy_query->b_offset;
+      query->c_offset = copy_query->c_offset;
+   }
 }
 
-void
-gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result,
-                                    const struct gen_perf_query_info *query,
-                                    const uint64_t *start,
-                                    const uint64_t *end)
+static uint64_t
+get_metric_id(struct gen_perf_config *perf,
+              const struct gen_perf_query_info *query)
 {
-   for (uint32_t i = 0; i < 2; i++) {
-      uint64_t v0 = start[i] & PERF_CNT_VALUE_MASK;
-      uint64_t v1 = end[i] & PERF_CNT_VALUE_MASK;
+   /* These queries are know not to ever change, their config ID has been
+    * loaded upon the first query creation. No need to look them up again.
+    */
+   if (query->kind == GEN_PERF_QUERY_TYPE_OA)
+      return query->oa_metrics_set_id;
+
+   assert(query->kind == GEN_PERF_QUERY_TYPE_RAW);
 
-      result->accumulator[query->perfcnt_offset + i] = v0 > v1 ?
-         (PERF_CNT_VALUE_MASK + 1 + v1 - v0) :
-         (v1 - v0);
+   /* Raw queries can be reprogrammed up by an external application/library.
+    * When a raw query is used for the first time it's id is set to a value !=
+    * 0. When it stops being used the id returns to 0. No need to reload the
+    * ID when it's already loaded.
+    */
+   if (query->oa_metrics_set_id != 0) {
+      DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n",
+          query->name, query->guid, query->oa_metrics_set_id);
+      return query->oa_metrics_set_id;
    }
+
+   struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query;
+   if (!load_metric_id(perf, query->guid,
+                       &raw_query->oa_metrics_set_id)) {
+      DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
+      raw_query->oa_metrics_set_id = 1ULL;
+   } else {
+      DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n",
+          query->name, query->guid, query->oa_metrics_set_id);
+   }
+   return query->oa_metrics_set_id;
 }
 
-static uint32_t
-query_accumulator_offset(const struct gen_perf_query_info *query,
-                         enum gen_perf_query_field_type type,
-                         uint8_t index)
+static struct oa_sample_buf *
+get_free_sample_buf(struct gen_perf_context *perf_ctx)
 {
-   switch (type) {
-   case GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
-      return query->perfcnt_offset + index;
-   case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
-      return query->b_offset + index;
-   case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
-      return query->c_offset + index;
-   default:
-      unreachable("Invalid register type");
-      return 0;
+   struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers);
+   struct oa_sample_buf *buf;
+
+   if (node)
+      buf = exec_node_data(struct oa_sample_buf, node, link);
+   else {
+      buf = ralloc_size(perf_ctx->perf, sizeof(*buf));
+
+      exec_node_init(&buf->link);
+      buf->refcount = 0;
+   }
+   buf->len = 0;
+
+   return buf;
+}
+
+static void
+reap_old_sample_buffers(struct gen_perf_context *perf_ctx)
+{
+   struct exec_node *tail_node =
+      exec_list_get_tail(&perf_ctx->sample_buffers);
+   struct oa_sample_buf *tail_buf =
+      exec_node_data(struct oa_sample_buf, tail_node, link);
+
+   /* Remove all old, unreferenced sample buffers walking forward from
+    * the head of the list, except always leave at least one node in
+    * the list so we always have a node to reference when we Begin
+    * a new query.
+    */
+   foreach_list_typed_safe(struct oa_sample_buf, buf, link,
+                           &perf_ctx->sample_buffers)
+   {
+      if (buf->refcount == 0 && buf != tail_buf) {
+         exec_node_remove(&buf->link);
+         exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link);
+      } else
+         return;
+   }
+}
+
+static void
+free_sample_bufs(struct gen_perf_context *perf_ctx)
+{
+   foreach_list_typed_safe(struct oa_sample_buf, buf, link,
+                           &perf_ctx->free_sample_buffers)
+      ralloc_free(buf);
+
+   exec_list_make_empty(&perf_ctx->free_sample_buffers);
+}
+
+/******************************************************************************/
+
+/**
+ * Emit MI_STORE_REGISTER_MEM commands to capture all of the
+ * pipeline statistics for the performance query object.
+ */
+static void
+snapshot_statistics_registers(void *context,
+                              struct gen_perf_config *perf,
+                              struct gen_perf_query_object *obj,
+                              uint32_t offset_in_bytes)
+{
+   const struct gen_perf_query_info *query = obj->queryinfo;
+   const int n_counters = query->n_counters;
+
+   for (int i = 0; i < n_counters; i++) {
+      const struct gen_perf_query_counter *counter = &query->counters[i];
+
+      assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
+
+      perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo,
+                                      counter->pipeline_stat.reg,
+                                      offset_in_bytes + i * sizeof(uint64_t));
+   }
+}
+
+static void
+gen_perf_close(struct gen_perf_context *perfquery,
+               const struct gen_perf_query_info *query)
+{
+   if (perfquery->oa_stream_fd != -1) {
+      close(perfquery->oa_stream_fd);
+      perfquery->oa_stream_fd = -1;
+   }
+   if (query->kind == GEN_PERF_QUERY_TYPE_RAW) {
+      struct gen_perf_query_info *raw_query =
+         (struct gen_perf_query_info *) query;
+      raw_query->oa_metrics_set_id = 0;
+   }
+}
+
+static bool
+gen_perf_open(struct gen_perf_context *perf_ctx,
+              int metrics_set_id,
+              int report_format,
+              int period_exponent,
+              int drm_fd,
+              uint32_t ctx_id)
+{
+   uint64_t properties[] = {
+      /* Single context sampling */
+      DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
+
+      /* Include OA reports in samples */
+      DRM_I915_PERF_PROP_SAMPLE_OA, true,
+
+      /* OA unit configuration */
+      DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
+      DRM_I915_PERF_PROP_OA_FORMAT, report_format,
+      DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
+   };
+   struct drm_i915_perf_open_param param = {
+      .flags = I915_PERF_FLAG_FD_CLOEXEC |
+               I915_PERF_FLAG_FD_NONBLOCK |
+               I915_PERF_FLAG_DISABLED,
+      .num_properties = ARRAY_SIZE(properties) / 2,
+      .properties_ptr = (uintptr_t) properties,
+   };
+   int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
+   if (fd == -1) {
+      DBG("Error opening gen perf OA stream: %m\n");
+      return false;
+   }
+
+   perf_ctx->oa_stream_fd = fd;
+
+   perf_ctx->current_oa_metrics_set_id = metrics_set_id;
+   perf_ctx->current_oa_format = report_format;
+
+   return true;
+}
+
+static bool
+inc_n_users(struct gen_perf_context *perf_ctx)
+{
+   if (perf_ctx->n_oa_users == 0 &&
+       gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0)
+   {
+      return false;
+   }
+   ++perf_ctx->n_oa_users;
+
+   return true;
+}
+
+static void
+dec_n_users(struct gen_perf_context *perf_ctx)
+{
+   /* Disabling the i915 perf stream will effectively disable the OA
+    * counters.  Note it's important to be sure there are no outstanding
+    * MI_RPC commands at this point since they could stall the CS
+    * indefinitely once OACONTROL is disabled.
+    */
+   --perf_ctx->n_oa_users;
+   if (perf_ctx->n_oa_users == 0 &&
+       gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
+   {
+      DBG("WARNING: Error disabling gen perf stream: %m\n");
    }
 }
 
 void
-gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *result,
-                                        const struct gen_perf_query_info *query,
-                                        const struct gen_device_info *devinfo,
-                                        const void *start,
-                                        const void *end,
-                                        bool no_oa_accumulate)
+gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
+                      const struct gen_device_info *devinfo,
+                      int drm_fd)
+{
+   load_pipeline_statistic_metrics(perf_cfg, devinfo);
+   register_mdapi_statistic_query(perf_cfg, devinfo);
+   if (load_oa_metrics(perf_cfg, drm_fd, devinfo))
+      register_mdapi_oa_query(devinfo, perf_cfg);
+}
+
+void
+gen_perf_init_context(struct gen_perf_context *perf_ctx,
+                      struct gen_perf_config *perf_cfg,
+                      void * ctx,  /* driver context (eg, brw_context) */
+                      void * bufmgr,  /* eg brw_bufmgr */
+                      const struct gen_device_info *devinfo,
+                      uint32_t hw_ctx,
+                      int drm_fd)
+{
+   perf_ctx->perf = perf_cfg;
+   perf_ctx->ctx = ctx;
+   perf_ctx->bufmgr = bufmgr;
+   perf_ctx->drm_fd = drm_fd;
+   perf_ctx->hw_ctx = hw_ctx;
+   perf_ctx->devinfo = devinfo;
+
+   perf_ctx->unaccumulated =
+      ralloc_array(ctx, struct gen_perf_query_object *, 2);
+   perf_ctx->unaccumulated_elements = 0;
+   perf_ctx->unaccumulated_array_size = 2;
+
+   exec_list_make_empty(&perf_ctx->sample_buffers);
+   exec_list_make_empty(&perf_ctx->free_sample_buffers);
+
+   /* It's convenient to guarantee that this linked list of sample
+    * buffers is never empty so we add an empty head so when we
+    * Begin an OA query we can always take a reference on a buffer
+    * in this list.
+    */
+   struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
+   exec_list_push_head(&perf_ctx->sample_buffers, &buf->link);
+
+   perf_ctx->oa_stream_fd = -1;
+   perf_ctx->next_query_start_report_id = 1000;
+}
+
+/**
+ * Add a query to the global list of "unaccumulated queries."
+ *
+ * Queries are tracked here until all the associated OA reports have
+ * been accumulated via accumulate_oa_reports() after the end
+ * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
+ */
+static void
+add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
+                                struct gen_perf_query_object *obj)
+{
+   if (perf_ctx->unaccumulated_elements >=
+       perf_ctx->unaccumulated_array_size)
+   {
+      perf_ctx->unaccumulated_array_size *= 1.5;
+      perf_ctx->unaccumulated =
+         reralloc(perf_ctx->ctx, perf_ctx->unaccumulated,
+                  struct gen_perf_query_object *,
+                  perf_ctx->unaccumulated_array_size);
+   }
+
+   perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj;
+}
+
+bool
+gen_perf_begin_query(struct gen_perf_context *perf_ctx,
+                     struct gen_perf_query_object *query)
 {
-   struct gen_perf_query_field_layout *layout = &query->perf->query_layout;
-
-   for (uint32_t r = 0; r < layout->n_fields; r++) {
-      struct gen_perf_query_field *field = &layout->fields[r];
-
-      if (field->type == GEN_PERF_QUERY_FIELD_TYPE_MI_RPC) {
-         gen_perf_query_result_read_frequencies(result, devinfo,
-                                                start + field->location,
-                                                end + field->location);
-         /* no_oa_accumulate=true is used when doing GL perf queries, we
-          * manually parse the OA reports from the OA buffer and substract
-          * unrelated deltas, so don't accumulate the begin/end reports here.
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+   const struct gen_perf_query_info *queryinfo = query->queryinfo;
+
+   /* XXX: We have to consider that the command parser unit that parses batch
+    * buffer commands and is used to capture begin/end counter snapshots isn't
+    * implicitly synchronized with what's currently running across other GPU
+    * units (such as the EUs running shaders) that the performance counters are
+    * associated with.
+    *
+    * The intention of performance queries is to measure the work associated
+    * with commands between the begin/end delimiters and so for that to be the
+    * case we need to explicitly synchronize the parsing of commands to capture
+    * Begin/End counter snapshots with what's running across other parts of the
+    * GPU.
+    *
+    * When the command parser reaches a Begin marker it effectively needs to
+    * drain everything currently running on the GPU until the hardware is idle
+    * before capturing the first snapshot of counters - otherwise the results
+    * would also be measuring the effects of earlier commands.
+    *
+    * When the command parser reaches an End marker it needs to stall until
+    * everything currently running on the GPU has finished before capturing the
+    * end snapshot - otherwise the results won't be a complete representation
+    * of the work.
+    *
+    * Theoretically there could be opportunities to minimize how much of the
+    * GPU pipeline is drained, or that we stall for, when we know what specific
+    * units the performance counters being queried relate to but we don't
+    * currently attempt to be clever here.
+    *
+    * Note: with our current simple approach here then for back-to-back queries
+    * we will redundantly emit duplicate commands to synchronize the command
+    * streamer with the rest of the GPU pipeline, but we assume that in HW the
+    * second synchronization is effectively a NOOP.
+    *
+    * N.B. The final results are based on deltas of counters between (inside)
+    * Begin/End markers so even though the total wall clock time of the
+    * workload is stretched by larger pipeline bubbles the bubbles themselves
+    * are generally invisible to the query results. Whether that's a good or a
+    * bad thing depends on the use case. For a lower real-time impact while
+    * capturing metrics then periodic sampling may be a better choice than
+    * INTEL_performance_query.
+    *
+    *
+    * This is our Begin synchronization point to drain current work on the
+    * GPU before we capture our first counter snapshot...
+    */
+   perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+
+   switch (queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW: {
+
+      /* Opening an i915 perf stream implies exclusive access to the OA unit
+       * which will generate counter reports for a specific counter set with a
+       * specific layout/format so we can't begin any OA based queries that
+       * require a different counter set or format unless we get an opportunity
+       * to close the stream and open a new one...
+       */
+      uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo);
+
+      if (perf_ctx->oa_stream_fd != -1 &&
+          perf_ctx->current_oa_metrics_set_id != metric_id) {
+
+         if (perf_ctx->n_oa_users != 0) {
+            DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n",
+                perf_ctx->current_oa_metrics_set_id, metric_id);
+            return false;
+         } else
+            gen_perf_close(perf_ctx, queryinfo);
+      }
+
+      /* If the OA counters aren't already on, enable them. */
+      if (perf_ctx->oa_stream_fd == -1) {
+         const struct gen_device_info *devinfo = perf_ctx->devinfo;
+
+         /* The period_exponent gives a sampling period as follows:
+          *   sample_period = timestamp_period * 2^(period_exponent + 1)
+          *
+          * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
+          * ~83ns (GEN8/9).
+          *
+          * The counter overflow period is derived from the EuActive counter
+          * which reads a counter that increments by the number of clock
+          * cycles multiplied by the number of EUs. It can be calculated as:
+          *
+          * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
+          *
+          * (E.g. 40 EUs @ 1GHz = ~53ms)
+          *
+          * We select a sampling period inferior to that overflow period to
+          * ensure we cannot see more than 1 counter overflow, otherwise we
+          * could loose information.
           */
-         if (!no_oa_accumulate) {
-            gen_perf_query_result_accumulate(result, query, devinfo,
-                                             start + field->location,
-                                             end + field->location);
-         }
-      } else {
-         uint64_t v0, v1;
-
-         if (field->size == 4) {
-            v0 = *(const uint32_t *)(start + field->location);
-            v1 = *(const uint32_t *)(end + field->location);
-         } else {
-            assert(field->size == 8);
-            v0 = *(const uint64_t *)(start + field->location);
-            v1 = *(const uint64_t *)(end + field->location);
+
+         int a_counter_in_bits = 32;
+         if (devinfo->gen >= 8)
+            a_counter_in_bits = 40;
+
+         uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus *
+             /* drop 1GHz freq to have units in nanoseconds */
+             2);
+
+         DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
+             overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus);
+
+         int period_exponent = 0;
+         uint64_t prev_sample_period, next_sample_period;
+         for (int e = 0; e < 30; e++) {
+            prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
+            next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
+
+            /* Take the previous sampling period, lower than the overflow
+             * period.
+             */
+            if (prev_sample_period < overflow_period &&
+                next_sample_period > overflow_period)
+               period_exponent = e + 1;
          }
 
-         if (field->mask) {
-            v0 = field->mask & v0;
-            v1 = field->mask & v1;
+         if (period_exponent == 0) {
+            DBG("WARNING: enable to find a sampling exponent\n");
+            return false;
          }
 
-         /* RPSTAT is a bit of a special case because its begin/end values
-          * represent frequencies. We store it in a separate location.
-          */
-         if (field->type == GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT)
-            gen_perf_query_result_read_gt_frequency(result, devinfo, v0, v1);
-         else
-            result->accumulator[query_accumulator_offset(query, field->type, field->index)] = v1 - v0;
+         DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
+             prev_sample_period / 1000000ul);
+
+         if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format,
+                            period_exponent, perf_ctx->drm_fd,
+                            perf_ctx->hw_ctx))
+            return false;
+      } else {
+         assert(perf_ctx->current_oa_metrics_set_id == metric_id &&
+                perf_ctx->current_oa_format == queryinfo->oa_format);
+      }
+
+      if (!inc_n_users(perf_ctx)) {
+         DBG("WARNING: Error enabling i915 perf stream: %m\n");
+         return false;
+      }
+
+      if (query->oa.bo) {
+         perf_cfg->vtbl.bo_unreference(query->oa.bo);
+         query->oa.bo = NULL;
+      }
+
+      query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
+                                             "perf. query OA MI_RPC bo",
+                                             MI_RPC_BO_SIZE);
+#ifdef DEBUG
+      /* Pre-filling the BO helps debug whether writes landed. */
+      void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE);
+      memset(map, 0x80, MI_RPC_BO_SIZE);
+      perf_cfg->vtbl.bo_unmap(query->oa.bo);
+#endif
+
+      query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
+      perf_ctx->next_query_start_report_id += 2;
+
+      /* We flush the batchbuffer here to minimize the chances that MI_RPC
+       * delimiting commands end up in different batchbuffers. If that's the
+       * case, the measurement will include the time it takes for the kernel
+       * scheduler to load a new request into the hardware. This is manifested in
+       * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
+       */
+      perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
+
+      /* Take a starting OA counter snapshot. */
+      perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0,
+                                               query->oa.begin_report_id);
+      perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
+                                                     MI_FREQ_START_OFFSET_BYTES);
+
+      ++perf_ctx->n_active_oa_queries;
+
+      /* No already-buffered samples can possibly be associated with this query
+       * so create a marker within the list of sample buffers enabling us to
+       * easily ignore earlier samples when processing this query after
+       * completion.
+       */
+      assert(!exec_list_is_empty(&perf_ctx->sample_buffers));
+      query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers);
+
+      struct oa_sample_buf *buf =
+         exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
+
+      /* This reference will ensure that future/following sample
+       * buffers (that may relate to this query) can't be freed until
+       * this drops to zero.
+       */
+      buf->refcount++;
+
+      query_result_clear(&query->oa.result);
+      query->oa.results_accumulated = false;
+
+      add_to_unaccumulated_query_list(perf_ctx, query);
+      break;
+   }
+
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      if (query->pipeline_stats.bo) {
+         perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
+         query->pipeline_stats.bo = NULL;
       }
+
+      query->pipeline_stats.bo =
+         perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
+                                 "perf. query pipeline stats bo",
+                                 STATS_BO_SIZE);
+
+      /* Take starting snapshots. */
+      snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0);
+
+      ++perf_ctx->n_active_pipeline_stats_queries;
+      break;
+
+   default:
+      unreachable("Unknown query type");
+      break;
    }
+
+   return true;
 }
 
 void
-gen_perf_query_result_clear(struct gen_perf_query_result *result)
+gen_perf_end_query(struct gen_perf_context *perf_ctx,
+                   struct gen_perf_query_object *query)
 {
-   memset(result, 0, sizeof(*result));
-   result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+
+   /* Ensure that the work associated with the queried commands will have
+    * finished before taking our query end counter readings.
+    *
+    * For more details see comment in brw_begin_perf_query for
+    * corresponding flush.
+    */
+  perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx);
+
+   switch (query->queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+
+      /* NB: It's possible that the query will have already been marked
+       * as 'accumulated' if an error was seen while reading samples
+       * from perf. In this case we mustn't try and emit a closing
+       * MI_RPC command in case the OA unit has already been disabled
+       */
+      if (!query->oa.results_accumulated) {
+         /* Take an ending OA counter snapshot. */
+         perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo,
+                                                     MI_FREQ_END_OFFSET_BYTES);
+         perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo,
+                                             MI_RPC_BO_END_OFFSET_BYTES,
+                                             query->oa.begin_report_id + 1);
+      }
+
+      --perf_ctx->n_active_oa_queries;
+
+      /* NB: even though the query has now ended, it can't be accumulated
+       * until the end MI_REPORT_PERF_COUNT snapshot has been written
+       * to query->oa.bo
+       */
+      break;
+
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query,
+                                    STATS_BO_END_OFFSET_BYTES);
+      --perf_ctx->n_active_pipeline_stats_queries;
+      break;
+
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
 }
 
-void
-gen_perf_query_result_print_fields(const struct gen_perf_query_info *query,
-                                   const struct gen_device_info *devinfo,
-                                   const void *data)
+enum OaReadStatus {
+   OA_READ_STATUS_ERROR,
+   OA_READ_STATUS_UNFINISHED,
+   OA_READ_STATUS_FINISHED,
+};
+
+static enum OaReadStatus
+read_oa_samples_until(struct gen_perf_context *perf_ctx,
+                      uint32_t start_timestamp,
+                      uint32_t end_timestamp)
 {
-   const struct gen_perf_query_field_layout *layout = &query->perf->query_layout;
+   struct exec_node *tail_node =
+      exec_list_get_tail(&perf_ctx->sample_buffers);
+   struct oa_sample_buf *tail_buf =
+      exec_node_data(struct oa_sample_buf, tail_node, link);
+   uint32_t last_timestamp =
+      tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp;
+
+   while (1) {
+      struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
+      uint32_t offset;
+      int len;
+
+      while ((len = read(perf_ctx->oa_stream_fd, buf->buf,
+                         sizeof(buf->buf))) < 0 && errno == EINTR)
+         ;
+
+      if (len <= 0) {
+         exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);
+
+         if (len < 0) {
+            if (errno == EAGAIN) {
+               return ((last_timestamp - start_timestamp) < INT32_MAX &&
+                       (last_timestamp - start_timestamp) >=
+                       (end_timestamp - start_timestamp)) ?
+                      OA_READ_STATUS_FINISHED :
+                      OA_READ_STATUS_UNFINISHED;
+            } else {
+               DBG("Error reading i915 perf samples: %m\n");
+            }
+         } else
+            DBG("Spurious EOF reading i915 perf samples\n");
+
+         return OA_READ_STATUS_ERROR;
+      }
 
-   for (uint32_t r = 0; r < layout->n_fields; r++) {
-      const struct gen_perf_query_field *field = &layout->fields[r];
-      const uint32_t *value32 = data + field->location;
+      buf->len = len;
+      exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link);
 
-      switch (field->type) {
-      case GEN_PERF_QUERY_FIELD_TYPE_MI_RPC:
-         fprintf(stderr, "MI_RPC:\n");
-         fprintf(stderr, "  TS: 0x%08x\n", *(value32 + 1));
-         fprintf(stderr, "  CLK: 0x%08x\n", *(value32 + 3));
-         break;
-      case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
-         fprintf(stderr, "B%u: 0x%08x\n", field->index, *value32);
-         break;
-      case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
-         fprintf(stderr, "C%u: 0x%08x\n", field->index, *value32);
-         break;
-      default:
-         break;
+      /* Go through the reports and update the last timestamp. */
+      offset = 0;
+      while (offset < buf->len) {
+         const struct drm_i915_perf_record_header *header =
+            (const struct drm_i915_perf_record_header *) &buf->buf[offset];
+         uint32_t *report = (uint32_t *) (header + 1);
+
+         if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
+            last_timestamp = report[1];
+
+         offset += header->size;
       }
+
+      buf->last_timestamp = last_timestamp;
    }
+
+   unreachable("not reached");
+   return OA_READ_STATUS_ERROR;
 }
 
-static int
-gen_perf_compare_query_names(const void *v1, const void *v2)
+/**
+ * Try to read all the reports until either the delimiting timestamp
+ * or an error arises.
+ */
+static bool
+read_oa_samples_for_query(struct gen_perf_context *perf_ctx,
+                          struct gen_perf_query_object *query,
+                          void *current_batch)
 {
-   const struct gen_perf_query_info *q1 = v1;
-   const struct gen_perf_query_info *q2 = v2;
+   uint32_t *start;
+   uint32_t *last;
+   uint32_t *end;
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+
+   /* We need the MI_REPORT_PERF_COUNT to land before we can start
+    * accumulate. */
+   assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
+          !perf_cfg->vtbl.bo_busy(query->oa.bo));
+
+   /* Map the BO once here and let accumulate_oa_reports() unmap
+    * it. */
+   if (query->oa.map == NULL)
+      query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ);
+
+   start = last = query->oa.map;
+   end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
+
+   if (start[0] != query->oa.begin_report_id) {
+      DBG("Spurious start report id=%"PRIu32"\n", start[0]);
+      return true;
+   }
+   if (end[0] != (query->oa.begin_report_id + 1)) {
+      DBG("Spurious end report id=%"PRIu32"\n", end[0]);
+      return true;
+   }
+
+   /* Read the reports until the end timestamp. */
+   switch (read_oa_samples_until(perf_ctx, start[1], end[1])) {
+   case OA_READ_STATUS_ERROR:
+      /* Fallthrough and let accumulate_oa_reports() deal with the
+       * error. */
+   case OA_READ_STATUS_FINISHED:
+      return true;
+   case OA_READ_STATUS_UNFINISHED:
+      return false;
+   }
 
-   return strcmp(q1->name, q2->name);
+   unreachable("invalid read status");
+   return false;
 }
 
-static inline struct gen_perf_query_field *
-add_query_register(struct gen_perf_query_field_layout *layout,
-                   enum gen_perf_query_field_type type,
-                   uint16_t offset,
-                   uint16_t size,
-                   uint8_t index)
+void
+gen_perf_wait_query(struct gen_perf_context *perf_ctx,
+                    struct gen_perf_query_object *query,
+                    void *current_batch)
 {
-   /* Align MI_RPC to 64bytes (HW requirement) & 64bit registers to 8bytes
-    * (shows up nicely in the debugger).
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+   struct brw_bo *bo = NULL;
+
+   switch (query->queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      bo = query->oa.bo;
+      break;
+
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      bo = query->pipeline_stats.bo;
+      break;
+
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
+
+   if (bo == NULL)
+      return;
+
+   /* If the current batch references our results bo then we need to
+    * flush first...
     */
-   if (type == GEN_PERF_QUERY_FIELD_TYPE_MI_RPC)
-      layout->size = align(layout->size, 64);
-   else if (size % 8 == 0)
-      layout->size = align(layout->size, 8);
-
-   layout->fields[layout->n_fields++] = (struct gen_perf_query_field) {
-      .mmio_offset = offset,
-      .location = layout->size,
-      .type = type,
-      .index = index,
-      .size = size,
-   };
-   layout->size += size;
+   if (perf_cfg->vtbl.batch_references(current_batch, bo))
+      perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
+
+   perf_cfg->vtbl.bo_wait_rendering(bo);
+
+   /* Due to a race condition between the OA unit signaling report
+    * availability and the report actually being written into memory,
+    * we need to wait for all the reports to come in before we can
+    * read them.
+    */
+   if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA ||
+       query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) {
+      while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
+         ;
+   }
+}
 
-   return &layout->fields[layout->n_fields - 1];
+bool
+gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
+                        struct gen_perf_query_object *query,
+                        void *current_batch)
+{
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+
+   switch (query->queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      return (query->oa.results_accumulated ||
+              (query->oa.bo &&
+               !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
+               !perf_cfg->vtbl.bo_busy(query->oa.bo) &&
+               read_oa_samples_for_query(perf_ctx, query, current_batch)));
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      return (query->pipeline_stats.bo &&
+              !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) &&
+              !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo));
+
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
+
+   return false;
 }
 
+/**
+ * Remove a query from the global list of unaccumulated queries once
+ * after successfully accumulating the OA reports associated with the
+ * query in accumulate_oa_reports() or when discarding unwanted query
+ * results.
+ */
 static void
-gen_perf_init_query_fields(struct gen_perf_config *perf_cfg,
-                           const struct gen_device_info *devinfo)
+drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx,
+                                   struct gen_perf_query_object *query)
 {
-   struct gen_perf_query_field_layout *layout = &perf_cfg->query_layout;
+   for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) {
+      if (perf_ctx->unaccumulated[i] == query) {
+         int last_elt = --perf_ctx->unaccumulated_elements;
+
+         if (i == last_elt)
+            perf_ctx->unaccumulated[i] = NULL;
+         else {
+            perf_ctx->unaccumulated[i] =
+               perf_ctx->unaccumulated[last_elt];
+         }
+
+         break;
+      }
+   }
 
-   layout->n_fields = 0;
+   /* Drop our samples_head reference so that associated periodic
+    * sample data buffers can potentially be reaped if they aren't
+    * referenced by any other queries...
+    */
 
-   /* MI_RPC requires a 64byte alignment. */
-   layout->alignment = 64;
+   struct oa_sample_buf *buf =
+      exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
 
-   layout->fields = rzalloc_array(perf_cfg, struct gen_perf_query_field, 5 + 16);
+   assert(buf->refcount > 0);
+   buf->refcount--;
 
-   add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_MI_RPC,
-                      0, 256, 0);
+   query->oa.samples_head = NULL;
 
-   if (devinfo->ver <= 11) {
-      struct gen_perf_query_field *field =
-         add_query_register(layout,
-                            GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
-                            PERF_CNT_1_DW0, 8, 0);
-      field->mask = PERF_CNT_VALUE_MASK;
+   reap_old_sample_buffers(perf_ctx);
+}
 
-      field = add_query_register(layout,
-                                 GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
-                                 PERF_CNT_2_DW0, 8, 1);
-      field->mask = PERF_CNT_VALUE_MASK;
-   }
+/* In general if we see anything spurious while accumulating results,
+ * we don't try and continue accumulating the current query, hoping
+ * for the best, we scrap anything outstanding, and then hope for the
+ * best with new queries.
+ */
+static void
+discard_all_queries(struct gen_perf_context *perf_ctx)
+{
+   while (perf_ctx->unaccumulated_elements) {
+      struct gen_perf_query_object *query = perf_ctx->unaccumulated[0];
 
-   if (devinfo->ver == 8 && !devinfo->is_cherryview) {
-      add_query_register(layout,
-                         GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,
-                         GFX7_RPSTAT1, 4, 0);
+      query->oa.results_accumulated = true;
+      drop_from_unaccumulated_query_list(perf_ctx, query);
+
+      dec_n_users(perf_ctx);
    }
+}
 
-   if (devinfo->ver >= 9) {
-      add_query_register(layout,
-                         GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,
-                         GFX9_RPSTAT0, 4, 0);
+/* Looks for the validity bit of context ID (dword 2) of an OA report. */
+static bool
+oa_report_ctx_id_valid(const struct gen_device_info *devinfo,
+                       const uint32_t *report)
+{
+   assert(devinfo->gen >= 8);
+   if (devinfo->gen == 8)
+      return (report[0] & (1 << 25)) != 0;
+   return (report[0] & (1 << 16)) != 0;
+}
+
+/**
+ * Accumulate raw OA counter values based on deltas between pairs of
+ * OA reports.
+ *
+ * Accumulation starts from the first report captured via
+ * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
+ * last MI_RPC report requested by brw_end_perf_query(). Between these
+ * two reports there may also some number of periodically sampled OA
+ * reports collected via the i915 perf interface - depending on the
+ * duration of the query.
+ *
+ * These periodic snapshots help to ensure we handle counter overflow
+ * correctly by being frequent enough to ensure we don't miss multiple
+ * overflows of a counter between snapshots. For Gen8+ the i915 perf
+ * snapshots provide the extra context-switch reports that let us
+ * subtract out the progress of counters associated with other
+ * contexts running on the system.
+ */
+static void
+accumulate_oa_reports(struct gen_perf_context *perf_ctx,
+                      struct gen_perf_query_object *query)
+{
+   const struct gen_device_info *devinfo = perf_ctx->devinfo;
+   uint32_t *start;
+   uint32_t *last;
+   uint32_t *end;
+   struct exec_node *first_samples_node;
+   bool last_report_ctx_match = true;
+   int out_duration = 0;
+
+   assert(query->oa.map != NULL);
+
+   start = last = query->oa.map;
+   end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
+
+   if (start[0] != query->oa.begin_report_id) {
+      DBG("Spurious start report id=%"PRIu32"\n", start[0]);
+      goto error;
+   }
+   if (end[0] != (query->oa.begin_report_id + 1)) {
+      DBG("Spurious end report id=%"PRIu32"\n", end[0]);
+      goto error;
    }
 
-   if (!can_use_mi_rpc_bc_counters(devinfo)) {
-      if (devinfo->ver >= 8 && devinfo->ver <= 11) {
-         for (uint32_t i = 0; i < GFX8_N_OA_PERF_B32; i++) {
-            add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
-                               GFX8_OA_PERF_B32(i), 4, i);
-         }
-         for (uint32_t i = 0; i < GFX8_N_OA_PERF_C32; i++) {
-            add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
-                               GFX8_OA_PERF_C32(i), 4, i);
+   /* See if we have any periodic reports to accumulate too... */
+
+   /* N.B. The oa.samples_head was set when the query began and
+    * pointed to the tail of the perf_ctx->sample_buffers list at
+    * the time the query started. Since the buffer existed before the
+    * first MI_REPORT_PERF_COUNT command was emitted we therefore know
+    * that no data in this particular node's buffer can possibly be
+    * associated with the query - so skip ahead one...
+    */
+   first_samples_node = query->oa.samples_head->next;
+
+   foreach_list_typed_from(struct oa_sample_buf, buf, link,
+                           &perf_ctx->sample_buffers,
+                           first_samples_node)
+   {
+      int offset = 0;
+
+      while (offset < buf->len) {
+         const struct drm_i915_perf_record_header *header =
+            (const struct drm_i915_perf_record_header *)(buf->buf + offset);
+
+         assert(header->size != 0);
+         assert(header->size <= buf->len);
+
+         offset += header->size;
+
+         switch (header->type) {
+         case DRM_I915_PERF_RECORD_SAMPLE: {
+            uint32_t *report = (uint32_t *)(header + 1);
+            bool report_ctx_match = true;
+            bool add = true;
+
+            /* Ignore reports that come before the start marker.
+             * (Note: takes care to allow overflow of 32bit timestamps)
+             */
+            if (gen_device_info_timebase_scale(devinfo,
+                                               report[1] - start[1]) > 5000000000) {
+               continue;
+            }
+
+            /* Ignore reports that come after the end marker.
+             * (Note: takes care to allow overflow of 32bit timestamps)
+             */
+            if (gen_device_info_timebase_scale(devinfo,
+                                               report[1] - end[1]) <= 5000000000) {
+               goto end;
+            }
+
+            /* For Gen8+ since the counters continue while other
+             * contexts are running we need to discount any unrelated
+             * deltas. The hardware automatically generates a report
+             * on context switch which gives us a new reference point
+             * to continuing adding deltas from.
+             *
+             * For Haswell we can rely on the HW to stop the progress
+             * of OA counters while any other context is acctive.
+             */
+            if (devinfo->gen >= 8) {
+               /* Consider that the current report matches our context only if
+                * the report says the report ID is valid.
+                */
+               report_ctx_match = oa_report_ctx_id_valid(devinfo, report) &&
+                  report[2] == start[2];
+               if (report_ctx_match)
+                  out_duration = 0;
+               else
+                  out_duration++;
+
+               /* Only add the delta between <last, report> if the last report
+                * was clearly identified as our context, or if we have at most
+                * 1 report without a matching ID.
+                *
+                * The OA unit will sometimes label reports with an invalid
+                * context ID when i915 rewrites the execlist submit register
+                * with the same context as the one currently running. This
+                * happens when i915 wants to notify the HW of ringbuffer tail
+                * register update. We have to consider this report as part of
+                * our context as the 3d pipeline behind the OACS unit is still
+                * processing the operations started at the previous execlist
+                * submission.
+                */
+               add = last_report_ctx_match && out_duration < 2;
+            }
+
+            if (add) {
+               query_result_accumulate(&query->oa.result, query->queryinfo,
+                                       last, report);
+            }
+
+            last = report;
+            last_report_ctx_match = report_ctx_match;
+
+            break;
          }
-      } else if (devinfo->ver == 12) {
-         for (uint32_t i = 0; i < GFX12_N_OAG_PERF_B32; i++) {
-            add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
-                               GFX12_OAG_PERF_B32(i), 4, i);
+
+         case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
+             DBG("i915 perf: OA error: all reports lost\n");
+             goto error;
+         case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
+             DBG("i915 perf: OA report lost\n");
+             break;
          }
-         for (uint32_t i = 0; i < GFX12_N_OAG_PERF_C32; i++) {
-            add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
-                               GFX12_OAG_PERF_C32(i), 4, i);
+      }
+   }
+
+end:
+
+   query_result_accumulate(&query->oa.result, query->queryinfo,
+                           last, end);
+
+   query->oa.results_accumulated = true;
+   drop_from_unaccumulated_query_list(perf_ctx, query);
+   dec_n_users(perf_ctx);
+
+   return;
+
+error:
+
+   discard_all_queries(perf_ctx);
+}
+
+void
+gen_perf_delete_query(struct gen_perf_context *perf_ctx,
+                      struct gen_perf_query_object *query)
+{
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+
+   /* We can assume that the frontend waits for a query to complete
+    * before ever calling into here, so we don't have to worry about
+    * deleting an in-flight query object.
+    */
+   switch (query->queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      if (query->oa.bo) {
+         if (!query->oa.results_accumulated) {
+            drop_from_unaccumulated_query_list(perf_ctx, query);
+            dec_n_users(perf_ctx);
          }
+
+         perf_cfg->vtbl.bo_unreference(query->oa.bo);
+         query->oa.bo = NULL;
+      }
+
+      query->oa.results_accumulated = false;
+      break;
+
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      if (query->pipeline_stats.bo) {
+         perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
+         query->pipeline_stats.bo = NULL;
       }
+      break;
+
+   default:
+      unreachable("Unknown query type");
+      break;
    }
 
-   /* Align the whole package to 64bytes so that 2 snapshots can be put
-    * together without extract alignment for the user.
+   /* As an indication that the INTEL_performance_query extension is no
+    * longer in use, it's a good time to free our cache of sample
+    * buffers and close any current i915-perf stream.
     */
-   layout->size = align(layout->size, 64);
+   if (--perf_ctx->n_query_instances == 0) {
+      free_sample_bufs(perf_ctx);
+      gen_perf_close(perf_ctx, query->queryinfo);
+   }
+
+   free(query);
 }
 
-void
-gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
-                      const struct gen_device_info *devinfo,
-                      int drm_fd,
-                      bool include_pipeline_statistics)
+#define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
+
+static void
+read_gt_frequency(struct gen_perf_context *perf_ctx,
+                  struct gen_perf_query_object *obj)
 {
-   gen_perf_init_query_fields(perf_cfg, devinfo);
+   const struct gen_device_info *devinfo = perf_ctx->devinfo;
+   uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)),
+      end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES));
 
-   if (include_pipeline_statistics) {
-      load_pipeline_statistic_metrics(perf_cfg, devinfo);
-      gen_perf_register_mdapi_statistic_query(perf_cfg, devinfo);
+   switch (devinfo->gen) {
+   case 7:
+   case 8:
+      obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
+      obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
+      break;
+   case 9:
+   case 10:
+   case 11:
+      obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
+      obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
+      break;
+   default:
+      unreachable("unexpected gen");
    }
 
-   bool oa_metrics = oa_metrics_available(perf_cfg, drm_fd, devinfo);
-   if (oa_metrics)
-      load_oa_metrics(perf_cfg, drm_fd, devinfo);
+   /* Put the numbers into Hz. */
+   obj->oa.gt_frequency[0] *= 1000000ULL;
+   obj->oa.gt_frequency[1] *= 1000000ULL;
+}
 
-   /* sort query groups by name */
-   qsort(perf_cfg->queries, perf_cfg->n_queries,
-         sizeof(perf_cfg->queries[0]), gen_perf_compare_query_names);
+static int
+get_oa_counter_data(struct gen_perf_context *perf_ctx,
+                    struct gen_perf_query_object *query,
+                    size_t data_size,
+                    uint8_t *data)
+{
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+   const struct gen_perf_query_info *queryinfo = query->queryinfo;
+   int n_counters = queryinfo->n_counters;
+   int written = 0;
+
+   for (int i = 0; i < n_counters; i++) {
+      const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
+      uint64_t *out_uint64;
+      float *out_float;
+      size_t counter_size = gen_perf_query_counter_get_size(counter);
+
+      if (counter_size) {
+         switch (counter->data_type) {
+         case GEN_PERF_COUNTER_DATA_TYPE_UINT64:
+            out_uint64 = (uint64_t *)(data + counter->offset);
+            *out_uint64 =
+               counter->oa_counter_read_uint64(perf_cfg, queryinfo,
+                                               query->oa.result.accumulator);
+            break;
+         case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
+            out_float = (float *)(data + counter->offset);
+            *out_float =
+               counter->oa_counter_read_float(perf_cfg, queryinfo,
+                                              query->oa.result.accumulator);
+            break;
+         default:
+            /* So far we aren't using uint32, double or bool32... */
+            unreachable("unexpected counter data type");
+         }
+         written = counter->offset + counter_size;
+      }
+   }
+
+   return written;
+}
+
+static int
+get_pipeline_stats_data(struct gen_perf_context *perf_ctx,
+                        struct gen_perf_query_object *query,
+                        size_t data_size,
+                        uint8_t *data)
+
+{
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+   const struct gen_perf_query_info *queryinfo = query->queryinfo;
+   int n_counters = queryinfo->n_counters;
+   uint8_t *p = data;
+
+   uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ);
+   uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
+
+   for (int i = 0; i < n_counters; i++) {
+      const struct gen_perf_query_counter *counter = &queryinfo->counters[i];
+      uint64_t value = end[i] - start[i];
+
+      if (counter->pipeline_stat.numerator !=
+          counter->pipeline_stat.denominator) {
+         value *= counter->pipeline_stat.numerator;
+         value /= counter->pipeline_stat.denominator;
+      }
+
+      *((uint64_t *)p) = value;
+      p += 8;
+   }
 
-   build_unique_counter_list(perf_cfg);
+   perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo);
 
-   if (oa_metrics)
-      gen_perf_register_mdapi_oa_query(perf_cfg, devinfo);
+   return p - data;
+}
+
+void
+gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
+                        struct gen_perf_query_object *query,
+                        int data_size,
+                        unsigned *data,
+                        unsigned *bytes_written)
+{
+   struct gen_perf_config *perf_cfg = perf_ctx->perf;
+   int written = 0;
+
+   switch (query->queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      if (!query->oa.results_accumulated) {
+         read_gt_frequency(perf_ctx, query);
+         uint32_t *begin_report = query->oa.map;
+         uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
+         query_result_read_frequencies(&query->oa.result,
+                                       perf_ctx->devinfo,
+                                       begin_report,
+                                       end_report);
+         accumulate_oa_reports(perf_ctx, query);
+         assert(query->oa.results_accumulated);
+
+         perf_cfg->vtbl.bo_unmap(query->oa.bo);
+         query->oa.map = NULL;
+      }
+      if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) {
+         written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data);
+      } else {
+         const struct gen_device_info *devinfo = perf_ctx->devinfo;
+
+         written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size,
+                                                     devinfo, &query->oa.result,
+                                                     query->oa.gt_frequency[0],
+                                                     query->oa.gt_frequency[1]);
+      }
+      break;
+
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data);
+      break;
+
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
+
+   if (bytes_written)
+      *bytes_written = written;
+}
+
+void
+gen_perf_dump_query_count(struct gen_perf_context *perf_ctx)
+{
+   DBG("Queries: (Open queries = %d, OA users = %d)\n",
+       perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users);
+}
+
+void
+gen_perf_dump_query(struct gen_perf_context *ctx,
+                    struct gen_perf_query_object *obj,
+                    void *current_batch)
+{
+   switch (obj->queryinfo->kind) {
+   case GEN_PERF_QUERY_TYPE_OA:
+   case GEN_PERF_QUERY_TYPE_RAW:
+      DBG("BO: %-4s OA data: %-10s %-15s\n",
+          obj->oa.bo ? "yes," : "no,",
+          gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,",
+          obj->oa.results_accumulated ? "accumulated" : "not accumulated");
+      break;
+   case GEN_PERF_QUERY_TYPE_PIPELINE:
+      DBG("BO: %-4s\n",
+          obj->pipeline_stats.bo ? "yes" : "no");
+      break;
+   default:
+      unreachable("Unknown query type");
+      break;
+   }
 }
diff --git a/lib/mesa/src/intel/perf/gen_perf.h b/lib/mesa/src/intel/perf/gen_perf.h
index 6b061c420..e33d9b0c9 100644
--- a/lib/mesa/src/intel/perf/gen_perf.h
+++ b/lib/mesa/src/intel/perf/gen_perf.h
@@ -25,7 +25,6 @@
 #define GEN_PERF_H
 
 #include <stdio.h>
-#include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -39,13 +38,23 @@
 #include "compiler/glsl/list.h"
 #include "util/ralloc.h"
 
-#include "drm-uapi/i915_drm.h"
-
 struct gen_device_info;
 
 struct gen_perf_config;
 struct gen_perf_query_info;
 
+#define GEN7_RPSTAT1                       0xA01C
+#define  GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT   7
+#define  GEN7_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(13, 7)
+#define  GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT   0
+#define  GEN7_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(6, 0)
+
+#define GEN9_RPSTAT0                       0xA01C
+#define  GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT   23
+#define  GEN9_RPSTAT0_CURR_GT_FREQ_MASK    INTEL_MASK(31, 23)
+#define  GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT   0
+#define  GEN9_RPSTAT0_PREV_GT_FREQ_MASK    INTEL_MASK(8, 0)
+
 enum gen_perf_counter_type {
    GEN_PERF_COUNTER_TYPE_EVENT,
    GEN_PERF_COUNTER_TYPE_DURATION_NORM,
@@ -63,39 +72,6 @@ enum gen_perf_counter_data_type {
    GEN_PERF_COUNTER_DATA_TYPE_DOUBLE,
 };
 
-enum gen_perf_counter_units {
-   /* size */
-   GEN_PERF_COUNTER_UNITS_BYTES,
-
-   /* frequency */
-   GEN_PERF_COUNTER_UNITS_HZ,
-
-   /* time */
-   GEN_PERF_COUNTER_UNITS_NS,
-   GEN_PERF_COUNTER_UNITS_US,
-
-   /**/
-   GEN_PERF_COUNTER_UNITS_PIXELS,
-   GEN_PERF_COUNTER_UNITS_TEXELS,
-   GEN_PERF_COUNTER_UNITS_THREADS,
-   GEN_PERF_COUNTER_UNITS_PERCENT,
-
-   /* events */
-   GEN_PERF_COUNTER_UNITS_MESSAGES,
-   GEN_PERF_COUNTER_UNITS_NUMBER,
-   GEN_PERF_COUNTER_UNITS_CYCLES,
-   GEN_PERF_COUNTER_UNITS_EVENTS,
-   GEN_PERF_COUNTER_UNITS_UTILIZATION,
-
-   /**/
-   GEN_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES,
-   GEN_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES,
-   GEN_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES,
-   GEN_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE,
-
-   GEN_PERF_COUNTER_UNITS_MAX
-};
-
 struct gen_pipeline_stat {
    uint32_t reg;
    uint32_t numerator;
@@ -106,12 +82,23 @@ struct gen_pipeline_stat {
  * The largest OA formats we can use include:
  * For Haswell:
  *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
- * For Gfx8+
+ * For Gen8+
  *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
- *
- * Plus 2 PERF_CNT registers and 1 RPSTAT register.
  */
-#define MAX_OA_REPORT_COUNTERS (62 + 2 + 1)
+#define MAX_OA_REPORT_COUNTERS 62
+
+#define IA_VERTICES_COUNT          0x2310
+#define IA_PRIMITIVES_COUNT        0x2318
+#define VS_INVOCATION_COUNT        0x2320
+#define HS_INVOCATION_COUNT        0x2300
+#define DS_INVOCATION_COUNT        0x2308
+#define GS_INVOCATION_COUNT        0x2328
+#define GS_PRIMITIVES_COUNT        0x2330
+#define CL_INVOCATION_COUNT        0x2338
+#define CL_PRIMITIVES_COUNT        0x2340
+#define PS_INVOCATION_COUNT        0x2348
+#define CS_INVOCATION_COUNT        0x2290
+#define PS_DEPTH_COUNT             0x2350
 
 /*
  * When currently allocate only one page for pipeline statistics queries. Here
@@ -151,41 +138,23 @@ struct gen_perf_query_result {
     * query.
     */
    uint64_t unslice_frequency[2];
-
-   /**
-    * Frequency of the whole GT at the begin and end of the query.
-    */
-   uint64_t gt_frequency[2];
-
-   /**
-    * Timestamp of the query.
-    */
-   uint64_t begin_timestamp;
-
-   /**
-    * Whether the query was interrupted by another workload (aka preemption).
-    */
-   bool query_disjoint;
 };
 
 struct gen_perf_query_counter {
    const char *name;
    const char *desc;
-   const char *symbol_name;
-   const char *category;
    enum gen_perf_counter_type type;
    enum gen_perf_counter_data_type data_type;
-   enum gen_perf_counter_units units;
    uint64_t raw_max;
    size_t offset;
 
    union {
       uint64_t (*oa_counter_read_uint64)(struct gen_perf_config *perf,
                                          const struct gen_perf_query_info *query,
-                                         const struct gen_perf_query_result *results);
+                                         const uint64_t *accumulator);
       float (*oa_counter_read_float)(struct gen_perf_config *perf,
                                      const struct gen_perf_query_info *query,
-                                     const struct gen_perf_query_result *results);
+                                     const uint64_t *accumulator);
       struct gen_pipeline_stat pipeline_stat;
    };
 };
@@ -195,28 +164,13 @@ struct gen_perf_query_register_prog {
    uint32_t val;
 };
 
-/* Register programming for a given query */
-struct gen_perf_registers {
-   const struct gen_perf_query_register_prog *flex_regs;
-   uint32_t n_flex_regs;
-
-   const struct gen_perf_query_register_prog *mux_regs;
-   uint32_t n_mux_regs;
-
-   const struct gen_perf_query_register_prog *b_counter_regs;
-   uint32_t n_b_counter_regs;
-};
-
 struct gen_perf_query_info {
-   struct gen_perf_config *perf;
-
    enum gen_perf_query_type {
       GEN_PERF_QUERY_TYPE_OA,
       GEN_PERF_QUERY_TYPE_RAW,
       GEN_PERF_QUERY_TYPE_PIPELINE,
    } kind;
    const char *name;
-   const char *symbol_name;
    const char *guid;
    struct gen_perf_query_counter *counters;
    int n_counters;
@@ -233,90 +187,22 @@ struct gen_perf_query_info {
    int a_offset;
    int b_offset;
    int c_offset;
-   int perfcnt_offset;
-   int rpstat_offset;
-
-   struct gen_perf_registers config;
-};
-
-/* When not using the MI_RPC command, this structure describes the list of
- * register offsets as well as their storage location so that they can be
- * stored through a series of MI_SRM commands and accumulated with
- * gen_perf_query_result_accumulate_snapshots().
- */
-struct gen_perf_query_field_layout {
-   /* Alignment for the layout */
-   uint32_t alignment;
-
-   /* Size of the whole layout */
-   uint32_t size;
-
-   uint32_t n_fields;
-
-   struct gen_perf_query_field {
-      /* MMIO location of this register */
-      uint16_t mmio_offset;
-
-      /* Location of this register in the storage */
-      uint16_t location;
-
-      /* Type of register, for accumulation (see gen_perf_query_info:*_offset
-       * fields)
-       */
-      enum gen_perf_query_field_type {
-         GEN_PERF_QUERY_FIELD_TYPE_MI_RPC,
-         GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
-         GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,
-         GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
-         GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
-      } type;
-
-      /* Index of register in the given type (for instance A31 or B2,
-       * etc...)
-       */
-      uint8_t index;
-
-      /* 4, 8 or 256 */
-      uint16_t size;
-
-      /* If not 0, mask to apply to the register value. */
-      uint64_t mask;
-   } *fields;
-};
 
-struct gen_perf_query_counter_info {
-   struct gen_perf_query_counter *counter;
+   /* Register programming for a given query */
+   struct gen_perf_query_register_prog *flex_regs;
+   uint32_t n_flex_regs;
 
-   uint64_t query_mask;
+   struct gen_perf_query_register_prog *mux_regs;
+   uint32_t n_mux_regs;
 
-   /**
-    * Each counter can be a part of many groups, each time at different index.
-    * This struct stores one of those locations.
-    */
-   struct {
-      int group_idx; /* query/group number */
-      int counter_idx; /* index inside of query/group */
-   } location;
+   struct gen_perf_query_register_prog *b_counter_regs;
+   uint32_t n_b_counter_regs;
 };
 
 struct gen_perf_config {
-   /* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */
-   bool i915_query_supported;
-
-   /* Version of the i915-perf subsystem, refer to i915_drm.h. */
-   int i915_perf_version;
-
-   /* Powergating configuration for the running the query. */
-   struct drm_i915_gem_context_param_sseu sseu;
-
    struct gen_perf_query_info *queries;
    int n_queries;
 
-   struct gen_perf_query_counter_info *counter_infos;
-   int n_counters;
-
-   struct gen_perf_query_field_layout query_layout;
-
    /* Variables referenced in the XML meta data for OA performance
     * counters, e.g in the normalization equations.
     *
@@ -333,7 +219,6 @@ struct gen_perf_config {
       uint64_t gt_min_freq;         /** $GpuMinFrequency */
       uint64_t gt_max_freq;         /** $GpuMaxFrequency */
       uint64_t revision;            /** $SkuRevisionId */
-      bool     query_mode;          /** $QueryMode */
    } sys_vars;
 
    /* OA metric sets, indexed by GUID, as know by Mesa at build time, to
@@ -342,17 +227,6 @@ struct gen_perf_config {
     */
    struct hash_table *oa_metrics_table;
 
-   /* When MDAPI hasn't configured the metric we need to use by the time the
-    * query begins, this OA metric is used as a fallback.
-    */
-   uint64_t fallback_raw_oa_metric;
-
-   /* Whether we have support for this platform. If true && n_queries == 0,
-    * this means we will not be able to use i915-perf because of it is in
-    * paranoid mode.
-    */
-   bool platform_supported;
-
    /* Location of the device's sysfs entry. */
    char sysfs_dev_dir[256];
 
@@ -364,96 +238,41 @@ struct gen_perf_config {
       bool (*batch_references)(void *batch, void *bo);
       void (*bo_wait_rendering)(void *bo);
       int (*bo_busy)(void *bo);
-      void (*emit_stall_at_pixel_scoreboard)(void *ctx);
+      void (*emit_mi_flush)(void *ctx);
       void (*emit_mi_report_perf_count)(void *ctx,
                                         void *bo,
                                         uint32_t offset_in_bytes,
                                         uint32_t report_id);
       void (*batchbuffer_flush)(void *ctx,
                                 const char *file, int line);
-      void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset);
+      void (*capture_frequency_stat_register)(void *ctx, void *bo,
+                                              uint32_t bo_offset);
+      void (*store_register_mem64)(void *ctx, void *bo, uint32_t reg, uint32_t offset);
 
    } vtbl;
 };
 
-struct gen_perf_counter_pass {
-   struct gen_perf_query_info *query;
-   struct gen_perf_query_counter *counter;
-   uint32_t pass;
-};
+struct gen_perf_query_object;
+const struct gen_perf_query_info* gen_perf_query_info(const struct gen_perf_query_object *);
+
+struct gen_perf_context;
+struct gen_perf_context *gen_perf_new_context(void *parent);
 
 void gen_perf_init_metrics(struct gen_perf_config *perf_cfg,
                            const struct gen_device_info *devinfo,
-                           int drm_fd,
-                           bool include_pipeline_statistics);
-
-/** Query i915 for a metric id using guid.
- */
-bool gen_perf_load_metric_id(struct gen_perf_config *perf_cfg,
-                             const char *guid,
-                             uint64_t *metric_id);
-
-/** Load a configuation's content from i915 using a guid.
- */
-struct gen_perf_registers *gen_perf_load_configuration(struct gen_perf_config *perf_cfg,
-                                                      int fd, const char *guid);
-
-/** Store a configuration into i915 using guid and return a new metric id.
- *
- * If guid is NULL, then a generated one will be provided by hashing the
- * content of the configuration.
- */
-uint64_t gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd,
-                                      const struct gen_perf_registers *config,
-                                      const char *guid);
-
-/** Read the slice/unslice frequency from 2 OA reports and store then into
- *  result.
- */
-void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
-                                            const struct gen_device_info *devinfo,
-                                            const uint32_t *start,
-                                            const uint32_t *end);
-
-/** Store the GT frequency as reported by the RPSTAT register.
- */
-void gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result,
-                                             const struct gen_device_info *devinfo,
-                                             const uint32_t start,
-                                             const uint32_t end);
-
-/** Store PERFCNT registers values.
- */
-void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result,
-                                         const struct gen_perf_query_info *query,
-                                         const uint64_t *start,
-                                         const uint64_t *end);
-
-/** Accumulate the delta between 2 OA reports into result for a given query.
- */
-void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
-                                      const struct gen_perf_query_info *query,
-                                      const struct gen_device_info *devinfo,
-                                      const uint32_t *start,
-                                      const uint32_t *end);
-
-/** Accumulate the delta between 2 snapshots of OA perf registers (layout
- * should match description specified through gen_perf_query_register_layout).
- */
-void gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *result,
-                                             const struct gen_perf_query_info *query,
-                                             const struct gen_device_info *devinfo,
-                                             const void *start,
-                                             const void *end,
-                                             bool no_oa_accumulate);
+                           int drm_fd);
+void gen_perf_init_context(struct gen_perf_context *perf_ctx,
+                           struct gen_perf_config *perf_cfg,
+                           void * ctx,  /* driver context (eg, brw_context) */
+                           void * bufmgr,  /* eg brw_bufmgr */
+                           const struct gen_device_info *devinfo,
+                           uint32_t hw_ctx,
+                           int drm_fd);
 
-void gen_perf_query_result_clear(struct gen_perf_query_result *result);
+struct gen_perf_config *gen_perf_config(struct gen_perf_context *ctx);
 
-/** Debug helper printing out query data.
- */
-void gen_perf_query_result_print_fields(const struct gen_perf_query_info *query,
-                                        const struct gen_device_info *devinfo,
-                                        const void *data);
+int gen_perf_active_queries(struct gen_perf_context *perf_ctx,
+                            const struct gen_perf_query_info *query);
 
 static inline size_t
 gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter)
@@ -481,33 +300,31 @@ gen_perf_new(void *ctx)
    return perf;
 }
 
-/** Whether we have the ability to hold off preemption on a batch so we don't
- * have to look at the OA buffer to subtract unrelated workloads off the
- * values captured through MI_* commands.
- */
-static inline bool
-gen_perf_has_hold_preemption(const struct gen_perf_config *perf)
-{
-   return perf->i915_perf_version >= 3;
-}
-
-/** Whether we have the ability to lock EU array power configuration for the
- * duration of the performance recording. This is useful on Gfx11 where the HW
- * architecture requires half the EU for particular workloads.
- */
-static inline bool
-gen_perf_has_global_sseu(const struct gen_perf_config *perf)
-{
-   return perf->i915_perf_version >= 4;
-}
-
-uint32_t gen_perf_get_n_passes(struct gen_perf_config *perf,
-                               const uint32_t *counter_indices,
-                               uint32_t counter_indices_count,
-                               struct gen_perf_query_info **pass_queries);
-void gen_perf_get_counters_passes(struct gen_perf_config *perf,
-                                  const uint32_t *counter_indices,
-                                  uint32_t counter_indices_count,
-                                  struct gen_perf_counter_pass *counter_pass);
+struct gen_perf_query_object *
+gen_perf_new_query(struct gen_perf_context *, unsigned query_index);
+
+
+bool gen_perf_begin_query(struct gen_perf_context *perf_ctx,
+                          struct gen_perf_query_object *query);
+void gen_perf_end_query(struct gen_perf_context *perf_ctx,
+                        struct gen_perf_query_object *query);
+void gen_perf_wait_query(struct gen_perf_context *perf_ctx,
+                         struct gen_perf_query_object *query,
+                         void *current_batch);
+bool gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
+                             struct gen_perf_query_object *query,
+                             void *current_batch);
+void gen_perf_delete_query(struct gen_perf_context *perf_ctx,
+                           struct gen_perf_query_object *query);
+void gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
+                             struct gen_perf_query_object *query,
+                             int data_size,
+                             unsigned *data,
+                             unsigned *bytes_written);
+
+void gen_perf_dump_query_count(struct gen_perf_context *perf_ctx);
+void gen_perf_dump_query(struct gen_perf_context *perf_ctx,
+                         struct gen_perf_query_object *obj,
+                         void *current_batch);
 
 #endif /* GEN_PERF_H */
diff --git a/lib/mesa/src/intel/perf/gen_perf_mdapi.c b/lib/mesa/src/intel/perf/gen_perf_mdapi.c
index 5508baba5..38ca23088 100644
--- a/lib/mesa/src/intel/perf/gen_perf_mdapi.c
+++ b/lib/mesa/src/intel/perf/gen_perf_mdapi.c
@@ -23,23 +23,18 @@
 
 #include "gen_perf.h"
 #include "gen_perf_mdapi.h"
-#include "gen_perf_private.h"
-#include "gen_perf_regs.h"
 
 #include "dev/gen_device_info.h"
 
-#include <drm-uapi/i915_drm.h>
-
-
 int
 gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
                                   const struct gen_device_info *devinfo,
-                                  const struct gen_perf_query_info *query,
-                                  const struct gen_perf_query_result *result)
+                                  const struct gen_perf_query_result *result,
+                                  uint64_t freq_start, uint64_t freq_end)
 {
-   switch (devinfo->ver) {
+   switch (devinfo->gen) {
    case 7: {
-      struct gfx7_mdapi_metrics *mdapi_data = (struct gfx7_mdapi_metrics *) data;
+      struct gen7_mdapi_metrics *mdapi_data = (struct gen7_mdapi_metrics *) data;
 
       if (data_size < sizeof(*mdapi_data))
          return 0;
@@ -54,19 +49,15 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
             result->accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i];
       }
 
-      mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
-      mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
-
       mdapi_data->ReportsCount = result->reports_accumulated;
       mdapi_data->TotalTime =
          gen_device_info_timebase_scale(devinfo, result->accumulator[0]);
-      mdapi_data->CoreFrequency = result->gt_frequency[1];
-      mdapi_data->CoreFrequencyChanged = result->gt_frequency[1] != result->gt_frequency[0];
-      mdapi_data->SplitOccured = result->query_disjoint;
+      mdapi_data->CoreFrequency = freq_end;
+      mdapi_data->CoreFrequencyChanged = freq_end != freq_start;
       return sizeof(*mdapi_data);
    }
    case 8: {
-      struct gfx8_mdapi_metrics *mdapi_data = (struct gfx8_mdapi_metrics *) data;
+      struct gen8_mdapi_metrics *mdapi_data = (struct gen8_mdapi_metrics *) data;
 
       if (data_size < sizeof(*mdapi_data))
          return 0;
@@ -78,29 +69,23 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
             result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
       }
 
-      mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
-      mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
-
       mdapi_data->ReportId = result->hw_id;
       mdapi_data->ReportsCount = result->reports_accumulated;
       mdapi_data->TotalTime =
          gen_device_info_timebase_scale(devinfo, result->accumulator[0]);
-      mdapi_data->BeginTimestamp =
-         gen_device_info_timebase_scale(devinfo, result->begin_timestamp);
       mdapi_data->GPUTicks = result->accumulator[1];
-      mdapi_data->CoreFrequency = result->gt_frequency[1];
-      mdapi_data->CoreFrequencyChanged = result->gt_frequency[1] != result->gt_frequency[0];
+      mdapi_data->CoreFrequency = freq_end;
+      mdapi_data->CoreFrequencyChanged = freq_end != freq_start;
       mdapi_data->SliceFrequency =
          (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL;
       mdapi_data->UnsliceFrequency =
          (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL;
-      mdapi_data->SplitOccured = result->query_disjoint;
       return sizeof(*mdapi_data);
    }
    case 9:
-   case 11:
-   case 12:{
-      struct gfx9_mdapi_metrics *mdapi_data = (struct gfx9_mdapi_metrics *) data;
+   case 10:
+   case 11: {
+      struct gen9_mdapi_metrics *mdapi_data = (struct gen9_mdapi_metrics *) data;
 
       if (data_size < sizeof(*mdapi_data))
          return 0;
@@ -112,257 +97,20 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
             result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
       }
 
-      mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
-      mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
-
       mdapi_data->ReportId = result->hw_id;
       mdapi_data->ReportsCount = result->reports_accumulated;
       mdapi_data->TotalTime =
          gen_device_info_timebase_scale(devinfo, result->accumulator[0]);
-      mdapi_data->BeginTimestamp =
-         gen_device_info_timebase_scale(devinfo, result->begin_timestamp);
       mdapi_data->GPUTicks = result->accumulator[1];
-      mdapi_data->CoreFrequency = result->gt_frequency[1];
-      mdapi_data->CoreFrequencyChanged = result->gt_frequency[1] != result->gt_frequency[0];
+      mdapi_data->CoreFrequency = freq_end;
+      mdapi_data->CoreFrequencyChanged = freq_end != freq_start;
       mdapi_data->SliceFrequency =
          (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL;
       mdapi_data->UnsliceFrequency =
          (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL;
-      mdapi_data->SplitOccured = result->query_disjoint;
       return sizeof(*mdapi_data);
    }
    default:
       unreachable("unexpected gen");
    }
 }
-
-void
-gen_perf_register_mdapi_statistic_query(struct gen_perf_config *perf_cfg,
-                                        const struct gen_device_info *devinfo)
-{
-   if (!(devinfo->ver >= 7 && devinfo->ver <= 12))
-      return;
-
-   struct gen_perf_query_info *query =
-      gen_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS);
-
-   query->kind = GEN_PERF_QUERY_TYPE_PIPELINE;
-   query->name = "Intel_Raw_Pipeline_Statistics_Query";
-
-   /* The order has to match mdapi_pipeline_metrics. */
-   gen_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT,
-                                     "N vertices submitted");
-   gen_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
-                                     "N primitives submitted");
-   gen_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
-                                     "N vertex shader invocations");
-   gen_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
-                                     "N geometry shader invocations");
-   gen_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
-                                     "N geometry shader primitives emitted");
-   gen_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
-                                     "N primitives entering clipping");
-   gen_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
-                                     "N primitives leaving clipping");
-   if (devinfo->is_haswell || devinfo->ver == 8) {
-      gen_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
-                                  "N fragment shader invocations",
-                                  "N fragment shader invocations");
-   } else {
-      gen_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
-                                        "N fragment shader invocations");
-   }
-   gen_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
-                                     "N TCS shader invocations");
-   gen_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
-                                     "N TES shader invocations");
-   if (devinfo->ver >= 7) {
-      gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
-                                        "N compute shader invocations");
-   }
-
-   if (devinfo->ver >= 10) {
-      /* Reuse existing CS invocation register until we can expose this new
-       * one.
-       */
-      gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
-                                        "Reserved1");
-   }
-
-   query->data_size = sizeof(uint64_t) * query->n_counters;
-}
-
-static void
-fill_mdapi_perf_query_counter(struct gen_perf_query_info *query,
-                              const char *name,
-                              uint32_t data_offset,
-                              uint32_t data_size,
-                              enum gen_perf_counter_data_type data_type)
-{
-   struct gen_perf_query_counter *counter = &query->counters[query->n_counters];
-
-   assert(query->n_counters <= query->max_counters);
-
-   counter->name = name;
-   counter->desc = "Raw counter value";
-   counter->type = GEN_PERF_COUNTER_TYPE_RAW;
-   counter->data_type = data_type;
-   counter->offset = data_offset;
-
-   query->n_counters++;
-
-   assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size);
-}
-
-#define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
-   fill_mdapi_perf_query_counter(query, #field_name,                    \
-                                 (uint8_t *) &struct_name.field_name -  \
-                                 (uint8_t *) &struct_name,              \
-                                 sizeof(struct_name.field_name),        \
-                                 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
-#define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
-   fill_mdapi_perf_query_counter(query,                                 \
-                                 ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
-                                 (uint8_t *) &struct_name.field_name[idx] - \
-                                 (uint8_t *) &struct_name,              \
-                                 sizeof(struct_name.field_name[0]),     \
-                                 GEN_PERF_COUNTER_DATA_TYPE_##type_name)
-
-void
-gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf,
-                                 const struct gen_device_info *devinfo)
-{
-   struct gen_perf_query_info *query = NULL;
-
-   /* MDAPI requires different structures for pretty much every generation
-    * (right now we have definitions for gen 7 to 12).
-    */
-   if (!(devinfo->ver >= 7 && devinfo->ver <= 12))
-      return;
-
-   switch (devinfo->ver) {
-   case 7: {
-      query = gen_perf_append_query_info(perf, 1 + 45 + 16 + 7);
-      query->oa_format = I915_OA_FORMAT_A45_B8_C8;
-
-      struct gfx7_mdapi_metrics metric_data;
-      query->data_size = sizeof(metric_data);
-
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
-      for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
-         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
-                                       metric_data, ACounters, i, UINT64);
-      }
-      for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
-         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
-                                       metric_data, NOACounters, i, UINT64);
-      }
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
-      break;
-   }
-   case 8: {
-      query = gen_perf_append_query_info(perf, 2 + 36 + 16 + 16);
-      query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
-
-      struct gfx8_mdapi_metrics metric_data;
-      query->data_size = sizeof(metric_data);
-
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
-      for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
-         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
-                                       metric_data, OaCntr, i, UINT64);
-      }
-      for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
-         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
-                                       metric_data, NoaCntr, i, UINT64);
-      }
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
-      break;
-   }
-   case 9:
-   case 11:
-   case 12: {
-      query = gen_perf_append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2);
-      query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
-
-      struct gfx9_mdapi_metrics metric_data;
-      query->data_size = sizeof(metric_data);
-
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
-      for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
-         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
-                                       metric_data, OaCntr, i, UINT64);
-      }
-      for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
-         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
-                                       metric_data, NoaCntr, i, UINT64);
-      }
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
-      for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
-         MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query,
-                                       metric_data, UserCntr, i, UINT64);
-      }
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
-      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
-      break;
-   }
-   default:
-      unreachable("Unsupported gen");
-      break;
-   }
-
-   query->kind = GEN_PERF_QUERY_TYPE_RAW;
-   query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
-   query->guid = GEN_PERF_QUERY_GUID_MDAPI;
-
-   {
-      /* Accumulation buffer offsets copied from an actual query... */
-      const struct gen_perf_query_info *copy_query =
-         &perf->queries[0];
-
-      query->gpu_time_offset = copy_query->gpu_time_offset;
-      query->gpu_clock_offset = copy_query->gpu_clock_offset;
-      query->a_offset = copy_query->a_offset;
-      query->b_offset = copy_query->b_offset;
-      query->c_offset = copy_query->c_offset;
-      query->perfcnt_offset = copy_query->perfcnt_offset;
-   }
-}
diff --git a/lib/mesa/src/intel/perf/gen_perf_mdapi.h b/lib/mesa/src/intel/perf/gen_perf_mdapi.h
index 4e77e2beb..3c3aec2c6 100644
--- a/lib/mesa/src/intel/perf/gen_perf_mdapi.h
+++ b/lib/mesa/src/intel/perf/gen_perf_mdapi.h
@@ -26,8 +26,7 @@
 
 #include <stdint.h>
 
-#include "dev/gen_device_info.h"
-
+struct gen_device_info;
 struct gen_perf_query_result;
 
 /* Guid has to matches with MDAPI's. */
@@ -37,7 +36,7 @@ struct gen_perf_query_result;
  * Data format expected by MDAPI.
  */
 
-struct gfx7_mdapi_metrics {
+struct gen7_mdapi_metrics {
    uint64_t TotalTime;
 
    uint64_t ACounters[45];
@@ -55,7 +54,7 @@ struct gfx7_mdapi_metrics {
 #define GTDI_QUERY_BDW_METRICS_OA_COUNT         36
 #define GTDI_QUERY_BDW_METRICS_OA_40b_COUNT     32
 #define GTDI_QUERY_BDW_METRICS_NOA_COUNT        16
-struct gfx8_mdapi_metrics {
+struct gen8_mdapi_metrics {
    uint64_t TotalTime;
    uint64_t GPUTicks;
    uint64_t OaCntr[GTDI_QUERY_BDW_METRICS_OA_COUNT];
@@ -81,7 +80,7 @@ struct gfx8_mdapi_metrics {
 
 #define GTDI_MAX_READ_REGS 16
 
-struct gfx9_mdapi_metrics {
+struct gen9_mdapi_metrics {
    uint64_t TotalTime;
    uint64_t GPUTicks;
    uint64_t OaCntr[GTDI_QUERY_BDW_METRICS_OA_COUNT];
@@ -110,7 +109,8 @@ struct gfx9_mdapi_metrics {
 };
 
 /* Add new definition */
-#define gfx11_mdapi_metrics gfx9_mdapi_metrics
+#define gen10_mdapi_metrics gen9_mdapi_metrics
+#define gen11_mdapi_metrics gen9_mdapi_metrics
 
 struct mdapi_pipeline_metrics {
    uint64_t IAVertices;
@@ -124,37 +124,12 @@ struct mdapi_pipeline_metrics {
    uint64_t HSInvocations;
    uint64_t DSInvocations;
    uint64_t CSInvocations;
-   uint64_t Reserved1; /* Gfx10+ */
+   uint64_t Reserved1; /* Gen10+ */
 };
 
 int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
                                       const struct gen_device_info *devinfo,
-                                      const struct gen_perf_query_info *query,
-                                      const struct gen_perf_query_result *result);
-
-static inline void gen_perf_query_mdapi_write_marker(void *data, uint32_t data_size,
-                                                     const struct gen_device_info *devinfo,
-                                                     uint64_t value)
-{
-   switch (devinfo->ver) {
-   case 8: {
-      if (data_size < sizeof(struct gfx8_mdapi_metrics))
-         return;
-      struct gfx8_mdapi_metrics *mdapi_data = data;
-      mdapi_data->MarkerUser = value;
-      break;
-   }
-   case 9:
-   case 11: {
-      if (data_size < sizeof(struct gfx9_mdapi_metrics))
-         return;
-      struct gfx9_mdapi_metrics *mdapi_data = data;
-      mdapi_data->MarkerUser = value;
-      break;
-   }
-   default:
-      break;
-   }
-}
+                                      const struct gen_perf_query_result *result,
+                                      uint64_t freq_start, uint64_t freq_end);
 
 #endif /* GEN_PERF_MDAPI_H */
diff --git a/lib/mesa/src/loader/Android.mk b/lib/mesa/src/loader/Android.mk
index 6aaaa1dac..ca9218846 100644
--- a/lib/mesa/src/loader/Android.mk
+++ b/lib/mesa/src/loader/Android.mk
@@ -35,10 +35,6 @@ LOCAL_SRC_FILES := \
 
 LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)
 
-ifneq ($(HAVE_GALLIUM_IRIS),)
-LOCAL_CFLAGS += -DPREFER_IRIS
-endif
-
 LOCAL_MODULE := libmesa_loader
 
 include $(MESA_COMMON_MK)
diff --git a/lib/mesa/src/mesa/Android.gen.mk b/lib/mesa/src/mesa/Android.gen.mk
index ae79a7cc0..ff4f5e4e4 100644
--- a/lib/mesa/src/mesa/Android.gen.mk
+++ b/lib/mesa/src/mesa/Android.gen.mk
@@ -36,17 +36,11 @@ sources := \
 	main/dispatch.h \
 	main/format_fallback.c \
 	main/format_pack.c \
+	main/format_unpack.c \
 	main/format_info.h \
 	main/remap_helper.h \
 	main/get_hash.h \
-	main/marshal_generated0.c \
-	main/marshal_generated1.c \
-	main/marshal_generated2.c \
-	main/marshal_generated3.c \
-	main/marshal_generated4.c \
-	main/marshal_generated5.c \
-	main/marshal_generated6.c \
-	main/marshal_generated7.c \
+	main/marshal_generated.c \
 	main/marshal_generated.h
 
 LOCAL_SRC_FILES := $(filter-out $(sources), $(LOCAL_SRC_FILES))
@@ -93,52 +87,10 @@ $(intermediates)/main/api_exec.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml
 $(intermediates)/main/api_exec.c: $(dispatch_deps)
 	$(call es-gen)
 
-$(intermediates)/main/marshal_generated0.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated0.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 0 -n 8
+$(intermediates)/main/marshal_generated.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
+$(intermediates)/main/marshal_generated.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml
 
-$(intermediates)/main/marshal_generated0.c: $(dispatch_deps)
-	$(call es-gen)
-
-$(intermediates)/main/marshal_generated1.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated1.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 1 -n 8
-
-$(intermediates)/main/marshal_generated1.c: $(dispatch_deps)
-	$(call es-gen)
-
-$(intermediates)/main/marshal_generated2.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated2.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 2 -n 8
-
-$(intermediates)/main/marshal_generated2.c: $(dispatch_deps)
-	$(call es-gen)
-
-$(intermediates)/main/marshal_generated3.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated3.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 3 -n 8
-
-$(intermediates)/main/marshal_generated3.c: $(dispatch_deps)
-	$(call es-gen)
-
-$(intermediates)/main/marshal_generated4.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated4.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 4 -n 8
-
-$(intermediates)/main/marshal_generated4.c: $(dispatch_deps)
-	$(call es-gen)
-
-$(intermediates)/main/marshal_generated5.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated5.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 5 -n 8
-
-$(intermediates)/main/marshal_generated5.c: $(dispatch_deps)
-	$(call es-gen)
-
-$(intermediates)/main/marshal_generated6.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated6.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 6 -n 8
-
-$(intermediates)/main/marshal_generated6.c: $(dispatch_deps)
-	$(call es-gen)
-
-$(intermediates)/main/marshal_generated7.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal.py
-$(intermediates)/main/marshal_generated7.c: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml -i 7 -n 8
-
-$(intermediates)/main/marshal_generated7.c: $(dispatch_deps)
+$(intermediates)/main/marshal_generated.c: $(dispatch_deps)
 	$(call es-gen)
 
 $(intermediates)/main/marshal_generated.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(glapi)/gl_marshal_h.py
@@ -187,3 +139,14 @@ $(intermediates)/main/format_pack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_
 $(intermediates)/main/format_pack.c: PRIVATE_XML :=
 $(intermediates)/main/format_pack.c: $(format_pack_deps)
 	$(call es-gen, $<)
+
+FORMAT_UNPACK := $(LOCAL_PATH)/main/format_unpack.py
+format_unpack_deps := \
+	$(LOCAL_PATH)/main/formats.csv \
+	$(LOCAL_PATH)/main/format_parser.py \
+	$(FORMAT_UNPACK)
+
+$(intermediates)/main/format_unpack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_UNPACK)
+$(intermediates)/main/format_unpack.c: PRIVATE_XML :=
+$(intermediates)/main/format_unpack.c: $(format_unpack_deps)
+	$(call es-gen, $<)
diff --git a/lib/mesa/src/mesa/Android.libmesa_dricore.mk b/lib/mesa/src/mesa/Android.libmesa_dricore.mk
index 8eb6aabe8..792117767 100644
--- a/lib/mesa/src/mesa/Android.libmesa_dricore.mk
+++ b/lib/mesa/src/mesa/Android.libmesa_dricore.mk
@@ -39,9 +39,11 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_SRC_FILES := \
 	$(MESA_FILES)
 
+ifeq ($(strip $(MESA_ENABLE_ASM)),true)
 ifeq ($(TARGET_ARCH),x86)
 	LOCAL_SRC_FILES += $(X86_FILES)
 endif # x86
+endif # MESA_ENABLE_ASM
 
 ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
 LOCAL_WHOLE_STATIC_LIBRARIES := \
diff --git a/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk b/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk
index 66b6ef13a..0d83cd5a9 100644
--- a/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk
+++ b/lib/mesa/src/mesa/Android.libmesa_glsl_utils.mk
@@ -43,6 +43,7 @@ LOCAL_C_INCLUDES := \
 
 LOCAL_SRC_FILES := \
 	main/extensions_table.c \
+	main/imports.c \
 	program/symbol_table.c \
 	program/dummy_errors.c
 
@@ -67,6 +68,7 @@ LOCAL_C_INCLUDES := \
 
 LOCAL_SRC_FILES := \
 	main/extensions_table.c \
+	main/imports.c \
 	program/symbol_table.c \
 	program/dummy_errors.c
 
diff --git a/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk b/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk
index 16153a3c5..ddfd03059 100644
--- a/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk
+++ b/lib/mesa/src/mesa/Android.libmesa_st_mesa.mk
@@ -42,9 +42,11 @@ LOCAL_GENERATED_SOURCES := \
 	$(MESA_GEN_GLSL_H) \
 	$(MESA_GEN_NIR_H)
 
+ifeq ($(strip $(MESA_ENABLE_ASM)),true)
 ifeq ($(TARGET_ARCH),x86)
 	LOCAL_SRC_FILES += $(X86_FILES)
 endif # x86
+endif # MESA_ENABLE_ASM
 
 ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
 LOCAL_WHOLE_STATIC_LIBRARIES := \
diff --git a/lib/mesa/src/mesa/drivers/dri/Android.mk b/lib/mesa/src/mesa/drivers/dri/Android.mk
index fb7d97f1a..60c8476a3 100644
--- a/lib/mesa/src/mesa/drivers/dri/Android.mk
+++ b/lib/mesa/src/mesa/drivers/dri/Android.mk
@@ -51,7 +51,6 @@ MESA_DRI_SHARED_LIBRARIES := \
 	libdl \
 	libglapi \
 	liblog \
-	libsync \
 	libz
 
 # If Android version >=8 MESA should static link libexpat else should dynamic link
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/Android.mk b/lib/mesa/src/mesa/drivers/dri/i965/Android.mk
index 816492581..29b46147f 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/Android.mk
+++ b/lib/mesa/src/mesa/drivers/dri/i965/Android.mk
@@ -40,201 +40,223 @@ I965_PERGEN_STATIC_LIBRARIES := \
 	libmesa_nir
 
 I965_PERGEN_LIBS := \
-	libmesa_i965_gfx4 \
-	libmesa_i965_gfx45 \
-	libmesa_i965_gfx5 \
-	libmesa_i965_gfx6 \
-	libmesa_i965_gfx7 \
-	libmesa_i965_gfx75 \
-	libmesa_i965_gfx8 \
-	libmesa_i965_gfx9 \
-	libmesa_i965_gfx11
+	libmesa_i965_gen4 \
+	libmesa_i965_gen45 \
+	libmesa_i965_gen5 \
+	libmesa_i965_gen6 \
+	libmesa_i965_gen7 \
+	libmesa_i965_gen75 \
+	libmesa_i965_gen8 \
+	libmesa_i965_gen9 \
+	libmesa_i965_gen10 \
+	libmesa_i965_gen11
 
 # ---------------------------------------
-# Build libmesa_i965_gfx4
+# Build libmesa_i965_gen4
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx4
+LOCAL_MODULE := libmesa_i965_gen4
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx4_FILES)
+LOCAL_SRC_FILES := $(i965_gen4_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=40
+LOCAL_CFLAGS := -DGEN_VERSIONx10=40
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx45
+# Build libmesa_i965_gen45
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx45
+LOCAL_MODULE := libmesa_i965_gen45
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx45_FILES)
+LOCAL_SRC_FILES := $(i965_gen45_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=45
+LOCAL_CFLAGS := -DGEN_VERSIONx10=45
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx5
+# Build libmesa_i965_gen5
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx5
+LOCAL_MODULE := libmesa_i965_gen5
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx5_FILES)
+LOCAL_SRC_FILES := $(i965_gen5_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=50
+LOCAL_CFLAGS := -DGEN_VERSIONx10=50
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx6
+# Build libmesa_i965_gen6
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx6
+LOCAL_MODULE := libmesa_i965_gen6
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx6_FILES)
+LOCAL_SRC_FILES := $(i965_gen6_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=60
+LOCAL_CFLAGS := -DGEN_VERSIONx10=60
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx7
+# Build libmesa_i965_gen7
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx7
+LOCAL_MODULE := libmesa_i965_gen7
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx7_FILES)
+LOCAL_SRC_FILES := $(i965_gen7_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=70
+LOCAL_CFLAGS := -DGEN_VERSIONx10=70
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx75
+# Build libmesa_i965_gen75
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx75
+LOCAL_MODULE := libmesa_i965_gen75
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx75_FILES)
+LOCAL_SRC_FILES := $(i965_gen75_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=75
+LOCAL_CFLAGS := -DGEN_VERSIONx10=75
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx8
+# Build libmesa_i965_gen8
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx8
+LOCAL_MODULE := libmesa_i965_gen8
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx8_FILES)
+LOCAL_SRC_FILES := $(i965_gen8_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=80
+LOCAL_CFLAGS := -DGEN_VERSIONx10=80
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx9
+# Build libmesa_i965_gen9
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx9
+LOCAL_MODULE := libmesa_i965_gen9
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx9_FILES)
+LOCAL_SRC_FILES := $(i965_gen9_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=90
+LOCAL_CFLAGS := -DGEN_VERSIONx10=90
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
-# Build libmesa_i965_gfx11
+# Build libmesa_i965_gen10
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_MODULE := libmesa_i965_gfx11
+LOCAL_MODULE := libmesa_i965_gen10
 
 LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
 
-LOCAL_SRC_FILES := $(i965_gfx11_FILES)
+LOCAL_SRC_FILES := $(i965_gen10_FILES)
 
 LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
 
 LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
 
-LOCAL_CFLAGS := -DGFX_VERx10=110
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libmesa_i965_gen11
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_i965_gen11
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(i965_gen11_FILES)
+
+LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
+
+LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=110
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/mesa/program/Android.mk b/lib/mesa/src/mesa/program/Android.mk
index 6b4e19167..c6470e628 100644
--- a/lib/mesa/src/mesa/program/Android.mk
+++ b/lib/mesa/src/mesa/program/Android.mk
@@ -23,7 +23,7 @@ LOCAL_PATH := $(call my-dir)
 define local-l-to-c
 	@mkdir -p $(dir $@)
 	@echo "Mesa Lex: $(PRIVATE_MODULE) <= $<"
-	$(hide) $(MESA_LEX) -o$@ $<
+	$(hide) $(LEX) -o$@ $<
 endef
 
 define mesa_local-y-to-c-and-h
diff --git a/lib/mesa/src/panfrost/Android.mk b/lib/mesa/src/panfrost/Android.mk
index 0681651ab..9ab5ddf9f 100644
--- a/lib/mesa/src/panfrost/Android.mk
+++ b/lib/mesa/src/panfrost/Android.mk
@@ -25,8 +25,4 @@
 LOCAL_PATH := $(call my-dir)
 
 include $(LOCAL_PATH)/Makefile.sources
-include $(LOCAL_PATH)/Android.util.mk
-include $(LOCAL_PATH)/Android.bifrost.mk
-include $(LOCAL_PATH)/Android.lib.mk
-include $(LOCAL_PATH)/Android.midgard.mk
 include $(LOCAL_PATH)/Android.shared.mk
diff --git a/lib/mesa/src/panfrost/Android.shared.mk b/lib/mesa/src/panfrost/Android.shared.mk
index 81024607e..6b921756e 100644
--- a/lib/mesa/src/panfrost/Android.shared.mk
+++ b/lib/mesa/src/panfrost/Android.shared.mk
@@ -33,7 +33,7 @@ LOCAL_SRC_FILES := \
 
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/auxiliary/ \
-	$(MESA_TOP)/src/gallium/include/
+	$(MESA_TOP)/src/gallium/include/ \
 
 LOCAL_STATIC_LIBRARIES := \
 
diff --git a/lib/mesa/src/panfrost/Makefile.sources b/lib/mesa/src/panfrost/Makefile.sources
index 2f55e07db..3ab90f279 100644
--- a/lib/mesa/src/panfrost/Makefile.sources
+++ b/lib/mesa/src/panfrost/Makefile.sources
@@ -1,121 +1,3 @@
-bifrost_FILES := \
-        bifrost/bifrost.h \
-        bifrost/bifrost_compile.c \
-        bifrost/bifrost_compile.h \
-        bifrost/bi_layout.c \
-        bifrost/bi_liveness.c \
-        bifrost/bi_lower_swizzle.c \
-        bifrost/bi_schedule.c \
-        bifrost/bi_scoreboard.c \
-        bifrost/bi_pack.c \
-        bifrost/bi_print.c \
-        bifrost/bi_print.h \
-        bifrost/bi_ra.c \
-        bifrost/bi_opt_copy_prop.c \
-        bifrost/bi_opt_dce.c \
-        bifrost/bi_opt_push_ubo.c \
-        bifrost/bi_quirks.h \
-        bifrost/bi_test_pack.c \
-        bifrost/bir.c \
-        bifrost/compiler.h \
-        bifrost/cmdline.c
-
-bifrost_disasm_FILES := \
-        bifrost/disassemble.c \
-        bifrost/disassemble.h \
-        bifrost/bi_print_common.c \
-        bifrost/bi_print_common.h
-
-lib_FILES := \
-        lib/decode_common.c \
-        lib/decode.c \
-        lib/pan_afbc.c \
-        lib/pan_attributes.c \
-        lib/pan_bo.c \
-        lib/pan_bo.h \
-        lib/pan_blend.c \
-        lib/pan_blend.h \
-        lib/pan_blitter.c \
-        lib/pan_blitter.h \
-        lib/pan_cs.c \
-        lib/pan_cs.h \
-        lib/pan_device.h \
-        lib/pan_encoder.h \
-        lib/pan_format.c \
-        lib/pan_indirect_draw.c \
-        lib/pan_indirect_draw.h \
-        lib/pan_invocation.c \
-        lib/pan_pool.c \
-        lib/pan_pool.h \
-        lib/pan_props.c \
-        lib/pan_sampler.c \
-        lib/pan_samples.c \
-        lib/pan_shader.c \
-        lib/pan_shader.h \
-        lib/pan_scoreboard.c \
-        lib/pan_scoreboard.h \
-        lib/pan_tiler.c \
-        lib/pan_texture.c \
-        lib/pan_scratch.c \
-        lib/pan_util.h
-
-midgard_FILES := \
-        midgard/compiler.h \
-        midgard/disassemble.c \
-        midgard/disassemble.h \
-        midgard/helpers.h \
-        midgard/midgard_address.c \
-        midgard/midgard_compile.c \
-        midgard/midgard_compile.h \
-        midgard/midgard_derivatives.c \
-        midgard/midgard_emit.c \
-        midgard/midgard.h \
-        midgard/midgard_liveness.c \
-        midgard/midgard_nir_lower_helper_writes.c \
-        midgard/midgard_helper_invocations.c \
-        midgard/midgard_nir.h \
-        midgard/midgard_nir_lower_image_bitsize.c \
-        midgard/midgard_ops.c \
-        midgard/midgard_ops.h \
-        midgard/midgard_opt_copy_prop.c \
-        midgard/midgard_opt_dce.c \
-        midgard/midgard_opt_perspective.c \
-        midgard/midgard-parse.h \
-        midgard/midgard_print.c \
-        midgard/midgard_ra.c \
-        midgard/midgard_ra_pipeline.c \
-        midgard/midgard_schedule.c \
-        midgard/midgard_errata_lod.c \
-        midgard/mir.c \
-        midgard/mir_promote_uniforms.c \
-        midgard/mir_squeeze.c \
-        midgard/nir_fuse_io_16.c \
-
-midgard_disasm_FILES := \
-        midgard/disassemble.c \
-        midgard/disassemble.h \
-        midgard/midgard_ops.c \
-        midgard/midgard_ops.h \
-        midgard/midgard_print_constant.c
-
 shared_FILES := \
-        shared/pan_minmax_cache.c \
         shared/pan_tiling.c \
-        shared/pan_minmax_cache.h \
-        shared/pan_tiling.h \
-
-util_FILES := \
-        util/lcra.c \
-        util/lcra.h \
-        util/nir_lower_blend.c \
-        util/nir_lower_blend.h \
-        util/nir_mod_helpers.c \
-        util/pan_ir.c \
-        util/pan_ir.h \
-        util/pan_liveness.c \
-        util/pan_lower_framebuffer.c \
-        util/pan_lower_helper_invocation.c \
-        util/pan_lower_sample_position.c \
-        util/pan_lower_writeout.c \
-        util/pan_lower_64bit_intrin.c \
-        util/pan_sysval.c \
+        shared/pan_tiling.h
diff --git a/lib/mesa/src/util/Android.mk b/lib/mesa/src/util/Android.mk
index 829699db6..6d770ca95 100644
--- a/lib/mesa/src/util/Android.mk
+++ b/lib/mesa/src/util/Android.mk
@@ -34,21 +34,12 @@ LOCAL_SRC_FILES := \
 	$(MESA_UTIL_FILES) \
 	$(XMLCONFIG_FILES)
 
-LOCAL_MODULE := libmesa_util
-
-LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
-intermediates := $(call local-generated-sources-dir)
-
 LOCAL_C_INCLUDES := \
 	external/zlib \
 	$(MESA_TOP)/src/mesa \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/gallium/include \
-	$(MESA_TOP)/src/gallium/auxiliary \
-	$(MESA_TOP)/src/util/format \
-	$(intermediates)/util/format \
-	$(intermediates)
+	$(MESA_TOP)/src/gallium/auxiliary
 
 # If Android version >=8 MESA should static link libexpat else should dynamic link
 ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
@@ -59,41 +50,70 @@ LOCAL_SHARED_LIBRARIES := \
 	libexpat
 endif
 
-LOCAL_SHARED_LIBRARIES += liblog libsync libcutils
+LOCAL_MODULE := libmesa_util
 
 # Generated sources
 
-LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
-
-# Some sources do require "util/format/u_format_pack.h" generated header
-UTIL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(subst format/u_format_pack.h,util/format/u_format_pack.h,$(MESA_UTIL_GENERATED_FILES)))
-LOCAL_GENERATED_SOURCES := $(UTIL_GENERATED_SOURCES)
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
-driconf_static_gen := $(LOCAL_PATH)/driconf_static.py
-driconf_static_deps := $(LOCAL_PATH)/00-mesa-defaults.conf
+intermediates := $(call local-generated-sources-dir)
 
-$(intermediates)/driconf_static.h: $(driconf_static_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $(driconf_static_gen) $^ $@
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
 
-format_srgb_gen := $(LOCAL_PATH)/format_srgb.py
+UTIL_GENERATED_SOURCES := $(addprefix $(intermediates)/,$(MESA_UTIL_GENERATED_FILES))
+LOCAL_GENERATED_SOURCES := $(UTIL_GENERATED_SOURCES)
 
-$(intermediates)/format_srgb.c: $(format_srgb_gen)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $(format_srgb_gen) $< > $@
+MESA_DRI_OPTIONS_H := $(intermediates)/xmlpool/options.h
+LOCAL_GENERATED_SOURCES += $(MESA_DRI_OPTIONS_H)
 
-u_format_gen := $(LOCAL_PATH)/format/u_format_table.py
-u_format_deps := $(LOCAL_PATH)/format/u_format.csv \
-	$(LOCAL_PATH)/format/u_format_pack.py \
-	$(LOCAL_PATH)/format/u_format_parse.py
+#
+# Generate options.h from gettext translations.
+#
 
-$(intermediates)/util/format/u_format_pack.h: $(u_format_deps)
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $(u_format_gen) --header $< > $@
+MESA_DRI_OPTIONS_LANGS := de es nl fr sv
+POT := $(intermediates)/xmlpool.pot
 
-$(intermediates)/format/u_format_table.c: $(u_format_deps)
+$(POT): $(LOCAL_PATH)/xmlpool/t_options.h
 	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $(u_format_gen) $< > $@
+	xgettext -L C --from-code utf-8 -o $@ $<
+
+$(MESA_DRI_OPTIONS_LANGS:%=$(intermediates)/xmlpool/%.po): $(intermediates)/xmlpool/%.po: $(LOCAL_PATH)/xmlpool/%.po $(POT)
+	lang=$(basename $(notdir $@)); \
+	mkdir -p $(dir $@); \
+	if [ -f $< ]; then \
+		msgmerge -o $@ $^; \
+	else \
+		msginit -i $(POT) \
+			-o $@ \
+			--locale=$$lang \
+			--no-translator; \
+		sed -i -e 's/charset=.*\\n/charset=UTF-8\\n/' $@; \
+	fi
+
+PRIVATE_SCRIPT := $(LOCAL_PATH)/xmlpool/gen_xmlpool.py
+PRIVATE_LOCALEDIR := $(intermediates)/xmlpool
+PRIVATE_TEMPLATE_HEADER := $(LOCAL_PATH)/xmlpool/t_options.h
+PRIVATE_MO_FILES := $(MESA_DRI_OPTIONS_LANGS:%=$(intermediates)/xmlpool/%.gmo)
+
+LOCAL_GENERATED_SOURCES += $(PRIVATE_MO_FILES)
+
+$(LOCAL_GENERATED_SOURCES): PRIVATE_PYTHON := $(MESA_PYTHON2)
+
+$(PRIVATE_MO_FILES): $(intermediates)/xmlpool/%.gmo: $(intermediates)/xmlpool/%.po
+	mkdir -p $(dir $@)
+	msgfmt -o $@ $<
+
+$(UTIL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@
+$(UTIL_GENERATED_SOURCES): $(intermediates)/%.c: $(LOCAL_PATH)/%.py
+	$(transform-generated-source)
+
+$(MESA_DRI_OPTIONS_H): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $< \
+		--template $(PRIVATE_TEMPLATE_HEADER) \
+		--output $@ \
+		--localedir $(PRIVATE_LOCALEDIR) \
+		--languages $(MESA_DRI_OPTIONS_LANGS)
+$(MESA_DRI_OPTIONS_H): $(PRIVATE_SCRIPT) $(PRIVATE_TEMPLATE_HEADER) $(PRIVATE_MO_FILES)
+	$(transform-generated-source)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/vulkan/Android.mk b/lib/mesa/src/vulkan/Android.mk
index 295c57d3c..71aa5e5f0 100644
--- a/lib/mesa/src/vulkan/Android.mk
+++ b/lib/mesa/src/vulkan/Android.mk
@@ -37,9 +37,7 @@ intermediates := $(call local-generated-sources-dir)
 
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/include/vulkan \
-	$(MESA_TOP)/src/vulkan/util \
-	$(MESA_TOP)/src/gallium/include \
-	$(intermediates)/util \
+	$(MESA_TOP)/src/vulkan/util
 
 ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
 LOCAL_C_INCLUDES += \
@@ -56,7 +54,7 @@ LOCAL_SRC_FILES := $(VULKAN_UTIL_FILES) $(VULKAN_WSI_FILES)
 
 vulkan_api_xml = $(MESA_TOP)/src/vulkan/registry/vk.xml
 
-$(intermediates)/util/vk_enum_to_str.c: $(MESA_TOP)/src/vulkan/util/gen_enum_to_str.py \
+$(firstword $(LOCAL_GENERATED_SOURCES)): $(MESA_TOP)/src/vulkan/util/gen_enum_to_str.py \
 		$(vulkan_api_xml)
 	@echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))"
 	@mkdir -p $(dir $@)
@@ -64,50 +62,7 @@ $(intermediates)/util/vk_enum_to_str.c: $(MESA_TOP)/src/vulkan/util/gen_enum_to_
 	    --xml $(vulkan_api_xml) \
 	    --outdir $(dir $@)
 
-$(intermediates)/util/vk_enum_to_str.h: $(intermediates)/util/vk_enum_to_str.c
-
-$(intermediates)/util/vk_common_entrypoints.c: $(MESA_TOP)/src/vulkan/util/vk_entrypoints_gen.py \
-		$(vulkan_api_xml)
-	@echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $< \
-	    --xml $(vulkan_api_xml) \
-	    --proto --weak --prefix vk_common \
-	    --out-c $@ --out-h $(dir $@)/vk_common_entrypoints.h
-
-$(intermediates)/util/vk_common_entrypoints.h: $(intermediates)/util/vk_common_entrypoints.c
-
-$(intermediates)/util/vk_dispatch_table.c: $(MESA_TOP)/src/vulkan/util/vk_dispatch_table_gen.py \
-		$(vulkan_api_xml)
-	@echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $< \
-	    --xml $(vulkan_api_xml) \
-	    --out-c $@
-
-$(intermediates)/util/vk_dispatch_table.h: $(MESA_TOP)/src/vulkan/util/vk_dispatch_table_gen.py \
-		$(vulkan_api_xml)
-	@echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $< \
-	    --xml $(vulkan_api_xml) \
-	    --out-h $@
-
-$(intermediates)/util/vk_extensions.c: $(MESA_TOP)/src/vulkan/util/vk_extensions_gen.py \
-		$(vulkan_api_xml)
-	@echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $< \
-	    --xml $(vulkan_api_xml) \
-	    --out-c $@
-
-$(intermediates)/util/vk_extensions.h: $(MESA_TOP)/src/vulkan/util/vk_extensions_gen.py \
-		$(vulkan_api_xml)
-	@echo "target Generated: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	@mkdir -p $(dir $@)
-	$(hide) $(MESA_PYTHON2) $< \
-	    --xml $(vulkan_api_xml) \
-	    --out-h $@
+$(lastword $(LOCAL_GENERATED_SOURCES)): $(firstword $(LOCAL_GENERATED_SOURCES))
 
 LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)/util