Revert to Mesa 13.0.6 to hopefully address rendering issues a handful of

people have reported with xpdf/fvwm on ivy bridge with modesetting driver.
author: Jonathan Gray <jsg@cvs.openbsd.org> 2017-08-26 16:59:42 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2017-08-26 16:59:42 +0000
commit: 81ece42815e80818f160cdd85fab57d65b56ad15 (patch)
tree: 1059ff094da1aa50334115952fcb1cfcbda3acc6 /lib/mesa/src/gallium/drivers/radeon
parent: b0244145d5bb49623d58f6b5cab8143ada692b60 (diff)
22 files changed, 5757 insertions, 1886 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.am b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
index 13d8976de..a6fc145cb 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.am
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
 if NEED_RADEON_LLVM
 
 AM_CFLAGS += \
-	$(LLVM_CFLAGS)
+	$(LLVM_CFLAGS) \
+	$(LIBELF_CFLAGS)
 
 libradeon_la_SOURCES += \
 	$(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
 libradeon_la_LIBADD = \
 	$(CLOCK_LIB) \
 	$(LLVM_LIBS) \
-	$(ELF_LIB)
+	$(LIBELF_LIBS)
 
 libradeon_la_LDFLAGS = \
 	$(LLVM_LDFLAGS)
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.in b/lib/mesa/src/gallium/drivers/radeon/Makefile.in
index f9faa3eef..d720beb87 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.in
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.in
@@ -54,18 +54,19 @@ target_triplet = @target@
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
 	$(srcdir)/Makefile.sources $(top_srcdir)/bin/depcomp \
 	$(top_srcdir)/src/gallium/Automake.inc
-@HAVE_LIBDRM_TRUE@am__append_1 = \
-@HAVE_LIBDRM_TRUE@	$(LIBDRM_LIBS)
-
-@HAVE_DRISW_TRUE@am__append_2 = \
+@HAVE_DRISW_TRUE@am__append_1 = \
 @HAVE_DRISW_TRUE@	$(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la
 
-@HAVE_DRISW_KMS_TRUE@am__append_3 = \
+@HAVE_DRISW_KMS_TRUE@am__append_2 = \
 @HAVE_DRISW_KMS_TRUE@	$(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \
 @HAVE_DRISW_KMS_TRUE@	$(LIBDRM_LIBS)
 
-@HAVE_GALLIUM_LLVM_TRUE@am__append_4 = \
-@HAVE_GALLIUM_LLVM_TRUE@	$(LLVM_CFLAGS)
+@NEED_RADEON_LLVM_TRUE@am__append_3 = \
+@NEED_RADEON_LLVM_TRUE@	$(LLVM_CFLAGS) \
+@NEED_RADEON_LLVM_TRUE@	$(LIBELF_CFLAGS)
+
+@NEED_RADEON_LLVM_TRUE@am__append_4 = \
+@NEED_RADEON_LLVM_TRUE@	$(LLVM_C_FILES)
 
 subdir = src/gallium/drivers/radeon
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -86,16 +87,27 @@ CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 LTLIBRARIES = $(noinst_LTLIBRARIES)
 am__DEPENDENCIES_1 =
-@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_DEPENDENCIES =  \
-@HAVE_GALLIUM_LLVM_TRUE@	$(am__DEPENDENCIES_1) \
-@HAVE_GALLIUM_LLVM_TRUE@	$(am__DEPENDENCIES_1)
+@NEED_RADEON_LLVM_TRUE@libradeon_la_DEPENDENCIES =  \
+@NEED_RADEON_LLVM_TRUE@	$(am__DEPENDENCIES_1) \
+@NEED_RADEON_LLVM_TRUE@	$(am__DEPENDENCIES_1) \
+@NEED_RADEON_LLVM_TRUE@	$(am__DEPENDENCIES_1)
+am__libradeon_la_SOURCES_DIST = cayman_msaa.c r600_buffer_common.c \
+	r600_cs.h r600_gpu_load.c r600_perfcounter.c \
+	r600_pipe_common.c r600_pipe_common.h r600_query.c \
+	r600_query.h r600_streamout.c r600_test_dma.c r600_texture.c \
+	r600_viewport.c radeon_uvd.c radeon_uvd.h radeon_vce_40_2_2.c \
+	radeon_vce_50.c radeon_vce_52.c radeon_vce.c radeon_vce.h \
+	radeon_video.c radeon_video.h radeon_winsys.h \
+	radeon_elf_util.c radeon_elf_util.h
 am__objects_1 = cayman_msaa.lo r600_buffer_common.lo r600_gpu_load.lo \
 	r600_perfcounter.lo r600_pipe_common.lo r600_query.lo \
 	r600_streamout.lo r600_test_dma.lo r600_texture.lo \
 	r600_viewport.lo radeon_uvd.lo radeon_vce_40_2_2.lo \
 	radeon_vce_50.lo radeon_vce_52.lo radeon_vce.lo \
 	radeon_video.lo
-am_libradeon_la_OBJECTS = $(am__objects_1)
+am__objects_2 = radeon_elf_util.lo
+@NEED_RADEON_LLVM_TRUE@am__objects_3 = $(am__objects_2)
+am_libradeon_la_OBJECTS = $(am__objects_1) $(am__objects_3)
 libradeon_la_OBJECTS = $(am_libradeon_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -139,7 +151,7 @@ am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
 am__v_CCLD_0 = @echo "  CCLD    " $@;
 am__v_CCLD_1 = 
 SOURCES = $(libradeon_la_SOURCES)
-DIST_SOURCES = $(libradeon_la_SOURCES)
+DIST_SOURCES = $(am__libradeon_la_SOURCES_DIST)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -153,8 +165,6 @@ AMDGPU_CFLAGS = @AMDGPU_CFLAGS@
 AMDGPU_LIBS = @AMDGPU_LIBS@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
-ANDROID_CFLAGS = @ANDROID_CFLAGS@
-ANDROID_LIBS = @ANDROID_LIBS@
 AR = @AR@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
@@ -185,6 +195,8 @@ DLLTOOL = @DLLTOOL@
 DLOPEN_LIBS = @DLOPEN_LIBS@
 DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@
 DRI2PROTO_LIBS = @DRI2PROTO_LIBS@
+DRI3PROTO_CFLAGS = @DRI3PROTO_CFLAGS@
+DRI3PROTO_LIBS = @DRI3PROTO_LIBS@
 DRIGL_CFLAGS = @DRIGL_CFLAGS@
 DRIGL_LIBS = @DRIGL_LIBS@
 DRI_DRIVER_INSTALL_DIR = @DRI_DRIVER_INSTALL_DIR@
@@ -197,11 +209,10 @@ ECHO_C = @ECHO_C@
 ECHO_N = @ECHO_N@
 ECHO_T = @ECHO_T@
 EGL_CFLAGS = @EGL_CFLAGS@
+EGL_CLIENT_APIS = @EGL_CLIENT_APIS@
 EGL_LIB_DEPS = @EGL_LIB_DEPS@
 EGL_NATIVE_PLATFORM = @EGL_NATIVE_PLATFORM@
 EGREP = @EGREP@
-ETNAVIV_CFLAGS = @ETNAVIV_CFLAGS@
-ETNAVIV_LIBS = @ETNAVIV_LIBS@
 EXEEXT = @EXEEXT@
 EXPAT_CFLAGS = @EXPAT_CFLAGS@
 EXPAT_LIBS = @EXPAT_LIBS@
@@ -249,27 +260,31 @@ LIBDRM_CFLAGS = @LIBDRM_CFLAGS@
 LIBDRM_LIBS = @LIBDRM_LIBS@
 LIBELF_CFLAGS = @LIBELF_CFLAGS@
 LIBELF_LIBS = @LIBELF_LIBS@
-LIBGLVND_DATADIR = @LIBGLVND_DATADIR@
 LIBOBJS = @LIBOBJS@
 LIBS = @LIBS@
-LIBSENSORS_LIBS = @LIBSENSORS_LIBS@
+LIBSENSORS_LDFLAGS = @LIBSENSORS_LDFLAGS@
+LIBSHA1_CFLAGS = @LIBSHA1_CFLAGS@
+LIBSHA1_LIBS = @LIBSHA1_LIBS@
 LIBTOOL = @LIBTOOL@
-LIBUNWIND_CFLAGS = @LIBUNWIND_CFLAGS@
-LIBUNWIND_LIBS = @LIBUNWIND_LIBS@
 LIB_DIR = @LIB_DIR@
 LIB_EXT = @LIB_EXT@
 LIPO = @LIPO@
+LLVM_BINDIR = @LLVM_BINDIR@
 LLVM_CFLAGS = @LLVM_CFLAGS@
 LLVM_CONFIG = @LLVM_CONFIG@
+LLVM_CPPFLAGS = @LLVM_CPPFLAGS@
 LLVM_CXXFLAGS = @LLVM_CXXFLAGS@
 LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@
 LLVM_LDFLAGS = @LLVM_LDFLAGS@
+LLVM_LIBDIR = @LLVM_LIBDIR@
 LLVM_LIBS = @LLVM_LIBS@
+LLVM_VERSION = @LLVM_VERSION@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
 MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
+MESA_LLVM = @MESA_LLVM@
 MKDIR_P = @MKDIR_P@
 MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@
 MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@
@@ -290,6 +305,8 @@ OMX_LIBS = @OMX_LIBS@
 OMX_LIB_INSTALL_DIR = @OMX_LIB_INSTALL_DIR@
 OPENCL_LIBNAME = @OPENCL_LIBNAME@
 OPENCL_VERSION = @OPENCL_VERSION@
+OPENSSL_CFLAGS = @OPENSSL_CFLAGS@
+OPENSSL_LIBS = @OPENSSL_LIBS@
 OSMESA_LIB = @OSMESA_LIB@
 OSMESA_LIB_DEPS = @OSMESA_LIB_DEPS@
 OSMESA_PC_LIB_PRIV = @OSMESA_PC_LIB_PRIV@
@@ -309,6 +326,8 @@ PKG_CONFIG = @PKG_CONFIG@
 PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
 PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
 POSIX_SHELL = @POSIX_SHELL@
+PRESENTPROTO_CFLAGS = @PRESENTPROTO_CFLAGS@
+PRESENTPROTO_LIBS = @PRESENTPROTO_LIBS@
 PTHREADSTUBS_CFLAGS = @PTHREADSTUBS_CFLAGS@
 PTHREADSTUBS_LIBS = @PTHREADSTUBS_LIBS@
 PTHREAD_CC = @PTHREAD_CC@
@@ -324,6 +343,8 @@ SED = @SED@
 SELINUX_CFLAGS = @SELINUX_CFLAGS@
 SELINUX_LIBS = @SELINUX_LIBS@
 SET_MAKE = @SET_MAKE@
+SHA1_CFLAGS = @SHA1_CFLAGS@
+SHA1_LIBS = @SHA1_LIBS@
 SHELL = @SHELL@
 SIMPENROSE_CFLAGS = @SIMPENROSE_CFLAGS@
 SIMPENROSE_LIBS = @SIMPENROSE_LIBS@
@@ -332,6 +353,7 @@ STRIP = @STRIP@
 SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@
 SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@
 SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@
+TIMESTAMP_CMD = @TIMESTAMP_CMD@
 VALGRIND_CFLAGS = @VALGRIND_CFLAGS@
 VALGRIND_LIBS = @VALGRIND_LIBS@
 VA_CFLAGS = @VA_CFLAGS@
@@ -347,6 +369,7 @@ VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@
 VDPAU_MAJOR = @VDPAU_MAJOR@
 VDPAU_MINOR = @VDPAU_MINOR@
 VERSION = @VERSION@
+VG_LIB_DEPS = @VG_LIB_DEPS@
 VISIBILITY_CFLAGS = @VISIBILITY_CFLAGS@
 VISIBILITY_CXXFLAGS = @VISIBILITY_CXXFLAGS@
 VL_CFLAGS = @VL_CFLAGS@
@@ -375,10 +398,9 @@ XVMC_LIBS = @XVMC_LIBS@
 XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@
 XVMC_MAJOR = @XVMC_MAJOR@
 XVMC_MINOR = @XVMC_MINOR@
+XXD = @XXD@
 YACC = @YACC@
 YFLAGS = @YFLAGS@
-ZLIB_CFLAGS = @ZLIB_CFLAGS@
-ZLIB_LIBS = @ZLIB_LIBS@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
 abs_top_builddir = @abs_top_builddir@
@@ -464,6 +486,10 @@ C_SOURCES := \
 	radeon_video.h \
 	radeon_winsys.h
 
+LLVM_C_FILES := \
+	radeon_elf_util.c \
+	radeon_elf_util.h
+
 GALLIUM_CFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
@@ -511,8 +537,12 @@ GALLIUM_TARGET_CFLAGS = \
 	$(LIBDRM_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
-GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \
-	$(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1)
+GALLIUM_COMMON_LIB_DEPS = \
+	-lm \
+	$(CLOCK_LIB) \
+	$(PTHREAD_LIBS) \
+	$(DLOPEN_LIBS)
+
 GALLIUM_WINSYS_CFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/include \
@@ -524,20 +554,19 @@ GALLIUM_WINSYS_CFLAGS = \
 GALLIUM_PIPE_LOADER_WINSYS_LIBS =  \
 	$(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \
 	$(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \
-	$(am__append_2) $(am__append_3)
+	$(am__append_1) $(am__append_2)
 AM_CFLAGS = $(GALLIUM_DRIVER_CFLAGS) $(RADEON_CFLAGS) \
-	-Wstrict-overflow=0 $(am__append_4)
+	-Wstrict-overflow=0 $(am__append_3)
 # ^^ disable warnings about overflows (os_time_timeout)
 noinst_LTLIBRARIES = libradeon.la
-libradeon_la_SOURCES = \
-	$(C_SOURCES)
-
-@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_LIBADD = \
-@HAVE_GALLIUM_LLVM_TRUE@	$(CLOCK_LIB) \
-@HAVE_GALLIUM_LLVM_TRUE@	$(LLVM_LIBS)
+libradeon_la_SOURCES = $(C_SOURCES) $(am__append_4)
+@NEED_RADEON_LLVM_TRUE@libradeon_la_LIBADD = \
+@NEED_RADEON_LLVM_TRUE@	$(CLOCK_LIB) \
+@NEED_RADEON_LLVM_TRUE@	$(LLVM_LIBS) \
+@NEED_RADEON_LLVM_TRUE@	$(LIBELF_LIBS)
 
-@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_LDFLAGS = \
-@HAVE_GALLIUM_LLVM_TRUE@	$(LLVM_LDFLAGS)
+@NEED_RADEON_LLVM_TRUE@libradeon_la_LDFLAGS = \
+@NEED_RADEON_LLVM_TRUE@	$(LLVM_LDFLAGS)
 
 EXTRA_DIST = \
 	LLVM_REVISION.txt
@@ -607,6 +636,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_test_dma.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_texture.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_viewport.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_elf_util.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_uvd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_vce.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_vce_40_2_2.Plo@am__quote@
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
index f63790c32..3e13dae3c 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
@@ -2,17 +2,21 @@ C_SOURCES := \
 	cayman_msaa.c \
 	r600_buffer_common.c \
 	r600_cs.h \
-	r600d_common.h \
 	r600_gpu_load.c \
+	r600_perfcounter.c \
 	r600_pipe_common.c \
 	r600_pipe_common.h \
 	r600_query.c \
+	r600_query.h \
 	r600_streamout.c \
+	r600_test_dma.c \
 	r600_texture.c \
+	r600_viewport.c \
 	radeon_uvd.c \
 	radeon_uvd.h \
 	radeon_vce_40_2_2.c \
 	radeon_vce_50.c \
+	radeon_vce_52.c \
 	radeon_vce.c \
 	radeon_vce.h \
 	radeon_video.c \
@@ -21,10 +25,4 @@ C_SOURCES := \
 
 LLVM_C_FILES := \
 	radeon_elf_util.c \
-	radeon_elf_util.h \
-	radeon_llvm_emit.c \
-	radeon_llvm_emit.h \
-	radeon_llvm.h \
-	radeon_llvm_util.c \
-	radeon_llvm_util.h \
-	radeon_setup_tgsi_llvm.c
+	radeon_elf_util.h
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
index 2d1058479..bbab58946 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -30,18 +30,18 @@
 #include <inttypes.h>
 #include <stdio.h>
 
-boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
-					struct radeon_winsys_cs_handle *buf,
-					enum radeon_bo_usage usage)
+bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
+				     struct pb_buffer *buf,
+				     enum radeon_bo_usage usage)
 {
-	if (ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, buf, usage)) {
-		return TRUE;
+	if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) {
+		return true;
 	}
-	if (ctx->rings.dma.cs && ctx->rings.dma.cs->cdw &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, buf, usage)) {
-		return TRUE;
+	if (radeon_emitted(ctx->dma.cs, 0) &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) {
+		return true;
 	}
-	return FALSE;
+	return false;
 }
 
 void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
@@ -52,7 +52,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 	bool busy = false;
 
 	if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
-		return ctx->ws->buffer_map(resource->cs_buf, NULL, usage);
+		return ctx->ws->buffer_map(resource->buf, NULL, usage);
 	}
 
 	if (!(usage & PIPE_TRANSFER_WRITE)) {
@@ -60,26 +60,25 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		rusage = RADEON_USAGE_WRITE;
 	}
 
-	if (ctx->rings.gfx.cs->cdw != ctx->initial_gfx_cs_size &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs,
-					     resource->cs_buf, rusage)) {
+	if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
+					     resource->buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+			ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
 		} else {
-			ctx->rings.gfx.flush(ctx, 0, NULL);
+			ctx->gfx.flush(ctx, 0, NULL);
 			busy = true;
 		}
 	}
-	if (ctx->rings.dma.cs &&
-	    ctx->rings.dma.cs->cdw &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs,
-					     resource->cs_buf, rusage)) {
+	if (radeon_emitted(ctx->dma.cs, 0) &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
+					     resource->buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+			ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
 		} else {
-			ctx->rings.dma.flush(ctx, 0, NULL);
+			ctx->dma.flush(ctx, 0, NULL);
 			busy = true;
 		}
 	}
@@ -90,31 +89,33 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		} else {
 			/* We will be wait for the GPU. Wait for any offloaded
 			 * CS flush to complete to avoid busy-waiting in the winsys. */
-			ctx->ws->cs_sync_flush(ctx->rings.gfx.cs);
-			if (ctx->rings.dma.cs)
-				ctx->ws->cs_sync_flush(ctx->rings.dma.cs);
+			ctx->ws->cs_sync_flush(ctx->gfx.cs);
+			if (ctx->dma.cs)
+				ctx->ws->cs_sync_flush(ctx->dma.cs);
 		}
 	}
 
 	/* Setting the CS to NULL will prevent doing checks we have done already. */
-	return ctx->ws->buffer_map(resource->cs_buf, NULL, usage);
+	return ctx->ws->buffer_map(resource->buf, NULL, usage);
 }
 
-bool r600_init_resource(struct r600_common_screen *rscreen,
-			struct r600_resource *res,
-			unsigned size, unsigned alignment,
-			bool use_reusable_pool)
+void r600_init_resource_fields(struct r600_common_screen *rscreen,
+			       struct r600_resource *res,
+			       uint64_t size, unsigned alignment)
 {
 	struct r600_texture *rtex = (struct r600_texture*)res;
-	struct pb_buffer *old_buf, *new_buf;
-	enum radeon_bo_flag flags = 0;
+
+	res->bo_size = size;
+	res->bo_alignment = alignment;
+	res->flags = 0;
 
 	switch (res->b.b.usage) {
 	case PIPE_USAGE_STREAM:
-		flags = RADEON_FLAG_GTT_WC;
+		res->flags = RADEON_FLAG_GTT_WC;
 		/* fall through */
 	case PIPE_USAGE_STAGING:
-		/* Transfers are likely to occur more often with these resources. */
+		/* Transfers are likely to occur more often with these
+		 * resources. */
 		res->domains = RADEON_DOMAIN_GTT;
 		break;
 	case PIPE_USAGE_DYNAMIC:
@@ -124,52 +125,78 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		if (rscreen->info.drm_major == 2 &&
 		    rscreen->info.drm_minor < 40) {
 			res->domains = RADEON_DOMAIN_GTT;
-			flags |= RADEON_FLAG_GTT_WC;
+			res->flags |= RADEON_FLAG_GTT_WC;
 			break;
 		}
-		flags |= RADEON_FLAG_CPU_ACCESS;
+		res->flags |= RADEON_FLAG_CPU_ACCESS;
 		/* fall through */
 	case PIPE_USAGE_DEFAULT:
 	case PIPE_USAGE_IMMUTABLE:
 	default:
-		/* Not listing GTT here improves performance in some apps. */
+		/* Not listing GTT here improves performance in some
+		 * apps. */
 		res->domains = RADEON_DOMAIN_VRAM;
-		flags |= RADEON_FLAG_GTT_WC;
+		res->flags |= RADEON_FLAG_GTT_WC;
 		break;
 	}
 
 	if (res->b.b.target == PIPE_BUFFER &&
 	    res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
 			      PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
-		/* Use GTT for all persistent mappings with older kernels,
-		 * because they didn't always flush the HDP cache before CS
-		 * execution.
+		/* Use GTT for all persistent mappings with older
+		 * kernels, because they didn't always flush the HDP
+		 * cache before CS execution.
 		 *
-		 * Write-combined CPU mappings are fine, the kernel ensures all CPU
-		 * writes finish before the GPU executes a command stream.
+		 * Write-combined CPU mappings are fine, the kernel
+		 * ensures all CPU writes finish before the GPU
+		 * executes a command stream.
 		 */
 		if (rscreen->info.drm_major == 2 &&
 		    rscreen->info.drm_minor < 40)
 			res->domains = RADEON_DOMAIN_GTT;
 		else if (res->domains & RADEON_DOMAIN_VRAM)
-			flags |= RADEON_FLAG_CPU_ACCESS;
+			res->flags |= RADEON_FLAG_CPU_ACCESS;
 	}
 
 	/* Tiled textures are unmappable. Always put them in VRAM. */
 	if (res->b.b.target != PIPE_BUFFER &&
 	    rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
 		res->domains = RADEON_DOMAIN_VRAM;
-		flags &= ~RADEON_FLAG_CPU_ACCESS;
-		flags |= RADEON_FLAG_NO_CPU_ACCESS;
+		res->flags &= ~RADEON_FLAG_CPU_ACCESS;
+		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
+			 RADEON_FLAG_GTT_WC;
 	}
 
+	/* If VRAM is just stolen system memory, allow both VRAM and
+	 * GTT, whichever has free space. If a buffer is evicted from
+	 * VRAM to GTT, it will stay there.
+	 */
+	if (!rscreen->info.has_dedicated_vram &&
+	    res->domains == RADEON_DOMAIN_VRAM)
+		res->domains = RADEON_DOMAIN_VRAM_GTT;
+
 	if (rscreen->debug_flags & DBG_NO_WC)
-		flags &= ~RADEON_FLAG_GTT_WC;
+		res->flags &= ~RADEON_FLAG_GTT_WC;
+
+	/* Set expected VRAM and GART usage for the buffer. */
+	res->vram_usage = 0;
+	res->gart_usage = 0;
+
+	if (res->domains & RADEON_DOMAIN_VRAM)
+		res->vram_usage = size;
+	else if (res->domains & RADEON_DOMAIN_GTT)
+		res->gart_usage = size;
+}
+
+bool r600_alloc_resource(struct r600_common_screen *rscreen,
+			 struct r600_resource *res)
+{
+	struct pb_buffer *old_buf, *new_buf;
 
 	/* Allocate a new resource. */
-	new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment,
-					     use_reusable_pool,
-					     res->domains, flags);
+	new_buf = rscreen->ws->buffer_create(rscreen->ws, res->bo_size,
+					     res->bo_alignment,
+					     res->domains, res->flags);
 	if (!new_buf) {
 		return false;
 	}
@@ -179,11 +206,10 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	 * the same buffer where one of the contexts invalidates it while
 	 * the others are using it. */
 	old_buf = res->buf;
-	res->cs_buf = rscreen->ws->buffer_get_cs_handle(new_buf); /* should be atomic */
 	res->buf = new_buf; /* should be atomic */
 
-	if (rscreen->info.r600_virtual_address)
-		res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->cs_buf);
+	if (rscreen->info.has_virtual_memory)
+		res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf);
 	else
 		res->gpu_address = 0;
 
@@ -192,8 +218,9 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	util_range_set_empty(&res->valid_buffer_range);
 	res->TC_L2_dirty = false;
 
+	/* Print debug information. */
 	if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) {
-		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %u bytes\n",
+		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
 			res->gpu_address, res->gpu_address + res->buf->size,
 			res->buf->size);
 	}
@@ -210,6 +237,42 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
 	FREE(rbuffer);
 }
 
+static bool
+r600_invalidate_buffer(struct r600_common_context *rctx,
+		       struct r600_resource *rbuffer)
+{
+	/* Shared buffers can't be reallocated. */
+	if (rbuffer->is_shared)
+		return false;
+
+	/* In AMD_pinned_memory, the user pointer association only gets
+	 * broken when the buffer is explicitly re-allocated.
+	 */
+	if (rctx->ws->buffer_is_user_ptr(rbuffer->buf))
+		return false;
+
+	/* Check if mapping this buffer would cause waiting for the GPU. */
+	if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+	    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+		rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+	} else {
+		util_range_set_empty(&rbuffer->valid_buffer_range);
+	}
+
+	return true;
+}
+
+void r600_invalidate_resource(struct pipe_context *ctx,
+			      struct pipe_resource *resource)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	struct r600_resource *rbuffer = r600_resource(resource);
+
+	/* We currently only do anyting here for buffers */
+	if (resource->target == PIPE_BUFFER)
+		(void)r600_invalidate_buffer(rctx, rbuffer);
+}
+
 static void *r600_buffer_get_transfer(struct pipe_context *ctx,
 				      struct pipe_resource *resource,
                                       unsigned level,
@@ -220,7 +283,7 @@ static void *r600_buffer_get_transfer(struct pipe_context *ctx,
 				      unsigned offset)
 {
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
-	struct r600_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
+	struct r600_transfer *transfer = slab_alloc(&rctx->pool_transfers);
 
 	transfer->transfer.resource = resource;
 	transfer->transfer.level = level;
@@ -240,7 +303,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx,
 	bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4);
 
 	return rctx->screen->has_cp_dma ||
-	       (dword_aligned && (rctx->rings.dma.cs ||
+	       (dword_aligned && (rctx->dma.cs ||
 				  rctx->screen->has_streamout));
 
 }
@@ -263,6 +326,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	 * in which case it can be mapped unsynchronized. */
 	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
 	    usage & PIPE_TRANSFER_WRITE &&
+	    !rbuffer->is_shared &&
 	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
 		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 	}
@@ -277,29 +341,31 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 		assert(usage & PIPE_TRANSFER_WRITE);
 
-		/* Check if mapping this buffer would cause waiting for the GPU. */
-		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
-		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
-			rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+		if (r600_invalidate_buffer(rctx, rbuffer)) {
+			/* At this point, the buffer is always idle. */
+			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+		} else {
+			/* Fall back to a temporary buffer. */
+			usage |= PIPE_TRANSFER_DISCARD_RANGE;
 		}
-		/* At this point, the buffer is always idle. */
-		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 	}
-	else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
-		 !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
-		 !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
-		 r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
+
+	if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+	    !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+		       PIPE_TRANSFER_PERSISTENT)) &&
+	    !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
+	    r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
 		assert(usage & PIPE_TRANSFER_WRITE);
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
-		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
+		if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
 		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			/* Do a wait-free write-only transfer using a temporary buffer. */
 			unsigned offset;
 			struct r600_resource *staging = NULL;
 
 			u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
-				       &offset, (struct pipe_resource**)&staging, (void**)&data);
+				       256, &offset, (struct pipe_resource**)&staging, (void**)&data);
 
 			if (staging) {
 				data += box->x % R600_MAP_BUFFER_ALIGNMENT;
@@ -311,23 +377,29 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 		}
 	}
-	/* Using a staging buffer in GTT for larger reads is much faster. */
+	/* Use a staging buffer in cached GTT for reads. */
 	else if ((usage & PIPE_TRANSFER_READ) &&
-		 !(usage & PIPE_TRANSFER_WRITE) &&
-		 rbuffer->domains == RADEON_DOMAIN_VRAM &&
+		 !(usage & PIPE_TRANSFER_PERSISTENT) &&
+		 (rbuffer->domains & RADEON_DOMAIN_VRAM ||
+		  rbuffer->flags & RADEON_FLAG_GTT_WC) &&
 		 r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) {
 		struct r600_resource *staging;
 
 		staging = (struct r600_resource*) pipe_buffer_create(
-				ctx->screen, PIPE_BIND_TRANSFER_READ, PIPE_USAGE_STAGING,
+				ctx->screen, 0, PIPE_USAGE_STAGING,
 				box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT));
 		if (staging) {
 			/* Copy the VRAM buffer to the staging buffer. */
 			rctx->dma_copy(ctx, &staging->b.b, 0,
 				       box->x % R600_MAP_BUFFER_ALIGNMENT,
-				       0, 0, resource, level, box);
+				       0, 0, resource, 0, box);
 
-			data = r600_buffer_map_sync_with_rings(rctx, staging, PIPE_TRANSFER_READ);
+			data = r600_buffer_map_sync_with_rings(rctx, staging,
+							       usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
+			if (!data) {
+				r600_resource_reference(&staging, NULL);
+				return NULL;
+			}
 			data += box->x % R600_MAP_BUFFER_ALIGNMENT;
 
 			return r600_buffer_get_transfer(ctx, resource, level, usage, box,
@@ -345,38 +417,81 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 					ptransfer, data, NULL, 0);
 }
 
-static void r600_buffer_transfer_unmap(struct pipe_context *ctx,
-				       struct pipe_transfer *transfer)
+static void r600_buffer_do_flush_region(struct pipe_context *ctx,
+					struct pipe_transfer *transfer,
+				        const struct pipe_box *box)
 {
-	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
 	struct r600_resource *rbuffer = r600_resource(transfer->resource);
 
 	if (rtransfer->staging) {
-		if (rtransfer->transfer.usage & PIPE_TRANSFER_WRITE) {
-			struct pipe_resource *dst, *src;
-			unsigned soffset, doffset, size;
-			struct pipe_box box;
+		struct pipe_resource *dst, *src;
+		unsigned soffset;
+		struct pipe_box dma_box;
 
-			dst = transfer->resource;
-			src = &rtransfer->staging->b.b;
-			size = transfer->box.width;
-			doffset = transfer->box.x;
-			soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT;
+		dst = transfer->resource;
+		src = &rtransfer->staging->b.b;
+		soffset = rtransfer->offset + box->x % R600_MAP_BUFFER_ALIGNMENT;
 
-			u_box_1d(soffset, size, &box);
+		u_box_1d(soffset, box->width, &dma_box);
 
-			/* Copy the staging buffer into the original one. */
-			rctx->dma_copy(ctx, dst, 0, doffset, 0, 0, src, 0, &box);
-		}
-		pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
+		/* Copy the staging buffer into the original one. */
+		ctx->resource_copy_region(ctx, dst, 0, box->x, 0, 0, src, 0, &dma_box);
 	}
 
-	if (transfer->usage & PIPE_TRANSFER_WRITE) {
-		util_range_add(&rbuffer->valid_buffer_range, transfer->box.x,
-			       transfer->box.x + transfer->box.width);
+	util_range_add(&rbuffer->valid_buffer_range, box->x,
+		       box->x + box->width);
+}
+
+static void r600_buffer_flush_region(struct pipe_context *ctx,
+				     struct pipe_transfer *transfer,
+				     const struct pipe_box *rel_box)
+{
+	if (transfer->usage & (PIPE_TRANSFER_WRITE |
+			       PIPE_TRANSFER_FLUSH_EXPLICIT)) {
+		struct pipe_box box;
+
+		u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+		r600_buffer_do_flush_region(ctx, transfer, &box);
 	}
-	util_slab_free(&rctx->pool_transfers, transfer);
+}
+
+static void r600_buffer_transfer_unmap(struct pipe_context *ctx,
+				       struct pipe_transfer *transfer)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+
+	if (transfer->usage & PIPE_TRANSFER_WRITE &&
+	    !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+		r600_buffer_do_flush_region(ctx, transfer, &transfer->box);
+
+	if (rtransfer->staging)
+		r600_resource_reference(&rtransfer->staging, NULL);
+
+	slab_free(&rctx->pool_transfers, transfer);
+}
+
+void r600_buffer_subdata(struct pipe_context *ctx,
+			 struct pipe_resource *buffer,
+			 unsigned usage, unsigned offset,
+			 unsigned size, const void *data)
+{
+	struct pipe_transfer *transfer = NULL;
+	struct pipe_box box;
+	uint8_t *map = NULL;
+
+	u_box_1d(offset, size, &box);
+	map = r600_buffer_transfer_map(ctx, buffer, 0,
+				       PIPE_TRANSFER_WRITE |
+				       PIPE_TRANSFER_DISCARD_RANGE |
+				       usage,
+				       &box, &transfer);
+	if (!map)
+		return;
+
+	memcpy(map, data, size);
+	r600_buffer_transfer_unmap(ctx, transfer);
 }
 
 static const struct u_resource_vtbl r600_buffer_vtbl =
@@ -384,9 +499,8 @@ static const struct u_resource_vtbl r600_buffer_vtbl =
 	NULL,				/* get_handle */
 	r600_buffer_destroy,		/* resource_destroy */
 	r600_buffer_transfer_map,	/* transfer_map */
-	NULL,				/* transfer_flush_region */
+	r600_buffer_flush_region,	/* transfer_flush_region */
 	r600_buffer_transfer_unmap,	/* transfer_unmap */
-	NULL				/* transfer_inline_write */
 };
 
 static struct r600_resource *
@@ -398,11 +512,14 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
 	rbuffer = MALLOC_STRUCT(r600_resource);
 
 	rbuffer->b.b = *templ;
+	rbuffer->b.b.next = NULL;
 	pipe_reference_init(&rbuffer->b.b.reference, 1);
 	rbuffer->b.b.screen = screen;
 	rbuffer->b.vtbl = &r600_buffer_vtbl;
 	rbuffer->buf = NULL;
+	rbuffer->bind_history = 0;
 	rbuffer->TC_L2_dirty = false;
+	rbuffer->is_shared = false;
 	util_range_init(&rbuffer->valid_buffer_range);
 	return rbuffer;
 }
@@ -414,13 +531,39 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct r600_resource *rbuffer = r600_alloc_buffer_struct(screen, templ);
 
-	if (!r600_init_resource(rscreen, rbuffer, templ->width0, alignment, TRUE)) {
+	r600_init_resource_fields(rscreen, rbuffer, templ->width0, alignment);
+
+	if (templ->bind & PIPE_BIND_SHARED)
+		rbuffer->flags |= RADEON_FLAG_HANDLE;
+
+	if (!r600_alloc_resource(rscreen, rbuffer)) {
 		FREE(rbuffer);
 		return NULL;
 	}
 	return &rbuffer->b.b;
 }
 
+struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen,
+						 unsigned bind,
+						 unsigned usage,
+						 unsigned size,
+						 unsigned alignment)
+{
+	struct pipe_resource buffer;
+
+	memset(&buffer, 0, sizeof buffer);
+	buffer.target = PIPE_BUFFER;
+	buffer.format = PIPE_FORMAT_R8_UNORM;
+	buffer.bind = bind;
+	buffer.usage = usage;
+	buffer.flags = 0;
+	buffer.width0 = size;
+	buffer.height0 = 1;
+	buffer.depth0 = 1;
+	buffer.array_size = 1;
+	return r600_buffer_create(screen, &buffer, alignment);
+}
+
 struct pipe_resource *
 r600_buffer_from_user_memory(struct pipe_screen *screen,
 			     const struct pipe_resource *templ,
@@ -440,11 +583,9 @@ r600_buffer_from_user_memory(struct pipe_screen *screen,
 		return NULL;
 	}
 
-	rbuffer->cs_buf = ws->buffer_get_cs_handle(rbuffer->buf);
-
-	if (rscreen->info.r600_virtual_address)
+	if (rscreen->info.has_virtual_memory)
 		rbuffer->gpu_address =
-			ws->buffer_get_virtual_address(rbuffer->cs_buf);
+			ws->buffer_get_virtual_address(rbuffer->buf);
 	else
 		rbuffer->gpu_address = 0;
 
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
index f3529a1fe..0c55fc2a2 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -28,7 +28,7 @@
 #include "util/u_memory.h"
 #include "r600_query.h"
 #include "r600_pipe_common.h"
-#include "r600d_common.h"
+#include "amd/common/r600d_common.h"
 
 /* Max counters per HW block */
 #define R600_QUERY_MAX_COUNTERS 16
@@ -84,8 +84,8 @@ struct r600_pc_group {
 
 struct r600_pc_counter {
 	unsigned base;
-	unsigned dwords;
-	unsigned stride;
+	unsigned qwords;
+	unsigned stride; /* in uint64s */
 };
 
 #define R600_PC_SHADERS_WINDOWING (1 << 31)
@@ -115,6 +115,14 @@ static void r600_pc_query_destroy(struct r600_common_context *ctx,
 	r600_query_hw_destroy(ctx, rquery);
 }
 
+static bool r600_pc_query_prepare_buffer(struct r600_common_context *ctx,
+					 struct r600_query_hw *hwquery,
+					 struct r600_resource *buffer)
+{
+	/* no-op */
+	return true;
+}
+
 static void r600_pc_query_emit_start(struct r600_common_context *ctx,
 				     struct r600_query_hw *hwquery,
 				     struct r600_resource *buffer, uint64_t va)
@@ -172,7 +180,7 @@ static void r600_pc_query_emit_stop(struct r600_common_context *ctx,
 				pc->emit_read(ctx, block,
 					      group->num_counters, group->selectors,
 					      buffer, va);
-				va += 4 * group->num_counters;
+				va += sizeof(uint64_t) * group->num_counters;
 			} while (group->instance < 0 && ++instance < block->num_instances);
 		} while (++se < se_end);
 	}
@@ -194,15 +202,15 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx,
 				     union pipe_query_result *result)
 {
 	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
-	uint32_t *results = buffer;
+	uint64_t *results = buffer;
 	unsigned i, j;
 
 	for (i = 0; i < query->num_counters; ++i) {
 		struct r600_pc_counter *counter = &query->counters[i];
 
-		for (j = 0; j < counter->dwords; ++j) {
+		for (j = 0; j < counter->qwords; ++j) {
 			uint32_t value = results[counter->base + j * counter->stride];
-			result->batch[i].u32 += value;
+			result->batch[i].u64 += value;
 		}
 	}
 }
@@ -215,6 +223,7 @@ static struct r600_query_ops batch_query_ops = {
 };
 
 static struct r600_query_hw_ops batch_query_hw_ops = {
+	.prepare_buffer = r600_pc_query_prepare_buffer,
 	.emit_start = r600_pc_query_emit_start,
 	.emit_stop = r600_pc_query_emit_stop,
 	.clear_result = r600_pc_query_clear_result,
@@ -310,7 +319,6 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 
 	query->b.b.ops = &batch_query_ops;
 	query->b.ops = &batch_query_hw_ops;
-	query->b.flags = R600_QUERY_HW_FLAG_TIMER;
 
 	query->num_counters = num_queries;
 
@@ -362,7 +370,7 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 			instances *= block->num_instances;
 
 		group->result_base = i;
-		query->b.result_size += 4 * instances * group->num_counters;
+		query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
 		i += instances * group->num_counters;
 
 		pc->get_size(block, group->num_counters, group->selectors,
@@ -402,11 +410,11 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 		counter->base = group->result_base + j;
 		counter->stride = group->num_counters;
 
-		counter->dwords = 1;
+		counter->qwords = 1;
 		if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
-			counter->dwords = screen->info.max_se;
+			counter->qwords = screen->info.max_se;
 		if (group->instance < 0)
-			counter->dwords *= block->num_instances;
+			counter->qwords *= block->num_instances;
 	}
 
 	if (!r600_query_hw_init(rctx, &query->b))
@@ -419,8 +427,8 @@ error:
 	return NULL;
 }
 
-static boolean r600_init_block_names(struct r600_common_screen *screen,
-				     struct r600_perfcounter_block *block)
+static bool r600_init_block_names(struct r600_common_screen *screen,
+				  struct r600_perfcounter_block *block)
 {
 	unsigned i, j, k;
 	unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
@@ -453,7 +461,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
 
 	block->group_names = MALLOC(block->num_groups * block->group_name_stride);
 	if (!block->group_names)
-		return FALSE;
+		return false;
 
 	groupname = block->group_names;
 	for (i = 0; i < groups_shader; ++i) {
@@ -488,7 +496,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
 	block->selector_names = MALLOC(block->num_groups * block->num_selectors *
 				       block->selector_name_stride);
 	if (!block->selector_names)
-		return FALSE;
+		return false;
 
 	groupname = block->group_names;
 	p = block->selector_names;
@@ -500,7 +508,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
 		groupname += block->group_name_stride;
 	}
 
-	return TRUE;
+	return true;
 }
 
 int r600_get_perfcounter_info(struct r600_common_screen *screen,
@@ -536,7 +544,7 @@ int r600_get_perfcounter_info(struct r600_common_screen *screen,
 	info->name = block->selector_names + sub * block->selector_name_stride;
 	info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
 	info->max_value.u64 = 0;
-	info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
+	info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
 	info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
 	info->group_id = base_gid + sub / block->num_selectors;
 	info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
@@ -578,17 +586,17 @@ void r600_perfcounters_destroy(struct r600_common_screen *rscreen)
 		rscreen->perfcounters->cleanup(rscreen);
 }
 
-boolean r600_perfcounters_init(struct r600_perfcounters *pc,
-			       unsigned num_blocks)
+bool r600_perfcounters_init(struct r600_perfcounters *pc,
+			    unsigned num_blocks)
 {
 	pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block));
 	if (!pc->blocks)
-		return FALSE;
+		return false;
 
-	pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", FALSE);
-	pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", FALSE);
+	pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+	pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
 
-	return TRUE;
+	return true;
 }
 
 void r600_perfcounters_add_block(struct r600_common_screen *rscreen,
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
index 495fda0a8..f62bbf2e0 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -27,23 +27,118 @@
 #include "r600_pipe_common.h"
 #include "r600_cs.h"
 #include "tgsi/tgsi_parse.h"
+#include "util/list.h"
 #include "util/u_draw_quad.h"
 #include "util/u_memory.h"
 #include "util/u_format_s3tc.h"
 #include "util/u_upload_mgr.h"
+#include "os/os_time.h"
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "radeon/radeon_video.h"
 #include <inttypes.h>
+#include <sys/utsname.h>
 
 #ifndef HAVE_LLVM
 #define HAVE_LLVM 0
 #endif
 
+struct r600_multi_fence {
+	struct pipe_reference reference;
+	struct pipe_fence_handle *gfx;
+	struct pipe_fence_handle *sdma;
+
+	/* If the context wasn't flushed at fence creation, this is non-NULL. */
+	struct {
+		struct r600_common_context *ctx;
+		unsigned ib_index;
+	} gfx_unflushed;
+};
+
+/*
+ * shader binary helpers.
+ */
+void radeon_shader_binary_init(struct radeon_shader_binary *b)
+{
+	memset(b, 0, sizeof(*b));
+}
+
+void radeon_shader_binary_clean(struct radeon_shader_binary *b)
+{
+	if (!b)
+		return;
+	FREE(b->code);
+	FREE(b->config);
+	FREE(b->rodata);
+	FREE(b->global_symbol_offsets);
+	FREE(b->relocs);
+	FREE(b->disasm_string);
+	FREE(b->llvm_ir_string);
+}
+
 /*
  * pipe_context
  */
 
+void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf,
+			  uint64_t va, uint32_t old_value, uint32_t new_value)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+	if (ctx->chip_class == CIK ||
+	    ctx->chip_class == VI) {
+		/* Two EOP events are required to make all engines go idle
+		 * (and optional cache flushes executed) before the timestamp
+		 * is written.
+		 */
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
+				EVENT_INDEX(5));
+		radeon_emit(cs, va);
+		radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
+		radeon_emit(cs, old_value); /* immediate data */
+		radeon_emit(cs, 0); /* unused */
+	}
+
+	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
+			EVENT_INDEX(5));
+	radeon_emit(cs, va);
+	radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
+	radeon_emit(cs, new_value); /* immediate data */
+	radeon_emit(cs, 0); /* unused */
+
+	r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+}
+
+unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen)
+{
+	unsigned dwords = 6;
+
+	if (screen->chip_class == CIK ||
+	    screen->chip_class == VI)
+		dwords *= 2;
+
+	if (!screen->info.has_virtual_memory)
+		dwords += 2;
+
+	return dwords;
+}
+
+void r600_gfx_wait_fence(struct r600_common_context *ctx,
+			 uint64_t va, uint32_t ref, uint32_t mask)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, ref); /* reference value */
+	radeon_emit(cs, mask); /* mask */
+	radeon_emit(cs, 4); /* poll interval */
+}
+
 void r600_draw_rectangle(struct blitter_context *blitter,
 			 int x1, int y1, int x2, int y2, float depth,
 			 enum blitter_attrib_type type,
@@ -77,7 +172,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	/* Upload vertices. The hw rectangle has only 3 vertices,
 	 * I guess the 4th one is derived from the first 3.
 	 * The vertex specification should match u_blitter's vertex element state. */
-	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb);
+	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
 	if (!buf)
 		return;
 
@@ -108,12 +203,89 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	pipe_resource_reference(&buf, NULL);
 }
 
-void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
+void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
+                         struct r600_resource *dst, struct r600_resource *src)
+{
+	uint64_t vram = 0, gtt = 0;
+
+	if (dst) {
+		vram += dst->vram_usage;
+		gtt += dst->gart_usage;
+	}
+	if (src) {
+		vram += src->vram_usage;
+		gtt += src->gart_usage;
+	}
+
+	/* Flush the GFX IB if DMA depends on it. */
+	if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
+	    ((dst &&
+	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
+					       RADEON_USAGE_READWRITE)) ||
+	     (src &&
+	      ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
+					       RADEON_USAGE_WRITE))))
+		ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+
+	/* Flush if there's not enough space, or if the memory usage per IB
+	 * is too large.
+	 */
+	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
+	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
+		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
+	}
+
+	/* If GPUVM is not supported, the CS checker needs 2 entries
+	 * in the buffer list per packet, which has to be done manually.
+	 */
+	if (ctx->screen->info.has_virtual_memory) {
+		if (dst)
+			radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
+						  RADEON_USAGE_WRITE,
+						  RADEON_PRIO_SDMA_BUFFER);
+		if (src)
+			radeon_add_to_buffer_list(ctx, &ctx->dma, src,
+						  RADEON_USAGE_READ,
+						  RADEON_PRIO_SDMA_BUFFER);
+	}
+}
+
+/* This is required to prevent read-after-write hazards. */
+void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 {
-	/* Flush if there's not enough space. */
-	if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) {
-		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw);
+	struct radeon_winsys_cs *cs = rctx->dma.cs;
+
+	/* done at the end of DMA calls, so increment this. */
+	rctx->num_dma_calls++;
+
+	/* IBs using too little memory are limited by the IB submission overhead.
+	 * IBs using too much memory are limited by the kernel/TTM overhead.
+	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+	 *
+	 * This heuristic makes sure that DMA requests are executed
+	 * very soon after the call is made and lowers memory usage.
+	 * It improves texture upload performance by keeping the DMA
+	 * engine busy while uploads are being submitted.
+	 */
+	if (cs->used_vram + cs->used_gart > 64 * 1024 * 1024) {
+		rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+		return;
+	}
+
+	r600_need_dma_space(rctx, 1, NULL, NULL);
+
+	if (!radeon_emitted(cs, 0)) /* empty queue */
+		return;
+
+	/* NOP waits for idle on Evergreen and later. */
+	if (rctx->chip_class >= CIK)
+		radeon_emit(cs, 0x00000000); /* NOP */
+	else if (rctx->chip_class >= EVERGREEN)
+		radeon_emit(cs, 0xf0000000); /* NOP */
+	else {
+		/* TODO: R600-R700 should use the FENCE packet.
+		 * CS checker support is required. */
 	}
 }
 
@@ -123,24 +295,9 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
-	/* Disable render condition. */
-	ctx->saved_render_cond = NULL;
-	ctx->saved_render_cond_cond = FALSE;
-	ctx->saved_render_cond_mode = 0;
-	if (ctx->current_render_cond) {
-		ctx->saved_render_cond = ctx->current_render_cond;
-		ctx->saved_render_cond_cond = ctx->current_render_cond_cond;
-		ctx->saved_render_cond_mode = ctx->current_render_cond_mode;
-		ctx->b.render_condition(&ctx->b, NULL, FALSE, 0);
-	}
-
 	/* suspend queries */
-	ctx->queries_suspended_for_flush = false;
-	if (ctx->num_cs_dw_nontimer_queries_suspend) {
-		r600_suspend_nontimer_queries(ctx);
-		r600_suspend_timer_queries(ctx);
-		ctx->queries_suspended_for_flush = true;
-	}
+	if (!LIST_IS_EMPTY(&ctx->active_queries))
+		r600_suspend_queries(ctx);
 
 	ctx->streamout.suspended = false;
 	if (ctx->streamout.begin_emitted) {
@@ -157,48 +314,152 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	}
 
 	/* resume queries */
-	if (ctx->queries_suspended_for_flush) {
-		r600_resume_nontimer_queries(ctx);
-		r600_resume_timer_queries(ctx);
-	}
-
-	/* Re-enable render condition. */
-	if (ctx->saved_render_cond) {
-		ctx->b.render_condition(&ctx->b, ctx->saved_render_cond,
-					  ctx->saved_render_cond_cond,
-					  ctx->saved_render_cond_mode);
-	}
+	if (!LIST_IS_EMPTY(&ctx->active_queries))
+		r600_resume_queries(ctx);
 }
 
 static void r600_flush_from_st(struct pipe_context *ctx,
 			       struct pipe_fence_handle **fence,
 			       unsigned flags)
 {
+	struct pipe_screen *screen = ctx->screen;
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct radeon_winsys *ws = rctx->ws;
 	unsigned rflags = 0;
+	struct pipe_fence_handle *gfx_fence = NULL;
+	struct pipe_fence_handle *sdma_fence = NULL;
+	bool deferred_fence = false;
 
 	if (flags & PIPE_FLUSH_END_OF_FRAME)
 		rflags |= RADEON_FLUSH_END_OF_FRAME;
+	if (flags & PIPE_FLUSH_DEFERRED)
+		rflags |= RADEON_FLUSH_ASYNC;
+
+	if (rctx->dma.cs) {
+		rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
+	}
+
+	if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) {
+		if (fence)
+			ws->fence_reference(&gfx_fence, rctx->last_gfx_fence);
+		if (!(rflags & RADEON_FLUSH_ASYNC))
+			ws->cs_sync_flush(rctx->gfx.cs);
+	} else {
+		/* Instead of flushing, create a deferred fence. Constraints:
+		 * - The state tracker must allow a deferred flush.
+		 * - The state tracker must request a fence.
+		 * Thread safety in fence_finish must be ensured by the state tracker.
+		 */
+		if (flags & PIPE_FLUSH_DEFERRED && fence) {
+			gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
+			deferred_fence = true;
+		} else {
+			rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
+		}
+	}
+
+	/* Both engines can signal out of order, so we need to keep both fences. */
+	if (fence) {
+		struct r600_multi_fence *multi_fence =
+			CALLOC_STRUCT(r600_multi_fence);
+		if (!multi_fence)
+			return;
+
+		multi_fence->reference.count = 1;
+		/* If both fences are NULL, fence_finish will always return true. */
+		multi_fence->gfx = gfx_fence;
+		multi_fence->sdma = sdma_fence;
+
+		if (deferred_fence) {
+			multi_fence->gfx_unflushed.ctx = rctx;
+			multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
+		}
 
-	if (rctx->rings.dma.cs) {
-		rctx->rings.dma.flush(rctx, rflags, NULL);
+		screen->fence_reference(screen, fence, NULL);
+		*fence = (struct pipe_fence_handle*)multi_fence;
 	}
-	rctx->rings.gfx.flush(rctx, rflags, fence);
 }
 
 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 				struct pipe_fence_handle **fence)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
-
-	if (!cs->cdw) {
+	struct radeon_winsys_cs *cs = rctx->dma.cs;
+	struct radeon_saved_cs saved;
+	bool check_vm =
+		(rctx->screen->debug_flags & DBG_CHECK_VM) &&
+		rctx->check_vm_faults;
+
+	if (!radeon_emitted(cs, 0)) {
+		if (fence)
+			rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 		return;
 	}
 
-	rctx->rings.dma.flushing = true;
-	rctx->ws->cs_flush(cs, flags, fence, 0);
-	rctx->rings.dma.flushing = false;
+	if (check_vm)
+		radeon_save_cs(rctx->ws, cs, &saved);
+
+	rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
+	if (fence)
+		rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
+
+	if (check_vm) {
+		/* Use conservative timeout 800ms, after which we won't wait any
+		 * longer and assume the GPU is hung.
+		 */
+		rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
+
+		rctx->check_vm_faults(rctx, &saved, RING_DMA);
+		radeon_clear_saved_cs(&saved);
+	}
+}
+
+/**
+ * Store a linearized copy of all chunks of \p cs together with the buffer
+ * list in \p saved.
+ */
+void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+		    struct radeon_saved_cs *saved)
+{
+	void *buf;
+	unsigned i;
+
+	/* Save the IB chunks. */
+	saved->num_dw = cs->prev_dw + cs->current.cdw;
+	saved->ib = MALLOC(4 * saved->num_dw);
+	if (!saved->ib)
+		goto oom;
+
+	buf = saved->ib;
+	for (i = 0; i < cs->num_prev; ++i) {
+		memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
+		buf += cs->prev[i].cdw;
+	}
+	memcpy(buf, cs->current.buf, cs->current.cdw * 4);
+
+	/* Save the buffer list. */
+	saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
+	saved->bo_list = CALLOC(saved->bo_count,
+				sizeof(saved->bo_list[0]));
+	if (!saved->bo_list) {
+		FREE(saved->ib);
+		goto oom;
+	}
+	ws->cs_get_buffer_list(cs, saved->bo_list);
+
+	return;
+
+oom:
+	fprintf(stderr, "%s: out of memory\n", __func__);
+	memset(saved, 0, sizeof(*saved));
+}
+
+void radeon_clear_saved_cs(struct radeon_saved_cs *saved)
+{
+	FREE(saved->ib);
+	FREE(saved->bo_list);
+
+	memset(saved, 0, sizeof(*saved));
 }
 
 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
@@ -214,31 +475,82 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 	return PIPE_UNKNOWN_CONTEXT_RESET;
 }
 
+static void r600_set_debug_callback(struct pipe_context *ctx,
+				    const struct pipe_debug_callback *cb)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+	if (cb)
+		rctx->debug = *cb;
+	else
+		memset(&rctx->debug, 0, sizeof(rctx->debug));
+}
+
+static void r600_set_device_reset_callback(struct pipe_context *ctx,
+					   const struct pipe_device_reset_callback *cb)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+	if (cb)
+		rctx->device_reset_callback = *cb;
+	else
+		memset(&rctx->device_reset_callback, 0,
+		       sizeof(rctx->device_reset_callback));
+}
+
+bool r600_check_device_reset(struct r600_common_context *rctx)
+{
+	enum pipe_reset_status status;
+
+	if (!rctx->device_reset_callback.reset)
+		return false;
+
+	if (!rctx->b.get_device_reset_status)
+		return false;
+
+	status = rctx->b.get_device_reset_status(&rctx->b);
+	if (status == PIPE_NO_RESET)
+		return false;
+
+	rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
+	return true;
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
-			      struct r600_common_screen *rscreen)
+			      struct r600_common_screen *rscreen,
+			      unsigned context_flags)
 {
-	util_slab_create(&rctx->pool_transfers,
-			 sizeof(struct r600_transfer), 64,
-			 UTIL_SLAB_SINGLETHREADED);
+	slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers);
 
 	rctx->screen = rscreen;
 	rctx->ws = rscreen->ws;
 	rctx->family = rscreen->family;
 	rctx->chip_class = rscreen->chip_class;
 
-	if (rscreen->family == CHIP_HAWAII)
-		rctx->max_db = 16;
+	if (rscreen->chip_class >= CIK)
+		rctx->max_db = MAX2(8, rscreen->info.num_render_backends);
 	else if (rscreen->chip_class >= EVERGREEN)
 		rctx->max_db = 8;
 	else
 		rctx->max_db = 4;
 
+	rctx->b.invalidate_resource = r600_invalidate_resource;
 	rctx->b.transfer_map = u_transfer_map_vtbl;
-	rctx->b.transfer_flush_region = u_default_transfer_flush_region;
+	rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
 	rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
-	rctx->b.transfer_inline_write = u_default_transfer_inline_write;
-        rctx->b.memory_barrier = r600_memory_barrier;
+	rctx->b.texture_subdata = u_default_texture_subdata;
+	rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
+	rctx->b.set_debug_callback = r600_set_debug_callback;
+
+	/* evergreen_compute.c has a special codepath for global buffers.
+	 * Everything else can use the direct path.
+	 */
+	if ((rscreen->chip_class == EVERGREEN || rscreen->chip_class == CAYMAN) &&
+	    (context_flags & PIPE_CONTEXT_COMPUTE_ONLY))
+		rctx->b.buffer_subdata = u_default_buffer_subdata;
+	else
+		rctx->b.buffer_subdata = r600_buffer_subdata;
 
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 		rctx->b.get_device_reset_status = r600_get_reset_status;
@@ -247,21 +559,23 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 					      RADEON_GPU_RESET_COUNTER);
 	}
 
-	LIST_INITHEAD(&rctx->texture_buffers);
+	rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 
 	r600_init_context_texture_functions(rctx);
+	r600_init_viewport_functions(rctx);
 	r600_streamout_init(rctx);
 	r600_query_init(rctx);
 	cayman_init_msaa(&rctx->b);
 
-	rctx->allocator_so_filled_size = u_suballocator_create(&rctx->b, 4096, 4,
-							       0, PIPE_USAGE_DEFAULT, TRUE);
-	if (!rctx->allocator_so_filled_size)
+	rctx->allocator_zeroed_memory =
+		u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
+				      0, PIPE_USAGE_DEFAULT, true);
+	if (!rctx->allocator_zeroed_memory)
 		return false;
 
-	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256,
+	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
 					PIPE_BIND_INDEX_BUFFER |
-					PIPE_BIND_CONSTANT_BUFFER);
+					PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
 	if (!rctx->uploader)
 		return false;
 
@@ -269,11 +583,11 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->ctx)
 		return false;
 
-	if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
-		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
-							 r600_flush_dma_ring,
-							 rctx, NULL);
-		rctx->rings.dma.flush = r600_flush_dma_ring;
+	if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
+		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
+						   r600_flush_dma_ring,
+						   rctx);
+		rctx->dma.flush = r600_flush_dma_ring;
 	}
 
 	return true;
@@ -281,46 +595,41 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 void r600_common_context_cleanup(struct r600_common_context *rctx)
 {
-	if (rctx->rings.gfx.cs)
-		rctx->ws->cs_destroy(rctx->rings.gfx.cs);
-	if (rctx->rings.dma.cs)
-		rctx->ws->cs_destroy(rctx->rings.dma.cs);
-	if (rctx->ctx)
-		rctx->ws->ctx_destroy(rctx->ctx);
+	unsigned i,j;
 
-	if (rctx->uploader) {
-		u_upload_destroy(rctx->uploader);
-	}
+	/* Release DCC stats. */
+	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+		assert(!rctx->dcc_stats[i].query_active);
 
-	util_slab_destroy(&rctx->pool_transfers);
+		for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
+			if (rctx->dcc_stats[i].ps_stats[j])
+				rctx->b.destroy_query(&rctx->b,
+						      rctx->dcc_stats[i].ps_stats[j]);
 
-	if (rctx->allocator_so_filled_size) {
-		u_suballocator_destroy(rctx->allocator_so_filled_size);
+		r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
 	}
-}
 
-void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
-{
-	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_resource *rr = (struct r600_resource *)r;
+	if (rctx->query_result_shader)
+		rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
 
-	if (r == NULL) {
-		return;
-	}
+	if (rctx->gfx.cs)
+		rctx->ws->cs_destroy(rctx->gfx.cs);
+	if (rctx->dma.cs)
+		rctx->ws->cs_destroy(rctx->dma.cs);
+	if (rctx->ctx)
+		rctx->ws->ctx_destroy(rctx->ctx);
 
-	/*
-	 * The idea is to compute a gross estimate of memory requirement of
-	 * each draw call. After each draw call, memory will be precisely
-	 * accounted. So the uncertainty is only on the current draw call.
-	 * In practice this gave very good estimate (+/- 10% of the target
-	 * memory limit).
-	 */
-	if (rr->domains & RADEON_DOMAIN_GTT) {
-		rctx->gtt += rr->buf->size;
+	if (rctx->uploader) {
+		u_upload_destroy(rctx->uploader);
 	}
-	if (rr->domains & RADEON_DOMAIN_VRAM) {
-		rctx->vram += rr->buf->size;
+
+	slab_destroy_child(&rctx->pool_transfers);
+
+	if (rctx->allocator_zeroed_memory) {
+		u_suballocator_destroy(rctx->allocator_zeroed_memory);
 	}
+	rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
+	rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 }
 
 /*
@@ -330,10 +639,8 @@ void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resour
 static const struct debug_named_value common_debug_options[] = {
 	/* logging */
 	{ "tex", DBG_TEX, "Print texture info" },
-	{ "texmip", DBG_TEXMIP, "Print texture info (mipmapped only)" },
 	{ "compute", DBG_COMPUTE, "Print compute info" },
 	{ "vm", DBG_VM, "Print virtual addresses when creating resources" },
-	{ "trace_cs", DBG_TRACE_CS, "Trace cs and write rlockup_<csid>.c file with faulty cs" },
 	{ "info", DBG_INFO, "Print driver information" },
 
 	/* shaders */
@@ -347,6 +654,10 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
 	{ "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
 	{ "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
+	{ "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial optimizations" },
+	{ "checkir", DBG_CHECK_IR, "Enable additional sanity checks on shader IR" },
+
+	{ "testdma", DBG_TEST_DMA, "Invoke SDMA tests and exit." },
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
@@ -359,6 +670,15 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." },
 	{ "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." },
 	{ "nowc", DBG_NO_WC, "Disable GTT write combining" },
+	{ "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." },
+	{ "nodcc", DBG_NO_DCC, "Disable DCC." },
+	{ "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." },
+	{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
+	{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." },
+	{ "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" },
+	{ "noce", DBG_NO_CE, "Disable the constant engine"},
+	{ "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader optimizations" },
+	{ "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
@@ -415,6 +735,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_ICELAND: return "AMD ICELAND";
 	case CHIP_CARRIZO: return "AMD CARRIZO";
 	case CHIP_FIJI: return "AMD FIJI";
+	case CHIP_POLARIS10: return "AMD POLARIS10";
+	case CHIP_POLARIS11: return "AMD POLARIS11";
 	case CHIP_STONEY: return "AMD STONEY";
 	default: return "AMD unknown";
 	}
@@ -535,25 +857,30 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_KAVERI: return "kaveri";
 	case CHIP_HAWAII: return "hawaii";
 	case CHIP_MULLINS:
-#if HAVE_LLVM >= 0x0305
 		return "mullins";
-#else
-		return "kabini";
-#endif
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
-	case CHIP_FIJI: return "fiji";
 #if HAVE_LLVM <= 0x0307
+	case CHIP_FIJI: return "tonga";
 	case CHIP_STONEY: return "carrizo";
 #else
+	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
 #endif
+#if HAVE_LLVM <= 0x0308
+	case CHIP_POLARIS10: return "tonga";
+	case CHIP_POLARIS11: return "tonga";
+#else
+	case CHIP_POLARIS10: return "polaris10";
+	case CHIP_POLARIS11: return "polaris11";
+#endif
 	default: return "";
 	}
 }
 
 static int r600_get_compute_param(struct pipe_screen *screen,
+        enum pipe_shader_ir ir_type,
         enum pipe_compute_cap param,
         void *ret)
 {
@@ -564,20 +891,19 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 	case PIPE_COMPUTE_CAP_IR_TARGET: {
 		const char *gpu;
 		const char *triple;
-		if (rscreen->family <= CHIP_ARUBA || HAVE_LLVM < 0x0306) {
+		if (rscreen->family <= CHIP_ARUBA) {
 			triple = "r600--";
 		} else {
-			triple = "amdgcn--";
+			if (HAVE_LLVM < 0x0400) {
+				triple = "amdgcn--";
+			} else {
+				triple = "amdgcn-mesa-mesa3d";
+			}
 		}
 		switch(rscreen->family) {
 		/* Clang < 3.6 is missing Hainan in its list of
 		 * GPUs, so we need to use the name of a similar GPU.
 		 */
-#if HAVE_LLVM < 0x0306
-		case CHIP_HAINAN:
-			gpu = "oland";
-			break;
-#endif
 		default:
 			gpu = r600_get_llvm_processor_name(rscreen->family);
 			break;
@@ -600,32 +926,51 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 			uint64_t *grid_size = ret;
 			grid_size[0] = 65535;
 			grid_size[1] = 65535;
-			grid_size[2] = 1;
+			grid_size[2] = 65535;
 		}
 		return 3 * sizeof(uint64_t) ;
 
 	case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
 		if (ret) {
 			uint64_t *block_size = ret;
-			block_size[0] = 256;
-			block_size[1] = 256;
-			block_size[2] = 256;
+			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+			    ir_type == PIPE_SHADER_IR_TGSI) {
+				block_size[0] = 2048;
+				block_size[1] = 2048;
+				block_size[2] = 2048;
+			} else {
+				block_size[0] = 256;
+				block_size[1] = 256;
+				block_size[2] = 256;
+			}
 		}
 		return 3 * sizeof(uint64_t);
 
 	case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
 		if (ret) {
 			uint64_t *max_threads_per_block = ret;
-			*max_threads_per_block = 256;
+			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+			    ir_type == PIPE_SHADER_IR_TGSI)
+				*max_threads_per_block = 2048;
+			else
+				*max_threads_per_block = 256;
 		}
 		return sizeof(uint64_t);
+	case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+		if (ret) {
+			uint32_t *address_bits = ret;
+			address_bits[0] = 32;
+			if (rscreen->chip_class >= SI)
+				address_bits[0] = 64;
+		}
+		return 1 * sizeof(uint32_t);
 
 	case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
 		if (ret) {
 			uint64_t *max_global_size = ret;
 			uint64_t max_mem_alloc_size;
 
-			r600_get_compute_param(screen,
+			r600_get_compute_param(screen, ir_type,
 				PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
 				&max_mem_alloc_size);
 
@@ -636,8 +981,8 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 			 * 4 * MAX_MEM_ALLOC_SIZE.
 			 */
 			*max_global_size = MIN2(4 * max_mem_alloc_size,
-				rscreen->info.gart_size +
-				rscreen->info.vram_size);
+						MAX2(rscreen->info.gart_size,
+						     rscreen->info.vram_size));
 		}
 		return sizeof(uint64_t);
 
@@ -661,24 +1006,21 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 		if (ret) {
 			uint64_t *max_mem_alloc_size = ret;
 
-			/* XXX: The limit in older kernels is 256 MB.  We
-			 * should add a query here for newer kernels.
-			 */
-			*max_mem_alloc_size = 256 * 1024 * 1024;
+			*max_mem_alloc_size = rscreen->info.max_alloc_size;
 		}
 		return sizeof(uint64_t);
 
 	case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
 		if (ret) {
 			uint32_t *max_clock_frequency = ret;
-			*max_clock_frequency = rscreen->info.max_sclk;
+			*max_clock_frequency = rscreen->info.max_shader_clock;
 		}
 		return sizeof(uint32_t);
 
 	case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
 		if (ret) {
 			uint32_t *max_compute_units = ret;
-			*max_compute_units = rscreen->info.max_compute_units;
+			*max_compute_units = rscreen->info.num_good_compute_units;
 		}
 		return sizeof(uint32_t);
 
@@ -696,6 +1038,16 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 			*subgroup_size = r600_wavefront_size(rscreen->family);
 		}
 		return sizeof(uint32_t);
+	case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+		if (ret) {
+			uint64_t *max_variable_threads_per_block = ret;
+			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+			    ir_type == PIPE_SHADER_IR_TGSI)
+				*max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+			else
+				*max_variable_threads_per_block = 0;
+		}
+		return sizeof(uint64_t);
 	}
 
         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
@@ -707,188 +1059,116 @@ static uint64_t r600_get_timestamp(struct pipe_screen *screen)
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 
 	return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) /
-			rscreen->info.r600_clock_crystal_freq;
-}
-
-static int r600_get_driver_query_info(struct pipe_screen *screen,
-				      unsigned index,
-				      struct pipe_driver_query_info *info)
-{
-	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
-	struct pipe_driver_query_info list[] = {
-		{"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
-		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}},
-		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES,
-		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
-		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"GPU-load", R600_QUERY_GPU_LOAD, {100}},
-		{"temperature", R600_QUERY_GPU_TEMPERATURE, {100}},
-		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
-		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
-	};
-	unsigned num_queries;
-
-	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
-		num_queries = Elements(list);
-	else if (rscreen->info.drm_major == 3)
-		num_queries = Elements(list) - 3;
-	else
-		num_queries = Elements(list) - 4;
-
-	if (!info)
-		return num_queries;
-
-	if (index >= num_queries)
-		return 0;
-
-	*info = list[index];
-	return 1;
+			rscreen->info.clock_crystal_freq;
 }
 
 static void r600_fence_reference(struct pipe_screen *screen,
-				 struct pipe_fence_handle **ptr,
-				 struct pipe_fence_handle *fence)
+				 struct pipe_fence_handle **dst,
+				 struct pipe_fence_handle *src)
 {
-	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
-
-	rws->fence_reference(ptr, fence);
+	struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
+	struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
+	struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
+
+	if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+		ws->fence_reference(&(*rdst)->gfx, NULL);
+		ws->fence_reference(&(*rdst)->sdma, NULL);
+		FREE(*rdst);
+	}
+        *rdst = rsrc;
 }
 
 static boolean r600_fence_finish(struct pipe_screen *screen,
+				 struct pipe_context *ctx,
 				 struct pipe_fence_handle *fence,
 				 uint64_t timeout)
 {
 	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
+	struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
+	struct r600_common_context *rctx =
+		ctx ? (struct r600_common_context*)ctx : NULL;
+	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
 
-	return rws->fence_wait(rws, fence, timeout);
-}
+	if (rfence->sdma) {
+		if (!rws->fence_wait(rws, rfence->sdma, timeout))
+			return false;
 
-static bool r600_interpret_tiling(struct r600_common_screen *rscreen,
-				  uint32_t tiling_config)
-{
-	switch ((tiling_config & 0xe) >> 1) {
-	case 0:
-		rscreen->tiling_info.num_channels = 1;
-		break;
-	case 1:
-		rscreen->tiling_info.num_channels = 2;
-		break;
-	case 2:
-		rscreen->tiling_info.num_channels = 4;
-		break;
-	case 3:
-		rscreen->tiling_info.num_channels = 8;
-		break;
-	default:
-		return false;
+		/* Recompute the timeout after waiting. */
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
+		}
 	}
 
-	switch ((tiling_config & 0x30) >> 4) {
-	case 0:
-		rscreen->tiling_info.num_banks = 4;
-		break;
-	case 1:
-		rscreen->tiling_info.num_banks = 8;
-		break;
-	default:
-		return false;
+	if (!rfence->gfx)
+		return true;
 
-	}
-	switch ((tiling_config & 0xc0) >> 6) {
-	case 0:
-		rscreen->tiling_info.group_bytes = 256;
-		break;
-	case 1:
-		rscreen->tiling_info.group_bytes = 512;
-		break;
-	default:
-		return false;
-	}
-	return true;
-}
+	/* Flush the gfx IB if it hasn't been flushed yet. */
+	if (rctx &&
+	    rfence->gfx_unflushed.ctx == rctx &&
+	    rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) {
+		rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL);
+		rfence->gfx_unflushed.ctx = NULL;
 
-static bool evergreen_interpret_tiling(struct r600_common_screen *rscreen,
-				       uint32_t tiling_config)
-{
-	switch (tiling_config & 0xf) {
-	case 0:
-		rscreen->tiling_info.num_channels = 1;
-		break;
-	case 1:
-		rscreen->tiling_info.num_channels = 2;
-		break;
-	case 2:
-		rscreen->tiling_info.num_channels = 4;
-		break;
-	case 3:
-		rscreen->tiling_info.num_channels = 8;
-		break;
-	default:
-		return false;
-	}
+		if (!timeout)
+			return false;
 
-	switch ((tiling_config & 0xf0) >> 4) {
-	case 0:
-		rscreen->tiling_info.num_banks = 4;
-		break;
-	case 1:
-		rscreen->tiling_info.num_banks = 8;
-		break;
-	case 2:
-		rscreen->tiling_info.num_banks = 16;
-		break;
-	default:
-		return false;
+		/* Recompute the timeout after all that. */
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
+		}
 	}
 
-	switch ((tiling_config & 0xf00) >> 8) {
-	case 0:
-		rscreen->tiling_info.group_bytes = 256;
-		break;
-	case 1:
-		rscreen->tiling_info.group_bytes = 512;
-		break;
-	default:
-		return false;
-	}
-	return true;
+	return rws->fence_wait(rws, rfence->gfx, timeout);
 }
 
-static bool r600_init_tiling(struct r600_common_screen *rscreen)
+static void r600_query_memory_info(struct pipe_screen *screen,
+				   struct pipe_memory_info *info)
 {
-	uint32_t tiling_config = rscreen->info.r600_tiling_config;
-
-	/* set default group bytes, overridden by tiling info ioctl */
-	if (rscreen->chip_class <= R700) {
-		rscreen->tiling_info.group_bytes = 256;
-	} else {
-		rscreen->tiling_info.group_bytes = 512;
-	}
-
-	if (!tiling_config)
-		return true;
-
-	if (rscreen->chip_class <= R700) {
-		return r600_interpret_tiling(rscreen, tiling_config);
-	} else {
-		return evergreen_interpret_tiling(rscreen, tiling_config);
-	}
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	struct radeon_winsys *ws = rscreen->ws;
+	unsigned vram_usage, gtt_usage;
+
+	info->total_device_memory = rscreen->info.vram_size / 1024;
+	info->total_staging_memory = rscreen->info.gart_size / 1024;
+
+	/* The real TTM memory usage is somewhat random, because:
+	 *
+	 * 1) TTM delays freeing memory, because it can only free it after
+	 *    fences expire.
+	 *
+	 * 2) The memory usage can be really low if big VRAM evictions are
+	 *    taking place, but the real usage is well above the size of VRAM.
+	 *
+	 * Instead, return statistics of this process.
+	 */
+	vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024;
+	gtt_usage =  ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024;
+
+	info->avail_device_memory =
+		vram_usage <= info->total_device_memory ?
+				info->total_device_memory - vram_usage : 0;
+	info->avail_staging_memory =
+		gtt_usage <= info->total_staging_memory ?
+				info->total_staging_memory - gtt_usage : 0;
+
+	info->device_memory_evicted =
+		ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+
+	if (rscreen->info.drm_major == 3 && rscreen->info.drm_minor >= 4)
+		info->nr_device_memory_evictions =
+			ws->query_value(ws, RADEON_NUM_EVICTIONS);
+	else
+		/* Just return the number of evicted 64KB pages. */
+		info->nr_device_memory_evictions = info->device_memory_evicted / 64;
 }
 
 struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 						  const struct pipe_resource *templ)
 {
 	if (templ->target == PIPE_BUFFER) {
-		return r600_buffer_create(screen, templ, 4096);
+		return r600_buffer_create(screen, templ, 256);
 	} else {
 		return r600_texture_create(screen, templ);
 	}
@@ -897,10 +1177,15 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 bool r600_common_screen_init(struct r600_common_screen *rscreen,
 			     struct radeon_winsys *ws)
 {
-	char llvm_string[32] = {};
+	char llvm_string[32] = {}, kernel_version[128] = {};
+	struct utsname uname_data;
 
 	ws->query_info(ws, &rscreen->info);
 
+	if (uname(&uname_data) == 0)
+		snprintf(kernel_version, sizeof(kernel_version),
+			 " / %s", uname_data.release);
+
 #if HAVE_LLVM
 	snprintf(llvm_string, sizeof(llvm_string),
 		 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
@@ -908,22 +1193,22 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 #endif
 
 	snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
-		 "%s (DRM %i.%i.%i%s)",
+		 "%s (DRM %i.%i.%i%s%s)",
 		 r600_get_chip_name(rscreen), rscreen->info.drm_major,
 		 rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
-		 llvm_string);
+		 kernel_version, llvm_string);
 
 	rscreen->b.get_name = r600_get_name;
 	rscreen->b.get_vendor = r600_get_vendor;
 	rscreen->b.get_device_vendor = r600_get_device_vendor;
 	rscreen->b.get_compute_param = r600_get_compute_param;
 	rscreen->b.get_paramf = r600_get_paramf;
-	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
 	rscreen->b.get_timestamp = r600_get_timestamp;
 	rscreen->b.fence_finish = r600_fence_finish;
 	rscreen->b.fence_reference = r600_fence_reference;
 	rscreen->b.resource_destroy = u_resource_destroy_vtbl;
 	rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory;
+	rscreen->b.query_memory_info = r600_query_memory_info;
 
 	if (rscreen->info.has_uvd) {
 		rscreen->b.get_video_param = rvid_get_video_param;
@@ -934,109 +1219,115 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	}
 
 	r600_init_screen_texture_functions(rscreen);
+	r600_init_screen_query_functions(rscreen);
 
 	rscreen->ws = ws;
 	rscreen->family = rscreen->info.family;
 	rscreen->chip_class = rscreen->info.chip_class;
 	rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
 
-	if (!r600_init_tiling(rscreen)) {
-		return false;
+	slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64);
+
+	rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
+	if (rscreen->force_aniso >= 0) {
+		printf("radeon: Forcing anisotropy filter to %ix\n",
+		       /* round down to a power of two */
+		       1 << util_logbase2(rscreen->force_aniso));
 	}
+
 	util_format_s3tc_init();
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
 
-	if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) ||
-	     rscreen->info.drm_major == 3) &&
-	    (rscreen->debug_flags & DBG_TRACE_CS)) {
-		rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b,
-										PIPE_BIND_CUSTOM,
-										PIPE_USAGE_STAGING,
-										4096);
-		if (rscreen->trace_bo) {
-			rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->cs_buf, NULL,
-									PIPE_TRANSFER_UNSYNCHRONIZED);
-		}
-	}
-
 	if (rscreen->debug_flags & DBG_INFO) {
 		printf("pci_id = 0x%x\n", rscreen->info.pci_id);
-		printf("family = %i\n", rscreen->info.family);
+		printf("family = %i (%s)\n", rscreen->info.family,
+		       r600_get_chip_name(rscreen));
 		printf("chip_class = %i\n", rscreen->info.chip_class);
-		printf("gart_size = %i MB\n", (int)(rscreen->info.gart_size >> 20));
-		printf("vram_size = %i MB\n", (int)(rscreen->info.vram_size >> 20));
-		printf("max_sclk = %i\n", rscreen->info.max_sclk);
-		printf("max_compute_units = %i\n", rscreen->info.max_compute_units);
-		printf("max_se = %i\n", rscreen->info.max_se);
-		printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
-		printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
-		       rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
+		printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
+		printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
+		printf("max_alloc_size = %i MB\n",
+		       (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024));
+		printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
+		printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2);
+		printf("has_sdma = %i\n", rscreen->info.has_sdma);
 		printf("has_uvd = %i\n", rscreen->info.has_uvd);
+		printf("me_fw_version = %i\n", rscreen->info.me_fw_version);
+		printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version);
+		printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version);
 		printf("vce_fw_version = %i\n", rscreen->info.vce_fw_version);
-		printf("r600_num_backends = %i\n", rscreen->info.r600_num_backends);
-		printf("r600_clock_crystal_freq = %i\n", rscreen->info.r600_clock_crystal_freq);
-		printf("r600_tiling_config = 0x%x\n", rscreen->info.r600_tiling_config);
-		printf("r600_num_tile_pipes = %i\n", rscreen->info.r600_num_tile_pipes);
-		printf("r600_max_pipes = %i\n", rscreen->info.r600_max_pipes);
-		printf("r600_virtual_address = %i\n", rscreen->info.r600_virtual_address);
-		printf("r600_has_dma = %i\n", rscreen->info.r600_has_dma);
-		printf("r600_backend_map = %i\n", rscreen->info.r600_backend_map);
-		printf("r600_backend_map_valid = %i\n", rscreen->info.r600_backend_map_valid);
-		printf("si_tile_mode_array_valid = %i\n", rscreen->info.si_tile_mode_array_valid);
-		printf("cik_macrotile_mode_array_valid = %i\n", rscreen->info.cik_macrotile_mode_array_valid);
+		printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config);
+		printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq);
+		printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
+		       rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
+		printf("has_userptr = %i\n", rscreen->info.has_userptr);
+
+		printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes);
+		printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock);
+		printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units);
+		printf("max_se = %i\n", rscreen->info.max_se);
+		printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
+
+		printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map);
+		printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid);
+		printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks);
+		printf("num_render_backends = %i\n", rscreen->info.num_render_backends);
+		printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
+		printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
 	}
 	return true;
 }
 
 void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 {
+	r600_perfcounters_destroy(rscreen);
 	r600_gpu_load_kill_thread(rscreen);
 
 	pipe_mutex_destroy(rscreen->gpu_load_mutex);
 	pipe_mutex_destroy(rscreen->aux_context_lock);
 	rscreen->aux_context->destroy(rscreen->aux_context);
 
-	if (rscreen->trace_bo)
-		pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL);
+	slab_destroy_parent(&rscreen->pool_transfers);
 
 	rscreen->ws->destroy(rscreen->ws);
 	FREE(rscreen);
 }
 
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens)
+			  unsigned processor)
 {
-	/* Compute shader don't have tgsi_tokens */
-	if (!tokens)
-		return (rscreen->debug_flags & DBG_CS) != 0;
-
-	switch (tgsi_get_processor_type(tokens)) {
-	case TGSI_PROCESSOR_VERTEX:
+	switch (processor) {
+	case PIPE_SHADER_VERTEX:
 		return (rscreen->debug_flags & DBG_VS) != 0;
-	case TGSI_PROCESSOR_TESS_CTRL:
+	case PIPE_SHADER_TESS_CTRL:
 		return (rscreen->debug_flags & DBG_TCS) != 0;
-	case TGSI_PROCESSOR_TESS_EVAL:
+	case PIPE_SHADER_TESS_EVAL:
 		return (rscreen->debug_flags & DBG_TES) != 0;
-	case TGSI_PROCESSOR_GEOMETRY:
+	case PIPE_SHADER_GEOMETRY:
 		return (rscreen->debug_flags & DBG_GS) != 0;
-	case TGSI_PROCESSOR_FRAGMENT:
+	case PIPE_SHADER_FRAGMENT:
 		return (rscreen->debug_flags & DBG_PS) != 0;
-	case TGSI_PROCESSOR_COMPUTE:
+	case PIPE_SHADER_COMPUTE:
 		return (rscreen->debug_flags & DBG_CS) != 0;
 	default:
 		return false;
 	}
 }
 
+bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor)
+{
+	return (rscreen->debug_flags & DBG_CHECK_IR) ||
+	       r600_can_dump_shader(rscreen, processor);
+}
+
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
-			      unsigned offset, unsigned size, unsigned value,
-			      bool is_framebuffer)
+			      uint64_t offset, uint64_t size, unsigned value,
+			      enum r600_coherency coher)
 {
 	struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context;
 
 	pipe_mutex_lock(rscreen->aux_context_lock);
-	rctx->clear_buffer(&rctx->b, dst, offset, size, value, is_framebuffer);
+	rctx->clear_buffer(&rctx->b, dst, offset, size, value, coher);
 	rscreen->aux_context->flush(rscreen->aux_context, NULL, 0);
 	pipe_mutex_unlock(rscreen->aux_context_lock);
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
index 29db1cc4e..86772c0af 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -39,31 +39,22 @@
 #include "util/u_blitter.h"
 #include "util/list.h"
 #include "util/u_range.h"
-#include "util/u_slab.h"
+#include "util/slab.h"
 #include "util/u_suballoc.h"
 #include "util/u_transfer.h"
 
+#define ATI_VENDOR_ID 0x1002
+
 #define R600_RESOURCE_FLAG_TRANSFER		(PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
 #define R600_RESOURCE_FLAG_FLUSHED_DEPTH	(PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 #define R600_RESOURCE_FLAG_FORCE_TILING		(PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
-
-#define R600_QUERY_DRAW_CALLS		(PIPE_QUERY_DRIVER_SPECIFIC + 0)
-#define R600_QUERY_REQUESTED_VRAM	(PIPE_QUERY_DRIVER_SPECIFIC + 1)
-#define R600_QUERY_REQUESTED_GTT	(PIPE_QUERY_DRIVER_SPECIFIC + 2)
-#define R600_QUERY_BUFFER_WAIT_TIME	(PIPE_QUERY_DRIVER_SPECIFIC + 3)
-#define R600_QUERY_NUM_CS_FLUSHES	(PIPE_QUERY_DRIVER_SPECIFIC + 4)
-#define R600_QUERY_NUM_BYTES_MOVED	(PIPE_QUERY_DRIVER_SPECIFIC + 5)
-#define R600_QUERY_VRAM_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 6)
-#define R600_QUERY_GTT_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 7)
-#define R600_QUERY_GPU_TEMPERATURE	(PIPE_QUERY_DRIVER_SPECIFIC + 8)
-#define R600_QUERY_CURRENT_GPU_SCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 9)
-#define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define R600_RESOURCE_FLAG_DISABLE_DCC		(PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
 
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
-#define R600_CONTEXT_PRIVATE_FLAG		(1u << 1)
+/* Pipeline & streamout query controls. */
+#define R600_CONTEXT_START_PIPELINE_STATS	(1u << 1)
+#define R600_CONTEXT_STOP_PIPELINE_STATS	(1u << 2)
+#define R600_CONTEXT_PRIVATE_FLAG		(1u << 3)
 
 /* special primitive types */
 #define R600_PRIM_RECTANGLE_LIST	PIPE_PRIM_MAX
@@ -71,10 +62,10 @@
 /* Debug flags. */
 /* logging */
 #define DBG_TEX			(1 << 0)
-#define DBG_TEXMIP		(1 << 1)
+/* gap - reuse */
 #define DBG_COMPUTE		(1 << 2)
 #define DBG_VM			(1 << 3)
-#define DBG_TRACE_CS		(1 << 4)
+/* gap - reuse */
 /* shader logging */
 #define DBG_FS			(1 << 5)
 #define DBG_VS			(1 << 6)
@@ -86,6 +77,10 @@
 #define DBG_NO_IR		(1 << 12)
 #define DBG_NO_TGSI		(1 << 13)
 #define DBG_NO_ASM		(1 << 14)
+#define DBG_PREOPT_IR		(1 << 15)
+#define DBG_CHECK_IR		(1 << 16)
+/* gaps */
+#define DBG_TEST_DMA		(1 << 20)
 /* Bits 21-31 are reserved for the r600g driver. */
 /* features */
 #define DBG_NO_ASYNC_DMA	(1llu << 32)
@@ -98,13 +93,40 @@
 #define DBG_PRECOMPILE		(1llu << 39)
 #define DBG_INFO		(1llu << 40)
 #define DBG_NO_WC		(1llu << 41)
+#define DBG_CHECK_VM		(1llu << 42)
+#define DBG_NO_DCC		(1llu << 43)
+#define DBG_NO_DCC_CLEAR	(1llu << 44)
+#define DBG_NO_RB_PLUS		(1llu << 45)
+#define DBG_SI_SCHED		(1llu << 46)
+#define DBG_MONOLITHIC_SHADERS	(1llu << 47)
+#define DBG_NO_CE		(1llu << 48)
+#define DBG_UNSAFE_MATH		(1llu << 49)
+#define DBG_NO_DCC_FB		(1llu << 50)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
+#define R600_MAX_VIEWPORTS        16
+
+#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
+
+enum r600_coherency {
+	R600_COHERENCY_NONE, /* no cache flushes needed */
+	R600_COHERENCY_SHADER,
+	R600_COHERENCY_CB_META,
+};
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+#define R600_BIG_ENDIAN 1
+#else
+#define R600_BIG_ENDIAN 0
+#endif
 
 struct r600_common_context;
+struct r600_perfcounters;
+struct tgsi_shader_info;
+struct r600_qbo_state;
 
 struct radeon_shader_reloc {
-	char *name;
+	char name[32];
 	uint64_t offset;
 };
 
@@ -137,18 +159,31 @@ struct radeon_shader_binary {
 
 	/** Disassembled shader in a string. */
 	char *disasm_string;
+	char *llvm_ir_string;
 };
 
+void radeon_shader_binary_init(struct radeon_shader_binary *b);
+void radeon_shader_binary_clean(struct radeon_shader_binary *b);
+
+/* Only 32-bit buffer allocations are supported, gallium doesn't support more
+ * at the moment.
+ */
 struct r600_resource {
 	struct u_resource		b;
 
 	/* Winsys objects. */
 	struct pb_buffer		*buf;
-	struct radeon_winsys_cs_handle	*cs_buf;
 	uint64_t			gpu_address;
+	/* Memory usage if the buffer placement is optimal. */
+	uint64_t			vram_usage;
+	uint64_t			gart_usage;
 
-	/* Resource state. */
+	/* Resource properties. */
+	uint64_t			bo_size;
+	unsigned			bo_alignment;
 	enum radeon_bo_domain		domains;
+	enum radeon_bo_flag		flags;
+	unsigned			bind_history;
 
 	/* The buffer range which is initialized (with a write transfer,
 	 * streamout, DMA, or as a random access target). The rest of
@@ -171,6 +206,10 @@ struct r600_resource {
 	 * use TC L2.
 	 */
 	bool				TC_L2_dirty;
+
+	/* Whether the resource has been exported via resource_get_handle. */
+	bool				is_shared;
+	unsigned			external_usage; /* PIPE_HANDLE_USAGE_* */
 };
 
 struct r600_transfer {
@@ -180,51 +219,107 @@ struct r600_transfer {
 };
 
 struct r600_fmask_info {
-	unsigned offset;
-	unsigned size;
+	uint64_t offset;
+	uint64_t size;
 	unsigned alignment;
-	unsigned pitch;
+	unsigned pitch_in_pixels;
 	unsigned bank_height;
 	unsigned slice_tile_max;
 	unsigned tile_mode_index;
 };
 
 struct r600_cmask_info {
-	unsigned offset;
-	unsigned size;
+	uint64_t offset;
+	uint64_t size;
 	unsigned alignment;
+	unsigned pitch;
+	unsigned height;
+	unsigned xalign;
+	unsigned yalign;
 	unsigned slice_tile_max;
 	unsigned base_address_reg;
 };
 
+struct r600_htile_info {
+	unsigned pitch;
+	unsigned height;
+	unsigned xalign;
+	unsigned yalign;
+	unsigned alignment;
+};
+
 struct r600_texture {
 	struct r600_resource		resource;
 
-	unsigned			size;
-	unsigned			pitch_override;
+	uint64_t			size;
+	unsigned			num_level0_transfers;
+	enum pipe_format		db_render_format;
 	bool				is_depth;
+	bool				db_compatible;
+	bool				can_sample_z;
+	bool				can_sample_s;
 	unsigned			dirty_level_mask; /* each bit says if that mipmap is compressed */
+	unsigned			stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
 	struct r600_texture		*flushed_depth_texture;
-	boolean				is_flushing_texture;
 	struct radeon_surf		surface;
 
 	/* Colorbuffer compression and fast clear. */
 	struct r600_fmask_info		fmask;
 	struct r600_cmask_info		cmask;
 	struct r600_resource		*cmask_buffer;
+	uint64_t			dcc_offset; /* 0 = disabled */
 	unsigned			cb_color_info; /* fast clear enable bit */
 	unsigned			color_clear_value[2];
+	unsigned			last_msaa_resolve_target_micro_mode;
 
 	/* Depth buffer compression and fast clear. */
+	struct r600_htile_info		htile;
 	struct r600_resource		*htile_buffer;
+	bool				tc_compatible_htile;
 	bool				depth_cleared; /* if it was cleared at least once */
 	float				depth_clear_value;
+	bool				stencil_cleared; /* if it was cleared at least once */
+	uint8_t				stencil_clear_value;
 
 	bool				non_disp_tiling; /* R600-Cayman only */
+
+	/* Whether the texture is a displayable back buffer and needs DCC
+	 * decompression, which is expensive. Therefore, it's enabled only
+	 * if statistics suggest that it will pay off and it's allocated
+	 * separately. It can't be bound as a sampler by apps. Limited to
+	 * target == 2D and last_level == 0. If enabled, dcc_offset contains
+	 * the absolute GPUVM address, not the relative one.
+	 */
+	struct r600_resource		*dcc_separate_buffer;
+	/* When DCC is temporarily disabled, the separate buffer is here. */
+	struct r600_resource		*last_dcc_separate_buffer;
+	/* We need to track DCC dirtiness, because st/dri usually calls
+	 * flush_resource twice per frame (not a bug) and we don't wanna
+	 * decompress DCC twice. Also, the dirty tracking must be done even
+	 * if DCC isn't used, because it's required by the DCC usage analysis
+	 * for a possible future enablement.
+	 */
+	bool				separate_dcc_dirty;
+	/* Statistics gathering for the DCC enablement heuristic. */
+	bool				dcc_gather_statistics;
+	/* Estimate of how much this color buffer is written to in units of
+	 * full-screen draws: ps_invocations / (width * height)
+	 * Shader kills, late Z, and blending with trivial discards make it
+	 * inaccurate (we need to count CB updates, not PS invocations).
+	 */
+	unsigned			ps_draw_ratio;
+	/* The number of clears since the last DCC usage analysis. */
+	unsigned			num_slow_clears;
+
+	/* Counter that should be non-zero if the texture is bound to a
+	 * framebuffer. Implemented in radeonsi only.
+	 */
+	uint32_t			framebuffers_bound;
 };
 
 struct r600_surface {
 	struct pipe_surface		base;
+	const struct radeon_surf_level	*level_info;
 
 	bool color_initialized;
 	bool depth_initialized;
@@ -232,6 +327,8 @@ struct r600_surface {
 	/* Misc. color flags. */
 	bool alphatest_bypass;
 	bool export_16bpc;
+	bool color_is_int8;
+	bool color_is_int10;
 
 	/* Color registers. */
 	unsigned cb_color_info;
@@ -247,6 +344,10 @@ struct r600_surface {
 	unsigned cb_color_fmask_slice;	/* EG and later */
 	unsigned cb_color_cmask;	/* CB_COLORn_TILE (r600 only) */
 	unsigned cb_color_mask;		/* R600 only */
+	unsigned spi_shader_col_format;		/* SI+, no blending, no alpha-to-coverage. */
+	unsigned spi_shader_col_format_alpha;	/* SI+, alpha-to-coverage */
+	unsigned spi_shader_col_format_blend;	/* SI+, blending without alpha. */
+	unsigned spi_shader_col_format_blend_alpha; /* SI+, blending with alpha. */
 	struct r600_resource *cb_buffer_fmask; /* Used for FMASK relocations. R600 only */
 	struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */
 
@@ -263,13 +364,6 @@ struct r600_surface {
 	unsigned db_htile_surface;
 	unsigned db_htile_data_base;
 	unsigned db_preload_control;	/* EG and later */
-	unsigned pa_su_poly_offset_db_fmt_cntl;
-};
-
-struct r600_tiling_info {
-	unsigned num_channels;
-	unsigned num_banks;
-	unsigned group_bytes;
 };
 
 struct r600_common_screen {
@@ -278,20 +372,20 @@ struct r600_common_screen {
 	enum radeon_family		family;
 	enum chip_class			chip_class;
 	struct radeon_info		info;
-	struct r600_tiling_info		tiling_info;
 	uint64_t			debug_flags;
 	bool				has_cp_dma;
 	bool				has_streamout;
 
+	struct slab_parent_pool		pool_transfers;
+
+	/* Texture filter settings. */
+	int				force_aniso; /* -1 = disabled */
+
 	/* Auxiliary context. Mainly used to initialize resources.
 	 * It must be locked prior to using and flushed before unlocking. */
 	struct pipe_context		*aux_context;
 	pipe_mutex			aux_context_lock;
 
-	struct r600_resource		*trace_bo;
-	uint32_t			*trace_ptr;
-	unsigned			cs_count;
-
 	/* This must be in the screen, because UE4 uses one context for
 	 * compilation and another one for rendering.
 	 */
@@ -308,7 +402,49 @@ struct r600_common_screen {
 	unsigned			gpu_load_counter_idle;
 	volatile unsigned		gpu_load_stop_thread; /* bool */
 
-	char				renderer_string[64];
+	char				renderer_string[100];
+
+	/* Performance counters. */
+	struct r600_perfcounters	*perfcounters;
+
+	/* If pipe_screen wants to re-emit the framebuffer state of all
+	 * contexts, it should atomically increment this. Each context will
+	 * compare this with its own last known value of the counter before
+	 * drawing and re-emit the framebuffer state accordingly.
+	 */
+	unsigned			dirty_fb_counter;
+
+	/* Atomically increment this counter when an existing texture's
+	 * metadata is enabled or disabled in a way that requires changing
+	 * contexts' compressed texture binding masks.
+	 */
+	unsigned			compressed_colortex_counter;
+
+	/* Atomically increment this counter when an existing texture's
+	 * backing buffer or tile mode parameters have changed that requires
+	 * recomputation of shader descriptors.
+	 */
+	unsigned			dirty_tex_descriptor_counter;
+
+	struct {
+		/* Context flags to set so that all writes from earlier jobs
+		 * in the CP are seen by L2 clients.
+		 */
+		unsigned cp_to_L2;
+
+		/* Context flags to set so that all writes from earlier
+		 * compute jobs are seen by L2 clients.
+		 */
+		unsigned compute_to_L2;
+	} barrier_flags;
+
+	void (*query_opaque_metadata)(struct r600_common_screen *rscreen,
+				      struct r600_texture *rtex,
+				      struct radeon_bo_metadata *md);
+
+	void (*apply_opaque_metadata)(struct r600_common_screen *rscreen,
+				    struct r600_texture *rtex,
+				    struct radeon_bo_metadata *md);
 };
 
 /* This encapsulates a state or an operation which can emitted into the GPU
@@ -316,8 +452,7 @@ struct r600_common_screen {
 struct r600_atom {
 	void (*emit)(struct r600_common_context *ctx, struct r600_atom *state);
 	unsigned		num_dw;
-	unsigned short		id;	/* used by r600 only */
-	bool			dirty;
+	unsigned short		id;
 };
 
 struct r600_so_target {
@@ -358,16 +493,40 @@ struct r600_streamout {
 	int				num_prims_gen_queries;
 };
 
+struct r600_signed_scissor {
+	int minx;
+	int miny;
+	int maxx;
+	int maxy;
+};
+
+struct r600_scissors {
+	struct r600_atom		atom;
+	unsigned			dirty_mask;
+	struct pipe_scissor_state	states[R600_MAX_VIEWPORTS];
+};
+
+struct r600_viewports {
+	struct r600_atom		atom;
+	unsigned			dirty_mask;
+	unsigned			depth_range_dirty_mask;
+	struct pipe_viewport_state	states[R600_MAX_VIEWPORTS];
+	struct r600_signed_scissor	as_scissor[R600_MAX_VIEWPORTS];
+};
+
 struct r600_ring {
 	struct radeon_winsys_cs		*cs;
-	bool				flushing;
 	void (*flush)(void *ctx, unsigned flags,
 		      struct pipe_fence_handle **fence);
 };
 
-struct r600_rings {
-	struct r600_ring		gfx;
-	struct r600_ring		dma;
+/* Saved CS data for debugging features. */
+struct radeon_saved_cs {
+	uint32_t			*ib;
+	unsigned			num_dw;
+
+	struct radeon_bo_list_item	*bo_list;
+	unsigned			bo_count;
 };
 
 struct r600_common_context {
@@ -378,13 +537,20 @@ struct r600_common_context {
 	struct radeon_winsys_ctx	*ctx;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
-	struct r600_rings		rings;
+	struct r600_ring		gfx;
+	struct r600_ring		dma;
+	struct pipe_fence_handle	*last_gfx_fence;
+	struct pipe_fence_handle	*last_sdma_fence;
+	unsigned			num_gfx_cs_flushes;
 	unsigned			initial_gfx_cs_size;
 	unsigned			gpu_reset_counter;
+	unsigned			last_dirty_fb_counter;
+	unsigned			last_compressed_colortex_counter;
+	unsigned			last_dirty_tex_descriptor_counter;
 
 	struct u_upload_mgr		*uploader;
-	struct u_suballocator		*allocator_so_filled_size;
-	struct util_slab_mempool	pool_transfers;
+	struct u_suballocator		*allocator_zeroed_memory;
+	struct slab_child_pool		pool_transfers;
 
 	/* Current unaccounted memory usage. */
 	uint64_t			vram;
@@ -392,38 +558,43 @@ struct r600_common_context {
 
 	/* States. */
 	struct r600_streamout		streamout;
+	struct r600_scissors		scissors;
+	struct r600_viewports		viewports;
+	bool				scissor_enabled;
+	bool				clip_halfz;
+	bool				vs_writes_viewport_index;
+	bool				vs_disables_clipping_viewport;
 
 	/* Additional context states. */
 	unsigned flags; /* flush flags */
 
 	/* Queries. */
-	/* The list of active queries. Only one query of each type can be active. */
+	/* Maintain the list of active queries for pausing between IBs. */
 	int				num_occlusion_queries;
-	/* Keep track of non-timer queries, because they should be suspended
-	 * during context flushing.
-	 * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits,
-	 * but they should be suspended between IBs. */
-	struct list_head		active_nontimer_queries;
-	struct list_head		active_timer_queries;
-	unsigned			num_cs_dw_nontimer_queries_suspend;
-	unsigned			num_cs_dw_timer_queries_suspend;
-	/* If queries have been suspended. */
-	bool				queries_suspended_for_flush;
+	int				num_perfect_occlusion_queries;
+	struct list_head		active_queries;
+	unsigned			num_cs_dw_queries_suspend;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
 	unsigned			max_db; /* for OQ */
 	/* Misc stats. */
 	unsigned			num_draw_calls;
+	unsigned			num_spill_draw_calls;
+	unsigned			num_compute_calls;
+	unsigned			num_spill_compute_calls;
+	unsigned			num_dma_calls;
+	unsigned			num_vs_flushes;
+	unsigned			num_ps_flushes;
+	unsigned			num_cs_flushes;
+	uint64_t			num_alloc_tex_transfer_bytes;
+	unsigned			last_tex_ps_draw_ratio; /* for query */
 
 	/* Render condition. */
-	struct pipe_query		*current_render_cond;
-	unsigned			current_render_cond_mode;
-	boolean				current_render_cond_cond;
-	boolean				predicate_drawing;
-	/* For context flushing. */
-	struct pipe_query		*saved_render_cond;
-	boolean				saved_render_cond_cond;
-	unsigned			saved_render_cond_mode;
+	struct r600_atom		render_cond_atom;
+	struct pipe_query		*render_cond;
+	unsigned			render_cond_mode;
+	bool				render_cond_invert;
+	bool				render_cond_force_off; /* for u_blitter */
 
 	/* MSAA sample locations.
 	 * The first index is the sample index.
@@ -434,10 +605,29 @@ struct r600_common_context {
 	float				sample_locations_8x[8][2];
 	float				sample_locations_16x[16][2];
 
-	/* The list of all texture buffer objects in this context.
-	 * This list is walked when a buffer is invalidated/reallocated and
-	 * the GPU addresses are updated. */
-	struct list_head		texture_buffers;
+	/* Statistics gathering for the DCC enablement heuristic. It can't be
+	 * in r600_texture because r600_texture can be shared by multiple
+	 * contexts. This is for back buffers only. We shouldn't get too many
+	 * of those.
+	 *
+	 * X11 DRI3 rotates among a finite set of back buffers. They should
+	 * all fit in this array. If they don't, separate DCC might never be
+	 * enabled by DCC stat gathering.
+	 */
+	struct {
+		struct r600_texture		*tex;
+		/* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
+		struct pipe_query		*ps_stats[3];
+		/* If all slots are used and another slot is needed,
+		 * the least recently used slot is evicted based on this. */
+		int64_t				last_use_timestamp;
+		bool				query_active;
+	} dcc_stats[5];
+
+	struct pipe_debug_callback	debug;
+	struct pipe_device_reset_callback device_reset_callback;
+
+	void				*query_result_shader;
 
 	/* Copy one resource to another using async DMA. */
 	void (*dma_copy)(struct pipe_context *ctx,
@@ -449,8 +639,8 @@ struct r600_common_context {
 			 const struct pipe_box *src_box);
 
 	void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst,
-			     unsigned offset, unsigned size, unsigned value,
-			     bool is_framebuffer);
+			     uint64_t offset, uint64_t size, unsigned value,
+			     enum r600_coherency coher);
 
 	void (*blit_decompress_depth)(struct pipe_context *ctx,
 				      struct r600_texture *texture,
@@ -459,6 +649,9 @@ struct r600_common_context {
 				      unsigned first_layer, unsigned last_layer,
 				      unsigned first_sample, unsigned last_sample);
 
+	void (*decompress_dcc)(struct pipe_context *ctx,
+			       struct r600_texture *rtex);
+
 	/* Reallocate the buffer and update all resource bindings where
 	 * the buffer is bound, including all resource descriptors. */
 	void (*invalidate_buffer)(struct pipe_context *ctx, struct pipe_resource *buf);
@@ -466,34 +659,58 @@ struct r600_common_context {
 	/* Enable or disable occlusion queries. */
 	void (*set_occlusion_query_state)(struct pipe_context *ctx, bool enable);
 
+	void (*save_qbo_state)(struct pipe_context *ctx, struct r600_qbo_state *st);
+
 	/* This ensures there is enough space in the command stream. */
 	void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw,
 				  bool include_draw_vbo);
 
 	void (*set_atom_dirty)(struct r600_common_context *ctx,
 			       struct r600_atom *atom, bool dirty);
+
+	void (*check_vm_faults)(struct r600_common_context *ctx,
+				struct radeon_saved_cs *saved,
+				enum ring_type ring);
 };
 
 /* r600_buffer.c */
-boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
-					struct radeon_winsys_cs_handle *buf,
-					enum radeon_bo_usage usage);
+bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
+				     struct pb_buffer *buf,
+				     enum radeon_bo_usage usage);
 void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
                                       struct r600_resource *resource,
                                       unsigned usage);
-bool r600_init_resource(struct r600_common_screen *rscreen,
-			struct r600_resource *res,
-			unsigned size, unsigned alignment,
-			bool use_reusable_pool);
+void r600_buffer_subdata(struct pipe_context *ctx,
+			 struct pipe_resource *buffer,
+			 unsigned usage, unsigned offset,
+			 unsigned size, const void *data);
+void r600_init_resource_fields(struct r600_common_screen *rscreen,
+			       struct r600_resource *res,
+			       uint64_t size, unsigned alignment);
+bool r600_alloc_resource(struct r600_common_screen *rscreen,
+			 struct r600_resource *res);
 struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 					 const struct pipe_resource *templ,
 					 unsigned alignment);
+struct pipe_resource * r600_aligned_buffer_create(struct pipe_screen *screen,
+						  unsigned bind,
+						  unsigned usage,
+						  unsigned size,
+						  unsigned alignment);
 struct pipe_resource *
 r600_buffer_from_user_memory(struct pipe_screen *screen,
 			     const struct pipe_resource *templ,
 			     void *user_memory);
+void
+r600_invalidate_resource(struct pipe_context *ctx,
+			 struct pipe_resource *resource);
 
 /* r600_common_pipe.c */
+void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf,
+			  uint64_t va, uint32_t old_value, uint32_t new_value);
+unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen);
+void r600_gfx_wait_fence(struct r600_common_context *ctx,
+			 uint64_t va, uint32_t ref, uint32_t mask);
 void r600_draw_rectangle(struct blitter_context *blitter,
 			 int x1, int y1, int x2, int y2, float depth,
 			 enum blitter_attrib_type type,
@@ -504,30 +721,40 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen);
 void r600_preflush_suspend_features(struct r600_common_context *ctx);
 void r600_postflush_resume_features(struct r600_common_context *ctx);
 bool r600_common_context_init(struct r600_common_context *rctx,
-			      struct r600_common_screen *rscreen);
+			      struct r600_common_screen *rscreen,
+			      unsigned context_flags);
 void r600_common_context_cleanup(struct r600_common_context *rctx);
-void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r);
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens);
+			  unsigned processor);
+bool r600_extra_shader_checks(struct r600_common_screen *rscreen,
+			      unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
-			      unsigned offset, unsigned size, unsigned value,
-			      bool is_framebuffer);
+			      uint64_t offset, uint64_t size, unsigned value,
+			      enum r600_coherency coher);
 struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 						  const struct pipe_resource *templ);
 const char *r600_get_llvm_processor_name(enum radeon_family family);
-void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw);
+void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
+			 struct r600_resource *dst, struct r600_resource *src);
+void r600_dma_emit_wait_idle(struct r600_common_context *rctx);
+void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+		    struct radeon_saved_cs *saved);
+void radeon_clear_saved_cs(struct radeon_saved_cs *saved);
+bool r600_check_device_reset(struct r600_common_context *rctx);
 
 /* r600_gpu_load.c */
 void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen);
 uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
 unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 
+/* r600_perfcounters.c */
+void r600_perfcounters_destroy(struct r600_common_screen *rscreen);
+
 /* r600_query.c */
+void r600_init_screen_query_functions(struct r600_common_screen *rscreen);
 void r600_query_init(struct r600_common_context *rctx);
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx);
-void r600_resume_nontimer_queries(struct r600_common_context *ctx);
-void r600_suspend_timer_queries(struct r600_common_context *ctx);
-void r600_resume_timer_queries(struct r600_common_context *ctx);
+void r600_suspend_queries(struct r600_common_context *ctx);
+void r600_resume_queries(struct r600_common_context *ctx);
 void r600_query_init_backend_mask(struct r600_common_context *ctx);
 
 /* r600_streamout.c */
@@ -541,7 +768,17 @@ void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
 					     unsigned type, int diff);
 void r600_streamout_init(struct r600_common_context *rctx);
 
+/* r600_test_dma.c */
+void r600_test_dma(struct r600_common_screen *rscreen);
+
 /* r600_texture.c */
+bool r600_prepare_for_dma_blit(struct r600_common_context *rctx,
+				struct r600_texture *rdst,
+				unsigned dst_level, unsigned dstx,
+				unsigned dsty, unsigned dstz,
+				struct r600_texture *rsrc,
+				unsigned src_level,
+				const struct pipe_box *src_box);
 void r600_texture_get_fmask_info(struct r600_common_screen *rscreen,
 				 struct r600_texture *rtex,
 				 unsigned nr_samples,
@@ -552,21 +789,48 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
 bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
 				     struct pipe_resource *texture,
 				     struct r600_texture **staging);
+void r600_print_texture_info(struct r600_texture *rtex, FILE *f);
 struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 					const struct pipe_resource *templ);
+bool vi_dcc_formats_compatible(enum pipe_format format1,
+			       enum pipe_format format2);
+void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx,
+					   struct pipe_resource *tex,
+					   unsigned level,
+					   enum pipe_format view_format);
 struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
 						struct pipe_resource *texture,
 						const struct pipe_surface *templ,
 						unsigned width, unsigned height);
-unsigned r600_translate_colorswap(enum pipe_format format);
+unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap);
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+				 struct r600_texture *tex);
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+				struct r600_texture *tex);
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+					     struct r600_texture *tex);
+void vi_dcc_clear_level(struct r600_common_context *rctx,
+			struct r600_texture *rtex,
+			unsigned level, unsigned clear_value);
 void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   struct pipe_framebuffer_state *fb,
 				   struct r600_atom *fb_state,
-				   unsigned *buffers,
+				   unsigned *buffers, unsigned *dirty_cbufs,
 				   const union pipe_color_union *color);
+bool r600_texture_disable_dcc(struct r600_common_context *rctx,
+			      struct r600_texture *rtex);
 void r600_init_screen_texture_functions(struct r600_common_screen *rscreen);
 void r600_init_context_texture_functions(struct r600_common_context *rctx);
 
+/* r600_viewport.c */
+void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx,
+					    struct pipe_scissor_state *scissor);
+void r600_viewport_set_rast_deps(struct r600_common_context *rctx,
+				 bool scissor_enable, bool clip_halfz);
+void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx,
+					  struct tgsi_shader_info *info);
+void r600_init_viewport_functions(struct r600_common_context *rctx);
+
 /* cayman_msaa.c */
 extern const uint32_t eg_sample_locs_2x[4];
 extern const unsigned eg_max_dist_2x;
@@ -577,7 +841,8 @@ void cayman_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
 void cayman_init_msaa(struct pipe_context *ctx);
 void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples);
 void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
-			     int ps_iter_samples, int overrast_samples);
+			     int ps_iter_samples, int overrast_samples,
+			     unsigned sc_mode_cntl_1);
 
 
 /* Inline helpers. */
@@ -594,13 +859,57 @@ r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
 				(struct pipe_resource *)res);
 }
 
+static inline void
+r600_texture_reference(struct r600_texture **ptr, struct r600_texture *res)
+{
+	pipe_resource_reference((struct pipe_resource **)ptr, &res->resource.b.b);
+}
+
+static inline void
+r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_resource *res = (struct r600_resource *)r;
+
+	if (res) {
+		/* Add memory usage for need_gfx_cs_space */
+		rctx->vram += res->vram_usage;
+		rctx->gtt += res->gart_usage;
+	}
+}
+
+static inline bool r600_get_strmout_en(struct r600_common_context *rctx)
+{
+	return rctx->streamout.streamout_enabled ||
+	       rctx->streamout.prims_gen_query_enabled;
+}
+
+#define     SQ_TEX_XY_FILTER_POINT                         0x00
+#define     SQ_TEX_XY_FILTER_BILINEAR                      0x01
+#define     SQ_TEX_XY_FILTER_ANISO_POINT                   0x02
+#define     SQ_TEX_XY_FILTER_ANISO_BILINEAR                0x03
+
+static inline unsigned eg_tex_filter(unsigned filter, unsigned max_aniso)
+{
+	if (filter == PIPE_TEX_FILTER_LINEAR)
+		return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_BILINEAR
+				     : SQ_TEX_XY_FILTER_BILINEAR;
+	else
+		return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_POINT
+				     : SQ_TEX_XY_FILTER_POINT;
+}
+
 static inline unsigned r600_tex_aniso_filter(unsigned filter)
 {
-	if (filter <= 1)   return 0;
-	if (filter <= 2)   return 1;
-	if (filter <= 4)   return 2;
-	if (filter <= 8)   return 3;
-	 /* else */        return 4;
+	if (filter < 2)
+		return 0;
+	if (filter < 4)
+		return 1;
+	if (filter < 8)
+		return 2;
+	if (filter < 16)
+		return 3;
+	return 4;
 }
 
 static inline unsigned r600_wavefront_size(enum radeon_family family)
@@ -623,19 +932,38 @@ static inline unsigned r600_wavefront_size(enum radeon_family family)
 	}
 }
 
+static inline enum radeon_bo_priority
+r600_get_sampler_view_priority(struct r600_resource *res)
+{
+	if (res->b.b.target == PIPE_BUFFER)
+		return RADEON_PRIO_SAMPLER_BUFFER;
+
+	if (res->b.b.nr_samples > 1)
+		return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
+
+	return RADEON_PRIO_SAMPLER_TEXTURE;
+}
+
+static inline bool
+r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler)
+{
+	return (stencil_sampler && tex->can_sample_s) ||
+	       (!stencil_sampler && tex->can_sample_z);
+}
+
 #define COMPUTE_DBG(rscreen, fmt, args...) \
 	do { \
 		if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \
 	} while (0);
 
 #define R600_ERR(fmt, args...) \
-	fprintf(stderr, "EE %s:%d %s - "fmt, __FILE__, __LINE__, __func__, ##args)
+	fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
 /* For MSAA sample positions. */
 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
-	(((s0x) & 0xf) | (((s0y) & 0xf) << 4) |		   \
-	(((s1x) & 0xf) << 8) | (((s1y) & 0xf) << 12) |	   \
-	(((s2x) & 0xf) << 16) | (((s2y) & 0xf) << 20) |	   \
-	 (((s3x) & 0xf) << 24) | (((s3y) & 0xf) << 28))
+	(((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |		   \
+	(((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) |	   \
+	(((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) |	   \
+	 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
 
 #endif
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.c b/lib/mesa/src/gallium/drivers/radeon/r600_query.c
index 65339bbb6..4b6767dd3 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_query.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.c
@@ -22,81 +22,317 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "r600_query.h"
 #include "r600_cs.h"
 #include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
 
+#include "tgsi/tgsi_text.h"
 
-struct r600_query_buffer {
-	/* The buffer where query results are stored. */
-	struct r600_resource			*buf;
-	/* Offset of the next free result after current query data */
-	unsigned				results_end;
-	/* If a query buffer is full, a new buffer is created and the old one
-	 * is put in here. When we calculate the result, we sum up the samples
-	 * from all buffers. */
-	struct r600_query_buffer		*previous;
+struct r600_hw_query_params {
+	unsigned start_offset;
+	unsigned end_offset;
+	unsigned fence_offset;
+	unsigned pair_stride;
+	unsigned pair_count;
 };
 
-struct r600_query {
-	/* The query buffer and how many results are in it. */
-	struct r600_query_buffer		buffer;
-	/* The type of query */
-	unsigned				type;
-	/* Size of the result in memory for both begin_query and end_query,
-	 * this can be one or two numbers, or it could even be a size of a structure. */
-	unsigned				result_size;
-	/* The number of dwords for begin_query or end_query. */
-	unsigned				num_cs_dw;
-	/* linked list of queries */
-	struct list_head			list;
-	/* for custom non-GPU queries */
+/* Queries without buffer handling or suspend/resume. */
+struct r600_query_sw {
+	struct r600_query b;
+
 	uint64_t begin_result;
 	uint64_t end_result;
 	/* Fence for GPU_FINISHED. */
 	struct pipe_fence_handle *fence;
-	/* For transform feedback: which stream the query is for */
-	unsigned stream;
 };
 
-
-static bool r600_is_timer_query(unsigned type)
+static void r600_query_sw_destroy(struct r600_common_context *rctx,
+				  struct r600_query *rquery)
 {
-	return type == PIPE_QUERY_TIME_ELAPSED ||
-	       type == PIPE_QUERY_TIMESTAMP;
+	struct pipe_screen *screen = rctx->b.screen;
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+	screen->fence_reference(screen, &query->fence, NULL);
+	FREE(query);
 }
 
-static bool r600_query_needs_begin(unsigned type)
+static enum radeon_value_id winsys_id_from_type(unsigned type)
 {
-	return type != PIPE_QUERY_GPU_FINISHED &&
-	       type != PIPE_QUERY_TIMESTAMP;
+	switch (type) {
+	case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
+	case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
+	case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
+	case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
+	case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
+	case R600_QUERY_NUM_CTX_FLUSHES: return RADEON_NUM_CS_FLUSHES;
+	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
+	case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
+	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
+	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
+	case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
+	case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
+	case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
+	default: unreachable("query type does not correspond to winsys id");
+	}
 }
 
-static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx, unsigned type)
+static bool r600_query_sw_begin(struct r600_common_context *rctx,
+				struct r600_query *rquery)
 {
-	unsigned j, i, num_results, buf_size = 4096;
-	uint32_t *results;
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
 
-	/* Non-GPU queries. */
-	switch (type) {
+	switch(query->b.type) {
 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
 	case PIPE_QUERY_GPU_FINISHED:
+		break;
 	case R600_QUERY_DRAW_CALLS:
+		query->begin_result = rctx->num_draw_calls;
+		break;
+	case R600_QUERY_SPILL_DRAW_CALLS:
+		query->begin_result = rctx->num_spill_draw_calls;
+		break;
+	case R600_QUERY_COMPUTE_CALLS:
+		query->begin_result = rctx->num_compute_calls;
+		break;
+	case R600_QUERY_SPILL_COMPUTE_CALLS:
+		query->begin_result = rctx->num_spill_compute_calls;
+		break;
+	case R600_QUERY_DMA_CALLS:
+		query->begin_result = rctx->num_dma_calls;
+		break;
+	case R600_QUERY_NUM_VS_FLUSHES:
+		query->begin_result = rctx->num_vs_flushes;
+		break;
+	case R600_QUERY_NUM_PS_FLUSHES:
+		query->begin_result = rctx->num_ps_flushes;
+		break;
+	case R600_QUERY_NUM_CS_FLUSHES:
+		query->begin_result = rctx->num_cs_flushes;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
+	case R600_QUERY_MAPPED_VRAM:
+	case R600_QUERY_MAPPED_GTT:
+	case R600_QUERY_VRAM_USAGE:
+	case R600_QUERY_GTT_USAGE:
+	case R600_QUERY_GPU_TEMPERATURE:
+	case R600_QUERY_CURRENT_GPU_SCLK:
+	case R600_QUERY_CURRENT_GPU_MCLK:
+	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+		query->begin_result = 0;
+		break;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-	case R600_QUERY_NUM_CS_FLUSHES:
+	case R600_QUERY_NUM_CTX_FLUSHES:
 	case R600_QUERY_NUM_BYTES_MOVED:
+	case R600_QUERY_NUM_EVICTIONS: {
+		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
+		break;
+	}
+	case R600_QUERY_GPU_LOAD:
+		query->begin_result = r600_gpu_load_begin(rctx->screen);
+		break;
+	case R600_QUERY_NUM_COMPILATIONS:
+		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		break;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		break;
+	case R600_QUERY_GPIN_ASIC_ID:
+	case R600_QUERY_GPIN_NUM_SIMD:
+	case R600_QUERY_GPIN_NUM_RB:
+	case R600_QUERY_GPIN_NUM_SPI:
+	case R600_QUERY_GPIN_NUM_SE:
+		break;
+	default:
+		unreachable("r600_query_sw_begin: bad query type");
+	}
+
+	return true;
+}
+
+static bool r600_query_sw_end(struct r600_common_context *rctx,
+			      struct r600_query *rquery)
+{
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+	switch(query->b.type) {
+	case PIPE_QUERY_TIMESTAMP_DISJOINT:
+		break;
+	case PIPE_QUERY_GPU_FINISHED:
+		rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
+		break;
+	case R600_QUERY_DRAW_CALLS:
+		query->end_result = rctx->num_draw_calls;
+		break;
+	case R600_QUERY_SPILL_DRAW_CALLS:
+		query->end_result = rctx->num_spill_draw_calls;
+		break;
+	case R600_QUERY_COMPUTE_CALLS:
+		query->end_result = rctx->num_compute_calls;
+		break;
+	case R600_QUERY_SPILL_COMPUTE_CALLS:
+		query->end_result = rctx->num_spill_compute_calls;
+		break;
+	case R600_QUERY_DMA_CALLS:
+		query->end_result = rctx->num_dma_calls;
+		break;
+	case R600_QUERY_NUM_VS_FLUSHES:
+		query->end_result = rctx->num_vs_flushes;
+		break;
+	case R600_QUERY_NUM_PS_FLUSHES:
+		query->end_result = rctx->num_ps_flushes;
+		break;
+	case R600_QUERY_NUM_CS_FLUSHES:
+		query->end_result = rctx->num_cs_flushes;
+		break;
+	case R600_QUERY_REQUESTED_VRAM:
+	case R600_QUERY_REQUESTED_GTT:
+	case R600_QUERY_MAPPED_VRAM:
+	case R600_QUERY_MAPPED_GTT:
 	case R600_QUERY_VRAM_USAGE:
 	case R600_QUERY_GTT_USAGE:
 	case R600_QUERY_GPU_TEMPERATURE:
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
+	case R600_QUERY_BUFFER_WAIT_TIME:
+	case R600_QUERY_NUM_CTX_FLUSHES:
+	case R600_QUERY_NUM_BYTES_MOVED:
+	case R600_QUERY_NUM_EVICTIONS: {
+		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
+		break;
+	}
 	case R600_QUERY_GPU_LOAD:
+		query->end_result = r600_gpu_load_end(rctx->screen,
+						      query->begin_result);
+		query->begin_result = 0;
+		break;
 	case R600_QUERY_NUM_COMPILATIONS:
+		query->end_result = p_atomic_read(&rctx->screen->num_compilations);
+		break;
 	case R600_QUERY_NUM_SHADERS_CREATED:
+		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		break;
+	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+		query->end_result = rctx->last_tex_ps_draw_ratio;
+		break;
+	case R600_QUERY_GPIN_ASIC_ID:
+	case R600_QUERY_GPIN_NUM_SIMD:
+	case R600_QUERY_GPIN_NUM_RB:
+	case R600_QUERY_GPIN_NUM_SPI:
+	case R600_QUERY_GPIN_NUM_SE:
+		break;
+	default:
+		unreachable("r600_query_sw_end: bad query type");
+	}
+
+	return true;
+}
+
+static bool r600_query_sw_get_result(struct r600_common_context *rctx,
+				     struct r600_query *rquery,
+				     bool wait,
+				     union pipe_query_result *result)
+{
+	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+	switch (query->b.type) {
+	case PIPE_QUERY_TIMESTAMP_DISJOINT:
+		/* Convert from cycles per millisecond to cycles per second (Hz). */
+		result->timestamp_disjoint.frequency =
+			(uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
+		result->timestamp_disjoint.disjoint = false;
+		return true;
+	case PIPE_QUERY_GPU_FINISHED: {
+		struct pipe_screen *screen = rctx->b.screen;
+		result->b = screen->fence_finish(screen, &rctx->b, query->fence,
+						 wait ? PIPE_TIMEOUT_INFINITE : 0);
+		return result->b;
+	}
+
+	case R600_QUERY_GPIN_ASIC_ID:
+		result->u32 = 0;
+		return true;
+	case R600_QUERY_GPIN_NUM_SIMD:
+		result->u32 = rctx->screen->info.num_good_compute_units;
+		return true;
+	case R600_QUERY_GPIN_NUM_RB:
+		result->u32 = rctx->screen->info.num_render_backends;
+		return true;
+	case R600_QUERY_GPIN_NUM_SPI:
+		result->u32 = 1; /* all supported chips have one SPI per SE */
+		return true;
+	case R600_QUERY_GPIN_NUM_SE:
+		result->u32 = rctx->screen->info.max_se;
+		return true;
+	}
+
+	result->u64 = query->end_result - query->begin_result;
+
+	switch (query->b.type) {
+	case R600_QUERY_BUFFER_WAIT_TIME:
+	case R600_QUERY_GPU_TEMPERATURE:
+		result->u64 /= 1000;
+		break;
+	case R600_QUERY_CURRENT_GPU_SCLK:
+	case R600_QUERY_CURRENT_GPU_MCLK:
+		result->u64 *= 1000000;
+		break;
+	}
+
+	return true;
+}
+
+
+static struct r600_query_ops sw_query_ops = {
+	.destroy = r600_query_sw_destroy,
+	.begin = r600_query_sw_begin,
+	.end = r600_query_sw_end,
+	.get_result = r600_query_sw_get_result,
+	.get_result_resource = NULL
+};
+
+static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
+					       unsigned query_type)
+{
+	struct r600_query_sw *query;
+
+	query = CALLOC_STRUCT(r600_query_sw);
+	if (!query)
 		return NULL;
+
+	query->b.type = query_type;
+	query->b.ops = &sw_query_ops;
+
+	return (struct pipe_query *)query;
+}
+
+void r600_query_hw_destroy(struct r600_common_context *rctx,
+			   struct r600_query *rquery)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+	struct r600_query_buffer *prev = query->buffer.previous;
+
+	/* Release all query buffers. */
+	while (prev) {
+		struct r600_query_buffer *qbuf = prev;
+		prev = prev->previous;
+		r600_resource_reference(&qbuf->buf, NULL);
+		FREE(qbuf);
 	}
 
+	r600_resource_reference(&query->buffer.buf, NULL);
+	FREE(rquery);
+}
+
+static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx,
+						   struct r600_query_hw *query)
+{
+	unsigned buf_size = MAX2(query->result_size,
+				 ctx->screen->info.min_alloc_size);
+
 	/* Queries are normally read by the CPU after
 	 * being written by the gpu, hence staging is probably a good
 	 * usage pattern.
@@ -104,15 +340,37 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	struct r600_resource *buf = (struct r600_resource*)
 		pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM,
 				   PIPE_USAGE_STAGING, buf_size);
+	if (!buf)
+		return NULL;
 
-	switch (type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
-		memset(results, 0, buf_size);
+	if (!query->ops->prepare_buffer(ctx, query, buf)) {
+		r600_resource_reference(&buf, NULL);
+		return NULL;
+	}
+
+	return buf;
+}
+
+static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
+					 struct r600_query_hw *query,
+					 struct r600_resource *buffer)
+{
+	/* Callers ensure that the buffer is currently unused by the GPU. */
+	uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL,
+						PIPE_TRANSFER_WRITE |
+						PIPE_TRANSFER_UNSYNCHRONIZED);
+	if (!results)
+		return false;
+
+	memset(results, 0, buffer->b.b.width0);
+
+	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+		unsigned num_results;
+		unsigned i, j;
 
 		/* Set top bits for unused backends. */
-		num_results = buf_size / (16 * ctx->max_db);
+		num_results = buffer->b.b.width0 / query->result_size;
 		for (j = 0; j < num_results; j++) {
 			for (i = 0; i < ctx->max_db; i++) {
 				if (!(ctx->backend_mask & (1<<i))) {
@@ -122,22 +380,118 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 			}
 			results += 4 * ctx->max_db;
 		}
+	}
+
+	return true;
+}
+
+static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
+                                              struct r600_query *rquery,
+                                              bool wait,
+                                              enum pipe_query_value_type result_type,
+                                              int index,
+                                              struct pipe_resource *resource,
+                                              unsigned offset);
+
+static struct r600_query_ops query_hw_ops = {
+	.destroy = r600_query_hw_destroy,
+	.begin = r600_query_hw_begin,
+	.end = r600_query_hw_end,
+	.get_result = r600_query_hw_get_result,
+	.get_result_resource = r600_query_hw_get_result_resource,
+};
+
+static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
+					struct r600_query_hw *query,
+					struct r600_resource *buffer,
+					uint64_t va);
+static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
+				       struct r600_query_hw *query,
+				       struct r600_resource *buffer,
+				       uint64_t va);
+static void r600_query_hw_add_result(struct r600_common_context *ctx,
+				     struct r600_query_hw *, void *buffer,
+				     union pipe_query_result *result);
+static void r600_query_hw_clear_result(struct r600_query_hw *,
+				       union pipe_query_result *);
+
+static struct r600_query_hw_ops query_hw_default_hw_ops = {
+	.prepare_buffer = r600_query_hw_prepare_buffer,
+	.emit_start = r600_query_hw_do_emit_start,
+	.emit_stop = r600_query_hw_do_emit_stop,
+	.clear_result = r600_query_hw_clear_result,
+	.add_result = r600_query_hw_add_result,
+};
+
+bool r600_query_hw_init(struct r600_common_context *rctx,
+			struct r600_query_hw *query)
+{
+	query->buffer.buf = r600_new_query_buffer(rctx, query);
+	if (!query->buffer.buf)
+		return false;
+
+	return true;
+}
+
+static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
+					       unsigned query_type,
+					       unsigned index)
+{
+	struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
+	if (!query)
+		return NULL;
+
+	query->b.type = query_type;
+	query->b.ops = &query_hw_ops;
+	query->ops = &query_hw_default_hw_ops;
+
+	switch (query_type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		query->result_size = 16 * rctx->max_db;
+		query->result_size += 16; /* for the fence + alignment */
+		query->num_cs_dw_begin = 6;
+		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
+		query->result_size = 24;
+		query->num_cs_dw_begin = 8;
+		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+		break;
 	case PIPE_QUERY_TIMESTAMP:
+		query->result_size = 16;
+		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+		query->flags = R600_QUERY_HW_FLAG_NO_START;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+		query->result_size = 32;
+		query->num_cs_dw_begin = 6;
+		query->num_cs_dw_end = 6;
+		query->stream = index;
+		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
-		results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
-		memset(results, 0, buf_size);
+		/* 11 values on EG, 8 on R600. */
+		query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
+		query->result_size += 8; /* for the fence + alignment */
+		query->num_cs_dw_begin = 6;
+		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
 		break;
 	default:
 		assert(0);
+		FREE(query);
+		return NULL;
 	}
-	return buf;
+
+	if (!r600_query_hw_init(rctx, query)) {
+		FREE(query);
+		return NULL;
+	}
+
+	return (struct pipe_query *)query;
 }
 
 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
@@ -146,20 +500,28 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
 	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
 	    type == PIPE_QUERY_OCCLUSION_PREDICATE) {
 		bool old_enable = rctx->num_occlusion_queries != 0;
-		bool enable;
+		bool old_perfect_enable =
+			rctx->num_perfect_occlusion_queries != 0;
+		bool enable, perfect_enable;
 
 		rctx->num_occlusion_queries += diff;
 		assert(rctx->num_occlusion_queries >= 0);
 
+		if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
+			rctx->num_perfect_occlusion_queries += diff;
+			assert(rctx->num_perfect_occlusion_queries >= 0);
+		}
+
 		enable = rctx->num_occlusion_queries != 0;
+		perfect_enable = rctx->num_perfect_occlusion_queries != 0;
 
-		if (enable != old_enable) {
+		if (enable != old_enable || perfect_enable != old_perfect_enable) {
 			rctx->set_occlusion_query_state(&rctx->b, enable);
 		}
 	}
 }
 
-static unsigned event_type_for_stream(struct r600_query *query)
+static unsigned event_type_for_stream(struct r600_query_hw *query)
 {
 	switch (query->stream) {
 	default:
@@ -170,28 +532,14 @@ static unsigned event_type_for_stream(struct r600_query *query)
 	}
 }
 
-static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
+static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
+					struct r600_query_hw *query,
+					struct r600_resource *buffer,
+					uint64_t va)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
-	uint64_t va;
-
-	r600_update_occlusion_query_state(ctx, query->type, 1);
-	r600_update_prims_generated_query_state(ctx, query->type, 1);
-	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw * 2, TRUE);
-
-	/* Get a new query buffer if needed. */
-	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
-		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
-		*qbuf = query->buffer;
-		query->buffer.buf = r600_new_query_buffer(ctx, query->type);
-		query->buffer.results_end = 0;
-		query->buffer.previous = qbuf;
-	}
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 
-	/* emit begin query */
-	va = query->buffer.buf->gpu_address + query->buffer.results_end;
-
-	switch (query->type) {
+	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -210,7 +558,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5));
+		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
 		radeon_emit(cs, 0);
@@ -225,226 +573,210 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	default:
 		assert(0);
 	}
-	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
-			RADEON_PRIO_MIN);
-
-	if (r600_is_timer_query(query->type))
-		ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw;
-	else
-		ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
+	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+			RADEON_PRIO_QUERY);
 }
 
-static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
+static void r600_query_hw_emit_start(struct r600_common_context *ctx,
+				     struct r600_query_hw *query)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
 	uint64_t va;
 
-	/* The queries which need begin already called this in begin_query. */
-	if (!r600_query_needs_begin(query->type)) {
-		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw, FALSE);
+	if (!query->buffer.buf)
+		return; // previous buffer allocation failure
+
+	r600_update_occlusion_query_state(ctx, query->b.type, 1);
+	r600_update_prims_generated_query_state(ctx, query->b.type, 1);
+
+	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
+			       true);
+
+	/* Get a new query buffer if needed. */
+	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
+		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
+		*qbuf = query->buffer;
+		query->buffer.results_end = 0;
+		query->buffer.previous = qbuf;
+		query->buffer.buf = r600_new_query_buffer(ctx, query);
+		if (!query->buffer.buf)
+			return;
 	}
 
-	va = query->buffer.buf->gpu_address;
+	/* emit begin query */
+	va = query->buffer.buf->gpu_address + query->buffer.results_end;
 
-	/* emit end query */
-	switch (query->type) {
+	query->ops->emit_start(ctx, query, query->buffer.buf, va);
+
+	ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
+}
+
+static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
+				       struct r600_query_hw *query,
+				       struct r600_resource *buffer,
+				       uint64_t va)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	uint64_t fence_va = 0;
+
+	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		va += query->buffer.results_end + 8;
+		va += 8;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32) & 0xFFFF);
+
+		fence_va = va + ctx->max_db * 16 - 8;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		va += query->buffer.results_end + query->result_size/2;
+		va += query->result_size/2;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
-		va += query->buffer.results_end + query->result_size/2;
+		va += 8;
 		/* fall through */
 	case PIPE_QUERY_TIMESTAMP:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5));
+		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
 		radeon_emit(cs, 0);
 		radeon_emit(cs, 0);
+
+		fence_va = va + 8;
 		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS:
-		va += query->buffer.results_end + query->result_size/2;
+	case PIPE_QUERY_PIPELINE_STATISTICS: {
+		unsigned sample_size = (query->result_size - 8) / 2;
+
+		va += sample_size;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32) & 0xFFFF);
+
+		fence_va = va + sample_size;
 		break;
+	}
 	default:
 		assert(0);
 	}
-	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
-			RADEON_PRIO_MIN);
+	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+			RADEON_PRIO_QUERY);
 
-	query->buffer.results_end += query->result_size;
+	if (fence_va)
+		r600_gfx_write_fence(ctx, query->buffer.buf, fence_va, 0, 0x80000000);
+}
 
-	if (r600_query_needs_begin(query->type)) {
-		if (r600_is_timer_query(query->type))
-			ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw;
-		else
-			ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw;
+static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
+				    struct r600_query_hw *query)
+{
+	uint64_t va;
+
+	if (!query->buffer.buf)
+		return; // previous buffer allocation failure
+
+	/* The queries which need begin already called this in begin_query. */
+	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
+		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
 	}
 
-	r600_update_occlusion_query_state(ctx, query->type, -1);
-	r600_update_prims_generated_query_state(ctx, query->type, -1);
-}
+	/* emit end query */
+	va = query->buffer.buf->gpu_address + query->buffer.results_end;
 
-static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_query *query,
-					int operation, bool flag_wait)
-{
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
-	uint32_t op = PRED_OP(operation);
+	query->ops->emit_stop(ctx, query, query->buffer.buf, va);
 
-	/* if true then invert, see GL_ARB_conditional_render_inverted */
-	if (ctx->current_render_cond_cond)
-		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
-	else
-		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
+	query->buffer.results_end += query->result_size;
 
-	if (operation == PREDICATION_OP_CLEAR) {
-		ctx->need_gfx_cs_space(&ctx->b, 3, FALSE);
+	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
+		ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
 
-		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-		radeon_emit(cs, 0);
-		radeon_emit(cs, PRED_OP(PREDICATION_OP_CLEAR));
-	} else {
-		struct r600_query_buffer *qbuf;
-		unsigned count;
-		/* Find how many results there are. */
-		count = 0;
-		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-			count += qbuf->results_end / query->result_size;
-		}
-	
-		ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE);
-	
-		op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-	
-		/* emit predicate packets for all data blocks */
-		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-			unsigned results_base = 0;
-			uint64_t va = qbuf->buf->gpu_address;
-	
-			while (results_base < qbuf->results_end) {
-				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-				radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL);
-				radeon_emit(cs, op | (((va + results_base) >> 32UL) & 0xFF));
-				r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
-						RADEON_PRIO_MIN);
-				results_base += query->result_size;
-	
-				/* set CONTINUE bit for all packets except the first */
-				op |= PREDICATION_CONTINUE;
-			}
-		}
-	}
+	r600_update_occlusion_query_state(ctx, query->b.type, -1);
+	r600_update_prims_generated_query_state(ctx, query->b.type, -1);
 }
 
-static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+static void r600_emit_query_predication(struct r600_common_context *ctx,
+					struct r600_atom *atom)
 {
-	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_query *query;
-	bool skip_allocation = false;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
+	struct r600_query_buffer *qbuf;
+	uint32_t op;
+	bool flag_wait;
 
-	query = CALLOC_STRUCT(r600_query);
-	if (query == NULL)
-		return NULL;
+	if (!query)
+		return;
 
-	query->type = query_type;
+	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
 
-	switch (query_type) {
+	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		query->result_size = 16 * rctx->max_db;
-		query->num_cs_dw = 6;
-		break;
-		break;
-	case PIPE_QUERY_TIME_ELAPSED:
-		query->result_size = 16;
-		query->num_cs_dw = 8;
-		break;
-	case PIPE_QUERY_TIMESTAMP:
-		query->result_size = 8;
-		query->num_cs_dw = 8;
+		op = PRED_OP(PREDICATION_OP_ZPASS);
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
-		query->result_size = 32;
-		query->num_cs_dw = 6;
-		query->stream = index;
-		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS:
-		/* 11 values on EG, 8 on R600. */
-		query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
-		query->num_cs_dw = 6;
-		break;
-	/* Non-GPU queries and queries not requiring a buffer. */
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-	case PIPE_QUERY_GPU_FINISHED:
-	case R600_QUERY_DRAW_CALLS:
-	case R600_QUERY_REQUESTED_VRAM:
-	case R600_QUERY_REQUESTED_GTT:
-	case R600_QUERY_BUFFER_WAIT_TIME:
-	case R600_QUERY_NUM_CS_FLUSHES:
-	case R600_QUERY_NUM_BYTES_MOVED:
-	case R600_QUERY_VRAM_USAGE:
-	case R600_QUERY_GTT_USAGE:
-	case R600_QUERY_GPU_TEMPERATURE:
-	case R600_QUERY_CURRENT_GPU_SCLK:
-	case R600_QUERY_CURRENT_GPU_MCLK:
-	case R600_QUERY_GPU_LOAD:
-	case R600_QUERY_NUM_COMPILATIONS:
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		skip_allocation = true;
+		op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
 		break;
 	default:
 		assert(0);
-		FREE(query);
-		return NULL;
+		return;
 	}
 
-	if (!skip_allocation) {
-		query->buffer.buf = r600_new_query_buffer(rctx, query_type);
-		if (!query->buffer.buf) {
-			FREE(query);
-			return NULL;
+	/* if true then invert, see GL_ARB_conditional_render_inverted */
+	if (ctx->render_cond_invert)
+		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
+	else
+		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
+
+	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+	
+	/* emit predicate packets for all data blocks */
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned results_base = 0;
+		uint64_t va = qbuf->buf->gpu_address;
+
+		while (results_base < qbuf->results_end) {
+			radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+			radeon_emit(cs, va + results_base);
+			radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
+			r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
+					RADEON_PRIO_QUERY);
+			results_base += query->result_size;
+
+			/* set CONTINUE bit for all packets except the first */
+			op |= PREDICATION_CONTINUE;
 		}
 	}
-	return (struct pipe_query*)query;
 }
 
-static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
 {
-	struct r600_query *rquery = (struct r600_query*)query;
-	struct r600_query_buffer *prev = rquery->buffer.previous;
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 
-	/* Release all query buffers. */
-	while (prev) {
-		struct r600_query_buffer *qbuf = prev;
-		prev = prev->previous;
-		pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL);
-		FREE(qbuf);
-	}
+	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
+	    query_type == PIPE_QUERY_GPU_FINISHED ||
+	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
+		return r600_query_sw_create(ctx, query_type);
 
-	pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
-	FREE(query);
+	return r600_query_hw_create(rctx, query_type, index);
+}
+
+static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_query *rquery = (struct r600_query *)query;
+
+	rquery->ops->destroy(rctx, rquery);
 }
 
 static boolean r600_begin_query(struct pipe_context *ctx,
@@ -452,139 +784,141 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query *rquery = (struct r600_query *)query;
-	struct r600_query_buffer *prev = rquery->buffer.previous;
 
-	if (!r600_query_needs_begin(rquery->type)) {
-		assert(0);
-		return false;
-	}
+	return rquery->ops->begin(rctx, rquery);
+}
 
-	/* Non-GPU queries. */
-	switch (rquery->type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		return true;
-	case R600_QUERY_DRAW_CALLS:
-		rquery->begin_result = rctx->num_draw_calls;
-		return true;
-	case R600_QUERY_REQUESTED_VRAM:
-	case R600_QUERY_REQUESTED_GTT:
-	case R600_QUERY_VRAM_USAGE:
-	case R600_QUERY_GTT_USAGE:
-	case R600_QUERY_GPU_TEMPERATURE:
-	case R600_QUERY_CURRENT_GPU_SCLK:
-	case R600_QUERY_CURRENT_GPU_MCLK:
-		rquery->begin_result = 0;
-		return true;
-	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
-		return true;
-	case R600_QUERY_NUM_CS_FLUSHES:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
-		return true;
-	case R600_QUERY_NUM_BYTES_MOVED:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED);
-		return true;
-	case R600_QUERY_GPU_LOAD:
-		rquery->begin_result = r600_gpu_load_begin(rctx->screen);
-		return true;
-	case R600_QUERY_NUM_COMPILATIONS:
-		rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations);
-		return true;
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
-		return true;
-	}
+void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
+				 struct r600_query_hw *query)
+{
+	struct r600_query_buffer *prev = query->buffer.previous;
 
 	/* Discard the old query buffers. */
 	while (prev) {
 		struct r600_query_buffer *qbuf = prev;
 		prev = prev->previous;
-		pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL);
+		r600_resource_reference(&qbuf->buf, NULL);
 		FREE(qbuf);
 	}
 
+	query->buffer.results_end = 0;
+	query->buffer.previous = NULL;
+
 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
-	if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
-	    !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
-		pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
-		rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type);
+	if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
+	    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
+		r600_resource_reference(&query->buffer.buf, NULL);
+		query->buffer.buf = r600_new_query_buffer(rctx, query);
+	} else {
+		if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf))
+			r600_resource_reference(&query->buffer.buf, NULL);
 	}
+}
+
+bool r600_query_hw_begin(struct r600_common_context *rctx,
+			 struct r600_query *rquery)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
 
-	rquery->buffer.results_end = 0;
-	rquery->buffer.previous = NULL;
+	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
+		assert(0);
+		return false;
+	}
 
-	r600_emit_query_begin(rctx, rquery);
+	if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
+		r600_query_hw_reset_buffers(rctx, query);
 
-	if (r600_is_timer_query(rquery->type))
-		LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries);
-	else
-		LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries);
-   return true;
+	r600_query_hw_emit_start(rctx, query);
+	if (!query->buffer.buf)
+		return false;
+
+	LIST_ADDTAIL(&query->list, &rctx->active_queries);
+	return true;
 }
 
-static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
+static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query *rquery = (struct r600_query *)query;
 
-	/* Non-GPU queries. */
-	switch (rquery->type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		return;
-	case PIPE_QUERY_GPU_FINISHED:
-		rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, &rquery->fence);
-		return;
-	case R600_QUERY_DRAW_CALLS:
-		rquery->end_result = rctx->num_draw_calls;
-		return;
-	case R600_QUERY_REQUESTED_VRAM:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_VRAM_MEMORY);
-		return;
-	case R600_QUERY_REQUESTED_GTT:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY);
-		return;
-	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
-		return;
-	case R600_QUERY_NUM_CS_FLUSHES:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
-		return;
-	case R600_QUERY_NUM_BYTES_MOVED:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED);
-		return;
-	case R600_QUERY_VRAM_USAGE:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_VRAM_USAGE);
-		return;
-	case R600_QUERY_GTT_USAGE:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GTT_USAGE);
-		return;
-	case R600_QUERY_GPU_TEMPERATURE:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GPU_TEMPERATURE) / 1000;
-		return;
-	case R600_QUERY_CURRENT_GPU_SCLK:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_SCLK) * 1000000;
-		return;
-	case R600_QUERY_CURRENT_GPU_MCLK:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_MCLK) * 1000000;
-		return;
-	case R600_QUERY_GPU_LOAD:
-		rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result);
-		return;
-	case R600_QUERY_NUM_COMPILATIONS:
-		rquery->end_result = p_atomic_read(&rctx->screen->num_compilations);
-		return;
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
-		return;
-	}
+	return rquery->ops->end(rctx, rquery);
+}
 
-	r600_emit_query_end(rctx, rquery);
+bool r600_query_hw_end(struct r600_common_context *rctx,
+		       struct r600_query *rquery)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+
+	if (query->flags & R600_QUERY_HW_FLAG_NO_START)
+		r600_query_hw_reset_buffers(rctx, query);
+
+	r600_query_hw_emit_stop(rctx, query);
+
+	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
+		LIST_DELINIT(&query->list);
+
+	if (!query->buffer.buf)
+		return false;
+
+	return true;
+}
+
+static void r600_get_hw_query_params(struct r600_common_context *rctx,
+				     struct r600_query_hw *rquery, int index,
+				     struct r600_hw_query_params *params)
+{
+	params->pair_stride = 0;
+	params->pair_count = 1;
 
-	if (r600_query_needs_begin(rquery->type))
-		LIST_DELINIT(&rquery->list);
+	switch (rquery->b.type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		params->start_offset = 0;
+		params->end_offset = 8;
+		params->fence_offset = rctx->max_db * 16;
+		params->pair_stride = 16;
+		params->pair_count = rctx->max_db;
+		break;
+	case PIPE_QUERY_TIME_ELAPSED:
+		params->start_offset = 0;
+		params->end_offset = 8;
+		params->fence_offset = 16;
+		break;
+	case PIPE_QUERY_TIMESTAMP:
+		params->start_offset = 0;
+		params->end_offset = 0;
+		params->fence_offset = 8;
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		params->start_offset = 8;
+		params->end_offset = 24;
+		params->fence_offset = params->end_offset + 4;
+		break;
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		params->start_offset = 0;
+		params->end_offset = 16;
+		params->fence_offset = params->end_offset + 4;
+		break;
+	case PIPE_QUERY_SO_STATISTICS:
+		params->start_offset = 8 - index * 8;
+		params->end_offset = 24 - index * 8;
+		params->fence_offset = params->end_offset + 4;
+		break;
+	case PIPE_QUERY_PIPELINE_STATISTICS:
+	{
+		/* Offsets apply to EG+ */
+		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
+		params->start_offset = offsets[index];
+		params->end_offset = 88 + offsets[index];
+		params->fence_offset = 2 * 88;
+		break;
+	}
+	default:
+		unreachable("r600_get_hw_query_params unsupported");
+	}
 }
 
-static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
+static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
 				       bool test_status_bit)
 {
 	uint32_t *current_result = (uint32_t*)map;
@@ -602,84 +936,34 @@ static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned
 	return 0;
 }
 
-static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
-					    struct r600_query *query,
-					    struct r600_query_buffer *qbuf,
-					    boolean wait,
-					    union pipe_query_result *result)
+static void r600_query_hw_add_result(struct r600_common_context *ctx,
+				     struct r600_query_hw *query,
+				     void *buffer,
+				     union pipe_query_result *result)
 {
-	struct pipe_screen *screen = ctx->b.screen;
-	unsigned results_base = 0;
-	char *map;
-
-	/* Non-GPU queries. */
-	switch (query->type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		/* Convert from cycles per millisecond to cycles per second (Hz). */
-		result->timestamp_disjoint.frequency =
-			(uint64_t)ctx->screen->info.r600_clock_crystal_freq * 1000;
-		result->timestamp_disjoint.disjoint = FALSE;
-		return TRUE;
-	case PIPE_QUERY_GPU_FINISHED:
-		result->b = screen->fence_finish(screen, query->fence,
-					wait ? PIPE_TIMEOUT_INFINITE : 0);
-		return result->b;
-	case R600_QUERY_DRAW_CALLS:
-	case R600_QUERY_REQUESTED_VRAM:
-	case R600_QUERY_REQUESTED_GTT:
-	case R600_QUERY_BUFFER_WAIT_TIME:
-	case R600_QUERY_NUM_CS_FLUSHES:
-	case R600_QUERY_NUM_BYTES_MOVED:
-	case R600_QUERY_VRAM_USAGE:
-	case R600_QUERY_GTT_USAGE:
-	case R600_QUERY_GPU_TEMPERATURE:
-	case R600_QUERY_CURRENT_GPU_SCLK:
-	case R600_QUERY_CURRENT_GPU_MCLK:
-	case R600_QUERY_NUM_COMPILATIONS:
-	case R600_QUERY_NUM_SHADERS_CREATED:
-		result->u64 = query->end_result - query->begin_result;
-		return TRUE;
-	case R600_QUERY_GPU_LOAD:
-		result->u64 = query->end_result;
-		return TRUE;
-	}
-
-	map = r600_buffer_map_sync_with_rings(ctx, qbuf->buf,
-						PIPE_TRANSFER_READ |
-						(wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
-	if (!map)
-		return FALSE;
-
-	/* count all results across all data blocks */
-	switch (query->type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-		while (results_base != qbuf->results_end) {
+	switch (query->b.type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER: {
+		for (unsigned i = 0; i < ctx->max_db; ++i) {
+			unsigned results_base = i * 16;
 			result->u64 +=
-				r600_query_read_result(map + results_base, 0, 2, true);
-			results_base += 16;
+				r600_query_read_result(buffer + results_base, 0, 2, true);
 		}
 		break;
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		while (results_base != qbuf->results_end) {
+	}
+	case PIPE_QUERY_OCCLUSION_PREDICATE: {
+		for (unsigned i = 0; i < ctx->max_db; ++i) {
+			unsigned results_base = i * 16;
 			result->b = result->b ||
-				r600_query_read_result(map + results_base, 0, 2, true) != 0;
-			results_base += 16;
+				r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
 		}
 		break;
+	}
 	case PIPE_QUERY_TIME_ELAPSED:
-		while (results_base != qbuf->results_end) {
-			result->u64 +=
-				r600_query_read_result(map + results_base, 0, 2, false);
-			results_base += query->result_size;
-		}
+		result->u64 += r600_query_read_result(buffer, 0, 2, false);
 		break;
 	case PIPE_QUERY_TIMESTAMP:
-	{
-		uint32_t *current_result = (uint32_t*)map;
-		result->u64 = (uint64_t)current_result[0] |
-			      (uint64_t)current_result[1] << 32;
+		result->u64 = *(uint64_t*)buffer;
 		break;
-	}
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 		/* SAMPLE_STREAMOUTSTATS stores this structure:
 		 * {
@@ -687,84 +971,64 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 		 *    u64 PrimitiveStorageNeeded;
 		 * }
 		 * We only need NumPrimitivesWritten here. */
-		while (results_base != qbuf->results_end) {
-			result->u64 +=
-				r600_query_read_result(map + results_base, 2, 6, true);
-			results_base += query->result_size;
-		}
+		result->u64 += r600_query_read_result(buffer, 2, 6, true);
 		break;
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
 		/* Here we read PrimitiveStorageNeeded. */
-		while (results_base != qbuf->results_end) {
-			result->u64 +=
-				r600_query_read_result(map + results_base, 0, 4, true);
-			results_base += query->result_size;
-		}
+		result->u64 += r600_query_read_result(buffer, 0, 4, true);
 		break;
 	case PIPE_QUERY_SO_STATISTICS:
-		while (results_base != qbuf->results_end) {
-			result->so_statistics.num_primitives_written +=
-				r600_query_read_result(map + results_base, 2, 6, true);
-			result->so_statistics.primitives_storage_needed +=
-				r600_query_read_result(map + results_base, 0, 4, true);
-			results_base += query->result_size;
-		}
+		result->so_statistics.num_primitives_written +=
+			r600_query_read_result(buffer, 2, 6, true);
+		result->so_statistics.primitives_storage_needed +=
+			r600_query_read_result(buffer, 0, 4, true);
 		break;
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		while (results_base != qbuf->results_end) {
-			result->b = result->b ||
-				r600_query_read_result(map + results_base, 2, 6, true) !=
-				r600_query_read_result(map + results_base, 0, 4, true);
-			results_base += query->result_size;
-		}
+		result->b = result->b ||
+			r600_query_read_result(buffer, 2, 6, true) !=
+			r600_query_read_result(buffer, 0, 4, true);
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		if (ctx->chip_class >= EVERGREEN) {
-			while (results_base != qbuf->results_end) {
-				result->pipeline_statistics.ps_invocations +=
-					r600_query_read_result(map + results_base, 0, 22, false);
-				result->pipeline_statistics.c_primitives +=
-					r600_query_read_result(map + results_base, 2, 24, false);
-				result->pipeline_statistics.c_invocations +=
-					r600_query_read_result(map + results_base, 4, 26, false);
-				result->pipeline_statistics.vs_invocations +=
-					r600_query_read_result(map + results_base, 6, 28, false);
-				result->pipeline_statistics.gs_invocations +=
-					r600_query_read_result(map + results_base, 8, 30, false);
-				result->pipeline_statistics.gs_primitives +=
-					r600_query_read_result(map + results_base, 10, 32, false);
-				result->pipeline_statistics.ia_primitives +=
-					r600_query_read_result(map + results_base, 12, 34, false);
-				result->pipeline_statistics.ia_vertices +=
-					r600_query_read_result(map + results_base, 14, 36, false);
-				result->pipeline_statistics.hs_invocations +=
-					r600_query_read_result(map + results_base, 16, 38, false);
-				result->pipeline_statistics.ds_invocations +=
-					r600_query_read_result(map + results_base, 18, 40, false);
-				result->pipeline_statistics.cs_invocations +=
-					r600_query_read_result(map + results_base, 20, 42, false);
-				results_base += query->result_size;
-			}
+			result->pipeline_statistics.ps_invocations +=
+				r600_query_read_result(buffer, 0, 22, false);
+			result->pipeline_statistics.c_primitives +=
+				r600_query_read_result(buffer, 2, 24, false);
+			result->pipeline_statistics.c_invocations +=
+				r600_query_read_result(buffer, 4, 26, false);
+			result->pipeline_statistics.vs_invocations +=
+				r600_query_read_result(buffer, 6, 28, false);
+			result->pipeline_statistics.gs_invocations +=
+				r600_query_read_result(buffer, 8, 30, false);
+			result->pipeline_statistics.gs_primitives +=
+				r600_query_read_result(buffer, 10, 32, false);
+			result->pipeline_statistics.ia_primitives +=
+				r600_query_read_result(buffer, 12, 34, false);
+			result->pipeline_statistics.ia_vertices +=
+				r600_query_read_result(buffer, 14, 36, false);
+			result->pipeline_statistics.hs_invocations +=
+				r600_query_read_result(buffer, 16, 38, false);
+			result->pipeline_statistics.ds_invocations +=
+				r600_query_read_result(buffer, 18, 40, false);
+			result->pipeline_statistics.cs_invocations +=
+				r600_query_read_result(buffer, 20, 42, false);
 		} else {
-			while (results_base != qbuf->results_end) {
-				result->pipeline_statistics.ps_invocations +=
-					r600_query_read_result(map + results_base, 0, 16, false);
-				result->pipeline_statistics.c_primitives +=
-					r600_query_read_result(map + results_base, 2, 18, false);
-				result->pipeline_statistics.c_invocations +=
-					r600_query_read_result(map + results_base, 4, 20, false);
-				result->pipeline_statistics.vs_invocations +=
-					r600_query_read_result(map + results_base, 6, 22, false);
-				result->pipeline_statistics.gs_invocations +=
-					r600_query_read_result(map + results_base, 8, 24, false);
-				result->pipeline_statistics.gs_primitives +=
-					r600_query_read_result(map + results_base, 10, 26, false);
-				result->pipeline_statistics.ia_primitives +=
-					r600_query_read_result(map + results_base, 12, 28, false);
-				result->pipeline_statistics.ia_vertices +=
-					r600_query_read_result(map + results_base, 14, 30, false);
-				results_base += query->result_size;
-			}
+			result->pipeline_statistics.ps_invocations +=
+				r600_query_read_result(buffer, 0, 16, false);
+			result->pipeline_statistics.c_primitives +=
+				r600_query_read_result(buffer, 2, 18, false);
+			result->pipeline_statistics.c_invocations +=
+				r600_query_read_result(buffer, 4, 20, false);
+			result->pipeline_statistics.vs_invocations +=
+				r600_query_read_result(buffer, 6, 22, false);
+			result->pipeline_statistics.gs_invocations +=
+				r600_query_read_result(buffer, 8, 24, false);
+			result->pipeline_statistics.gs_primitives +=
+				r600_query_read_result(buffer, 10, 26, false);
+			result->pipeline_statistics.ia_primitives +=
+				r600_query_read_result(buffer, 12, 28, false);
+			result->pipeline_statistics.ia_vertices +=
+				r600_query_read_result(buffer, 14, 30, false);
 		}
 #if 0 /* for testing */
 		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
@@ -786,118 +1050,482 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 	default:
 		assert(0);
 	}
-
-	return TRUE;
 }
 
 static boolean r600_get_query_result(struct pipe_context *ctx,
-					struct pipe_query *query,
-					boolean wait, union pipe_query_result *result)
+				     struct pipe_query *query, boolean wait,
+				     union pipe_query_result *result)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query *rquery = (struct r600_query *)query;
+
+	return rquery->ops->get_result(rctx, rquery, wait, result);
+}
+
+static void r600_get_query_result_resource(struct pipe_context *ctx,
+                                           struct pipe_query *query,
+                                           boolean wait,
+                                           enum pipe_query_value_type result_type,
+                                           int index,
+                                           struct pipe_resource *resource,
+                                           unsigned offset)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_query *rquery = (struct r600_query *)query;
+
+	rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
+	                                 resource, offset);
+}
+
+static void r600_query_hw_clear_result(struct r600_query_hw *query,
+				       union pipe_query_result *result)
+{
+	util_query_clear_result(result, query->b.type);
+}
+
+bool r600_query_hw_get_result(struct r600_common_context *rctx,
+			      struct r600_query *rquery,
+			      bool wait, union pipe_query_result *result)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
 	struct r600_query_buffer *qbuf;
 
-	util_query_clear_result(result, rquery->type);
+	query->ops->clear_result(query, result);
+
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned results_base = 0;
+		void *map;
 
-	for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) {
-		if (!r600_get_query_buffer_result(rctx, rquery, qbuf, wait, result)) {
-			return FALSE;
+		map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf,
+						      PIPE_TRANSFER_READ |
+						      (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
+		if (!map)
+			return false;
+
+		while (results_base != qbuf->results_end) {
+			query->ops->add_result(rctx, query, map + results_base,
+					       result);
+			results_base += query->result_size;
 		}
 	}
 
 	/* Convert the time to expected units. */
 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
-		result->u64 = (1000000 * result->u64) / rctx->screen->info.r600_clock_crystal_freq;
+		result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq;
 	}
-	return TRUE;
+	return true;
 }
 
-static void r600_render_condition(struct pipe_context *ctx,
-				  struct pipe_query *query,
-				  boolean condition,
-				  uint mode)
+/* Create the compute shader that is used to collect the results.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * CONST
+ *  0.x = end_offset
+ *  0.y = result_stride
+ *  0.z = result_count
+ *  0.w = bit field:
+ *          1: read previously accumulated values
+ *          2: write accumulated values for chaining
+ *          4: write result available
+ *          8: convert result to boolean (0/1)
+ *         16: only read one dword and use that as result
+ *         32: apply timestamp conversion
+ *         64: store full 64 bits result
+ *        128: store signed 32 bits result
+ *  1.x = fence_offset
+ *  1.y = pair_stride
+ *  1.z = pair_count
+ *
+ * BUFFER[0] = query result buffer
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ */
+static void r600_create_query_result_shader(struct r600_common_context *rctx)
 {
-	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_query *rquery = (struct r600_query *)query;
-	bool wait_flag = false;
-
-	rctx->current_render_cond = query;
-	rctx->current_render_cond_cond = condition;
-	rctx->current_render_cond_mode = mode;
+	/* TEMP[0].xy = accumulated result so far
+	 * TEMP[0].z = result not available
+	 *
+	 * TEMP[1].x = current result index
+	 * TEMP[1].y = current pair index
+	 */
+	static const char text_tmpl[] =
+		"COMP\n"
+		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+		"DCL BUFFER[0]\n"
+		"DCL BUFFER[1]\n"
+		"DCL BUFFER[2]\n"
+		"DCL CONST[0..1]\n"
+		"DCL TEMP[0..5]\n"
+		"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
+		"IMM[1] UINT32 {1, 2, 4, 8}\n"
+		"IMM[2] UINT32 {16, 32, 64, 128}\n"
+		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+
+		"AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
+		"UIF TEMP[5]\n"
+			/* Check result availability. */
+			"LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n"
+			"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
+			"MOV TEMP[1], TEMP[0].zzzz\n"
+			"NOT TEMP[0].z, TEMP[0].zzzz\n"
+
+			/* Load result if available. */
+			"UIF TEMP[1]\n"
+				"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+			"ENDIF\n"
+		"ELSE\n"
+			/* Load previously accumulated result if requested. */
+			"MOV TEMP[0], IMM[0].xxxx\n"
+			"AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n"
+			"UIF TEMP[4]\n"
+				"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
+			"ENDIF\n"
+
+			"MOV TEMP[1].x, IMM[0].xxxx\n"
+			"BGNLOOP\n"
+				/* Break if accumulated result so far is not available. */
+				"UIF TEMP[0].zzzz\n"
+					"BRK\n"
+				"ENDIF\n"
+
+				/* Break if result_index >= result_count. */
+				"USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n"
+				"UIF TEMP[5]\n"
+					"BRK\n"
+				"ENDIF\n"
+
+				/* Load fence and check result availability */
+				"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n"
+				"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+				"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
+				"NOT TEMP[0].z, TEMP[0].zzzz\n"
+				"UIF TEMP[0].zzzz\n"
+					"BRK\n"
+				"ENDIF\n"
+
+				"MOV TEMP[1].y, IMM[0].xxxx\n"
+				"BGNLOOP\n"
+					/* Load start and end. */
+					"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n"
+					"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n"
+					"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+					"UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n"
+					"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+					"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+					"U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n"
+
+					/* Increment pair index */
+					"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
+					"USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n"
+					"UIF TEMP[5]\n"
+						"BRK\n"
+					"ENDIF\n"
+				"ENDLOOP\n"
+
+				/* Increment result index */
+				"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
+			"ENDLOOP\n"
+		"ENDIF\n"
+
+		"AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n"
+		"UIF TEMP[4]\n"
+			/* Store accumulated data for chaining. */
+			"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+		"ELSE\n"
+			"AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n"
+			"UIF TEMP[4]\n"
+				/* Store result availability. */
+				"NOT TEMP[0].z, TEMP[0]\n"
+				"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
+				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+
+				"AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
+				"UIF TEMP[4]\n"
+					"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+				"ENDIF\n"
+			"ELSE\n"
+				/* Store result if it is available. */
+				"NOT TEMP[4], TEMP[0].zzzz\n"
+				"UIF TEMP[4]\n"
+					/* Apply timestamp conversion */
+					"AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n"
+					"UIF TEMP[4]\n"
+						"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
+						"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
+					"ENDIF\n"
+
+					/* Convert to boolean */
+					"AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
+					"UIF TEMP[4]\n"
+						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
+						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
+						"MOV TEMP[0].y, IMM[0].xxxx\n"
+					"ENDIF\n"
+
+					"AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
+					"UIF TEMP[4]\n"
+						"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+					"ELSE\n"
+						/* Clamping */
+						"UIF TEMP[0].yyyy\n"
+							"MOV TEMP[0].x, IMM[0].wwww\n"
+						"ENDIF\n"
+
+						"AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n"
+						"UIF TEMP[4]\n"
+							"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
+						"ENDIF\n"
+
+						"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+					"ENDIF\n"
+				"ENDIF\n"
+			"ENDIF\n"
+		"ENDIF\n"
+
+		"END\n";
+
+	char text[sizeof(text_tmpl) + 32];
+	struct tgsi_token tokens[1024];
+	struct pipe_compute_state state = {};
+
+	/* Hard code the frequency into the shader so that the backend can
+	 * use the full range of optimizations for divide-by-constant.
+	 */
+	snprintf(text, sizeof(text), text_tmpl,
+		 rctx->screen->info.clock_crystal_freq);
 
-	if (query == NULL) {
-		if (rctx->predicate_drawing) {
-			rctx->predicate_drawing = false;
-			r600_emit_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, false);
-		}
+	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+		assert(false);
 		return;
 	}
 
-	if (mode == PIPE_RENDER_COND_WAIT ||
-	    mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
-		wait_flag = true;
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = tokens;
+
+	rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
+}
+
+static void r600_restore_qbo_state(struct r600_common_context *rctx,
+				   struct r600_qbo_state *st)
+{
+	rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
+
+	rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+	pipe_resource_reference(&st->saved_const0.buffer, NULL);
+
+	rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+	for (unsigned i = 0; i < 3; ++i)
+		pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
+}
+
+static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
+                                              struct r600_query *rquery,
+                                              bool wait,
+                                              enum pipe_query_value_type result_type,
+                                              int index,
+                                              struct pipe_resource *resource,
+                                              unsigned offset)
+{
+	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+	struct r600_query_buffer *qbuf;
+	struct r600_query_buffer *qbuf_prev;
+	struct pipe_resource *tmp_buffer = NULL;
+	unsigned tmp_buffer_offset = 0;
+	struct r600_qbo_state saved_state = {};
+	struct pipe_grid_info grid = {};
+	struct pipe_constant_buffer constant_buffer = {};
+	struct pipe_shader_buffer ssbo[3];
+	struct r600_hw_query_params params;
+	struct {
+		uint32_t end_offset;
+		uint32_t result_stride;
+		uint32_t result_count;
+		uint32_t config;
+		uint32_t fence_offset;
+		uint32_t pair_stride;
+		uint32_t pair_count;
+	} consts;
+
+	if (!rctx->query_result_shader) {
+		r600_create_query_result_shader(rctx);
+		if (!rctx->query_result_shader)
+			return;
 	}
 
-	rctx->predicate_drawing = true;
+	if (query->buffer.previous) {
+		u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
+				     &tmp_buffer_offset, &tmp_buffer);
+		if (!tmp_buffer)
+			return;
+	}
 
-	switch (rquery->type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		r600_emit_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+	rctx->save_qbo_state(&rctx->b, &saved_state);
+
+	r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
+	consts.end_offset = params.end_offset - params.start_offset;
+	consts.fence_offset = params.fence_offset - params.start_offset;
+	consts.result_stride = query->result_size;
+	consts.pair_stride = params.pair_stride;
+	consts.pair_count = params.pair_count;
+
+	constant_buffer.buffer_size = sizeof(consts);
+	constant_buffer.user_buffer = &consts;
+
+	ssbo[1].buffer = tmp_buffer;
+	ssbo[1].buffer_offset = tmp_buffer_offset;
+	ssbo[1].buffer_size = 16;
+
+	ssbo[2] = ssbo[1];
+
+	rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
+
+	grid.block[0] = 1;
+	grid.block[1] = 1;
+	grid.block[2] = 1;
+	grid.grid[0] = 1;
+	grid.grid[1] = 1;
+	grid.grid[2] = 1;
+
+	consts.config = 0;
+	if (index < 0)
+		consts.config |= 4;
+	if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+	    query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
+		consts.config |= 8;
+	else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
+		 query->b.type == PIPE_QUERY_TIME_ELAPSED)
+		consts.config |= 32;
+
+	switch (result_type) {
+	case PIPE_QUERY_TYPE_U64:
+	case PIPE_QUERY_TYPE_I64:
+		consts.config |= 64;
 		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_SO_STATISTICS:
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		r600_emit_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
+	case PIPE_QUERY_TYPE_I32:
+		consts.config |= 128;
+		break;
+	case PIPE_QUERY_TYPE_U32:
 		break;
-	default:
-		assert(0);
 	}
-}
 
-static void r600_suspend_queries(struct r600_common_context *ctx,
-				 struct list_head *query_list,
-				 unsigned *num_cs_dw_queries_suspend)
-{
-	struct r600_query *query;
+	rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
+
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
+		if (query->b.type != PIPE_QUERY_TIMESTAMP) {
+			qbuf_prev = qbuf->previous;
+			consts.result_count = qbuf->results_end / query->result_size;
+			consts.config &= ~3;
+			if (qbuf != &query->buffer)
+				consts.config |= 1;
+			if (qbuf->previous)
+				consts.config |= 2;
+		} else {
+			/* Only read the last timestamp. */
+			qbuf_prev = NULL;
+			consts.result_count = 0;
+			consts.config |= 16;
+			params.start_offset += qbuf->results_end - query->result_size;
+		}
 
-	LIST_FOR_EACH_ENTRY(query, query_list, list) {
-		r600_emit_query_end(ctx, query);
+		rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+
+		ssbo[0].buffer = &qbuf->buf->b.b;
+		ssbo[0].buffer_offset = params.start_offset;
+		ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
+
+		if (!qbuf->previous) {
+			ssbo[2].buffer = resource;
+			ssbo[2].buffer_offset = offset;
+			ssbo[2].buffer_size = 8;
+
+			((struct r600_resource *)resource)->TC_L2_dirty = true;
+		}
+
+		rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
+
+		if (wait && qbuf == &query->buffer) {
+			uint64_t va;
+
+			/* Wait for result availability. Wait only for readiness
+			 * of the last entry, since the fence writes should be
+			 * serialized in the CP.
+			 */
+			va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
+			va += params.fence_offset;
+
+			r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
+		}
+
+		rctx->b.launch_grid(&rctx->b, &grid);
+		rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;
 	}
-	assert(*num_cs_dw_queries_suspend == 0);
+
+	r600_restore_qbo_state(rctx, &saved_state);
+	pipe_resource_reference(&tmp_buffer, NULL);
 }
 
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+static void r600_render_condition(struct pipe_context *ctx,
+				  struct pipe_query *query,
+				  boolean condition,
+				  uint mode)
 {
-	r600_suspend_queries(ctx, &ctx->active_nontimer_queries,
-			     &ctx->num_cs_dw_nontimer_queries_suspend);
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_query_hw *rquery = (struct r600_query_hw *)query;
+	struct r600_query_buffer *qbuf;
+	struct r600_atom *atom = &rctx->render_cond_atom;
+
+	rctx->render_cond = query;
+	rctx->render_cond_invert = condition;
+	rctx->render_cond_mode = mode;
+
+	/* Compute the size of SET_PREDICATION packets. */
+	atom->num_dw = 0;
+	if (query) {
+		for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
+			atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
+	}
+
+	rctx->set_atom_dirty(rctx, atom, query != NULL);
 }
 
-void r600_suspend_timer_queries(struct r600_common_context *ctx)
+void r600_suspend_queries(struct r600_common_context *ctx)
 {
-	r600_suspend_queries(ctx, &ctx->active_timer_queries,
-			     &ctx->num_cs_dw_timer_queries_suspend);
+	struct r600_query_hw *query;
+
+	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
+		r600_query_hw_emit_stop(ctx, query);
+	}
+	assert(ctx->num_cs_dw_queries_suspend == 0);
 }
 
 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
 						    struct list_head *query_list)
 {
-	struct r600_query *query;
+	struct r600_query_hw *query;
 	unsigned num_dw = 0;
 
 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		/* begin + end */
-		num_dw += query->num_cs_dw * 2;
+		num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
 
 		/* Workaround for the fact that
 		 * num_cs_dw_nontimer_queries_suspend is incremented for every
 		 * resumed query, which raises the bar in need_cs_space for
 		 * queries about to be resumed.
 		 */
-		num_dw += query->num_cs_dw;
+		num_dw += query->num_cs_dw_end;
 	}
 	/* primitives generated query */
 	num_dw += ctx->streamout.enable_atom.num_dw;
@@ -907,48 +1535,34 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *
 	return num_dw;
 }
 
-static void r600_resume_queries(struct r600_common_context *ctx,
-				struct list_head *query_list,
-				unsigned *num_cs_dw_queries_suspend)
+void r600_resume_queries(struct r600_common_context *ctx)
 {
-	struct r600_query *query;
-	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list);
+	struct r600_query_hw *query;
+	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
 
-	assert(*num_cs_dw_queries_suspend == 0);
+	assert(ctx->num_cs_dw_queries_suspend == 0);
 
 	/* Check CS space here. Resuming must not be interrupted by flushes. */
-	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE);
+	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
 
-	LIST_FOR_EACH_ENTRY(query, query_list, list) {
-		r600_emit_query_begin(ctx, query);
+	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
+		r600_query_hw_emit_start(ctx, query);
 	}
 }
 
-void r600_resume_nontimer_queries(struct r600_common_context *ctx)
-{
-	r600_resume_queries(ctx, &ctx->active_nontimer_queries,
-			    &ctx->num_cs_dw_nontimer_queries_suspend);
-}
-
-void r600_resume_timer_queries(struct r600_common_context *ctx)
-{
-	r600_resume_queries(ctx, &ctx->active_timer_queries,
-			    &ctx->num_cs_dw_timer_queries_suspend);
-}
-
 /* Get backends mask */
 void r600_query_init_backend_mask(struct r600_common_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	struct r600_resource *buffer;
 	uint32_t *results;
-	unsigned num_backends = ctx->screen->info.r600_num_backends;
+	unsigned num_backends = ctx->screen->info.num_render_backends;
 	unsigned i, mask = 0;
 
 	/* if backend_map query is supported by the kernel */
-	if (ctx->screen->info.r600_backend_map_valid) {
-		unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
-		unsigned backend_map = ctx->screen->info.r600_backend_map;
+	if (ctx->screen->info.r600_gb_backend_map_valid) {
+		unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes;
+		unsigned backend_map = ctx->screen->info.r600_gb_backend_map;
 		unsigned item_width, item_mask;
 
 		if (ctx->chip_class >= EVERGREEN) {
@@ -959,7 +1573,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 			item_mask = 0x3;
 		}
 
-		while(num_tile_pipes--) {
+		while (num_tile_pipes--) {
 			i = backend_map & item_mask;
 			mask |= (1<<i);
 			backend_map >>= item_width;
@@ -990,7 +1604,8 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 		radeon_emit(cs, buffer->gpu_address);
 		radeon_emit(cs, buffer->gpu_address >> 32);
 
-		r600_emit_reloc(ctx, &ctx->rings.gfx, buffer, RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+		r600_emit_reloc(ctx, &ctx->gfx, buffer,
+                                RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 
 		/* analyze results */
 		results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
@@ -1003,7 +1618,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 		}
 	}
 
-	pipe_resource_reference((struct pipe_resource**)&buffer, NULL);
+	r600_resource_reference(&buffer, NULL);
 
 	if (mask != 0) {
 		ctx->backend_mask = mask;
@@ -1016,17 +1631,167 @@ err:
 	return;
 }
 
+#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
+	{ \
+		.name = name_, \
+		.query_type = R600_QUERY_##query_type_, \
+		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
+		.group_id = group_id_ \
+	}
+
+#define X(name_, query_type_, type_, result_type_) \
+	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+
+#define XG(group_, name_, query_type_, type_, result_type_) \
+	XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
+
+static struct pipe_driver_query_info r600_driver_query_list[] = {
+	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
+	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
+	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
+	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
+	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
+	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
+	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
+	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
+	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
+	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
+	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
+	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
+	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
+	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
+	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
+	X("num-ctx-flushes",		NUM_CTX_FLUSHES,	UINT64, AVERAGE),
+	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
+	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
+	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
+	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
+	X("back-buffer-ps-draw-ratio",	BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
+
+	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+	 * which use it as a fallback path to detect the GPU type.
+	 *
+	 * Note: The names of these queries are significant for GPUPerfStudio
+	 * (and possibly their order as well). */
+	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
+
+	/* The following queries must be at the end of the list because their
+	 * availability is adjusted dynamically based on the DRM version. */
+	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
+	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
+	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
+	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
+};
+
+#undef X
+#undef XG
+#undef XFULL
+
+static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
+{
+	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
+		return ARRAY_SIZE(r600_driver_query_list);
+	else if (rscreen->info.drm_major == 3)
+		return ARRAY_SIZE(r600_driver_query_list) - 3;
+	else
+		return ARRAY_SIZE(r600_driver_query_list) - 4;
+}
+
+static int r600_get_driver_query_info(struct pipe_screen *screen,
+				      unsigned index,
+				      struct pipe_driver_query_info *info)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	unsigned num_queries = r600_get_num_queries(rscreen);
+
+	if (!info) {
+		unsigned num_perfcounters =
+			r600_get_perfcounter_info(rscreen, 0, NULL);
+
+		return num_queries + num_perfcounters;
+	}
+
+	if (index >= num_queries)
+		return r600_get_perfcounter_info(rscreen, index - num_queries, info);
+
+	*info = r600_driver_query_list[index];
+
+	switch (info->query_type) {
+	case R600_QUERY_REQUESTED_VRAM:
+	case R600_QUERY_VRAM_USAGE:
+	case R600_QUERY_MAPPED_VRAM:
+		info->max_value.u64 = rscreen->info.vram_size;
+		break;
+	case R600_QUERY_REQUESTED_GTT:
+	case R600_QUERY_GTT_USAGE:
+	case R600_QUERY_MAPPED_GTT:
+		info->max_value.u64 = rscreen->info.gart_size;
+		break;
+	case R600_QUERY_GPU_TEMPERATURE:
+		info->max_value.u64 = 125;
+		break;
+	}
+
+	if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
+		info->group_id += rscreen->perfcounters->num_groups;
+
+	return 1;
+}
+
+/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
+ * performance counter groups, so be careful when changing this and related
+ * functions.
+ */
+static int r600_get_driver_query_group_info(struct pipe_screen *screen,
+					    unsigned index,
+					    struct pipe_driver_query_group_info *info)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
+	unsigned num_pc_groups = 0;
+
+	if (rscreen->perfcounters)
+		num_pc_groups = rscreen->perfcounters->num_groups;
+
+	if (!info)
+		return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
+
+	if (index < num_pc_groups)
+		return r600_get_perfcounter_group_info(rscreen, index, info);
+
+	index -= num_pc_groups;
+	if (index >= R600_NUM_SW_QUERY_GROUPS)
+		return 0;
+
+	info->name = "GPIN";
+	info->max_active_queries = 5;
+	info->num_queries = 5;
+	return 1;
+}
+
 void r600_query_init(struct r600_common_context *rctx)
 {
 	rctx->b.create_query = r600_create_query;
+	rctx->b.create_batch_query = r600_create_batch_query;
 	rctx->b.destroy_query = r600_destroy_query;
 	rctx->b.begin_query = r600_begin_query;
 	rctx->b.end_query = r600_end_query;
 	rctx->b.get_query_result = r600_get_query_result;
+	rctx->b.get_query_result_resource = r600_get_query_result_resource;
+	rctx->render_cond_atom.emit = r600_emit_query_predication;
 
-	if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0)
+	if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
 	    rctx->b.render_condition = r600_render_condition;
 
-	LIST_INITHEAD(&rctx->active_nontimer_queries);
-	LIST_INITHEAD(&rctx->active_timer_queries);
+	LIST_INITHEAD(&rctx->active_queries);
+}
+
+void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
+{
+	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
+	rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.h b/lib/mesa/src/gallium/drivers/radeon/r600_query.h
index 8b2c4e3fe..14c433d91 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_query.h
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.h
@@ -29,10 +29,12 @@
 #define R600_QUERY_H
 
 #include "pipe/p_defines.h"
+#include "pipe/p_state.h"
 #include "util/list.h"
 
 struct pipe_context;
 struct pipe_query;
+struct pipe_resource;
 
 struct r600_common_context;
 struct r600_common_screen;
@@ -40,26 +42,40 @@ struct r600_query;
 struct r600_query_hw;
 struct r600_resource;
 
-#define R600_QUERY_DRAW_CALLS		(PIPE_QUERY_DRIVER_SPECIFIC + 0)
-#define R600_QUERY_REQUESTED_VRAM	(PIPE_QUERY_DRIVER_SPECIFIC + 1)
-#define R600_QUERY_REQUESTED_GTT	(PIPE_QUERY_DRIVER_SPECIFIC + 2)
-#define R600_QUERY_BUFFER_WAIT_TIME	(PIPE_QUERY_DRIVER_SPECIFIC + 3)
-#define R600_QUERY_NUM_CS_FLUSHES	(PIPE_QUERY_DRIVER_SPECIFIC + 4)
-#define R600_QUERY_NUM_BYTES_MOVED	(PIPE_QUERY_DRIVER_SPECIFIC + 5)
-#define R600_QUERY_VRAM_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 6)
-#define R600_QUERY_GTT_USAGE		(PIPE_QUERY_DRIVER_SPECIFIC + 7)
-#define R600_QUERY_GPU_TEMPERATURE	(PIPE_QUERY_DRIVER_SPECIFIC + 8)
-#define R600_QUERY_CURRENT_GPU_SCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 9)
-#define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
-#define R600_QUERY_GPIN_ASIC_ID		(PIPE_QUERY_DRIVER_SPECIFIC + 14)
-#define R600_QUERY_GPIN_NUM_SIMD	(PIPE_QUERY_DRIVER_SPECIFIC + 15)
-#define R600_QUERY_GPIN_NUM_RB		(PIPE_QUERY_DRIVER_SPECIFIC + 16)
-#define R600_QUERY_GPIN_NUM_SPI		(PIPE_QUERY_DRIVER_SPECIFIC + 17)
-#define R600_QUERY_GPIN_NUM_SE		(PIPE_QUERY_DRIVER_SPECIFIC + 18)
-#define R600_QUERY_FIRST_PERFCOUNTER	(PIPE_QUERY_DRIVER_SPECIFIC + 100)
+enum {
+	R600_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+	R600_QUERY_SPILL_DRAW_CALLS,
+	R600_QUERY_COMPUTE_CALLS,
+	R600_QUERY_SPILL_COMPUTE_CALLS,
+	R600_QUERY_DMA_CALLS,
+	R600_QUERY_NUM_VS_FLUSHES,
+	R600_QUERY_NUM_PS_FLUSHES,
+	R600_QUERY_NUM_CS_FLUSHES,
+	R600_QUERY_REQUESTED_VRAM,
+	R600_QUERY_REQUESTED_GTT,
+	R600_QUERY_MAPPED_VRAM,
+	R600_QUERY_MAPPED_GTT,
+	R600_QUERY_BUFFER_WAIT_TIME,
+	R600_QUERY_NUM_CTX_FLUSHES,
+	R600_QUERY_NUM_BYTES_MOVED,
+	R600_QUERY_NUM_EVICTIONS,
+	R600_QUERY_VRAM_USAGE,
+	R600_QUERY_GTT_USAGE,
+	R600_QUERY_GPU_TEMPERATURE,
+	R600_QUERY_CURRENT_GPU_SCLK,
+	R600_QUERY_CURRENT_GPU_MCLK,
+	R600_QUERY_GPU_LOAD,
+	R600_QUERY_NUM_COMPILATIONS,
+	R600_QUERY_NUM_SHADERS_CREATED,
+	R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+	R600_QUERY_GPIN_ASIC_ID,
+	R600_QUERY_GPIN_NUM_SIMD,
+	R600_QUERY_GPIN_NUM_RB,
+	R600_QUERY_GPIN_NUM_SPI,
+	R600_QUERY_GPIN_NUM_SE,
+
+	R600_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+};
 
 enum {
 	R600_QUERY_GROUP_GPIN = 0,
@@ -68,11 +84,17 @@ enum {
 
 struct r600_query_ops {
 	void (*destroy)(struct r600_common_context *, struct r600_query *);
-	boolean (*begin)(struct r600_common_context *, struct r600_query *);
-	void (*end)(struct r600_common_context *, struct r600_query *);
-	boolean (*get_result)(struct r600_common_context *,
-			      struct r600_query *, boolean wait,
-			      union pipe_query_result *result);
+	bool (*begin)(struct r600_common_context *, struct r600_query *);
+	bool (*end)(struct r600_common_context *, struct r600_query *);
+	bool (*get_result)(struct r600_common_context *,
+			   struct r600_query *, bool wait,
+			   union pipe_query_result *result);
+	void (*get_result_resource)(struct r600_common_context *,
+				    struct r600_query *, bool wait,
+				    enum pipe_query_value_type result_type,
+				    int index,
+				    struct pipe_resource *resource,
+				    unsigned offset);
 };
 
 struct r600_query {
@@ -84,12 +106,13 @@ struct r600_query {
 
 enum {
 	R600_QUERY_HW_FLAG_NO_START = (1 << 0),
-	R600_QUERY_HW_FLAG_TIMER = (1 << 1),
-	R600_QUERY_HW_FLAG_PREDICATE = (1 << 2),
+	/* gap */
+	/* whether begin_query doesn't clear the result */
+	R600_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
 };
 
 struct r600_query_hw_ops {
-	void (*prepare_buffer)(struct r600_common_context *,
+	bool (*prepare_buffer)(struct r600_common_context *,
 			       struct r600_query_hw *,
 			       struct r600_resource *);
 	void (*emit_start)(struct r600_common_context *,
@@ -134,18 +157,18 @@ struct r600_query_hw {
 	unsigned stream;
 };
 
-boolean r600_query_hw_init(struct r600_common_context *rctx,
-			   struct r600_query_hw *query);
+bool r600_query_hw_init(struct r600_common_context *rctx,
+			struct r600_query_hw *query);
 void r600_query_hw_destroy(struct r600_common_context *rctx,
 			   struct r600_query *rquery);
-boolean r600_query_hw_begin(struct r600_common_context *rctx,
-			    struct r600_query *rquery);
-void r600_query_hw_end(struct r600_common_context *rctx,
+bool r600_query_hw_begin(struct r600_common_context *rctx,
+			 struct r600_query *rquery);
+bool r600_query_hw_end(struct r600_common_context *rctx,
 		       struct r600_query *rquery);
-boolean r600_query_hw_get_result(struct r600_common_context *rctx,
-				 struct r600_query *rquery,
-				 boolean wait,
-				 union pipe_query_result *result);
+bool r600_query_hw_get_result(struct r600_common_context *rctx,
+			      struct r600_query *rquery,
+			      bool wait,
+			      union pipe_query_result *result);
 
 /* Performance counters */
 enum {
@@ -227,8 +250,8 @@ struct r600_perfcounters {
 
 	void (*cleanup)(struct r600_common_screen *);
 
-	boolean separate_se;
-	boolean separate_instance;
+	bool separate_se;
+	bool separate_instance;
 };
 
 struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
@@ -242,12 +265,20 @@ int r600_get_perfcounter_group_info(struct r600_common_screen *,
 				    unsigned index,
 				    struct pipe_driver_query_group_info *info);
 
-boolean r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks);
+bool r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks);
 void r600_perfcounters_add_block(struct r600_common_screen *,
 				 struct r600_perfcounters *,
 				 const char *name, unsigned flags,
 				 unsigned counters, unsigned selectors,
 				 unsigned instances, void *data);
 void r600_perfcounters_do_destroy(struct r600_perfcounters *);
+void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
+				 struct r600_query_hw *query);
+
+struct r600_qbo_state {
+	void *saved_compute;
+	struct pipe_constant_buffer saved_const0;
+	struct pipe_shader_buffer saved_ssbo[3];
+};
 
 #endif /* R600_QUERY_H */
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
index 0853f636a..b5296aa56 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
@@ -46,7 +46,7 @@ r600_create_so_target(struct pipe_context *ctx,
 		return NULL;
 	}
 
-	u_suballocator_alloc(rctx->allocator_so_filled_size, 4,
+	u_suballocator_alloc(rctx->allocator_zeroed_memory, 4, 4,
 			     &t->buf_filled_size_offset,
 			     (struct pipe_resource**)&t->buf_filled_size);
 	if (!t->buf_filled_size) {
@@ -70,7 +70,7 @@ static void r600_so_target_destroy(struct pipe_context *ctx,
 {
 	struct r600_so_target *t = (struct r600_so_target*)target;
 	pipe_resource_reference(&t->b.buffer, NULL);
-	pipe_resource_reference((struct pipe_resource**)&t->buf_filled_size, NULL);
+	r600_resource_reference(&t->buf_filled_size, NULL);
 	FREE(t);
 }
 
@@ -116,7 +116,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	unsigned i;
-        unsigned append_bitmask = 0;
+        unsigned enabled_mask = 0, append_bitmask = 0;
 
 	/* Stop streamout. */
 	if (rctx->streamout.num_targets && rctx->streamout.begin_emitted) {
@@ -126,18 +126,19 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 	/* Set the new targets. */
 	for (i = 0; i < num_targets; i++) {
 		pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], targets[i]);
+		if (!targets[i])
+			continue;
+
 		r600_context_add_resource_size(ctx, targets[i]->buffer);
+		enabled_mask |= 1 << i;
 		if (offsets[i] == ((unsigned)-1))
-			append_bitmask |=  1 << i;
+			append_bitmask |= 1 << i;
 	}
 	for (; i < rctx->streamout.num_targets; i++) {
 		pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], NULL);
 	}
 
-	rctx->streamout.enabled_mask = (num_targets >= 1 && targets[0] ? 1 : 0) |
-				       (num_targets >= 2 && targets[1] ? 2 : 0) |
-				       (num_targets >= 3 && targets[2] ? 4 : 0) |
-				       (num_targets >= 4 && targets[3] ? 8 : 0);
+	rctx->streamout.enabled_mask = enabled_mask;
 
 	rctx->streamout.num_targets = num_targets;
 	rctx->streamout.append_bitmask = append_bitmask;
@@ -152,7 +153,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 
 static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	unsigned reg_strmout_cntl;
 
 	/* The register is at different places on different ASICs. */
@@ -165,9 +166,9 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 	}
 
 	if (rctx->chip_class >= CIK) {
-		cik_write_uconfig_reg(cs, reg_strmout_cntl, 0);
+		radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
 	} else {
-		r600_write_config_reg(cs, reg_strmout_cntl, 0);
+		radeon_set_config_reg(cs, reg_strmout_cntl, 0);
 	}
 
 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -184,7 +185,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 
 static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	unsigned *stride_in_dw = rctx->streamout.stride_in_dw;
 	unsigned i, update_flags = 0;
@@ -201,7 +202,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 			/* SI binds streamout buffers as shader resources.
 			 * VGT only counts primitives and tells the shader
 			 * through SGPRs what to do. */
-			r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
+			radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
 			radeon_emit(cs, (t[i]->b.buffer_offset +
 					 t[i]->b.buffer_size) >> 2);	/* BUFFER_SIZE (in DW) */
 			radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
@@ -210,14 +211,14 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 
 			update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
 
-			r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3);
+			radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3);
 			radeon_emit(cs, (t[i]->b.buffer_offset +
 					 t[i]->b.buffer_size) >> 2);	/* BUFFER_SIZE (in DW) */
 			radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
 			radeon_emit(cs, va >> 8);			/* BUFFER_BASE */
 
-			r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
-					RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
+			r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
+					RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER);
 
 			/* R7xx requires this packet after updating BUFFER_BASE.
 			 * Without this, R7xx locks up. */
@@ -226,8 +227,8 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 				radeon_emit(cs, i);
 				radeon_emit(cs, va >> 8);
 
-				r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
-						RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
+				r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
+						RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER);
 			}
 		}
 
@@ -244,8 +245,8 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 			radeon_emit(cs, va); /* src address lo */
 			radeon_emit(cs, va >> 32); /* src address hi */
 
-			r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
-					RADEON_USAGE_READ, RADEON_PRIO_MIN);
+			r600_emit_reloc(rctx,  &rctx->gfx, t[i]->buf_filled_size,
+					RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE);
 		} else {
 			/* Start from the beginning. */
 			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
@@ -267,7 +268,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 
 void r600_emit_streamout_end(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	unsigned i;
 	uint64_t va;
@@ -288,14 +289,14 @@ void r600_emit_streamout_end(struct r600_common_context *rctx)
 		radeon_emit(cs, 0); /* unused */
 		radeon_emit(cs, 0); /* unused */
 
-		r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
-				RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+		r600_emit_reloc(rctx,  &rctx->gfx, t[i]->buf_filled_size,
+				RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE);
 
 		/* Zero the buffer size. The counters (primitives generated,
 		 * primitives emitted) may be enabled even if there is not
 		 * buffer bound. This ensures that the primitives-emitted query
 		 * won't increment. */
-		r600_write_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
+		radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
 
 		t[i]->buf_filled_size_valid = true;
 	}
@@ -311,12 +312,6 @@ void r600_emit_streamout_end(struct r600_common_context *rctx)
  * are no buffers bound.
  */
 
-static bool r600_get_strmout_en(struct r600_common_context *rctx)
-{
-	return rctx->streamout.streamout_enabled ||
-	       rctx->streamout.prims_gen_query_enabled;
-}
-
 static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 				       struct r600_atom *atom)
 {
@@ -336,8 +331,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 			S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) |
 			S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx));
 	}
-	r600_write_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val);
-	r600_write_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val);
+	radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val);
+	radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val);
 }
 
 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
index e9bd4a21f..27035c0fa 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
@@ -26,12 +26,82 @@
  */
 #include "r600_pipe_common.h"
 #include "r600_cs.h"
+#include "r600_query.h"
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
+#include "util/u_surface.h"
+#include "os/os_time.h"
 #include <errno.h>
 #include <inttypes.h>
 
+static void r600_texture_discard_cmask(struct r600_common_screen *rscreen,
+				       struct r600_texture *rtex);
+static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
+				   const struct pipe_resource *templ);
+
+
+bool r600_prepare_for_dma_blit(struct r600_common_context *rctx,
+			       struct r600_texture *rdst,
+			       unsigned dst_level, unsigned dstx,
+			       unsigned dsty, unsigned dstz,
+			       struct r600_texture *rsrc,
+			       unsigned src_level,
+			       const struct pipe_box *src_box)
+{
+	if (!rctx->dma.cs)
+		return false;
+
+	if (util_format_get_blocksizebits(rdst->resource.b.b.format) !=
+	    util_format_get_blocksizebits(rsrc->resource.b.b.format))
+		return false;
+
+	/* MSAA: Blits don't exist in the real world. */
+	if (rsrc->resource.b.b.nr_samples > 1 ||
+	    rdst->resource.b.b.nr_samples > 1)
+		return false;
+
+	/* Depth-stencil surfaces:
+	 *   When dst is linear, the DB->CB copy preserves HTILE.
+	 *   When dst is tiled, the 3D path must be used to update HTILE.
+	 */
+	if (rsrc->is_depth || rdst->is_depth)
+		return false;
+
+	/* DCC as:
+	 *   src: Use the 3D path. DCC decompression is expensive.
+	 *   dst: Use the 3D path to compress the pixels with DCC.
+	 */
+	if ((rsrc->dcc_offset && rsrc->surface.level[src_level].dcc_enabled) ||
+	    (rdst->dcc_offset && rdst->surface.level[dst_level].dcc_enabled))
+		return false;
+
+	/* CMASK as:
+	 *   src: Both texture and SDMA paths need decompression. Use SDMA.
+	 *   dst: If overwriting the whole texture, discard CMASK and use
+	 *        SDMA. Otherwise, use the 3D path.
+	 */
+	if (rdst->cmask.size && rdst->dirty_level_mask & (1 << dst_level)) {
+		/* The CMASK clear is only enabled for the first level. */
+		assert(dst_level == 0);
+		if (!util_texrange_covers_whole_level(&rdst->resource.b.b, dst_level,
+						      dstx, dsty, dstz, src_box->width,
+						      src_box->height, src_box->depth))
+			return false;
+
+		r600_texture_discard_cmask(rctx->screen, rdst);
+	}
+
+	/* All requirements are met. Prepare textures for SDMA. */
+	if (rsrc->cmask.size && rsrc->dirty_level_mask & (1 << src_level))
+		rctx->b.flush_resource(&rctx->b, &rsrc->resource.b.b);
+
+	assert(!(rsrc->dirty_level_mask & (1 << src_level)));
+	assert(!(rdst->dirty_level_mask & (1 << dst_level)));
+
+	return true;
+}
+
 /* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
 static void r600_copy_region_with_blit(struct pipe_context *pipe,
 				       struct pipe_resource *dst,
@@ -122,7 +192,8 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
 			     struct radeon_surf *surface,
 			     const struct pipe_resource *ptex,
 			     unsigned array_mode,
-			     bool is_flushed_depth)
+			     bool is_flushed_depth,
+			     bool tc_compatible_htile)
 {
 	const struct util_format_description *desc =
 		util_format_description(ptex->format);
@@ -169,8 +240,9 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
 		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D_ARRAY, TYPE);
 		surface->array_size = ptex->array_size;
 		break;
-	case PIPE_TEXTURE_2D_ARRAY:
 	case PIPE_TEXTURE_CUBE_ARRAY: /* cube array layout like 2d array */
+		assert(ptex->array_size % 6 == 0);
+	case PIPE_TEXTURE_2D_ARRAY:
 		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D_ARRAY, TYPE);
 		surface->array_size = ptex->array_size;
 		break;
@@ -181,29 +253,55 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
 	default:
 		return -EINVAL;
 	}
-	if (ptex->bind & PIPE_BIND_SCANOUT) {
-		surface->flags |= RADEON_SURF_SCANOUT;
-	}
 
 	if (!is_flushed_depth && is_depth) {
 		surface->flags |= RADEON_SURF_ZBUFFER;
 
+		if (tc_compatible_htile &&
+		    array_mode == RADEON_SURF_MODE_2D) {
+			/* TC-compatible HTILE only supports Z32_FLOAT.
+			 * Promote Z16 to Z32. DB->CB copies will convert
+			 * the format for transfers.
+			 */
+			surface->bpe = 4;
+			surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+		}
+
 		if (is_stencil) {
 			surface->flags |= RADEON_SURF_SBUFFER |
 					  RADEON_SURF_HAS_SBUFFER_MIPTREE;
 		}
 	}
+
 	if (rscreen->chip_class >= SI) {
 		surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
 	}
+
+	if (rscreen->chip_class >= VI &&
+	    (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC ||
+	     ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT))
+		surface->flags |= RADEON_SURF_DISABLE_DCC;
+
+	if (ptex->bind & PIPE_BIND_SCANOUT) {
+		/* This should catch bugs in gallium users setting incorrect flags. */
+		assert(surface->nsamples == 1 &&
+		       surface->array_size == 1 &&
+		       surface->npix_z == 1 &&
+		       surface->last_level == 0 &&
+		       !(surface->flags & RADEON_SURF_Z_OR_SBUFFER));
+
+		surface->flags |= RADEON_SURF_SCANOUT;
+	}
 	return 0;
 }
 
 static int r600_setup_surface(struct pipe_screen *screen,
 			      struct r600_texture *rtex,
-			      unsigned pitch_in_bytes_override)
+			      unsigned pitch_in_bytes_override,
+			      unsigned offset)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	unsigned i;
 	int r;
 
 	r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface);
@@ -220,39 +318,292 @@ static int r600_setup_surface(struct pipe_screen *screen,
 		rtex->surface.level[0].nblk_x = pitch_in_bytes_override / rtex->surface.bpe;
 		rtex->surface.level[0].pitch_bytes = pitch_in_bytes_override;
 		rtex->surface.level[0].slice_size = pitch_in_bytes_override * rtex->surface.level[0].nblk_y;
-		if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
-			rtex->surface.stencil_offset =
-			rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size;
-		}
+	}
+
+	if (offset) {
+		for (i = 0; i < ARRAY_SIZE(rtex->surface.level); ++i)
+			rtex->surface.level[i].offset += offset;
 	}
 	return 0;
 }
 
-static boolean r600_texture_get_handle(struct pipe_screen* screen,
-				       struct pipe_resource *ptex,
-				       struct winsys_handle *whandle)
+static void r600_texture_init_metadata(struct r600_texture *rtex,
+				       struct radeon_bo_metadata *metadata)
 {
-	struct r600_texture *rtex = (struct r600_texture*)ptex;
-	struct r600_resource *resource = &rtex->resource;
 	struct radeon_surf *surface = &rtex->surface;
+
+	memset(metadata, 0, sizeof(*metadata));
+	metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ?
+				   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+	metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ?
+				   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+	metadata->pipe_config = surface->pipe_config;
+	metadata->bankw = surface->bankw;
+	metadata->bankh = surface->bankh;
+	metadata->tile_split = surface->tile_split;
+	metadata->mtilea = surface->mtilea;
+	metadata->num_banks = surface->num_banks;
+	metadata->stride = surface->level[0].pitch_bytes;
+	metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+}
+
+static void r600_dirty_all_framebuffer_states(struct r600_common_screen *rscreen)
+{
+	p_atomic_inc(&rscreen->dirty_fb_counter);
+}
+
+static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx,
+					    struct r600_texture *rtex)
+{
+	struct r600_common_screen *rscreen = rctx->screen;
+	struct pipe_context *ctx = &rctx->b;
+
+	if (ctx == rscreen->aux_context)
+		pipe_mutex_lock(rscreen->aux_context_lock);
+
+	ctx->flush_resource(ctx, &rtex->resource.b.b);
+	ctx->flush(ctx, NULL, 0);
+
+	if (ctx == rscreen->aux_context)
+		pipe_mutex_unlock(rscreen->aux_context_lock);
+}
+
+static void r600_texture_discard_cmask(struct r600_common_screen *rscreen,
+				       struct r600_texture *rtex)
+{
+	if (!rtex->cmask.size)
+		return;
+
+	assert(rtex->resource.b.b.nr_samples <= 1);
+
+	/* Disable CMASK. */
+	memset(&rtex->cmask, 0, sizeof(rtex->cmask));
+	rtex->cmask.base_address_reg = rtex->resource.gpu_address >> 8;
+	rtex->dirty_level_mask = 0;
+
+	if (rscreen->chip_class >= SI)
+		rtex->cb_color_info &= ~SI_S_028C70_FAST_CLEAR(1);
+	else
+		rtex->cb_color_info &= ~EG_S_028C70_FAST_CLEAR(1);
+
+	if (rtex->cmask_buffer != &rtex->resource)
+	    r600_resource_reference(&rtex->cmask_buffer, NULL);
+
+	/* Notify all contexts about the change. */
+	r600_dirty_all_framebuffer_states(rscreen);
+	p_atomic_inc(&rscreen->compressed_colortex_counter);
+}
+
+static bool r600_can_disable_dcc(struct r600_texture *rtex)
+{
+	/* We can't disable DCC if it can be written by another process. */
+	return rtex->dcc_offset &&
+	       (!rtex->resource.is_shared ||
+		!(rtex->resource.external_usage & PIPE_HANDLE_USAGE_WRITE));
+}
+
+static bool r600_texture_discard_dcc(struct r600_common_screen *rscreen,
+				     struct r600_texture *rtex)
+{
+	if (!r600_can_disable_dcc(rtex))
+		return false;
+
+	assert(rtex->dcc_separate_buffer == NULL);
+
+	/* Disable DCC. */
+	rtex->dcc_offset = 0;
+
+	/* Notify all contexts about the change. */
+	r600_dirty_all_framebuffer_states(rscreen);
+	return true;
+}
+
+/**
+ * Disable DCC for the texture. (first decompress, then discard metadata).
+ *
+ * There is unresolved multi-context synchronization issue between
+ * screen::aux_context and the current context. If applications do this with
+ * multiple contexts, it's already undefined behavior for them and we don't
+ * have to worry about that. The scenario is:
+ *
+ * If context 1 disables DCC and context 2 has queued commands that write
+ * to the texture via CB with DCC enabled, and the order of operations is
+ * as follows:
+ *   context 2 queues draw calls rendering to the texture, but doesn't flush
+ *   context 1 disables DCC and flushes
+ *   context 1 & 2 reset descriptors and FB state
+ *   context 2 flushes (new compressed tiles written by the draw calls)
+ *   context 1 & 2 read garbage, because DCC is disabled, yet there are
+ *   compressed tiled
+ *
+ * \param rctx  the current context if you have one, or rscreen->aux_context
+ *              if you don't.
+ */
+bool r600_texture_disable_dcc(struct r600_common_context *rctx,
+			      struct r600_texture *rtex)
+{
+	struct r600_common_screen *rscreen = rctx->screen;
+
+	if (!r600_can_disable_dcc(rtex))
+		return false;
+
+	if (&rctx->b == rscreen->aux_context)
+		pipe_mutex_lock(rscreen->aux_context_lock);
+
+	/* Decompress DCC. */
+	rctx->decompress_dcc(&rctx->b, rtex);
+	rctx->b.flush(&rctx->b, NULL, 0);
+
+	if (&rctx->b == rscreen->aux_context)
+		pipe_mutex_unlock(rscreen->aux_context_lock);
+
+	return r600_texture_discard_dcc(rscreen, rtex);
+}
+
+static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx,
+					     struct r600_texture *rtex,
+					     bool invalidate_storage)
+{
+	struct pipe_screen *screen = rctx->b.screen;
+	struct r600_texture *new_tex;
+	struct pipe_resource templ = rtex->resource.b.b;
+	unsigned i;
+
+	templ.bind |= PIPE_BIND_LINEAR;
+
+	/* r600g doesn't react to dirty_tex_descriptor_counter */
+	if (rctx->chip_class < SI)
+		return;
+
+	if (rtex->resource.is_shared ||
+	    rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED)
+		return;
+
+	/* This fails with MSAA, depth, and compressed textures. */
+	if (r600_choose_tiling(rctx->screen, &templ) !=
+	    RADEON_SURF_MODE_LINEAR_ALIGNED)
+		return;
+
+	new_tex = (struct r600_texture*)screen->resource_create(screen, &templ);
+	if (!new_tex)
+		return;
+
+	/* Copy the pixels to the new texture. */
+	if (!invalidate_storage) {
+		for (i = 0; i <= templ.last_level; i++) {
+			struct pipe_box box;
+
+			u_box_3d(0, 0, 0,
+				 u_minify(templ.width0, i), u_minify(templ.height0, i),
+				 util_max_layer(&templ, i) + 1, &box);
+
+			rctx->dma_copy(&rctx->b, &new_tex->resource.b.b, i, 0, 0, 0,
+				       &rtex->resource.b.b, i, &box);
+		}
+	}
+
+	r600_texture_discard_cmask(rctx->screen, rtex);
+	r600_texture_discard_dcc(rctx->screen, rtex);
+
+	/* Replace the structure fields of rtex. */
+	rtex->resource.b.b.bind = templ.bind;
+	pb_reference(&rtex->resource.buf, new_tex->resource.buf);
+	rtex->resource.gpu_address = new_tex->resource.gpu_address;
+	rtex->resource.vram_usage = new_tex->resource.vram_usage;
+	rtex->resource.gart_usage = new_tex->resource.gart_usage;
+	rtex->resource.bo_size = new_tex->resource.bo_size;
+	rtex->resource.bo_alignment = new_tex->resource.bo_alignment;
+	rtex->resource.domains = new_tex->resource.domains;
+	rtex->resource.flags = new_tex->resource.flags;
+	rtex->size = new_tex->size;
+	rtex->surface = new_tex->surface;
+	rtex->non_disp_tiling = new_tex->non_disp_tiling;
+	rtex->cb_color_info = new_tex->cb_color_info;
+	rtex->cmask = new_tex->cmask; /* needed even without CMASK */
+
+	assert(!rtex->htile_buffer);
+	assert(!rtex->cmask.size);
+	assert(!rtex->fmask.size);
+	assert(!rtex->dcc_offset);
+	assert(!rtex->is_depth);
+
+	r600_texture_reference(&new_tex, NULL);
+
+	r600_dirty_all_framebuffer_states(rctx->screen);
+	p_atomic_inc(&rctx->screen->dirty_tex_descriptor_counter);
+}
+
+static boolean r600_texture_get_handle(struct pipe_screen* screen,
+				       struct pipe_context *ctx,
+				       struct pipe_resource *resource,
+				       struct winsys_handle *whandle,
+                                       unsigned usage)
+{
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	struct r600_common_context *rctx = (struct r600_common_context*)
+					   (ctx ? ctx : rscreen->aux_context);
+	struct r600_resource *res = (struct r600_resource*)resource;
+	struct r600_texture *rtex = (struct r600_texture*)resource;
+	struct radeon_bo_metadata metadata;
+	bool update_metadata = false;
+
+	/* This is not supported now, but it might be required for OpenCL
+	 * interop in the future.
+	 */
+	if (resource->target != PIPE_BUFFER &&
+	    (resource->nr_samples > 1 || rtex->is_depth))
+		return false;
+
+	if (resource->target != PIPE_BUFFER) {
+		/* Since shader image stores don't support DCC on VI,
+		 * disable it for external clients that want write
+		 * access.
+		 */
+		if (usage & PIPE_HANDLE_USAGE_WRITE && rtex->dcc_offset) {
+			if (r600_texture_disable_dcc(rctx, rtex))
+				update_metadata = true;
+		}
 
-	rscreen->ws->buffer_set_tiling(resource->buf,
-				       NULL,
-				       surface->level[0].mode >= RADEON_SURF_MODE_1D ?
-				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
-				       surface->level[0].mode >= RADEON_SURF_MODE_2D ?
-				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
-				       surface->pipe_config,
-				       surface->bankw, surface->bankh,
-				       surface->tile_split,
-				       surface->stencil_tile_split,
-				       surface->mtilea, surface->num_banks,
-				       surface->level[0].pitch_bytes,
-				       (surface->flags & RADEON_SURF_SCANOUT) != 0);
-
-	return rscreen->ws->buffer_get_handle(resource->buf,
-						surface->level[0].pitch_bytes, whandle);
+		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+		    (rtex->cmask.size || rtex->dcc_offset)) {
+			/* Eliminate fast clear (both CMASK and DCC) */
+			r600_eliminate_fast_color_clear(rctx, rtex);
+
+			/* Disable CMASK if flush_resource isn't going
+			 * to be called.
+			 */
+			if (rtex->cmask.size)
+				r600_texture_discard_cmask(rscreen, rtex);
+		}
+
+		/* Set metadata. */
+		if (!res->is_shared || update_metadata) {
+			r600_texture_init_metadata(rtex, &metadata);
+			if (rscreen->query_opaque_metadata)
+				rscreen->query_opaque_metadata(rscreen, rtex,
+							       &metadata);
+
+			rscreen->ws->buffer_set_metadata(res->buf, &metadata);
+		}
+	}
+
+	if (res->is_shared) {
+		/* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+		 * doesn't set it.
+		 */
+		res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+			res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+	} else {
+		res->is_shared = true;
+		res->external_usage = usage;
+	}
+
+	return rscreen->ws->buffer_get_handle(res->buf,
+					      rtex->surface.level[0].pitch_bytes,
+					      rtex->surface.level[0].offset,
+					      rtex->surface.level[0].slice_size,
+					      whandle);
 }
 
 static void r600_texture_destroy(struct pipe_screen *screen,
@@ -261,14 +612,15 @@ static void r600_texture_destroy(struct pipe_screen *screen,
 	struct r600_texture *rtex = (struct r600_texture*)ptex;
 	struct r600_resource *resource = &rtex->resource;
 
-	if (rtex->flushed_depth_texture)
-		pipe_resource_reference((struct pipe_resource **)&rtex->flushed_depth_texture, NULL);
+	r600_texture_reference(&rtex->flushed_depth_texture, NULL);
 
-	pipe_resource_reference((struct pipe_resource**)&rtex->htile_buffer, NULL);
+	r600_resource_reference(&rtex->htile_buffer, NULL);
 	if (rtex->cmask_buffer != &rtex->resource) {
-	    pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL);
+	    r600_resource_reference(&rtex->cmask_buffer, NULL);
 	}
 	pb_reference(&resource->buf, NULL);
+	r600_resource_reference(&rtex->dcc_separate_buffer, NULL);
+	r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL);
 	FREE(rtex);
 }
 
@@ -335,7 +687,7 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen,
 		out->slice_tile_max -= 1;
 
 	out->tile_mode_index = fmask.tiling_index[0];
-	out->pitch = fmask.level[0].nblk_x;
+	out->pitch_in_pixels = fmask.level[0].nblk_x;
 	out->bank_height = fmask.bankh;
 	out->alignment = MAX2(256, fmask.bo_alignment);
 	out->size = fmask.bo_size;
@@ -347,7 +699,7 @@ static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen,
 	r600_texture_get_fmask_info(rscreen, rtex,
 				    rtex->resource.b.b.nr_samples, &rtex->fmask);
 
-	rtex->fmask.offset = align(rtex->size, rtex->fmask.alignment);
+	rtex->fmask.offset = align64(rtex->size, rtex->fmask.alignment);
 	rtex->size = rtex->fmask.offset + rtex->fmask.size;
 }
 
@@ -360,8 +712,8 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
 	unsigned cmask_tile_elements = cmask_tile_width * cmask_tile_height;
 	unsigned element_bits = 4;
 	unsigned cmask_cache_bits = 1024;
-	unsigned num_pipes = rscreen->tiling_info.num_channels;
-	unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
+	unsigned num_pipes = rscreen->info.num_tile_pipes;
+	unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
 
 	unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes;
 	unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements;
@@ -379,6 +731,10 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
 	assert(macro_tile_width % 128 == 0);
 	assert(macro_tile_height % 128 == 0);
 
+	out->pitch = pitch_elements;
+	out->height = height;
+	out->xalign = macro_tile_width;
+	out->yalign = macro_tile_height;
 	out->slice_tile_max = ((pitch_elements * height) / (128*128)) - 1;
 	out->alignment = MAX2(256, base_align);
 	out->size = (util_max_layer(&rtex->resource.b.b, 0) + 1) *
@@ -389,8 +745,8 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen,
 				      struct r600_texture *rtex,
 				      struct r600_cmask_info *out)
 {
-	unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
-	unsigned num_pipes = rscreen->tiling_info.num_channels;
+	unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
+	unsigned num_pipes = rscreen->info.num_tile_pipes;
 	unsigned cl_width, cl_height;
 
 	switch (num_pipes) {
@@ -424,6 +780,10 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen,
 	/* Each element of CMASK is a nibble. */
 	unsigned slice_bytes = slice_elements / 2;
 
+	out->pitch = width;
+	out->height = height;
+	out->xalign = cl_width * 8;
+	out->yalign = cl_height * 8;
 	out->slice_tile_max = (width * height) / (128*128);
 	if (out->slice_tile_max)
 		out->slice_tile_max -= 1;
@@ -442,7 +802,7 @@ static void r600_texture_allocate_cmask(struct r600_common_screen *rscreen,
 		r600_texture_get_cmask_info(rscreen, rtex, &rtex->cmask);
 	}
 
-	rtex->cmask.offset = align(rtex->size, rtex->cmask.alignment);
+	rtex->cmask.offset = align64(rtex->size, rtex->cmask.alignment);
 	rtex->size = rtex->cmask.offset + rtex->cmask.size;
 
 	if (rscreen->chip_class >= SI)
@@ -466,8 +826,9 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 	}
 
 	rtex->cmask_buffer = (struct r600_resource *)
-		pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT, rtex->cmask.size);
+		r600_aligned_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
+					   rtex->cmask.size,
+					   rtex->cmask.alignment);
 	if (rtex->cmask_buffer == NULL) {
 		rtex->cmask.size = 0;
 		return;
@@ -480,6 +841,8 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 		rtex->cb_color_info |= SI_S_028C70_FAST_CLEAR(1);
 	else
 		rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1);
+
+	p_atomic_inc(&rscreen->compressed_colortex_counter);
 }
 
 static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
@@ -487,7 +850,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 {
 	unsigned cl_width, cl_height, width, height;
 	unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
-	unsigned num_pipes = rscreen->tiling_info.num_channels;
+	unsigned num_pipes = rscreen->info.num_tile_pipes;
 
 	if (rscreen->chip_class <= EVERGREEN &&
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26)
@@ -505,6 +868,16 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;
 
+	/* Overalign HTILE on P2 configs to work around GPU hangs in
+	 * piglit/depthstencil-render-miplevels 585.
+	 *
+	 * This has been confirmed to help Kabini & Stoney, where the hangs
+	 * are always reproducible. I think I have seen the test hang
+	 * on Carrizo too, though it was very rare there.
+	 */
+	if (rscreen->chip_class >= CIK && num_pipes < 4)
+		num_pipes = 4;
+
 	switch (num_pipes) {
 	case 1:
 		cl_width = 32;
@@ -537,9 +910,15 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	slice_elements = (width * height) / (8 * 8);
 	slice_bytes = slice_elements * 4;
 
-	pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
+	pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
 	base_align = num_pipes * pipe_interleave_bytes;
 
+	rtex->htile.pitch = width;
+	rtex->htile.height = height;
+	rtex->htile.xalign = cl_width * 8;
+	rtex->htile.yalign = cl_height * 8;
+	rtex->htile.alignment = base_align;
+
 	return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
 		align(slice_bytes, base_align);
 }
@@ -547,21 +926,126 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
 					struct r600_texture *rtex)
 {
-	unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
+	uint64_t htile_size, alignment;
+	uint32_t clear_value;
+
+	if (rtex->tc_compatible_htile) {
+		htile_size = rtex->surface.htile_size;
+		alignment = rtex->surface.htile_alignment;
+		clear_value = 0x0000030F;
+	} else {
+		htile_size = r600_texture_get_htile_size(rscreen, rtex);
+		alignment = rtex->htile.alignment;
+		clear_value = 0;
+	}
 
 	if (!htile_size)
 		return;
 
 	rtex->htile_buffer = (struct r600_resource*)
-			     pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-						PIPE_USAGE_DEFAULT, htile_size);
+			     r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
+							PIPE_USAGE_DEFAULT,
+							htile_size, alignment);
 	if (rtex->htile_buffer == NULL) {
 		/* this is not a fatal error as we can still keep rendering
 		 * without htile buffer */
 		R600_ERR("Failed to create buffer object for htile buffer.\n");
 	} else {
-		r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0,
-					 htile_size, 0, true);
+		r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
+					 0, htile_size, clear_value,
+					 R600_COHERENCY_NONE);
+	}
+}
+
+void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
+{
+	int i;
+
+	fprintf(f, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
+		"blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
+		"bpe=%u, nsamples=%u, flags=0x%x, %s\n",
+		rtex->surface.npix_x, rtex->surface.npix_y,
+		rtex->surface.npix_z, rtex->surface.blk_w,
+		rtex->surface.blk_h, rtex->surface.blk_d,
+		rtex->surface.array_size, rtex->surface.last_level,
+		rtex->surface.bpe, rtex->surface.nsamples,
+		rtex->surface.flags, util_format_short_name(rtex->resource.b.b.format));
+
+	fprintf(f, "  Layout: size=%"PRIu64", alignment=%"PRIu64", bankw=%u, "
+		"bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
+		rtex->surface.bo_size, rtex->surface.bo_alignment, rtex->surface.bankw,
+		rtex->surface.bankh, rtex->surface.num_banks, rtex->surface.mtilea,
+		rtex->surface.tile_split, rtex->surface.pipe_config,
+		(rtex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+
+	if (rtex->fmask.size)
+		fprintf(f, "  FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
+			"bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
+			rtex->fmask.offset, rtex->fmask.size, rtex->fmask.alignment,
+			rtex->fmask.pitch_in_pixels, rtex->fmask.bank_height,
+			rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index);
+
+	if (rtex->cmask.size)
+		fprintf(f, "  CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, "
+			"height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n",
+			rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
+			rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign,
+			rtex->cmask.yalign, rtex->cmask.slice_tile_max);
+
+	if (rtex->htile_buffer)
+		fprintf(f, "  HTile: size=%u, alignment=%u, pitch=%u, height=%u, "
+			"xalign=%u, yalign=%u, TC_compatible = %u\n",
+			rtex->htile_buffer->b.b.width0,
+			rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
+			rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign,
+			rtex->tc_compatible_htile);
+
+	if (rtex->dcc_offset) {
+		fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
+			rtex->dcc_offset, rtex->surface.dcc_size,
+			rtex->surface.dcc_alignment);
+		for (i = 0; i <= rtex->surface.last_level; i++)
+			fprintf(f, "  DCCLevel[%i]: enabled=%u, offset=%"PRIu64", "
+				"fast_clear_size=%"PRIu64"\n",
+				i, rtex->surface.level[i].dcc_enabled,
+				rtex->surface.level[i].dcc_offset,
+				rtex->surface.level[i].dcc_fast_clear_size);
+	}
+
+	for (i = 0; i <= rtex->surface.last_level; i++)
+		fprintf(f, "  Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
+			"npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+			"nblk_z=%u, pitch_bytes=%u, mode=%u\n",
+			i, rtex->surface.level[i].offset,
+			rtex->surface.level[i].slice_size,
+			u_minify(rtex->resource.b.b.width0, i),
+			u_minify(rtex->resource.b.b.height0, i),
+			u_minify(rtex->resource.b.b.depth0, i),
+			rtex->surface.level[i].nblk_x,
+			rtex->surface.level[i].nblk_y,
+			rtex->surface.level[i].nblk_z,
+			rtex->surface.level[i].pitch_bytes,
+			rtex->surface.level[i].mode);
+
+	if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
+		fprintf(f, "  StencilLayout: tilesplit=%u\n",
+			rtex->surface.stencil_tile_split);
+		for (i = 0; i <= rtex->surface.last_level; i++) {
+			fprintf(f, "  StencilLevel[%i]: offset=%"PRIu64", "
+				"slice_size=%"PRIu64", npix_x=%u, "
+				"npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+				"nblk_z=%u, pitch_bytes=%u, mode=%u\n",
+				i, rtex->surface.stencil_level[i].offset,
+				rtex->surface.stencil_level[i].slice_size,
+				u_minify(rtex->resource.b.b.width0, i),
+				u_minify(rtex->resource.b.b.height0, i),
+				u_minify(rtex->resource.b.b.depth0, i),
+				rtex->surface.stencil_level[i].nblk_x,
+				rtex->surface.stencil_level[i].nblk_y,
+				rtex->surface.stencil_level[i].nblk_z,
+				rtex->surface.stencil_level[i].pitch_bytes,
+				rtex->surface.stencil_level[i].mode);
+		}
 	}
 }
 
@@ -570,6 +1054,7 @@ static struct r600_texture *
 r600_texture_create_object(struct pipe_screen *screen,
 			   const struct pipe_resource *base,
 			   unsigned pitch_in_bytes_override,
+			   unsigned offset,
 			   struct pb_buffer *buf,
 			   struct radeon_surf *surface)
 {
@@ -578,36 +1063,67 @@ r600_texture_create_object(struct pipe_screen *screen,
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 
 	rtex = CALLOC_STRUCT(r600_texture);
-	if (rtex == NULL)
+	if (!rtex)
 		return NULL;
 
 	resource = &rtex->resource;
 	resource->b.b = *base;
+	resource->b.b.next = NULL;
 	resource->b.vtbl = &r600_texture_vtbl;
 	pipe_reference_init(&resource->b.b.reference, 1);
 	resource->b.b.screen = screen;
-	rtex->pitch_override = pitch_in_bytes_override;
 
 	/* don't include stencil-only formats which we don't support for rendering */
 	rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
 
 	rtex->surface = *surface;
-	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override)) {
+	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
 		FREE(rtex);
 		return NULL;
 	}
 
+	rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
+	assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
+	       rtex->tc_compatible_htile);
+
+	/* TC-compatible HTILE only supports Z32_FLOAT. */
+	if (rtex->tc_compatible_htile)
+		rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+	else
+		rtex->db_render_format = base->format;
+
 	/* Tiled depth textures utilize the non-displayable tile order.
 	 * This must be done after r600_setup_surface.
 	 * Applies to R600-Cayman. */
 	rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D;
+	/* Applies to GCN. */
+	rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
+
+	/* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
+	 * between frames, so the only thing that can enable separate DCC
+	 * with DRI2 is multiple slow clears within a frame.
+	 */
+	rtex->ps_draw_ratio = 0;
 
 	if (rtex->is_depth) {
+		if (base->flags & (R600_RESOURCE_FLAG_TRANSFER |
+				   R600_RESOURCE_FLAG_FLUSHED_DEPTH) ||
+		    rscreen->chip_class >= EVERGREEN) {
+			rtex->can_sample_z = !rtex->surface.depth_adjusted;
+			rtex->can_sample_s = !rtex->surface.stencil_adjusted;
+		} else {
+			if (rtex->resource.b.b.nr_samples <= 1 &&
+			    (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM ||
+			     rtex->resource.b.b.format == PIPE_FORMAT_Z32_FLOAT))
+				rtex->can_sample_z = true;
+		}
+
 		if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER |
-				     R600_RESOURCE_FLAG_FLUSHED_DEPTH)) &&
-		    !(rscreen->debug_flags & DBG_NO_HYPERZ)) {
+				     R600_RESOURCE_FLAG_FLUSHED_DEPTH))) {
+			rtex->db_compatible = true;
 
-			r600_texture_allocate_htile(rscreen, rtex);
+			if (!(rscreen->debug_flags & DBG_NO_HYPERZ))
+				r600_texture_allocate_htile(rscreen, rtex);
 		}
 	} else {
 		if (base->nr_samples > 1) {
@@ -621,27 +1137,56 @@ r600_texture_create_object(struct pipe_screen *screen,
 				return NULL;
 			}
 		}
+
+		/* Shared textures must always set up DCC here.
+		 * If it's not present, it will be disabled by
+		 * apply_opaque_metadata later.
+		 */
+		if (rtex->surface.dcc_size &&
+		    (buf || !(rscreen->debug_flags & DBG_NO_DCC)) &&
+		    !(rtex->surface.flags & RADEON_SURF_SCANOUT)) {
+			/* Reserve space for the DCC buffer. */
+			rtex->dcc_offset = align64(rtex->size, rtex->surface.dcc_alignment);
+			rtex->size = rtex->dcc_offset + rtex->surface.dcc_size;
+		}
 	}
 
 	/* Now create the backing buffer. */
 	if (!buf) {
-		if (!r600_init_resource(rscreen, resource, rtex->size,
-					rtex->surface.bo_alignment, TRUE)) {
+		r600_init_resource_fields(rscreen, resource, rtex->size,
+					  rtex->surface.bo_alignment);
+
+		resource->flags |= RADEON_FLAG_HANDLE;
+
+		if (!r600_alloc_resource(rscreen, resource)) {
 			FREE(rtex);
 			return NULL;
 		}
 	} else {
 		resource->buf = buf;
-		resource->cs_buf = rscreen->ws->buffer_get_cs_handle(buf);
-		resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->cs_buf);
-		resource->domains = rscreen->ws->buffer_get_initial_domain(resource->cs_buf);
+		resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->buf);
+		resource->bo_size = buf->size;
+		resource->bo_alignment = buf->alignment;
+		resource->domains = rscreen->ws->buffer_get_initial_domain(resource->buf);
+		if (resource->domains & RADEON_DOMAIN_VRAM)
+			resource->vram_usage = buf->size;
+		else if (resource->domains & RADEON_DOMAIN_GTT)
+			resource->gart_usage = buf->size;
 	}
 
 	if (rtex->cmask.size) {
 		/* Initialize the cmask to 0xCC (= compressed state). */
 		r600_screen_clear_buffer(rscreen, &rtex->cmask_buffer->b.b,
 					 rtex->cmask.offset, rtex->cmask.size,
-					 0xCCCCCCCC, true);
+					 0xCCCCCCCC, R600_COHERENCY_NONE);
+	}
+
+	/* Initialize DCC only if the texture is not being imported. */
+	if (!buf && rtex->dcc_offset) {
+		r600_screen_clear_buffer(rscreen, &rtex->resource.b.b,
+					 rtex->dcc_offset,
+					 rtex->surface.dcc_size,
+					 0xFFFFFFFF, R600_COHERENCY_NONE);
 	}
 
 	/* Initialize the CMASK base register value. */
@@ -656,50 +1201,12 @@ r600_texture_create_object(struct pipe_screen *screen,
 			base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
 	}
 
-	if (rscreen->debug_flags & DBG_TEX ||
-	    (rtex->resource.b.b.last_level > 0 && rscreen->debug_flags & DBG_TEXMIP)) {
-		printf("Texture: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
-		       "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
-		       "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
-		       rtex->surface.npix_x, rtex->surface.npix_y,
-		       rtex->surface.npix_z, rtex->surface.blk_w,
-		       rtex->surface.blk_h, rtex->surface.blk_d,
-		       rtex->surface.array_size, rtex->surface.last_level,
-		       rtex->surface.bpe, rtex->surface.nsamples,
-		       rtex->surface.flags, util_format_short_name(base->format));
-		for (int i = 0; i <= rtex->surface.last_level; i++) {
-			printf("  L %i: offset=%"PRIu64", slice_size=%"PRIu64", npix_x=%u, "
-			       "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-			       "nblk_z=%u, pitch_bytes=%u, mode=%u\n",
-			       i, rtex->surface.level[i].offset,
-			       rtex->surface.level[i].slice_size,
-			       u_minify(rtex->resource.b.b.width0, i),
-			       u_minify(rtex->resource.b.b.height0, i),
-			       u_minify(rtex->resource.b.b.depth0, i),
-			       rtex->surface.level[i].nblk_x,
-			       rtex->surface.level[i].nblk_y,
-			       rtex->surface.level[i].nblk_z,
-			       rtex->surface.level[i].pitch_bytes,
-			       rtex->surface.level[i].mode);
-		}
-		if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
-			for (int i = 0; i <= rtex->surface.last_level; i++) {
-				printf("  S %i: offset=%"PRIu64", slice_size=%"PRIu64", npix_x=%u, "
-				       "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-				       "nblk_z=%u, pitch_bytes=%u, mode=%u\n",
-				       i, rtex->surface.stencil_level[i].offset,
-				       rtex->surface.stencil_level[i].slice_size,
-				       u_minify(rtex->resource.b.b.width0, i),
-				       u_minify(rtex->resource.b.b.height0, i),
-				       u_minify(rtex->resource.b.b.depth0, i),
-				       rtex->surface.stencil_level[i].nblk_x,
-				       rtex->surface.stencil_level[i].nblk_y,
-				       rtex->surface.stencil_level[i].nblk_z,
-				       rtex->surface.stencil_level[i].pitch_bytes,
-				       rtex->surface.stencil_level[i].mode);
-			}
-		}
+	if (rscreen->debug_flags & DBG_TEX) {
+		puts("Texture:");
+		r600_print_texture_info(rtex, stdout);
+		fflush(stdout);
 	}
+
 	return rtex;
 }
 
@@ -725,13 +1232,12 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 		force_tiling = true;
 
 	/* Handle common candidates for the linear mode.
-	 * Compressed textures must always be tiled. */
-	if (!force_tiling && !util_format_is_compressed(templ->format)) {
-		/* Not everything can be linear, so we cannot enforce it
-		 * for all textures. */
-		if ((rscreen->debug_flags & DBG_NO_TILING) &&
-		    (!util_format_is_depth_or_stencil(templ->format) ||
-		     !(templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)))
+	 * Compressed textures and DB surfaces must always be tiled.
+	 */
+	if (!force_tiling && !util_format_is_compressed(templ->format) &&
+	    (!util_format_is_depth_or_stencil(templ->format) ||
+	     templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)) {
+		if (rscreen->debug_flags & DBG_NO_TILING)
 			return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
 		/* Tiling doesn't work with the 422 (SUBSAMPLED) formats on R600+. */
@@ -773,11 +1279,20 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct radeon_surf surface = {0};
+	bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH;
+	bool tc_compatible_htile =
+		rscreen->chip_class >= VI &&
+		(templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+		!(rscreen->debug_flags & DBG_NO_HYPERZ) &&
+		!is_flushed_depth &&
+		templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+		util_format_is_depth_or_stencil(templ->format);
+
 	int r;
 
 	r = r600_init_surface(rscreen, &surface, templ,
 			      r600_choose_tiling(rscreen, templ),
-			      templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
+			      is_flushed_depth, tc_compatible_htile);
 	if (r) {
 		return NULL;
 	}
@@ -785,55 +1300,70 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 	if (r) {
 		return NULL;
 	}
-	return (struct pipe_resource *)r600_texture_create_object(screen, templ,
+	return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0,
 								  0, NULL, &surface);
 }
 
 static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
 						      const struct pipe_resource *templ,
-						      struct winsys_handle *whandle)
+						      struct winsys_handle *whandle,
+                                                      unsigned usage)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct pb_buffer *buf = NULL;
-	unsigned stride = 0;
+	unsigned stride = 0, offset = 0;
 	unsigned array_mode;
-	enum radeon_bo_layout micro, macro;
 	struct radeon_surf surface;
-	bool scanout;
 	int r;
+	struct radeon_bo_metadata metadata = {};
+	struct r600_texture *rtex;
 
 	/* Support only 2D textures without mipmaps */
 	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
 	      templ->depth0 != 1 || templ->last_level != 0)
 		return NULL;
 
-	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride);
+	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride, &offset);
 	if (!buf)
 		return NULL;
 
-	rscreen->ws->buffer_get_tiling(buf, &micro, &macro,
-				       &surface.bankw, &surface.bankh,
-				       &surface.tile_split,
-				       &surface.stencil_tile_split,
-				       &surface.mtilea, &scanout);
+	rscreen->ws->buffer_get_metadata(buf, &metadata);
+
+	surface.pipe_config = metadata.pipe_config;
+	surface.bankw = metadata.bankw;
+	surface.bankh = metadata.bankh;
+	surface.tile_split = metadata.tile_split;
+	surface.mtilea = metadata.mtilea;
+	surface.num_banks = metadata.num_banks;
 
-	if (macro == RADEON_LAYOUT_TILED)
+	if (metadata.macrotile == RADEON_LAYOUT_TILED)
 		array_mode = RADEON_SURF_MODE_2D;
-	else if (micro == RADEON_LAYOUT_TILED)
+	else if (metadata.microtile == RADEON_LAYOUT_TILED)
 		array_mode = RADEON_SURF_MODE_1D;
 	else
 		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
 
-	r = r600_init_surface(rscreen, &surface, templ, array_mode, false);
+	r = r600_init_surface(rscreen, &surface, templ, array_mode,
+			      false, false);
 	if (r) {
 		return NULL;
 	}
 
-	if (scanout)
+	if (metadata.scanout)
 		surface.flags |= RADEON_SURF_SCANOUT;
 
-	return (struct pipe_resource *)r600_texture_create_object(screen, templ,
-								  stride, buf, &surface);
+	rtex = r600_texture_create_object(screen, templ, stride,
+					  offset, buf, &surface);
+	if (!rtex)
+		return NULL;
+
+	rtex->resource.is_shared = true;
+	rtex->resource.external_usage = usage;
+
+	if (rscreen->apply_opaque_metadata)
+		rscreen->apply_opaque_metadata(rscreen, rtex, &metadata);
+
+	return &rtex->resource.b.b;
 }
 
 bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
@@ -844,12 +1374,44 @@ bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
 	struct pipe_resource resource;
 	struct r600_texture **flushed_depth_texture = staging ?
 			staging : &rtex->flushed_depth_texture;
+	enum pipe_format pipe_format = texture->format;
+
+	if (!staging) {
+		if (rtex->flushed_depth_texture)
+			return true; /* it's ready */
+
+		if (!rtex->can_sample_z && rtex->can_sample_s) {
+			switch (pipe_format) {
+			case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+				/* Save memory by not allocating the S plane. */
+				pipe_format = PIPE_FORMAT_Z32_FLOAT;
+				break;
+			case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+			case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+				/* Save memory bandwidth by not copying the
+				 * stencil part during flush.
+				 *
+				 * This potentially increases memory bandwidth
+				 * if an application uses both Z and S texturing
+				 * simultaneously (a flushed Z24S8 texture
+				 * would be stored compactly), but how often
+				 * does that really happen?
+				 */
+				pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+				break;
+			default:;
+			}
+		} else if (!rtex->can_sample_s && rtex->can_sample_z) {
+			assert(util_format_has_stencil(util_format_description(pipe_format)));
 
-	if (!staging && rtex->flushed_depth_texture)
-		return true; /* it's ready */
+			/* DB->CB copies to an 8bpp surface don't work. */
+			pipe_format = PIPE_FORMAT_X24S8_UINT;
+		}
+	}
 
+	memset(&resource, 0, sizeof(resource));
 	resource.target = texture->target;
-	resource.format = texture->format;
+	resource.format = pipe_format;
 	resource.width0 = texture->width0;
 	resource.height0 = texture->height0;
 	resource.depth0 = texture->depth0;
@@ -869,7 +1431,6 @@ bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
 		return false;
 	}
 
-	(*flushed_depth_texture)->is_flushing_texture = TRUE;
 	(*flushed_depth_texture)->non_disp_tiling = false;
 	return true;
 }
@@ -894,24 +1455,52 @@ static void r600_init_temp_resource_from_box(struct pipe_resource *res,
 	res->flags = flags;
 
 	/* We must set the correct texture target and dimensions for a 3D box. */
-	if (box->depth > 1 && util_max_layer(orig, level) > 0)
-		res->target = orig->target;
-	else
-		res->target = PIPE_TEXTURE_2D;
-
-	switch (res->target) {
-	case PIPE_TEXTURE_1D_ARRAY:
-	case PIPE_TEXTURE_2D_ARRAY:
-	case PIPE_TEXTURE_CUBE_ARRAY:
+	if (box->depth > 1 && util_max_layer(orig, level) > 0) {
+		res->target = PIPE_TEXTURE_2D_ARRAY;
 		res->array_size = box->depth;
-		break;
-	case PIPE_TEXTURE_3D:
-		res->depth0 = box->depth;
-		break;
-	default:;
+	} else {
+		res->target = PIPE_TEXTURE_2D;
 	}
 }
 
+static bool r600_can_invalidate_texture(struct r600_common_screen *rscreen,
+					struct r600_texture *rtex,
+					unsigned transfer_usage,
+					const struct pipe_box *box)
+{
+	/* r600g doesn't react to dirty_tex_descriptor_counter */
+	return rscreen->chip_class >= SI &&
+		!rtex->resource.is_shared &&
+		!(transfer_usage & PIPE_TRANSFER_READ) &&
+		rtex->resource.b.b.last_level == 0 &&
+		util_texrange_covers_whole_level(&rtex->resource.b.b, 0,
+						 box->x, box->y, box->z,
+						 box->width, box->height,
+						 box->depth);
+}
+
+static void r600_texture_invalidate_storage(struct r600_common_context *rctx,
+					    struct r600_texture *rtex)
+{
+	struct r600_common_screen *rscreen = rctx->screen;
+
+	/* There is no point in discarding depth and tiled buffers. */
+	assert(!rtex->is_depth);
+	assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED);
+
+	/* Reallocate the buffer in the same pipe_resource. */
+	r600_alloc_resource(rscreen, &rtex->resource);
+
+	/* Initialize the CMASK base address (needed even without CMASK). */
+	rtex->cmask.base_address_reg =
+		(rtex->resource.gpu_address + rtex->cmask.offset) >> 8;
+
+	r600_dirty_all_framebuffer_states(rscreen);
+	p_atomic_inc(&rscreen->dirty_tex_descriptor_counter);
+
+	rctx->num_alloc_tex_transfer_bytes += rtex->size;
+}
+
 static void *r600_texture_transfer_map(struct pipe_context *ctx,
 				       struct pipe_resource *texture,
 				       unsigned level,
@@ -922,41 +1511,61 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_texture *rtex = (struct r600_texture*)texture;
 	struct r600_transfer *trans;
-	boolean use_staging_texture = FALSE;
 	struct r600_resource *buf;
 	unsigned offset = 0;
 	char *map;
+	bool use_staging_texture = false;
 
-	/* We cannot map a tiled texture directly because the data is
-	 * in a different order, therefore we do detiling using a blit.
-	 *
-	 * Also, use a temporary in GTT memory for read transfers, as
-	 * the CPU is much happier reading out of cached system memory
-	 * than uncached VRAM.
-	 */
-	if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
-		use_staging_texture = TRUE;
-	} else if ((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_MAP_DIRECTLY) &&
-	    (rtex->resource.domains == RADEON_DOMAIN_VRAM)) {
-		/* Untiled buffers in VRAM, which is slow for CPU reads */
-		use_staging_texture = TRUE;
-	} else if (!(usage & PIPE_TRANSFER_READ) &&
-	    (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) ||
-	     !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) {
-		/* Use a staging texture for uploads if the underlying BO is busy. */
-		use_staging_texture = TRUE;
-	}
+	assert(!(texture->flags & R600_RESOURCE_FLAG_TRANSFER));
 
-	if (texture->flags & R600_RESOURCE_FLAG_TRANSFER) {
-		use_staging_texture = FALSE;
-	}
+	/* Depth textures use staging unconditionally. */
+	if (!rtex->is_depth) {
+		/* Degrade the tile mode if we get too many transfers on APUs.
+		 * On dGPUs, the staging texture is always faster.
+		 * Only count uploads that are at least 4x4 pixels large.
+		 */
+		if (!rctx->screen->info.has_dedicated_vram &&
+		    level == 0 &&
+		    box->width >= 4 && box->height >= 4 &&
+		    p_atomic_inc_return(&rtex->num_level0_transfers) == 10) {
+			bool can_invalidate =
+				r600_can_invalidate_texture(rctx->screen, rtex,
+							    usage, box);
+
+			r600_degrade_tile_mode_to_linear(rctx, rtex,
+							 can_invalidate);
+		}
 
-	if (use_staging_texture && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) {
-		return NULL;
+		/* Tiled textures need to be converted into a linear texture for CPU
+		 * access. The staging texture is always linear and is placed in GART.
+		 *
+		 * Reading from VRAM is slow, always use the staging texture in
+		 * this case.
+		 *
+		 * Use the staging texture for uploads if the underlying BO
+		 * is busy.
+		 */
+		if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D)
+			use_staging_texture = true;
+		else if (usage & PIPE_TRANSFER_READ)
+			use_staging_texture = (rtex->resource.domains &
+					       RADEON_DOMAIN_VRAM) != 0;
+		/* Write & linear only: */
+		else if (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf,
+							 RADEON_USAGE_READWRITE) ||
+			 !rctx->ws->buffer_wait(rtex->resource.buf, 0,
+						RADEON_USAGE_READWRITE)) {
+			/* It's busy. */
+			if (r600_can_invalidate_texture(rctx->screen, rtex,
+							usage, box))
+				r600_texture_invalidate_storage(rctx, rtex);
+			else
+				use_staging_texture = true;
+		}
 	}
 
 	trans = CALLOC_STRUCT(r600_transfer);
-	if (trans == NULL)
+	if (!trans)
 		return NULL;
 	trans->transfer.resource = texture;
 	trans->transfer.level = level;
@@ -998,7 +1607,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 				r600_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box);
 				rctx->blit_decompress_depth(ctx, (struct r600_texture*)temp, staging_depth,
 							    0, 0, 0, box->depth, 0, 0);
-				pipe_resource_reference((struct pipe_resource**)&temp, NULL);
+				pipe_resource_reference(&temp, NULL);
 			}
 		}
 		else {
@@ -1021,6 +1630,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes;
 		trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size;
 		trans->staging = (struct r600_resource*)staging_depth;
+		buf = trans->staging;
 	} else if (use_staging_texture) {
 		struct pipe_resource resource;
 		struct r600_texture *staging;
@@ -1032,7 +1642,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 
 		/* Create the temporary texture. */
 		staging = (struct r600_texture*)ctx->screen->resource_create(ctx->screen, &resource);
-		if (staging == NULL) {
+		if (!staging) {
 			R600_ERR("failed to create temporary texture to hold untiled copy\n");
 			FREE(trans);
 			return NULL;
@@ -1040,26 +1650,23 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		trans->staging = &staging->resource;
 		trans->transfer.stride = staging->surface.level[0].pitch_bytes;
 		trans->transfer.layer_stride = staging->surface.level[0].slice_size;
-		if (usage & PIPE_TRANSFER_READ) {
+
+		if (usage & PIPE_TRANSFER_READ)
 			r600_copy_to_staging_texture(ctx, trans);
-		}
+		else
+			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+		buf = trans->staging;
 	} else {
 		/* the resource is mapped directly */
 		trans->transfer.stride = rtex->surface.level[level].pitch_bytes;
 		trans->transfer.layer_stride = rtex->surface.level[level].slice_size;
 		offset = r600_texture_get_offset(rtex, level, box);
-	}
-
-	if (trans->staging) {
-		buf = trans->staging;
-		if (!rtex->is_depth && !(usage & PIPE_TRANSFER_READ))
-			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-	} else {
 		buf = &rtex->resource;
 	}
 
 	if (!(map = r600_buffer_map_sync_with_rings(rctx, buf, usage))) {
-		pipe_resource_reference((struct pipe_resource**)&trans->staging, NULL);
+		r600_resource_reference(&trans->staging, NULL);
 		FREE(trans);
 		return NULL;
 	}
@@ -1071,6 +1678,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 static void r600_texture_transfer_unmap(struct pipe_context *ctx,
 					struct pipe_transfer* transfer)
 {
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
 	struct pipe_resource *texture = transfer->resource;
 	struct r600_texture *rtex = (struct r600_texture*)texture;
@@ -1086,8 +1694,28 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx,
 		}
 	}
 
-	if (rtransfer->staging)
-		pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
+	if (rtransfer->staging) {
+		rctx->num_alloc_tex_transfer_bytes += rtransfer->staging->buf->size;
+		r600_resource_reference(&rtransfer->staging, NULL);
+	}
+
+	/* Heuristic for {upload, draw, upload, draw, ..}:
+	 *
+	 * Flush the gfx IB if we've allocated too much texture storage.
+	 *
+	 * The idea is that we don't want to build IBs that use too much
+	 * memory and put pressure on the kernel memory manager and we also
+	 * want to make temporary and invalidated buffers go idle ASAP to
+	 * decrease the total memory usage or make them reusable. The memory
+	 * usage will be slightly higher than given here because of the buffer
+	 * cache in the winsys.
+	 *
+	 * The result is that the kernel memory manager is never a bottleneck.
+	 */
+	if (rctx->num_alloc_tex_transfer_bytes > rctx->screen->info.gart_size / 4) {
+		rctx->gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+		rctx->num_alloc_tex_transfer_bytes = 0;
+	}
 
 	FREE(transfer);
 }
@@ -1097,19 +1725,118 @@ static const struct u_resource_vtbl r600_texture_vtbl =
 	NULL,				/* get_handle */
 	r600_texture_destroy,		/* resource_destroy */
 	r600_texture_transfer_map,	/* transfer_map */
-	NULL,				/* transfer_flush_region */
+	u_default_transfer_flush_region, /* transfer_flush_region */
 	r600_texture_transfer_unmap,	/* transfer_unmap */
-	NULL				/* transfer_inline_write */
 };
 
+/* DCC channel type categories within which formats can be reinterpreted
+ * while keeping the same DCC encoding. The swizzle must also match. */
+enum dcc_channel_type {
+	dcc_channel_float32,
+	dcc_channel_uint32,
+	dcc_channel_sint32,
+	dcc_channel_float16,
+	dcc_channel_uint16,
+	dcc_channel_sint16,
+	dcc_channel_uint_10_10_10_2,
+	dcc_channel_uint8,
+	dcc_channel_sint8,
+	dcc_channel_incompatible,
+};
+
+/* Return the type of DCC encoding. */
+static enum dcc_channel_type
+vi_get_dcc_channel_type(const struct util_format_description *desc)
+{
+	int i;
+
+	/* Find the first non-void channel. */
+	for (i = 0; i < desc->nr_channels; i++)
+		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
+			break;
+	if (i == desc->nr_channels)
+		return dcc_channel_incompatible;
+
+	switch (desc->channel[i].size) {
+	case 32:
+		if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT)
+			return dcc_channel_float32;
+		if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
+			return dcc_channel_uint32;
+		return dcc_channel_sint32;
+	case 16:
+		if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT)
+			return dcc_channel_float16;
+		if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
+			return dcc_channel_uint16;
+		return dcc_channel_sint16;
+	case 10:
+		return dcc_channel_uint_10_10_10_2;
+	case 8:
+		if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
+			return dcc_channel_uint8;
+		return dcc_channel_sint8;
+	default:
+		return dcc_channel_incompatible;
+	}
+}
+
+/* Return if it's allowed to reinterpret one format as another with DCC enabled. */
+bool vi_dcc_formats_compatible(enum pipe_format format1,
+			       enum pipe_format format2)
+{
+	const struct util_format_description *desc1, *desc2;
+	enum dcc_channel_type type1, type2;
+	int i;
+
+	if (format1 == format2)
+		return true;
+
+	desc1 = util_format_description(format1);
+	desc2 = util_format_description(format2);
+
+	if (desc1->nr_channels != desc2->nr_channels)
+		return false;
+
+	/* Swizzles must be the same. */
+	for (i = 0; i < desc1->nr_channels; i++)
+		if (desc1->swizzle[i] <= PIPE_SWIZZLE_W &&
+		    desc2->swizzle[i] <= PIPE_SWIZZLE_W &&
+		    desc1->swizzle[i] != desc2->swizzle[i])
+			return false;
+
+	type1 = vi_get_dcc_channel_type(desc1);
+	type2 = vi_get_dcc_channel_type(desc2);
+
+	return type1 != dcc_channel_incompatible &&
+	       type2 != dcc_channel_incompatible &&
+	       type1 == type2;
+}
+
+void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx,
+					   struct pipe_resource *tex,
+					   unsigned level,
+					   enum pipe_format view_format)
+{
+	struct r600_texture *rtex = (struct r600_texture *)tex;
+
+	if (rtex->dcc_offset &&
+	    rtex->surface.level[level].dcc_enabled &&
+	    !vi_dcc_formats_compatible(tex->format, view_format))
+		if (!r600_texture_disable_dcc(rctx, (struct r600_texture*)tex))
+			rctx->decompress_dcc(&rctx->b, rtex);
+}
+
 struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
 						struct pipe_resource *texture,
 						const struct pipe_surface *templ,
 						unsigned width, unsigned height)
 {
+	struct r600_common_context *rctx = (struct r600_common_context*)pipe;
+	struct r600_texture *rtex = (struct r600_texture*)texture;
 	struct r600_surface *surface = CALLOC_STRUCT(r600_surface);
 
-	if (surface == NULL)
+	if (!surface)
 		return NULL;
 
 	assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
@@ -1122,6 +1849,13 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
 	surface->base.width = width;
 	surface->base.height = height;
 	surface->base.u = templ->u;
+	surface->level_info = &rtex->surface.level[templ->u.tex.level];
+
+	if (texture->target != PIPE_BUFFER)
+		vi_dcc_disable_if_incompatible_format(rctx, texture,
+						      templ->u.tex.level,
+						      templ->format);
+
 	return &surface->base;
 }
 
@@ -1130,27 +1864,112 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
 						const struct pipe_surface *templ)
 {
 	unsigned level = templ->u.tex.level;
+	unsigned width = u_minify(tex->width0, level);
+	unsigned height = u_minify(tex->height0, level);
+
+	if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+		const struct util_format_description *tex_desc
+			= util_format_description(tex->format);
+		const struct util_format_description *templ_desc
+			= util_format_description(templ->format);
+
+		assert(tex_desc->block.bits == templ_desc->block.bits);
+
+		/* Adjust size of surface if and only if the block width or
+		 * height is changed. */
+		if (tex_desc->block.width != templ_desc->block.width ||
+		    tex_desc->block.height != templ_desc->block.height) {
+			unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+			unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+			width = nblks_x * templ_desc->block.width;
+			height = nblks_y * templ_desc->block.height;
+		}
+	}
 
-	return r600_create_surface_custom(pipe, tex, templ,
-					  u_minify(tex->width0, level),
-					  u_minify(tex->height0, level));
+	return r600_create_surface_custom(pipe, tex, templ, width, height);
 }
 
 static void r600_surface_destroy(struct pipe_context *pipe,
 				 struct pipe_surface *surface)
 {
 	struct r600_surface *surf = (struct r600_surface*)surface;
-	pipe_resource_reference((struct pipe_resource**)&surf->cb_buffer_fmask, NULL);
-	pipe_resource_reference((struct pipe_resource**)&surf->cb_buffer_cmask, NULL);
+	r600_resource_reference(&surf->cb_buffer_fmask, NULL);
+	r600_resource_reference(&surf->cb_buffer_cmask, NULL);
 	pipe_resource_reference(&surface->texture, NULL);
 	FREE(surface);
 }
 
-unsigned r600_translate_colorswap(enum pipe_format format)
+static void r600_clear_texture(struct pipe_context *pipe,
+			       struct pipe_resource *tex,
+			       unsigned level,
+			       const struct pipe_box *box,
+			       const void *data)
+{
+	struct pipe_screen *screen = pipe->screen;
+	struct r600_texture *rtex = (struct r600_texture*)tex;
+	struct pipe_surface tmpl = {{0}};
+	struct pipe_surface *sf;
+	const struct util_format_description *desc =
+		util_format_description(tex->format);
+
+	tmpl.format = tex->format;
+	tmpl.u.tex.first_layer = box->z;
+	tmpl.u.tex.last_layer = box->z + box->depth - 1;
+	tmpl.u.tex.level = level;
+	sf = pipe->create_surface(pipe, tex, &tmpl);
+	if (!sf)
+		return;
+
+	if (rtex->is_depth) {
+		unsigned clear;
+		float depth;
+		uint8_t stencil = 0;
+
+		/* Depth is always present. */
+		clear = PIPE_CLEAR_DEPTH;
+		desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+
+		if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
+			clear |= PIPE_CLEAR_STENCIL;
+			desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+		}
+
+		pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil,
+					  box->x, box->y,
+					  box->width, box->height, false);
+	} else {
+		union pipe_color_union color;
+
+		/* pipe_color_union requires the full vec4 representation. */
+		if (util_format_is_pure_uint(tex->format))
+			desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1);
+		else if (util_format_is_pure_sint(tex->format))
+			desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1);
+		else
+			desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
+
+		if (screen->is_format_supported(screen, tex->format,
+						tex->target, 0,
+						PIPE_BIND_RENDER_TARGET)) {
+			pipe->clear_render_target(pipe, sf, &color,
+						  box->x, box->y,
+						  box->width, box->height, false);
+		} else {
+			/* Software fallback - just for R9G9B9E5_FLOAT */
+			util_clear_render_target(pipe, sf, &color,
+						 box->x, box->y,
+						 box->width, box->height);
+		}
+	}
+	pipe_surface_reference(&sf, NULL);
+}
+
+unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap)
 {
 	const struct util_format_description *desc = util_format_description(format);
 
-#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == UTIL_FORMAT_SWIZZLE_##swz)
+#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
 
 	if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
 		return V_0280A0_SWAP_STD;
@@ -1173,7 +1992,8 @@ unsigned r600_translate_colorswap(enum pipe_format format)
 		else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) ||
 			 (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) ||
 		         (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X)))
-			return V_0280A0_SWAP_STD_REV; /* YX__ */
+			/* YX__ */
+			return (do_endian_swap ? V_0280A0_SWAP_STD : V_0280A0_SWAP_STD_REV);
 		else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y))
 			return V_0280A0_SWAP_ALT; /* X__Y */
 		else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X))
@@ -1181,25 +2001,269 @@ unsigned r600_translate_colorswap(enum pipe_format format)
 		break;
 	case 3:
 		if (HAS_SWIZZLE(0,X))
-			return V_0280A0_SWAP_STD; /* XYZ */
+			return (do_endian_swap ? V_0280A0_SWAP_STD_REV : V_0280A0_SWAP_STD);
 		else if (HAS_SWIZZLE(0,Z))
 			return V_0280A0_SWAP_STD_REV; /* ZYX */
 		break;
 	case 4:
 		/* check the middle channels, the 1st and 4th channel can be NONE */
-		if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z))
+		if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) {
 			return V_0280A0_SWAP_STD; /* XYZW */
-		else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y))
+		} else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) {
 			return V_0280A0_SWAP_STD_REV; /* WZYX */
-		else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X))
+		} else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) {
 			return V_0280A0_SWAP_ALT; /* ZYXW */
-		else if (HAS_SWIZZLE(1,X) && HAS_SWIZZLE(2,Y))
-			return V_0280A0_SWAP_ALT_REV; /* WXYZ */
+		} else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) {
+			/* YZWX */
+			if (desc->is_array)
+				return V_0280A0_SWAP_ALT_REV;
+			else
+				return (do_endian_swap ? V_0280A0_SWAP_ALT : V_0280A0_SWAP_ALT_REV);
+		}
 		break;
 	}
 	return ~0U;
 }
 
+/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
+
+static void vi_dcc_clean_up_context_slot(struct r600_common_context *rctx,
+					 int slot)
+{
+	int i;
+
+	if (rctx->dcc_stats[slot].query_active)
+		vi_separate_dcc_stop_query(&rctx->b,
+					   rctx->dcc_stats[slot].tex);
+
+	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[slot].ps_stats); i++)
+		if (rctx->dcc_stats[slot].ps_stats[i]) {
+			rctx->b.destroy_query(&rctx->b,
+					      rctx->dcc_stats[slot].ps_stats[i]);
+			rctx->dcc_stats[slot].ps_stats[i] = NULL;
+		}
+
+	r600_texture_reference(&rctx->dcc_stats[slot].tex, NULL);
+}
+
+/**
+ * Return the per-context slot where DCC statistics queries for the texture live.
+ */
+static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx,
+					       struct r600_texture *tex)
+{
+	int i, empty_slot = -1;
+
+	/* Remove zombie textures (textures kept alive by this array only). */
+	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++)
+		if (rctx->dcc_stats[i].tex &&
+		    rctx->dcc_stats[i].tex->resource.b.b.reference.count == 1)
+			vi_dcc_clean_up_context_slot(rctx, i);
+
+	/* Find the texture. */
+	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+		/* Return if found. */
+		if (rctx->dcc_stats[i].tex == tex) {
+			rctx->dcc_stats[i].last_use_timestamp = os_time_get();
+			return i;
+		}
+
+		/* Record the first seen empty slot. */
+		if (empty_slot == -1 && !rctx->dcc_stats[i].tex)
+			empty_slot = i;
+	}
+
+	/* Not found. Remove the oldest member to make space in the array. */
+	if (empty_slot == -1) {
+		int oldest_slot = 0;
+
+		/* Find the oldest slot. */
+		for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++)
+			if (rctx->dcc_stats[oldest_slot].last_use_timestamp >
+			    rctx->dcc_stats[i].last_use_timestamp)
+				oldest_slot = i;
+
+		/* Clean up the oldest slot. */
+		vi_dcc_clean_up_context_slot(rctx, oldest_slot);
+		empty_slot = oldest_slot;
+	}
+
+	/* Add the texture to the new slot. */
+	r600_texture_reference(&rctx->dcc_stats[empty_slot].tex, tex);
+	rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+	return empty_slot;
+}
+
+static struct pipe_query *
+vi_create_resuming_pipestats_query(struct pipe_context *ctx)
+{
+	struct r600_query_hw *query = (struct r600_query_hw*)
+		ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+
+	query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES;
+	return (struct pipe_query*)query;
+}
+
+/**
+ * Called when binding a color buffer.
+ */
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+				 struct r600_texture *tex)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+	assert(!rctx->dcc_stats[i].query_active);
+
+	if (!rctx->dcc_stats[i].ps_stats[0])
+		rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx);
+
+	/* begin or resume the query */
+	ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+	rctx->dcc_stats[i].query_active = true;
+}
+
+/**
+ * Called when unbinding a color buffer.
+ */
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+				struct r600_texture *tex)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+	assert(rctx->dcc_stats[i].query_active);
+	assert(rctx->dcc_stats[i].ps_stats[0]);
+
+	/* pause or end the query */
+	ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+	rctx->dcc_stats[i].query_active = false;
+}
+
+static bool vi_should_enable_separate_dcc(struct r600_texture *tex)
+{
+	/* The minimum number of fullscreen draws per frame that is required
+	 * to enable DCC. */
+	return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+}
+
+/* Called by fast clear. */
+static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
+				       struct r600_texture *tex)
+{
+	/* The intent is to use this with shared displayable back buffers,
+	 * but it's not strictly limited only to them.
+	 */
+	if (!tex->resource.is_shared ||
+	    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+	    tex->resource.b.b.target != PIPE_TEXTURE_2D ||
+	    tex->surface.last_level > 0 ||
+	    !tex->surface.dcc_size)
+		return;
+
+	if (tex->dcc_offset)
+		return; /* already enabled */
+
+	/* Enable the DCC stat gathering. */
+	if (!tex->dcc_gather_statistics) {
+		tex->dcc_gather_statistics = true;
+		vi_separate_dcc_start_query(&rctx->b, tex);
+	}
+
+	if (!vi_should_enable_separate_dcc(tex))
+		return; /* stats show that DCC decompression is too expensive */
+
+	assert(tex->surface.level[0].dcc_enabled);
+	assert(!tex->dcc_separate_buffer);
+
+	r600_texture_discard_cmask(rctx->screen, tex);
+
+	/* Get a DCC buffer. */
+	if (tex->last_dcc_separate_buffer) {
+		assert(tex->dcc_gather_statistics);
+		assert(!tex->dcc_separate_buffer);
+		tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+		tex->last_dcc_separate_buffer = NULL;
+	} else {
+		tex->dcc_separate_buffer = (struct r600_resource*)
+			r600_aligned_buffer_create(rctx->b.screen, 0,
+						   PIPE_USAGE_DEFAULT,
+						   tex->surface.dcc_size,
+						   tex->surface.dcc_alignment);
+		if (!tex->dcc_separate_buffer)
+			return;
+	}
+
+	/* dcc_offset is the absolute GPUVM address. */
+	tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+	/* no need to flag anything since this is called by fast clear that
+	 * flags framebuffer state
+	 */
+}
+
+/**
+ * Called by pipe_context::flush_resource, the place where DCC decompression
+ * takes place.
+ */
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+					     struct r600_texture *tex)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	struct pipe_query *tmp;
+	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+	bool query_active = rctx->dcc_stats[i].query_active;
+	bool disable = false;
+
+	if (rctx->dcc_stats[i].ps_stats[2]) {
+		union pipe_query_result result;
+
+		/* Read the results. */
+		ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2],
+				      true, &result);
+		r600_query_hw_reset_buffers(rctx,
+					    (struct r600_query_hw*)
+					    rctx->dcc_stats[i].ps_stats[2]);
+
+		/* Compute the approximate number of fullscreen draws. */
+		tex->ps_draw_ratio =
+			result.pipeline_statistics.ps_invocations /
+			(tex->resource.b.b.width0 * tex->resource.b.b.height0);
+		rctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
+
+		disable = tex->dcc_separate_buffer &&
+			  !vi_should_enable_separate_dcc(tex);
+	}
+
+	tex->num_slow_clears = 0;
+
+	/* stop the statistics query for ps_stats[0] */
+	if (query_active)
+		vi_separate_dcc_stop_query(ctx, tex);
+
+	/* Move the queries in the queue by one. */
+	tmp = rctx->dcc_stats[i].ps_stats[2];
+	rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1];
+	rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0];
+	rctx->dcc_stats[i].ps_stats[0] = tmp;
+
+	/* create and start a new query as ps_stats[0] */
+	if (query_active)
+		vi_separate_dcc_start_query(ctx, tex);
+
+	if (disable) {
+		assert(!tex->last_dcc_separate_buffer);
+		tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+		tex->dcc_separate_buffer = NULL;
+		tex->dcc_offset = 0;
+		/* no need to flag anything since this is called after
+		 * decompression that re-sets framebuffer state
+		 */
+	}
+}
+
+/* FAST COLOR CLEAR */
+
 static void evergreen_set_clear_color(struct r600_texture *rtex,
 				      enum pipe_format surface_format,
 				      const union pipe_color_union *color)
@@ -1208,7 +2272,16 @@ static void evergreen_set_clear_color(struct r600_texture *rtex,
 
 	memset(&uc, 0, sizeof(uc));
 
-	if (util_format_is_pure_uint(surface_format)) {
+	if (util_format_get_blocksizebits(surface_format) == 128) {
+		/* DCC fast clear only:
+		 *   CLEAR_WORD0 = R = G = B
+		 *   CLEAR_WORD1 = A
+		 */
+		assert(color->ui[0] == color->ui[1] &&
+		       color->ui[0] == color->ui[2]);
+		uc.ui[0] = color->ui[0];
+		uc.ui[1] = color->ui[3];
+	} else if (util_format_is_pure_uint(surface_format)) {
 		util_format_write_4ui(surface_format, color->ui, 0, &uc, 0, 0, 0, 1, 1);
 	} else if (util_format_is_pure_sint(surface_format)) {
 		util_format_write_4i(surface_format, color->i, 0, &uc, 0, 0, 0, 1, 1);
@@ -1219,15 +2292,210 @@ static void evergreen_set_clear_color(struct r600_texture *rtex,
 	memcpy(rtex->color_clear_value, &uc, 2 * sizeof(uint32_t));
 }
 
+static bool vi_get_fast_clear_parameters(enum pipe_format surface_format,
+					 const union pipe_color_union *color,
+					 uint32_t* reset_value,
+					 bool* clear_words_needed)
+{
+	bool values[4] = {};
+	int i;
+	bool main_value = false;
+	bool extra_value = false;
+	int extra_channel;
+	const struct util_format_description *desc = util_format_description(surface_format);
+
+	if (desc->block.bits == 128 &&
+	    (color->ui[0] != color->ui[1] ||
+	     color->ui[0] != color->ui[2]))
+		return false;
+
+	*clear_words_needed = true;
+	*reset_value = 0x20202020U;
+
+	/* If we want to clear without needing a fast clear eliminate step, we
+	 * can set each channel to 0 or 1 (or 0/max for integer formats). We
+	 * have two sets of flags, one for the last or first channel(extra) and
+	 * one for the other channels(main).
+	 */
+
+	if (surface_format == PIPE_FORMAT_R11G11B10_FLOAT ||
+	    surface_format == PIPE_FORMAT_B5G6R5_UNORM ||
+	    surface_format == PIPE_FORMAT_B5G6R5_SRGB) {
+		extra_channel = -1;
+	} else if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
+		if(r600_translate_colorswap(surface_format, false) <= 1)
+			extra_channel = desc->nr_channels - 1;
+		else
+			extra_channel = 0;
+	} else
+		return true;
+
+	for (i = 0; i < 4; ++i) {
+		int index = desc->swizzle[i] - PIPE_SWIZZLE_X;
+
+		if (desc->swizzle[i] < PIPE_SWIZZLE_X ||
+		    desc->swizzle[i] > PIPE_SWIZZLE_W)
+			continue;
+
+		if (desc->channel[i].pure_integer &&
+		    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+			/* Use the maximum value for clamping the clear color. */
+			int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+			values[i] = color->i[i] != 0;
+			if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
+				return true;
+		} else if (desc->channel[i].pure_integer &&
+			   desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+			/* Use the maximum value for clamping the clear color. */
+			unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+			values[i] = color->ui[i] != 0U;
+			if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
+				return true;
+		} else {
+			values[i] = color->f[i] != 0.0F;
+			if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+				return true;
+		}
+
+		if (index == extra_channel)
+			extra_value = values[i];
+		else
+			main_value = values[i];
+	}
+
+	for (int i = 0; i < 4; ++i)
+		if (values[i] != main_value &&
+		    desc->swizzle[i] - PIPE_SWIZZLE_X != extra_channel &&
+		    desc->swizzle[i] >= PIPE_SWIZZLE_X &&
+		    desc->swizzle[i] <= PIPE_SWIZZLE_W)
+			return true;
+
+	*clear_words_needed = false;
+	if (main_value)
+		*reset_value |= 0x80808080U;
+
+	if (extra_value)
+		*reset_value |= 0x40404040U;
+	return true;
+}
+
+void vi_dcc_clear_level(struct r600_common_context *rctx,
+			struct r600_texture *rtex,
+			unsigned level, unsigned clear_value)
+{
+	struct pipe_resource *dcc_buffer;
+	uint64_t dcc_offset;
+
+	assert(rtex->dcc_offset && rtex->surface.level[level].dcc_enabled);
+
+	if (rtex->dcc_separate_buffer) {
+		dcc_buffer = &rtex->dcc_separate_buffer->b.b;
+		dcc_offset = 0;
+	} else {
+		dcc_buffer = &rtex->resource.b.b;
+		dcc_offset = rtex->dcc_offset;
+	}
+
+	dcc_offset += rtex->surface.level[level].dcc_offset;
+
+	rctx->clear_buffer(&rctx->b, dcc_buffer, dcc_offset,
+			   rtex->surface.level[level].dcc_fast_clear_size,
+			   clear_value, R600_COHERENCY_CB_META);
+}
+
+/* Set the same micro tile mode as the destination of the last MSAA resolve.
+ * This allows hitting the MSAA resolve fast path, which requires that both
+ * src and dst micro tile modes match.
+ */
+static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen,
+					   struct r600_texture *rtex)
+{
+	if (rtex->resource.is_shared ||
+	    rtex->surface.nsamples <= 1 ||
+	    rtex->surface.micro_tile_mode == rtex->last_msaa_resolve_target_micro_mode)
+		return;
+
+	assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_2D);
+	assert(rtex->surface.last_level == 0);
+
+	/* These magic numbers were copied from addrlib. It doesn't use any
+	 * definitions for them either. They are all 2D_TILED_THIN1 modes with
+	 * different bpp and micro tile mode.
+	 */
+	if (rscreen->chip_class >= CIK) {
+		switch (rtex->last_msaa_resolve_target_micro_mode) {
+		case 0: /* displayable */
+			rtex->surface.tiling_index[0] = 10;
+			break;
+		case 1: /* thin */
+			rtex->surface.tiling_index[0] = 14;
+			break;
+		case 3: /* rotated */
+			rtex->surface.tiling_index[0] = 28;
+			break;
+		default: /* depth, thick */
+			assert(!"unexpected micro mode");
+			return;
+		}
+	} else { /* SI */
+		switch (rtex->last_msaa_resolve_target_micro_mode) {
+		case 0: /* displayable */
+			switch (rtex->surface.bpe) {
+			case 1:
+                            rtex->surface.tiling_index[0] = 10;
+                            break;
+			case 2:
+                            rtex->surface.tiling_index[0] = 11;
+                            break;
+			default: /* 4, 8 */
+                            rtex->surface.tiling_index[0] = 12;
+                            break;
+			}
+			break;
+		case 1: /* thin */
+			switch (rtex->surface.bpe) {
+			case 1:
+                                rtex->surface.tiling_index[0] = 14;
+                                break;
+			case 2:
+                                rtex->surface.tiling_index[0] = 15;
+                                break;
+			case 4:
+                                rtex->surface.tiling_index[0] = 16;
+                                break;
+			default: /* 8, 16 */
+                                rtex->surface.tiling_index[0] = 17;
+                                break;
+			}
+			break;
+		default: /* depth, thick */
+			assert(!"unexpected micro mode");
+			return;
+		}
+	}
+
+	rtex->surface.micro_tile_mode = rtex->last_msaa_resolve_target_micro_mode;
+
+	p_atomic_inc(&rscreen->dirty_fb_counter);
+	p_atomic_inc(&rscreen->dirty_tex_descriptor_counter);
+}
+
 void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   struct pipe_framebuffer_state *fb,
 				   struct r600_atom *fb_state,
-				   unsigned *buffers,
+				   unsigned *buffers, unsigned *dirty_cbufs,
 				   const union pipe_color_union *color)
 {
 	int i;
 
-	if (rctx->current_render_cond)
+	/* This function is broken in BE, so just disable this path for now */
+#ifdef PIPE_ARCH_BIG_ENDIAN
+	return;
+#endif
+
+	if (rctx->render_cond)
 		return;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
@@ -1243,11 +2511,6 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 
 		tex = (struct r600_texture *)fb->cbufs[i]->texture;
 
-		/* 128-bit formats are unusupported */
-		if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) {
-			continue;
-		}
-
 		/* the clear is allowed if all layers are bound */
 		if (fb->cbufs[i]->u.tex.first_layer != 0 ||
 		    fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->resource.b.b, 0)) {
@@ -1264,6 +2527,14 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			continue;
 		}
 
+		/* shared textures can't use fast clear without an explicit flush,
+		 * because there is no way to communicate the clear color among
+		 * all clients
+		 */
+		if (tex->resource.is_shared &&
+		    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+			continue;
+
 		/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
 		if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
 		    rctx->chip_class >= CIK &&
@@ -1272,18 +2543,72 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			continue;
 		}
 
-		/* ensure CMASK is enabled */
-		r600_texture_alloc_cmask_separate(rctx->screen, tex);
-		if (tex->cmask.size == 0) {
-			continue;
+		/* Fast clear is the most appropriate place to enable DCC for
+		 * displayable surfaces.
+		 */
+		if (rctx->chip_class >= VI &&
+		    !(rctx->screen->debug_flags & DBG_NO_DCC_FB)) {
+			vi_separate_dcc_try_enable(rctx, tex);
+
+			/* Stoney can't do a CMASK-based clear, so all clears are
+			 * considered to be hypothetically slow clears, which
+			 * is weighed when determining to enable separate DCC.
+			 */
+			if (tex->dcc_gather_statistics &&
+			    rctx->family == CHIP_STONEY)
+				tex->num_slow_clears++;
+		}
+
+		/* Try to clear DCC first, otherwise try CMASK. */
+		if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) {
+			uint32_t reset_value;
+			bool clear_words_needed;
+
+			if (rctx->screen->debug_flags & DBG_NO_DCC_CLEAR)
+				continue;
+
+			if (!vi_get_fast_clear_parameters(fb->cbufs[i]->format,
+							  color, &reset_value,
+							  &clear_words_needed))
+				continue;
+
+			vi_dcc_clear_level(rctx, tex, 0, reset_value);
+
+			if (clear_words_needed)
+				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+			tex->separate_dcc_dirty = true;
+		} else {
+			/* 128-bit formats are unusupported */
+			if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) {
+				continue;
+			}
+
+			/* Stoney/RB+ doesn't work with CMASK fast clear. */
+			if (rctx->family == CHIP_STONEY)
+				continue;
+
+			/* ensure CMASK is enabled */
+			r600_texture_alloc_cmask_separate(rctx->screen, tex);
+			if (tex->cmask.size == 0) {
+				continue;
+			}
+
+			/* Do the fast clear. */
+			rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b,
+					   tex->cmask.offset, tex->cmask.size, 0,
+					   R600_COHERENCY_CB_META);
+
+			tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
 		}
 
-		/* Do the fast clear. */
+		/* We can change the micro tile mode before a full clear. */
+		if (rctx->screen->chip_class >= SI)
+			si_set_optimal_micro_tile_mode(rctx->screen, tex);
+
 		evergreen_set_clear_color(tex, fb->cbufs[i]->format, color);
-		rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b,
-				   tex->cmask.offset, tex->cmask.size, 0, true);
 
-		tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+		if (dirty_cbufs)
+			*dirty_cbufs |= 1 << i;
 		rctx->set_atom_dirty(rctx, fb_state, true);
 		*buffers &= ~clear_bit;
 	}
@@ -1299,4 +2624,5 @@ void r600_init_context_texture_functions(struct r600_common_context *rctx)
 {
 	rctx->b.create_surface = r600_create_surface;
 	rctx->b.surface_destroy = r600_surface_destroy;
+	rctx->b.clear_texture = r600_clear_texture;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c
new file mode 100644
index 000000000..8aaa85d02
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Tom Stellard <thomas.stellard@amd.com>
+ *
+ */
+
+#include "radeon_elf_util.h"
+#include "r600_pipe_common.h"
+
+#include "util/u_memory.h"
+
+#include <gelf.h>
+#include <libelf.h>
+#include <stdio.h>
+
+static void parse_symbol_table(Elf_Data *symbol_table_data,
+				const GElf_Shdr *symbol_table_header,
+				struct radeon_shader_binary *binary)
+{
+	GElf_Sym symbol;
+	unsigned i = 0;
+	unsigned symbol_count =
+		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
+
+	/* We are over allocating this list, because symbol_count gives the
+ 	 * total number of symbols, and we will only be filling the list
+ 	 * with offsets of global symbols.  The memory savings from
+ 	 * allocating the correct size of this list will be small, and
+ 	 * I don't think it is worth the cost of pre-computing the number
+ 	 * of global symbols.
+ 	 */
+	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
+
+	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
+		unsigned i;
+		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
+		    symbol.st_shndx == 0 /* Undefined symbol */) {
+			continue;
+		}
+
+		binary->global_symbol_offsets[binary->global_symbol_count] =
+					symbol.st_value;
+
+		/* Sort the list using bubble sort.  This list will usually
+		 * be small. */
+		for (i = binary->global_symbol_count; i > 0; --i) {
+			uint64_t lhs = binary->global_symbol_offsets[i - 1];
+			uint64_t rhs = binary->global_symbol_offsets[i];
+			if (lhs < rhs) {
+				break;
+			}
+			binary->global_symbol_offsets[i] = lhs;
+			binary->global_symbol_offsets[i - 1] = rhs;
+		}
+		++binary->global_symbol_count;
+	}
+}
+
+static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
+			unsigned symbol_sh_link,
+			struct radeon_shader_binary *binary)
+{
+	unsigned i;
+
+	if (!relocs || !symbols || !binary->reloc_count) {
+		return;
+	}
+	binary->relocs = CALLOC(binary->reloc_count,
+			sizeof(struct radeon_shader_reloc));
+	for (i = 0; i < binary->reloc_count; i++) {
+		GElf_Sym symbol;
+		GElf_Rel rel;
+		char *symbol_name;
+		struct radeon_shader_reloc *reloc = &binary->relocs[i];
+
+		gelf_getrel(relocs, i, &rel);
+		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
+		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
+
+		reloc->offset = rel.r_offset;
+		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
+		reloc->name[sizeof(reloc->name)-1] = 0;
+	}
+}
+
+void radeon_elf_read(const char *elf_data, unsigned elf_size,
+		     struct radeon_shader_binary *binary)
+{
+	char *elf_buffer;
+	Elf *elf;
+	Elf_Scn *section = NULL;
+	Elf_Data *symbols = NULL, *relocs = NULL;
+	size_t section_str_index;
+	unsigned symbol_sh_link = 0;
+
+	/* One of the libelf implementations
+	 * (http://www.mr511.de/software/english.htm) requires calling
+	 * elf_version() before elf_memory().
+	 */
+	elf_version(EV_CURRENT);
+	elf_buffer = MALLOC(elf_size);
+	memcpy(elf_buffer, elf_data, elf_size);
+
+	elf = elf_memory(elf_buffer, elf_size);
+
+	elf_getshdrstrndx(elf, &section_str_index);
+
+	while ((section = elf_nextscn(elf, section))) {
+		const char *name;
+		Elf_Data *section_data = NULL;
+		GElf_Shdr section_header;
+		if (gelf_getshdr(section, &section_header) != &section_header) {
+			fprintf(stderr, "Failed to read ELF section header\n");
+			return;
+		}
+		name = elf_strptr(elf, section_str_index, section_header.sh_name);
+		if (!strcmp(name, ".text")) {
+			section_data = elf_getdata(section, section_data);
+			binary->code_size = section_data->d_size;
+			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
+			memcpy(binary->code, section_data->d_buf, binary->code_size);
+		} else if (!strcmp(name, ".AMDGPU.config")) {
+			section_data = elf_getdata(section, section_data);
+			binary->config_size = section_data->d_size;
+			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
+			memcpy(binary->config, section_data->d_buf, binary->config_size);
+		} else if (!strcmp(name, ".AMDGPU.disasm")) {
+			/* Always read disassembly if it's available. */
+			section_data = elf_getdata(section, section_data);
+			binary->disasm_string = strndup(section_data->d_buf,
+							section_data->d_size);
+		} else if (!strncmp(name, ".rodata", 7)) {
+			section_data = elf_getdata(section, section_data);
+			binary->rodata_size = section_data->d_size;
+			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
+			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
+		} else if (!strncmp(name, ".symtab", 7)) {
+			symbols = elf_getdata(section, section_data);
+			symbol_sh_link = section_header.sh_link;
+			parse_symbol_table(symbols, &section_header, binary);
+		} else if (!strcmp(name, ".rel.text")) {
+			relocs = elf_getdata(section, section_data);
+			binary->reloc_count = section_header.sh_size /
+					section_header.sh_entsize;
+		}
+	}
+
+	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
+
+	if (elf){
+		elf_end(elf);
+	}
+	FREE(elf_buffer);
+
+	/* Cache the config size per symbol */
+	if (binary->global_symbol_count) {
+		binary->config_size_per_symbol =
+			binary->config_size / binary->global_symbol_count;
+	} else {
+		binary->global_symbol_count = 1;
+		binary->config_size_per_symbol = binary->config_size;
+	}
+}
+
+const unsigned char *radeon_shader_binary_config_start(
+	const struct radeon_shader_binary *binary,
+	uint64_t symbol_offset)
+{
+	unsigned i;
+	for (i = 0; i < binary->global_symbol_count; ++i) {
+		if (binary->global_symbol_offsets[i] == symbol_offset) {
+			unsigned offset = i * binary->config_size_per_symbol;
+			return binary->config + offset;
+		}
+	}
+	return binary->config;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h
new file mode 100644
index 000000000..c2af9e0df
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Tom Stellard <thomas.stellard@amd.com>
+ *
+ */
+
+#ifndef RADEON_ELF_UTIL_H
+#define RADEON_ELF_UTIL_H
+
+#include <stdint.h>
+
+struct radeon_shader_binary;
+struct radeon_shader_reloc;
+
+/*
+ * Parse the elf binary stored in \p elf_data and create a
+ * radeon_shader_binary object.
+ */
+void radeon_elf_read(const char *elf_data, unsigned elf_size,
+		     struct radeon_shader_binary *binary);
+
+/**
+ * @returns A pointer to the start of the configuration information for
+ * the function starting at \p symbol_offset of the binary.
+ */
+const unsigned char *radeon_shader_binary_config_start(
+	const struct radeon_shader_binary *binary,
+	uint64_t symbol_offset);
+
+#endif /* RADEON_ELF_UTIL_H */
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
index 55c216aa5..fb1491a28 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
@@ -57,7 +57,9 @@
 
 #define FB_BUFFER_OFFSET 0x1000
 #define FB_BUFFER_SIZE 2048
+#define FB_BUFFER_SIZE_TONGA (2048 * 64)
 #define IT_SCALING_TABLE_SIZE 992
+#define UVD_SESSION_CONTEXT_SIZE (128 * 1024)
 
 /* UVD decoder representation */
 struct ruvd_decoder {
@@ -78,6 +80,7 @@ struct ruvd_decoder {
 	struct rvid_buffer		msg_fb_it_buffers[NUM_BUFFERS];
 	struct ruvd_msg			*msg;
 	uint32_t			*fb;
+	unsigned			fb_size;
 	uint8_t				*it;
 
 	struct rvid_buffer		bs_buffers[NUM_BUFFERS];
@@ -87,38 +90,40 @@ struct ruvd_decoder {
 	struct rvid_buffer		dpb;
 	bool				use_legacy;
 	struct rvid_buffer		ctx;
+	struct rvid_buffer		sessionctx;
 };
 
 /* flush IB to the hardware */
-static void flush(struct ruvd_decoder *dec)
+static int flush(struct ruvd_decoder *dec, unsigned flags)
 {
-	dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	return dec->ws->cs_flush(dec->cs, flags, NULL);
 }
 
 /* add a new set register command to the IB */
 static void set_reg(struct ruvd_decoder *dec, unsigned reg, uint32_t val)
 {
-	uint32_t *pm4 =	dec->cs->buf;
-	pm4[dec->cs->cdw++] = RUVD_PKT0(reg >> 2, 0);
-	pm4[dec->cs->cdw++] = val;
+	radeon_emit(dec->cs, RUVD_PKT0(reg >> 2, 0));
+	radeon_emit(dec->cs, val);
 }
 
 /* send a command to the VCPU through the GPCOM registers */
 static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
-		     struct radeon_winsys_cs_handle* cs_buf, uint32_t off,
+		     struct pb_buffer* buf, uint32_t off,
 		     enum radeon_bo_usage usage, enum radeon_bo_domain domain)
 {
 	int reloc_idx;
 
-	reloc_idx = dec->ws->cs_add_reloc(dec->cs, cs_buf, usage, domain,
-					  RADEON_PRIO_MIN);
+	reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
+					   domain,
+					  RADEON_PRIO_UVD);
 	if (!dec->use_legacy) {
 		uint64_t addr;
-		addr = dec->ws->buffer_get_virtual_address(cs_buf);
+		addr = dec->ws->buffer_get_virtual_address(buf);
 		addr = addr + off;
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr);
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32);
 	} else {
+		off += dec->ws->buffer_get_reloc_offset(buf);
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
 	}
@@ -142,13 +147,13 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* and map it for CPU access */
-	ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE);
+	ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);
 
 	/* calc buffer offsets */
 	dec->msg = (struct ruvd_msg *)ptr;
 	dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
 	if (have_it(dec))
-		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);
+		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size);
 }
 
 /* unmap and send a message command to the VCPU */
@@ -164,13 +169,19 @@ static void send_msg_buf(struct ruvd_decoder *dec)
 	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* unmap the buffer */
-	dec->ws->buffer_unmap(buf->res->cs_buf);
+	dec->ws->buffer_unmap(buf->res->buf);
 	dec->msg = NULL;
 	dec->fb = NULL;
 	dec->it = NULL;
 
+
+	if (dec->sessionctx.res)
+		send_cmd(dec, RUVD_CMD_SESSION_CONTEXT_BUFFER,
+			 dec->sessionctx.res->buf, 0, RADEON_USAGE_READWRITE,
+			 RADEON_DOMAIN_VRAM);
+
 	/* and send it to the hardware */
-	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0,
+	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->buf, 0,
 		 RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 }
 
@@ -207,7 +218,61 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 	}
 }
 
-static unsigned calc_ctx_size(struct ruvd_decoder *dec)
+static unsigned calc_ctx_size_h264_perf(struct ruvd_decoder *dec)
+{
+	unsigned width_in_mb, height_in_mb, ctx_size;
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	// picture width & height in 16 pixel units
+	width_in_mb = width / VL_MACROBLOCK_WIDTH;
+	height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2);
+
+	if (!dec->use_legacy) {
+		unsigned fs_in_mb = width_in_mb * height_in_mb;
+		unsigned num_dpb_buffer;
+		switch(dec->base.level) {
+		case 30:
+			num_dpb_buffer = 8100 / fs_in_mb;
+			break;
+		case 31:
+			num_dpb_buffer = 18000 / fs_in_mb;
+			break;
+		case 32:
+			num_dpb_buffer = 20480 / fs_in_mb;
+			break;
+		case 41:
+			num_dpb_buffer = 32768 / fs_in_mb;
+			break;
+		case 42:
+			num_dpb_buffer = 34816 / fs_in_mb;
+			break;
+		case 50:
+			num_dpb_buffer = 110400 / fs_in_mb;
+			break;
+		case 51:
+			num_dpb_buffer = 184320 / fs_in_mb;
+			break;
+		default:
+			num_dpb_buffer = 184320 / fs_in_mb;
+			break;
+		}
+		num_dpb_buffer++;
+		max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
+		ctx_size = max_references * align(width_in_mb * height_in_mb  * 192, 256);
+	} else {
+		// the firmware seems to always assume a minimum of ref frames
+		max_references = MAX2(NUM_H264_REFS, max_references);
+		// macroblock context buffer
+		ctx_size = align(width_in_mb * height_in_mb * max_references * 192, 256);
+	}
+
+	return ctx_size;
+}
+
+static unsigned calc_ctx_size_h265_main(struct ruvd_decoder *dec)
 {
 	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
 	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
@@ -224,6 +289,39 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec)
 	return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024;
 }
 
+static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_h265_picture_desc *pic)
+{
+	unsigned block_size, log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb;
+	unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size;
+	unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4);
+
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+	unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 || pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1;
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	if (dec->base.width * dec->base.height >= 4096*2000)
+		max_references = MAX2(max_references, 8);
+	else
+		max_references = MAX2(max_references, 17);
+
+	block_size = (1 << (pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3));
+	log2_ctb_size = block_size + pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+
+	width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+	height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+
+	num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4);
+	context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256);
+	max_mb_address = (unsigned) ceil(height * 8 / 2048.0);
+
+	cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb;
+	db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024);
+
+	return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size;
+}
+
 /* calculate size of reference picture buffer */
 static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 {
@@ -282,17 +380,23 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 			num_dpb_buffer++;
 			max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
 			dpb_size = image_size * max_references;
-			dpb_size += max_references * align(width_in_mb * height_in_mb  * 192, alignment);
-			dpb_size += align(width_in_mb * height_in_mb * 32, alignment);
+			if ((dec->stream_type != RUVD_CODEC_H264_PERF) ||
+			    (((struct r600_common_screen*)dec->screen)->family < CHIP_POLARIS10)) {
+				dpb_size += max_references * align(width_in_mb * height_in_mb  * 192, alignment);
+				dpb_size += align(width_in_mb * height_in_mb * 32, alignment);
+			}
 		} else {
 			// the firmware seems to allways assume a minimum of ref frames
 			max_references = MAX2(NUM_H264_REFS, max_references);
 			// reference picture buffer
 			dpb_size = image_size * max_references;
-			// macroblock context buffer
-			dpb_size += width_in_mb * height_in_mb * max_references * 192;
-			// IT surface buffer
-			dpb_size += width_in_mb * height_in_mb * 32;
+			if ((dec->stream_type != RUVD_CODEC_H264_PERF) ||
+			    (((struct r600_common_screen*)dec->screen)->family < CHIP_POLARIS10)) {
+				// macroblock context buffer
+				dpb_size += width_in_mb * height_in_mb * max_references * 192;
+				// IT surface buffer
+				dpb_size += width_in_mb * height_in_mb * 32;
+			}
 		}
 		break;
 	}
@@ -305,7 +409,10 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 
 		width = align (width, 16);
 		height = align (height, 16);
-		dpb_size = align((width * height * 3) / 2, 256) * max_references;
+		if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+			dpb_size = align((width * height * 9) / 4, 256) * max_references;
+		else
+			dpb_size = align((width * height * 3) / 2, 256) * max_references;
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -402,6 +509,9 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4;
 
 	switch (dec->base.chroma_format) {
+	case PIPE_VIDEO_CHROMA_FORMAT_NONE:
+		/* TODO: assert? */
+		break;
 	case PIPE_VIDEO_CHROMA_FORMAT_400:
 		result.chroma_format = 0;
 		break;
@@ -478,6 +588,8 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
 	result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
 	if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO)
 		result.sps_info_flags |= 1 << 9;
+	if (pic->UseRefPicList == true)
+		result.sps_info_flags |= 1 << 10;
 
 	result.chroma_format = pic->pps->sps->chroma_format_idc;
 	result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
@@ -586,6 +698,20 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
 	memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64);
 	memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64);
 
+	for (i = 0 ; i < 2 ; i++) {
+		for (int j = 0 ; j < 15 ; j++)
+			result.direct_reflist[i][j] = pic->RefPicList[i][j];
+	}
+
+	if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) &&
+		(target->buffer_format == PIPE_FORMAT_NV12)) {
+		result.p010_mode = 0;
+		result.luma_10to8 = 5;
+		result.chroma_10to8 = 5;
+		result.sclr_luma10to8 = 4;
+		result.sclr_chroma10to8 = 4;
+	}
+
 	/* TODO
 	result.highestTid;
 	result.isNonRef;
@@ -811,7 +937,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 	dec->msg->stream_handle = dec->stream_handle;
 	send_msg_buf(dec);
 
-	flush(dec);
+	flush(dec, 0);
 
 	dec->ws->cs_destroy(dec->cs);
 
@@ -821,8 +947,8 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
-	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
-		rvid_destroy_buffer(&dec->ctx);
+	rvid_destroy_buffer(&dec->ctx);
+	rvid_destroy_buffer(&dec->sessionctx);
 
 	FREE(dec);
 }
@@ -845,7 +971,7 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder,
 
 	dec->bs_size = 0;
 	dec->bs_ptr = dec->ws->buffer_map(
-		dec->bs_buffers[dec->cur_buffer].res->cs_buf,
+		dec->bs_buffers[dec->cur_buffer].res->buf,
 		dec->cs, PIPE_TRANSFER_WRITE);
 }
 
@@ -885,13 +1011,13 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder,
 		unsigned new_size = dec->bs_size + sizes[i];
 
 		if (new_size > buf->res->buf->size) {
-			dec->ws->buffer_unmap(buf->res->cs_buf);
+			dec->ws->buffer_unmap(buf->res->buf);
 			if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) {
 				RVID_ERR("Can't resize bitstream buffer!");
 				return;
 			}
 
-			dec->bs_ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs,
+			dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
 							  PIPE_TRANSFER_WRITE);
 			if (!dec->bs_ptr)
 				return;
@@ -913,7 +1039,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 			   struct pipe_picture_desc *picture)
 {
 	struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder;
-	struct radeon_winsys_cs_handle *dt;
+	struct pb_buffer *dt;
 	struct rvid_buffer *msg_fb_it_buf, *bs_buf;
 	unsigned bs_size;
 
@@ -927,7 +1053,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	bs_size = align(dec->bs_size, 128);
 	memset(dec->bs_ptr, 0, bs_size - dec->bs_size);
-	dec->ws->buffer_unmap(bs_buf->res->cs_buf);
+	dec->ws->buffer_unmap(bs_buf->res->buf);
 
 	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
@@ -948,9 +1074,15 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
-	dec->msg->body.decode.db_pitch = dec->base.width;
+	dec->msg->body.decode.db_pitch = align(dec->base.width, 16);
+
+	if (dec->stream_type == RUVD_CODEC_H264_PERF &&
+	    ((struct r600_common_screen*)dec->screen)->family >= CHIP_POLARIS10)
+		dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size;
 
 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
+	if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
+		dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2;
 
 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
@@ -959,6 +1091,20 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	case PIPE_VIDEO_FORMAT_HEVC:
 		dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture);
+		if (dec->ctx.res == NULL) {
+			unsigned ctx_size;
+			if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+				ctx_size = calc_ctx_size_h265_main10(dec, (struct pipe_h265_picture_desc*)picture);
+			else
+				ctx_size = calc_ctx_size_h265_main(dec);
+			if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
+				RVID_ERR("Can't allocated context buffer.\n");
+			}
+			rvid_clear_buffer(decoder->context, &dec->ctx);
+		}
+
+		if (dec->ctx.res)
+			dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size;
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -982,28 +1128,27 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.extension_support = 0x1;
 
 	/* set at least the feedback buffer size */
-	dec->fb[0] = FB_BUFFER_SIZE;
+	dec->fb[0] = dec->fb_size;
 
 	send_msg_buf(dec);
 
-	send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0,
+	send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->buf, 0,
 		 RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
-	if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) {
-		send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0,
+	if (dec->ctx.res)
+		send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->buf, 0,
 			RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
-	}
-	send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf,
+	send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->buf,
 		 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0,
 		 RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
-	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf,
+	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->buf,
 		 FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
 	if (have_it(dec))
-		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf,
-			 FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
+		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf,
+			 FB_BUFFER_OFFSET + dec->fb_size, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	set_reg(dec, RUVD_ENGINE_CNTL, 1);
 
-	flush(dec);
+	flush(dec, RADEON_FLUSH_ASYNC);
 	next_buffer(dec);
 }
 
@@ -1028,7 +1173,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	unsigned bs_buf_size;
 	struct radeon_info info;
 	struct ruvd_decoder *dec;
-	int i;
+	int r, i;
 
 	ws->query_info(ws, &info);
 
@@ -1039,6 +1184,9 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 		/* fall through */
 	case PIPE_VIDEO_FORMAT_MPEG4:
+		width = align(width, VL_MACROBLOCK_WIDTH);
+		height = align(height, VL_MACROBLOCK_HEIGHT);
+		break;
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
 		width = align(width, VL_MACROBLOCK_WIDTH);
 		height = align(height, VL_MACROBLOCK_HEIGHT);
@@ -1055,7 +1203,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 		return NULL;
 
 	if (info.drm_major < 3)
-		dec->use_legacy = TRUE;
+		dec->use_legacy = true;
 
 	dec->base = *templ;
 	dec->base.context = context;
@@ -1074,15 +1222,17 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->stream_handle = rvid_alloc_stream_handle();
 	dec->screen = context->screen;
 	dec->ws = ws;
-	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL);
+	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL);
 	if (!dec->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
 	}
 
-	bs_buf_size = width * height * 512 / (16 * 16);
+	dec->fb_size = (info.family == CHIP_TONGA) ? FB_BUFFER_SIZE_TONGA :
+			FB_BUFFER_SIZE;
+	bs_buf_size = width * height * (512 / (16 * 16));
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
+		unsigned msg_fb_it_size = FB_BUFFER_OFFSET + dec->fb_size;
 		STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET);
 		if (have_it(dec))
 			msg_fb_it_size += IT_SCALING_TABLE_SIZE;
@@ -1111,8 +1261,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	rvid_clear_buffer(context, &dec->dpb);
 
-	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
-		unsigned ctx_size = calc_ctx_size(dec);
+	if (dec->stream_type == RUVD_CODEC_H264_PERF && info.family >= CHIP_POLARIS10) {
+		unsigned ctx_size = calc_ctx_size_h264_perf(dec);
 		if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
 			RVID_ERR("Can't allocated context buffer.\n");
 			goto error;
@@ -1120,6 +1270,16 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 		rvid_clear_buffer(context, &dec->ctx);
 	}
 
+	if (info.family >= CHIP_POLARIS10 && info.drm_minor >= 3) {
+		if (!rvid_create_buffer(dec->screen, &dec->sessionctx,
+					UVD_SESSION_CONTEXT_SIZE,
+					PIPE_USAGE_DEFAULT)) {
+			RVID_ERR("Can't allocated session ctx.\n");
+			goto error;
+		}
+		rvid_clear_buffer(context, &dec->sessionctx);
+	}
+
 	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_CREATE;
@@ -1129,7 +1289,10 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->msg->body.create.height_in_samples = dec->base.height;
 	dec->msg->body.create.dpb_size = dpb_size;
 	send_msg_buf(dec);
-	flush(dec);
+	r = flush(dec, 0);
+	if (r)
+		goto error;
+
 	next_buffer(dec);
 
 	return &dec->base;
@@ -1143,8 +1306,8 @@ error:
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
-	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
-		rvid_destroy_buffer(&dec->ctx);
+	rvid_destroy_buffer(&dec->ctx);
+	rvid_destroy_buffer(&dec->sessionctx);
 
 	FREE(dec);
 
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
index 452fbd608..e3f8504d8 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
@@ -38,13 +38,13 @@
 #include "vl/vl_video_buffer.h"
 
 /* UVD uses PM4 packet type 0 and 2 */
-#define RUVD_PKT_TYPE_S(x)		(((x) & 0x3) << 30)
+#define RUVD_PKT_TYPE_S(x)		(((unsigned)(x) & 0x3) << 30)
 #define RUVD_PKT_TYPE_G(x)		(((x) >> 30) & 0x3)
 #define RUVD_PKT_TYPE_C			0x3FFFFFFF
-#define RUVD_PKT_COUNT_S(x)		(((x) & 0x3FFF) << 16)
+#define RUVD_PKT_COUNT_S(x)		(((unsigned)(x) & 0x3FFF) << 16)
 #define RUVD_PKT_COUNT_G(x)		(((x) >> 16) & 0x3FFF)
 #define RUVD_PKT_COUNT_C		0xC000FFFF
-#define RUVD_PKT0_BASE_INDEX_S(x)	(((x) & 0xFFFF) << 0)
+#define RUVD_PKT0_BASE_INDEX_S(x)	(((unsigned)(x) & 0xFFFF) << 0)
 #define RUVD_PKT0_BASE_INDEX_G(x)	(((x) >> 0) & 0xFFFF)
 #define RUVD_PKT0_BASE_INDEX_C		0xFFFF0000
 #define RUVD_PKT0(index, count)		(RUVD_PKT_TYPE_S(0) | RUVD_PKT0_BASE_INDEX_S(index) | RUVD_PKT_COUNT_S(count))
@@ -61,6 +61,7 @@
 #define RUVD_CMD_DPB_BUFFER		0x00000001
 #define RUVD_CMD_DECODING_TARGET_BUFFER	0x00000002
 #define RUVD_CMD_FEEDBACK_BUFFER	0x00000003
+#define RUVD_CMD_SESSION_CONTEXT_BUFFER	0x00000005
 #define RUVD_CMD_BITSTREAM_BUFFER	0x00000100
 #define RUVD_CMD_ITSCALING_TABLE_BUFFER	0x00000204
 #define RUVD_CMD_CONTEXT_BUFFER		0x00000206
@@ -233,6 +234,15 @@ struct ruvd_h265 {
 
 	uint8_t		highestTid;
 	uint8_t		isNonRef;
+
+	uint8_t 	p010_mode;
+	uint8_t 	msb_mode;
+	uint8_t 	luma_10to8;
+	uint8_t 	chroma_10to8;
+	uint8_t 	sclr_luma10to8;
+	uint8_t 	sclr_chroma10to8;
+
+	uint8_t 	direct_reflist[2][15];
 };
 
 struct ruvd_vc1 {
@@ -385,7 +395,10 @@ struct ruvd_msg {
 			uint32_t	dt_chroma_top_offset;
 			uint32_t	dt_chroma_bottom_offset;
 			uint32_t	dt_surf_tile_config;
-			uint32_t	dt_reserved[3];
+			uint32_t	dt_uv_surf_tile_config;
+			// re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney
+			uint32_t	dt_wa_chroma_top_offset;
+			uint32_t	dt_wa_chroma_bottom_offset;
 
 			uint32_t	reserved[16];
 
@@ -409,7 +422,7 @@ struct ruvd_msg {
 };
 
 /* driver dependent callback */
-typedef struct radeon_winsys_cs_handle* (*ruvd_set_dtb)
+typedef struct pb_buffer* (*ruvd_set_dtb)
 (struct ruvd_msg* msg, struct vl_video_buffer *vb);
 
 /* create an UVD decode */
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
index 7eab974a3..ef93e46c1 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
@@ -49,13 +49,16 @@
 #define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
 #define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
 #define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
+#define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
+#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8))
+#define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8))
 
 /**
  * flush commands to the hardware
  */
 static void flush(struct rvce_encoder *enc)
 {
-	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL);
 	enc->task_info_idx = 0;
 	enc->bs_idx = 0;
 }
@@ -63,7 +66,7 @@ static void flush(struct rvce_encoder *enc)
 #if 0
 static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb)
 {
-	uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
+	uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
 	unsigned i = 0;
 	fprintf(stderr, "\n");
 	fprintf(stderr, "encStatus:\t\t\t%08x\n", ptr[i++]);
@@ -82,7 +85,7 @@ static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb)
 	fprintf(stderr, "seiPrivatePackageOffset:\t%08x\n", ptr[i++]);
 	fprintf(stderr, "seiPrivatePackageSize:\t\t%08x\n", ptr[i++]);
 	fprintf(stderr, "\n");
-	enc->ws->buffer_unmap(fb->res->cs_buf);
+	enc->ws->buffer_unmap(fb->res->buf);
 }
 #endif
 
@@ -265,6 +268,7 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder,
 		enc->pic.quant_b_frames != pic->quant_b_frames;
 
 	enc->pic = *pic;
+	get_pic_param(enc, pic);
 
 	enc->get_buffer(vid_buf->resources[0], &enc->handle, &enc->luma);
 	enc->get_buffer(vid_buf->resources[1], NULL, &enc->chroma);
@@ -311,7 +315,7 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
 		RVID_ERR("Can't create feedback buffer.\n");
 		return;
 	}
-	if (!enc->cs->cdw)
+	if (!radeon_emitted(enc->cs, 0))
 		enc->session(enc);
 	enc->encode(enc);
 	enc->feedback(enc);
@@ -345,7 +349,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
 	struct rvid_buffer *fb = feedback;
 
 	if (size) {
-		uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
+		uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
 
 		if (ptr[1]) {
 			*size = ptr[4] - ptr[9];
@@ -353,7 +357,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
 			*size = 0;
 		}
 
-		enc->ws->buffer_unmap(fb->res->cs_buf);
+		enc->ws->buffer_unmap(fb->res->buf);
 	}
 	//dump_feedback(enc, fb);
 	rvid_destroy_buffer(fb);
@@ -403,9 +407,12 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	if (rscreen->info.drm_major == 3)
 		enc->use_vm = true;
-	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
+	if ((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) ||
+            rscreen->info.drm_major == 3)
 		enc->use_vui = true;
-	if (rscreen->info.family >= CHIP_TONGA)
+	if (rscreen->info.family >= CHIP_TONGA &&
+	    rscreen->info.family != CHIP_STONEY &&
+	    rscreen->info.family != CHIP_POLARIS11)
 		enc->dual_pipe = true;
 	/* TODO enable B frame with dual instance */
 	if ((rscreen->info.family >= CHIP_TONGA) &&
@@ -426,7 +433,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	enc->screen = context->screen;
 	enc->ws = ws;
-	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL);
+	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc);
 	if (!enc->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
@@ -448,7 +455,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	get_buffer(((struct vl_video_buffer *)tmp_buf)->resources[0], NULL, &tmp_surf);
 	cpb_size = align(tmp_surf->level[0].pitch_bytes, 128);
-	cpb_size = cpb_size * align(tmp_surf->npix_y, 16);
+	cpb_size = cpb_size * align(tmp_surf->npix_y, 32);
 	cpb_size = cpb_size * 3 / 2;
 	cpb_size = cpb_size * enc->cpb_num;
 	if (enc->dual_pipe)
@@ -469,6 +476,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	switch (rscreen->info.vce_fw_version) {
 	case FW_40_2_2:
 		radeon_vce_40_2_2_init(enc);
+		get_pic_param = radeon_vce_40_2_2_get_param;
 		break;
 
 	case FW_50_0_1:
@@ -476,6 +484,14 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	case FW_50_10_2:
 	case FW_50_17_3:
 		radeon_vce_50_init(enc);
+		get_pic_param = radeon_vce_50_get_param;
+		break;
+
+	case FW_52_0_3:
+	case FW_52_4_3:
+	case FW_52_8_3:
+		radeon_vce_52_init(enc);
+		get_pic_param = radeon_vce_52_get_param;
 		break;
 
 	default:
@@ -500,23 +516,32 @@ error:
  */
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 {
-	return rscreen->info.vce_fw_version == FW_40_2_2 ||
-		rscreen->info.vce_fw_version == FW_50_0_1 ||
-		rscreen->info.vce_fw_version == FW_50_1_2 ||
-		rscreen->info.vce_fw_version == FW_50_10_2 ||
-		rscreen->info.vce_fw_version == FW_50_17_3;
+	switch (rscreen->info.vce_fw_version) {
+	case FW_40_2_2:
+	case FW_50_0_1:
+	case FW_50_1_2:
+	case FW_50_10_2:
+	case FW_50_17_3:
+	case FW_52_0_3:
+	case FW_52_4_3:
+	case FW_52_8_3:
+		return true;
+	default:
+		return false;
+	}
 }
 
 /**
  * Add the buffer as relocation to the current command submission
  */
-void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf,
                      enum radeon_bo_usage usage, enum radeon_bo_domain domain,
                      signed offset)
 {
 	int reloc_idx;
 
-	reloc_idx = enc->ws->cs_add_reloc(enc->cs, buf, usage, domain, RADEON_PRIO_MIN);
+	reloc_idx = enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
+					   domain, RADEON_PRIO_VCE);
 	if (enc->use_vm) {
 		uint64_t addr;
 		addr = enc->ws->buffer_get_virtual_address(buf);
@@ -524,6 +549,7 @@ void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *b
 		RVCE_CS(addr >> 32);
 		RVCE_CS(addr);
 	} else {
+		offset += enc->ws->buffer_get_reloc_offset(buf);
 		RVCE_CS(reloc_idx * 4);
 		RVCE_CS(offset);
 	}
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index c00565904..fe15ded39 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -59,11 +59,11 @@ static void task_info(struct rvce_encoder *enc, uint32_t op,
 	RVCE_BEGIN(0x00000002); // task info
 	if (op == 0x3) {
 		if (enc->task_info_idx) {
-			uint32_t offs = enc->cs->cdw - enc->task_info_idx + 3;
+			uint32_t offs = enc->cs->current.cdw - enc->task_info_idx + 3;
 			// Update offsetOfNextTaskInfo
-			enc->cs->buf[enc->task_info_idx] = offs;
+			enc->cs->current.buf[enc->task_info_idx] = offs;
 		}
-		enc->task_info_idx = enc->cs->cdw;
+		enc->task_info_idx = enc->cs->current.cdw;
 	}
 	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
 	RVCE_CS(op); // taskOperation
@@ -77,7 +77,7 @@ static void task_info(struct rvce_encoder *enc, uint32_t op,
 static void feedback(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x05000005); // feedback buffer
-	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
+	RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
 	RVCE_CS(0x00000001); // feedbackRingSize
 	RVCE_END();
 }
@@ -303,7 +303,7 @@ static void encode(struct rvce_encoder *enc)
 	enc->task_info(enc, 0x00000003, 0, 0, 0);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
+	RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
@@ -431,6 +431,10 @@ static void destroy(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
+void radeon_vce_40_2_2_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+}
+
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
 {
 	enc->session = session;
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
index afdab18c0..262e13ba9 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -95,7 +95,7 @@ static void encode(struct rvce_encoder *enc)
 	enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
+	RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	bs_offset = -(signed)(bs_idx * enc->bs_size);
@@ -233,6 +233,10 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
+void radeon_vce_50_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+}
+
 void radeon_vce_50_init(struct rvce_encoder *enc)
 {
 	radeon_vce_40_2_2_init(enc);
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
index 3894eea31..5db01fe52 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
@@ -40,27 +40,152 @@
 
 static const unsigned profiles[7] = { 66, 77, 88, 100, 110, 122, 244 };
 
+static void get_rate_control_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+	enc->enc_pic.rc.rc_method = pic->rate_ctrl.rate_ctrl_method;
+	enc->enc_pic.rc.target_bitrate = pic->rate_ctrl.target_bitrate;
+	enc->enc_pic.rc.peak_bitrate = pic->rate_ctrl.peak_bitrate;
+	enc->enc_pic.rc.quant_i_frames = pic->quant_i_frames;
+	enc->enc_pic.rc.quant_p_frames = pic->quant_p_frames;
+	enc->enc_pic.rc.quant_b_frames = pic->quant_b_frames;
+	enc->enc_pic.rc.gop_size = pic->gop_size;
+	enc->enc_pic.rc.frame_rate_num = pic->rate_ctrl.frame_rate_num;
+	enc->enc_pic.rc.frame_rate_den = pic->rate_ctrl.frame_rate_den;
+	enc->enc_pic.rc.max_qp = 51;
+	enc->enc_pic.rc.vbv_buffer_size = pic->rate_ctrl.vbv_buffer_size;
+	enc->enc_pic.rc.vbv_buf_lv = pic->rate_ctrl.vbv_buf_lv;
+	enc->enc_pic.rc.fill_data_enable = pic->rate_ctrl.fill_data_enable;
+	enc->enc_pic.rc.enforce_hrd = pic->rate_ctrl.enforce_hrd;
+	enc->enc_pic.rc.target_bits_picture = pic->rate_ctrl.target_bits_picture;
+	enc->enc_pic.rc.peak_bits_picture_integer = pic->rate_ctrl.peak_bits_picture_integer;
+	enc->enc_pic.rc.peak_bits_picture_fraction = pic->rate_ctrl.peak_bits_picture_fraction;
+}
+
+static void get_motion_estimation_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+	enc->enc_pic.me.motion_est_quarter_pixel = pic->motion_est.motion_est_quarter_pixel;
+	enc->enc_pic.me.enc_disable_sub_mode = pic->motion_est.enc_disable_sub_mode;
+	enc->enc_pic.me.lsmvert = pic->motion_est.lsmvert;
+	enc->enc_pic.me.enc_en_ime_overw_dis_subm = pic->motion_est.enc_en_ime_overw_dis_subm;
+	enc->enc_pic.me.enc_ime_overw_dis_subm_no = pic->motion_est.enc_ime_overw_dis_subm_no;
+	enc->enc_pic.me.enc_ime2_search_range_x = pic->motion_est.enc_ime2_search_range_x;
+	enc->enc_pic.me.enc_ime2_search_range_y = pic->motion_est.enc_ime2_search_range_y;
+	enc->enc_pic.me.enc_ime_decimation_search = 0x00000001;
+	enc->enc_pic.me.motion_est_half_pixel = 0x00000001;
+	enc->enc_pic.me.enc_search_range_x = 0x00000010;
+	enc->enc_pic.me.enc_search_range_y = 0x00000010;
+	enc->enc_pic.me.enc_search1_range_x = 0x00000010;
+	enc->enc_pic.me.enc_search1_range_y = 0x00000010;
+}
+
+static void get_pic_control_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+	unsigned encNumMBsPerSlice;
+	encNumMBsPerSlice = align(enc->base.width, 16) / 16;
+	encNumMBsPerSlice *= align(enc->base.height, 16) / 16;
+	enc->enc_pic.pc.enc_crop_right_offset = (align(enc->base.width, 16) - enc->base.width) >> 1;
+	enc->enc_pic.pc.enc_crop_bottom_offset = (align(enc->base.height, 16) - enc->base.height) >> 1;
+	enc->enc_pic.pc.enc_num_mbs_per_slice = encNumMBsPerSlice;
+	enc->enc_pic.pc.enc_b_pic_pattern = MAX2(enc->base.max_references, 1) - 1;
+	enc->enc_pic.pc.enc_number_of_reference_frames = MIN2(enc->base.max_references, 2);
+	enc->enc_pic.pc.enc_max_num_ref_frames = enc->base.max_references + 1;
+	enc->enc_pic.pc.enc_num_default_active_ref_l0 = 0x00000001;
+	enc->enc_pic.pc.enc_num_default_active_ref_l1 = 0x00000001;
+	enc->enc_pic.pc.enc_cabac_enable = pic->pic_ctrl.enc_cabac_enable;
+	enc->enc_pic.pc.enc_constraint_set_flags = pic->pic_ctrl.enc_constraint_set_flags;
+	enc->enc_pic.pc.enc_num_default_active_ref_l0 = 0x00000001;
+	enc->enc_pic.pc.enc_num_default_active_ref_l1 = 0x00000001;
+}
+
+static void get_task_info_param(struct rvce_encoder *enc)
+{
+	enc->enc_pic.ti.offset_of_next_task_info = 0xffffffff;
+}
+
+static void get_feedback_buffer_param(struct rvce_encoder *enc)
+{
+	enc->enc_pic.fb.feedback_ring_size = 0x00000001;
+}
+
+static void get_config_ext_param(struct rvce_encoder *enc)
+{
+	enc->enc_pic.ce.enc_enable_perf_logging = 0x00000003;
+}
+
+static void get_vui_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+	enc->enc_pic.enable_vui = pic->enable_vui;
+	enc->enc_pic.vui.video_format = 0x00000005;
+	enc->enc_pic.vui.color_prim = 0x00000002;
+	enc->enc_pic.vui.transfer_char = 0x00000002;
+	enc->enc_pic.vui.matrix_coef = 0x00000002;
+	enc->enc_pic.vui.timing_info_present_flag = 0x00000001;
+	enc->enc_pic.vui.num_units_in_tick = pic->rate_ctrl.frame_rate_den;
+	enc->enc_pic.vui.time_scale = pic->rate_ctrl.frame_rate_num * 2;
+	enc->enc_pic.vui.fixed_frame_rate_flag = 0x00000001;
+	enc->enc_pic.vui.bit_rate_scale = 0x00000004;
+	enc->enc_pic.vui.cpb_size_scale = 0x00000006;
+	enc->enc_pic.vui.initial_cpb_removal_delay_length_minus1 = 0x00000017;
+	enc->enc_pic.vui.cpb_removal_delay_length_minus1 = 0x00000017;
+	enc->enc_pic.vui.dpb_output_delay_length_minus1 = 0x00000017;
+	enc->enc_pic.vui.time_offset_length = 0x00000018;
+	enc->enc_pic.vui.motion_vectors_over_pic_boundaries_flag = 0x00000001;
+	enc->enc_pic.vui.max_bytes_per_pic_denom = 0x00000002;
+	enc->enc_pic.vui.max_bits_per_mb_denom = 0x00000001;
+	enc->enc_pic.vui.log2_max_mv_length_hori = 0x00000010;
+	enc->enc_pic.vui.log2_max_mv_length_vert = 0x00000010;
+	enc->enc_pic.vui.num_reorder_frames = 0x00000003;
+	enc->enc_pic.vui.max_dec_frame_buffering = 0x00000003;
+}
+
+void radeon_vce_52_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+	get_rate_control_param(enc, pic);
+	get_motion_estimation_param(enc, pic);
+	get_pic_control_param(enc, pic);
+	get_task_info_param(enc);
+	get_feedback_buffer_param(enc);
+	get_vui_param(enc, pic);
+	get_config_ext_param(enc);
+
+	enc->enc_pic.picture_type = pic->picture_type;
+	enc->enc_pic.frame_num = pic->frame_num;
+	enc->enc_pic.frame_num_cnt = pic->frame_num_cnt;
+	enc->enc_pic.p_remain = pic->p_remain;
+	enc->enc_pic.i_remain = pic->i_remain;
+	enc->enc_pic.gop_cnt = pic->gop_cnt;
+	enc->enc_pic.pic_order_cnt = pic->pic_order_cnt;
+	enc->enc_pic.ref_idx_l0 = pic->ref_idx_l0;
+	enc->enc_pic.ref_idx_l1 = pic->ref_idx_l1;
+	enc->enc_pic.not_referenced = pic->not_referenced;
+	if (enc->dual_inst)
+		enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants = 0x00000201;
+	else
+		enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants = 0x01000201;
+	enc->enc_pic.is_idr = pic->is_idr;
+}
+
 static void create(struct rvce_encoder *enc)
 {
 	enc->task_info(enc, 0x00000000, 0, 0, 0);
 
 	RVCE_BEGIN(0x01000001); // create cmd
-	RVCE_CS(0x00000000); // encUseCircularBuffer
+	RVCE_CS(enc->enc_pic.ec.enc_use_circular_buffer);
 	RVCE_CS(profiles[enc->base.profile -
 		PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE]); // encProfile
 	RVCE_CS(enc->base.level); // encLevel
-	RVCE_CS(0x00000000); // encPicStructRestriction
+	RVCE_CS(enc->enc_pic.ec.enc_pic_struct_restriction);
 	RVCE_CS(enc->base.width); // encImageWidth
 	RVCE_CS(enc->base.height); // encImageHeight
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch
 	RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw
-	RVCE_CS(0x00000000); // encRefPic(Addr|Array)Mode, encPicStructRestriction, disableRDO
+	RVCE_CS(enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants);
 
-	RVCE_CS(0x00000000); // encPreEncodeContextBufferOffset
-	RVCE_CS(0x00000000); // encPreEncodeInputLumaBufferOffset
-	RVCE_CS(0x00000000); // encPreEncodeInputChromaBufferOffs
-	RVCE_CS(0x00000000); // encPreEncodeMode|ChromaFlag|VBAQMode|SceneChangeSensitivity
+	RVCE_CS(enc->enc_pic.ec.enc_pre_encode_context_buffer_offset);
+	RVCE_CS(enc->enc_pic.ec.enc_pre_encode_input_luma_buffer_offset);
+	RVCE_CS(enc->enc_pic.ec.enc_pre_encode_input_chroma_buffer_offset);
+	RVCE_CS(enc->enc_pic.ec.enc_pre_encode_mode_chromaflag_vbaqmode_scenechangesensitivity);
 	RVCE_END();
 }
 
@@ -73,7 +198,7 @@ static void encode(struct rvce_encoder *enc)
 	if (enc->dual_inst) {
 		if (bs_idx == 0)
 			dep = 1;
-		else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
+		else if (enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
 			dep = 0;
 		else
 			dep = 2;
@@ -107,13 +232,13 @@ static void encode(struct rvce_encoder *enc)
 	}
 
 	RVCE_BEGIN(0x03000001); // encode
-	RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
-	RVCE_CS(0x00000000); // pictureStructure
+	RVCE_CS(enc->enc_pic.frame_num ? 0x0 : 0x11); // insertHeaders
+	RVCE_CS(enc->enc_pic.eo.picture_structure);
 	RVCE_CS(enc->bs_size); // allowedMaxBitstreamSize
-	RVCE_CS(0x00000000); // forceRefreshMap
-	RVCE_CS(0x00000000); // insertAUD
-	RVCE_CS(0x00000000); // endOfSequence
-	RVCE_CS(0x00000000); // endOfStream
+	RVCE_CS(enc->enc_pic.eo.force_refresh_map);
+	RVCE_CS(enc->enc_pic.eo.insert_aud);
+	RVCE_CS(enc->enc_pic.eo.end_of_sequence);
+	RVCE_CS(enc->enc_pic.eo.end_of_stream);
 	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
 		enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
 	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
@@ -122,121 +247,396 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
 	if (enc->dual_pipe)
-		RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+		enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00000000;
 	else
-		RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
-	RVCE_CS(0x00000000); // encInputPicTileConfig
-	RVCE_CS(enc->pic.picture_type); // encPicType
-	RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
-	RVCE_CS(0x00000000); // encIdrPicId
-	RVCE_CS(0x00000000); // encMGSKeyPic
-	RVCE_CS(!enc->pic.not_referenced); // encReferenceFlag
-	RVCE_CS(0x00000000); // encTemporalLayerIndex
-	RVCE_CS(0x00000000); // num_ref_idx_active_override_flag
-	RVCE_CS(0x00000000); // num_ref_idx_l0_active_minus1
-	RVCE_CS(0x00000000); // num_ref_idx_l1_active_minus1
-
-	i = enc->pic.frame_num - enc->pic.ref_idx_l0;
-	if (i > 1 && enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) {
-		RVCE_CS(0x00000001); // encRefListModificationOp
-		RVCE_CS(i - 1);      // encRefListModificationNum
+		enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00010000;
+	RVCE_CS(enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload);
+	RVCE_CS(enc->enc_pic.eo.enc_input_pic_tile_config);
+	RVCE_CS(enc->enc_pic.picture_type); // encPicType
+	RVCE_CS(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
+	if ((enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) && (enc->enc_pic.eo.enc_idr_pic_id !=0))
+		enc->enc_pic.eo.enc_idr_pic_id = enc->enc_pic.idr_pic_id - 1;
+	else
+		enc->enc_pic.eo.enc_idr_pic_id = 0x00000000;
+	RVCE_CS(enc->enc_pic.eo.enc_idr_pic_id);
+	RVCE_CS(enc->enc_pic.eo.enc_mgs_key_pic);
+	RVCE_CS(!enc->enc_pic.not_referenced);
+	RVCE_CS(enc->enc_pic.eo.enc_temporal_layer_index);
+	RVCE_CS(enc->enc_pic.eo.num_ref_idx_active_override_flag);
+	RVCE_CS(enc->enc_pic.eo.num_ref_idx_l0_active_minus1);
+	RVCE_CS(enc->enc_pic.eo.num_ref_idx_l1_active_minus1);
+
+	i = enc->enc_pic.frame_num - enc->enc_pic.ref_idx_l0;
+	if (i > 1 && enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) {
+		enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000001;
+		enc->enc_pic.eo.enc_ref_list_modification_num = i - 1;
+		RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op);
+		RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num);
 	} else {
-		RVCE_CS(0x00000000); // encRefListModificationOp
-		RVCE_CS(0x00000000); // encRefListModificationNum
+		enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000000;
+		enc->enc_pic.eo.enc_ref_list_modification_num = 0x00000000;
+		RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op);
+		RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num);
 	}
 
 	for (i = 0; i < 3; ++i) {
-		RVCE_CS(0x00000000); // encRefListModificationOp
-		RVCE_CS(0x00000000); // encRefListModificationNum
+		enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000000;
+		enc->enc_pic.eo.enc_ref_list_modification_num = 0x00000000;
+		RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op);
+		RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num);
 	}
 	for (i = 0; i < 4; ++i) {
-		RVCE_CS(0x00000000); // encDecodedPictureMarkingOp
-		RVCE_CS(0x00000000); // encDecodedPictureMarkingNum
-		RVCE_CS(0x00000000); // encDecodedPictureMarkingIdx
-		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingOp
-		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingNum
+		RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_op);
+		RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_num);
+		RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_idx);
+		RVCE_CS(enc->enc_pic.eo.enc_decoded_ref_base_picture_marking_op);
+		RVCE_CS(enc->enc_pic.eo.enc_decoded_ref_base_picture_marking_num);
 	}
 
 	// encReferencePictureL0[0]
 	RVCE_CS(0x00000000); // pictureStructure
-	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
-	   enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+	if(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
+		enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
 		struct rvce_cpb_slot *l0 = l0_slot(enc);
 		rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset);
-		RVCE_CS(l0->picture_type); // encPicType
-		RVCE_CS(l0->frame_num); // frameNumber
-		RVCE_CS(l0->pic_order_cnt); // pictureOrderCount
-		RVCE_CS(luma_offset); // lumaOffset
-		RVCE_CS(chroma_offset); // chromaOffset
+		RVCE_CS(l0->picture_type);
+		RVCE_CS(l0->frame_num);
+		RVCE_CS(l0->pic_order_cnt);
+		RVCE_CS(luma_offset);
+		RVCE_CS(chroma_offset);
 	} else {
-		RVCE_CS(0x00000000); // encPicType
-		RVCE_CS(0x00000000); // frameNumber
-		RVCE_CS(0x00000000); // pictureOrderCount
-		RVCE_CS(0xffffffff); // lumaOffset
-		RVCE_CS(0xffffffff); // chromaOffset
+		enc->enc_pic.eo.l0_enc_pic_type = 0x00000000;
+		enc->enc_pic.eo.l0_frame_number = 0x00000000;
+		enc->enc_pic.eo.l0_picture_order_count = 0x00000000;
+		enc->enc_pic.eo.l0_luma_offset = 0xffffffff;
+		enc->enc_pic.eo.l0_chroma_offset = 0xffffffff;
+		RVCE_CS(enc->enc_pic.eo.l0_enc_pic_type);
+		RVCE_CS(enc->enc_pic.eo.l0_frame_number);
+		RVCE_CS(enc->enc_pic.eo.l0_picture_order_count);
+		RVCE_CS(enc->enc_pic.eo.l0_luma_offset);
+		RVCE_CS(enc->enc_pic.eo.l0_chroma_offset);
 	}
 
 	// encReferencePictureL0[1]
-	RVCE_CS(0x00000000); // pictureStructure
-	RVCE_CS(0x00000000); // encPicType
-	RVCE_CS(0x00000000); // frameNumber
-	RVCE_CS(0x00000000); // pictureOrderCount
-	RVCE_CS(0xffffffff); // lumaOffset
-	RVCE_CS(0xffffffff); // chromaOffset
+	enc->enc_pic.eo.l0_picture_structure = 0x00000000;
+	enc->enc_pic.eo.l0_enc_pic_type = 0x00000000;
+	enc->enc_pic.eo.l0_frame_number = 0x00000000;
+	enc->enc_pic.eo.l0_picture_order_count = 0x00000000;
+	enc->enc_pic.eo.l0_luma_offset = 0xffffffff;
+	enc->enc_pic.eo.l0_chroma_offset = 0xffffffff;
+	RVCE_CS(enc->enc_pic.eo.l0_picture_structure);
+	RVCE_CS(enc->enc_pic.eo.l0_enc_pic_type);
+	RVCE_CS(enc->enc_pic.eo.l0_frame_number);
+	RVCE_CS(enc->enc_pic.eo.l0_picture_order_count);
+	RVCE_CS(enc->enc_pic.eo.l0_luma_offset);
+	RVCE_CS(enc->enc_pic.eo.l0_chroma_offset);
 
 	// encReferencePictureL1[0]
 	RVCE_CS(0x00000000); // pictureStructure
-	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+	if(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
 		struct rvce_cpb_slot *l1 = l1_slot(enc);
 		rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset);
-		RVCE_CS(l1->picture_type); // encPicType
-		RVCE_CS(l1->frame_num); // frameNumber
-		RVCE_CS(l1->pic_order_cnt); // pictureOrderCount
-		RVCE_CS(luma_offset); // lumaOffset
-		RVCE_CS(chroma_offset); // chromaOffset
+		RVCE_CS(l1->picture_type);
+		RVCE_CS(l1->frame_num);
+		RVCE_CS(l1->pic_order_cnt);
+		RVCE_CS(luma_offset);
+		RVCE_CS(chroma_offset);
 	} else {
-		RVCE_CS(0x00000000); // encPicType
-		RVCE_CS(0x00000000); // frameNumber
-		RVCE_CS(0x00000000); // pictureOrderCount
-		RVCE_CS(0xffffffff); // lumaOffset
-		RVCE_CS(0xffffffff); // chromaOffset
+		enc->enc_pic.eo.l1_enc_pic_type = 0x00000000;
+		enc->enc_pic.eo.l1_frame_number = 0x00000000;
+		enc->enc_pic.eo.l1_picture_order_count = 0x00000000;
+		enc->enc_pic.eo.l1_luma_offset = 0xffffffff;
+		enc->enc_pic.eo.l1_chroma_offset = 0xffffffff;
+		RVCE_CS(enc->enc_pic.eo.l1_enc_pic_type);
+		RVCE_CS(enc->enc_pic.eo.l1_frame_number);
+		RVCE_CS(enc->enc_pic.eo.l1_picture_order_count);
+		RVCE_CS(enc->enc_pic.eo.l1_luma_offset);
+		RVCE_CS(enc->enc_pic.eo.l1_chroma_offset);
 	}
 
 	rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
-	RVCE_CS(luma_offset); // encReconstructedLumaOffset
-	RVCE_CS(chroma_offset); // encReconstructedChromaOffset
-	RVCE_CS(0x00000000); // encColocBufferOffset
-	RVCE_CS(0x00000000); // encReconstructedRefBasePictureLumaOffset
-	RVCE_CS(0x00000000); // encReconstructedRefBasePictureChromaOffset
-	RVCE_CS(0x00000000); // encReferenceRefBasePictureLumaOffset
-	RVCE_CS(0x00000000); // encReferenceRefBasePictureChromaOffset
-	RVCE_CS(0x00000000); // pictureCount
-	RVCE_CS(enc->pic.frame_num); // frameNumber
-	RVCE_CS(enc->pic.pic_order_cnt); // pictureOrderCount
-	RVCE_CS(0x00000000); // numIPicRemainInRCGOP
-	RVCE_CS(0x00000000); // numPPicRemainInRCGOP
-	RVCE_CS(0x00000000); // numBPicRemainInRCGOP
-	RVCE_CS(0x00000000); // numIRPicRemainInRCGOP
-	RVCE_CS(0x00000000); // enableIntraRefresh
-
-	RVCE_CS(0x00000000); // aq_variance_en
-	RVCE_CS(0x00000000); // aq_block_size
-	RVCE_CS(0x00000000); // aq_mb_variance_sel
-	RVCE_CS(0x00000000); // aq_frame_variance_sel
-	RVCE_CS(0x00000000); // aq_param_a
-	RVCE_CS(0x00000000); // aq_param_b
-	RVCE_CS(0x00000000); // aq_param_c
-	RVCE_CS(0x00000000); // aq_param_d
-	RVCE_CS(0x00000000); // aq_param_e
-
-	RVCE_CS(0x00000000); // contextInSFB
+	RVCE_CS(luma_offset);
+	RVCE_CS(chroma_offset);
+	RVCE_CS(enc->enc_pic.eo.enc_coloc_buffer_offset);
+	RVCE_CS(enc->enc_pic.eo.enc_reconstructed_ref_base_picture_luma_offset);
+	RVCE_CS(enc->enc_pic.eo.enc_reconstructed_ref_base_picture_chroma_offset);
+	RVCE_CS(enc->enc_pic.eo.enc_reference_ref_base_picture_luma_offset);
+	RVCE_CS(enc->enc_pic.eo.enc_reference_ref_base_picture_chroma_offset);
+	RVCE_CS(enc->enc_pic.frame_num_cnt-1);
+	RVCE_CS(enc->enc_pic.frame_num);
+	RVCE_CS(enc->enc_pic.pic_order_cnt);
+	RVCE_CS(enc->enc_pic.i_remain);
+	RVCE_CS(enc->enc_pic.p_remain);
+	RVCE_CS(enc->enc_pic.eo.num_b_pic_remain_in_rcgop);
+	RVCE_CS(enc->enc_pic.eo.num_ir_pic_remain_in_rcgop);
+	RVCE_CS(enc->enc_pic.eo.enable_intra_refresh);
+
+	RVCE_CS(enc->enc_pic.eo.aq_variance_en);
+	RVCE_CS(enc->enc_pic.eo.aq_block_size);
+	RVCE_CS(enc->enc_pic.eo.aq_mb_variance_sel);
+	RVCE_CS(enc->enc_pic.eo.aq_frame_variance_sel);
+	RVCE_CS(enc->enc_pic.eo.aq_param_a);
+	RVCE_CS(enc->enc_pic.eo.aq_param_b);
+	RVCE_CS(enc->enc_pic.eo.aq_param_c);
+	RVCE_CS(enc->enc_pic.eo.aq_param_d);
+	RVCE_CS(enc->enc_pic.eo.aq_param_e);
+
+	RVCE_CS(enc->enc_pic.eo.context_in_sfb);
 	RVCE_END();
 }
 
-void radeon_vce_52_init(struct rvce_encoder *enc)
+static void rate_control(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x04000005); // rate control
+	RVCE_CS(enc->enc_pic.rc.rc_method);
+	RVCE_CS(enc->enc_pic.rc.target_bitrate);
+	RVCE_CS(enc->enc_pic.rc.peak_bitrate);
+	RVCE_CS(enc->enc_pic.rc.frame_rate_num);
+	RVCE_CS(enc->enc_pic.rc.gop_size);
+	RVCE_CS(enc->enc_pic.rc.quant_i_frames);
+	RVCE_CS(enc->enc_pic.rc.quant_p_frames);
+	RVCE_CS(enc->enc_pic.rc.quant_b_frames);
+	RVCE_CS(enc->enc_pic.rc.vbv_buffer_size);
+	RVCE_CS(enc->enc_pic.rc.frame_rate_den);
+	RVCE_CS(enc->enc_pic.rc.vbv_buf_lv);
+	RVCE_CS(enc->enc_pic.rc.max_au_size);
+	RVCE_CS(enc->enc_pic.rc.qp_initial_mode);
+	RVCE_CS(enc->enc_pic.rc.target_bits_picture);
+	RVCE_CS(enc->enc_pic.rc.peak_bits_picture_integer);
+	RVCE_CS(enc->enc_pic.rc.peak_bits_picture_fraction);
+	RVCE_CS(enc->enc_pic.rc.min_qp);
+	RVCE_CS(enc->enc_pic.rc.max_qp);
+	RVCE_CS(enc->enc_pic.rc.skip_frame_enable);
+	RVCE_CS(enc->enc_pic.rc.fill_data_enable);
+	RVCE_CS(enc->enc_pic.rc.enforce_hrd);
+	RVCE_CS(enc->enc_pic.rc.b_pics_delta_qp);
+	RVCE_CS(enc->enc_pic.rc.ref_b_pics_delta_qp);
+	RVCE_CS(enc->enc_pic.rc.rc_reinit_disable);
+	RVCE_CS(enc->enc_pic.rc.enc_lcvbr_init_qp_flag);
+	RVCE_CS(enc->enc_pic.rc.lcvbrsatd_based_nonlinear_bit_budget_flag);
+	RVCE_END();
+}
+
+static void config(struct rvce_encoder *enc)
 {
-	radeon_vce_50_init(enc);
+	enc->task_info(enc, 0x00000002, 0, 0xffffffff, 0);
+	enc->rate_control(enc);
+	enc->config_extension(enc);
+	enc->motion_estimation(enc);
+	enc->rdo(enc);
+	if (enc->use_vui)
+		enc->vui(enc);
+	enc->pic_control(enc);
+}
+
+static void config_extension(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x04000001); // config extension
+	RVCE_CS(enc->enc_pic.ce.enc_enable_perf_logging);
+	RVCE_END();
+}
 
+static void destroy(struct rvce_encoder *enc)
+{
+	enc->task_info(enc, 0x00000001, 0, 0, 0);
+
+	RVCE_BEGIN(0x02000001); // destroy
+	RVCE_END();
+}
+
+static void feedback(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x05000005); // feedback buffer
+	RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
+	RVCE_CS(enc->enc_pic.fb.feedback_ring_size);
+	RVCE_END();
+}
+
+static void motion_estimation(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x04000007); // motion estimation
+	RVCE_CS(enc->enc_pic.me.enc_ime_decimation_search);
+	RVCE_CS(enc->enc_pic.me.motion_est_half_pixel);
+	RVCE_CS(enc->enc_pic.me.motion_est_quarter_pixel);
+	RVCE_CS(enc->enc_pic.me.disable_favor_pmv_point);
+	RVCE_CS(enc->enc_pic.me.force_zero_point_center);
+	RVCE_CS(enc->enc_pic.me.lsmvert);
+	RVCE_CS(enc->enc_pic.me.enc_search_range_x);
+	RVCE_CS(enc->enc_pic.me.enc_search_range_y);
+	RVCE_CS(enc->enc_pic.me.enc_search1_range_x);
+	RVCE_CS(enc->enc_pic.me.enc_search1_range_y);
+	RVCE_CS(enc->enc_pic.me.disable_16x16_frame1);
+	RVCE_CS(enc->enc_pic.me.disable_satd);
+	RVCE_CS(enc->enc_pic.me.enable_amd);
+	RVCE_CS(enc->enc_pic.me.enc_disable_sub_mode);
+	RVCE_CS(enc->enc_pic.me.enc_ime_skip_x);
+	RVCE_CS(enc->enc_pic.me.enc_ime_skip_y);
+	RVCE_CS(enc->enc_pic.me.enc_en_ime_overw_dis_subm);
+	RVCE_CS(enc->enc_pic.me.enc_ime_overw_dis_subm_no);
+	RVCE_CS(enc->enc_pic.me.enc_ime2_search_range_x);
+	RVCE_CS(enc->enc_pic.me.enc_ime2_search_range_y);
+	RVCE_CS(enc->enc_pic.me.parallel_mode_speedup_enable);
+	RVCE_CS(enc->enc_pic.me.fme0_enc_disable_sub_mode);
+	RVCE_CS(enc->enc_pic.me.fme1_enc_disable_sub_mode);
+	RVCE_CS(enc->enc_pic.me.ime_sw_speedup_enable);
+	RVCE_END();
+}
+
+static void pic_control(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x04000002); // pic control
+	RVCE_CS(enc->enc_pic.pc.enc_use_constrained_intra_pred);
+	RVCE_CS(enc->enc_pic.pc.enc_cabac_enable);
+	RVCE_CS(enc->enc_pic.pc.enc_cabac_idc);
+	RVCE_CS(enc->enc_pic.pc.enc_loop_filter_disable);
+	RVCE_CS(enc->enc_pic.pc.enc_lf_beta_offset);
+	RVCE_CS(enc->enc_pic.pc.enc_lf_alpha_c0_offset);
+	RVCE_CS(enc->enc_pic.pc.enc_crop_left_offset);
+	RVCE_CS(enc->enc_pic.pc.enc_crop_right_offset);
+	RVCE_CS(enc->enc_pic.pc.enc_crop_top_offset);
+	RVCE_CS(enc->enc_pic.pc.enc_crop_bottom_offset);
+	RVCE_CS(enc->enc_pic.pc.enc_num_mbs_per_slice);
+	RVCE_CS(enc->enc_pic.pc.enc_intra_refresh_num_mbs_per_slot);
+	RVCE_CS(enc->enc_pic.pc.enc_force_intra_refresh);
+	RVCE_CS(enc->enc_pic.pc.enc_force_imb_period);
+	RVCE_CS(enc->enc_pic.pc.enc_pic_order_cnt_type);
+	RVCE_CS(enc->enc_pic.pc.log2_max_pic_order_cnt_lsb_minus4);
+	RVCE_CS(enc->enc_pic.pc.enc_sps_id);
+	RVCE_CS(enc->enc_pic.pc.enc_pps_id);
+	RVCE_CS(enc->enc_pic.pc.enc_constraint_set_flags);
+	RVCE_CS(enc->enc_pic.pc.enc_b_pic_pattern);
+	RVCE_CS(enc->enc_pic.pc.weight_pred_mode_b_picture);
+	RVCE_CS(enc->enc_pic.pc.enc_number_of_reference_frames);
+	RVCE_CS(enc->enc_pic.pc.enc_max_num_ref_frames);
+	RVCE_CS(enc->enc_pic.pc.enc_num_default_active_ref_l0);
+	RVCE_CS(enc->enc_pic.pc.enc_num_default_active_ref_l1);
+	RVCE_CS(enc->enc_pic.pc.enc_slice_mode);
+	RVCE_CS(enc->enc_pic.pc.enc_max_slice_size);
+	RVCE_END();
+}
+
+static void rdo(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x04000008); // rdo
+	RVCE_CS(enc->enc_pic.rdo.enc_disable_tbe_pred_i_frame);
+	RVCE_CS(enc->enc_pic.rdo.enc_disable_tbe_pred_p_frame);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_y);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_uv);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_y);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_uv);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_y_1);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_uv_1);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_y_1);
+	RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_uv_1);
+	RVCE_CS(enc->enc_pic.rdo.enc_16x16_cost_adj);
+	RVCE_CS(enc->enc_pic.rdo.enc_skip_cost_adj);
+	RVCE_CS(enc->enc_pic.rdo.enc_force_16x16_skip);
+	RVCE_CS(enc->enc_pic.rdo.enc_disable_threshold_calc_a);
+	RVCE_CS(enc->enc_pic.rdo.enc_luma_coeff_cost);
+	RVCE_CS(enc->enc_pic.rdo.enc_luma_mb_coeff_cost);
+	RVCE_CS(enc->enc_pic.rdo.enc_chroma_coeff_cost);
+	RVCE_END();
+}
+
+static void session(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x00000001); // session cmd
+	RVCE_CS(enc->stream_handle);
+	RVCE_END();
+}
+
+static void task_info(struct rvce_encoder *enc, uint32_t op,
+					  uint32_t dep, uint32_t fb_idx, uint32_t ring_idx)
+{
+	RVCE_BEGIN(0x00000002); // task info
+	if (op == 0x3) {
+		if (enc->task_info_idx) {
+			uint32_t offs = enc->cs->current.cdw - enc->task_info_idx + 3;
+			// Update offsetOfNextTaskInfo
+			enc->cs->current.buf[enc->task_info_idx] = offs;
+		}
+		enc->task_info_idx = enc->cs->current.cdw;
+	}
+	enc->enc_pic.ti.task_operation = op;
+	enc->enc_pic.ti.reference_picture_dependency = dep;
+	enc->enc_pic.ti.feedback_index = fb_idx;
+	enc->enc_pic.ti.video_bitstream_ring_index = ring_idx;
+	RVCE_CS(enc->enc_pic.ti.offset_of_next_task_info);
+	RVCE_CS(enc->enc_pic.ti.task_operation);
+	RVCE_CS(enc->enc_pic.ti.reference_picture_dependency);
+	RVCE_CS(enc->enc_pic.ti.collocate_flag_dependency);
+	RVCE_CS(enc->enc_pic.ti.feedback_index);
+	RVCE_CS(enc->enc_pic.ti.video_bitstream_ring_index);
+	RVCE_END();
+}
+
+static void vui(struct rvce_encoder *enc)
+{
+	int i;
+
+	if (!enc->enc_pic.enable_vui)
+		return;
+
+	RVCE_BEGIN(0x04000009); // vui
+	RVCE_CS(enc->enc_pic.vui.aspect_ratio_info_present_flag);
+	RVCE_CS(enc->enc_pic.vui.aspect_ratio_idc);
+	RVCE_CS(enc->enc_pic.vui.sar_width);
+	RVCE_CS(enc->enc_pic.vui.sar_height);
+	RVCE_CS(enc->enc_pic.vui.overscan_info_present_flag);
+	RVCE_CS(enc->enc_pic.vui.overscan_Approp_flag);
+	RVCE_CS(enc->enc_pic.vui.video_signal_type_present_flag);
+	RVCE_CS(enc->enc_pic.vui.video_format);
+	RVCE_CS(enc->enc_pic.vui.video_full_range_flag);
+	RVCE_CS(enc->enc_pic.vui.color_description_present_flag);
+	RVCE_CS(enc->enc_pic.vui.color_prim);
+	RVCE_CS(enc->enc_pic.vui.transfer_char);
+	RVCE_CS(enc->enc_pic.vui.matrix_coef);
+	RVCE_CS(enc->enc_pic.vui.chroma_loc_info_present_flag);
+	RVCE_CS(enc->enc_pic.vui.chroma_loc_top);
+	RVCE_CS(enc->enc_pic.vui.chroma_loc_bottom);
+	RVCE_CS(enc->enc_pic.vui.timing_info_present_flag);
+	RVCE_CS(enc->enc_pic.vui.num_units_in_tick);
+	RVCE_CS(enc->enc_pic.vui.time_scale);
+	RVCE_CS(enc->enc_pic.vui.fixed_frame_rate_flag);
+	RVCE_CS(enc->enc_pic.vui.nal_hrd_parameters_present_flag);
+	RVCE_CS(enc->enc_pic.vui.cpb_cnt_minus1);
+	RVCE_CS(enc->enc_pic.vui.bit_rate_scale);
+	RVCE_CS(enc->enc_pic.vui.cpb_size_scale);
+	for (i = 0; i < 32; i++) {
+		RVCE_CS(enc->enc_pic.vui.bit_rate_value_minus);
+		RVCE_CS(enc->enc_pic.vui.cpb_size_value_minus);
+		RVCE_CS(enc->enc_pic.vui.cbr_flag);
+	}
+	RVCE_CS(enc->enc_pic.vui.initial_cpb_removal_delay_length_minus1);
+	RVCE_CS(enc->enc_pic.vui.cpb_removal_delay_length_minus1);
+	RVCE_CS(enc->enc_pic.vui.dpb_output_delay_length_minus1);
+	RVCE_CS(enc->enc_pic.vui.time_offset_length);
+	RVCE_CS(enc->enc_pic.vui.low_delay_hrd_flag);
+	RVCE_CS(enc->enc_pic.vui.pic_struct_present_flag);
+	RVCE_CS(enc->enc_pic.vui.bitstream_restriction_present_flag);
+	RVCE_CS(enc->enc_pic.vui.motion_vectors_over_pic_boundaries_flag);
+	RVCE_CS(enc->enc_pic.vui.max_bytes_per_pic_denom);
+	RVCE_CS(enc->enc_pic.vui.max_bits_per_mb_denom);
+	RVCE_CS(enc->enc_pic.vui.log2_max_mv_length_hori);
+	RVCE_CS(enc->enc_pic.vui.log2_max_mv_length_vert);
+	RVCE_CS(enc->enc_pic.vui.num_reorder_frames);
+	RVCE_CS(enc->enc_pic.vui.max_dec_frame_buffering);
+	RVCE_END();
+}
+
+void radeon_vce_52_init(struct rvce_encoder *enc)
+{
+	enc->session = session;
+	enc->task_info = task_info;
 	enc->create = create;
+	enc->feedback = feedback;
+	enc->rate_control = rate_control;
+	enc->config_extension = config_extension;
+	enc->pic_control = pic_control;
+	enc->motion_estimation = motion_estimation;
+	enc->rdo = rdo;
+	enc->vui = vui;
+	enc->config = config;
 	enc->encode = encode;
+	enc->destroy = destroy;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
index f56c6cf6c..de8e11cd8 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
@@ -43,6 +43,8 @@
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
+#define UVD_FW_1_66_16 ((1 << 24) | (66 << 16) | (16 << 8))
+
 /* generate an stream handle */
 unsigned rvid_alloc_stream_handle()
 {
@@ -64,8 +66,14 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer,
 {
 	memset(buffer, 0, sizeof(*buffer));
 	buffer->usage = usage;
+
+	/* Hardware buffer placement restrictions require the kernel to be
+	 * able to move buffers around individually, so request a
+	 * non-sub-allocated buffer.
+	 */
 	buffer->res = (struct r600_resource *)
-		pipe_buffer_create(screen, PIPE_BIND_CUSTOM, usage, size);
+		pipe_buffer_create(screen, PIPE_BIND_CUSTOM | PIPE_BIND_SHARED,
+				   usage, size);
 
 	return buffer->res != NULL;
 }
@@ -73,7 +81,7 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer,
 /* destroy a buffer */
 void rvid_destroy_buffer(struct rvid_buffer *buffer)
 {
-	pipe_resource_reference((struct pipe_resource **)&buffer->res, NULL);
+	r600_resource_reference(&buffer->res, NULL);
 }
 
 /* reallocate a buffer, preserving its content */
@@ -89,11 +97,11 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
 	if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage))
 		goto error;
 
-	src = ws->buffer_map(old_buf.res->cs_buf, cs, PIPE_TRANSFER_READ);
+	src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ);
 	if (!src)
 		goto error;
 
-	dst = ws->buffer_map(new_buf->res->cs_buf, cs, PIPE_TRANSFER_WRITE);
+	dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE);
 	if (!dst)
 		goto error;
 
@@ -103,14 +111,14 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
 		dst += bytes;
 		memset(dst, 0, new_size);
 	}
-	ws->buffer_unmap(new_buf->res->cs_buf);
-	ws->buffer_unmap(old_buf.res->cs_buf);
+	ws->buffer_unmap(new_buf->res->buf);
+	ws->buffer_unmap(old_buf.res->buf);
 	rvid_destroy_buffer(&old_buf);
 	return true;
 
 error:
 	if (src)
-		ws->buffer_unmap(old_buf.res->cs_buf);
+		ws->buffer_unmap(old_buf.res->buf);
 	rvid_destroy_buffer(new_buf);
 	*new_buf = old_buf;
 	return false;
@@ -122,7 +130,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
 	struct r600_common_context *rctx = (struct r600_common_context*)context;
 
 	rctx->clear_buffer(context, &buffer->res->b.b, 0, buffer->res->buf->size,
-			   0, false);
+			   0, R600_COHERENCY_NONE);
 	context->flush(context, NULL, 0);
 }
 
@@ -130,7 +138,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
  * join surfaces into the same buffer with identical tiling params
  * sumup their sizes and replace the backend buffers with a single bo
  */
-void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
+void rvid_join_surfaces(struct radeon_winsys* ws,
 			struct pb_buffer** buffers[VL_NUM_COMPONENTS],
 			struct radeon_surf *surfaces[VL_NUM_COMPONENTS])
 {
@@ -165,7 +173,7 @@ void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
 
 		/* adjust the texture layer offsets */
 		off = align(off, surfaces[i]->bo_alignment);
-		for (j = 0; j < Elements(surfaces[i]->level); ++j)
+		for (j = 0; j < ARRAY_SIZE(surfaces[i]->level); ++j)
 			surfaces[i]->level[j].offset += off;
 		off += surfaces[i]->bo_size;
 	}
@@ -185,7 +193,7 @@ void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
 	/* TODO: 2D tiling workaround */
 	alignment *= 2;
 
-	pb = ws->buffer_create(ws, size, alignment, bind, RADEON_DOMAIN_VRAM, 0);
+	pb = ws->buffer_create(ws, size, alignment, RADEON_DOMAIN_VRAM, 0);
 	if (!pb)
 		return;
 
@@ -206,30 +214,33 @@ int rvid_get_video_param(struct pipe_screen *screen,
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
 	enum pipe_video_format codec = u_reduce_video_profile(profile);
+	struct radeon_info info;
+
+	rscreen->ws->query_info(rscreen->ws, &info);
 
 	if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
 		switch (param) {
 		case PIPE_VIDEO_CAP_SUPPORTED:
 			return codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
 				rvce_is_fw_version_supported(rscreen);
-	        case PIPE_VIDEO_CAP_NPOT_TEXTURES:
-        	        return 1;
-	        case PIPE_VIDEO_CAP_MAX_WIDTH:
+		case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+			return 1;
+		case PIPE_VIDEO_CAP_MAX_WIDTH:
 			return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
-	        case PIPE_VIDEO_CAP_MAX_HEIGHT:
+		case PIPE_VIDEO_CAP_MAX_HEIGHT:
 			return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
-	        case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-        	        return PIPE_FORMAT_NV12;
-	        case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-        	        return false;
-	        case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-        	        return false;
-	        case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
-        	        return true;
-	        case PIPE_VIDEO_CAP_STACKED_FRAMES:
+		case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+			return PIPE_FORMAT_NV12;
+		case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+			return false;
+		case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+			return false;
+		case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+			return true;
+		case PIPE_VIDEO_CAP_STACKED_FRAMES:
 			return (rscreen->family < CHIP_TONGA) ? 1 : 2;
-	        default:
-        	        return 0;
+		default:
+			return 0;
 		}
 	}
 
@@ -237,18 +248,27 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_SUPPORTED:
 		switch (codec) {
 		case PIPE_VIDEO_FORMAT_MPEG12:
+			return profile != PIPE_VIDEO_PROFILE_MPEG1;
 		case PIPE_VIDEO_FORMAT_MPEG4:
+			/* no support for MPEG4 on older hw */
+			return rscreen->family >= CHIP_PALM;
 		case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-			if (rscreen->family < CHIP_PALM)
-				/* no support for MPEG4 */
-				return codec != PIPE_VIDEO_FORMAT_MPEG4;
+			if ((rscreen->family == CHIP_POLARIS10 ||
+			     rscreen->family == CHIP_POLARIS11) &&
+			    info.uvd_fw_version < UVD_FW_1_66_16 ) {
+				RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
+				return false;
+			}
 			return true;
 		case PIPE_VIDEO_FORMAT_VC1:
 			return true;
 		case PIPE_VIDEO_FORMAT_HEVC:
 			/* Carrizo only supports HEVC Main */
-			return rscreen->family >= CHIP_CARRIZO &&
-				   profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+			if (rscreen->family >= CHIP_STONEY)
+				return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+					profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+			else if (rscreen->family >= CHIP_CARRIZO)
+				return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
 		default:
 			return false;
 		}
@@ -257,7 +277,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_MAX_WIDTH:
 		return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
+		return (rscreen->family < CHIP_TONGA) ? 1152 : 4096;
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
@@ -294,8 +314,9 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
-			return 41;
+			return (rscreen->family < CHIP_TONGA) ? 41 : 52;
 		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+		case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
 			return 186;
 		default:
 			return 0;
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
index c9ee67f07..39305b4fd 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
@@ -66,7 +66,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
 
 /* join surfaces into the same buffer with identical tiling params
    sumup their sizes and replace the backend buffers with a single bo */
-void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
+void rvid_join_surfaces(struct radeon_winsys* ws,
                         struct pb_buffer** buffers[VL_NUM_COMPONENTS],
                         struct radeon_surf *surfaces[VL_NUM_COMPONENTS]);
 
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
index f9a7f878f..8946209d3 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
@@ -26,25 +26,12 @@
 
 /* The public winsys interface header for the radeon driver. */
 
-/* R300 features in DRM.
- *
- * 2.6.0:
- * - Hyper-Z
- * - GB_Z_PEQ_CONFIG on rv350->r4xx
- * - R500 FG_ALPHA_VALUE
- *
- * 2.8.0:
- * - R500 US_FORMAT regs
- * - R500 ARGB2101010 colorbuffer
- * - CMask and AA regs
- * - R16F/RG16F
- */
-
 #include "pipebuffer/pb_buffer.h"
 
+#include "amd/common/amd_family.h"
+
 #define RADEON_FLUSH_ASYNC		(1 << 0)
-#define RADEON_FLUSH_KEEP_TILING_FLAGS	(1 << 1) /* needs DRM 2.12.0 */
-#define RADEON_FLUSH_END_OF_FRAME       (1 << 2)
+#define RADEON_FLUSH_END_OF_FRAME       (1 << 1)
 
 /* Tiling flags. */
 enum radeon_bo_layout {
@@ -65,94 +52,18 @@ enum radeon_bo_flag { /* bitfield */
     RADEON_FLAG_GTT_WC =        (1 << 0),
     RADEON_FLAG_CPU_ACCESS =    (1 << 1),
     RADEON_FLAG_NO_CPU_ACCESS = (1 << 2),
+    RADEON_FLAG_HANDLE =        (1 << 3), /* the buffer most not be suballocated */
 };
 
 enum radeon_bo_usage { /* bitfield */
     RADEON_USAGE_READ = 2,
     RADEON_USAGE_WRITE = 4,
-    RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE
-};
+    RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
 
-enum radeon_family {
-    CHIP_UNKNOWN = 0,
-    CHIP_R300, /* R3xx-based cores. */
-    CHIP_R350,
-    CHIP_RV350,
-    CHIP_RV370,
-    CHIP_RV380,
-    CHIP_RS400,
-    CHIP_RC410,
-    CHIP_RS480,
-    CHIP_R420,     /* R4xx-based cores. */
-    CHIP_R423,
-    CHIP_R430,
-    CHIP_R480,
-    CHIP_R481,
-    CHIP_RV410,
-    CHIP_RS600,
-    CHIP_RS690,
-    CHIP_RS740,
-    CHIP_RV515,    /* R5xx-based cores. */
-    CHIP_R520,
-    CHIP_RV530,
-    CHIP_R580,
-    CHIP_RV560,
-    CHIP_RV570,
-    CHIP_R600,
-    CHIP_RV610,
-    CHIP_RV630,
-    CHIP_RV670,
-    CHIP_RV620,
-    CHIP_RV635,
-    CHIP_RS780,
-    CHIP_RS880,
-    CHIP_RV770,
-    CHIP_RV730,
-    CHIP_RV710,
-    CHIP_RV740,
-    CHIP_CEDAR,
-    CHIP_REDWOOD,
-    CHIP_JUNIPER,
-    CHIP_CYPRESS,
-    CHIP_HEMLOCK,
-    CHIP_PALM,
-    CHIP_SUMO,
-    CHIP_SUMO2,
-    CHIP_BARTS,
-    CHIP_TURKS,
-    CHIP_CAICOS,
-    CHIP_CAYMAN,
-    CHIP_ARUBA,
-    CHIP_TAHITI,
-    CHIP_PITCAIRN,
-    CHIP_VERDE,
-    CHIP_OLAND,
-    CHIP_HAINAN,
-    CHIP_BONAIRE,
-    CHIP_KAVERI,
-    CHIP_KABINI,
-    CHIP_HAWAII,
-    CHIP_MULLINS,
-    CHIP_TONGA,
-    CHIP_ICELAND,
-    CHIP_CARRIZO,
-    CHIP_FIJI,
-    CHIP_STONEY,
-    CHIP_LAST,
-};
-
-enum chip_class {
-    CLASS_UNKNOWN = 0,
-    R300,
-    R400,
-    R500,
-    R600,
-    R700,
-    EVERGREEN,
-    CAYMAN,
-    SI,
-    CIK,
-    VI,
+    /* The winsys ensures that the CS submission will be scheduled after
+     * previously flushed CSs referencing this BO in a conflicting way.
+     */
+    RADEON_USAGE_SYNCHRONIZED = 8
 };
 
 enum ring_type {
@@ -167,10 +78,13 @@ enum ring_type {
 enum radeon_value_id {
     RADEON_REQUESTED_VRAM_MEMORY,
     RADEON_REQUESTED_GTT_MEMORY,
+    RADEON_MAPPED_VRAM,
+    RADEON_MAPPED_GTT,
     RADEON_BUFFER_WAIT_TIME_NS,
     RADEON_TIMESTAMP,
     RADEON_NUM_CS_FLUSHES,
     RADEON_NUM_BYTES_MOVED,
+    RADEON_NUM_EVICTIONS,
     RADEON_VRAM_USAGE,
     RADEON_GTT_USAGE,
     RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */
@@ -179,73 +93,161 @@ enum radeon_value_id {
     RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */
 };
 
+/* Each group of four has the same priority. */
 enum radeon_bo_priority {
-    RADEON_PRIO_MIN,
-    RADEON_PRIO_SHADER_DATA, /* shader code, resource descriptors */
-    RADEON_PRIO_SHADER_BUFFER_RO, /* read-only */
-    RADEON_PRIO_SHADER_TEXTURE_RO, /* read-only */
-    RADEON_PRIO_SHADER_RESOURCE_RW, /* buffers, textures, streamout, GS rings, RATs; read/write */
-    RADEON_PRIO_COLOR_BUFFER,
-    RADEON_PRIO_DEPTH_BUFFER,
-    RADEON_PRIO_SHADER_TEXTURE_MSAA,
-    RADEON_PRIO_COLOR_BUFFER_MSAA,
-    RADEON_PRIO_DEPTH_BUFFER_MSAA,
-    RADEON_PRIO_COLOR_META,
-    RADEON_PRIO_DEPTH_META,
-    RADEON_PRIO_MAX /* must be <= 15 */
+    RADEON_PRIO_FENCE = 0,
+    RADEON_PRIO_TRACE,
+    RADEON_PRIO_SO_FILLED_SIZE,
+    RADEON_PRIO_QUERY,
+
+    RADEON_PRIO_IB1 = 4, /* main IB submitted to the kernel */
+    RADEON_PRIO_IB2, /* IB executed with INDIRECT_BUFFER */
+    RADEON_PRIO_DRAW_INDIRECT,
+    RADEON_PRIO_INDEX_BUFFER,
+
+    RADEON_PRIO_VCE = 8,
+    RADEON_PRIO_UVD,
+    RADEON_PRIO_SDMA_BUFFER,
+    RADEON_PRIO_SDMA_TEXTURE,
+
+    RADEON_PRIO_CP_DMA = 12,
+
+    RADEON_PRIO_CONST_BUFFER = 16,
+    RADEON_PRIO_DESCRIPTORS,
+    RADEON_PRIO_BORDER_COLORS,
+
+    RADEON_PRIO_SAMPLER_BUFFER = 20,
+    RADEON_PRIO_VERTEX_BUFFER,
+
+    RADEON_PRIO_SHADER_RW_BUFFER = 24,
+    RADEON_PRIO_COMPUTE_GLOBAL,
+
+    RADEON_PRIO_SAMPLER_TEXTURE = 28,
+    RADEON_PRIO_SHADER_RW_IMAGE,
+
+    RADEON_PRIO_SAMPLER_TEXTURE_MSAA = 32,
+
+    RADEON_PRIO_COLOR_BUFFER = 36,
+
+    RADEON_PRIO_DEPTH_BUFFER = 40,
+
+    RADEON_PRIO_COLOR_BUFFER_MSAA = 44,
+
+    RADEON_PRIO_DEPTH_BUFFER_MSAA = 48,
+
+    RADEON_PRIO_CMASK = 52,
+    RADEON_PRIO_DCC,
+    RADEON_PRIO_HTILE,
+    RADEON_PRIO_SHADER_BINARY, /* the hw can't hide instruction cache misses */
+
+    RADEON_PRIO_SHADER_RINGS = 56,
+
+    RADEON_PRIO_SCRATCH_BUFFER = 60,
+    /* 63 is the maximum value */
 };
 
 struct winsys_handle;
-struct radeon_winsys_cs_handle;
 struct radeon_winsys_ctx;
 
+struct radeon_winsys_cs_chunk {
+    unsigned cdw;  /* Number of used dwords. */
+    unsigned max_dw; /* Maximum number of dwords. */
+    uint32_t *buf; /* The base pointer of the chunk. */
+};
+
 struct radeon_winsys_cs {
-    unsigned                    cdw;  /* Number of used dwords. */
-    unsigned                    max_dw; /* Maximum number of dwords. */
-    uint32_t                    *buf; /* The command buffer. */
-    enum ring_type              ring_type;
+    struct radeon_winsys_cs_chunk current;
+    struct radeon_winsys_cs_chunk *prev;
+    unsigned                      num_prev; /* Number of previous chunks. */
+    unsigned                      max_prev; /* Space in array pointed to by prev. */
+    unsigned                      prev_dw; /* Total number of dwords in previous chunks. */
+
+    /* Memory usage of the buffer list. These are always 0 for CE and preamble
+     * IBs. */
+    uint64_t                      used_vram;
+    uint64_t                      used_gart;
 };
 
 struct radeon_info {
+    /* PCI info: domain:bus:dev:func */
+    uint32_t                    pci_domain;
+    uint32_t                    pci_bus;
+    uint32_t                    pci_dev;
+    uint32_t                    pci_func;
+
+    /* Device info. */
     uint32_t                    pci_id;
     enum radeon_family          family;
     enum chip_class             chip_class;
+    uint32_t                    gart_page_size;
     uint64_t                    gart_size;
     uint64_t                    vram_size;
-    uint32_t                    max_sclk;
-    uint32_t                    max_compute_units;
-    uint32_t                    max_se;
-    uint32_t                    max_sh_per_se;
+    uint64_t                    max_alloc_size;
+    uint32_t                    min_alloc_size;
+    bool                        has_dedicated_vram;
+    bool                        has_virtual_memory;
+    bool                        gfx_ib_pad_with_type2;
+    bool                        has_sdma;
+    bool                        has_uvd;
+    uint32_t                    uvd_fw_version;
+    uint32_t                    vce_fw_version;
+    uint32_t                    me_fw_version;
+    uint32_t                    pfp_fw_version;
+    uint32_t                    ce_fw_version;
+    uint32_t                    vce_harvest_config;
+    uint32_t                    clock_crystal_freq;
 
+    /* Kernel info. */
     uint32_t                    drm_major; /* version */
     uint32_t                    drm_minor;
     uint32_t                    drm_patchlevel;
+    bool                        has_userptr;
 
-    boolean                     has_uvd;
-    uint32_t                    vce_fw_version;
-    boolean                     has_userptr;
+    /* Shader cores. */
+    uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
+    uint32_t                    max_shader_clock;
+    uint32_t                    num_good_compute_units;
+    uint32_t                    max_se; /* shader engines */
+    uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
 
+    /* Render backends (color + depth blocks). */
     uint32_t                    r300_num_gb_pipes;
     uint32_t                    r300_num_z_pipes;
-
-    uint32_t                    r600_num_backends;
-    uint32_t                    r600_clock_crystal_freq;
-    uint32_t                    r600_tiling_config;
-    uint32_t                    r600_num_tile_pipes;
-    uint32_t                    r600_max_pipes;
-    boolean                     r600_virtual_address;
-    boolean                     r600_has_dma;
-
-    uint32_t                    r600_backend_map;
-    boolean                     r600_backend_map_valid;
-
-    boolean                     si_tile_mode_array_valid;
+    uint32_t                    r600_gb_backend_map; /* R600 harvest config */
+    bool                        r600_gb_backend_map_valid;
+    uint32_t                    r600_num_banks;
+    uint32_t                    num_render_backends;
+    uint32_t                    num_tile_pipes; /* pipe count from PIPE_CONFIG */
+    uint32_t                    pipe_interleave_bytes;
+    uint32_t                    enabled_rb_mask; /* GCN harvest config */
+
+    /* Tile modes. */
     uint32_t                    si_tile_mode_array[32];
-    uint32_t                    si_backend_enabled_mask;
-
-    boolean                     cik_macrotile_mode_array_valid;
     uint32_t                    cik_macrotile_mode_array[16];
-    uint32_t                    vce_harvest_config;
+};
+
+/* Tiling info for display code, DRI sharing, and other data. */
+struct radeon_bo_metadata {
+    /* Tiling flags describing the texture layout for display code
+     * and DRI sharing.
+     */
+    enum radeon_bo_layout   microtile;
+    enum radeon_bo_layout   macrotile;
+    unsigned                pipe_config;
+    unsigned                bankw;
+    unsigned                bankh;
+    unsigned                tile_split;
+    unsigned                mtilea;
+    unsigned                num_banks;
+    unsigned                stride;
+    bool                    scanout;
+
+    /* Additional metadata associated with the buffer, in bytes.
+     * The maximum size is 64 * 4. This is opaque for the winsys & kernel.
+     * Supported by amdgpu only.
+     */
+    uint32_t                size_metadata;
+    uint32_t                metadata[64];
 };
 
 enum radeon_feature_id {
@@ -265,7 +267,6 @@ enum radeon_feature_id {
 #define     RADEON_SURF_TYPE_2D_ARRAY               5
 #define RADEON_SURF_MODE_MASK                   0xFF
 #define RADEON_SURF_MODE_SHIFT                  8
-#define     RADEON_SURF_MODE_LINEAR                 0
 #define     RADEON_SURF_MODE_LINEAR_ALIGNED         1
 #define     RADEON_SURF_MODE_1D                     2
 #define     RADEON_SURF_MODE_2D                     3
@@ -276,6 +277,8 @@ enum radeon_feature_id {
 #define RADEON_SURF_HAS_SBUFFER_MIPTREE         (1 << 19)
 #define RADEON_SURF_HAS_TILE_MODE_INDEX         (1 << 20)
 #define RADEON_SURF_FMASK                       (1 << 21)
+#define RADEON_SURF_DISABLE_DCC                 (1 << 22)
+#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
 
 #define RADEON_SURF_GET(v, field)   (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
 #define RADEON_SURF_SET(v, field)   (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
@@ -292,6 +295,9 @@ struct radeon_surf_level {
     uint32_t                    nblk_z;
     uint32_t                    pitch_bytes;
     uint32_t                    mode;
+    uint64_t                    dcc_offset;
+    uint64_t                    dcc_fast_clear_size;
+    bool                        dcc_enabled;
 };
 
 struct radeon_surf {
@@ -320,13 +326,34 @@ struct radeon_surf {
     uint32_t                    mtilea;
     uint32_t                    tile_split;
     uint32_t                    stencil_tile_split;
-    uint64_t                    stencil_offset;
     struct radeon_surf_level    level[RADEON_SURF_MAX_LEVEL];
     struct radeon_surf_level    stencil_level[RADEON_SURF_MAX_LEVEL];
     uint32_t                    tiling_index[RADEON_SURF_MAX_LEVEL];
     uint32_t                    stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
     uint32_t                    pipe_config;
     uint32_t                    num_banks;
+    uint32_t                    macro_tile_index;
+    uint32_t                    micro_tile_mode; /* displayable, thin, depth, rotated */
+
+    /* Whether the depth miptree or stencil miptree as used by the DB are
+     * adjusted from their TC compatible form to ensure depth/stencil
+     * compatibility. If either is true, the corresponding plane cannot be
+     * sampled from.
+     */
+    bool                        depth_adjusted;
+    bool                        stencil_adjusted;
+
+    uint64_t                    dcc_size;
+    uint64_t                    dcc_alignment;
+    /* TC-compatible HTILE only. */
+    uint64_t                    htile_size;
+    uint64_t                    htile_alignment;
+};
+
+struct radeon_bo_list_item {
+    uint64_t bo_size;
+    uint64_t vm_address;
+    uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */
 };
 
 struct radeon_winsys {
@@ -378,15 +405,11 @@ struct radeon_winsys {
      * \return          The created buffer object.
      */
     struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws,
-                                       unsigned size,
+                                       uint64_t size,
                                        unsigned alignment,
-                                       boolean use_reusable_pool,
                                        enum radeon_bo_domain domain,
                                        enum radeon_bo_flag flags);
 
-    struct radeon_winsys_cs_handle *(*buffer_get_cs_handle)(
-            struct pb_buffer *buf);
-
     /**
      * Map the entire data store of a buffer object into the client's address
      * space.
@@ -396,7 +419,7 @@ struct radeon_winsys {
      * \param usage     A bitmask of the PIPE_TRANSFER_* flags.
      * \return          The pointer at the beginning of the buffer.
      */
-    void *(*buffer_map)(struct radeon_winsys_cs_handle *buf,
+    void *(*buffer_map)(struct pb_buffer *buf,
                         struct radeon_winsys_cs *cs,
                         enum pipe_transfer_usage usage);
 
@@ -405,7 +428,7 @@ struct radeon_winsys {
      *
      * \param buf       A winsys buffer object to unmap.
      */
-    void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf);
+    void (*buffer_unmap)(struct pb_buffer *buf);
 
     /**
      * Wait for the buffer and return true if the buffer is not used
@@ -419,45 +442,24 @@ struct radeon_winsys {
                         enum radeon_bo_usage usage);
 
     /**
-     * Return tiling flags describing a memory layout of a buffer object.
+     * Return buffer metadata.
+     * (tiling info for display code, DRI sharing, and other data)
      *
      * \param buf       A winsys buffer object to get the flags from.
-     * \param macrotile A pointer to the return value of the microtile flag.
-     * \param microtile A pointer to the return value of the macrotile flag.
-     *
-     * \note microtile and macrotile are not bitmasks!
+     * \param md        Metadata
      */
-    void (*buffer_get_tiling)(struct pb_buffer *buf,
-                              enum radeon_bo_layout *microtile,
-                              enum radeon_bo_layout *macrotile,
-                              unsigned *bankw, unsigned *bankh,
-                              unsigned *tile_split,
-                              unsigned *stencil_tile_split,
-                              unsigned *mtilea,
-                              bool *scanout);
+    void (*buffer_get_metadata)(struct pb_buffer *buf,
+                                struct radeon_bo_metadata *md);
 
     /**
-     * Set tiling flags describing a memory layout of a buffer object.
+     * Set buffer metadata.
+     * (tiling info for display code, DRI sharing, and other data)
      *
      * \param buf       A winsys buffer object to set the flags for.
-     * \param cs        A command stream to flush if the buffer is referenced by it.
-     * \param macrotile A macrotile flag.
-     * \param microtile A microtile flag.
-     * \param stride    A stride of the buffer in bytes, for texturing.
-     *
-     * \note microtile and macrotile are not bitmasks!
+     * \param md        Metadata
      */
-    void (*buffer_set_tiling)(struct pb_buffer *buf,
-                              struct radeon_winsys_cs *rcs,
-                              enum radeon_bo_layout microtile,
-                              enum radeon_bo_layout macrotile,
-                              unsigned pipe_config,
-                              unsigned bankw, unsigned bankh,
-                              unsigned tile_split,
-                              unsigned stencil_tile_split,
-                              unsigned mtilea, unsigned num_banks,
-                              unsigned stride,
-                              bool scanout);
+    void (*buffer_set_metadata)(struct pb_buffer *buf,
+                                struct radeon_bo_metadata *md);
 
     /**
      * Get a winsys buffer from a winsys handle. The internal structure
@@ -470,7 +472,7 @@ struct radeon_winsys {
      */
     struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws,
                                             struct winsys_handle *whandle,
-                                            unsigned *stride);
+                                            unsigned *stride, unsigned *offset);
 
     /**
      * Get a winsys buffer from a user pointer. The resulting buffer can't
@@ -481,7 +483,15 @@ struct radeon_winsys {
      * \param Size      Size in bytes for the new buffer.
      */
     struct pb_buffer *(*buffer_from_ptr)(struct radeon_winsys *ws,
-                                         void *pointer, unsigned size);
+                                         void *pointer, uint64_t size);
+
+    /**
+     * Whether the buffer was created from a user pointer.
+     *
+     * \param buf       A winsys buffer object
+     * \return          whether \p buf was created via buffer_from_ptr
+     */
+    bool (*buffer_is_user_ptr)(struct pb_buffer *buf);
 
     /**
      * Get a winsys handle from a winsys buffer. The internal structure
@@ -490,24 +500,40 @@ struct radeon_winsys {
      * \param buf       A winsys buffer object to get the handle from.
      * \param whandle   A winsys handle pointer.
      * \param stride    A stride of the buffer in bytes, for texturing.
-     * \return          TRUE on success.
+     * \return          true on success.
      */
-    boolean (*buffer_get_handle)(struct pb_buffer *buf,
-                                 unsigned stride,
-                                 struct winsys_handle *whandle);
+    bool (*buffer_get_handle)(struct pb_buffer *buf,
+                              unsigned stride, unsigned offset,
+                              unsigned slice_size,
+                              struct winsys_handle *whandle);
 
     /**
      * Return the virtual address of a buffer.
      *
+     * When virtual memory is not in use, this is the offset relative to the
+     * relocation base (non-zero for sub-allocated buffers).
+     *
      * \param buf       A winsys buffer object
      * \return          virtual address
      */
-    uint64_t (*buffer_get_virtual_address)(struct radeon_winsys_cs_handle *buf);
+    uint64_t (*buffer_get_virtual_address)(struct pb_buffer *buf);
+
+    /**
+     * Return the offset of this buffer relative to the relocation base.
+     * This is only non-zero for sub-allocated buffers.
+     *
+     * This is only supported in the radeon winsys, since amdgpu uses virtual
+     * addresses in submissions even for the video engines.
+     *
+     * \param buf      A winsys buffer object
+     * \return         the offset for relocations
+     */
+    unsigned (*buffer_get_reloc_offset)(struct pb_buffer *buf);
 
     /**
      * Query the initial placement of the buffer from the kernel driver.
      */
-    enum radeon_bo_domain (*buffer_get_initial_domain)(struct radeon_winsys_cs_handle *buf);
+    enum radeon_bo_domain (*buffer_get_initial_domain)(struct pb_buffer *buf);
 
     /**************************************************************************
      * Command submission.
@@ -539,15 +565,43 @@ struct radeon_winsys {
      * \param ring_type The ring type (GFX, DMA, UVD)
      * \param flush     Flush callback function associated with the command stream.
      * \param user      User pointer that will be passed to the flush callback.
-     * \param trace_buf Trace buffer when tracing is enabled
      */
     struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
                                           enum ring_type ring_type,
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
-                                          void *flush_ctx,
-                                          struct radeon_winsys_cs_handle *trace_buf);
+                                          void *flush_ctx);
+
+    /**
+     * Add a constant engine IB to a graphics CS. This makes the graphics CS
+     * from "cs_create" a group of two IBs that share a buffer list and are
+     * flushed together.
+     *
+     * The returned constant CS is only a stream for writing packets to the new
+     * IB. Calling other winsys functions with it is not allowed, not even
+     * "cs_destroy".
+     *
+     * In order to add buffers and check memory usage, use the graphics CS.
+     * In order to flush it, use the graphics CS, which will flush both IBs.
+     * Destroying the graphics CS will destroy both of them.
+     *
+     * \param cs  The graphics CS from "cs_create" that will hold the buffer
+     *            list and will be used for flushing.
+     */
+    struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs);
 
+     /**
+     * Add a constant engine preamble IB to a graphics CS. This add an extra IB
+     * in similar manner to cs_add_const_ib. This should always be called after
+     * cs_add_const_ib.
+     *
+     * The returned IB is a constant engine IB that only gets flushed if the
+     * context changed.
+     *
+     * \param cs  The graphics CS from "cs_create" that will hold the buffer
+     *            list and will be used for flushing.
+     */
+    struct radeon_winsys_cs *(*cs_add_const_preamble_ib)(struct radeon_winsys_cs *cs);
     /**
      * Destroy a command stream.
      *
@@ -556,19 +610,18 @@ struct radeon_winsys {
     void (*cs_destroy)(struct radeon_winsys_cs *cs);
 
     /**
-     * Add a new buffer relocation. Every relocation must first be added
-     * before it can be written.
+     * Add a buffer. Each buffer used by a CS must be added using this function.
      *
-     * \param cs  A command stream to add buffer for validation against.
-     * \param buf A winsys buffer to validate.
+     * \param cs      Command stream
+     * \param buf     Buffer
      * \param usage   Whether the buffer is used for read and/or write.
      * \param domain  Bitmask of the RADEON_DOMAIN_* flags.
      * \param priority  A higher number means a greater chance of being
      *                  placed in the requested domain. 15 is the maximum.
-     * \return Relocation index.
+     * \return Buffer index.
      */
-    unsigned (*cs_add_reloc)(struct radeon_winsys_cs *cs,
-                             struct radeon_winsys_cs_handle *buf,
+    unsigned (*cs_add_buffer)(struct radeon_winsys_cs *cs,
+                             struct pb_buffer *buf,
                              enum radeon_bo_usage usage,
                              enum radeon_bo_domain domain,
                              enum radeon_bo_priority priority);
@@ -576,32 +629,47 @@ struct radeon_winsys {
     /**
      * Return the index of an already-added buffer.
      *
+     * Not supported on amdgpu. Drivers with GPUVM should not care about
+     * buffer indices.
+     *
      * \param cs        Command stream
      * \param buf       Buffer
      * \return          The buffer index, or -1 if the buffer has not been added.
      */
-    int (*cs_get_reloc)(struct radeon_winsys_cs *cs,
-                        struct radeon_winsys_cs_handle *buf);
+    int (*cs_lookup_buffer)(struct radeon_winsys_cs *cs,
+                            struct pb_buffer *buf);
 
     /**
-     * Return TRUE if there is enough memory in VRAM and GTT for the relocs
-     * added so far. If the validation fails, all the relocations which have
+     * Return true if there is enough memory in VRAM and GTT for the buffers
+     * added so far. If the validation fails, all buffers which have
      * been added since the last call of cs_validate will be removed and
-     * the CS will be flushed (provided there are still any relocations).
+     * the CS will be flushed (provided there are still any buffers).
      *
      * \param cs        A command stream to validate.
      */
-    boolean (*cs_validate)(struct radeon_winsys_cs *cs);
+    bool (*cs_validate)(struct radeon_winsys_cs *cs);
 
     /**
-     * Return TRUE if there is enough memory in VRAM and GTT for the relocs
-     * added so far.
+     * Check whether the given number of dwords is available in the IB.
+     * Optionally chain a new chunk of the IB if necessary and supported.
      *
-     * \param cs        A command stream to validate.
-     * \param vram      VRAM memory size pending to be use
-     * \param gtt       GTT memory size pending to be use
+     * \param cs        A command stream.
+     * \param dw        Number of CS dwords requested by the caller.
      */
-    boolean (*cs_memory_below_limit)(struct radeon_winsys_cs *cs, uint64_t vram, uint64_t gtt);
+    bool (*cs_check_space)(struct radeon_winsys_cs *cs, unsigned dw);
+
+    /**
+     * Return the buffer list.
+     *
+     * This is the buffer list as passed to the kernel, i.e. it only contains
+     * the parent buffers of sub-allocated buffers.
+     *
+     * \param cs    Command stream
+     * \param list  Returned buffer list. Set to NULL to query the count only.
+     * \return      The buffer count.
+     */
+    unsigned (*cs_get_buffer_list)(struct radeon_winsys_cs *cs,
+                                   struct radeon_bo_list_item *list);
 
     /**
      * Flush a command stream.
@@ -610,22 +678,29 @@ struct radeon_winsys {
      * \param flags,      RADEON_FLUSH_ASYNC or 0.
      * \param fence       Pointer to a fence. If non-NULL, a fence is inserted
      *                    after the CS and is returned through this parameter.
-     * \param cs_trace_id A unique identifier of the cs, used for tracing.
+     * \return Negative POSIX error code or 0 for success.
+     *         Asynchronous submissions never return an error.
      */
-    void (*cs_flush)(struct radeon_winsys_cs *cs,
-                     unsigned flags,
-                     struct pipe_fence_handle **fence,
-                     uint32_t cs_trace_id);
+    int (*cs_flush)(struct radeon_winsys_cs *cs,
+                    unsigned flags,
+                    struct pipe_fence_handle **fence);
 
     /**
-     * Return TRUE if a buffer is referenced by a command stream.
+     * Create a fence before the CS is flushed.
+     * The user must flush manually to complete the initializaton of the fence.
+     * The fence must not be used before the flush.
+     */
+    struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs *cs);
+
+    /**
+     * Return true if a buffer is referenced by a command stream.
      *
      * \param cs        A command stream.
      * \param buf       A winsys buffer.
      */
-    boolean (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
-                                       struct radeon_winsys_cs_handle *buf,
-                                       enum radeon_bo_usage usage);
+    bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
+                                    struct pb_buffer *buf,
+                                    enum radeon_bo_usage usage);
 
     /**
      * Request access to a feature for a command stream.
@@ -634,9 +709,9 @@ struct radeon_winsys {
      * \param fid       Feature ID, one of RADEON_FID_*
      * \param enable    Whether to enable or disable the feature.
      */
-    boolean (*cs_request_feature)(struct radeon_winsys_cs *cs,
-                                  enum radeon_feature_id fid,
-                                  boolean enable);
+    bool (*cs_request_feature)(struct radeon_winsys_cs *cs,
+                               enum radeon_feature_id fid,
+                               bool enable);
      /**
       * Make sure all asynchronous flush of the cs have completed
       *
@@ -681,21 +756,25 @@ struct radeon_winsys {
     uint64_t (*query_value)(struct radeon_winsys *ws,
                             enum radeon_value_id value);
 
-    void (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset,
+    bool (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset,
                            unsigned num_registers, uint32_t *out);
 };
 
+static inline bool radeon_emitted(struct radeon_winsys_cs *cs, unsigned num_dw)
+{
+    return cs && (cs->prev_dw + cs->current.cdw > num_dw);
+}
 
 static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
 {
-    cs->buf[cs->cdw++] = value;
+    cs->current.buf[cs->current.cdw++] = value;
 }
 
 static inline void radeon_emit_array(struct radeon_winsys_cs *cs,
 				     const uint32_t *values, unsigned count)
 {
-    memcpy(cs->buf+cs->cdw, values, count * 4);
-    cs->cdw += count;
+    memcpy(cs->current.buf + cs->current.cdw, values, count * 4);
+    cs->current.cdw += count;
 }
 
 #endif
author	Jonathan Gray <jsg@cvs.openbsd.org>	2017-08-26 16:59:42 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2017-08-26 16:59:42 +0000
commit	81ece42815e80818f160cdd85fab57d65b56ad15 (patch)
tree	1059ff094da1aa50334115952fcb1cfcbda3acc6 /lib/mesa/src/gallium/drivers/radeon
parent	b0244145d5bb49623d58f6b5cab8143ada692b60 (diff)