Import Mesa 17.1.6

author: Jonathan Gray <jsg@cvs.openbsd.org> 2017-08-14 09:45:54 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2017-08-14 09:45:54 +0000
commit: 4c58069f5013f0a621503525f7d5193bfe9976b3 (patch)
tree: bd8f8a08b889e9a8b99c9de01ae12459d527ea6d /lib/mesa/src/gallium/drivers/radeon
parent: 5caa025e6b62d0456faad86c89f239a14d1eaadb (diff)
21 files changed, 1930 insertions, 1015 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.am b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
index a6fc145cb..2be6af4b1 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.am
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
@@ -13,19 +13,14 @@ noinst_LTLIBRARIES = libradeon.la
 libradeon_la_SOURCES = \
 	$(C_SOURCES)
 
-if NEED_RADEON_LLVM
+if HAVE_GALLIUM_LLVM
 
 AM_CFLAGS += \
-	$(LLVM_CFLAGS) \
-	$(LIBELF_CFLAGS)
-
-libradeon_la_SOURCES += \
-	$(LLVM_C_FILES)
+	$(LLVM_CFLAGS)
 
 libradeon_la_LIBADD = \
 	$(CLOCK_LIB) \
-	$(LLVM_LIBS) \
-	$(LIBELF_LIBS)
+	$(LLVM_LIBS)
 
 libradeon_la_LDFLAGS = \
 	$(LLVM_LDFLAGS)
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
index 3e13dae3c..9dd4e1a88 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
@@ -22,7 +22,3 @@ C_SOURCES := \
 	radeon_video.c \
 	radeon_video.h \
 	radeon_winsys.h
-
-LLVM_C_FILES := \
-	radeon_elf_util.c \
-	radeon_elf_util.h
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
index bbab58946..b2289e26f 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -51,6 +51,8 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 	enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
 	bool busy = false;
 
+	assert(!(resource->flags & RADEON_FLAG_SPARSE));
+
 	if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
 		return ctx->ws->buffer_map(resource->buf, NULL, usage);
 	}
@@ -159,8 +161,8 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen,
 	}
 
 	/* Tiled textures are unmappable. Always put them in VRAM. */
-	if (res->b.b.target != PIPE_BUFFER &&
-	    rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
+	if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
+	    res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
 		res->domains = RADEON_DOMAIN_VRAM;
 		res->flags &= ~RADEON_FLAG_CPU_ACCESS;
 		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
@@ -170,8 +172,12 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen,
 	/* If VRAM is just stolen system memory, allow both VRAM and
 	 * GTT, whichever has free space. If a buffer is evicted from
 	 * VRAM to GTT, it will stay there.
+	 *
+	 * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
+	 * placements even with a low amount of stolen VRAM.
 	 */
 	if (!rscreen->info.has_dedicated_vram &&
+	    (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
 	    res->domains == RADEON_DOMAIN_VRAM)
 		res->domains = RADEON_DOMAIN_VRAM_GTT;
 
@@ -245,6 +251,10 @@ r600_invalidate_buffer(struct r600_common_context *rctx,
 	if (rbuffer->is_shared)
 		return false;
 
+	/* Sparse buffers can't be reallocated. */
+	if (rbuffer->flags & RADEON_FLAG_SPARSE)
+		return false;
+
 	/* In AMD_pinned_memory, the user pointer association only gets
 	 * broken when the buffer is explicitly re-allocated.
 	 */
@@ -275,7 +285,6 @@ void r600_invalidate_resource(struct pipe_context *ctx,
 
 static void *r600_buffer_get_transfer(struct pipe_context *ctx,
 				      struct pipe_resource *resource,
-                                      unsigned level,
                                       unsigned usage,
                                       const struct pipe_box *box,
 				      struct pipe_transfer **ptransfer,
@@ -285,8 +294,9 @@ static void *r600_buffer_get_transfer(struct pipe_context *ctx,
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_transfer *transfer = slab_alloc(&rctx->pool_transfers);
 
-	transfer->transfer.resource = resource;
-	transfer->transfer.level = level;
+	transfer->transfer.resource = NULL;
+	pipe_resource_reference(&transfer->transfer.resource, resource);
+	transfer->transfer.level = 0;
 	transfer->transfer.usage = usage;
 	transfer->transfer.box = *box;
 	transfer->transfer.stride = 0;
@@ -317,11 +327,25 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 {
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen;
-        struct r600_resource *rbuffer = r600_resource(resource);
-        uint8_t *data;
+	struct r600_resource *rbuffer = r600_resource(resource);
+	uint8_t *data;
 
 	assert(box->x + box->width <= resource->width0);
 
+	/* From GL_AMD_pinned_memory issues:
+	 *
+	 *     4) Is glMapBuffer on a shared buffer guaranteed to return the
+	 *        same system address which was specified at creation time?
+	 *
+	 *        RESOLVED: NO. The GL implementation might return a different
+	 *        virtual mapping of that memory, although the same physical
+	 *        page will be used.
+	 *
+	 * So don't ever use staging buffers.
+	 */
+	if (rscreen->ws->buffer_is_user_ptr(rbuffer->buf))
+		usage |= PIPE_TRANSFER_PERSISTENT;
+
 	/* See if the buffer range being mapped has never been initialized,
 	 * in which case it can be mapped unsynchronized. */
 	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
@@ -351,26 +375,34 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	}
 
 	if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
-	    !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
-		       PIPE_TRANSFER_PERSISTENT)) &&
 	    !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
-	    r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
+	    ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+			 PIPE_TRANSFER_PERSISTENT)) &&
+	      r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) ||
+	     (rbuffer->flags & RADEON_FLAG_SPARSE))) {
 		assert(usage & PIPE_TRANSFER_WRITE);
 
-		/* Check if mapping this buffer would cause waiting for the GPU. */
-		if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+		/* Check if mapping this buffer would cause waiting for the GPU.
+		 */
+		if (rbuffer->flags & RADEON_FLAG_SPARSE ||
+		    r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
 		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			/* Do a wait-free write-only transfer using a temporary buffer. */
 			unsigned offset;
 			struct r600_resource *staging = NULL;
 
-			u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
-				       256, &offset, (struct pipe_resource**)&staging, (void**)&data);
+			u_upload_alloc(ctx->stream_uploader, 0,
+                                       box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
+				       rctx->screen->info.tcc_cache_line_size,
+				       &offset, (struct pipe_resource**)&staging,
+                                       (void**)&data);
 
 			if (staging) {
 				data += box->x % R600_MAP_BUFFER_ALIGNMENT;
-				return r600_buffer_get_transfer(ctx, resource, level, usage, box,
+				return r600_buffer_get_transfer(ctx, resource, usage, box,
 								ptransfer, data, staging, offset);
+			} else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+				return NULL;
 			}
 		} else {
 			/* At this point, the buffer is always idle (we checked it above). */
@@ -378,11 +410,12 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 		}
 	}
 	/* Use a staging buffer in cached GTT for reads. */
-	else if ((usage & PIPE_TRANSFER_READ) &&
-		 !(usage & PIPE_TRANSFER_PERSISTENT) &&
-		 (rbuffer->domains & RADEON_DOMAIN_VRAM ||
-		  rbuffer->flags & RADEON_FLAG_GTT_WC) &&
-		 r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) {
+	else if (((usage & PIPE_TRANSFER_READ) &&
+		  !(usage & PIPE_TRANSFER_PERSISTENT) &&
+		  (rbuffer->domains & RADEON_DOMAIN_VRAM ||
+		   rbuffer->flags & RADEON_FLAG_GTT_WC) &&
+		  r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) ||
+		 (rbuffer->flags & RADEON_FLAG_SPARSE)) {
 		struct r600_resource *staging;
 
 		staging = (struct r600_resource*) pipe_buffer_create(
@@ -402,8 +435,10 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 			}
 			data += box->x % R600_MAP_BUFFER_ALIGNMENT;
 
-			return r600_buffer_get_transfer(ctx, resource, level, usage, box,
+			return r600_buffer_get_transfer(ctx, resource, usage, box,
 							ptransfer, data, staging, 0);
+		} else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+			return NULL;
 		}
 	}
 
@@ -413,7 +448,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	}
 	data += box->x;
 
-	return r600_buffer_get_transfer(ctx, resource, level, usage, box,
+	return r600_buffer_get_transfer(ctx, resource, usage, box,
 					ptransfer, data, NULL, 0);
 }
 
@@ -469,6 +504,7 @@ static void r600_buffer_transfer_unmap(struct pipe_context *ctx,
 	if (rtransfer->staging)
 		r600_resource_reference(&rtransfer->staging, NULL);
 
+	pipe_resource_reference(&transfer->resource, NULL);
 	slab_free(&rctx->pool_transfers, transfer);
 }
 
@@ -535,6 +571,8 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 
 	if (templ->bind & PIPE_BIND_SHARED)
 		rbuffer->flags |= RADEON_FLAG_HANDLE;
+	if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+		rbuffer->flags |= RADEON_FLAG_SPARSE;
 
 	if (!r600_alloc_resource(rscreen, rbuffer)) {
 		FREE(rbuffer);
@@ -544,7 +582,7 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 }
 
 struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen,
-						 unsigned bind,
+						 unsigned flags,
 						 unsigned usage,
 						 unsigned size,
 						 unsigned alignment)
@@ -554,9 +592,9 @@ struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen,
 	memset(&buffer, 0, sizeof buffer);
 	buffer.target = PIPE_BUFFER;
 	buffer.format = PIPE_FORMAT_R8_UNORM;
-	buffer.bind = bind;
+	buffer.bind = 0;
 	buffer.usage = usage;
-	buffer.flags = 0;
+	buffer.flags = flags;
 	buffer.width0 = size;
 	buffer.height0 = 1;
 	buffer.depth0 = 1;
@@ -574,6 +612,7 @@ r600_buffer_from_user_memory(struct pipe_screen *screen,
 	struct r600_resource *rbuffer = r600_alloc_buffer_struct(screen, templ);
 
 	rbuffer->domains = RADEON_DOMAIN_GTT;
+	rbuffer->flags = 0;
 	util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0);
 
 	/* Convert a user pointer to a buffer. */
@@ -589,5 +628,8 @@ r600_buffer_from_user_memory(struct pipe_screen *screen,
 	else
 		rbuffer->gpu_address = 0;
 
+	rbuffer->vram_usage = 0;
+	rbuffer->gart_usage = templ->width0;
+
 	return &rbuffer->b.b;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c b/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c
index a653834b3..3b45545b7 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c
@@ -35,6 +35,7 @@
  */
 
 #include "r600_pipe_common.h"
+#include "r600_query.h"
 #include "os/os_time.h"
 
 /* For good accuracy at 1000 fps or lower. This will be inaccurate for higher
@@ -42,17 +43,97 @@
 #define SAMPLES_PER_SEC 10000
 
 #define GRBM_STATUS		0x8010
+#define TA_BUSY(x)		(((x) >> 14) & 0x1)
+#define GDS_BUSY(x)		(((x) >> 15) & 0x1)
+#define VGT_BUSY(x)		(((x) >> 17) & 0x1)
+#define IA_BUSY(x)		(((x) >> 19) & 0x1)
+#define SX_BUSY(x)		(((x) >> 20) & 0x1)
+#define WD_BUSY(x)		(((x) >> 21) & 0x1)
+#define SPI_BUSY(x)		(((x) >> 22) & 0x1)
+#define BCI_BUSY(x)		(((x) >> 23) & 0x1)
+#define SC_BUSY(x)		(((x) >> 24) & 0x1)
+#define PA_BUSY(x)		(((x) >> 25) & 0x1)
+#define DB_BUSY(x)		(((x) >> 26) & 0x1)
+#define CP_BUSY(x)		(((x) >> 29) & 0x1)
+#define CB_BUSY(x)		(((x) >> 30) & 0x1)
 #define GUI_ACTIVE(x)		(((x) >> 31) & 0x1)
 
-static bool r600_is_gpu_busy(struct r600_common_screen *rscreen)
+#define SRBM_STATUS2		0x0e4c
+#define SDMA_BUSY(x)		(((x) >> 5) & 0x1)
+
+#define CP_STAT                 0x8680
+#define PFP_BUSY(x)		(((x) >> 15) & 0x1)
+#define MEQ_BUSY(x)		(((x) >> 16) & 0x1)
+#define ME_BUSY(x)		(((x) >> 17) & 0x1)
+#define SURFACE_SYNC_BUSY(x)	(((x) >> 21) & 0x1)
+#define DMA_BUSY(x)		(((x) >> 22) & 0x1)
+#define SCRATCH_RAM_BUSY(x)	(((x) >> 24) & 0x1)
+#define CE_BUSY(x)		(((x) >> 26) & 0x1)
+
+#define IDENTITY(x) x
+
+#define UPDATE_COUNTER(field, mask)					\
+	do {								\
+		if (mask(value))					\
+			p_atomic_inc(&counters->named.field.busy);	\
+		else							\
+			p_atomic_inc(&counters->named.field.idle);	\
+	} while (0)
+
+static void r600_update_mmio_counters(struct r600_common_screen *rscreen,
+				      union r600_mmio_counters *counters)
 {
 	uint32_t value = 0;
+	bool gui_busy, sdma_busy = false;
 
+	/* GRBM_STATUS */
 	rscreen->ws->read_registers(rscreen->ws, GRBM_STATUS, 1, &value);
-	return GUI_ACTIVE(value);
+
+	UPDATE_COUNTER(ta, TA_BUSY);
+	UPDATE_COUNTER(gds, GDS_BUSY);
+	UPDATE_COUNTER(vgt, VGT_BUSY);
+	UPDATE_COUNTER(ia, IA_BUSY);
+	UPDATE_COUNTER(sx, SX_BUSY);
+	UPDATE_COUNTER(wd, WD_BUSY);
+	UPDATE_COUNTER(spi, SPI_BUSY);
+	UPDATE_COUNTER(bci, BCI_BUSY);
+	UPDATE_COUNTER(sc, SC_BUSY);
+	UPDATE_COUNTER(pa, PA_BUSY);
+	UPDATE_COUNTER(db, DB_BUSY);
+	UPDATE_COUNTER(cp, CP_BUSY);
+	UPDATE_COUNTER(cb, CB_BUSY);
+	UPDATE_COUNTER(gui, GUI_ACTIVE);
+	gui_busy = GUI_ACTIVE(value);
+
+	if (rscreen->chip_class >= CIK) {
+		/* SRBM_STATUS2 */
+		rscreen->ws->read_registers(rscreen->ws, SRBM_STATUS2, 1, &value);
+
+		UPDATE_COUNTER(sdma, SDMA_BUSY);
+		sdma_busy = SDMA_BUSY(value);
+	}
+
+	if (rscreen->chip_class >= VI) {
+		/* CP_STAT */
+		rscreen->ws->read_registers(rscreen->ws, CP_STAT, 1, &value);
+
+		UPDATE_COUNTER(pfp, PFP_BUSY);
+		UPDATE_COUNTER(meq, MEQ_BUSY);
+		UPDATE_COUNTER(me, ME_BUSY);
+		UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
+		UPDATE_COUNTER(dma, DMA_BUSY);
+		UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
+		UPDATE_COUNTER(ce, CE_BUSY);
+	}
+
+	value = gui_busy || sdma_busy;
+	UPDATE_COUNTER(gpu, IDENTITY);
 }
 
-static PIPE_THREAD_ROUTINE(r600_gpu_load_thread, param)
+#undef UPDATE_COUNTER
+
+static int
+r600_gpu_load_thread(void *param)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)param;
 	const int period_us = 1000000 / SAMPLES_PER_SEC;
@@ -77,10 +158,7 @@ static PIPE_THREAD_ROUTINE(r600_gpu_load_thread, param)
 		last_time = cur_time;
 
 		/* Update the counters. */
-		if (r600_is_gpu_busy(rscreen))
-			p_atomic_inc(&rscreen->gpu_load_counter_busy);
-		else
-			p_atomic_inc(&rscreen->gpu_load_counter_idle);
+		r600_update_mmio_counters(rscreen, &rscreen->mmio_counters);
 	}
 	p_atomic_dec(&rscreen->gpu_load_stop_thread);
 	return 0;
@@ -92,50 +170,118 @@ void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen)
 		return;
 
 	p_atomic_inc(&rscreen->gpu_load_stop_thread);
-	pipe_thread_wait(rscreen->gpu_load_thread);
+	thrd_join(rscreen->gpu_load_thread, NULL);
 	rscreen->gpu_load_thread = 0;
 }
 
-static uint64_t r600_gpu_load_read_counter(struct r600_common_screen *rscreen)
+static uint64_t r600_read_mmio_counter(struct r600_common_screen *rscreen,
+				       unsigned busy_index)
 {
 	/* Start the thread if needed. */
 	if (!rscreen->gpu_load_thread) {
-		pipe_mutex_lock(rscreen->gpu_load_mutex);
+		mtx_lock(&rscreen->gpu_load_mutex);
 		/* Check again inside the mutex. */
 		if (!rscreen->gpu_load_thread)
 			rscreen->gpu_load_thread =
-				pipe_thread_create(r600_gpu_load_thread, rscreen);
-		pipe_mutex_unlock(rscreen->gpu_load_mutex);
+				u_thread_create(r600_gpu_load_thread, rscreen);
+		mtx_unlock(&rscreen->gpu_load_mutex);
 	}
 
-	/* The busy counter is in the lower 32 bits.
-	 * The idle counter is in the upper 32 bits. */
-	return p_atomic_read(&rscreen->gpu_load_counter_busy) |
-	       ((uint64_t)p_atomic_read(&rscreen->gpu_load_counter_idle) << 32);
-}
+	unsigned busy = p_atomic_read(&rscreen->mmio_counters.array[busy_index]);
+	unsigned idle = p_atomic_read(&rscreen->mmio_counters.array[busy_index + 1]);
 
-/**
- * Just return the counters.
- */
-uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen)
-{
-	return r600_gpu_load_read_counter(rscreen);
+	return busy | ((uint64_t)idle << 32);
 }
 
-unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin)
+static unsigned r600_end_mmio_counter(struct r600_common_screen *rscreen,
+				      uint64_t begin, unsigned busy_index)
 {
-	uint64_t end = r600_gpu_load_read_counter(rscreen);
+	uint64_t end = r600_read_mmio_counter(rscreen, busy_index);
 	unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
 	unsigned idle = (end >> 32) - (begin >> 32);
 
-	/* Calculate the GPU load.
+	/* Calculate the % of time the busy counter was being incremented.
 	 *
-	 * If no counters have been incremented, return the current load.
+	 * If no counters were incremented, return the current counter status.
 	 * It's for the case when the load is queried faster than
 	 * the counters are updated.
 	 */
-	if (idle || busy)
+	if (idle || busy) {
 		return busy*100 / (busy + idle);
-	else
-		return r600_is_gpu_busy(rscreen) ? 100 : 0;
+	} else {
+		union r600_mmio_counters counters;
+
+		memset(&counters, 0, sizeof(counters));
+		r600_update_mmio_counters(rscreen, &counters);
+		return counters.array[busy_index] ? 100 : 0;
+	}
+}
+
+#define BUSY_INDEX(rscreen, field) (&rscreen->mmio_counters.named.field.busy - \
+				    rscreen->mmio_counters.array)
+
+static unsigned busy_index_from_type(struct r600_common_screen *rscreen,
+				     unsigned type)
+{
+	switch (type) {
+	case R600_QUERY_GPU_LOAD:
+		return BUSY_INDEX(rscreen, gpu);
+	case R600_QUERY_GPU_SHADERS_BUSY:
+		return BUSY_INDEX(rscreen, spi);
+	case R600_QUERY_GPU_TA_BUSY:
+		return BUSY_INDEX(rscreen, ta);
+	case R600_QUERY_GPU_GDS_BUSY:
+		return BUSY_INDEX(rscreen, gds);
+	case R600_QUERY_GPU_VGT_BUSY:
+		return BUSY_INDEX(rscreen, vgt);
+	case R600_QUERY_GPU_IA_BUSY:
+		return BUSY_INDEX(rscreen, ia);
+	case R600_QUERY_GPU_SX_BUSY:
+		return BUSY_INDEX(rscreen, sx);
+	case R600_QUERY_GPU_WD_BUSY:
+		return BUSY_INDEX(rscreen, wd);
+	case R600_QUERY_GPU_BCI_BUSY:
+		return BUSY_INDEX(rscreen, bci);
+	case R600_QUERY_GPU_SC_BUSY:
+		return BUSY_INDEX(rscreen, sc);
+	case R600_QUERY_GPU_PA_BUSY:
+		return BUSY_INDEX(rscreen, pa);
+	case R600_QUERY_GPU_DB_BUSY:
+		return BUSY_INDEX(rscreen, db);
+	case R600_QUERY_GPU_CP_BUSY:
+		return BUSY_INDEX(rscreen, cp);
+	case R600_QUERY_GPU_CB_BUSY:
+		return BUSY_INDEX(rscreen, cb);
+	case R600_QUERY_GPU_SDMA_BUSY:
+		return BUSY_INDEX(rscreen, sdma);
+	case R600_QUERY_GPU_PFP_BUSY:
+		return BUSY_INDEX(rscreen, pfp);
+	case R600_QUERY_GPU_MEQ_BUSY:
+		return BUSY_INDEX(rscreen, meq);
+	case R600_QUERY_GPU_ME_BUSY:
+		return BUSY_INDEX(rscreen, me);
+	case R600_QUERY_GPU_SURF_SYNC_BUSY:
+		return BUSY_INDEX(rscreen, surf_sync);
+	case R600_QUERY_GPU_DMA_BUSY:
+		return BUSY_INDEX(rscreen, dma);
+	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
+		return BUSY_INDEX(rscreen, scratch_ram);
+	case R600_QUERY_GPU_CE_BUSY:
+		return BUSY_INDEX(rscreen, ce);
+	default:
+		unreachable("invalid query type");
+	}
+}
+
+uint64_t r600_begin_counter(struct r600_common_screen *rscreen, unsigned type)
+{
+	unsigned busy_index = busy_index_from_type(rscreen, type);
+	return r600_read_mmio_counter(rscreen, busy_index);
+}
+
+unsigned r600_end_counter(struct r600_common_screen *rscreen, unsigned type,
+			  uint64_t begin)
+{
+	unsigned busy_index = busy_index_from_type(rscreen, type);
+	return r600_end_mmio_counter(rscreen, begin, busy_index);
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
index 0c55fc2a2..48f609bcb 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -99,7 +99,7 @@ struct r600_query_pc {
 	struct r600_pc_group *groups;
 };
 
-static void r600_pc_query_destroy(struct r600_common_context *ctx,
+static void r600_pc_query_destroy(struct r600_common_screen *rscreen,
 				  struct r600_query *rquery)
 {
 	struct r600_query_pc *query = (struct r600_query_pc *)rquery;
@@ -112,10 +112,10 @@ static void r600_pc_query_destroy(struct r600_common_context *ctx,
 
 	FREE(query->counters);
 
-	r600_query_hw_destroy(ctx, rquery);
+	r600_query_hw_destroy(rscreen, rquery);
 }
 
-static bool r600_pc_query_prepare_buffer(struct r600_common_context *ctx,
+static bool r600_pc_query_prepare_buffer(struct r600_common_screen *screen,
 					 struct r600_query_hw *hwquery,
 					 struct r600_resource *buffer)
 {
@@ -196,7 +196,7 @@ static void r600_pc_query_clear_result(struct r600_query_hw *hwquery,
 	memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
 }
 
-static void r600_pc_query_add_result(struct r600_common_context *ctx,
+static void r600_pc_query_add_result(struct r600_common_screen *rscreen,
 				     struct r600_query_hw *hwquery,
 				     void *buffer,
 				     union pipe_query_result *result)
@@ -301,8 +301,8 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 					   unsigned num_queries,
 					   unsigned *query_types)
 {
-	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_common_screen *screen = rctx->screen;
+	struct r600_common_screen *screen =
+		(struct r600_common_screen *)ctx->screen;
 	struct r600_perfcounters *pc = screen->perfcounters;
 	struct r600_perfcounter_block *block;
 	struct r600_pc_group *group;
@@ -365,7 +365,7 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 		unsigned instances = 1;
 
 		if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
-			instances = rctx->screen->info.max_se;
+			instances = screen->info.max_se;
 		if (group->instance < 0)
 			instances *= block->num_instances;
 
@@ -417,13 +417,13 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 			counter->qwords *= block->num_instances;
 	}
 
-	if (!r600_query_hw_init(rctx, &query->b))
+	if (!r600_query_hw_init(screen, &query->b))
 		goto error;
 
 	return (struct pipe_query *)query;
 
 error:
-	r600_pc_query_destroy(rctx, &query->b.b);
+	r600_pc_query_destroy(screen, &query->b.b);
 	return NULL;
 }
 
@@ -545,7 +545,7 @@ int r600_get_perfcounter_info(struct r600_common_screen *screen,
 	info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
 	info->max_value.u64 = 0;
 	info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
-	info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
+	info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
 	info->group_id = base_gid + sub / block->num_selectors;
 	info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
 	if (sub > 0 && sub + 1 < block->num_selectors * block->num_groups)
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
index f62bbf2e0..2019ecdd5 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -43,6 +43,14 @@
 #define HAVE_LLVM 0
 #endif
 
+#if HAVE_LLVM
+#include <llvm-c/TargetMachine.h>
+#endif
+
+#ifndef MESA_LLVM_VERSION_PATCH
+#define MESA_LLVM_VERSION_PATCH 0
+#endif
+
 struct r600_multi_fence {
 	struct pipe_reference reference;
 	struct pipe_fence_handle *gfx;
@@ -58,12 +66,12 @@ struct r600_multi_fence {
 /*
  * shader binary helpers.
  */
-void radeon_shader_binary_init(struct radeon_shader_binary *b)
+void radeon_shader_binary_init(struct ac_shader_binary *b)
 {
 	memset(b, 0, sizeof(*b));
 }
 
-void radeon_shader_binary_clean(struct radeon_shader_binary *b)
+void radeon_shader_binary_clean(struct ac_shader_binary *b)
 {
 	if (!b)
 		return;
@@ -80,35 +88,63 @@ void radeon_shader_binary_clean(struct radeon_shader_binary *b)
  * pipe_context
  */
 
-void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf,
-			  uint64_t va, uint32_t old_value, uint32_t new_value)
+/**
+ * Write an EOP event.
+ *
+ * \param event		EVENT_TYPE_*
+ * \param event_flags	Optional cache flush flags (TC)
+ * \param data_sel	1 = fence, 3 = timestamp
+ * \param buf		Buffer
+ * \param va		GPU address
+ * \param old_value	Previous fence value (for a bug workaround)
+ * \param new_value	Fence value to write for this event.
+ */
+void r600_gfx_write_event_eop(struct r600_common_context *ctx,
+			      unsigned event, unsigned event_flags,
+			      unsigned data_sel,
+			      struct r600_resource *buf, uint64_t va,
+			      uint32_t old_fence, uint32_t new_fence)
 {
 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	unsigned op = EVENT_TYPE(event) |
+		      EVENT_INDEX(5) |
+		      event_flags;
+
+	if (ctx->chip_class >= GFX9) {
+		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
+		radeon_emit(cs, op);
+		radeon_emit(cs, EOP_DATA_SEL(data_sel));
+		radeon_emit(cs, va);		/* address lo */
+		radeon_emit(cs, va >> 32);	/* address hi */
+		radeon_emit(cs, new_fence);	/* immediate data lo */
+		radeon_emit(cs, 0); /* immediate data hi */
+		radeon_emit(cs, 0); /* unused */
+	} else {
+		if (ctx->chip_class == CIK ||
+		    ctx->chip_class == VI) {
+			/* Two EOP events are required to make all engines go idle
+			 * (and optional cache flushes executed) before the timestamp
+			 * is written.
+			 */
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+			radeon_emit(cs, op);
+			radeon_emit(cs, va);
+			radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+			radeon_emit(cs, old_fence); /* immediate data */
+			radeon_emit(cs, 0); /* unused */
+		}
 
-	if (ctx->chip_class == CIK ||
-	    ctx->chip_class == VI) {
-		/* Two EOP events are required to make all engines go idle
-		 * (and optional cache flushes executed) before the timestamp
-		 * is written.
-		 */
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
-				EVENT_INDEX(5));
+		radeon_emit(cs, op);
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
-		radeon_emit(cs, old_value); /* immediate data */
+		radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+		radeon_emit(cs, new_fence); /* immediate data */
 		radeon_emit(cs, 0); /* unused */
 	}
 
-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
-			EVENT_INDEX(5));
-	radeon_emit(cs, va);
-	radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
-	radeon_emit(cs, new_value); /* immediate data */
-	radeon_emit(cs, 0); /* unused */
-
-	r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+	if (buf)
+		r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
+				RADEON_PRIO_QUERY);
 }
 
 unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen)
@@ -172,7 +208,9 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	/* Upload vertices. The hw rectangle has only 3 vertices,
 	 * I guess the 4th one is derived from the first 3.
 	 * The vertex specification should match u_blitter's vertex element state. */
-	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
+	u_upload_alloc(rctx->b.stream_uploader, 0, sizeof(float) * 24,
+		       rctx->screen->info.tcc_cache_line_size,
+                       &offset, &buf, (void**)&vb);
 	if (!buf)
 		return;
 
@@ -203,10 +241,26 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	pipe_resource_reference(&buf, NULL);
 }
 
+static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
+{
+	struct radeon_winsys_cs *cs = rctx->dma.cs;
+
+	/* NOP waits for idle on Evergreen and later. */
+	if (rctx->chip_class >= CIK)
+		radeon_emit(cs, 0x00000000); /* NOP */
+	else if (rctx->chip_class >= EVERGREEN)
+		radeon_emit(cs, 0xf0000000); /* NOP */
+	else {
+		/* TODO: R600-R700 should use the FENCE packet.
+		 * CS checker support is required. */
+	}
+}
+
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
                          struct r600_resource *dst, struct r600_resource *src)
 {
-	uint64_t vram = 0, gtt = 0;
+	uint64_t vram = ctx->dma.cs->used_vram;
+	uint64_t gtt = ctx->dma.cs->used_gart;
 
 	if (dst) {
 		vram += dst->vram_usage;
@@ -229,13 +283,35 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 
 	/* Flush if there's not enough space, or if the memory usage per IB
 	 * is too large.
+	 *
+	 * IBs using too little memory are limited by the IB submission overhead.
+	 * IBs using too much memory are limited by the kernel/TTM overhead.
+	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+	 *
+	 * This heuristic makes sure that DMA requests are executed
+	 * very soon after the call is made and lowers memory usage.
+	 * It improves texture upload performance by keeping the DMA
+	 * engine busy while uploads are being submitted.
 	 */
+	num_dw++; /* for emit_wait_idle below */
 	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
+	    ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 		assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 	}
 
+	/* Wait for idle if either buffer has been used in the IB before to
+	 * prevent read-after-write hazards.
+	 */
+	if ((dst &&
+	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
+					      RADEON_USAGE_READWRITE)) ||
+	    (src &&
+	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
+					      RADEON_USAGE_WRITE)))
+		r600_dma_emit_wait_idle(ctx);
+
 	/* If GPUVM is not supported, the CS checker needs 2 entries
 	 * in the buffer list per packet, which has to be done manually.
 	 */
@@ -249,44 +325,9 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 						  RADEON_USAGE_READ,
 						  RADEON_PRIO_SDMA_BUFFER);
 	}
-}
-
-/* This is required to prevent read-after-write hazards. */
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
-{
-	struct radeon_winsys_cs *cs = rctx->dma.cs;
-
-	/* done at the end of DMA calls, so increment this. */
-	rctx->num_dma_calls++;
-
-	/* IBs using too little memory are limited by the IB submission overhead.
-	 * IBs using too much memory are limited by the kernel/TTM overhead.
-	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
-	 *
-	 * This heuristic makes sure that DMA requests are executed
-	 * very soon after the call is made and lowers memory usage.
-	 * It improves texture upload performance by keeping the DMA
-	 * engine busy while uploads are being submitted.
-	 */
-	if (cs->used_vram + cs->used_gart > 64 * 1024 * 1024) {
-		rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
-		return;
-	}
-
-	r600_need_dma_space(rctx, 1, NULL, NULL);
-
-	if (!radeon_emitted(cs, 0)) /* empty queue */
-		return;
 
-	/* NOP waits for idle on Evergreen and later. */
-	if (rctx->chip_class >= CIK)
-		radeon_emit(cs, 0x00000000); /* NOP */
-	else if (rctx->chip_class >= EVERGREEN)
-		radeon_emit(cs, 0xf0000000); /* NOP */
-	else {
-		/* TODO: R600-R700 should use the FENCE packet.
-		 * CS checker support is required. */
-	}
+	/* this function is called before all DMA calls, so increment this. */
+	ctx->num_dma_calls++;
 }
 
 static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
@@ -325,24 +366,22 @@ static void r600_flush_from_st(struct pipe_context *ctx,
 	struct pipe_screen *screen = ctx->screen;
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct radeon_winsys *ws = rctx->ws;
-	unsigned rflags = 0;
 	struct pipe_fence_handle *gfx_fence = NULL;
 	struct pipe_fence_handle *sdma_fence = NULL;
 	bool deferred_fence = false;
+	unsigned rflags = RADEON_FLUSH_ASYNC;
 
 	if (flags & PIPE_FLUSH_END_OF_FRAME)
 		rflags |= RADEON_FLUSH_END_OF_FRAME;
-	if (flags & PIPE_FLUSH_DEFERRED)
-		rflags |= RADEON_FLUSH_ASYNC;
 
-	if (rctx->dma.cs) {
+	/* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
+	if (rctx->dma.cs)
 		rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
-	}
 
 	if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) {
 		if (fence)
 			ws->fence_reference(&gfx_fence, rctx->last_gfx_fence);
-		if (!(rflags & RADEON_FLUSH_ASYNC))
+		if (!(flags & PIPE_FLUSH_DEFERRED))
 			ws->cs_sync_flush(rctx->gfx.cs);
 	} else {
 		/* Instead of flushing, create a deferred fence. Constraints:
@@ -378,6 +417,12 @@ static void r600_flush_from_st(struct pipe_context *ctx,
 		screen->fence_reference(screen, fence, NULL);
 		*fence = (struct pipe_fence_handle*)multi_fence;
 	}
+
+	if (!(flags & PIPE_FLUSH_DEFERRED)) {
+		if (rctx->dma.cs)
+			ws->cs_sync_flush(rctx->dma.cs);
+		ws->cs_sync_flush(rctx->gfx.cs);
+	}
 }
 
 static void r600_flush_dma_ring(void *ctx, unsigned flags,
@@ -516,6 +561,50 @@ bool r600_check_device_reset(struct r600_common_context *rctx)
 	return true;
 }
 
+static void r600_dma_clear_buffer_fallback(struct pipe_context *ctx,
+					   struct pipe_resource *dst,
+					   uint64_t offset, uint64_t size,
+					   unsigned value)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+	rctx->clear_buffer(ctx, dst, offset, size, value, R600_COHERENCY_NONE);
+}
+
+static bool r600_resource_commit(struct pipe_context *pctx,
+				 struct pipe_resource *resource,
+				 unsigned level, struct pipe_box *box,
+				 bool commit)
+{
+	struct r600_common_context *ctx = (struct r600_common_context *)pctx;
+	struct r600_resource *res = r600_resource(resource);
+
+	/*
+	 * Since buffer commitment changes cannot be pipelined, we need to
+	 * (a) flush any pending commands that refer to the buffer we're about
+	 *     to change, and
+	 * (b) wait for threaded submit to finish, including those that were
+	 *     triggered by some other, earlier operation.
+	 */
+	if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
+					     res->buf, RADEON_USAGE_READWRITE)) {
+		ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	}
+	if (radeon_emitted(ctx->dma.cs, 0) &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
+					     res->buf, RADEON_USAGE_READWRITE)) {
+		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	}
+
+	ctx->ws->cs_sync_flush(ctx->dma.cs);
+	ctx->ws->cs_sync_flush(ctx->gfx.cs);
+
+	assert(resource->target == PIPE_BUFFER);
+
+	return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
 			      struct r600_common_screen *rscreen,
 			      unsigned context_flags)
@@ -527,14 +616,8 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->family = rscreen->family;
 	rctx->chip_class = rscreen->chip_class;
 
-	if (rscreen->chip_class >= CIK)
-		rctx->max_db = MAX2(8, rscreen->info.num_render_backends);
-	else if (rscreen->chip_class >= EVERGREEN)
-		rctx->max_db = 8;
-	else
-		rctx->max_db = 4;
-
 	rctx->b.invalidate_resource = r600_invalidate_resource;
+	rctx->b.resource_commit = r600_resource_commit;
 	rctx->b.transfer_map = u_transfer_map_vtbl;
 	rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
 	rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
@@ -542,6 +625,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
 	rctx->b.set_debug_callback = r600_set_debug_callback;
+	rctx->dma_clear_buffer = r600_dma_clear_buffer_fallback;
 
 	/* evergreen_compute.c has a special codepath for global buffers.
 	 * Everything else can use the direct path.
@@ -569,14 +653,18 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 	rctx->allocator_zeroed_memory =
 		u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
-				      0, PIPE_USAGE_DEFAULT, true);
+				      0, PIPE_USAGE_DEFAULT, 0, true);
 	if (!rctx->allocator_zeroed_memory)
 		return false;
 
-	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
-					PIPE_BIND_INDEX_BUFFER |
-					PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
-	if (!rctx->uploader)
+	rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
+						  0, PIPE_USAGE_STREAM);
+	if (!rctx->b.stream_uploader)
+		return false;
+
+	rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
+						 0, PIPE_USAGE_DEFAULT);
+	if (!rctx->b.const_uploader)
 		return false;
 
 	rctx->ctx = rctx->ws->ctx_create(rctx->ws);
@@ -619,9 +707,10 @@ void r600_common_context_cleanup(struct r600_common_context *rctx)
 	if (rctx->ctx)
 		rctx->ws->ctx_destroy(rctx->ctx);
 
-	if (rctx->uploader) {
-		u_upload_destroy(rctx->uploader);
-	}
+	if (rctx->b.stream_uploader)
+		u_upload_destroy(rctx->b.stream_uploader);
+	if (rctx->b.const_uploader)
+		u_upload_destroy(rctx->b.const_uploader);
 
 	slab_destroy_child(&rctx->pool_transfers);
 
@@ -656,8 +745,12 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
 	{ "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial optimizations" },
 	{ "checkir", DBG_CHECK_IR, "Enable additional sanity checks on shader IR" },
+	{ "nooptvariant", DBG_NO_OPT_VARIANT, "Disable compiling optimized shader variants." },
 
 	{ "testdma", DBG_TEST_DMA, "Invoke SDMA tests and exit." },
+	{ "testvmfaultcp", DBG_TEST_VMFAULT_CP, "Invoke a CP VM fault test and exit." },
+	{ "testvmfaultsdma", DBG_TEST_VMFAULT_SDMA, "Invoke a SDMA VM fault test and exit." },
+	{ "testvmfaultshader", DBG_TEST_VMFAULT_SHADER, "Invoke a shader VM fault test and exit." },
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
@@ -673,7 +766,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." },
 	{ "nodcc", DBG_NO_DCC, "Disable DCC." },
 	{ "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." },
-	{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
+	{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+." },
 	{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." },
 	{ "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" },
 	{ "noce", DBG_NO_CE, "Disable the constant engine"},
@@ -737,11 +830,54 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_FIJI: return "AMD FIJI";
 	case CHIP_POLARIS10: return "AMD POLARIS10";
 	case CHIP_POLARIS11: return "AMD POLARIS11";
+	case CHIP_POLARIS12: return "AMD POLARIS12";
 	case CHIP_STONEY: return "AMD STONEY";
+	case CHIP_VEGA10: return "AMD VEGA10";
+	case CHIP_RAVEN: return "AMD RAVEN";
 	default: return "AMD unknown";
 	}
 }
 
+static void r600_disk_cache_create(struct r600_common_screen *rscreen)
+{
+	/* Don't use the cache if shader dumping is enabled. */
+	if (rscreen->debug_flags &
+	    (DBG_FS | DBG_VS | DBG_TCS | DBG_TES | DBG_GS | DBG_PS | DBG_CS))
+		return;
+
+	uint32_t mesa_timestamp;
+	if (disk_cache_get_function_timestamp(r600_disk_cache_create,
+					      &mesa_timestamp)) {
+		char *timestamp_str;
+		int res = -1;
+		if (rscreen->chip_class < SI) {
+			res = asprintf(&timestamp_str, "%u",mesa_timestamp);
+		}
+#if HAVE_LLVM
+		else {
+			uint32_t llvm_timestamp;
+			if (disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo,
+							      &llvm_timestamp)) {
+				res = asprintf(&timestamp_str, "%u_%u",
+					       mesa_timestamp, llvm_timestamp);
+			}
+		}
+#endif
+		if (res != -1) {
+			rscreen->disk_shader_cache =
+				disk_cache_create(r600_get_chip_name(rscreen),
+						  timestamp_str);
+			free(timestamp_str);
+		}
+	}
+}
+
+static struct disk_cache *r600_get_disk_shader_cache(struct pipe_screen *pscreen)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
+	return rscreen->disk_shader_cache;
+}
+
 static const char* r600_get_name(struct pipe_screen* pscreen)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
@@ -861,24 +997,45 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
-#if HAVE_LLVM <= 0x0307
-	case CHIP_FIJI: return "tonga";
-	case CHIP_STONEY: return "carrizo";
-#else
-	case CHIP_FIJI: return "fiji";
-	case CHIP_STONEY: return "stoney";
-#endif
-#if HAVE_LLVM <= 0x0308
-	case CHIP_POLARIS10: return "tonga";
-	case CHIP_POLARIS11: return "tonga";
-#else
-	case CHIP_POLARIS10: return "polaris10";
-	case CHIP_POLARIS11: return "polaris11";
-#endif
-	default: return "";
+	case CHIP_FIJI:
+		return "fiji";
+	case CHIP_STONEY:
+		return "stoney";
+	case CHIP_POLARIS10:
+		return HAVE_LLVM >= 0x0309 ? "polaris10" : "carrizo";
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12: /* same as polaris11 */
+		return HAVE_LLVM >= 0x0309 ? "polaris11" : "carrizo";
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
+		return "gfx900";
+	default:
+		return "";
 	}
 }
 
+static unsigned get_max_threads_per_block(struct r600_common_screen *screen,
+					  enum pipe_shader_ir ir_type)
+{
+	if (ir_type != PIPE_SHADER_IR_TGSI)
+		return 256;
+
+	if (HAVE_LLVM < 0x309)
+		return 256;
+
+	/* Only 16 waves per thread-group on gfx9. */
+	if (screen->chip_class >= GFX9)
+		return 1024;
+
+	/* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice
+	 * round number.
+	 */
+	if (screen->chip_class >= SI)
+		return 2048;
+
+	return 256;
+}
+
 static int r600_get_compute_param(struct pipe_screen *screen,
         enum pipe_shader_ir ir_type,
         enum pipe_compute_cap param,
@@ -933,27 +1090,17 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 	case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
 		if (ret) {
 			uint64_t *block_size = ret;
-			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
-			    ir_type == PIPE_SHADER_IR_TGSI) {
-				block_size[0] = 2048;
-				block_size[1] = 2048;
-				block_size[2] = 2048;
-			} else {
-				block_size[0] = 256;
-				block_size[1] = 256;
-				block_size[2] = 256;
-			}
+			unsigned threads_per_block = get_max_threads_per_block(rscreen, ir_type);
+			block_size[0] = threads_per_block;
+			block_size[1] = threads_per_block;
+			block_size[2] = threads_per_block;
 		}
 		return 3 * sizeof(uint64_t);
 
 	case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
 		if (ret) {
 			uint64_t *max_threads_per_block = ret;
-			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
-			    ir_type == PIPE_SHADER_IR_TGSI)
-				*max_threads_per_block = 2048;
-			else
-				*max_threads_per_block = 256;
+			*max_threads_per_block = get_max_threads_per_block(rscreen, ir_type);
 		}
 		return sizeof(uint64_t);
 	case PIPE_COMPUTE_CAP_ADDRESS_BITS:
@@ -1186,11 +1333,11 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 		snprintf(kernel_version, sizeof(kernel_version),
 			 " / %s", uname_data.release);
 
-#if HAVE_LLVM
-	snprintf(llvm_string, sizeof(llvm_string),
-		 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
-		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
-#endif
+	if (HAVE_LLVM > 0) {
+		snprintf(llvm_string, sizeof(llvm_string),
+			 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
+			 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
+	}
 
 	snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
 		 "%s (DRM %i.%i.%i%s%s)",
@@ -1201,6 +1348,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->b.get_name = r600_get_name;
 	rscreen->b.get_vendor = r600_get_vendor;
 	rscreen->b.get_device_vendor = r600_get_device_vendor;
+	rscreen->b.get_disk_shader_cache = r600_get_disk_shader_cache;
 	rscreen->b.get_compute_param = r600_get_compute_param;
 	rscreen->b.get_paramf = r600_get_paramf;
 	rscreen->b.get_timestamp = r600_get_timestamp;
@@ -1225,6 +1373,10 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->family = rscreen->info.family;
 	rscreen->chip_class = rscreen->info.chip_class;
 	rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
+	rscreen->has_rbplus = false;
+	rscreen->rbplus_allowed = false;
+
+	r600_disk_cache_create(rscreen);
 
 	slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64);
 
@@ -1236,8 +1388,8 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	}
 
 	util_format_s3tc_init();
-	pipe_mutex_init(rscreen->aux_context_lock);
-	pipe_mutex_init(rscreen->gpu_load_mutex);
+	(void) mtx_init(&rscreen->aux_context_lock, mtx_plain);
+	(void) mtx_init(&rscreen->gpu_load_mutex, mtx_plain);
 
 	if (rscreen->debug_flags & DBG_INFO) {
 		printf("pci_id = 0x%x\n", rscreen->info.pci_id);
@@ -1246,6 +1398,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 		printf("chip_class = %i\n", rscreen->info.chip_class);
 		printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
 		printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
+		printf("vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_vis_size, 1024*1024));
 		printf("max_alloc_size = %i MB\n",
 		       (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024));
 		printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
@@ -1274,6 +1427,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 		printf("num_render_backends = %i\n", rscreen->info.num_render_backends);
 		printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
 		printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
+		printf("enabled_rb_mask = 0x%x\n", rscreen->info.enabled_rb_mask);
 	}
 	return true;
 }
@@ -1283,12 +1437,13 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 	r600_perfcounters_destroy(rscreen);
 	r600_gpu_load_kill_thread(rscreen);
 
-	pipe_mutex_destroy(rscreen->gpu_load_mutex);
-	pipe_mutex_destroy(rscreen->aux_context_lock);
+	mtx_destroy(&rscreen->gpu_load_mutex);
+	mtx_destroy(&rscreen->aux_context_lock);
 	rscreen->aux_context->destroy(rscreen->aux_context);
 
 	slab_destroy_parent(&rscreen->pool_transfers);
 
+	disk_cache_destroy(rscreen->disk_shader_cache);
 	rscreen->ws->destroy(rscreen->ws);
 	FREE(rscreen);
 }
@@ -1321,13 +1476,12 @@ bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned proce
 }
 
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
-			      uint64_t offset, uint64_t size, unsigned value,
-			      enum r600_coherency coher)
+			      uint64_t offset, uint64_t size, unsigned value)
 {
 	struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context;
 
-	pipe_mutex_lock(rscreen->aux_context_lock);
-	rctx->clear_buffer(&rctx->b, dst, offset, size, value, coher);
+	mtx_lock(&rscreen->aux_context_lock);
+	rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value);
 	rscreen->aux_context->flush(rscreen->aux_context, NULL, 0);
-	pipe_mutex_unlock(rscreen->aux_context_lock);
+	mtx_unlock(&rscreen->aux_context_lock);
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
index 86772c0af..bd542e500 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -34,8 +34,11 @@
 
 #include <stdio.h>
 
+#include "amd/common/ac_binary.h"
+
 #include "radeon/radeon_winsys.h"
 
+#include "util/disk_cache.h"
 #include "util/u_blitter.h"
 #include "util/list.h"
 #include "util/u_range.h"
@@ -49,6 +52,7 @@
 #define R600_RESOURCE_FLAG_FLUSHED_DEPTH	(PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 #define R600_RESOURCE_FLAG_FORCE_TILING		(PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
 #define R600_RESOURCE_FLAG_DISABLE_DCC		(PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
+#define R600_RESOURCE_FLAG_UNMAPPABLE		(PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
 
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
 /* Pipeline & streamout query controls. */
@@ -79,6 +83,7 @@
 #define DBG_NO_ASM		(1 << 14)
 #define DBG_PREOPT_IR		(1 << 15)
 #define DBG_CHECK_IR		(1 << 16)
+#define DBG_NO_OPT_VARIANT	(1 << 17)
 /* gaps */
 #define DBG_TEST_DMA		(1 << 20)
 /* Bits 21-31 are reserved for the r600g driver. */
@@ -102,6 +107,9 @@
 #define DBG_NO_CE		(1llu << 48)
 #define DBG_UNSAFE_MATH		(1llu << 49)
 #define DBG_NO_DCC_FB		(1llu << 50)
+#define DBG_TEST_VMFAULT_CP	(1llu << 51)
+#define DBG_TEST_VMFAULT_SDMA	(1llu << 52)
+#define DBG_TEST_VMFAULT_SHADER	(1llu << 53)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 #define R600_MAX_VIEWPORTS        16
@@ -125,45 +133,8 @@ struct r600_perfcounters;
 struct tgsi_shader_info;
 struct r600_qbo_state;
 
-struct radeon_shader_reloc {
-	char name[32];
-	uint64_t offset;
-};
-
-struct radeon_shader_binary {
-	/** Shader code */
-	unsigned char *code;
-	unsigned code_size;
-
-	/** Config/Context register state that accompanies this shader.
-	 * This is a stream of dword pairs.  First dword contains the
-	 * register address, the second dword contains the value.*/
-	unsigned char *config;
-	unsigned config_size;
-
-	/** The number of bytes of config information for each global symbol.
-	 */
-	unsigned config_size_per_symbol;
-
-	/** Constant data accessed by the shader.  This will be uploaded
-	 * into a constant buffer. */
-	unsigned char *rodata;
-	unsigned rodata_size;
-
-	/** List of symbol offsets for the shader */
-	uint64_t *global_symbol_offsets;
-	unsigned global_symbol_count;
-
-	struct radeon_shader_reloc *relocs;
-	unsigned reloc_count;
-
-	/** Disassembled shader in a string. */
-	char *disasm_string;
-	char *llvm_ir_string;
-};
-
-void radeon_shader_binary_init(struct radeon_shader_binary *b);
-void radeon_shader_binary_clean(struct radeon_shader_binary *b);
+void radeon_shader_binary_init(struct ac_shader_binary *b);
+void radeon_shader_binary_clean(struct ac_shader_binary *b);
 
 /* Only 32-bit buffer allocations are supported, gallium doesn't support more
  * at the moment.
@@ -232,20 +203,8 @@ struct r600_cmask_info {
 	uint64_t offset;
 	uint64_t size;
 	unsigned alignment;
-	unsigned pitch;
-	unsigned height;
-	unsigned xalign;
-	unsigned yalign;
 	unsigned slice_tile_max;
-	unsigned base_address_reg;
-};
-
-struct r600_htile_info {
-	unsigned pitch;
-	unsigned height;
-	unsigned xalign;
-	unsigned yalign;
-	unsigned alignment;
+	uint64_t base_address_reg;
 };
 
 struct r600_texture {
@@ -273,7 +232,6 @@ struct r600_texture {
 	unsigned			last_msaa_resolve_target_micro_mode;
 
 	/* Depth buffer compression and fast clear. */
-	struct r600_htile_info		htile;
 	struct r600_resource		*htile_buffer;
 	bool				tc_compatible_htile;
 	bool				depth_cleared; /* if it was cleared at least once */
@@ -319,7 +277,10 @@ struct r600_texture {
 
 struct r600_surface {
 	struct pipe_surface		base;
-	const struct radeon_surf_level	*level_info;
+
+	/* These can vary with block-compressed textures. */
+	unsigned width0;
+	unsigned height0;
 
 	bool color_initialized;
 	bool depth_initialized;
@@ -329,6 +290,7 @@ struct r600_surface {
 	bool export_16bpc;
 	bool color_is_int8;
 	bool color_is_int10;
+	bool dcc_incompatible;
 
 	/* Color registers. */
 	unsigned cb_color_info;
@@ -339,6 +301,7 @@ struct r600_surface {
 	unsigned cb_color_pitch;	/* EG and later */
 	unsigned cb_color_slice;	/* EG and later */
 	unsigned cb_color_attrib;	/* EG and later */
+	unsigned cb_color_attrib2;	/* GFX9 and later */
 	unsigned cb_dcc_control;	/* VI and later */
 	unsigned cb_color_fmask;	/* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */
 	unsigned cb_color_fmask_slice;	/* EG and later */
@@ -352,20 +315,63 @@ struct r600_surface {
 	struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */
 
 	/* DB registers. */
+	uint64_t db_depth_base;		/* DB_Z_READ/WRITE_BASE (EG and later) or DB_DEPTH_BASE (r600) */
+	uint64_t db_stencil_base;	/* EG and later */
+	uint64_t db_htile_data_base;
 	unsigned db_depth_info;		/* R600 only, then SI and later */
 	unsigned db_z_info;		/* EG and later */
-	unsigned db_depth_base;		/* DB_Z_READ/WRITE_BASE (EG and later) or DB_DEPTH_BASE (r600) */
+	unsigned db_z_info2;		/* GFX9+ */
 	unsigned db_depth_view;
 	unsigned db_depth_size;
 	unsigned db_depth_slice;	/* EG and later */
-	unsigned db_stencil_base;	/* EG and later */
 	unsigned db_stencil_info;	/* EG and later */
+	unsigned db_stencil_info2;	/* GFX9+ */
 	unsigned db_prefetch_limit;	/* R600 only */
 	unsigned db_htile_surface;
-	unsigned db_htile_data_base;
 	unsigned db_preload_control;	/* EG and later */
 };
 
+struct r600_mmio_counter {
+	unsigned busy;
+	unsigned idle;
+};
+
+union r600_mmio_counters {
+	struct {
+		/* For global GPU load including SDMA. */
+		struct r600_mmio_counter gpu;
+
+		/* GRBM_STATUS */
+		struct r600_mmio_counter spi;
+		struct r600_mmio_counter gui;
+		struct r600_mmio_counter ta;
+		struct r600_mmio_counter gds;
+		struct r600_mmio_counter vgt;
+		struct r600_mmio_counter ia;
+		struct r600_mmio_counter sx;
+		struct r600_mmio_counter wd;
+		struct r600_mmio_counter bci;
+		struct r600_mmio_counter sc;
+		struct r600_mmio_counter pa;
+		struct r600_mmio_counter db;
+		struct r600_mmio_counter cp;
+		struct r600_mmio_counter cb;
+
+		/* SRBM_STATUS2 */
+		struct r600_mmio_counter sdma;
+
+		/* CP_STAT */
+		struct r600_mmio_counter pfp;
+		struct r600_mmio_counter meq;
+		struct r600_mmio_counter me;
+		struct r600_mmio_counter surf_sync;
+		struct r600_mmio_counter dma;
+		struct r600_mmio_counter scratch_ram;
+		struct r600_mmio_counter ce;
+	} named;
+	unsigned array[0];
+};
+
 struct r600_common_screen {
 	struct pipe_screen		b;
 	struct radeon_winsys		*ws;
@@ -375,6 +381,10 @@ struct r600_common_screen {
 	uint64_t			debug_flags;
 	bool				has_cp_dma;
 	bool				has_streamout;
+	bool				has_rbplus;     /* if RB+ registers exist */
+	bool				rbplus_allowed; /* if RB+ is allowed */
+
+	struct disk_cache		*disk_shader_cache;
 
 	struct slab_parent_pool		pool_transfers;
 
@@ -384,7 +394,7 @@ struct r600_common_screen {
 	/* Auxiliary context. Mainly used to initialize resources.
 	 * It must be locked prior to using and flushed before unlocking. */
 	struct pipe_context		*aux_context;
-	pipe_mutex			aux_context_lock;
+	mtx_t				aux_context_lock;
 
 	/* This must be in the screen, because UE4 uses one context for
 	 * compilation and another one for rendering.
@@ -394,12 +404,12 @@ struct r600_common_screen {
 	 * are loading shaders on demand. This is a monotonic counter.
 	 */
 	unsigned			num_shaders_created;
+	unsigned			num_shader_cache_hits;
 
 	/* GPU load thread. */
-	pipe_mutex			gpu_load_mutex;
-	pipe_thread			gpu_load_thread;
-	unsigned			gpu_load_counter_busy;
-	unsigned			gpu_load_counter_idle;
+	mtx_t				gpu_load_mutex;
+	thrd_t				gpu_load_thread;
+	union r600_mmio_counters	mmio_counters;
 	volatile unsigned		gpu_load_stop_thread; /* bool */
 
 	char				renderer_string[100];
@@ -407,12 +417,14 @@ struct r600_common_screen {
 	/* Performance counters. */
 	struct r600_perfcounters	*perfcounters;
 
-	/* If pipe_screen wants to re-emit the framebuffer state of all
-	 * contexts, it should atomically increment this. Each context will
-	 * compare this with its own last known value of the counter before
-	 * drawing and re-emit the framebuffer state accordingly.
+	/* If pipe_screen wants to recompute and re-emit the framebuffer,
+	 * sampler, and image states of all contexts, it should atomically
+	 * increment this.
+	 *
+	 * Each context will compare this with its own last known value of
+	 * the counter before drawing and re-emit the states accordingly.
 	 */
-	unsigned			dirty_fb_counter;
+	unsigned			dirty_tex_counter;
 
 	/* Atomically increment this counter when an existing texture's
 	 * metadata is enabled or disabled in a way that requires changing
@@ -420,12 +432,6 @@ struct r600_common_screen {
 	 */
 	unsigned			compressed_colortex_counter;
 
-	/* Atomically increment this counter when an existing texture's
-	 * backing buffer or tile mode parameters have changed that requires
-	 * recomputation of shader descriptors.
-	 */
-	unsigned			dirty_tex_descriptor_counter;
-
 	struct {
 		/* Context flags to set so that all writes from earlier jobs
 		 * in the CP are seen by L2 clients.
@@ -480,7 +486,7 @@ struct r600_streamout {
 
 	/* External state which comes from the vertex shader,
 	 * it must be set explicitly when binding a shader. */
-	unsigned			*stride_in_dw;
+	uint16_t			*stride_in_dw;
 	unsigned			enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
 
 	/* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
@@ -544,11 +550,9 @@ struct r600_common_context {
 	unsigned			num_gfx_cs_flushes;
 	unsigned			initial_gfx_cs_size;
 	unsigned			gpu_reset_counter;
-	unsigned			last_dirty_fb_counter;
+	unsigned			last_dirty_tex_counter;
 	unsigned			last_compressed_colortex_counter;
-	unsigned			last_dirty_tex_descriptor_counter;
 
-	struct u_upload_mgr		*uploader;
 	struct u_suballocator		*allocator_zeroed_memory;
 	struct slab_child_pool		pool_transfers;
 
@@ -574,18 +578,19 @@ struct r600_common_context {
 	int				num_perfect_occlusion_queries;
 	struct list_head		active_queries;
 	unsigned			num_cs_dw_queries_suspend;
-	/* Additional hardware info. */
-	unsigned			backend_mask;
-	unsigned			max_db; /* for OQ */
 	/* Misc stats. */
 	unsigned			num_draw_calls;
 	unsigned			num_spill_draw_calls;
 	unsigned			num_compute_calls;
 	unsigned			num_spill_compute_calls;
 	unsigned			num_dma_calls;
+	unsigned			num_cp_dma_calls;
 	unsigned			num_vs_flushes;
 	unsigned			num_ps_flushes;
 	unsigned			num_cs_flushes;
+	unsigned			num_fb_cache_flushes;
+	unsigned			num_L2_invalidates;
+	unsigned			num_L2_writebacks;
 	uint64_t			num_alloc_tex_transfer_bytes;
 	unsigned			last_tex_ps_draw_ratio; /* for query */
 
@@ -638,6 +643,9 @@ struct r600_common_context {
 			 unsigned src_level,
 			 const struct pipe_box *src_box);
 
+	void (*dma_clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst,
+				 uint64_t offset, uint64_t size, unsigned value);
+
 	void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst,
 			     uint64_t offset, uint64_t size, unsigned value,
 			     enum r600_coherency coher);
@@ -693,7 +701,7 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 					 const struct pipe_resource *templ,
 					 unsigned alignment);
 struct pipe_resource * r600_aligned_buffer_create(struct pipe_screen *screen,
-						  unsigned bind,
+						  unsigned flags,
 						  unsigned usage,
 						  unsigned size,
 						  unsigned alignment);
@@ -706,8 +714,11 @@ r600_invalidate_resource(struct pipe_context *ctx,
 			 struct pipe_resource *resource);
 
 /* r600_common_pipe.c */
-void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf,
-			  uint64_t va, uint32_t old_value, uint32_t new_value);
+void r600_gfx_write_event_eop(struct r600_common_context *ctx,
+			      unsigned event, unsigned event_flags,
+			      unsigned data_sel,
+			      struct r600_resource *buf, uint64_t va,
+			      uint32_t old_fence, uint32_t new_fence);
 unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen);
 void r600_gfx_wait_fence(struct r600_common_context *ctx,
 			 uint64_t va, uint32_t ref, uint32_t mask);
@@ -729,14 +740,12 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen,
 bool r600_extra_shader_checks(struct r600_common_screen *rscreen,
 			      unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
-			      uint64_t offset, uint64_t size, unsigned value,
-			      enum r600_coherency coher);
+			      uint64_t offset, uint64_t size, unsigned value);
 struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 						  const struct pipe_resource *templ);
 const char *r600_get_llvm_processor_name(enum radeon_family family);
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 			 struct r600_resource *dst, struct r600_resource *src);
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx);
 void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 		    struct radeon_saved_cs *saved);
 void radeon_clear_saved_cs(struct radeon_saved_cs *saved);
@@ -744,8 +753,9 @@ bool r600_check_device_reset(struct r600_common_context *rctx);
 
 /* r600_gpu_load.c */
 void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen);
-uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
-unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
+uint64_t r600_begin_counter(struct r600_common_screen *rscreen, unsigned type);
+unsigned r600_end_counter(struct r600_common_screen *rscreen, unsigned type,
+			  uint64_t begin);
 
 /* r600_perfcounters.c */
 void r600_perfcounters_destroy(struct r600_common_screen *rscreen);
@@ -755,7 +765,7 @@ void r600_init_screen_query_functions(struct r600_common_screen *rscreen);
 void r600_query_init(struct r600_common_context *rctx);
 void r600_suspend_queries(struct r600_common_context *ctx);
 void r600_resume_queries(struct r600_common_context *ctx);
-void r600_query_init_backend_mask(struct r600_common_context *ctx);
+void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen);
 
 /* r600_streamout.c */
 void r600_streamout_buffers_dirty(struct r600_common_context *rctx);
@@ -789,18 +799,23 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
 bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
 				     struct pipe_resource *texture,
 				     struct r600_texture **staging);
-void r600_print_texture_info(struct r600_texture *rtex, FILE *f);
+void r600_print_texture_info(struct r600_common_screen *rscreen,
+			     struct r600_texture *rtex, FILE *f);
 struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 					const struct pipe_resource *templ);
 bool vi_dcc_formats_compatible(enum pipe_format format1,
 			       enum pipe_format format2);
-void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx,
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
+				     unsigned level,
+				     enum pipe_format view_format);
+void vi_disable_dcc_if_incompatible_format(struct r600_common_context *rctx,
 					   struct pipe_resource *tex,
 					   unsigned level,
 					   enum pipe_format view_format);
 struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
 						struct pipe_resource *texture,
 						const struct pipe_surface *templ,
+						unsigned width0, unsigned height0,
 						unsigned width, unsigned height);
 unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap);
 void vi_separate_dcc_start_query(struct pipe_context *ctx,
@@ -951,6 +966,12 @@ r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler)
 	       (!stencil_sampler && tex->can_sample_z);
 }
 
+static inline bool
+vi_dcc_enabled(struct r600_texture *tex, unsigned level)
+{
+	return tex->dcc_offset && level < tex->surface.num_dcc_levels;
+}
+
 #define COMPUTE_DBG(rscreen, fmt, args...) \
 	do { \
 		if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \
@@ -966,4 +987,9 @@ r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler)
 	(((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) |	   \
 	 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
 
+static inline int S_FIXED(float value, unsigned frac_bits)
+{
+	return value * (1 << frac_bits);
+}
+
 #endif
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.c b/lib/mesa/src/gallium/drivers/radeon/r600_query.c
index 4b6767dd3..7764871aa 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_query.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.c
@@ -26,7 +26,7 @@
 #include "r600_cs.h"
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
-
+#include "os/os_time.h"
 #include "tgsi/tgsi_text.h"
 
 struct r600_hw_query_params {
@@ -43,17 +43,20 @@ struct r600_query_sw {
 
 	uint64_t begin_result;
 	uint64_t end_result;
+
+	uint64_t begin_time;
+	uint64_t end_time;
+
 	/* Fence for GPU_FINISHED. */
 	struct pipe_fence_handle *fence;
 };
 
-static void r600_query_sw_destroy(struct r600_common_context *rctx,
+static void r600_query_sw_destroy(struct r600_common_screen *rscreen,
 				  struct r600_query *rquery)
 {
-	struct pipe_screen *screen = rctx->b.screen;
 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
 
-	screen->fence_reference(screen, &query->fence, NULL);
+	rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL);
 	FREE(query);
 }
 
@@ -65,14 +68,18 @@ static enum radeon_value_id winsys_id_from_type(unsigned type)
 	case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
 	case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
 	case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
-	case R600_QUERY_NUM_CTX_FLUSHES: return RADEON_NUM_CS_FLUSHES;
+	case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
+	case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
+	case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
 	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
 	case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
 	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
+	case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
 	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
 	case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
 	case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
 	case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
+	case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
 	default: unreachable("query type does not correspond to winsys id");
 	}
 }
@@ -81,6 +88,7 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
 				struct r600_query *rquery)
 {
 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+	enum radeon_value_id ws_id;
 
 	switch(query->b.type) {
 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
@@ -101,6 +109,9 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
 	case R600_QUERY_DMA_CALLS:
 		query->begin_result = rctx->num_dma_calls;
 		break;
+	case R600_QUERY_CP_DMA_CALLS:
+		query->begin_result = rctx->num_cp_dma_calls;
+		break;
 	case R600_QUERY_NUM_VS_FLUSHES:
 		query->begin_result = rctx->num_vs_flushes;
 		break;
@@ -110,28 +121,67 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
 	case R600_QUERY_NUM_CS_FLUSHES:
 		query->begin_result = rctx->num_cs_flushes;
 		break;
+	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
+		query->begin_result = rctx->num_fb_cache_flushes;
+		break;
+	case R600_QUERY_NUM_L2_INVALIDATES:
+		query->begin_result = rctx->num_L2_invalidates;
+		break;
+	case R600_QUERY_NUM_L2_WRITEBACKS:
+		query->begin_result = rctx->num_L2_writebacks;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
 	case R600_QUERY_MAPPED_VRAM:
 	case R600_QUERY_MAPPED_GTT:
 	case R600_QUERY_VRAM_USAGE:
+	case R600_QUERY_VRAM_VIS_USAGE:
 	case R600_QUERY_GTT_USAGE:
 	case R600_QUERY_GPU_TEMPERATURE:
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+	case R600_QUERY_NUM_MAPPED_BUFFERS:
 		query->begin_result = 0;
 		break;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-	case R600_QUERY_NUM_CTX_FLUSHES:
+	case R600_QUERY_NUM_GFX_IBS:
+	case R600_QUERY_NUM_SDMA_IBS:
 	case R600_QUERY_NUM_BYTES_MOVED:
 	case R600_QUERY_NUM_EVICTIONS: {
 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
 		break;
 	}
+	case R600_QUERY_CS_THREAD_BUSY:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
+		query->begin_time = os_time_get_nano();
+		break;
 	case R600_QUERY_GPU_LOAD:
-		query->begin_result = r600_gpu_load_begin(rctx->screen);
+	case R600_QUERY_GPU_SHADERS_BUSY:
+	case R600_QUERY_GPU_TA_BUSY:
+	case R600_QUERY_GPU_GDS_BUSY:
+	case R600_QUERY_GPU_VGT_BUSY:
+	case R600_QUERY_GPU_IA_BUSY:
+	case R600_QUERY_GPU_SX_BUSY:
+	case R600_QUERY_GPU_WD_BUSY:
+	case R600_QUERY_GPU_BCI_BUSY:
+	case R600_QUERY_GPU_SC_BUSY:
+	case R600_QUERY_GPU_PA_BUSY:
+	case R600_QUERY_GPU_DB_BUSY:
+	case R600_QUERY_GPU_CP_BUSY:
+	case R600_QUERY_GPU_CB_BUSY:
+	case R600_QUERY_GPU_SDMA_BUSY:
+	case R600_QUERY_GPU_PFP_BUSY:
+	case R600_QUERY_GPU_MEQ_BUSY:
+	case R600_QUERY_GPU_ME_BUSY:
+	case R600_QUERY_GPU_SURF_SYNC_BUSY:
+	case R600_QUERY_GPU_DMA_BUSY:
+	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
+	case R600_QUERY_GPU_CE_BUSY:
+		query->begin_result = r600_begin_counter(rctx->screen,
+							 query->b.type);
 		break;
 	case R600_QUERY_NUM_COMPILATIONS:
 		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
@@ -139,6 +189,10 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx,
 	case R600_QUERY_NUM_SHADERS_CREATED:
 		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
 		break;
+	case R600_QUERY_NUM_SHADER_CACHE_HITS:
+		query->begin_result =
+			p_atomic_read(&rctx->screen->num_shader_cache_hits);
+		break;
 	case R600_QUERY_GPIN_ASIC_ID:
 	case R600_QUERY_GPIN_NUM_SIMD:
 	case R600_QUERY_GPIN_NUM_RB:
@@ -156,6 +210,7 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
 			      struct r600_query *rquery)
 {
 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+	enum radeon_value_id ws_id;
 
 	switch(query->b.type) {
 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
@@ -178,6 +233,9 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
 	case R600_QUERY_DMA_CALLS:
 		query->end_result = rctx->num_dma_calls;
 		break;
+	case R600_QUERY_CP_DMA_CALLS:
+		query->end_result = rctx->num_cp_dma_calls;
+		break;
 	case R600_QUERY_NUM_VS_FLUSHES:
 		query->end_result = rctx->num_vs_flushes;
 		break;
@@ -187,26 +245,65 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
 	case R600_QUERY_NUM_CS_FLUSHES:
 		query->end_result = rctx->num_cs_flushes;
 		break;
+	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
+		query->end_result = rctx->num_fb_cache_flushes;
+		break;
+	case R600_QUERY_NUM_L2_INVALIDATES:
+		query->end_result = rctx->num_L2_invalidates;
+		break;
+	case R600_QUERY_NUM_L2_WRITEBACKS:
+		query->end_result = rctx->num_L2_writebacks;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
 	case R600_QUERY_MAPPED_VRAM:
 	case R600_QUERY_MAPPED_GTT:
 	case R600_QUERY_VRAM_USAGE:
+	case R600_QUERY_VRAM_VIS_USAGE:
 	case R600_QUERY_GTT_USAGE:
 	case R600_QUERY_GPU_TEMPERATURE:
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_BUFFER_WAIT_TIME:
-	case R600_QUERY_NUM_CTX_FLUSHES:
+	case R600_QUERY_NUM_MAPPED_BUFFERS:
+	case R600_QUERY_NUM_GFX_IBS:
+	case R600_QUERY_NUM_SDMA_IBS:
 	case R600_QUERY_NUM_BYTES_MOVED:
 	case R600_QUERY_NUM_EVICTIONS: {
 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
 		break;
 	}
+	case R600_QUERY_CS_THREAD_BUSY:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
+		query->end_time = os_time_get_nano();
+		break;
 	case R600_QUERY_GPU_LOAD:
-		query->end_result = r600_gpu_load_end(rctx->screen,
-						      query->begin_result);
+	case R600_QUERY_GPU_SHADERS_BUSY:
+	case R600_QUERY_GPU_TA_BUSY:
+	case R600_QUERY_GPU_GDS_BUSY:
+	case R600_QUERY_GPU_VGT_BUSY:
+	case R600_QUERY_GPU_IA_BUSY:
+	case R600_QUERY_GPU_SX_BUSY:
+	case R600_QUERY_GPU_WD_BUSY:
+	case R600_QUERY_GPU_BCI_BUSY:
+	case R600_QUERY_GPU_SC_BUSY:
+	case R600_QUERY_GPU_PA_BUSY:
+	case R600_QUERY_GPU_DB_BUSY:
+	case R600_QUERY_GPU_CP_BUSY:
+	case R600_QUERY_GPU_CB_BUSY:
+	case R600_QUERY_GPU_SDMA_BUSY:
+	case R600_QUERY_GPU_PFP_BUSY:
+	case R600_QUERY_GPU_MEQ_BUSY:
+	case R600_QUERY_GPU_ME_BUSY:
+	case R600_QUERY_GPU_SURF_SYNC_BUSY:
+	case R600_QUERY_GPU_DMA_BUSY:
+	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
+	case R600_QUERY_GPU_CE_BUSY:
+		query->end_result = r600_end_counter(rctx->screen,
+						     query->b.type,
+						     query->begin_result);
 		query->begin_result = 0;
 		break;
 	case R600_QUERY_NUM_COMPILATIONS:
@@ -218,6 +315,10 @@ static bool r600_query_sw_end(struct r600_common_context *rctx,
 	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
 		query->end_result = rctx->last_tex_ps_draw_ratio;
 		break;
+	case R600_QUERY_NUM_SHADER_CACHE_HITS:
+		query->end_result =
+			p_atomic_read(&rctx->screen->num_shader_cache_hits);
+		break;
 	case R600_QUERY_GPIN_ASIC_ID:
 	case R600_QUERY_GPIN_NUM_SIMD:
 	case R600_QUERY_GPIN_NUM_RB:
@@ -252,6 +353,10 @@ static bool r600_query_sw_get_result(struct r600_common_context *rctx,
 		return result->b;
 	}
 
+	case R600_QUERY_CS_THREAD_BUSY:
+		result->u64 = (query->end_result - query->begin_result) * 100 /
+			      (query->end_time - query->begin_time);
+		return true;
 	case R600_QUERY_GPIN_ASIC_ID:
 		result->u32 = 0;
 		return true;
@@ -294,8 +399,7 @@ static struct r600_query_ops sw_query_ops = {
 	.get_result_resource = NULL
 };
 
-static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
-					       unsigned query_type)
+static struct pipe_query *r600_query_sw_create(unsigned query_type)
 {
 	struct r600_query_sw *query;
 
@@ -309,7 +413,7 @@ static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
 	return (struct pipe_query *)query;
 }
 
-void r600_query_hw_destroy(struct r600_common_context *rctx,
+void r600_query_hw_destroy(struct r600_common_screen *rscreen,
 			   struct r600_query *rquery)
 {
 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
@@ -327,23 +431,23 @@ void r600_query_hw_destroy(struct r600_common_context *rctx,
 	FREE(rquery);
 }
 
-static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx,
+static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen,
 						   struct r600_query_hw *query)
 {
 	unsigned buf_size = MAX2(query->result_size,
-				 ctx->screen->info.min_alloc_size);
+				 rscreen->info.min_alloc_size);
 
 	/* Queries are normally read by the CPU after
 	 * being written by the gpu, hence staging is probably a good
 	 * usage pattern.
 	 */
 	struct r600_resource *buf = (struct r600_resource*)
-		pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM,
+		pipe_buffer_create(&rscreen->b, 0,
 				   PIPE_USAGE_STAGING, buf_size);
 	if (!buf)
 		return NULL;
 
-	if (!query->ops->prepare_buffer(ctx, query, buf)) {
+	if (!query->ops->prepare_buffer(rscreen, query, buf)) {
 		r600_resource_reference(&buf, NULL);
 		return NULL;
 	}
@@ -351,14 +455,14 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	return buf;
 }
 
-static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
+static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen,
 					 struct r600_query_hw *query,
 					 struct r600_resource *buffer)
 {
 	/* Callers ensure that the buffer is currently unused by the GPU. */
-	uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL,
-						PIPE_TRANSFER_WRITE |
-						PIPE_TRANSFER_UNSYNCHRONIZED);
+	uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL,
+						   PIPE_TRANSFER_WRITE |
+						   PIPE_TRANSFER_UNSYNCHRONIZED);
 	if (!results)
 		return false;
 
@@ -366,19 +470,21 @@ static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
 
 	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
 	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+		unsigned max_rbs = rscreen->info.num_render_backends;
+		unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;
 		unsigned num_results;
 		unsigned i, j;
 
 		/* Set top bits for unused backends. */
 		num_results = buffer->b.b.width0 / query->result_size;
 		for (j = 0; j < num_results; j++) {
-			for (i = 0; i < ctx->max_db; i++) {
-				if (!(ctx->backend_mask & (1<<i))) {
+			for (i = 0; i < max_rbs; i++) {
+				if (!(enabled_rb_mask & (1<<i))) {
 					results[(i * 4)+1] = 0x80000000;
 					results[(i * 4)+3] = 0x80000000;
 				}
 			}
-			results += 4 * ctx->max_db;
+			results += 4 * max_rbs;
 		}
 	}
 
@@ -409,7 +515,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
 				       struct r600_query_hw *query,
 				       struct r600_resource *buffer,
 				       uint64_t va);
-static void r600_query_hw_add_result(struct r600_common_context *ctx,
+static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
 				     struct r600_query_hw *, void *buffer,
 				     union pipe_query_result *result);
 static void r600_query_hw_clear_result(struct r600_query_hw *,
@@ -423,17 +529,17 @@ static struct r600_query_hw_ops query_hw_default_hw_ops = {
 	.add_result = r600_query_hw_add_result,
 };
 
-bool r600_query_hw_init(struct r600_common_context *rctx,
+bool r600_query_hw_init(struct r600_common_screen *rscreen,
 			struct r600_query_hw *query)
 {
-	query->buffer.buf = r600_new_query_buffer(rctx, query);
+	query->buffer.buf = r600_new_query_buffer(rscreen, query);
 	if (!query->buffer.buf)
 		return false;
 
 	return true;
 }
 
-static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
+static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen,
 					       unsigned query_type,
 					       unsigned index)
 {
@@ -448,19 +554,19 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
 	switch (query_type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		query->result_size = 16 * rctx->max_db;
+		query->result_size = 16 * rscreen->info.num_render_backends;
 		query->result_size += 16; /* for the fence + alignment */
 		query->num_cs_dw_begin = 6;
-		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
+		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		query->result_size = 24;
 		query->num_cs_dw_begin = 8;
-		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
 		break;
 	case PIPE_QUERY_TIMESTAMP:
 		query->result_size = 16;
-		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
 		query->flags = R600_QUERY_HW_FLAG_NO_START;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
@@ -475,10 +581,10 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		/* 11 values on EG, 8 on R600. */
-		query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
+		query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16;
 		query->result_size += 8; /* for the fence + alignment */
 		query->num_cs_dw_begin = 6;
-		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
+		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
 		break;
 	default:
 		assert(0);
@@ -486,7 +592,7 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
 		return NULL;
 	}
 
-	if (!r600_query_hw_init(rctx, query)) {
+	if (!r600_query_hw_init(rscreen, query)) {
 		FREE(query);
 		return NULL;
 	}
@@ -545,7 +651,7 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) & 0xFFFF);
+		radeon_emit(cs, va >> 32);
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -554,21 +660,17 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) & 0xFFFF);
+		radeon_emit(cs, va >> 32);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
-		radeon_emit(cs, va);
-		radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
+		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
+					 0, 3, NULL, va, 0, 0);
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) & 0xFFFF);
+		radeon_emit(cs, va >> 32);
 		break;
 	default:
 		assert(0);
@@ -597,7 +699,7 @@ static void r600_query_hw_emit_start(struct r600_common_context *ctx,
 		*qbuf = query->buffer;
 		query->buffer.results_end = 0;
 		query->buffer.previous = qbuf;
-		query->buffer.buf = r600_new_query_buffer(ctx, query);
+		query->buffer.buf = r600_new_query_buffer(ctx->screen, query);
 		if (!query->buffer.buf)
 			return;
 	}
@@ -625,9 +727,9 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) & 0xFFFF);
+		radeon_emit(cs, va >> 32);
 
-		fence_va = va + ctx->max_db * 16 - 8;
+		fence_va = va + ctx->screen->info.num_render_backends * 16 - 8;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -637,19 +739,14 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) & 0xFFFF);
+		radeon_emit(cs, va >> 32);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		va += 8;
 		/* fall through */
 	case PIPE_QUERY_TIMESTAMP:
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
-		radeon_emit(cs, va);
-		radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-
+		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
+					 0, 3, NULL, va, 0, 0);
 		fence_va = va + 8;
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS: {
@@ -659,7 +756,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) & 0xFFFF);
+		radeon_emit(cs, va >> 32);
 
 		fence_va = va + sample_size;
 		break;
@@ -671,7 +768,8 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
 			RADEON_PRIO_QUERY);
 
 	if (fence_va)
-		r600_gfx_write_fence(ctx, query->buffer.buf, fence_va, 0, 0x80000000);
+		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1,
+					 query->buffer.buf, fence_va, 0, 0x80000000);
 }
 
 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
@@ -743,12 +841,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx,
 	/* emit predicate packets for all data blocks */
 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
 		unsigned results_base = 0;
-		uint64_t va = qbuf->buf->gpu_address;
+		uint64_t va_base = qbuf->buf->gpu_address;
 
 		while (results_base < qbuf->results_end) {
-			radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-			radeon_emit(cs, va + results_base);
-			radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
+			uint64_t va = va_base + results_base;
+
+			if (ctx->chip_class >= GFX9) {
+				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+				radeon_emit(cs, op);
+				radeon_emit(cs, va);
+				radeon_emit(cs, va >> 32);
+			} else {
+				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+				radeon_emit(cs, va);
+				radeon_emit(cs, op | ((va >> 32) & 0xFF));
+			}
 			r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
 					RADEON_PRIO_QUERY);
 			results_base += query->result_size;
@@ -761,14 +868,15 @@ static void r600_emit_query_predication(struct r600_common_context *ctx,
 
 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
 {
-	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_common_screen *rscreen =
+		(struct r600_common_screen *)ctx->screen;
 
 	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
 	    query_type == PIPE_QUERY_GPU_FINISHED ||
 	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
-		return r600_query_sw_create(ctx, query_type);
+		return r600_query_sw_create(query_type);
 
-	return r600_query_hw_create(rctx, query_type, index);
+	return r600_query_hw_create(rscreen, query_type, index);
 }
 
 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
@@ -776,7 +884,7 @@ static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *quer
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query *rquery = (struct r600_query *)query;
 
-	rquery->ops->destroy(rctx, rquery);
+	rquery->ops->destroy(rctx->screen, rquery);
 }
 
 static boolean r600_begin_query(struct pipe_context *ctx,
@@ -808,9 +916,9 @@ void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
 	if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
 	    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
 		r600_resource_reference(&query->buffer.buf, NULL);
-		query->buffer.buf = r600_new_query_buffer(rctx, query);
+		query->buffer.buf = r600_new_query_buffer(rctx->screen, query);
 	} else {
-		if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf))
+		if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))
 			r600_resource_reference(&query->buffer.buf, NULL);
 	}
 }
@@ -867,6 +975,8 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx,
 				     struct r600_query_hw *rquery, int index,
 				     struct r600_hw_query_params *params)
 {
+	unsigned max_rbs = rctx->screen->info.num_render_backends;
+
 	params->pair_stride = 0;
 	params->pair_count = 1;
 
@@ -875,9 +985,9 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx,
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
 		params->start_offset = 0;
 		params->end_offset = 8;
-		params->fence_offset = rctx->max_db * 16;
+		params->fence_offset = max_rbs * 16;
 		params->pair_stride = 16;
-		params->pair_count = rctx->max_db;
+		params->pair_count = max_rbs;
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		params->start_offset = 0;
@@ -936,14 +1046,16 @@ static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned
 	return 0;
 }
 
-static void r600_query_hw_add_result(struct r600_common_context *ctx,
+static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
 				     struct r600_query_hw *query,
 				     void *buffer,
 				     union pipe_query_result *result)
 {
+	unsigned max_rbs = rscreen->info.num_render_backends;
+
 	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER: {
-		for (unsigned i = 0; i < ctx->max_db; ++i) {
+		for (unsigned i = 0; i < max_rbs; ++i) {
 			unsigned results_base = i * 16;
 			result->u64 +=
 				r600_query_read_result(buffer + results_base, 0, 2, true);
@@ -951,7 +1063,7 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx,
 		break;
 	}
 	case PIPE_QUERY_OCCLUSION_PREDICATE: {
-		for (unsigned i = 0; i < ctx->max_db; ++i) {
+		for (unsigned i = 0; i < max_rbs; ++i) {
 			unsigned results_base = i * 16;
 			result->b = result->b ||
 				r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
@@ -989,7 +1101,7 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx,
 			r600_query_read_result(buffer, 0, 4, true);
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
-		if (ctx->chip_class >= EVERGREEN) {
+		if (rscreen->chip_class >= EVERGREEN) {
 			result->pipeline_statistics.ps_invocations +=
 				r600_query_read_result(buffer, 0, 22, false);
 			result->pipeline_statistics.c_primitives +=
@@ -1087,6 +1199,7 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx,
 			      struct r600_query *rquery,
 			      bool wait, union pipe_query_result *result)
 {
+	struct r600_common_screen *rscreen = rctx->screen;
 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
 	struct r600_query_buffer *qbuf;
 
@@ -1103,7 +1216,7 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx,
 			return false;
 
 		while (results_base != qbuf->results_end) {
-			query->ops->add_result(rctx, query, map + results_base,
+			query->ops->add_result(rscreen, query, map + results_base,
 					       result);
 			results_base += query->result_size;
 		}
@@ -1112,7 +1225,7 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx,
 	/* Convert the time to expected units. */
 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
-		result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq;
+		result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq;
 	}
 	return true;
 }
@@ -1170,6 +1283,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx)
 		"IMM[1] UINT32 {1, 2, 4, 8}\n"
 		"IMM[2] UINT32 {16, 32, 64, 128}\n"
 		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+		"IMM[4] UINT32 {0, 0, 0, 0}\n"
 
 		"AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
 		"UIF TEMP[5]\n"
@@ -1269,7 +1383,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx)
 					/* Convert to boolean */
 					"AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
 					"UIF TEMP[4]\n"
-						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
+						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
 						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
 						"MOV TEMP[0].y, IMM[0].xxxx\n"
 					"ENDIF\n"
@@ -1479,7 +1593,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
 static void r600_render_condition(struct pipe_context *ctx,
 				  struct pipe_query *query,
 				  boolean condition,
-				  uint mode)
+				  enum pipe_render_cond_flag mode)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	struct r600_query_hw *rquery = (struct r600_query_hw *)query;
@@ -1550,19 +1664,23 @@ void r600_resume_queries(struct r600_common_context *ctx)
 	}
 }
 
-/* Get backends mask */
-void r600_query_init_backend_mask(struct r600_common_context *ctx)
+/* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */
+void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen)
 {
+	struct r600_common_context *ctx =
+		(struct r600_common_context*)rscreen->aux_context;
 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	struct r600_resource *buffer;
 	uint32_t *results;
-	unsigned num_backends = ctx->screen->info.num_render_backends;
 	unsigned i, mask = 0;
+	unsigned max_rbs = ctx->screen->info.num_render_backends;
+
+	assert(rscreen->chip_class <= CAYMAN);
 
 	/* if backend_map query is supported by the kernel */
-	if (ctx->screen->info.r600_gb_backend_map_valid) {
-		unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes;
-		unsigned backend_map = ctx->screen->info.r600_gb_backend_map;
+	if (rscreen->info.r600_gb_backend_map_valid) {
+		unsigned num_tile_pipes = rscreen->info.num_tile_pipes;
+		unsigned backend_map = rscreen->info.r600_gb_backend_map;
 		unsigned item_width, item_mask;
 
 		if (ctx->chip_class >= EVERGREEN) {
@@ -1579,7 +1697,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 			backend_map >>= item_width;
 		}
 		if (mask != 0) {
-			ctx->backend_mask = mask;
+			rscreen->info.enabled_rb_mask = mask;
 			return;
 		}
 	}
@@ -1588,15 +1706,15 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 
 	/* create buffer for event data */
 	buffer = (struct r600_resource*)
-		pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_STAGING, ctx->max_db*16);
+		pipe_buffer_create(ctx->b.screen, 0,
+				   PIPE_USAGE_STAGING, max_rbs * 16);
 	if (!buffer)
-		goto err;
+		return;
 
 	/* initialize buffer with zeroes */
 	results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
 	if (results) {
-		memset(results, 0, ctx->max_db * 4 * 4);
+		memset(results, 0, max_rbs * 4 * 4);
 
 		/* emit EVENT_WRITE for ZPASS_DONE */
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -1610,7 +1728,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 		/* analyze results */
 		results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
 		if (results) {
-			for(i = 0; i < ctx->max_db; i++) {
+			for(i = 0; i < max_rbs; i++) {
 				/* at least highest bit will be set if backend is used */
 				if (results[i*4 + 1])
 					mask |= (1<<i);
@@ -1620,15 +1738,8 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 
 	r600_resource_reference(&buffer, NULL);
 
-	if (mask != 0) {
-		ctx->backend_mask = mask;
-		return;
-	}
-
-err:
-	/* fallback to old method - set num_backends lower bits to 1 */
-	ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
-	return;
+	if (mask)
+		rscreen->info.enabled_rb_mask = mask;
 }
 
 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
@@ -1649,23 +1760,32 @@ err:
 static struct pipe_driver_query_info r600_driver_query_list[] = {
 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
+	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
 	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
 	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
 	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
 	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
 	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
+	X("cp-dma-calls",		CP_DMA_CALLS,		UINT64, AVERAGE),
 	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
 	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
 	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
+	X("num-fb-cache-flushes",	NUM_FB_CACHE_FLUSHES,	UINT64, AVERAGE),
+	X("num-L2-invalidates",		NUM_L2_INVALIDATES,	UINT64, AVERAGE),
+	X("num-L2-writebacks",		NUM_L2_WRITEBACKS,	UINT64, AVERAGE),
+	X("CS-thread-busy",		CS_THREAD_BUSY,		UINT64, AVERAGE),
 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
 	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
 	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
 	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
-	X("num-ctx-flushes",		NUM_CTX_FLUSHES,	UINT64, AVERAGE),
+	X("num-mapped-buffers",		NUM_MAPPED_BUFFERS,	UINT64, AVERAGE),
+	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
+	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
 	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
 	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
 	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
+	X("VRAM-vis-usage",		VRAM_VIS_USAGE,		BYTES, AVERAGE),
 	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
 	X("back-buffer-ps-draw-ratio",	BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
 
@@ -1680,12 +1800,34 @@ static struct pipe_driver_query_info r600_driver_query_list[] = {
 	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
 	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
 
-	/* The following queries must be at the end of the list because their
-	 * availability is adjusted dynamically based on the DRM version. */
-	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
 	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
 	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
 	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
+
+	/* The following queries must be at the end of the list because their
+	 * availability is adjusted dynamically based on the DRM version. */
+	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
+	X("GPU-shaders-busy",		GPU_SHADERS_BUSY,	UINT64, AVERAGE),
+	X("GPU-ta-busy",		GPU_TA_BUSY,		UINT64, AVERAGE),
+	X("GPU-gds-busy",		GPU_GDS_BUSY,		UINT64, AVERAGE),
+	X("GPU-vgt-busy",		GPU_VGT_BUSY,		UINT64, AVERAGE),
+	X("GPU-ia-busy",		GPU_IA_BUSY,		UINT64, AVERAGE),
+	X("GPU-sx-busy",		GPU_SX_BUSY,		UINT64, AVERAGE),
+	X("GPU-wd-busy",		GPU_WD_BUSY,		UINT64, AVERAGE),
+	X("GPU-bci-busy",		GPU_BCI_BUSY,		UINT64, AVERAGE),
+	X("GPU-sc-busy",		GPU_SC_BUSY,		UINT64, AVERAGE),
+	X("GPU-pa-busy",		GPU_PA_BUSY,		UINT64, AVERAGE),
+	X("GPU-db-busy",		GPU_DB_BUSY,		UINT64, AVERAGE),
+	X("GPU-cp-busy",		GPU_CP_BUSY,		UINT64, AVERAGE),
+	X("GPU-cb-busy",		GPU_CB_BUSY,		UINT64, AVERAGE),
+	X("GPU-sdma-busy",		GPU_SDMA_BUSY,		UINT64, AVERAGE),
+	X("GPU-pfp-busy",		GPU_PFP_BUSY,		UINT64, AVERAGE),
+	X("GPU-meq-busy",		GPU_MEQ_BUSY,		UINT64, AVERAGE),
+	X("GPU-me-busy",		GPU_ME_BUSY,		UINT64, AVERAGE),
+	X("GPU-surf-sync-busy",		GPU_SURF_SYNC_BUSY,	UINT64, AVERAGE),
+	X("GPU-dma-busy",		GPU_DMA_BUSY,		UINT64, AVERAGE),
+	X("GPU-scratch-ram-busy",	GPU_SCRATCH_RAM_BUSY,	UINT64, AVERAGE),
+	X("GPU-ce-busy",		GPU_CE_BUSY,		UINT64, AVERAGE),
 };
 
 #undef X
@@ -1696,10 +1838,14 @@ static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
 {
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
 		return ARRAY_SIZE(r600_driver_query_list);
-	else if (rscreen->info.drm_major == 3)
-		return ARRAY_SIZE(r600_driver_query_list) - 3;
+	else if (rscreen->info.drm_major == 3) {
+		if (rscreen->chip_class >= VI)
+			return ARRAY_SIZE(r600_driver_query_list);
+		else
+			return ARRAY_SIZE(r600_driver_query_list) - 7;
+	}
 	else
-		return ARRAY_SIZE(r600_driver_query_list) - 4;
+		return ARRAY_SIZE(r600_driver_query_list) - 25;
 }
 
 static int r600_get_driver_query_info(struct pipe_screen *screen,
@@ -1735,6 +1881,9 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 	case R600_QUERY_GPU_TEMPERATURE:
 		info->max_value.u64 = 125;
 		break;
+	case R600_QUERY_VRAM_VIS_USAGE:
+		info->max_value.u64 = rscreen->info.vram_vis_size;
+		break;
 	}
 
 	if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.h b/lib/mesa/src/gallium/drivers/radeon/r600_query.h
index 14c433d91..b9ab44ca3 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_query.h
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.h
@@ -48,26 +48,56 @@ enum {
 	R600_QUERY_COMPUTE_CALLS,
 	R600_QUERY_SPILL_COMPUTE_CALLS,
 	R600_QUERY_DMA_CALLS,
+	R600_QUERY_CP_DMA_CALLS,
 	R600_QUERY_NUM_VS_FLUSHES,
 	R600_QUERY_NUM_PS_FLUSHES,
 	R600_QUERY_NUM_CS_FLUSHES,
+	R600_QUERY_NUM_FB_CACHE_FLUSHES,
+	R600_QUERY_NUM_L2_INVALIDATES,
+	R600_QUERY_NUM_L2_WRITEBACKS,
+	R600_QUERY_CS_THREAD_BUSY,
 	R600_QUERY_REQUESTED_VRAM,
 	R600_QUERY_REQUESTED_GTT,
 	R600_QUERY_MAPPED_VRAM,
 	R600_QUERY_MAPPED_GTT,
 	R600_QUERY_BUFFER_WAIT_TIME,
-	R600_QUERY_NUM_CTX_FLUSHES,
+	R600_QUERY_NUM_MAPPED_BUFFERS,
+	R600_QUERY_NUM_GFX_IBS,
+	R600_QUERY_NUM_SDMA_IBS,
 	R600_QUERY_NUM_BYTES_MOVED,
 	R600_QUERY_NUM_EVICTIONS,
 	R600_QUERY_VRAM_USAGE,
+	R600_QUERY_VRAM_VIS_USAGE,
 	R600_QUERY_GTT_USAGE,
 	R600_QUERY_GPU_TEMPERATURE,
 	R600_QUERY_CURRENT_GPU_SCLK,
 	R600_QUERY_CURRENT_GPU_MCLK,
 	R600_QUERY_GPU_LOAD,
+	R600_QUERY_GPU_SHADERS_BUSY,
+	R600_QUERY_GPU_TA_BUSY,
+	R600_QUERY_GPU_GDS_BUSY,
+	R600_QUERY_GPU_VGT_BUSY,
+	R600_QUERY_GPU_IA_BUSY,
+	R600_QUERY_GPU_SX_BUSY,
+	R600_QUERY_GPU_WD_BUSY,
+	R600_QUERY_GPU_BCI_BUSY,
+	R600_QUERY_GPU_SC_BUSY,
+	R600_QUERY_GPU_PA_BUSY,
+	R600_QUERY_GPU_DB_BUSY,
+	R600_QUERY_GPU_CP_BUSY,
+	R600_QUERY_GPU_CB_BUSY,
+	R600_QUERY_GPU_SDMA_BUSY,
+	R600_QUERY_GPU_PFP_BUSY,
+	R600_QUERY_GPU_MEQ_BUSY,
+	R600_QUERY_GPU_ME_BUSY,
+	R600_QUERY_GPU_SURF_SYNC_BUSY,
+	R600_QUERY_GPU_DMA_BUSY,
+	R600_QUERY_GPU_SCRATCH_RAM_BUSY,
+	R600_QUERY_GPU_CE_BUSY,
 	R600_QUERY_NUM_COMPILATIONS,
 	R600_QUERY_NUM_SHADERS_CREATED,
 	R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+	R600_QUERY_NUM_SHADER_CACHE_HITS,
 	R600_QUERY_GPIN_ASIC_ID,
 	R600_QUERY_GPIN_NUM_SIMD,
 	R600_QUERY_GPIN_NUM_RB,
@@ -83,7 +113,7 @@ enum {
 };
 
 struct r600_query_ops {
-	void (*destroy)(struct r600_common_context *, struct r600_query *);
+	void (*destroy)(struct r600_common_screen *, struct r600_query *);
 	bool (*begin)(struct r600_common_context *, struct r600_query *);
 	bool (*end)(struct r600_common_context *, struct r600_query *);
 	bool (*get_result)(struct r600_common_context *,
@@ -112,7 +142,7 @@ enum {
 };
 
 struct r600_query_hw_ops {
-	bool (*prepare_buffer)(struct r600_common_context *,
+	bool (*prepare_buffer)(struct r600_common_screen *,
 			       struct r600_query_hw *,
 			       struct r600_resource *);
 	void (*emit_start)(struct r600_common_context *,
@@ -122,7 +152,7 @@ struct r600_query_hw_ops {
 			  struct r600_query_hw *,
 			  struct r600_resource *buffer, uint64_t va);
 	void (*clear_result)(struct r600_query_hw *, union pipe_query_result *);
-	void (*add_result)(struct r600_common_context *ctx,
+	void (*add_result)(struct r600_common_screen *screen,
 			   struct r600_query_hw *, void *buffer,
 			   union pipe_query_result *result);
 };
@@ -157,9 +187,9 @@ struct r600_query_hw {
 	unsigned stream;
 };
 
-bool r600_query_hw_init(struct r600_common_context *rctx,
+bool r600_query_hw_init(struct r600_common_screen *rscreen,
 			struct r600_query_hw *query);
-void r600_query_hw_destroy(struct r600_common_context *rctx,
+void r600_query_hw_destroy(struct r600_common_screen *rscreen,
 			   struct r600_query *rquery);
 bool r600_query_hw_begin(struct r600_common_context *rctx,
 			 struct r600_query *rquery);
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
index b5296aa56..a18089a3b 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
@@ -187,7 +187,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 {
 	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
-	unsigned *stride_in_dw = rctx->streamout.stride_in_dw;
+	uint16_t *stride_in_dw = rctx->streamout.stride_in_dw;
 	unsigned i, update_flags = 0;
 
 	r600_flush_vgt_streamout(rctx);
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c b/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c
index 1e60f6aff..9e1ff9e5f 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c
@@ -26,26 +26,10 @@
 
 #include "r600_pipe_common.h"
 #include "util/u_surface.h"
+#include "util/rand_xor.h"
 
 static uint64_t seed_xorshift128plus[2];
 
-/* Super fast random number generator.
- *
- * This rand_xorshift128plus function by Sebastiano Vigna belongs
- * to the public domain.
- */
-static uint64_t rand_xorshift128plus(void)
-{
-	uint64_t *s = seed_xorshift128plus;
-
-	uint64_t s1 = s[0];
-	const uint64_t s0 = s[1];
-	s[0] = s0;
-	s1 ^= s1 << 23;
-	s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5);
-	return s[1] + s0;
-}
-
 #define RAND_NUM_SIZE 8
 
 /* The GPU blits are emulated on the CPU using these CPU textures. */
@@ -91,8 +75,10 @@ static void set_random_pixels(struct pipe_context *ctx,
 			assert(t->stride % RAND_NUM_SIZE == 0);
 			assert(cpu->stride % RAND_NUM_SIZE == 0);
 
-			for (x = 0; x < size; x++)
-				*ptr++ = *ptr_cpu++ = rand_xorshift128plus();
+			for (x = 0; x < size; x++) {
+				*ptr++ = *ptr_cpu++ =
+					rand_xorshift128plus(seed_xorshift128plus);
+			}
 		}
 	}
 
@@ -149,18 +135,24 @@ static enum pipe_format get_format_from_bpp(int bpp)
 	}
 }
 
-static const char *array_mode_to_string(unsigned mode)
+static const char *array_mode_to_string(struct r600_common_screen *rscreen,
+					struct radeon_surf *surf)
 {
-	switch (mode) {
-	case RADEON_SURF_MODE_LINEAR_ALIGNED:
-		return "LINEAR_ALIGNED";
-	case RADEON_SURF_MODE_1D:
-		return "1D_TILED_THIN1";
-	case RADEON_SURF_MODE_2D:
-		return "2D_TILED_THIN1";
-	default:
-		assert(0);
+	if (rscreen->chip_class >= GFX9) {
+		/* TODO */
 		return "       UNKNOWN";
+	} else {
+		switch (surf->u.legacy.level[0].mode) {
+		case RADEON_SURF_MODE_LINEAR_ALIGNED:
+			return "LINEAR_ALIGNED";
+		case RADEON_SURF_MODE_1D:
+			return "1D_TILED_THIN1";
+		case RADEON_SURF_MODE_2D:
+			return "2D_TILED_THIN1";
+		default:
+			assert(0);
+			return "       UNKNOWN";
+		}
 	}
 }
 
@@ -197,8 +189,7 @@ void r600_test_dma(struct r600_common_screen *rscreen)
 	/* the seed for random test parameters */
 	srand(0x9b47d95b);
 	/* the seed for random pixel data */
-	seed_xorshift128plus[0] = 0x3bffb83978e24f88;
-	seed_xorshift128plus[1] = 0x9238d5d56c71cd35;
+	s_rand_xorshift128plus(seed_xorshift128plus, false);
 
 	iterations = 1000000000; /* just kill it when you are bored */
 	num_partial_copies = 30;
@@ -292,16 +283,16 @@ void r600_test_dma(struct r600_common_screen *rscreen)
 		printf("%4u: dst = (%5u x %5u x %u, %s), "
 		       " src = (%5u x %5u x %u, %s), bpp = %2u, ",
 		       i, tdst.width0, tdst.height0, tdst.array_size,
-		       array_mode_to_string(rdst->surface.level[0].mode),
+		       array_mode_to_string(rscreen, &rdst->surface),
 		       tsrc.width0, tsrc.height0, tsrc.array_size,
-		       array_mode_to_string(rsrc->surface.level[0].mode), bpp);
+		       array_mode_to_string(rscreen, &rsrc->surface), bpp);
 		fflush(stdout);
 
 		/* set src pixels */
 		set_random_pixels(ctx, src, &src_cpu);
 
 		/* clear dst pixels */
-		rctx->clear_buffer(ctx, dst, 0, rdst->surface.bo_size, 0, true);
+		rctx->clear_buffer(ctx, dst, 0, rdst->surface.surf_size, 0, true);
 		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
 
 		/* preparation */
@@ -331,8 +322,8 @@ void r600_test_dma(struct r600_common_screen *rscreen)
 				dstz = rand() % (tdst.array_size - depth + 1);
 
 				/* special code path to hit the tiled partial copies */
-				if (rsrc->surface.level[0].mode >= RADEON_SURF_MODE_1D &&
-				    rdst->surface.level[0].mode >= RADEON_SURF_MODE_1D &&
+				if (!rsrc->surface.is_linear &&
+				    !rdst->surface.is_linear &&
 				    rand() & 1) {
 					if (max_width < 8 || max_height < 8)
 						continue;
@@ -359,8 +350,8 @@ void r600_test_dma(struct r600_common_screen *rscreen)
 				}
 
 				/* special code path to hit out-of-bounds reads in L2T */
-				if (rsrc->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
-				    rdst->surface.level[0].mode >= RADEON_SURF_MODE_1D &&
+				if (rsrc->surface.is_linear &&
+				    !rdst->surface.is_linear &&
 				    rand() % 4 == 0) {
 					srcx = 0;
 					srcy = 0;
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
index 27035c0fa..4b2082523 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
@@ -37,8 +37,9 @@
 
 static void r600_texture_discard_cmask(struct r600_common_screen *rscreen,
 				       struct r600_texture *rtex);
-static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
-				   const struct pipe_resource *templ);
+static enum radeon_surf_mode
+r600_choose_tiling(struct r600_common_screen *rscreen,
+		   const struct pipe_resource *templ);
 
 
 bool r600_prepare_for_dma_blit(struct r600_common_context *rctx,
@@ -52,8 +53,7 @@ bool r600_prepare_for_dma_blit(struct r600_common_context *rctx,
 	if (!rctx->dma.cs)
 		return false;
 
-	if (util_format_get_blocksizebits(rdst->resource.b.b.format) !=
-	    util_format_get_blocksizebits(rsrc->resource.b.b.format))
+	if (rdst->surface.bpe != rsrc->surface.bpe)
 		return false;
 
 	/* MSAA: Blits don't exist in the real world. */
@@ -72,8 +72,8 @@ bool r600_prepare_for_dma_blit(struct r600_common_context *rctx,
 	 *   src: Use the 3D path. DCC decompression is expensive.
 	 *   dst: Use the 3D path to compress the pixels with DCC.
 	 */
-	if ((rsrc->dcc_offset && rsrc->surface.level[src_level].dcc_enabled) ||
-	    (rdst->dcc_offset && rdst->surface.level[dst_level].dcc_enabled))
+	if (vi_dcc_enabled(rsrc, src_level) ||
+	    vi_dcc_enabled(rdst, dst_level))
 		return false;
 
 	/* CMASK as:
@@ -177,179 +177,170 @@ static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600
 		       src, 0, &sbox);
 }
 
-static unsigned r600_texture_get_offset(struct r600_texture *rtex, unsigned level,
-					const struct pipe_box *box)
+static unsigned r600_texture_get_offset(struct r600_common_screen *rscreen,
+					struct r600_texture *rtex, unsigned level,
+					const struct pipe_box *box,
+					unsigned *stride,
+					unsigned *layer_stride)
 {
-	enum pipe_format format = rtex->resource.b.b.format;
+	if (rscreen->chip_class >= GFX9) {
+		*stride = rtex->surface.u.gfx9.surf_pitch * rtex->surface.bpe;
+		*layer_stride = rtex->surface.u.gfx9.surf_slice_size;
+
+		if (!box)
+			return 0;
+
+		/* Each texture is an array of slices. Each slice is an array
+		 * of mipmap levels. */
+		return box->z * rtex->surface.u.gfx9.surf_slice_size +
+		       rtex->surface.u.gfx9.offset[level] +
+		       (box->y / rtex->surface.blk_h *
+			rtex->surface.u.gfx9.surf_pitch +
+			box->x / rtex->surface.blk_w) * rtex->surface.bpe;
+	} else {
+		*stride = rtex->surface.u.legacy.level[level].nblk_x *
+			  rtex->surface.bpe;
+		*layer_stride = rtex->surface.u.legacy.level[level].slice_size;
 
-	return rtex->surface.level[level].offset +
-	       box->z * rtex->surface.level[level].slice_size +
-	       box->y / util_format_get_blockheight(format) * rtex->surface.level[level].pitch_bytes +
-	       box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+		if (!box)
+			return rtex->surface.u.legacy.level[level].offset;
+
+		/* Each texture is an array of mipmap levels. Each level is
+		 * an array of slices. */
+		return rtex->surface.u.legacy.level[level].offset +
+		       box->z * rtex->surface.u.legacy.level[level].slice_size +
+		       (box->y / rtex->surface.blk_h *
+		        rtex->surface.u.legacy.level[level].nblk_x +
+		        box->x / rtex->surface.blk_w) * rtex->surface.bpe;
+	}
 }
 
 static int r600_init_surface(struct r600_common_screen *rscreen,
 			     struct radeon_surf *surface,
 			     const struct pipe_resource *ptex,
-			     unsigned array_mode,
+			     enum radeon_surf_mode array_mode,
+			     unsigned pitch_in_bytes_override,
+			     unsigned offset,
+			     bool is_imported,
+			     bool is_scanout,
 			     bool is_flushed_depth,
 			     bool tc_compatible_htile)
 {
 	const struct util_format_description *desc =
 		util_format_description(ptex->format);
 	bool is_depth, is_stencil;
+	int r;
+	unsigned i, bpe, flags = 0;
 
 	is_depth = util_format_has_depth(desc);
 	is_stencil = util_format_has_stencil(desc);
 
-	surface->npix_x = ptex->width0;
-	surface->npix_y = ptex->height0;
-	surface->npix_z = ptex->depth0;
-	surface->blk_w = util_format_get_blockwidth(ptex->format);
-	surface->blk_h = util_format_get_blockheight(ptex->format);
-	surface->blk_d = 1;
-	surface->array_size = 1;
-	surface->last_level = ptex->last_level;
-
 	if (rscreen->chip_class >= EVERGREEN && !is_flushed_depth &&
 	    ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
-		surface->bpe = 4; /* stencil is allocated separately on evergreen */
+		bpe = 4; /* stencil is allocated separately on evergreen */
 	} else {
-		surface->bpe = util_format_get_blocksize(ptex->format);
+		bpe = util_format_get_blocksize(ptex->format);
 		/* align byte per element on dword */
-		if (surface->bpe == 3) {
-			surface->bpe = 4;
+		if (bpe == 3) {
+			bpe = 4;
 		}
 	}
 
-	surface->nsamples = ptex->nr_samples ? ptex->nr_samples : 1;
-	surface->flags = RADEON_SURF_SET(array_mode, MODE);
-
-	switch (ptex->target) {
-	case PIPE_TEXTURE_1D:
-		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D, TYPE);
-		break;
-	case PIPE_TEXTURE_RECT:
-	case PIPE_TEXTURE_2D:
-		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D, TYPE);
-		break;
-	case PIPE_TEXTURE_3D:
-		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_3D, TYPE);
-		break;
-	case PIPE_TEXTURE_1D_ARRAY:
-		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D_ARRAY, TYPE);
-		surface->array_size = ptex->array_size;
-		break;
-	case PIPE_TEXTURE_CUBE_ARRAY: /* cube array layout like 2d array */
-		assert(ptex->array_size % 6 == 0);
-	case PIPE_TEXTURE_2D_ARRAY:
-		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D_ARRAY, TYPE);
-		surface->array_size = ptex->array_size;
-		break;
-	case PIPE_TEXTURE_CUBE:
-		surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_CUBEMAP, TYPE);
-		break;
-	case PIPE_BUFFER:
-	default:
-		return -EINVAL;
-	}
-
 	if (!is_flushed_depth && is_depth) {
-		surface->flags |= RADEON_SURF_ZBUFFER;
+		flags |= RADEON_SURF_ZBUFFER;
 
 		if (tc_compatible_htile &&
-		    array_mode == RADEON_SURF_MODE_2D) {
+		    (rscreen->chip_class >= GFX9 ||
+		     array_mode == RADEON_SURF_MODE_2D)) {
 			/* TC-compatible HTILE only supports Z32_FLOAT.
-			 * Promote Z16 to Z32. DB->CB copies will convert
+			 * GFX9 also supports Z16_UNORM.
+			 * On VI, promote Z16 to Z32. DB->CB copies will convert
 			 * the format for transfers.
 			 */
-			surface->bpe = 4;
-			surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
-		}
+			if (rscreen->chip_class == VI)
+				bpe = 4;
 
-		if (is_stencil) {
-			surface->flags |= RADEON_SURF_SBUFFER |
-					  RADEON_SURF_HAS_SBUFFER_MIPTREE;
+			flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
 		}
-	}
 
-	if (rscreen->chip_class >= SI) {
-		surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
+		if (is_stencil)
+			flags |= RADEON_SURF_SBUFFER;
 	}
 
 	if (rscreen->chip_class >= VI &&
 	    (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC ||
 	     ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT))
-		surface->flags |= RADEON_SURF_DISABLE_DCC;
+		flags |= RADEON_SURF_DISABLE_DCC;
 
-	if (ptex->bind & PIPE_BIND_SCANOUT) {
+	if (ptex->bind & PIPE_BIND_SCANOUT || is_scanout) {
 		/* This should catch bugs in gallium users setting incorrect flags. */
-		assert(surface->nsamples == 1 &&
-		       surface->array_size == 1 &&
-		       surface->npix_z == 1 &&
-		       surface->last_level == 0 &&
-		       !(surface->flags & RADEON_SURF_Z_OR_SBUFFER));
+		assert(ptex->nr_samples <= 1 &&
+		       ptex->array_size == 1 &&
+		       ptex->depth0 == 1 &&
+		       ptex->last_level == 0 &&
+		       !(flags & RADEON_SURF_Z_OR_SBUFFER));
 
-		surface->flags |= RADEON_SURF_SCANOUT;
+		flags |= RADEON_SURF_SCANOUT;
 	}
-	return 0;
-}
 
-static int r600_setup_surface(struct pipe_screen *screen,
-			      struct r600_texture *rtex,
-			      unsigned pitch_in_bytes_override,
-			      unsigned offset)
-{
-	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
-	unsigned i;
-	int r;
+	if (is_imported)
+		flags |= RADEON_SURF_IMPORTED;
+	if (!(ptex->flags & R600_RESOURCE_FLAG_FORCE_TILING))
+		flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
 
-	r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface);
+	r = rscreen->ws->surface_init(rscreen->ws, ptex, flags, bpe,
+				      array_mode, surface);
 	if (r) {
 		return r;
 	}
 
-	rtex->size = rtex->surface.bo_size;
-
-	if (pitch_in_bytes_override && pitch_in_bytes_override != rtex->surface.level[0].pitch_bytes) {
-		/* old ddx on evergreen over estimate alignment for 1d, only 1 level
-		 * for those
-		 */
-		rtex->surface.level[0].nblk_x = pitch_in_bytes_override / rtex->surface.bpe;
-		rtex->surface.level[0].pitch_bytes = pitch_in_bytes_override;
-		rtex->surface.level[0].slice_size = pitch_in_bytes_override * rtex->surface.level[0].nblk_y;
-	}
+	if (rscreen->chip_class >= GFX9) {
+		assert(!pitch_in_bytes_override ||
+		       pitch_in_bytes_override == surface->u.gfx9.surf_pitch * bpe);
+		surface->u.gfx9.surf_offset = offset;
+	} else {
+		if (pitch_in_bytes_override &&
+		    pitch_in_bytes_override != surface->u.legacy.level[0].nblk_x * bpe) {
+			/* old ddx on evergreen over estimate alignment for 1d, only 1 level
+			 * for those
+			 */
+			surface->u.legacy.level[0].nblk_x = pitch_in_bytes_override / bpe;
+			surface->u.legacy.level[0].slice_size = pitch_in_bytes_override *
+								surface->u.legacy.level[0].nblk_y;
+		}
 
-	if (offset) {
-		for (i = 0; i < ARRAY_SIZE(rtex->surface.level); ++i)
-			rtex->surface.level[i].offset += offset;
+		if (offset) {
+			for (i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
+				surface->u.legacy.level[i].offset += offset;
+		}
 	}
 	return 0;
 }
 
-static void r600_texture_init_metadata(struct r600_texture *rtex,
+static void r600_texture_init_metadata(struct r600_common_screen *rscreen,
+				       struct r600_texture *rtex,
 				       struct radeon_bo_metadata *metadata)
 {
 	struct radeon_surf *surface = &rtex->surface;
 
 	memset(metadata, 0, sizeof(*metadata));
-	metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ?
-				   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-	metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ?
-				   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-	metadata->pipe_config = surface->pipe_config;
-	metadata->bankw = surface->bankw;
-	metadata->bankh = surface->bankh;
-	metadata->tile_split = surface->tile_split;
-	metadata->mtilea = surface->mtilea;
-	metadata->num_banks = surface->num_banks;
-	metadata->stride = surface->level[0].pitch_bytes;
-	metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
-}
 
-static void r600_dirty_all_framebuffer_states(struct r600_common_screen *rscreen)
-{
-	p_atomic_inc(&rscreen->dirty_fb_counter);
+	if (rscreen->chip_class >= GFX9) {
+		metadata->u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
+	} else {
+		metadata->u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ?
+					   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+		metadata->u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ?
+					   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+		metadata->u.legacy.pipe_config = surface->u.legacy.pipe_config;
+		metadata->u.legacy.bankw = surface->u.legacy.bankw;
+		metadata->u.legacy.bankh = surface->u.legacy.bankh;
+		metadata->u.legacy.tile_split = surface->u.legacy.tile_split;
+		metadata->u.legacy.mtilea = surface->u.legacy.mtilea;
+		metadata->u.legacy.num_banks = surface->u.legacy.num_banks;
+		metadata->u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
+		metadata->u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+	}
 }
 
 static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx,
@@ -359,13 +350,13 @@ static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx,
 	struct pipe_context *ctx = &rctx->b;
 
 	if (ctx == rscreen->aux_context)
-		pipe_mutex_lock(rscreen->aux_context_lock);
+		mtx_lock(&rscreen->aux_context_lock);
 
 	ctx->flush_resource(ctx, &rtex->resource.b.b);
 	ctx->flush(ctx, NULL, 0);
 
 	if (ctx == rscreen->aux_context)
-		pipe_mutex_unlock(rscreen->aux_context_lock);
+		mtx_unlock(&rscreen->aux_context_lock);
 }
 
 static void r600_texture_discard_cmask(struct r600_common_screen *rscreen,
@@ -390,7 +381,7 @@ static void r600_texture_discard_cmask(struct r600_common_screen *rscreen,
 	    r600_resource_reference(&rtex->cmask_buffer, NULL);
 
 	/* Notify all contexts about the change. */
-	r600_dirty_all_framebuffer_states(rscreen);
+	p_atomic_inc(&rscreen->dirty_tex_counter);
 	p_atomic_inc(&rscreen->compressed_colortex_counter);
 }
 
@@ -414,7 +405,7 @@ static bool r600_texture_discard_dcc(struct r600_common_screen *rscreen,
 	rtex->dcc_offset = 0;
 
 	/* Notify all contexts about the change. */
-	r600_dirty_all_framebuffer_states(rscreen);
+	p_atomic_inc(&rscreen->dirty_tex_counter);
 	return true;
 }
 
@@ -448,14 +439,14 @@ bool r600_texture_disable_dcc(struct r600_common_context *rctx,
 		return false;
 
 	if (&rctx->b == rscreen->aux_context)
-		pipe_mutex_lock(rscreen->aux_context_lock);
+		mtx_lock(&rscreen->aux_context_lock);
 
 	/* Decompress DCC. */
 	rctx->decompress_dcc(&rctx->b, rtex);
 	rctx->b.flush(&rctx->b, NULL, 0);
 
 	if (&rctx->b == rscreen->aux_context)
-		pipe_mutex_unlock(rscreen->aux_context_lock);
+		mtx_unlock(&rscreen->aux_context_lock);
 
 	return r600_texture_discard_dcc(rscreen, rtex);
 }
@@ -476,7 +467,7 @@ static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx,
 		return;
 
 	if (rtex->resource.is_shared ||
-	    rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED)
+	    rtex->surface.is_linear)
 		return;
 
 	/* This fails with MSAA, depth, and compressed textures. */
@@ -529,8 +520,7 @@ static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx,
 
 	r600_texture_reference(&new_tex, NULL);
 
-	r600_dirty_all_framebuffer_states(rctx->screen);
-	p_atomic_inc(&rctx->screen->dirty_tex_descriptor_counter);
+	p_atomic_inc(&rctx->screen->dirty_tex_counter);
 }
 
 static boolean r600_texture_get_handle(struct pipe_screen* screen,
@@ -546,6 +536,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 	struct r600_texture *rtex = (struct r600_texture*)resource;
 	struct radeon_bo_metadata metadata;
 	bool update_metadata = false;
+	unsigned stride, offset, slice_size;
 
 	/* This is not supported now, but it might be required for OpenCL
 	 * interop in the future.
@@ -578,7 +569,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 
 		/* Set metadata. */
 		if (!res->is_shared || update_metadata) {
-			r600_texture_init_metadata(rtex, &metadata);
+			r600_texture_init_metadata(rscreen, rtex, &metadata);
 			if (rscreen->query_opaque_metadata)
 				rscreen->query_opaque_metadata(rscreen, rtex,
 							       &metadata);
@@ -599,11 +590,25 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 		res->external_usage = usage;
 	}
 
-	return rscreen->ws->buffer_get_handle(res->buf,
-					      rtex->surface.level[0].pitch_bytes,
-					      rtex->surface.level[0].offset,
-					      rtex->surface.level[0].slice_size,
-					      whandle);
+	if (res->b.b.target == PIPE_BUFFER) {
+		offset = 0;
+		stride = 0;
+		slice_size = 0;
+	} else {
+		if (rscreen->chip_class >= GFX9) {
+			offset = rtex->surface.u.gfx9.surf_offset;
+			stride = rtex->surface.u.gfx9.surf_pitch *
+				 rtex->surface.bpe;
+			slice_size = rtex->surface.u.gfx9.surf_slice_size;
+		} else {
+			offset = rtex->surface.u.legacy.level[0].offset;
+			stride = rtex->surface.u.legacy.level[0].nblk_x *
+				 rtex->surface.bpe;
+			slice_size = rtex->surface.u.legacy.level[0].slice_size;
+		}
+	}
+	return rscreen->ws->buffer_get_handle(res->buf, stride, offset,
+					      slice_size, whandle);
 }
 
 static void r600_texture_destroy(struct pipe_screen *screen,
@@ -633,35 +638,39 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen,
 				 struct r600_fmask_info *out)
 {
 	/* FMASK is allocated like an ordinary texture. */
-	struct radeon_surf fmask = rtex->surface;
+	struct pipe_resource templ = rtex->resource.b.b;
+	struct radeon_surf fmask = {};
+	unsigned flags, bpe;
 
 	memset(out, 0, sizeof(*out));
 
-	fmask.bo_alignment = 0;
-	fmask.bo_size = 0;
-	fmask.nsamples = 1;
-	fmask.flags |= RADEON_SURF_FMASK;
+	if (rscreen->chip_class >= GFX9) {
+		out->alignment = rtex->surface.u.gfx9.fmask_alignment;
+		out->size = rtex->surface.u.gfx9.fmask_size;
+		return;
+	}
 
-	/* Force 2D tiling if it wasn't set. This may occur when creating
-	 * FMASK for MSAA resolve on R6xx. On R6xx, the single-sample
-	 * destination buffer must have an FMASK too. */
-	fmask.flags = RADEON_SURF_CLR(fmask.flags, MODE);
-	fmask.flags |= RADEON_SURF_SET(RADEON_SURF_MODE_2D, MODE);
+	templ.nr_samples = 1;
+	flags = rtex->surface.flags | RADEON_SURF_FMASK;
 
-	if (rscreen->chip_class >= SI) {
-		fmask.flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
+	if (rscreen->chip_class <= CAYMAN) {
+		/* Use the same parameters and tile mode. */
+		fmask.u.legacy.bankw = rtex->surface.u.legacy.bankw;
+		fmask.u.legacy.bankh = rtex->surface.u.legacy.bankh;
+		fmask.u.legacy.mtilea = rtex->surface.u.legacy.mtilea;
+		fmask.u.legacy.tile_split = rtex->surface.u.legacy.tile_split;
+
+		if (nr_samples <= 4)
+			fmask.u.legacy.bankh = 4;
 	}
 
 	switch (nr_samples) {
 	case 2:
 	case 4:
-		fmask.bpe = 1;
-		if (rscreen->chip_class <= CAYMAN) {
-			fmask.bankh = 4;
-		}
+		bpe = 1;
 		break;
 	case 8:
-		fmask.bpe = 4;
+		bpe = 4;
 		break;
 	default:
 		R600_ERR("Invalid sample count for FMASK allocation.\n");
@@ -672,25 +681,26 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen,
 	 * This can be fixed by writing a separate FMASK allocator specifically
 	 * for R600-R700 asics. */
 	if (rscreen->chip_class <= R700) {
-		fmask.bpe *= 2;
+		bpe *= 2;
 	}
 
-	if (rscreen->ws->surface_init(rscreen->ws, &fmask)) {
+	if (rscreen->ws->surface_init(rscreen->ws, &templ, flags, bpe,
+				      RADEON_SURF_MODE_2D, &fmask)) {
 		R600_ERR("Got error in surface_init while allocating FMASK.\n");
 		return;
 	}
 
-	assert(fmask.level[0].mode == RADEON_SURF_MODE_2D);
+	assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
 
-	out->slice_tile_max = (fmask.level[0].nblk_x * fmask.level[0].nblk_y) / 64;
+	out->slice_tile_max = (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64;
 	if (out->slice_tile_max)
 		out->slice_tile_max -= 1;
 
-	out->tile_mode_index = fmask.tiling_index[0];
-	out->pitch_in_pixels = fmask.level[0].nblk_x;
-	out->bank_height = fmask.bankh;
-	out->alignment = MAX2(256, fmask.bo_alignment);
-	out->size = fmask.bo_size;
+	out->tile_mode_index = fmask.u.legacy.tiling_index[0];
+	out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x;
+	out->bank_height = fmask.u.legacy.bankh;
+	out->alignment = MAX2(256, fmask.surf_alignment);
+	out->size = fmask.surf_size;
 }
 
 static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen,
@@ -721,8 +731,8 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
 	unsigned macro_tile_width = util_next_power_of_two(sqrt_pixels_per_macro_tile);
 	unsigned macro_tile_height = pixels_per_macro_tile / macro_tile_width;
 
-	unsigned pitch_elements = align(rtex->surface.npix_x, macro_tile_width);
-	unsigned height = align(rtex->surface.npix_y, macro_tile_height);
+	unsigned pitch_elements = align(rtex->resource.b.b.width0, macro_tile_width);
+	unsigned height = align(rtex->resource.b.b.height0, macro_tile_height);
 
 	unsigned base_align = num_pipes * pipe_interleave_bytes;
 	unsigned slice_bytes =
@@ -731,10 +741,6 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
 	assert(macro_tile_width % 128 == 0);
 	assert(macro_tile_height % 128 == 0);
 
-	out->pitch = pitch_elements;
-	out->height = height;
-	out->xalign = macro_tile_width;
-	out->yalign = macro_tile_height;
 	out->slice_tile_max = ((pitch_elements * height) / (128*128)) - 1;
 	out->alignment = MAX2(256, base_align);
 	out->size = (util_max_layer(&rtex->resource.b.b, 0) + 1) *
@@ -749,6 +755,12 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen,
 	unsigned num_pipes = rscreen->info.num_tile_pipes;
 	unsigned cl_width, cl_height;
 
+	if (rscreen->chip_class >= GFX9) {
+		out->alignment = rtex->surface.u.gfx9.cmask_alignment;
+		out->size = rtex->surface.u.gfx9.cmask_size;
+		return;
+	}
+
 	switch (num_pipes) {
 	case 2:
 		cl_width = 32;
@@ -773,17 +785,13 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen,
 
 	unsigned base_align = num_pipes * pipe_interleave_bytes;
 
-	unsigned width = align(rtex->surface.npix_x, cl_width*8);
-	unsigned height = align(rtex->surface.npix_y, cl_height*8);
+	unsigned width = align(rtex->resource.b.b.width0, cl_width*8);
+	unsigned height = align(rtex->resource.b.b.height0, cl_height*8);
 	unsigned slice_elements = (width * height) / (8*8);
 
 	/* Each element of CMASK is a nibble. */
 	unsigned slice_bytes = slice_elements / 2;
 
-	out->pitch = width;
-	out->height = height;
-	out->xalign = cl_width * 8;
-	out->yalign = cl_height * 8;
 	out->slice_tile_max = (width * height) / (128*128);
 	if (out->slice_tile_max)
 		out->slice_tile_max -= 1;
@@ -826,7 +834,9 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 	}
 
 	rtex->cmask_buffer = (struct r600_resource *)
-		r600_aligned_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
+		r600_aligned_buffer_create(&rscreen->b,
+					   R600_RESOURCE_FLAG_UNMAPPABLE,
+					   PIPE_USAGE_DEFAULT,
 					   rtex->cmask.size,
 					   rtex->cmask.alignment);
 	if (rtex->cmask_buffer == NULL) {
@@ -845,28 +855,32 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 	p_atomic_inc(&rscreen->compressed_colortex_counter);
 }
 
-static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
-					    struct r600_texture *rtex)
+static void r600_texture_get_htile_size(struct r600_common_screen *rscreen,
+					struct r600_texture *rtex)
 {
 	unsigned cl_width, cl_height, width, height;
 	unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
 	unsigned num_pipes = rscreen->info.num_tile_pipes;
 
+	assert(rscreen->chip_class <= VI);
+
+	rtex->surface.htile_size = 0;
+
 	if (rscreen->chip_class <= EVERGREEN &&
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26)
-		return 0;
+		return;
 
 	/* HW bug on R6xx. */
 	if (rscreen->chip_class == R600 &&
-	    (rtex->surface.level[0].npix_x > 7680 ||
-	     rtex->surface.level[0].npix_y > 7680))
-		return 0;
+	    (rtex->resource.b.b.width0 > 7680 ||
+	     rtex->resource.b.b.height0 > 7680))
+		return;
 
 	/* HTILE is broken with 1D tiling on old kernels and CIK. */
 	if (rscreen->chip_class >= CIK &&
-	    rtex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
+	    rtex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
-		return 0;
+		return;
 
 	/* Overalign HTILE on P2 configs to work around GPU hangs in
 	 * piglit/depthstencil-render-miplevels 585.
@@ -901,11 +915,11 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 		break;
 	default:
 		assert(0);
-		return 0;
+		return;
 	}
 
-	width = align(rtex->surface.npix_x, cl_width * 8);
-	height = align(rtex->surface.npix_y, cl_height * 8);
+	width = align(rtex->resource.b.b.width0, cl_width * 8);
+	height = align(rtex->resource.b.b.height0, cl_height * 8);
 
 	slice_elements = (width * height) / (8 * 8);
 	slice_bytes = slice_elements * 4;
@@ -913,69 +927,122 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
 	base_align = num_pipes * pipe_interleave_bytes;
 
-	rtex->htile.pitch = width;
-	rtex->htile.height = height;
-	rtex->htile.xalign = cl_width * 8;
-	rtex->htile.yalign = cl_height * 8;
-	rtex->htile.alignment = base_align;
-
-	return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
+	rtex->surface.htile_alignment = base_align;
+	rtex->surface.htile_size =
+		(util_max_layer(&rtex->resource.b.b, 0) + 1) *
 		align(slice_bytes, base_align);
 }
 
 static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
 					struct r600_texture *rtex)
 {
-	uint64_t htile_size, alignment;
 	uint32_t clear_value;
 
-	if (rtex->tc_compatible_htile) {
-		htile_size = rtex->surface.htile_size;
-		alignment = rtex->surface.htile_alignment;
+	if (rscreen->chip_class >= GFX9 || rtex->tc_compatible_htile) {
 		clear_value = 0x0000030F;
 	} else {
-		htile_size = r600_texture_get_htile_size(rscreen, rtex);
-		alignment = rtex->htile.alignment;
+		r600_texture_get_htile_size(rscreen, rtex);
 		clear_value = 0;
 	}
 
-	if (!htile_size)
+	if (!rtex->surface.htile_size)
 		return;
 
 	rtex->htile_buffer = (struct r600_resource*)
-			     r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-							PIPE_USAGE_DEFAULT,
-							htile_size, alignment);
+		r600_aligned_buffer_create(&rscreen->b,
+					   R600_RESOURCE_FLAG_UNMAPPABLE,
+					   PIPE_USAGE_DEFAULT,
+					   rtex->surface.htile_size,
+					   rtex->surface.htile_alignment);
 	if (rtex->htile_buffer == NULL) {
 		/* this is not a fatal error as we can still keep rendering
 		 * without htile buffer */
 		R600_ERR("Failed to create buffer object for htile buffer.\n");
 	} else {
 		r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
-					 0, htile_size, clear_value,
-					 R600_COHERENCY_NONE);
+					 0, rtex->surface.htile_size,
+					 clear_value);
 	}
 }
 
-void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
+void r600_print_texture_info(struct r600_common_screen *rscreen,
+			     struct r600_texture *rtex, FILE *f)
 {
 	int i;
 
+	/* Common parameters. */
 	fprintf(f, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
-		"blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
+		"blk_h=%u, array_size=%u, last_level=%u, "
 		"bpe=%u, nsamples=%u, flags=0x%x, %s\n",
-		rtex->surface.npix_x, rtex->surface.npix_y,
-		rtex->surface.npix_z, rtex->surface.blk_w,
-		rtex->surface.blk_h, rtex->surface.blk_d,
-		rtex->surface.array_size, rtex->surface.last_level,
-		rtex->surface.bpe, rtex->surface.nsamples,
+		rtex->resource.b.b.width0, rtex->resource.b.b.height0,
+		rtex->resource.b.b.depth0, rtex->surface.blk_w,
+		rtex->surface.blk_h,
+		rtex->resource.b.b.array_size, rtex->resource.b.b.last_level,
+		rtex->surface.bpe, rtex->resource.b.b.nr_samples,
 		rtex->surface.flags, util_format_short_name(rtex->resource.b.b.format));
 
-	fprintf(f, "  Layout: size=%"PRIu64", alignment=%"PRIu64", bankw=%u, "
+	if (rscreen->chip_class >= GFX9) {
+		fprintf(f, "  Surf: size=%"PRIu64", slice_size=%"PRIu64", "
+			"alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
+			rtex->surface.surf_size,
+			rtex->surface.u.gfx9.surf_slice_size,
+			rtex->surface.surf_alignment,
+			rtex->surface.u.gfx9.surf.swizzle_mode,
+			rtex->surface.u.gfx9.surf.epitch,
+			rtex->surface.u.gfx9.surf_pitch);
+
+		if (rtex->fmask.size) {
+			fprintf(f, "  FMASK: offset=%"PRIu64", size=%"PRIu64", "
+				"alignment=%u, swmode=%u, epitch=%u\n",
+				rtex->fmask.offset,
+				rtex->surface.u.gfx9.fmask_size,
+				rtex->surface.u.gfx9.fmask_alignment,
+				rtex->surface.u.gfx9.fmask.swizzle_mode,
+				rtex->surface.u.gfx9.fmask.epitch);
+		}
+
+		if (rtex->cmask.size) {
+			fprintf(f, "  CMask: offset=%"PRIu64", size=%"PRIu64", "
+				"alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
+				rtex->cmask.offset,
+				rtex->surface.u.gfx9.cmask_size,
+				rtex->surface.u.gfx9.cmask_alignment,
+				rtex->surface.u.gfx9.cmask.rb_aligned,
+				rtex->surface.u.gfx9.cmask.pipe_aligned);
+		}
+
+		if (rtex->htile_buffer) {
+			fprintf(f, "  HTile: size=%u, alignment=%u, "
+				"rb_aligned=%u, pipe_aligned=%u\n",
+				rtex->htile_buffer->b.b.width0,
+				rtex->htile_buffer->buf->alignment,
+				rtex->surface.u.gfx9.htile.rb_aligned,
+				rtex->surface.u.gfx9.htile.pipe_aligned);
+		}
+
+		if (rtex->dcc_offset) {
+			fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", "
+				"alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
+				rtex->dcc_offset, rtex->surface.dcc_size,
+				rtex->surface.dcc_alignment,
+				rtex->surface.u.gfx9.dcc_pitch_max,
+				rtex->surface.num_dcc_levels);
+		}
+
+		if (rtex->surface.u.gfx9.stencil_offset) {
+			fprintf(f, "  Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n",
+				rtex->surface.u.gfx9.stencil_offset,
+				rtex->surface.u.gfx9.stencil.swizzle_mode,
+				rtex->surface.u.gfx9.stencil.epitch);
+		}
+		return;
+	}
+
+	fprintf(f, "  Layout: size=%"PRIu64", alignment=%u, bankw=%u, "
 		"bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
-		rtex->surface.bo_size, rtex->surface.bo_alignment, rtex->surface.bankw,
-		rtex->surface.bankh, rtex->surface.num_banks, rtex->surface.mtilea,
-		rtex->surface.tile_split, rtex->surface.pipe_config,
+		rtex->surface.surf_size, rtex->surface.surf_alignment, rtex->surface.u.legacy.bankw,
+		rtex->surface.u.legacy.bankh, rtex->surface.u.legacy.num_banks, rtex->surface.u.legacy.mtilea,
+		rtex->surface.u.legacy.tile_split, rtex->surface.u.legacy.pipe_config,
 		(rtex->surface.flags & RADEON_SURF_SCANOUT) != 0);
 
 	if (rtex->fmask.size)
@@ -986,65 +1053,60 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 			rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index);
 
 	if (rtex->cmask.size)
-		fprintf(f, "  CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, "
-			"height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n",
+		fprintf(f, "  CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, "
+			"slice_tile_max=%u\n",
 			rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
-			rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign,
-			rtex->cmask.yalign, rtex->cmask.slice_tile_max);
+			rtex->cmask.slice_tile_max);
 
 	if (rtex->htile_buffer)
-		fprintf(f, "  HTile: size=%u, alignment=%u, pitch=%u, height=%u, "
-			"xalign=%u, yalign=%u, TC_compatible = %u\n",
+		fprintf(f, "  HTile: size=%u, alignment=%u, TC_compatible = %u\n",
 			rtex->htile_buffer->b.b.width0,
-			rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
-			rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign,
+			rtex->htile_buffer->buf->alignment,
 			rtex->tc_compatible_htile);
 
 	if (rtex->dcc_offset) {
-		fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
+		fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%u\n",
 			rtex->dcc_offset, rtex->surface.dcc_size,
 			rtex->surface.dcc_alignment);
-		for (i = 0; i <= rtex->surface.last_level; i++)
+		for (i = 0; i <= rtex->resource.b.b.last_level; i++)
 			fprintf(f, "  DCCLevel[%i]: enabled=%u, offset=%"PRIu64", "
 				"fast_clear_size=%"PRIu64"\n",
-				i, rtex->surface.level[i].dcc_enabled,
-				rtex->surface.level[i].dcc_offset,
-				rtex->surface.level[i].dcc_fast_clear_size);
+				i, i < rtex->surface.num_dcc_levels,
+				rtex->surface.u.legacy.level[i].dcc_offset,
+				rtex->surface.u.legacy.level[i].dcc_fast_clear_size);
 	}
 
-	for (i = 0; i <= rtex->surface.last_level; i++)
+	for (i = 0; i <= rtex->resource.b.b.last_level; i++)
 		fprintf(f, "  Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
 			"npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-			"nblk_z=%u, pitch_bytes=%u, mode=%u\n",
-			i, rtex->surface.level[i].offset,
-			rtex->surface.level[i].slice_size,
+			"mode=%u, tiling_index = %u\n",
+			i, rtex->surface.u.legacy.level[i].offset,
+			rtex->surface.u.legacy.level[i].slice_size,
 			u_minify(rtex->resource.b.b.width0, i),
 			u_minify(rtex->resource.b.b.height0, i),
 			u_minify(rtex->resource.b.b.depth0, i),
-			rtex->surface.level[i].nblk_x,
-			rtex->surface.level[i].nblk_y,
-			rtex->surface.level[i].nblk_z,
-			rtex->surface.level[i].pitch_bytes,
-			rtex->surface.level[i].mode);
+			rtex->surface.u.legacy.level[i].nblk_x,
+			rtex->surface.u.legacy.level[i].nblk_y,
+			rtex->surface.u.legacy.level[i].mode,
+			rtex->surface.u.legacy.tiling_index[i]);
 
 	if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
 		fprintf(f, "  StencilLayout: tilesplit=%u\n",
-			rtex->surface.stencil_tile_split);
-		for (i = 0; i <= rtex->surface.last_level; i++) {
+			rtex->surface.u.legacy.stencil_tile_split);
+		for (i = 0; i <= rtex->resource.b.b.last_level; i++) {
 			fprintf(f, "  StencilLevel[%i]: offset=%"PRIu64", "
 				"slice_size=%"PRIu64", npix_x=%u, "
 				"npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-				"nblk_z=%u, pitch_bytes=%u, mode=%u\n",
-				i, rtex->surface.stencil_level[i].offset,
-				rtex->surface.stencil_level[i].slice_size,
+				"mode=%u, tiling_index = %u\n",
+				i, rtex->surface.u.legacy.stencil_level[i].offset,
+				rtex->surface.u.legacy.stencil_level[i].slice_size,
 				u_minify(rtex->resource.b.b.width0, i),
 				u_minify(rtex->resource.b.b.height0, i),
 				u_minify(rtex->resource.b.b.depth0, i),
-				rtex->surface.stencil_level[i].nblk_x,
-				rtex->surface.stencil_level[i].nblk_y,
-				rtex->surface.stencil_level[i].nblk_z,
-				rtex->surface.stencil_level[i].pitch_bytes,
-				rtex->surface.stencil_level[i].mode);
+				rtex->surface.u.legacy.stencil_level[i].nblk_x,
+				rtex->surface.u.legacy.stencil_level[i].nblk_y,
+				rtex->surface.u.legacy.stencil_level[i].mode,
+				rtex->surface.u.legacy.stencil_tiling_index[i]);
 		}
 	}
 }
@@ -1053,8 +1115,6 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 static struct r600_texture *
 r600_texture_create_object(struct pipe_screen *screen,
 			   const struct pipe_resource *base,
-			   unsigned pitch_in_bytes_override,
-			   unsigned offset,
 			   struct pb_buffer *buf,
 			   struct radeon_surf *surface)
 {
@@ -1077,25 +1137,29 @@ r600_texture_create_object(struct pipe_screen *screen,
 	rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
 
 	rtex->surface = *surface;
-	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
-		FREE(rtex);
-		return NULL;
-	}
+	rtex->size = rtex->surface.surf_size;
 
-	rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
-	assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
-	       rtex->tc_compatible_htile);
+	rtex->tc_compatible_htile = rtex->surface.htile_size != 0 &&
+				    (rtex->surface.flags &
+				     RADEON_SURF_TC_COMPATIBLE_HTILE);
 
-	/* TC-compatible HTILE only supports Z32_FLOAT. */
-	if (rtex->tc_compatible_htile)
-		rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
-	else
+	/* TC-compatible HTILE:
+	 * - VI only supports Z32_FLOAT.
+	 * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
+	if (rtex->tc_compatible_htile) {
+		if (rscreen->chip_class >= GFX9 &&
+		    base->format == PIPE_FORMAT_Z16_UNORM)
+			rtex->db_render_format = base->format;
+		else
+			rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+	} else {
 		rtex->db_render_format = base->format;
+	}
 
 	/* Tiled depth textures utilize the non-displayable tile order.
 	 * This must be done after r600_setup_surface.
 	 * Applies to R600-Cayman. */
-	rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D;
+	rtex->non_disp_tiling = rtex->is_depth && rtex->surface.u.legacy.level[0].mode >= RADEON_SURF_MODE_1D;
 	/* Applies to GCN. */
 	rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
 
@@ -1109,8 +1173,13 @@ r600_texture_create_object(struct pipe_screen *screen,
 		if (base->flags & (R600_RESOURCE_FLAG_TRANSFER |
 				   R600_RESOURCE_FLAG_FLUSHED_DEPTH) ||
 		    rscreen->chip_class >= EVERGREEN) {
-			rtex->can_sample_z = !rtex->surface.depth_adjusted;
-			rtex->can_sample_s = !rtex->surface.stencil_adjusted;
+			if (rscreen->chip_class >= GFX9) {
+				rtex->can_sample_z = true;
+				rtex->can_sample_s = true;
+			} else {
+				rtex->can_sample_z = !rtex->surface.u.legacy.depth_adjusted;
+				rtex->can_sample_s = !rtex->surface.u.legacy.stencil_adjusted;
+			}
 		} else {
 			if (rtex->resource.b.b.nr_samples <= 1 &&
 			    (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM ||
@@ -1154,7 +1223,7 @@ r600_texture_create_object(struct pipe_screen *screen,
 	/* Now create the backing buffer. */
 	if (!buf) {
 		r600_init_resource_fields(rscreen, resource, rtex->size,
-					  rtex->surface.bo_alignment);
+					  rtex->surface.surf_alignment);
 
 		resource->flags |= RADEON_FLAG_HANDLE;
 
@@ -1178,7 +1247,7 @@ r600_texture_create_object(struct pipe_screen *screen,
 		/* Initialize the cmask to 0xCC (= compressed state). */
 		r600_screen_clear_buffer(rscreen, &rtex->cmask_buffer->b.b,
 					 rtex->cmask.offset, rtex->cmask.size,
-					 0xCCCCCCCC, R600_COHERENCY_NONE);
+					 0xCCCCCCCC);
 	}
 
 	/* Initialize DCC only if the texture is not being imported. */
@@ -1186,7 +1255,7 @@ r600_texture_create_object(struct pipe_screen *screen,
 		r600_screen_clear_buffer(rscreen, &rtex->resource.b.b,
 					 rtex->dcc_offset,
 					 rtex->surface.dcc_size,
-					 0xFFFFFFFF, R600_COHERENCY_NONE);
+					 0xFFFFFFFF);
 	}
 
 	/* Initialize the CMASK base register value. */
@@ -1203,15 +1272,16 @@ r600_texture_create_object(struct pipe_screen *screen,
 
 	if (rscreen->debug_flags & DBG_TEX) {
 		puts("Texture:");
-		r600_print_texture_info(rtex, stdout);
+		r600_print_texture_info(rscreen, rtex, stdout);
 		fflush(stdout);
 	}
 
 	return rtex;
 }
 
-static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
-				   const struct pipe_resource *templ)
+static enum radeon_surf_mode
+r600_choose_tiling(struct r600_common_screen *rscreen,
+		   const struct pipe_resource *templ)
 {
 	const struct util_format_description *desc = util_format_description(templ->format);
 	bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING;
@@ -1256,7 +1326,9 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 		/* Textures with a very small height are recommended to be linear. */
 		if (templ->target == PIPE_TEXTURE_1D ||
 		    templ->target == PIPE_TEXTURE_1D_ARRAY ||
-		    templ->height0 <= 4)
+		    /* Only very thin and long 2D textures should benefit from
+		     * linear_aligned. */
+		    (templ->width0 > 8 && templ->height0 <= 2))
 			return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
 		/* Textures likely to be mapped often. */
@@ -1291,17 +1363,15 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 	int r;
 
 	r = r600_init_surface(rscreen, &surface, templ,
-			      r600_choose_tiling(rscreen, templ),
-			      is_flushed_depth, tc_compatible_htile);
+			      r600_choose_tiling(rscreen, templ), 0, 0,
+			      false, false, is_flushed_depth,
+			      tc_compatible_htile);
 	if (r) {
 		return NULL;
 	}
-	r = rscreen->ws->surface_best(rscreen->ws, &surface);
-	if (r) {
-		return NULL;
-	}
-	return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0,
-								  0, NULL, &surface);
+
+	return (struct pipe_resource *)
+	       r600_texture_create_object(screen, templ, NULL, &surface);
 }
 
 static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
@@ -1317,6 +1387,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	int r;
 	struct radeon_bo_metadata metadata = {};
 	struct r600_texture *rtex;
+	bool is_scanout;
 
 	/* Support only 2D textures without mipmaps */
 	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
@@ -1329,31 +1400,39 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 
 	rscreen->ws->buffer_get_metadata(buf, &metadata);
 
-	surface.pipe_config = metadata.pipe_config;
-	surface.bankw = metadata.bankw;
-	surface.bankh = metadata.bankh;
-	surface.tile_split = metadata.tile_split;
-	surface.mtilea = metadata.mtilea;
-	surface.num_banks = metadata.num_banks;
-
-	if (metadata.macrotile == RADEON_LAYOUT_TILED)
-		array_mode = RADEON_SURF_MODE_2D;
-	else if (metadata.microtile == RADEON_LAYOUT_TILED)
-		array_mode = RADEON_SURF_MODE_1D;
-	else
-		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+	if (rscreen->chip_class >= GFX9) {
+		if (metadata.u.gfx9.swizzle_mode > 0)
+			array_mode = RADEON_SURF_MODE_2D;
+		else
+			array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		is_scanout = metadata.u.gfx9.swizzle_mode == 0 ||
+			     metadata.u.gfx9.swizzle_mode % 4 == 2;
+	} else {
+		surface.u.legacy.pipe_config = metadata.u.legacy.pipe_config;
+		surface.u.legacy.bankw = metadata.u.legacy.bankw;
+		surface.u.legacy.bankh = metadata.u.legacy.bankh;
+		surface.u.legacy.tile_split = metadata.u.legacy.tile_split;
+		surface.u.legacy.mtilea = metadata.u.legacy.mtilea;
+		surface.u.legacy.num_banks = metadata.u.legacy.num_banks;
+
+		if (metadata.u.legacy.macrotile == RADEON_LAYOUT_TILED)
+			array_mode = RADEON_SURF_MODE_2D;
+		else if (metadata.u.legacy.microtile == RADEON_LAYOUT_TILED)
+			array_mode = RADEON_SURF_MODE_1D;
+		else
+			array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		is_scanout = metadata.u.legacy.scanout;
+	}
 
-	r = r600_init_surface(rscreen, &surface, templ, array_mode,
-			      false, false);
+	r = r600_init_surface(rscreen, &surface, templ, array_mode, stride,
+			      offset, true, is_scanout, false, false);
 	if (r) {
 		return NULL;
 	}
 
-	if (metadata.scanout)
-		surface.flags |= RADEON_SURF_SCANOUT;
-
-	rtex = r600_texture_create_object(screen, templ, stride,
-					  offset, buf, &surface);
+	rtex = r600_texture_create_object(screen, templ, buf, &surface);
 	if (!rtex)
 		return NULL;
 
@@ -1363,6 +1442,11 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	if (rscreen->apply_opaque_metadata)
 		rscreen->apply_opaque_metadata(rscreen, rtex, &metadata);
 
+	/* Validate that addrlib arrived at the same surface parameters. */
+	if (rscreen->chip_class >= GFX9) {
+		assert(metadata.u.gfx9.swizzle_mode == surface.u.gfx9.surf.swizzle_mode);
+	}
+
 	return &rtex->resource.b.b;
 }
 
@@ -1486,7 +1570,7 @@ static void r600_texture_invalidate_storage(struct r600_common_context *rctx,
 
 	/* There is no point in discarding depth and tiled buffers. */
 	assert(!rtex->is_depth);
-	assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED);
+	assert(rtex->surface.is_linear);
 
 	/* Reallocate the buffer in the same pipe_resource. */
 	r600_alloc_resource(rscreen, &rtex->resource);
@@ -1495,8 +1579,7 @@ static void r600_texture_invalidate_storage(struct r600_common_context *rctx,
 	rtex->cmask.base_address_reg =
 		(rtex->resource.gpu_address + rtex->cmask.offset) >> 8;
 
-	r600_dirty_all_framebuffer_states(rscreen);
-	p_atomic_inc(&rscreen->dirty_tex_descriptor_counter);
+	p_atomic_inc(&rscreen->dirty_tex_counter);
 
 	rctx->num_alloc_tex_transfer_bytes += rtex->size;
 }
@@ -1517,6 +1600,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 	bool use_staging_texture = false;
 
 	assert(!(texture->flags & R600_RESOURCE_FLAG_TRANSFER));
+	assert(box->width && box->height && box->depth);
 
 	/* Depth textures use staging unconditionally. */
 	if (!rtex->is_depth) {
@@ -1539,17 +1623,18 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		/* Tiled textures need to be converted into a linear texture for CPU
 		 * access. The staging texture is always linear and is placed in GART.
 		 *
-		 * Reading from VRAM is slow, always use the staging texture in
-		 * this case.
+		 * Reading from VRAM or GTT WC is slow, always use the staging
+		 * texture in this case.
 		 *
 		 * Use the staging texture for uploads if the underlying BO
 		 * is busy.
 		 */
-		if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D)
+		if (!rtex->surface.is_linear)
 			use_staging_texture = true;
 		else if (usage & PIPE_TRANSFER_READ)
-			use_staging_texture = (rtex->resource.domains &
-					       RADEON_DOMAIN_VRAM) != 0;
+			use_staging_texture =
+				rtex->resource.domains & RADEON_DOMAIN_VRAM ||
+				rtex->resource.flags & RADEON_FLAG_GTT_WC;
 		/* Write & linear only: */
 		else if (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf,
 							 RADEON_USAGE_READWRITE) ||
@@ -1567,7 +1652,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 	trans = CALLOC_STRUCT(r600_transfer);
 	if (!trans)
 		return NULL;
-	trans->transfer.resource = texture;
+	pipe_resource_reference(&trans->transfer.resource, texture);
 	trans->transfer.level = level;
 	trans->transfer.usage = usage;
 	trans->transfer.box = *box;
@@ -1609,8 +1694,12 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 							    0, 0, 0, box->depth, 0, 0);
 				pipe_resource_reference(&temp, NULL);
 			}
-		}
-		else {
+
+			/* Just get the strides. */
+			r600_texture_get_offset(rctx->screen, staging_depth, level, NULL,
+						&trans->transfer.stride,
+						&trans->transfer.layer_stride);
+		} else {
 			/* XXX: only readback the rectangle which is being mapped? */
 			/* XXX: when discard is true, no need to read back from depth texture */
 			if (!r600_init_flushed_depth_texture(ctx, texture, &staging_depth)) {
@@ -1624,11 +1713,12 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 						    box->z, box->z + box->depth - 1,
 						    0, 0);
 
-			offset = r600_texture_get_offset(staging_depth, level, box);
+			offset = r600_texture_get_offset(rctx->screen, staging_depth,
+							 level, box,
+							 &trans->transfer.stride,
+							 &trans->transfer.layer_stride);
 		}
 
-		trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes;
-		trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size;
 		trans->staging = (struct r600_resource*)staging_depth;
 		buf = trans->staging;
 	} else if (use_staging_texture) {
@@ -1648,8 +1738,11 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 			return NULL;
 		}
 		trans->staging = &staging->resource;
-		trans->transfer.stride = staging->surface.level[0].pitch_bytes;
-		trans->transfer.layer_stride = staging->surface.level[0].slice_size;
+
+		/* Just get the strides. */
+		r600_texture_get_offset(rctx->screen, staging, 0, NULL,
+					&trans->transfer.stride,
+					&trans->transfer.layer_stride);
 
 		if (usage & PIPE_TRANSFER_READ)
 			r600_copy_to_staging_texture(ctx, trans);
@@ -1659,9 +1752,9 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		buf = trans->staging;
 	} else {
 		/* the resource is mapped directly */
-		trans->transfer.stride = rtex->surface.level[level].pitch_bytes;
-		trans->transfer.layer_stride = rtex->surface.level[level].slice_size;
-		offset = r600_texture_get_offset(rtex, level, box);
+		offset = r600_texture_get_offset(rctx->screen, rtex, level, box,
+						 &trans->transfer.stride,
+						 &trans->transfer.layer_stride);
 		buf = &rtex->resource;
 	}
 
@@ -1717,6 +1810,7 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx,
 		rctx->num_alloc_tex_transfer_bytes = 0;
 	}
 
+	pipe_resource_reference(&transfer->resource, NULL);
 	FREE(transfer);
 }
 
@@ -1813,15 +1907,26 @@ bool vi_dcc_formats_compatible(enum pipe_format format1,
 	       type1 == type2;
 }
 
-void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx,
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
+				     unsigned level,
+				     enum pipe_format view_format)
+{
+	struct r600_texture *rtex = (struct r600_texture *)tex;
+
+	return vi_dcc_enabled(rtex, level) &&
+	       !vi_dcc_formats_compatible(tex->format, view_format);
+}
+
+/* This can't be merged with the above function, because
+ * vi_dcc_formats_compatible should be called only when DCC is enabled. */
+void vi_disable_dcc_if_incompatible_format(struct r600_common_context *rctx,
 					   struct pipe_resource *tex,
 					   unsigned level,
 					   enum pipe_format view_format)
 {
 	struct r600_texture *rtex = (struct r600_texture *)tex;
 
-	if (rtex->dcc_offset &&
-	    rtex->surface.level[level].dcc_enabled &&
+	if (vi_dcc_enabled(rtex, level) &&
 	    !vi_dcc_formats_compatible(tex->format, view_format))
 		if (!r600_texture_disable_dcc(rctx, (struct r600_texture*)tex))
 			rctx->decompress_dcc(&rctx->b, rtex);
@@ -1830,10 +1935,9 @@ void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx,
 struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
 						struct pipe_resource *texture,
 						const struct pipe_surface *templ,
+						unsigned width0, unsigned height0,
 						unsigned width, unsigned height)
 {
-	struct r600_common_context *rctx = (struct r600_common_context*)pipe;
-	struct r600_texture *rtex = (struct r600_texture*)texture;
 	struct r600_surface *surface = CALLOC_STRUCT(r600_surface);
 
 	if (!surface)
@@ -1849,13 +1953,14 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
 	surface->base.width = width;
 	surface->base.height = height;
 	surface->base.u = templ->u;
-	surface->level_info = &rtex->surface.level[templ->u.tex.level];
 
-	if (texture->target != PIPE_BUFFER)
-		vi_dcc_disable_if_incompatible_format(rctx, texture,
-						      templ->u.tex.level,
-						      templ->format);
+	surface->width0 = width0;
+	surface->height0 = height0;
 
+	surface->dcc_incompatible =
+		texture->target != PIPE_BUFFER &&
+		vi_dcc_formats_are_incompatible(texture, templ->u.tex.level,
+						templ->format);
 	return &surface->base;
 }
 
@@ -1866,6 +1971,8 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
 	unsigned level = templ->u.tex.level;
 	unsigned width = u_minify(tex->width0, level);
 	unsigned height = u_minify(tex->height0, level);
+	unsigned width0 = tex->width0;
+	unsigned height0 = tex->height0;
 
 	if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
 		const struct util_format_description *tex_desc
@@ -1884,10 +1991,15 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
 
 			width = nblks_x * templ_desc->block.width;
 			height = nblks_y * templ_desc->block.height;
+
+			width0 = util_format_get_nblocksx(tex->format, width0);
+			height0 = util_format_get_nblocksy(tex->format, height0);
 		}
 	}
 
-	return r600_create_surface_custom(pipe, tex, templ, width, height);
+	return r600_create_surface_custom(pipe, tex, templ,
+					  width0, height0,
+					  width, height);
 }
 
 static void r600_surface_destroy(struct pipe_context *pipe,
@@ -2157,7 +2269,7 @@ static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
 	if (!tex->resource.is_shared ||
 	    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
 	    tex->resource.b.b.target != PIPE_TEXTURE_2D ||
-	    tex->surface.last_level > 0 ||
+	    tex->resource.b.b.last_level > 0 ||
 	    !tex->surface.dcc_size)
 		return;
 
@@ -2173,7 +2285,7 @@ static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
 	if (!vi_should_enable_separate_dcc(tex))
 		return; /* stats show that DCC decompression is too expensive */
 
-	assert(tex->surface.level[0].dcc_enabled);
+	assert(tex->surface.num_dcc_levels);
 	assert(!tex->dcc_separate_buffer);
 
 	r600_texture_discard_cmask(rctx->screen, tex);
@@ -2186,7 +2298,8 @@ static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
 		tex->last_dcc_separate_buffer = NULL;
 	} else {
 		tex->dcc_separate_buffer = (struct r600_resource*)
-			r600_aligned_buffer_create(rctx->b.screen, 0,
+			r600_aligned_buffer_create(rctx->b.screen,
+						   R600_RESOURCE_FLAG_UNMAPPABLE,
 						   PIPE_USAGE_DEFAULT,
 						   tex->surface.dcc_size,
 						   tex->surface.dcc_alignment);
@@ -2272,7 +2385,7 @@ static void evergreen_set_clear_color(struct r600_texture *rtex,
 
 	memset(&uc, 0, sizeof(uc));
 
-	if (util_format_get_blocksizebits(surface_format) == 128) {
+	if (rtex->surface.bpe == 16) {
 		/* DCC fast clear only:
 		 *   CLEAR_WORD0 = R = G = B
 		 *   CLEAR_WORD1 = A
@@ -2386,9 +2499,9 @@ void vi_dcc_clear_level(struct r600_common_context *rctx,
 			unsigned level, unsigned clear_value)
 {
 	struct pipe_resource *dcc_buffer;
-	uint64_t dcc_offset;
+	uint64_t dcc_offset, clear_size;
 
-	assert(rtex->dcc_offset && rtex->surface.level[level].dcc_enabled);
+	assert(vi_dcc_enabled(rtex, level));
 
 	if (rtex->dcc_separate_buffer) {
 		dcc_buffer = &rtex->dcc_separate_buffer->b.b;
@@ -2398,10 +2511,18 @@ void vi_dcc_clear_level(struct r600_common_context *rctx,
 		dcc_offset = rtex->dcc_offset;
 	}
 
-	dcc_offset += rtex->surface.level[level].dcc_offset;
+	if (rctx->chip_class >= GFX9) {
+		/* Mipmap level clears aren't implemented. */
+		assert(rtex->resource.b.b.last_level == 0);
+		/* MSAA needs a different clear size. */
+		assert(rtex->resource.b.b.nr_samples <= 1);
+		clear_size = rtex->surface.dcc_size;
+	} else {
+		dcc_offset += rtex->surface.u.legacy.level[level].dcc_offset;
+		clear_size = rtex->surface.u.legacy.level[level].dcc_fast_clear_size;
+	}
 
-	rctx->clear_buffer(&rctx->b, dcc_buffer, dcc_offset,
-			   rtex->surface.level[level].dcc_fast_clear_size,
+	rctx->clear_buffer(&rctx->b, dcc_buffer, dcc_offset, clear_size,
 			   clear_value, R600_COHERENCY_CB_META);
 }
 
@@ -2413,27 +2534,59 @@ static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen,
 					   struct r600_texture *rtex)
 {
 	if (rtex->resource.is_shared ||
-	    rtex->surface.nsamples <= 1 ||
+	    rtex->resource.b.b.nr_samples <= 1 ||
 	    rtex->surface.micro_tile_mode == rtex->last_msaa_resolve_target_micro_mode)
 		return;
 
-	assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_2D);
-	assert(rtex->surface.last_level == 0);
+	assert(rscreen->chip_class >= GFX9 ||
+	       rtex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+	assert(rtex->resource.b.b.last_level == 0);
+
+	if (rscreen->chip_class >= GFX9) {
+		/* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
+		assert(rtex->surface.u.gfx9.surf.swizzle_mode >= 4);
+
+		/* If you do swizzle_mode % 4, you'll get:
+		 *   0 = Depth
+		 *   1 = Standard,
+		 *   2 = Displayable
+		 *   3 = Rotated
+		 *
+		 * Depth-sample order isn't allowed:
+		 */
+		assert(rtex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
 
-	/* These magic numbers were copied from addrlib. It doesn't use any
-	 * definitions for them either. They are all 2D_TILED_THIN1 modes with
-	 * different bpp and micro tile mode.
-	 */
-	if (rscreen->chip_class >= CIK) {
 		switch (rtex->last_msaa_resolve_target_micro_mode) {
-		case 0: /* displayable */
-			rtex->surface.tiling_index[0] = 10;
+		case RADEON_MICRO_MODE_DISPLAY:
+			rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			rtex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
 			break;
-		case 1: /* thin */
-			rtex->surface.tiling_index[0] = 14;
+		case RADEON_MICRO_MODE_THIN:
+			rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			rtex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
 			break;
-		case 3: /* rotated */
-			rtex->surface.tiling_index[0] = 28;
+		case RADEON_MICRO_MODE_ROTATED:
+			rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			rtex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
+			break;
+		default: /* depth */
+			assert(!"unexpected micro mode");
+			return;
+		}
+	} else if (rscreen->chip_class >= CIK) {
+		/* These magic numbers were copied from addrlib. It doesn't use
+		 * any definitions for them either. They are all 2D_TILED_THIN1
+		 * modes with different bpp and micro tile mode.
+		 */
+		switch (rtex->last_msaa_resolve_target_micro_mode) {
+		case RADEON_MICRO_MODE_DISPLAY:
+			rtex->surface.u.legacy.tiling_index[0] = 10;
+			break;
+		case RADEON_MICRO_MODE_THIN:
+			rtex->surface.u.legacy.tiling_index[0] = 14;
+			break;
+		case RADEON_MICRO_MODE_ROTATED:
+			rtex->surface.u.legacy.tiling_index[0] = 28;
 			break;
 		default: /* depth, thick */
 			assert(!"unexpected micro mode");
@@ -2441,32 +2594,32 @@ static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen,
 		}
 	} else { /* SI */
 		switch (rtex->last_msaa_resolve_target_micro_mode) {
-		case 0: /* displayable */
+		case RADEON_MICRO_MODE_DISPLAY:
 			switch (rtex->surface.bpe) {
 			case 1:
-                            rtex->surface.tiling_index[0] = 10;
+                            rtex->surface.u.legacy.tiling_index[0] = 10;
                             break;
 			case 2:
-                            rtex->surface.tiling_index[0] = 11;
+                            rtex->surface.u.legacy.tiling_index[0] = 11;
                             break;
 			default: /* 4, 8 */
-                            rtex->surface.tiling_index[0] = 12;
+                            rtex->surface.u.legacy.tiling_index[0] = 12;
                             break;
 			}
 			break;
-		case 1: /* thin */
+		case RADEON_MICRO_MODE_THIN:
 			switch (rtex->surface.bpe) {
 			case 1:
-                                rtex->surface.tiling_index[0] = 14;
+                                rtex->surface.u.legacy.tiling_index[0] = 14;
                                 break;
 			case 2:
-                                rtex->surface.tiling_index[0] = 15;
+                                rtex->surface.u.legacy.tiling_index[0] = 15;
                                 break;
 			case 4:
-                                rtex->surface.tiling_index[0] = 16;
+                                rtex->surface.u.legacy.tiling_index[0] = 16;
                                 break;
 			default: /* 8, 16 */
-                                rtex->surface.tiling_index[0] = 17;
+                                rtex->surface.u.legacy.tiling_index[0] = 17;
                                 break;
 			}
 			break;
@@ -2478,8 +2631,7 @@ static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen,
 
 	rtex->surface.micro_tile_mode = rtex->last_msaa_resolve_target_micro_mode;
 
-	p_atomic_inc(&rscreen->dirty_fb_counter);
-	p_atomic_inc(&rscreen->dirty_tex_descriptor_counter);
+	p_atomic_inc(&rscreen->dirty_tex_counter);
 }
 
 void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
@@ -2523,7 +2675,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 		}
 
 		/* only supported on tiled surfaces */
-		if (tex->surface.level[0].mode < RADEON_SURF_MODE_1D) {
+		if (tex->surface.is_linear) {
 			continue;
 		}
 
@@ -2536,8 +2688,8 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			continue;
 
 		/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
-		if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
-		    rctx->chip_class >= CIK &&
+		if (rctx->chip_class == CIK &&
+		    tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
 		    rctx->screen->info.drm_major == 2 &&
 		    rctx->screen->info.drm_minor < 38) {
 			continue;
@@ -2550,9 +2702,10 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 		    !(rctx->screen->debug_flags & DBG_NO_DCC_FB)) {
 			vi_separate_dcc_try_enable(rctx, tex);
 
-			/* Stoney can't do a CMASK-based clear, so all clears are
-			 * considered to be hypothetically slow clears, which
-			 * is weighed when determining to enable separate DCC.
+			/* RB+ isn't supported with a CMASK clear only on Stoney,
+			 * so all clears are considered to be hypothetically slow
+			 * clears, which is weighed when determining whether to
+			 * enable separate DCC.
 			 */
 			if (tex->dcc_gather_statistics &&
 			    rctx->family == CHIP_STONEY)
@@ -2560,10 +2713,14 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 		}
 
 		/* Try to clear DCC first, otherwise try CMASK. */
-		if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) {
+		if (vi_dcc_enabled(tex, 0)) {
 			uint32_t reset_value;
 			bool clear_words_needed;
 
+			/* TODO: fix DCC clear */
+			if (rctx->chip_class >= GFX9)
+				continue;
+
 			if (rctx->screen->debug_flags & DBG_NO_DCC_CLEAR)
 				continue;
 
@@ -2574,16 +2731,23 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 
 			vi_dcc_clear_level(rctx, tex, 0, reset_value);
 
-			if (clear_words_needed)
-				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+			unsigned level_bit = 1 << fb->cbufs[i]->u.tex.level;
+			if (clear_words_needed) {
+				bool need_compressed_update = !tex->dirty_level_mask;
+
+				tex->dirty_level_mask |= level_bit;
+
+				if (need_compressed_update)
+					p_atomic_inc(&rctx->screen->compressed_colortex_counter);
+			}
 			tex->separate_dcc_dirty = true;
 		} else {
 			/* 128-bit formats are unusupported */
-			if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) {
+			if (tex->surface.bpe > 8) {
 				continue;
 			}
 
-			/* Stoney/RB+ doesn't work with CMASK fast clear. */
+			/* RB+ doesn't work with CMASK fast clear on Stoney. */
 			if (rctx->family == CHIP_STONEY)
 				continue;
 
@@ -2598,7 +2762,12 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 					   tex->cmask.offset, tex->cmask.size, 0,
 					   R600_COHERENCY_CB_META);
 
+			bool need_compressed_update = !tex->dirty_level_mask;
+
 			tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+
+			if (need_compressed_update)
+				p_atomic_inc(&rctx->screen->compressed_colortex_counter);
 		}
 
 		/* We can change the micro tile mode before a full clear. */
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
index fb1491a28..d5352d9de 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
@@ -91,6 +91,12 @@ struct ruvd_decoder {
 	bool				use_legacy;
 	struct rvid_buffer		ctx;
 	struct rvid_buffer		sessionctx;
+	struct {
+		unsigned 		data0;
+		unsigned		data1;
+		unsigned		cmd;
+		unsigned		cntl;
+	} reg;
 };
 
 /* flush IB to the hardware */
@@ -120,14 +126,14 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
 		uint64_t addr;
 		addr = dec->ws->buffer_get_virtual_address(buf);
 		addr = addr + off;
-		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr);
-		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32);
+		set_reg(dec, dec->reg.data0, addr);
+		set_reg(dec, dec->reg.data1, addr >> 32);
 	} else {
 		off += dec->ws->buffer_get_reloc_offset(buf);
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
 	}
-	set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1);
+	set_reg(dec, dec->reg.cmd, cmd << 1);
 }
 
 /* do the codec needs an IT buffer ?*/
@@ -151,6 +157,8 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 
 	/* calc buffer offsets */
 	dec->msg = (struct ruvd_msg *)ptr;
+	memset(dec->msg, 0, sizeof(*dec->msg));
+
 	dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
 	if (have_it(dec))
 		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size);
@@ -322,6 +330,14 @@ static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_
 	return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size;
 }
 
+static unsigned get_db_pitch_alignment(struct ruvd_decoder *dec)
+{
+	if (((struct r600_common_screen*)dec->screen)->family < CHIP_VEGA10)
+		return 16;
+	else
+		return 32;
+}
+
 /* calculate size of reference picture buffer */
 static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 {
@@ -335,7 +351,7 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 	unsigned max_references = dec->base.max_references + 1;
 
 	// aligned size of a single frame
-	image_size = width * height;
+	image_size = align(width, get_db_pitch_alignment(dec)) * height;
 	image_size += image_size / 2;
 	image_size = align(image_size, 1024);
 
@@ -410,9 +426,9 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 		width = align (width, 16);
 		height = align (height, 16);
 		if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
-			dpb_size = align((width * height * 9) / 4, 256) * max_references;
+			dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 9) / 4, 256) * max_references;
 		else
-			dpb_size = align((width * height * 3) / 2, 256) * max_references;
+			dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 3) / 2, 256) * max_references;
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -478,6 +494,7 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	memset(&result, 0, sizeof(result));
 	switch (pic->base.profile) {
 	case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+	case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE:
 		result.profile = RUVD_H264_PROFILE_BASELINE;
 		break;
 
@@ -703,13 +720,16 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
 			result.direct_reflist[i][j] = pic->RefPicList[i][j];
 	}
 
-	if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) &&
-		(target->buffer_format == PIPE_FORMAT_NV12)) {
-		result.p010_mode = 0;
-		result.luma_10to8 = 5;
-		result.chroma_10to8 = 5;
-		result.sclr_luma10to8 = 4;
-		result.sclr_chroma10to8 = 4;
+	if (pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) {
+		if (target->buffer_format == PIPE_FORMAT_P016) {
+			result.p010_mode = 1;
+			result.msb_mode = 1;
+		} else {
+			result.luma_10to8 = 5;
+			result.chroma_10to8 = 5;
+			result.sclr_luma10to8 = 4;
+			result.sclr_chroma10to8 = 4;
+		}
 	}
 
 	/* TODO
@@ -931,7 +951,6 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 	assert(decoder);
 
 	map_msg_fb_it_buf(dec);
-	memset(dec->msg, 0, sizeof(*dec->msg));
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_DESTROY;
 	dec->msg->stream_handle = dec->stream_handle;
@@ -1074,7 +1093,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
-	dec->msg->body.decode.db_pitch = align(dec->base.width, 16);
+	dec->msg->body.decode.db_pitch = align(dec->base.width, get_db_pitch_alignment(dec));
 
 	if (dec->stream_type == RUVD_CODEC_H264_PERF &&
 	    ((struct r600_common_screen*)dec->screen)->family >= CHIP_POLARIS10)
@@ -1146,7 +1165,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	if (have_it(dec))
 		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf,
 			 FB_BUFFER_OFFSET + dec->fb_size, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
-	set_reg(dec, RUVD_ENGINE_CNTL, 1);
+	set_reg(dec, dec->reg.cntl, 1);
 
 	flush(dec, RADEON_FLUSH_ASYNC);
 	next_buffer(dec);
@@ -1280,6 +1299,18 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 		rvid_clear_buffer(context, &dec->sessionctx);
 	}
 
+	if (info.family >= CHIP_VEGA10) {
+		dec->reg.data0 = RUVD_GPCOM_VCPU_DATA0_SOC15;
+		dec->reg.data1 = RUVD_GPCOM_VCPU_DATA1_SOC15;
+		dec->reg.cmd = RUVD_GPCOM_VCPU_CMD_SOC15;
+		dec->reg.cntl = RUVD_ENGINE_CNTL_SOC15;
+	} else {
+		dec->reg.data0 = RUVD_GPCOM_VCPU_DATA0;
+		dec->reg.data1 = RUVD_GPCOM_VCPU_DATA1;
+		dec->reg.cmd = RUVD_GPCOM_VCPU_CMD;
+		dec->reg.cntl = RUVD_ENGINE_CNTL;
+	}
+
 	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_CREATE;
@@ -1315,10 +1346,20 @@ error:
 }
 
 /* calculate top/bottom offset */
-static unsigned texture_offset(struct radeon_surf *surface, unsigned layer)
+static unsigned texture_offset(struct radeon_surf *surface, unsigned layer,
+				enum ruvd_surface_type type)
 {
-	return surface->level[0].offset +
-		layer * surface->level[0].slice_size;
+	switch (type) {
+	default:
+	case RUVD_SURFACE_TYPE_LEGACY:
+		return surface->u.legacy.level[0].offset +
+			layer * surface->u.legacy.level[0].slice_size;
+		break;
+	case RUVD_SURFACE_TYPE_GFX9:
+		return surface->u.gfx9.surf_offset +
+			layer * surface->u.gfx9.surf_slice_size;
+		break;
+	}
 }
 
 /* hw encode the aspect of macro tiles */
@@ -1351,42 +1392,63 @@ static unsigned bank_wh(unsigned bankwh)
  * fill decoding target field from the luma and chroma surfaces
  */
 void ruvd_set_dt_surfaces(struct ruvd_msg *msg, struct radeon_surf *luma,
-			  struct radeon_surf *chroma)
+			struct radeon_surf *chroma, enum ruvd_surface_type type)
 {
-	msg->body.decode.dt_pitch = luma->level[0].pitch_bytes;
-	switch (luma->level[0].mode) {
-	case RADEON_SURF_MODE_LINEAR_ALIGNED:
-		msg->body.decode.dt_tiling_mode = RUVD_TILE_LINEAR;
-		msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_LINEAR;
-		break;
-	case RADEON_SURF_MODE_1D:
-		msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8;
-		msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_1D_THIN;
-		break;
-	case RADEON_SURF_MODE_2D:
-		msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8;
-		msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_2D_THIN;
-		break;
+	switch (type) {
 	default:
-		assert(0);
-		break;
-	}
+	case RUVD_SURFACE_TYPE_LEGACY:
+		msg->body.decode.dt_pitch = luma->u.legacy.level[0].nblk_x;
+		switch (luma->u.legacy.level[0].mode) {
+		case RADEON_SURF_MODE_LINEAR_ALIGNED:
+			msg->body.decode.dt_tiling_mode = RUVD_TILE_LINEAR;
+			msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_LINEAR;
+			break;
+		case RADEON_SURF_MODE_1D:
+			msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8;
+			msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_1D_THIN;
+			break;
+		case RADEON_SURF_MODE_2D:
+			msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8;
+			msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_2D_THIN;
+			break;
+		default:
+			assert(0);
+			break;
+		}
 
-	msg->body.decode.dt_luma_top_offset = texture_offset(luma, 0);
-	msg->body.decode.dt_chroma_top_offset = texture_offset(chroma, 0);
-	if (msg->body.decode.dt_field_mode) {
-		msg->body.decode.dt_luma_bottom_offset = texture_offset(luma, 1);
-		msg->body.decode.dt_chroma_bottom_offset = texture_offset(chroma, 1);
-	} else {
-		msg->body.decode.dt_luma_bottom_offset = msg->body.decode.dt_luma_top_offset;
-		msg->body.decode.dt_chroma_bottom_offset = msg->body.decode.dt_chroma_top_offset;
-	}
+		msg->body.decode.dt_luma_top_offset = texture_offset(luma, 0, type);
+		msg->body.decode.dt_chroma_top_offset = texture_offset(chroma, 0, type);
+		if (msg->body.decode.dt_field_mode) {
+			msg->body.decode.dt_luma_bottom_offset = texture_offset(luma, 1, type);
+			msg->body.decode.dt_chroma_bottom_offset = texture_offset(chroma, 1, type);
+		} else {
+			msg->body.decode.dt_luma_bottom_offset = msg->body.decode.dt_luma_top_offset;
+			msg->body.decode.dt_chroma_bottom_offset = msg->body.decode.dt_chroma_top_offset;
+		}
 
-	assert(luma->bankw == chroma->bankw);
-	assert(luma->bankh == chroma->bankh);
-	assert(luma->mtilea == chroma->mtilea);
+		assert(luma->u.legacy.bankw == chroma->u.legacy.bankw);
+		assert(luma->u.legacy.bankh == chroma->u.legacy.bankh);
+		assert(luma->u.legacy.mtilea == chroma->u.legacy.mtilea);
 
-	msg->body.decode.dt_surf_tile_config |= RUVD_BANK_WIDTH(bank_wh(luma->bankw));
-	msg->body.decode.dt_surf_tile_config |= RUVD_BANK_HEIGHT(bank_wh(luma->bankh));
-	msg->body.decode.dt_surf_tile_config |= RUVD_MACRO_TILE_ASPECT_RATIO(macro_tile_aspect(luma->mtilea));
+		msg->body.decode.dt_surf_tile_config |= RUVD_BANK_WIDTH(bank_wh(luma->u.legacy.bankw));
+		msg->body.decode.dt_surf_tile_config |= RUVD_BANK_HEIGHT(bank_wh(luma->u.legacy.bankh));
+		msg->body.decode.dt_surf_tile_config |= RUVD_MACRO_TILE_ASPECT_RATIO(macro_tile_aspect(luma->u.legacy.mtilea));
+		break;
+	case RUVD_SURFACE_TYPE_GFX9:
+		msg->body.decode.dt_pitch = luma->u.gfx9.surf_pitch * luma->bpe;
+		/* SWIZZLE LINEAR MODE */
+		msg->body.decode.dt_tiling_mode = RUVD_TILE_LINEAR;
+		msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_LINEAR;
+		msg->body.decode.dt_luma_top_offset = texture_offset(luma, 0, type);
+		msg->body.decode.dt_chroma_top_offset = texture_offset(chroma, 0, type);
+		if (msg->body.decode.dt_field_mode) {
+			msg->body.decode.dt_luma_bottom_offset = texture_offset(luma, 1, type);
+			msg->body.decode.dt_chroma_bottom_offset = texture_offset(chroma, 1, type);
+		} else {
+			msg->body.decode.dt_luma_bottom_offset = msg->body.decode.dt_luma_top_offset;
+			msg->body.decode.dt_chroma_bottom_offset = msg->body.decode.dt_chroma_top_offset;
+		}
+		msg->body.decode.dt_surf_tile_config = 0;
+		break;
+	}
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
index e3f8504d8..0c3797e22 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
@@ -56,6 +56,11 @@
 #define RUVD_GPCOM_VCPU_DATA1		0xEF14
 #define RUVD_ENGINE_CNTL		0xEF18
 
+#define RUVD_GPCOM_VCPU_CMD_SOC15		0x2070c
+#define RUVD_GPCOM_VCPU_DATA0_SOC15		0x20710
+#define RUVD_GPCOM_VCPU_DATA1_SOC15		0x20714
+#define RUVD_ENGINE_CNTL_SOC15			0x20718
+
 /* UVD commands to VCPU */
 #define RUVD_CMD_MSG_BUFFER		0x00000000
 #define RUVD_CMD_DPB_BUFFER		0x00000001
@@ -111,6 +116,11 @@
 #define RUVD_VC1_PROFILE_MAIN		0x00000001
 #define RUVD_VC1_PROFILE_ADVANCED	0x00000002
 
+enum ruvd_surface_type {
+	RUVD_SURFACE_TYPE_LEGACY = 0,
+	RUVD_SURFACE_TYPE_GFX9
+};
+
 struct ruvd_mvc_element {
 	uint16_t	viewOrderIndex;
 	uint16_t	viewId;
@@ -432,5 +442,5 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 /* fill decoding target field from the luma and chroma surfaces */
 void ruvd_set_dt_surfaces(struct ruvd_msg *msg, struct radeon_surf *luma,
-			  struct radeon_surf *chroma);
+			struct radeon_surf *chroma, enum ruvd_surface_type type);
 #endif
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
index ef93e46c1..70c1e60f5 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
@@ -52,6 +52,7 @@
 #define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
 #define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8))
 #define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8))
+#define FW_53_19_4 ((53 << 24) | (19 << 16) | (4 << 8))
 
 /**
  * flush commands to the hardware
@@ -178,14 +179,15 @@ static unsigned get_cpb_num(struct rvce_encoder *enc)
 	case 41:
 		dpb = 32768;
 		break;
-	default:
 	case 42:
 		dpb = 34816;
 		break;
 	case 50:
 		dpb = 110400;
 		break;
+	default:
 	case 51:
+	case 52:
 		dpb = 184320;
 		break;
 	}
@@ -223,9 +225,17 @@ struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
 void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
 		       signed *luma_offset, signed *chroma_offset)
 {
-	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
-	unsigned vpitch = align(enc->luma->npix_y, 16);
-	unsigned fsize = pitch * (vpitch + vpitch / 2);
+	struct r600_common_screen *rscreen = (struct r600_common_screen *)enc->screen;
+	unsigned pitch, vpitch, fsize;
+
+	if (rscreen->chip_class < GFX9) {
+		pitch = align(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe, 128);
+		vpitch = align(enc->luma->u.legacy.level[0].nblk_y, 16);
+	} else {
+		pitch = align(enc->luma->u.gfx9.surf_pitch * enc->luma->bpe, 256);
+		vpitch = align(enc->luma->u.gfx9.surf_height, 16);
+	}
+	fsize = pitch * (vpitch + vpitch / 2);
 
 	*luma_offset = slot->index * fsize;
 	*chroma_offset = *luma_offset + pitch * vpitch;
@@ -412,7 +422,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		enc->use_vui = true;
 	if (rscreen->info.family >= CHIP_TONGA &&
 	    rscreen->info.family != CHIP_STONEY &&
-	    rscreen->info.family != CHIP_POLARIS11)
+	    rscreen->info.family != CHIP_POLARIS11 &&
+	    rscreen->info.family != CHIP_POLARIS12)
 		enc->dual_pipe = true;
 	/* TODO enable B frame with dual instance */
 	if ((rscreen->info.family >= CHIP_TONGA) &&
@@ -454,8 +465,14 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		goto error;
 
 	get_buffer(((struct vl_video_buffer *)tmp_buf)->resources[0], NULL, &tmp_surf);
-	cpb_size = align(tmp_surf->level[0].pitch_bytes, 128);
-	cpb_size = cpb_size * align(tmp_surf->npix_y, 32);
+
+	cpb_size = (rscreen->chip_class < GFX9) ?
+		align(tmp_surf->u.legacy.level[0].nblk_x * tmp_surf->bpe, 128) *
+		align(tmp_surf->u.legacy.level[0].nblk_y, 32) :
+
+		align(tmp_surf->u.gfx9.surf_pitch * tmp_surf->bpe, 256) *
+		align(tmp_surf->u.gfx9.surf_height, 32);
+
 	cpb_size = cpb_size * 3 / 2;
 	cpb_size = cpb_size * enc->cpb_num;
 	if (enc->dual_pipe)
@@ -493,6 +510,10 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		radeon_vce_52_init(enc);
 		get_pic_param = radeon_vce_52_get_param;
 		break;
+	case FW_53_19_4:
+		radeon_vce_52_init(enc);
+		get_pic_param = radeon_vce_52_get_param;
+		break;
 
 	default:
 		goto error;
@@ -525,6 +546,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 	case FW_52_0_3:
 	case FW_52_4_3:
 	case FW_52_8_3:
+	case FW_53_19_4:
 		return true;
 	default:
 		return false;
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index fe15ded39..b9afd089a 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -94,9 +94,9 @@ static void create(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // encPicStructRestriction
 	RVCE_CS(enc->base.width); // encImageWidth
 	RVCE_CS(enc->base.height); // encImageHeight
-	RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch
-	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch
-	RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw
+	RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encRefPicLumaPitch
+	RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encRefPicChromaPitch
+	RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16) / 8); // encRefYHeightInQw
 	RVCE_CS(0x00000000); // encRefPic(Addr|Array)Mode, encPicStructRestriction, disableRDO
 	RVCE_END();
 }
@@ -320,12 +320,12 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
 	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
-		  enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+		  enc->luma->u.legacy.level[0].offset); // inputPictureLumaAddressHi/Lo
 	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
-		  enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
-	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
-	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
-	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
+		  enc->chroma->u.legacy.level[0].offset); // inputPictureChromaAddressHi/Lo
+	RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16)); // encInputFrameYPitch
+	RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encInputPicLumaPitch
+	RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encInputPicChromaPitch
 	RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode
 	RVCE_CS(0x00000000); // encInputPicTileConfig
 	RVCE_CS(enc->pic.picture_type); // encPicType
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
index 262e13ba9..0d1181451 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -127,12 +127,12 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
 	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
-		enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+		enc->luma->u.legacy.level[0].offset); // inputPictureLumaAddressHi/Lo
 	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
-		enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
-	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
-	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
-	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
+		enc->chroma->u.legacy.level[0].offset); // inputPictureChromaAddressHi/Lo
+	RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16)); // encInputFrameYPitch
+	RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encInputPicLumaPitch
+	RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encInputPicChromaPitch
 	if (enc->dual_pipe)
 		RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
 	else
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
index 5db01fe52..36cf48047 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
@@ -167,6 +167,7 @@ void radeon_vce_52_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_pict
 
 static void create(struct rvce_encoder *enc)
 {
+	struct r600_common_screen *rscreen = (struct r600_common_screen *)enc->screen;
 	enc->task_info(enc, 0x00000000, 0, 0, 0);
 
 	RVCE_BEGIN(0x01000001); // create cmd
@@ -177,9 +178,17 @@ static void create(struct rvce_encoder *enc)
 	RVCE_CS(enc->enc_pic.ec.enc_pic_struct_restriction);
 	RVCE_CS(enc->base.width); // encImageWidth
 	RVCE_CS(enc->base.height); // encImageHeight
-	RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch
-	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch
-	RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw
+
+	if (rscreen->chip_class < GFX9) {
+		RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encRefPicLumaPitch
+		RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encRefPicChromaPitch
+		RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16) / 8); // encRefYHeightInQw
+	} else {
+		RVCE_CS(enc->luma->u.gfx9.surf_pitch * enc->luma->bpe); // encRefPicLumaPitch
+		RVCE_CS(enc->chroma->u.gfx9.surf_pitch * enc->chroma->bpe); // encRefPicChromaPitch
+		RVCE_CS(align(enc->luma->u.gfx9.surf_height, 16) / 8); // encRefYHeightInQw
+	}
+
 	RVCE_CS(enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants);
 
 	RVCE_CS(enc->enc_pic.ec.enc_pre_encode_context_buffer_offset);
@@ -191,6 +200,7 @@ static void create(struct rvce_encoder *enc)
 
 static void encode(struct rvce_encoder *enc)
 {
+	struct r600_common_screen *rscreen = (struct r600_common_screen *)enc->screen;
 	signed luma_offset, chroma_offset, bs_offset;
 	unsigned dep, bs_idx = enc->bs_idx++;
 	int i;
@@ -239,13 +249,25 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(enc->enc_pic.eo.insert_aud);
 	RVCE_CS(enc->enc_pic.eo.end_of_sequence);
 	RVCE_CS(enc->enc_pic.eo.end_of_stream);
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
-		enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
-		enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
-	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
-	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
-	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
+
+	if (rscreen->chip_class < GFX9) {
+		RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+			enc->luma->u.legacy.level[0].offset); // inputPictureLumaAddressHi/Lo
+		RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+			enc->chroma->u.legacy.level[0].offset); // inputPictureChromaAddressHi/Lo
+		RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16)); // encInputFrameYPitch
+		RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encInputPicLumaPitch
+		RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encInputPicChromaPitch
+	} else {
+		RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+			enc->luma->u.gfx9.surf_offset); // inputPictureLumaAddressHi/Lo
+		RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+			enc->chroma->u.gfx9.surf_offset); // inputPictureChromaAddressHi/Lo
+		RVCE_CS(align(enc->luma->u.gfx9.surf_height, 16)); // encInputFrameYPitch
+		RVCE_CS(enc->luma->u.gfx9.surf_pitch * enc->luma->bpe); // encInputPicLumaPitch
+		RVCE_CS(enc->chroma->u.gfx9.surf_pitch * enc->chroma->bpe); // encInputPicChromaPitch
+	}
+
 	if (enc->dual_pipe)
 		enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00000000;
 	else
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
index de8e11cd8..c7ad7f7a3 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
@@ -72,7 +72,7 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer,
 	 * non-sub-allocated buffer.
 	 */
 	buffer->res = (struct r600_resource *)
-		pipe_buffer_create(screen, PIPE_BIND_CUSTOM | PIPE_BIND_SHARED,
+		pipe_buffer_create(screen, PIPE_BIND_SHARED,
 				   usage, size);
 
 	return buffer->res != NULL;
@@ -129,8 +129,8 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
 {
 	struct r600_common_context *rctx = (struct r600_common_context*)context;
 
-	rctx->clear_buffer(context, &buffer->res->b.b, 0, buffer->res->buf->size,
-			   0, R600_COHERENCY_NONE);
+	rctx->dma_clear_buffer(context, &buffer->res->b.b, 0,
+			       buffer->res->buf->size, 0);
 	context->flush(context, NULL, 0);
 }
 
@@ -138,26 +138,31 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
  * join surfaces into the same buffer with identical tiling params
  * sumup their sizes and replace the backend buffers with a single bo
  */
-void rvid_join_surfaces(struct radeon_winsys* ws,
+void rvid_join_surfaces(struct r600_common_context *rctx,
 			struct pb_buffer** buffers[VL_NUM_COMPONENTS],
 			struct radeon_surf *surfaces[VL_NUM_COMPONENTS])
 {
+	struct radeon_winsys* ws;
 	unsigned best_tiling, best_wh, off;
 	unsigned size, alignment;
 	struct pb_buffer *pb;
 	unsigned i, j;
 
+	ws = rctx->ws;
+
 	for (i = 0, best_tiling = 0, best_wh = ~0; i < VL_NUM_COMPONENTS; ++i) {
 		unsigned wh;
 
 		if (!surfaces[i])
 			continue;
 
-		/* choose the smallest bank w/h for now */
-		wh = surfaces[i]->bankw * surfaces[i]->bankh;
-		if (wh < best_wh) {
-			best_wh = wh;
-			best_tiling = i;
+		if (rctx->chip_class < GFX9) {
+			/* choose the smallest bank w/h for now */
+			wh = surfaces[i]->u.legacy.bankw * surfaces[i]->u.legacy.bankh;
+			if (wh < best_wh) {
+				best_wh = wh;
+				best_tiling = i;
+			}
 		}
 	}
 
@@ -165,17 +170,22 @@ void rvid_join_surfaces(struct radeon_winsys* ws,
 		if (!surfaces[i])
 			continue;
 
-		/* copy the tiling parameters */
-		surfaces[i]->bankw = surfaces[best_tiling]->bankw;
-		surfaces[i]->bankh = surfaces[best_tiling]->bankh;
-		surfaces[i]->mtilea = surfaces[best_tiling]->mtilea;
-		surfaces[i]->tile_split = surfaces[best_tiling]->tile_split;
-
 		/* adjust the texture layer offsets */
-		off = align(off, surfaces[i]->bo_alignment);
-		for (j = 0; j < ARRAY_SIZE(surfaces[i]->level); ++j)
-			surfaces[i]->level[j].offset += off;
-		off += surfaces[i]->bo_size;
+		off = align(off, surfaces[i]->surf_alignment);
+
+		if (rctx->chip_class < GFX9) {
+			/* copy the tiling parameters */
+			surfaces[i]->u.legacy.bankw = surfaces[best_tiling]->u.legacy.bankw;
+			surfaces[i]->u.legacy.bankh = surfaces[best_tiling]->u.legacy.bankh;
+			surfaces[i]->u.legacy.mtilea = surfaces[best_tiling]->u.legacy.mtilea;
+			surfaces[i]->u.legacy.tile_split = surfaces[best_tiling]->u.legacy.tile_split;
+
+			for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.legacy.level); ++j)
+				surfaces[i]->u.legacy.level[j].offset += off;
+		} else
+			surfaces[i]->u.gfx9.surf_offset += off;
+
+		off += surfaces[i]->surf_size;
 	}
 
 	for (i = 0, size = 0, alignment = 0; i < VL_NUM_COMPONENTS; ++i) {
@@ -279,7 +289,11 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_MAX_HEIGHT:
 		return (rscreen->family < CHIP_TONGA) ? 1152 : 4096;
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-		return PIPE_FORMAT_NV12;
+		if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+			return PIPE_FORMAT_P016;
+		else
+			return PIPE_FORMAT_NV12;
+
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
 	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
 		if (rscreen->family < CHIP_PALM) {
@@ -331,6 +345,11 @@ boolean rvid_is_format_supported(struct pipe_screen *screen,
 				 enum pipe_video_profile profile,
 				 enum pipe_video_entrypoint entrypoint)
 {
+	/* HEVC 10 bit decoding should use P016 instead of NV12 if possible */
+	if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+		return (format == PIPE_FORMAT_NV12) ||
+			(format == PIPE_FORMAT_P016);
+
 	/* we can only handle this one with UVD */
 	if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
 		return format == PIPE_FORMAT_NV12;
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
index 39305b4fd..3347c4ebc 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
@@ -66,7 +66,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
 
 /* join surfaces into the same buffer with identical tiling params
    sumup their sizes and replace the backend buffers with a single bo */
-void rvid_join_surfaces(struct radeon_winsys* ws,
+void rvid_join_surfaces(struct r600_common_context *rctx,
                         struct pb_buffer** buffers[VL_NUM_COMPONENTS],
                         struct radeon_surf *surfaces[VL_NUM_COMPONENTS]);
 
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
index 8946209d3..2e287c67e 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
@@ -52,7 +52,8 @@ enum radeon_bo_flag { /* bitfield */
     RADEON_FLAG_GTT_WC =        (1 << 0),
     RADEON_FLAG_CPU_ACCESS =    (1 << 1),
     RADEON_FLAG_NO_CPU_ACCESS = (1 << 2),
-    RADEON_FLAG_HANDLE =        (1 << 3), /* the buffer most not be suballocated */
+    RADEON_FLAG_HANDLE =        (1 << 3), /* the buffer must not be suballocated */
+    RADEON_FLAG_SPARSE =        (1 << 4),
 };
 
 enum radeon_bo_usage { /* bitfield */
@@ -66,6 +67,8 @@ enum radeon_bo_usage { /* bitfield */
     RADEON_USAGE_SYNCHRONIZED = 8
 };
 
+#define RADEON_SPARSE_PAGE_SIZE (64 * 1024)
+
 enum ring_type {
     RING_GFX = 0,
     RING_COMPUTE,
@@ -81,16 +84,20 @@ enum radeon_value_id {
     RADEON_MAPPED_VRAM,
     RADEON_MAPPED_GTT,
     RADEON_BUFFER_WAIT_TIME_NS,
+    RADEON_NUM_MAPPED_BUFFERS,
     RADEON_TIMESTAMP,
-    RADEON_NUM_CS_FLUSHES,
+    RADEON_NUM_GFX_IBS,
+    RADEON_NUM_SDMA_IBS,
     RADEON_NUM_BYTES_MOVED,
     RADEON_NUM_EVICTIONS,
     RADEON_VRAM_USAGE,
+    RADEON_VRAM_VIS_USAGE,
     RADEON_GTT_USAGE,
     RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */
     RADEON_CURRENT_SCLK,
     RADEON_CURRENT_MCLK,
     RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */
+    RADEON_CS_THREAD_TIME,
 };
 
 /* Each group of four has the same priority. */
@@ -182,6 +189,7 @@ struct radeon_info {
     uint32_t                    gart_page_size;
     uint64_t                    gart_size;
     uint64_t                    vram_size;
+    uint64_t                    vram_vis_size;
     uint64_t                    max_alloc_size;
     uint32_t                    min_alloc_size;
     bool                        has_dedicated_vram;
@@ -196,6 +204,7 @@ struct radeon_info {
     uint32_t                    ce_fw_version;
     uint32_t                    vce_harvest_config;
     uint32_t                    clock_crystal_freq;
+    uint32_t                    tcc_cache_line_size;
 
     /* Kernel info. */
     uint32_t                    drm_major; /* version */
@@ -231,16 +240,25 @@ struct radeon_bo_metadata {
     /* Tiling flags describing the texture layout for display code
      * and DRI sharing.
      */
-    enum radeon_bo_layout   microtile;
-    enum radeon_bo_layout   macrotile;
-    unsigned                pipe_config;
-    unsigned                bankw;
-    unsigned                bankh;
-    unsigned                tile_split;
-    unsigned                mtilea;
-    unsigned                num_banks;
-    unsigned                stride;
-    bool                    scanout;
+    union {
+        struct {
+            enum radeon_bo_layout   microtile;
+            enum radeon_bo_layout   macrotile;
+            unsigned                pipe_config;
+            unsigned                bankw;
+            unsigned                bankh;
+            unsigned                tile_split;
+            unsigned                mtilea;
+            unsigned                num_banks;
+            unsigned                stride;
+            bool                    scanout;
+        } legacy;
+
+        struct {
+            /* surface flags */
+            unsigned swizzle_mode:5;
+        } gfx9;
+    } u;
 
     /* Additional metadata associated with the buffer, in bytes.
      * The maximum size is 64 * 4. This is opaque for the winsys & kernel.
@@ -255,99 +273,151 @@ enum radeon_feature_id {
     RADEON_FID_R300_CMASK_ACCESS,
 };
 
-#define RADEON_SURF_MAX_LEVEL                   32
-
-#define RADEON_SURF_TYPE_MASK                   0xFF
-#define RADEON_SURF_TYPE_SHIFT                  0
-#define     RADEON_SURF_TYPE_1D                     0
-#define     RADEON_SURF_TYPE_2D                     1
-#define     RADEON_SURF_TYPE_3D                     2
-#define     RADEON_SURF_TYPE_CUBEMAP                3
-#define     RADEON_SURF_TYPE_1D_ARRAY               4
-#define     RADEON_SURF_TYPE_2D_ARRAY               5
-#define RADEON_SURF_MODE_MASK                   0xFF
-#define RADEON_SURF_MODE_SHIFT                  8
-#define     RADEON_SURF_MODE_LINEAR_ALIGNED         1
-#define     RADEON_SURF_MODE_1D                     2
-#define     RADEON_SURF_MODE_2D                     3
+#define RADEON_SURF_MAX_LEVELS                  15
+
+enum radeon_surf_mode {
+    RADEON_SURF_MODE_LINEAR_ALIGNED = 1,
+    RADEON_SURF_MODE_1D = 2,
+    RADEON_SURF_MODE_2D = 3,
+};
+
+/* These are defined exactly like GB_TILE_MODEn.MICRO_TILE_MODE_NEW. */
+enum radeon_micro_mode {
+    RADEON_MICRO_MODE_DISPLAY = 0,
+    RADEON_MICRO_MODE_THIN = 1,
+    RADEON_MICRO_MODE_DEPTH = 2,
+    RADEON_MICRO_MODE_ROTATED = 3,
+};
+
+/* the first 16 bits are reserved for libdrm_radeon, don't use them */
 #define RADEON_SURF_SCANOUT                     (1 << 16)
 #define RADEON_SURF_ZBUFFER                     (1 << 17)
 #define RADEON_SURF_SBUFFER                     (1 << 18)
 #define RADEON_SURF_Z_OR_SBUFFER                (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
-#define RADEON_SURF_HAS_SBUFFER_MIPTREE         (1 << 19)
-#define RADEON_SURF_HAS_TILE_MODE_INDEX         (1 << 20)
+/* bits 19 and 20 are reserved for libdrm_radeon, don't use them */
 #define RADEON_SURF_FMASK                       (1 << 21)
 #define RADEON_SURF_DISABLE_DCC                 (1 << 22)
 #define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
+#define RADEON_SURF_IMPORTED                    (1 << 24)
+#define RADEON_SURF_OPTIMIZE_FOR_SPACE          (1 << 25)
 
-#define RADEON_SURF_GET(v, field)   (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
-#define RADEON_SURF_SET(v, field)   (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
-#define RADEON_SURF_CLR(v, field)   ((v) & ~(RADEON_SURF_ ## field ## _MASK << RADEON_SURF_ ## field ## _SHIFT))
-
-struct radeon_surf_level {
+struct legacy_surf_level {
     uint64_t                    offset;
     uint64_t                    slice_size;
-    uint32_t                    npix_x;
-    uint32_t                    npix_y;
-    uint32_t                    npix_z;
-    uint32_t                    nblk_x;
-    uint32_t                    nblk_y;
-    uint32_t                    nblk_z;
-    uint32_t                    pitch_bytes;
-    uint32_t                    mode;
     uint64_t                    dcc_offset;
     uint64_t                    dcc_fast_clear_size;
-    bool                        dcc_enabled;
+    uint16_t                    nblk_x;
+    uint16_t                    nblk_y;
+    enum radeon_surf_mode       mode;
 };
 
-struct radeon_surf {
-    /* These are inputs to the calculator. */
-    uint32_t                    npix_x;
-    uint32_t                    npix_y;
-    uint32_t                    npix_z;
-    uint32_t                    blk_w;
-    uint32_t                    blk_h;
-    uint32_t                    blk_d;
-    uint32_t                    array_size;
-    uint32_t                    last_level;
-    uint32_t                    bpe;
-    uint32_t                    nsamples;
-    uint32_t                    flags;
-
-    /* These are return values. Some of them can be set by the caller, but
-     * they will be treated as hints (e.g. bankw, bankh) and might be
-     * changed by the calculator.
-     */
-    uint64_t                    bo_size;
-    uint64_t                    bo_alignment;
-    /* This applies to EG and later. */
-    uint32_t                    bankw;
-    uint32_t                    bankh;
-    uint32_t                    mtilea;
-    uint32_t                    tile_split;
-    uint32_t                    stencil_tile_split;
-    struct radeon_surf_level    level[RADEON_SURF_MAX_LEVEL];
-    struct radeon_surf_level    stencil_level[RADEON_SURF_MAX_LEVEL];
-    uint32_t                    tiling_index[RADEON_SURF_MAX_LEVEL];
-    uint32_t                    stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
-    uint32_t                    pipe_config;
-    uint32_t                    num_banks;
-    uint32_t                    macro_tile_index;
-    uint32_t                    micro_tile_mode; /* displayable, thin, depth, rotated */
+struct legacy_surf_layout {
+    unsigned                    bankw:4;  /* max 8 */
+    unsigned                    bankh:4;  /* max 8 */
+    unsigned                    mtilea:4; /* max 8 */
+    unsigned                    tile_split:13;         /* max 4K */
+    unsigned                    stencil_tile_split:13; /* max 4K */
+    unsigned                    pipe_config:5;      /* max 17 */
+    unsigned                    num_banks:5;        /* max 16 */
+    unsigned                    macro_tile_index:4; /* max 15 */
 
     /* Whether the depth miptree or stencil miptree as used by the DB are
      * adjusted from their TC compatible form to ensure depth/stencil
      * compatibility. If either is true, the corresponding plane cannot be
      * sampled from.
      */
-    bool                        depth_adjusted;
-    bool                        stencil_adjusted;
+    unsigned                    depth_adjusted:1;
+    unsigned                    stencil_adjusted:1;
+
+    struct legacy_surf_level    level[RADEON_SURF_MAX_LEVELS];
+    struct legacy_surf_level    stencil_level[RADEON_SURF_MAX_LEVELS];
+    uint8_t                     tiling_index[RADEON_SURF_MAX_LEVELS];
+    uint8_t                     stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
+};
+
+/* Same as addrlib - AddrResourceType. */
+enum gfx9_resource_type {
+    RADEON_RESOURCE_1D = 0,
+    RADEON_RESOURCE_2D,
+    RADEON_RESOURCE_3D,
+};
+
+struct gfx9_surf_flags {
+    uint16_t                    swizzle_mode; /* tile mode */
+    uint16_t                    epitch; /* (pitch - 1) or (height - 1) */
+};
+
+struct gfx9_surf_meta_flags {
+    unsigned                    rb_aligned:1;   /* optimal for RBs */
+    unsigned                    pipe_aligned:1; /* optimal for TC */
+};
+
+struct gfx9_surf_layout {
+    struct gfx9_surf_flags      surf;    /* color or depth surface */
+    struct gfx9_surf_flags      fmask;   /* not added to surf_size */
+    struct gfx9_surf_flags      stencil; /* added to surf_size, use stencil_offset */
+
+    struct gfx9_surf_meta_flags dcc;   /* metadata of color */
+    struct gfx9_surf_meta_flags htile; /* metadata of depth and stencil */
+    struct gfx9_surf_meta_flags cmask; /* metadata of fmask */
+
+    enum gfx9_resource_type     resource_type; /* 1D, 2D or 3D */
+    uint64_t                    surf_offset; /* 0 unless imported with an offset */
+    /* The size of the 2D plane containing all mipmap levels. */
+    uint64_t                    surf_slice_size;
+    uint16_t                    surf_pitch; /* in blocks */
+    uint16_t                    surf_height;
+    /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
+    uint32_t                    offset[RADEON_SURF_MAX_LEVELS];
 
+    uint16_t                    dcc_pitch_max;  /* (mip chain pitch - 1) */
+
+    uint64_t                    stencil_offset; /* separate stencil */
+    uint64_t                    fmask_size;
+    uint64_t                    cmask_size;
+
+    uint32_t                    fmask_alignment;
+    uint32_t                    cmask_alignment;
+};
+
+struct radeon_surf {
+    /* Format properties. */
+    unsigned                    blk_w:4;
+    unsigned                    blk_h:4;
+    unsigned                    bpe:5;
+    /* Number of mipmap levels where DCC is enabled starting from level 0.
+     * Non-zero levels may be disabled due to alignment constraints, but not
+     * the first level.
+     */
+    unsigned                    num_dcc_levels:4;
+    unsigned                    is_linear:1;
+    /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */
+    unsigned                    micro_tile_mode:3;
+    uint32_t                    flags;
+
+    /* These are return values. Some of them can be set by the caller, but
+     * they will be treated as hints (e.g. bankw, bankh) and might be
+     * changed by the calculator.
+     */
+    uint64_t                    surf_size;
     uint64_t                    dcc_size;
-    uint64_t                    dcc_alignment;
-    /* TC-compatible HTILE only. */
     uint64_t                    htile_size;
-    uint64_t                    htile_alignment;
+
+    uint32_t                    surf_alignment;
+    uint32_t                    dcc_alignment;
+    uint32_t                    htile_alignment;
+
+    union {
+        /* R600-VI return values.
+         *
+         * Some of them can be set by the caller if certain parameters are
+         * desirable. The allocator will try to obey them.
+         */
+        struct legacy_surf_layout legacy;
+
+        /* GFX9+ return values. */
+        struct gfx9_surf_layout gfx9;
+    } u;
 };
 
 struct radeon_bo_list_item {
@@ -508,6 +578,20 @@ struct radeon_winsys {
                               struct winsys_handle *whandle);
 
     /**
+     * Change the commitment of a (64KB-page aligned) region of the given
+     * sparse buffer.
+     *
+     * \warning There is no automatic synchronization with command submission.
+     *
+     * \note Only implemented by the amdgpu winsys.
+     *
+     * \return false on out of memory or other failure, true on success.
+     */
+    bool (*buffer_commit)(struct pb_buffer *buf,
+                          uint64_t offset, uint64_t size,
+                          bool commit);
+
+    /**
      * Return the virtual address of a buffer.
      *
      * When virtual memory is not in use, this is the offset relative to the
@@ -739,18 +823,16 @@ struct radeon_winsys {
      * Initialize surface
      *
      * \param ws        The winsys this function is called from.
-     * \param surf      Surface structure ptr
+     * \param tex       Input texture description
+     * \param flags     Bitmask of RADEON_SURF_* flags
+     * \param bpe       Bytes per pixel, it can be different for Z buffers.
+     * \param mode      Preferred tile mode. (linear, 1D, or 2D)
+     * \param surf      Output structure
      */
     int (*surface_init)(struct radeon_winsys *ws,
-                        struct radeon_surf *surf);
-
-    /**
-     * Find best values for a surface
-     *
-     * \param ws        The winsys this function is called from.
-     * \param surf      Surface structure ptr
-     */
-    int (*surface_best)(struct radeon_winsys *ws,
+                        const struct pipe_resource *tex,
+                        unsigned flags, unsigned bpe,
+                        enum radeon_surf_mode mode,
                         struct radeon_surf *surf);
 
     uint64_t (*query_value)(struct radeon_winsys *ws,
author	Jonathan Gray <jsg@cvs.openbsd.org>	2017-08-14 09:45:54 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2017-08-14 09:45:54 +0000
commit	4c58069f5013f0a621503525f7d5193bfe9976b3 (patch)
tree	bd8f8a08b889e9a8b99c9de01ae12459d527ea6d /lib/mesa/src/gallium/drivers/radeon
parent	5caa025e6b62d0456faad86c89f239a14d1eaadb (diff)