Import Mesa 18.3.2

author: Jonathan Gray <jsg@cvs.openbsd.org> 2019-01-29 11:08:07 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2019-01-29 11:08:07 +0000
commit: 6b139c2063623e9310025247cd966490b9aa57ea (patch)
tree: 375acfd898ca3d721250aa17291bbb90a8d7250a /lib/mesa/src/gallium/drivers/radeonsi
parent: cce99579dcfb1d54c54cff65573be3430e77f2c5 (diff)
23 files changed, 11910 insertions, 526 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
index 7f57b4ea8..8c5078c13 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -3,6 +3,7 @@ DRI_CONF_SECTION_PERFORMANCE
     DRI_CONF_RADEONSI_ENABLE_SISCHED("false")
     DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
     DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+    DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
 DRI_CONF_SECTION_END
 
 DRI_CONF_SECTION_DEBUG
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/meson.build b/lib/mesa/src/gallium/drivers/radeonsi/meson.build
new file mode 100644
index 000000000..ac8ed949e
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/meson.build
@@ -0,0 +1,120 @@
+# Copyright © 2017 Dylan Baker
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+files_libradeonsi = files(
+  'cik_sdma.c',
+  'driinfo_radeonsi.h',
+  'si_blit.c',
+  'si_buffer.c',
+  'si_build_pm4.h',
+  'si_clear.c',
+  'si_compute.c',
+  'si_compute.h',
+  'si_compute_blit.c',
+  'si_cp_dma.c',
+  'si_debug.c',
+  'si_descriptors.c',
+  'si_dma.c',
+  'si_dma_cs.c',
+  'si_fence.c',
+  'si_get.c',
+  'si_gfx_cs.c',
+  'si_gpu_load.c',
+  'si_perfcounter.c',
+  'si_pipe.c',
+  'si_pipe.h',
+  'si_pm4.c',
+  'si_pm4.h',
+  'si_public.h',
+  'si_query.c',
+  'si_query.h',
+  'si_shader.c',
+  'si_shader.h',
+  'si_shader_internal.h',
+  'si_shader_nir.c',
+  'si_shader_tgsi_alu.c',
+  'si_shader_tgsi_mem.c',
+  'si_shader_tgsi_setup.c',
+  'si_shaderlib_tgsi.c',
+  'si_state.c',
+  'si_state.h',
+  'si_state_binning.c',
+  'si_state_draw.c',
+  'si_state_msaa.c',
+  'si_state_shaders.c',
+  'si_state_streamout.c',
+  'si_state_viewport.c',
+  'si_test_dma.c',
+  'si_test_dma_perf.c',
+  'si_texture.c',
+  'si_uvd.c',
+  '../radeon/r600_perfcounter.c',
+  '../radeon/radeon_uvd.c',
+  '../radeon/radeon_uvd.h',
+  '../radeon/radeon_vcn_enc_1_2.c',
+  '../radeon/radeon_vcn_enc.c',
+  '../radeon/radeon_vcn_enc.h',
+  '../radeon/radeon_vcn_dec_jpeg.c',
+  '../radeon/radeon_vcn_dec.c',
+  '../radeon/radeon_vcn_dec.h',
+  '../radeon/radeon_uvd_enc_1_1.c',
+  '../radeon/radeon_uvd_enc.c',
+  '../radeon/radeon_uvd_enc.h',
+  '../radeon/radeon_vce_40_2_2.c',
+  '../radeon/radeon_vce_50.c',
+  '../radeon/radeon_vce_52.c',
+  '../radeon/radeon_vce.c',
+  '../radeon/radeon_vce.h',
+  '../radeon/radeon_video.c',
+  '../radeon/radeon_video.h',
+  '../radeon/radeon_winsys.h',
+)
+
+si_driinfo_h = custom_target(
+  'si_driinfo.h',
+  input : files(
+    '../../../util/merge_driinfo.py',
+    '../../auxiliary/pipe-loader/driinfo_gallium.h', 'driinfo_radeonsi.h'
+  ),
+  output : 'si_driinfo.h',
+  command : [prog_python, '@INPUT@'],
+  capture : true,
+)
+
+libradeonsi = static_library(
+  'radeonsi',
+  [files_libradeonsi, si_driinfo_h, sid_tables_h],
+  include_directories : [
+    inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_amd_common,
+    inc_gallium_drivers,
+  ],
+  c_args : ['-Wstrict-overflow=0', c_vis_args],
+  cpp_args : [cpp_vis_args],
+  dependencies : [dep_llvm, dep_clock, dep_libdrm_radeon, idep_nir_headers],
+)
+
+driver_radeonsi = declare_dependency(
+  compile_args : '-DGALLIUM_RADEONSI',
+  sources : si_driinfo_h,
+  link_with : [
+    libradeonsi, libradeonwinsys, libamdgpuwinsys, libamd_common,
+  ],
+  dependencies : idep_nir,
+)
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_buffer.c b/lib/mesa/src/gallium/drivers/radeonsi/si_buffer.c
new file mode 100644
index 000000000..c7260e06c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_buffer.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "radeonsi/si_pipe.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_transfer.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+bool si_rings_is_buffer_referenced(struct si_context *sctx,
+				   struct pb_buffer *buf,
+				   enum radeon_bo_usage usage)
+{
+	if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
+		return true;
+	}
+	if (radeon_emitted(sctx->dma_cs, 0) &&
+	    sctx->ws->cs_is_buffer_referenced(sctx->dma_cs, buf, usage)) {
+		return true;
+	}
+	return false;
+}
+
+void *si_buffer_map_sync_with_rings(struct si_context *sctx,
+				    struct r600_resource *resource,
+				    unsigned usage)
+{
+	enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
+	bool busy = false;
+
+	assert(!(resource->flags & RADEON_FLAG_SPARSE));
+
+	if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+		return sctx->ws->buffer_map(resource->buf, NULL, usage);
+	}
+
+	if (!(usage & PIPE_TRANSFER_WRITE)) {
+		/* have to wait for the last write */
+		rusage = RADEON_USAGE_WRITE;
+	}
+
+	if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
+	    sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
+						resource->buf, rusage)) {
+		if (usage & PIPE_TRANSFER_DONTBLOCK) {
+			si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+			return NULL;
+		} else {
+			si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+			busy = true;
+		}
+	}
+	if (radeon_emitted(sctx->dma_cs, 0) &&
+	    sctx->ws->cs_is_buffer_referenced(sctx->dma_cs,
+						resource->buf, rusage)) {
+		if (usage & PIPE_TRANSFER_DONTBLOCK) {
+			si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+			return NULL;
+		} else {
+			si_flush_dma_cs(sctx, 0, NULL);
+			busy = true;
+		}
+	}
+
+	if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
+		if (usage & PIPE_TRANSFER_DONTBLOCK) {
+			return NULL;
+		} else {
+			/* We will be wait for the GPU. Wait for any offloaded
+			 * CS flush to complete to avoid busy-waiting in the winsys. */
+			sctx->ws->cs_sync_flush(sctx->gfx_cs);
+			if (sctx->dma_cs)
+				sctx->ws->cs_sync_flush(sctx->dma_cs);
+		}
+	}
+
+	/* Setting the CS to NULL will prevent doing checks we have done already. */
+	return sctx->ws->buffer_map(resource->buf, NULL, usage);
+}
+
+void si_init_resource_fields(struct si_screen *sscreen,
+			     struct r600_resource *res,
+			     uint64_t size, unsigned alignment)
+{
+	struct si_texture *tex = (struct si_texture*)res;
+
+	res->bo_size = size;
+	res->bo_alignment = alignment;
+	res->flags = 0;
+	res->texture_handle_allocated = false;
+	res->image_handle_allocated = false;
+
+	switch (res->b.b.usage) {
+	case PIPE_USAGE_STREAM:
+		res->flags = RADEON_FLAG_GTT_WC;
+		/* fall through */
+	case PIPE_USAGE_STAGING:
+		/* Transfers are likely to occur more often with these
+		 * resources. */
+		res->domains = RADEON_DOMAIN_GTT;
+		break;
+	case PIPE_USAGE_DYNAMIC:
+		/* Older kernels didn't always flush the HDP cache before
+		 * CS execution
+		 */
+		if (!sscreen->info.kernel_flushes_hdp_before_ib) {
+			res->domains = RADEON_DOMAIN_GTT;
+			res->flags |= RADEON_FLAG_GTT_WC;
+			break;
+		}
+		/* fall through */
+	case PIPE_USAGE_DEFAULT:
+	case PIPE_USAGE_IMMUTABLE:
+	default:
+		/* Not listing GTT here improves performance in some
+		 * apps. */
+		res->domains = RADEON_DOMAIN_VRAM;
+		res->flags |= RADEON_FLAG_GTT_WC;
+		break;
+	}
+
+	if (res->b.b.target == PIPE_BUFFER &&
+	    res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
+		/* Use GTT for all persistent mappings with older
+		 * kernels, because they didn't always flush the HDP
+		 * cache before CS execution.
+		 *
+		 * Write-combined CPU mappings are fine, the kernel
+		 * ensures all CPU writes finish before the GPU
+		 * executes a command stream.
+		 *
+		 * radeon doesn't have good BO move throttling, so put all
+		 * persistent buffers into GTT to prevent VRAM CPU page faults.
+		 */
+		if (!sscreen->info.kernel_flushes_hdp_before_ib ||
+		    sscreen->info.drm_major == 2)
+			res->domains = RADEON_DOMAIN_GTT;
+	}
+
+	/* Tiled textures are unmappable. Always put them in VRAM. */
+	if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
+	    res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
+		res->domains = RADEON_DOMAIN_VRAM;
+		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
+			 RADEON_FLAG_GTT_WC;
+	}
+
+	/* Displayable and shareable surfaces are not suballocated. */
+	if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
+		res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
+	else
+		res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
+	if (sscreen->debug_flags & DBG(NO_WC))
+		res->flags &= ~RADEON_FLAG_GTT_WC;
+
+	if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
+		res->flags |= RADEON_FLAG_READ_ONLY;
+
+	if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
+		res->flags |= RADEON_FLAG_32BIT;
+
+	/* Set expected VRAM and GART usage for the buffer. */
+	res->vram_usage = 0;
+	res->gart_usage = 0;
+	res->max_forced_staging_uploads = 0;
+	res->b.max_forced_staging_uploads = 0;
+
+	if (res->domains & RADEON_DOMAIN_VRAM) {
+		res->vram_usage = size;
+
+		res->max_forced_staging_uploads =
+		res->b.max_forced_staging_uploads =
+			sscreen->info.has_dedicated_vram &&
+			size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
+	} else if (res->domains & RADEON_DOMAIN_GTT) {
+		res->gart_usage = size;
+	}
+}
+
+bool si_alloc_resource(struct si_screen *sscreen,
+		       struct r600_resource *res)
+{
+	struct pb_buffer *old_buf, *new_buf;
+
+	/* Allocate a new resource. */
+	new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size,
+					     res->bo_alignment,
+					     res->domains, res->flags);
+	if (!new_buf) {
+		return false;
+	}
+
+	/* Replace the pointer such that if res->buf wasn't NULL, it won't be
+	 * NULL. This should prevent crashes with multiple contexts using
+	 * the same buffer where one of the contexts invalidates it while
+	 * the others are using it. */
+	old_buf = res->buf;
+	res->buf = new_buf; /* should be atomic */
+	res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
+
+	if (res->flags & RADEON_FLAG_32BIT) {
+		uint64_t start = res->gpu_address;
+		uint64_t last = start + res->bo_size - 1;
+		(void)start;
+		(void)last;
+
+		assert((start >> 32) == sscreen->info.address32_hi);
+		assert((last >> 32) == sscreen->info.address32_hi);
+	}
+
+	pb_reference(&old_buf, NULL);
+
+	util_range_set_empty(&res->valid_buffer_range);
+	res->TC_L2_dirty = false;
+
+	/* Print debug information. */
+	if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
+		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
+			res->gpu_address, res->gpu_address + res->buf->size,
+			res->buf->size);
+	}
+	return true;
+}
+
+static void si_buffer_destroy(struct pipe_screen *screen,
+			      struct pipe_resource *buf)
+{
+	struct r600_resource *rbuffer = r600_resource(buf);
+
+	threaded_resource_deinit(buf);
+	util_range_destroy(&rbuffer->valid_buffer_range);
+	pb_reference(&rbuffer->buf, NULL);
+	FREE(rbuffer);
+}
+
+/* Reallocate the buffer a update all resource bindings where the buffer is
+ * bound.
+ *
+ * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
+ * idle by discarding its contents.
+ */
+static bool
+si_invalidate_buffer(struct si_context *sctx,
+		     struct r600_resource *rbuffer)
+{
+	/* Shared buffers can't be reallocated. */
+	if (rbuffer->b.is_shared)
+		return false;
+
+	/* Sparse buffers can't be reallocated. */
+	if (rbuffer->flags & RADEON_FLAG_SPARSE)
+		return false;
+
+	/* In AMD_pinned_memory, the user pointer association only gets
+	 * broken when the buffer is explicitly re-allocated.
+	 */
+	if (rbuffer->b.is_user_ptr)
+		return false;
+
+	/* Check if mapping this buffer would cause waiting for the GPU. */
+	if (si_rings_is_buffer_referenced(sctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+	    !sctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+		uint64_t old_va = rbuffer->gpu_address;
+
+		/* Reallocate the buffer in the same pipe_resource. */
+		si_alloc_resource(sctx->screen, rbuffer);
+		si_rebind_buffer(sctx, &rbuffer->b.b, old_va);
+	} else {
+		util_range_set_empty(&rbuffer->valid_buffer_range);
+	}
+
+	return true;
+}
+
+/* Replace the storage of dst with src. */
+void si_replace_buffer_storage(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 struct pipe_resource *src)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct r600_resource *rdst = r600_resource(dst);
+	struct r600_resource *rsrc = r600_resource(src);
+	uint64_t old_gpu_address = rdst->gpu_address;
+
+	pb_reference(&rdst->buf, rsrc->buf);
+	rdst->gpu_address = rsrc->gpu_address;
+	rdst->b.b.bind = rsrc->b.b.bind;
+	rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads;
+	rdst->max_forced_staging_uploads = rsrc->max_forced_staging_uploads;
+	rdst->flags = rsrc->flags;
+
+	assert(rdst->vram_usage == rsrc->vram_usage);
+	assert(rdst->gart_usage == rsrc->gart_usage);
+	assert(rdst->bo_size == rsrc->bo_size);
+	assert(rdst->bo_alignment == rsrc->bo_alignment);
+	assert(rdst->domains == rsrc->domains);
+
+	si_rebind_buffer(sctx, dst, old_gpu_address);
+}
+
+static void si_invalidate_resource(struct pipe_context *ctx,
+				   struct pipe_resource *resource)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct r600_resource *rbuffer = r600_resource(resource);
+
+	/* We currently only do anyting here for buffers */
+	if (resource->target == PIPE_BUFFER)
+		(void)si_invalidate_buffer(sctx, rbuffer);
+}
+
+static void *si_buffer_get_transfer(struct pipe_context *ctx,
+				    struct pipe_resource *resource,
+				    unsigned usage,
+				    const struct pipe_box *box,
+				    struct pipe_transfer **ptransfer,
+				    void *data, struct r600_resource *staging,
+				    unsigned offset)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct si_transfer *transfer;
+
+	if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+		transfer = slab_alloc(&sctx->pool_transfers_unsync);
+	else
+		transfer = slab_alloc(&sctx->pool_transfers);
+
+	transfer->b.b.resource = NULL;
+	pipe_resource_reference(&transfer->b.b.resource, resource);
+	transfer->b.b.level = 0;
+	transfer->b.b.usage = usage;
+	transfer->b.b.box = *box;
+	transfer->b.b.stride = 0;
+	transfer->b.b.layer_stride = 0;
+	transfer->b.staging = NULL;
+	transfer->offset = offset;
+	transfer->staging = staging;
+	*ptransfer = &transfer->b.b;
+	return data;
+}
+
+static void *si_buffer_transfer_map(struct pipe_context *ctx,
+				    struct pipe_resource *resource,
+				    unsigned level,
+				    unsigned usage,
+				    const struct pipe_box *box,
+				    struct pipe_transfer **ptransfer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct r600_resource *rbuffer = r600_resource(resource);
+	uint8_t *data;
+
+	assert(box->x + box->width <= resource->width0);
+
+	/* From GL_AMD_pinned_memory issues:
+	 *
+	 *     4) Is glMapBuffer on a shared buffer guaranteed to return the
+	 *        same system address which was specified at creation time?
+	 *
+	 *        RESOLVED: NO. The GL implementation might return a different
+	 *        virtual mapping of that memory, although the same physical
+	 *        page will be used.
+	 *
+	 * So don't ever use staging buffers.
+	 */
+	if (rbuffer->b.is_user_ptr)
+		usage |= PIPE_TRANSFER_PERSISTENT;
+
+	/* See if the buffer range being mapped has never been initialized,
+	 * in which case it can be mapped unsynchronized. */
+	if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+		       TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
+	    usage & PIPE_TRANSFER_WRITE &&
+	    !rbuffer->b.is_shared &&
+	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
+		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+	}
+
+	/* If discarding the entire range, discard the whole resource instead. */
+	if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
+	    box->x == 0 && box->width == resource->width0) {
+		usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+	}
+
+	/* If a buffer in VRAM is too large and the range is discarded, don't
+	 * map it directly. This makes sure that the buffer stays in VRAM.
+	 */
+	bool force_discard_range = false;
+	if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+		     PIPE_TRANSFER_DISCARD_RANGE) &&
+	    !(usage & PIPE_TRANSFER_PERSISTENT) &&
+	    /* Try not to decrement the counter if it's not positive. Still racy,
+	     * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
+	    rbuffer->max_forced_staging_uploads > 0 &&
+	    p_atomic_dec_return(&rbuffer->max_forced_staging_uploads) >= 0) {
+		usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+			   PIPE_TRANSFER_UNSYNCHRONIZED);
+		usage |= PIPE_TRANSFER_DISCARD_RANGE;
+		force_discard_range = true;
+	}
+
+	if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
+	    !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+		       TC_TRANSFER_MAP_NO_INVALIDATE))) {
+		assert(usage & PIPE_TRANSFER_WRITE);
+
+		if (si_invalidate_buffer(sctx, rbuffer)) {
+			/* At this point, the buffer is always idle. */
+			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+		} else {
+			/* Fall back to a temporary buffer. */
+			usage |= PIPE_TRANSFER_DISCARD_RANGE;
+		}
+	}
+
+	if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+	    ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+			 PIPE_TRANSFER_PERSISTENT))) ||
+	     (rbuffer->flags & RADEON_FLAG_SPARSE))) {
+		assert(usage & PIPE_TRANSFER_WRITE);
+
+		/* Check if mapping this buffer would cause waiting for the GPU.
+		 */
+		if (rbuffer->flags & RADEON_FLAG_SPARSE ||
+		    force_discard_range ||
+		    si_rings_is_buffer_referenced(sctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+		    !sctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+			/* Do a wait-free write-only transfer using a temporary buffer. */
+			unsigned offset;
+			struct r600_resource *staging = NULL;
+
+			u_upload_alloc(ctx->stream_uploader, 0,
+                                       box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
+				       sctx->screen->info.tcc_cache_line_size,
+				       &offset, (struct pipe_resource**)&staging,
+                                       (void**)&data);
+
+			if (staging) {
+				data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+				return si_buffer_get_transfer(ctx, resource, usage, box,
+								ptransfer, data, staging, offset);
+			} else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+				return NULL;
+			}
+		} else {
+			/* At this point, the buffer is always idle (we checked it above). */
+			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+		}
+	}
+	/* Use a staging buffer in cached GTT for reads. */
+	else if (((usage & PIPE_TRANSFER_READ) &&
+		  !(usage & PIPE_TRANSFER_PERSISTENT) &&
+		  (rbuffer->domains & RADEON_DOMAIN_VRAM ||
+		   rbuffer->flags & RADEON_FLAG_GTT_WC)) ||
+		 (rbuffer->flags & RADEON_FLAG_SPARSE)) {
+		struct r600_resource *staging;
+
+		assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
+		staging = r600_resource(pipe_buffer_create(
+				ctx->screen, 0, PIPE_USAGE_STAGING,
+				box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
+		if (staging) {
+			/* Copy the VRAM buffer to the staging buffer. */
+			sctx->dma_copy(ctx, &staging->b.b, 0,
+				       box->x % SI_MAP_BUFFER_ALIGNMENT,
+				       0, 0, resource, 0, box);
+
+			data = si_buffer_map_sync_with_rings(sctx, staging,
+							     usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
+			if (!data) {
+				r600_resource_reference(&staging, NULL);
+				return NULL;
+			}
+			data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+
+			return si_buffer_get_transfer(ctx, resource, usage, box,
+							ptransfer, data, staging, 0);
+		} else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+			return NULL;
+		}
+	}
+
+	data = si_buffer_map_sync_with_rings(sctx, rbuffer, usage);
+	if (!data) {
+		return NULL;
+	}
+	data += box->x;
+
+	return si_buffer_get_transfer(ctx, resource, usage, box,
+					ptransfer, data, NULL, 0);
+}
+
+static void si_buffer_do_flush_region(struct pipe_context *ctx,
+				      struct pipe_transfer *transfer,
+				      const struct pipe_box *box)
+{
+	struct si_transfer *stransfer = (struct si_transfer*)transfer;
+	struct r600_resource *rbuffer = r600_resource(transfer->resource);
+
+	if (stransfer->staging) {
+		/* Copy the staging buffer into the original one. */
+		si_copy_buffer((struct si_context*)ctx, transfer->resource,
+			       &stransfer->staging->b.b, box->x,
+			       stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT,
+			       box->width);
+	}
+
+	util_range_add(&rbuffer->valid_buffer_range, box->x,
+		       box->x + box->width);
+}
+
+static void si_buffer_flush_region(struct pipe_context *ctx,
+				   struct pipe_transfer *transfer,
+				   const struct pipe_box *rel_box)
+{
+	unsigned required_usage = PIPE_TRANSFER_WRITE |
+				  PIPE_TRANSFER_FLUSH_EXPLICIT;
+
+	if ((transfer->usage & required_usage) == required_usage) {
+		struct pipe_box box;
+
+		u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+		si_buffer_do_flush_region(ctx, transfer, &box);
+	}
+}
+
+static void si_buffer_transfer_unmap(struct pipe_context *ctx,
+				     struct pipe_transfer *transfer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct si_transfer *stransfer = (struct si_transfer*)transfer;
+
+	if (transfer->usage & PIPE_TRANSFER_WRITE &&
+	    !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+		si_buffer_do_flush_region(ctx, transfer, &transfer->box);
+
+	r600_resource_reference(&stransfer->staging, NULL);
+	assert(stransfer->b.staging == NULL); /* for threaded context only */
+	pipe_resource_reference(&transfer->resource, NULL);
+
+	/* Don't use pool_transfers_unsync. We are always in the driver
+	 * thread. */
+	slab_free(&sctx->pool_transfers, transfer);
+}
+
+static void si_buffer_subdata(struct pipe_context *ctx,
+			      struct pipe_resource *buffer,
+			      unsigned usage, unsigned offset,
+			      unsigned size, const void *data)
+{
+	struct pipe_transfer *transfer = NULL;
+	struct pipe_box box;
+	uint8_t *map = NULL;
+
+	u_box_1d(offset, size, &box);
+	map = si_buffer_transfer_map(ctx, buffer, 0,
+				       PIPE_TRANSFER_WRITE |
+				       PIPE_TRANSFER_DISCARD_RANGE |
+				       usage,
+				       &box, &transfer);
+	if (!map)
+		return;
+
+	memcpy(map, data, size);
+	si_buffer_transfer_unmap(ctx, transfer);
+}
+
+static const struct u_resource_vtbl si_buffer_vtbl =
+{
+	NULL,				/* get_handle */
+	si_buffer_destroy,		/* resource_destroy */
+	si_buffer_transfer_map,	/* transfer_map */
+	si_buffer_flush_region,	/* transfer_flush_region */
+	si_buffer_transfer_unmap,	/* transfer_unmap */
+};
+
+static struct r600_resource *
+si_alloc_buffer_struct(struct pipe_screen *screen,
+		       const struct pipe_resource *templ)
+{
+	struct r600_resource *rbuffer;
+
+	rbuffer = MALLOC_STRUCT(r600_resource);
+
+	rbuffer->b.b = *templ;
+	rbuffer->b.b.next = NULL;
+	pipe_reference_init(&rbuffer->b.b.reference, 1);
+	rbuffer->b.b.screen = screen;
+
+	rbuffer->b.vtbl = &si_buffer_vtbl;
+	threaded_resource_init(&rbuffer->b.b);
+
+	rbuffer->buf = NULL;
+	rbuffer->bind_history = 0;
+	rbuffer->TC_L2_dirty = false;
+	util_range_init(&rbuffer->valid_buffer_range);
+	return rbuffer;
+}
+
+static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
+					      const struct pipe_resource *templ,
+					      unsigned alignment)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct r600_resource *rbuffer = si_alloc_buffer_struct(screen, templ);
+
+	if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+		rbuffer->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
+
+	si_init_resource_fields(sscreen, rbuffer, templ->width0, alignment);
+
+	if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+		rbuffer->flags |= RADEON_FLAG_SPARSE;
+
+	if (!si_alloc_resource(sscreen, rbuffer)) {
+		FREE(rbuffer);
+		return NULL;
+	}
+	return &rbuffer->b.b;
+}
+
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
+						 unsigned flags, unsigned usage,
+						 unsigned size, unsigned alignment)
+{
+	struct pipe_resource buffer;
+
+	memset(&buffer, 0, sizeof buffer);
+	buffer.target = PIPE_BUFFER;
+	buffer.format = PIPE_FORMAT_R8_UNORM;
+	buffer.bind = 0;
+	buffer.usage = usage;
+	buffer.flags = flags;
+	buffer.width0 = size;
+	buffer.height0 = 1;
+	buffer.depth0 = 1;
+	buffer.array_size = 1;
+	return si_buffer_create(screen, &buffer, alignment);
+}
+
+struct r600_resource *si_aligned_buffer_create(struct pipe_screen *screen,
+					       unsigned flags, unsigned usage,
+					       unsigned size, unsigned alignment)
+{
+	return r600_resource(pipe_aligned_buffer_create(screen, flags, usage,
+							size, alignment));
+}
+
+static struct pipe_resource *
+si_buffer_from_user_memory(struct pipe_screen *screen,
+			   const struct pipe_resource *templ,
+			   void *user_memory)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct radeon_winsys *ws = sscreen->ws;
+	struct r600_resource *rbuffer = si_alloc_buffer_struct(screen, templ);
+
+	rbuffer->domains = RADEON_DOMAIN_GTT;
+	rbuffer->flags = 0;
+	rbuffer->b.is_user_ptr = true;
+	util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0);
+	util_range_add(&rbuffer->b.valid_buffer_range, 0, templ->width0);
+
+	/* Convert a user pointer to a buffer. */
+	rbuffer->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
+	if (!rbuffer->buf) {
+		FREE(rbuffer);
+		return NULL;
+	}
+
+	rbuffer->gpu_address = ws->buffer_get_virtual_address(rbuffer->buf);
+	rbuffer->vram_usage = 0;
+	rbuffer->gart_usage = templ->width0;
+
+	return &rbuffer->b.b;
+}
+
+static struct pipe_resource *si_resource_create(struct pipe_screen *screen,
+						const struct pipe_resource *templ)
+{
+	if (templ->target == PIPE_BUFFER) {
+		return si_buffer_create(screen, templ, 256);
+	} else {
+		return si_texture_create(screen, templ);
+	}
+}
+
+static bool si_resource_commit(struct pipe_context *pctx,
+			       struct pipe_resource *resource,
+			       unsigned level, struct pipe_box *box,
+			       bool commit)
+{
+	struct si_context *ctx = (struct si_context *)pctx;
+	struct r600_resource *res = r600_resource(resource);
+
+	/*
+	 * Since buffer commitment changes cannot be pipelined, we need to
+	 * (a) flush any pending commands that refer to the buffer we're about
+	 *     to change, and
+	 * (b) wait for threaded submit to finish, including those that were
+	 *     triggered by some other, earlier operation.
+	 */
+	if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
+					       res->buf, RADEON_USAGE_READWRITE)) {
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+	}
+	if (radeon_emitted(ctx->dma_cs, 0) &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma_cs,
+					       res->buf, RADEON_USAGE_READWRITE)) {
+		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+	}
+
+	ctx->ws->cs_sync_flush(ctx->dma_cs);
+	ctx->ws->cs_sync_flush(ctx->gfx_cs);
+
+	assert(resource->target == PIPE_BUFFER);
+
+	return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
+}
+
+void si_init_screen_buffer_functions(struct si_screen *sscreen)
+{
+	sscreen->b.resource_create = si_resource_create;
+	sscreen->b.resource_destroy = u_resource_destroy_vtbl;
+	sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
+}
+
+void si_init_buffer_functions(struct si_context *sctx)
+{
+	sctx->b.invalidate_resource = si_invalidate_resource;
+	sctx->b.transfer_map = u_transfer_map_vtbl;
+	sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
+	sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
+	sctx->b.texture_subdata = u_default_texture_subdata;
+	sctx->b.buffer_subdata = si_buffer_subdata;
+	sctx->b.resource_commit = si_resource_commit;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_build_pm4.h b/lib/mesa/src/gallium/drivers/radeonsi/si_build_pm4.h
new file mode 100644
index 000000000..796adda09
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * This file contains helpers for writing commands to commands streams.
+ */
+
+#ifndef SI_BUILD_PM4_H
+#define SI_BUILD_PM4_H
+
+#include "si_pipe.h"
+#include "sid.h"
+
+static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+	assert(reg < SI_CONTEXT_REG_OFFSET);
+	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+	radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
+	radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+	radeon_set_config_reg_seq(cs, reg, 1);
+	radeon_emit(cs, value);
+}
+
+static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+	assert(reg >= SI_CONTEXT_REG_OFFSET);
+	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
+	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+	radeon_set_context_reg_seq(cs, reg, 1);
+	radeon_emit(cs, value);
+}
+
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
+					      unsigned reg, unsigned idx,
+					      unsigned value)
+{
+	assert(reg >= SI_CONTEXT_REG_OFFSET);
+	assert(cs->current.cdw + 3 <= cs->current.max_dw);
+	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
+	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
+	radeon_emit(cs, value);
+}
+
+static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
+	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
+	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+	radeon_set_sh_reg_seq(cs, reg, 1);
+	radeon_emit(cs, value);
+}
+
+static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+	radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
+	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+	radeon_set_uconfig_reg_seq(cs, reg, 1);
+	radeon_emit(cs, value);
+}
+
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
+					      unsigned reg, unsigned idx,
+					      unsigned value)
+{
+	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+	assert(cs->current.cdw + 3 <= cs->current.max_dw);
+	radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, 1, 0));
+	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
+	radeon_emit(cs, value);
+}
+
+/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
+static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
+					      enum si_tracked_reg reg, unsigned value)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+	    sctx->tracked_regs.reg_value[reg] != value) {
+		radeon_set_context_reg(cs, offset, value);
+
+		sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+		sctx->tracked_regs.reg_value[reg] = value;
+	}
+}
+
+/**
+ * Set 2 consecutive registers if any registers value is different.
+ * @param offset        starting register offset
+ * @param value1        is written to first register
+ * @param value2        is written to second register
+ */
+static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
+					       enum si_tracked_reg reg, unsigned value1,
+					       unsigned value2)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
+	    sctx->tracked_regs.reg_value[reg] != value1 ||
+	    sctx->tracked_regs.reg_value[reg+1] != value2) {
+		radeon_set_context_reg_seq(cs, offset, 2);
+		radeon_emit(cs, value1);
+		radeon_emit(cs, value2);
+
+		sctx->tracked_regs.reg_value[reg] = value1;
+		sctx->tracked_regs.reg_value[reg+1] = value2;
+		sctx->tracked_regs.reg_saved |= 0x3ull << reg;
+	}
+}
+
+/**
+ * Set 3 consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
+					       enum si_tracked_reg reg, unsigned value1,
+					       unsigned value2, unsigned value3)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
+	    sctx->tracked_regs.reg_value[reg] != value1 ||
+	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
+	    sctx->tracked_regs.reg_value[reg+2] != value3) {
+		radeon_set_context_reg_seq(cs, offset, 3);
+		radeon_emit(cs, value1);
+		radeon_emit(cs, value2);
+		radeon_emit(cs, value3);
+
+		sctx->tracked_regs.reg_value[reg] = value1;
+		sctx->tracked_regs.reg_value[reg+1] = value2;
+		sctx->tracked_regs.reg_value[reg+2] = value3;
+		sctx->tracked_regs.reg_saved |= 0x7ull << reg;
+	}
+}
+
+/**
+ * Set 4 consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
+					       enum si_tracked_reg reg, unsigned value1,
+					       unsigned value2, unsigned value3,
+					       unsigned value4)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
+	    sctx->tracked_regs.reg_value[reg] != value1 ||
+	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
+	    sctx->tracked_regs.reg_value[reg+2] != value3 ||
+	    sctx->tracked_regs.reg_value[reg+3] != value4) {
+		radeon_set_context_reg_seq(cs, offset, 4);
+		radeon_emit(cs, value1);
+		radeon_emit(cs, value2);
+		radeon_emit(cs, value3);
+		radeon_emit(cs, value4);
+
+		sctx->tracked_regs.reg_value[reg] = value1;
+		sctx->tracked_regs.reg_value[reg+1] = value2;
+		sctx->tracked_regs.reg_value[reg+2] = value3;
+		sctx->tracked_regs.reg_value[reg+3] = value4;
+		sctx->tracked_regs.reg_saved |= 0xfull << reg;
+	}
+}
+
+/**
+ * Set consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
+					       unsigned *value, unsigned *saved_val,
+					       unsigned num)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+	int i, j;
+
+	for (i = 0; i < num; i++) {
+		if (saved_val[i] != value[i]) {
+			radeon_set_context_reg_seq(cs, offset, num);
+			for (j = 0; j < num; j++)
+				radeon_emit(cs, value[j]);
+
+			memcpy(saved_val, value, sizeof(uint32_t) * num);
+			break;
+		}
+	}
+}
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_clear.c b/lib/mesa/src/gallium/drivers/radeonsi/si_clear.c
new file mode 100644
index 000000000..8aa3355af
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_clear.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "sid.h"
+
+#include "util/u_format.h"
+#include "util/u_pack_color.h"
+#include "util/u_surface.h"
+
+enum {
+	SI_CLEAR         = SI_SAVE_FRAGMENT_STATE,
+	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
+};
+
+static void si_alloc_separate_cmask(struct si_screen *sscreen,
+				    struct si_texture *tex)
+{
+	if (tex->cmask_buffer || !tex->surface.cmask_size)
+                return;
+
+	tex->cmask_buffer =
+		si_aligned_buffer_create(&sscreen->b,
+					 SI_RESOURCE_FLAG_UNMAPPABLE,
+					 PIPE_USAGE_DEFAULT,
+					 tex->surface.cmask_size,
+					 tex->surface.cmask_alignment);
+	if (tex->cmask_buffer == NULL)
+		return;
+
+	tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
+	tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+
+	p_atomic_inc(&sscreen->compressed_colortex_counter);
+}
+
+static bool si_set_clear_color(struct si_texture *tex,
+			       enum pipe_format surface_format,
+			       const union pipe_color_union *color)
+{
+	union util_color uc;
+
+	memset(&uc, 0, sizeof(uc));
+
+	if (tex->surface.bpe == 16) {
+		/* DCC fast clear only:
+		 *   CLEAR_WORD0 = R = G = B
+		 *   CLEAR_WORD1 = A
+		 */
+		assert(color->ui[0] == color->ui[1] &&
+		       color->ui[0] == color->ui[2]);
+		uc.ui[0] = color->ui[0];
+		uc.ui[1] = color->ui[3];
+	} else if (util_format_is_pure_uint(surface_format)) {
+		util_format_write_4ui(surface_format, color->ui, 0, &uc, 0, 0, 0, 1, 1);
+	} else if (util_format_is_pure_sint(surface_format)) {
+		util_format_write_4i(surface_format, color->i, 0, &uc, 0, 0, 0, 1, 1);
+	} else {
+		util_pack_color(color->f, surface_format, &uc);
+	}
+
+	if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
+		return false;
+
+	memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
+	return true;
+}
+
+/** Linearize and convert luminace/intensity to red. */
+enum pipe_format si_simplify_cb_format(enum pipe_format format)
+{
+	format = util_format_linear(format);
+	format = util_format_luminance_to_red(format);
+	return util_format_intensity_to_red(format);
+}
+
+bool vi_alpha_is_on_msb(enum pipe_format format)
+{
+	format = si_simplify_cb_format(format);
+
+	/* Formats with 3 channels can't have alpha. */
+	if (util_format_description(format)->nr_channels == 3)
+		return true; /* same as xxxA; is any value OK here? */
+
+	return si_translate_colorswap(format, false) <= 1;
+}
+
+static bool vi_get_fast_clear_parameters(enum pipe_format base_format,
+					 enum pipe_format surface_format,
+					 const union pipe_color_union *color,
+					 uint32_t* clear_value,
+					 bool *eliminate_needed)
+{
+	/* If we want to clear without needing a fast clear eliminate step, we
+	 * can set color and alpha independently to 0 or 1 (or 0/max for integer
+	 * formats).
+	 */
+	bool values[4] = {}; /* whether to clear to 0 or 1 */
+	bool color_value = false; /* clear color to 0 or 1 */
+	bool alpha_value = false; /* clear alpha to 0 or 1 */
+	int alpha_channel; /* index of the alpha component */
+	bool has_color = false;
+	bool has_alpha = false;
+
+	const struct util_format_description *desc =
+		util_format_description(si_simplify_cb_format(surface_format));
+
+	/* 128-bit fast clear with different R,G,B values is unsupported. */
+	if (desc->block.bits == 128 &&
+	    (color->ui[0] != color->ui[1] ||
+	     color->ui[0] != color->ui[2]))
+		return false;
+
+	*eliminate_needed = true;
+	*clear_value = 0x20202020U; /* use CB clear color registers */
+
+	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+		return true; /* need ELIMINATE_FAST_CLEAR */
+
+	bool base_alpha_is_on_msb = vi_alpha_is_on_msb(base_format);
+	bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(surface_format);
+
+	/* Formats with 3 channels can't have alpha. */
+	if (desc->nr_channels == 3)
+		alpha_channel = -1;
+	else if (surf_alpha_is_on_msb)
+		alpha_channel = desc->nr_channels - 1;
+	else
+		alpha_channel = 0;
+
+	for (int i = 0; i < 4; ++i) {
+		if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
+			continue;
+
+		if (desc->channel[i].pure_integer &&
+		    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+			/* Use the maximum value for clamping the clear color. */
+			int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+			values[i] = color->i[i] != 0;
+			if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
+				return true; /* need ELIMINATE_FAST_CLEAR */
+		} else if (desc->channel[i].pure_integer &&
+			   desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+			/* Use the maximum value for clamping the clear color. */
+			unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+			values[i] = color->ui[i] != 0U;
+			if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
+				return true; /* need ELIMINATE_FAST_CLEAR */
+		} else {
+			values[i] = color->f[i] != 0.0F;
+			if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+				return true; /* need ELIMINATE_FAST_CLEAR */
+		}
+
+		if (desc->swizzle[i] == alpha_channel) {
+			alpha_value = values[i];
+			has_alpha = true;
+		} else {
+			color_value = values[i];
+			has_color = true;
+		}
+	}
+
+	/* If alpha isn't present, make it the same as color, and vice versa. */
+	if (!has_alpha)
+		alpha_value = color_value;
+	else if (!has_color)
+		color_value = alpha_value;
+
+	if (color_value != alpha_value &&
+	    base_alpha_is_on_msb != surf_alpha_is_on_msb)
+		return true; /* require ELIMINATE_FAST_CLEAR */
+
+	/* Check if all color values are equal if they are present. */
+	for (int i = 0; i < 4; ++i) {
+		if (desc->swizzle[i] <= PIPE_SWIZZLE_W &&
+		    desc->swizzle[i] != alpha_channel &&
+		    values[i] != color_value)
+			return true; /* require ELIMINATE_FAST_CLEAR */
+	}
+
+	/* This doesn't need ELIMINATE_FAST_CLEAR.
+	 * CB uses both the DCC clear codes and the CB clear color registers,
+	 * so they must match.
+	 */
+	*eliminate_needed = false;
+
+	if (color_value)
+		*clear_value |= 0x80808080U;
+	if (alpha_value)
+		*clear_value |= 0x40404040U;
+	return true;
+}
+
+void vi_dcc_clear_level(struct si_context *sctx,
+			struct si_texture *tex,
+			unsigned level, unsigned clear_value)
+{
+	struct pipe_resource *dcc_buffer;
+	uint64_t dcc_offset, clear_size;
+
+	assert(vi_dcc_enabled(tex, level));
+
+	if (tex->dcc_separate_buffer) {
+		dcc_buffer = &tex->dcc_separate_buffer->b.b;
+		dcc_offset = 0;
+	} else {
+		dcc_buffer = &tex->buffer.b.b;
+		dcc_offset = tex->dcc_offset;
+	}
+
+	if (sctx->chip_class >= GFX9) {
+		/* Mipmap level clears aren't implemented. */
+		assert(tex->buffer.b.b.last_level == 0);
+		/* 4x and 8x MSAA needs a sophisticated compute shader for
+		 * the clear. See AMDVLK. */
+		assert(tex->buffer.b.b.nr_storage_samples <= 2);
+		clear_size = tex->surface.dcc_size;
+	} else {
+		unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
+
+		/* If this is 0, fast clear isn't possible. (can occur with MSAA) */
+		assert(tex->surface.u.legacy.level[level].dcc_fast_clear_size);
+		/* Layered 4x and 8x MSAA DCC fast clears need to clear
+		 * dcc_fast_clear_size bytes for each layer. A compute shader
+		 * would be more efficient than separate per-layer clear operations.
+		 */
+		assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1);
+
+		dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
+		clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
+			     num_layers;
+	}
+
+	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
+			&clear_value, 4, SI_COHERENCY_CB_META);
+}
+
+/* Set the same micro tile mode as the destination of the last MSAA resolve.
+ * This allows hitting the MSAA resolve fast path, which requires that both
+ * src and dst micro tile modes match.
+ */
+static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
+					   struct si_texture *tex)
+{
+	if (tex->buffer.b.is_shared ||
+	    tex->buffer.b.b.nr_samples <= 1 ||
+	    tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
+		return;
+
+	assert(sscreen->info.chip_class >= GFX9 ||
+	       tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+	assert(tex->buffer.b.b.last_level == 0);
+
+	if (sscreen->info.chip_class >= GFX9) {
+		/* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
+		assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
+
+		/* If you do swizzle_mode % 4, you'll get:
+		 *   0 = Depth
+		 *   1 = Standard,
+		 *   2 = Displayable
+		 *   3 = Rotated
+		 *
+		 * Depth-sample order isn't allowed:
+		 */
+		assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
+
+		switch (tex->last_msaa_resolve_target_micro_mode) {
+		case RADEON_MICRO_MODE_DISPLAY:
+			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
+			break;
+		case RADEON_MICRO_MODE_THIN:
+			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
+			break;
+		case RADEON_MICRO_MODE_ROTATED:
+			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
+			break;
+		default: /* depth */
+			assert(!"unexpected micro mode");
+			return;
+		}
+	} else if (sscreen->info.chip_class >= CIK) {
+		/* These magic numbers were copied from addrlib. It doesn't use
+		 * any definitions for them either. They are all 2D_TILED_THIN1
+		 * modes with different bpp and micro tile mode.
+		 */
+		switch (tex->last_msaa_resolve_target_micro_mode) {
+		case RADEON_MICRO_MODE_DISPLAY:
+			tex->surface.u.legacy.tiling_index[0] = 10;
+			break;
+		case RADEON_MICRO_MODE_THIN:
+			tex->surface.u.legacy.tiling_index[0] = 14;
+			break;
+		case RADEON_MICRO_MODE_ROTATED:
+			tex->surface.u.legacy.tiling_index[0] = 28;
+			break;
+		default: /* depth, thick */
+			assert(!"unexpected micro mode");
+			return;
+		}
+	} else { /* SI */
+		switch (tex->last_msaa_resolve_target_micro_mode) {
+		case RADEON_MICRO_MODE_DISPLAY:
+			switch (tex->surface.bpe) {
+			case 1:
+                            tex->surface.u.legacy.tiling_index[0] = 10;
+                            break;
+			case 2:
+                            tex->surface.u.legacy.tiling_index[0] = 11;
+                            break;
+			default: /* 4, 8 */
+                            tex->surface.u.legacy.tiling_index[0] = 12;
+                            break;
+			}
+			break;
+		case RADEON_MICRO_MODE_THIN:
+			switch (tex->surface.bpe) {
+			case 1:
+                                tex->surface.u.legacy.tiling_index[0] = 14;
+                                break;
+			case 2:
+                                tex->surface.u.legacy.tiling_index[0] = 15;
+                                break;
+			case 4:
+                                tex->surface.u.legacy.tiling_index[0] = 16;
+                                break;
+			default: /* 8, 16 */
+                                tex->surface.u.legacy.tiling_index[0] = 17;
+                                break;
+			}
+			break;
+		default: /* depth, thick */
+			assert(!"unexpected micro mode");
+			return;
+		}
+	}
+
+	tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
+
+	p_atomic_inc(&sscreen->dirty_tex_counter);
+}
+
+static void si_do_fast_color_clear(struct si_context *sctx,
+				   unsigned *buffers,
+				   const union pipe_color_union *color)
+{
+	struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+	int i;
+
+	/* This function is broken in BE, so just disable this path for now */
+#ifdef PIPE_ARCH_BIG_ENDIAN
+	return;
+#endif
+
+	if (sctx->render_cond)
+		return;
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		struct si_texture *tex;
+		unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
+
+		if (!fb->cbufs[i])
+			continue;
+
+		/* if this colorbuffer is not being cleared */
+		if (!(*buffers & clear_bit))
+			continue;
+
+		unsigned level = fb->cbufs[i]->u.tex.level;
+		if (level > 0)
+			continue;
+
+		tex = (struct si_texture *)fb->cbufs[i]->texture;
+
+		/* TODO: GFX9: Implement DCC fast clear for level 0 of
+		 * mipmapped textures. Mipmapped DCC has to clear a rectangular
+		 * area of DCC for level 0 (because the whole miptree is
+		 * organized in a 2D plane).
+		 */
+		if (sctx->chip_class >= GFX9 &&
+		    tex->buffer.b.b.last_level > 0)
+			continue;
+
+		/* the clear is allowed if all layers are bound */
+		if (fb->cbufs[i]->u.tex.first_layer != 0 ||
+		    fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
+			continue;
+		}
+
+		/* only supported on tiled surfaces */
+		if (tex->surface.is_linear) {
+			continue;
+		}
+
+		/* shared textures can't use fast clear without an explicit flush,
+		 * because there is no way to communicate the clear color among
+		 * all clients
+		 */
+		if (tex->buffer.b.is_shared &&
+		    !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+			continue;
+
+		if (sctx->chip_class <= VI &&
+		    tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+		    !sctx->screen->info.htile_cmask_support_1d_tiling)
+			continue;
+
+		/* Use a slow clear for small surfaces where the cost of
+		 * the eliminate pass can be higher than the benefit of fast
+		 * clear. The closed driver does this, but the numbers may differ.
+		 *
+		 * This helps on both dGPUs and APUs, even small APUs like Mullins.
+		 */
+		bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
+				 tex->buffer.b.b.width0 *
+				 tex->buffer.b.b.height0 <= 512 * 512;
+		bool eliminate_needed = false;
+		bool fmask_decompress_needed = false;
+
+		/* Fast clear is the most appropriate place to enable DCC for
+		 * displayable surfaces.
+		 */
+		if (sctx->family == CHIP_STONEY && !too_small) {
+			vi_separate_dcc_try_enable(sctx, tex);
+
+			/* RB+ isn't supported with a CMASK clear only on Stoney,
+			 * so all clears are considered to be hypothetically slow
+			 * clears, which is weighed when determining whether to
+			 * enable separate DCC.
+			 */
+			if (tex->dcc_gather_statistics) /* only for Stoney */
+				tex->num_slow_clears++;
+		}
+
+		/* Try to clear DCC first, otherwise try CMASK. */
+		if (vi_dcc_enabled(tex, 0)) {
+			uint32_t reset_value;
+
+			if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
+				continue;
+
+			/* This can happen with mipmapping or MSAA. */
+			if (sctx->chip_class == VI &&
+			    !tex->surface.u.legacy.level[level].dcc_fast_clear_size)
+				continue;
+
+			if (!vi_get_fast_clear_parameters(tex->buffer.b.b.format,
+							  fb->cbufs[i]->format,
+							  color, &reset_value,
+							  &eliminate_needed))
+				continue;
+
+			if (eliminate_needed && too_small)
+				continue;
+
+			/* DCC fast clear with MSAA should clear CMASK to 0xC. */
+			if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
+				/* TODO: This doesn't work with MSAA. */
+				if (eliminate_needed)
+					continue;
+
+				uint32_t clear_value = 0xCCCCCCCC;
+				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
+						tex->cmask_offset, tex->surface.cmask_size,
+						&clear_value, 4, SI_COHERENCY_CB_META);
+				fmask_decompress_needed = true;
+			}
+
+			vi_dcc_clear_level(sctx, tex, 0, reset_value);
+			tex->separate_dcc_dirty = true;
+		} else {
+			if (too_small)
+				continue;
+
+			/* 128-bit formats are unusupported */
+			if (tex->surface.bpe > 8) {
+				continue;
+			}
+
+			/* RB+ doesn't work with CMASK fast clear on Stoney. */
+			if (sctx->family == CHIP_STONEY)
+				continue;
+
+			/* ensure CMASK is enabled */
+			si_alloc_separate_cmask(sctx->screen, tex);
+			if (!tex->cmask_buffer)
+				continue;
+
+			/* Do the fast clear. */
+			uint32_t clear_value = 0;
+			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
+					tex->cmask_offset, tex->surface.cmask_size,
+					&clear_value, 4, SI_COHERENCY_CB_META);
+			eliminate_needed = true;
+		}
+
+		if ((eliminate_needed || fmask_decompress_needed) &&
+		    !(tex->dirty_level_mask & (1 << level))) {
+			tex->dirty_level_mask |= 1 << level;
+			p_atomic_inc(&sctx->screen->compressed_colortex_counter);
+		}
+
+		/* We can change the micro tile mode before a full clear. */
+		si_set_optimal_micro_tile_mode(sctx->screen, tex);
+
+		*buffers &= ~clear_bit;
+
+		if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
+			sctx->framebuffer.dirty_cbufs |= 1 << i;
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+		}
+	}
+}
+
+static void si_clear(struct pipe_context *ctx, unsigned buffers,
+		     const union pipe_color_union *color,
+		     double depth, unsigned stencil)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+	struct pipe_surface *zsbuf = fb->zsbuf;
+	struct si_texture *zstex =
+		zsbuf ? (struct si_texture*)zsbuf->texture : NULL;
+
+	if (buffers & PIPE_CLEAR_COLOR) {
+		si_do_fast_color_clear(sctx, &buffers, color);
+		if (!buffers)
+			return; /* all buffers have been fast cleared */
+
+		/* These buffers cannot use fast clear, make sure to disable expansion. */
+		for (unsigned i = 0; i < fb->nr_cbufs; i++) {
+			struct si_texture *tex;
+
+			/* If not clearing this buffer, skip. */
+			if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
+				continue;
+
+			tex = (struct si_texture *)fb->cbufs[i]->texture;
+			if (tex->surface.fmask_size == 0)
+				tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
+		}
+	}
+
+	if (zstex &&
+	    si_htile_enabled(zstex, zsbuf->u.tex.level) &&
+	    zsbuf->u.tex.first_layer == 0 &&
+	    zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
+		/* TC-compatible HTILE only supports depth clears to 0 or 1. */
+		if (buffers & PIPE_CLEAR_DEPTH &&
+		    (!zstex->tc_compatible_htile ||
+		     depth == 0 || depth == 1)) {
+			/* Need to disable EXPCLEAR temporarily if clearing
+			 * to a new value. */
+			if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
+				sctx->db_depth_disable_expclear = true;
+			}
+
+			if (zstex->depth_clear_value != (float)depth) {
+				/* Update DB_DEPTH_CLEAR. */
+				zstex->depth_clear_value = depth;
+				sctx->framebuffer.dirty_zsbuf = true;
+				si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+			}
+			sctx->db_depth_clear = true;
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+		}
+
+		/* TC-compatible HTILE only supports stencil clears to 0. */
+		if (buffers & PIPE_CLEAR_STENCIL &&
+		    (!zstex->tc_compatible_htile || stencil == 0)) {
+			stencil &= 0xff;
+
+			/* Need to disable EXPCLEAR temporarily if clearing
+			 * to a new value. */
+			if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
+				sctx->db_stencil_disable_expclear = true;
+			}
+
+			if (zstex->stencil_clear_value != (uint8_t)stencil) {
+				/* Update DB_STENCIL_CLEAR. */
+				zstex->stencil_clear_value = stencil;
+				sctx->framebuffer.dirty_zsbuf = true;
+				si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+			}
+			sctx->db_stencil_clear = true;
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+		}
+
+		/* TODO: Find out what's wrong here. Fast depth clear leads to
+		 * corruption in ARK: Survival Evolved, but that may just be
+		 * a coincidence and the root cause is elsewhere.
+		 *
+		 * The corruption can be fixed by putting the DB flush before
+		 * or after the depth clear. (surprisingly)
+		 *
+		 * https://bugs.freedesktop.org/show_bug.cgi?id=102955 (apitrace)
+		 *
+		 * This hack decreases back-to-back ClearDepth performance.
+		 */
+		if ((sctx->db_depth_clear || sctx->db_stencil_clear) &&
+		    sctx->screen->clear_db_cache_before_clear)
+			sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+	}
+
+	si_blitter_begin(sctx, SI_CLEAR);
+	util_blitter_clear(sctx->blitter, fb->width, fb->height,
+			   util_framebuffer_get_num_layers(fb),
+			   buffers, color, depth, stencil);
+	si_blitter_end(sctx);
+
+	if (sctx->db_depth_clear) {
+		sctx->db_depth_clear = false;
+		sctx->db_depth_disable_expclear = false;
+		zstex->depth_cleared = true;
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+	}
+
+	if (sctx->db_stencil_clear) {
+		sctx->db_stencil_clear = false;
+		sctx->db_stencil_disable_expclear = false;
+		zstex->stencil_cleared = true;
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+	}
+}
+
+static void si_clear_render_target(struct pipe_context *ctx,
+				   struct pipe_surface *dst,
+				   const union pipe_color_union *color,
+				   unsigned dstx, unsigned dsty,
+				   unsigned width, unsigned height,
+				   bool render_condition_enabled)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	si_blitter_begin(sctx, SI_CLEAR_SURFACE |
+			 (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+	util_blitter_clear_render_target(sctx->blitter, dst, color,
+					 dstx, dsty, width, height);
+	si_blitter_end(sctx);
+}
+
+static void si_clear_depth_stencil(struct pipe_context *ctx,
+				   struct pipe_surface *dst,
+				   unsigned clear_flags,
+				   double depth,
+				   unsigned stencil,
+				   unsigned dstx, unsigned dsty,
+				   unsigned width, unsigned height,
+				   bool render_condition_enabled)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	si_blitter_begin(sctx, SI_CLEAR_SURFACE |
+			 (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+	util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil,
+					 dstx, dsty, width, height);
+	si_blitter_end(sctx);
+}
+
+static void si_clear_texture(struct pipe_context *pipe,
+			     struct pipe_resource *tex,
+			     unsigned level,
+			     const struct pipe_box *box,
+			     const void *data)
+{
+	struct pipe_screen *screen = pipe->screen;
+	struct si_texture *stex = (struct si_texture*)tex;
+	struct pipe_surface tmpl = {{0}};
+	struct pipe_surface *sf;
+	const struct util_format_description *desc =
+		util_format_description(tex->format);
+
+	tmpl.format = tex->format;
+	tmpl.u.tex.first_layer = box->z;
+	tmpl.u.tex.last_layer = box->z + box->depth - 1;
+	tmpl.u.tex.level = level;
+	sf = pipe->create_surface(pipe, tex, &tmpl);
+	if (!sf)
+		return;
+
+	if (stex->is_depth) {
+		unsigned clear;
+		float depth;
+		uint8_t stencil = 0;
+
+		/* Depth is always present. */
+		clear = PIPE_CLEAR_DEPTH;
+		desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+
+		if (stex->surface.has_stencil) {
+			clear |= PIPE_CLEAR_STENCIL;
+			desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+		}
+
+		si_clear_depth_stencil(pipe, sf, clear, depth, stencil,
+				       box->x, box->y,
+				       box->width, box->height, false);
+	} else {
+		union pipe_color_union color;
+
+		/* pipe_color_union requires the full vec4 representation. */
+		if (util_format_is_pure_uint(tex->format))
+			desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1);
+		else if (util_format_is_pure_sint(tex->format))
+			desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1);
+		else
+			desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
+
+		if (screen->is_format_supported(screen, tex->format,
+						tex->target, 0, 0,
+						PIPE_BIND_RENDER_TARGET)) {
+			si_clear_render_target(pipe, sf, &color,
+					       box->x, box->y,
+					       box->width, box->height, false);
+		} else {
+			/* Software fallback - just for R9G9B9E5_FLOAT */
+			util_clear_render_target(pipe, sf, &color,
+						 box->x, box->y,
+						 box->width, box->height);
+		}
+	}
+	pipe_surface_reference(&sf, NULL);
+}
+
+void si_init_clear_functions(struct si_context *sctx)
+{
+	sctx->b.clear = si_clear;
+	sctx->b.clear_render_target = si_clear_render_target;
+	sctx->b.clear_depth_stencil = si_clear_depth_stencil;
+	sctx->b.clear_texture = si_clear_texture;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_blit.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_blit.c
new file mode 100644
index 000000000..20e4f591f
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "si_pipe.h"
+
+/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
+ * and L2_STREAM for src.
+ */
+static enum si_cache_policy get_cache_policy(struct si_context *sctx,
+					     enum si_coherency coher,
+					     uint64_t size)
+{
+	if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
+					  coher == SI_COHERENCY_CP)) ||
+	    (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
+		return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
+
+	return L2_BYPASS;
+}
+
+unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
+			    enum si_cache_policy cache_policy)
+{
+	switch (coher) {
+	default:
+	case SI_COHERENCY_NONE:
+	case SI_COHERENCY_CP:
+		return 0;
+	case SI_COHERENCY_SHADER:
+		return SI_CONTEXT_INV_SMEM_L1 |
+		       SI_CONTEXT_INV_VMEM_L1 |
+		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+	case SI_COHERENCY_CB_META:
+		return SI_CONTEXT_FLUSH_AND_INV_CB;
+	}
+}
+
+static void si_compute_do_clear_or_copy(struct si_context *sctx,
+					struct pipe_resource *dst,
+					unsigned dst_offset,
+					struct pipe_resource *src,
+					unsigned src_offset,
+					unsigned size,
+					const uint32_t *clear_value,
+					unsigned clear_value_size,
+					enum si_coherency coher)
+{
+	struct pipe_context *ctx = &sctx->b;
+
+	assert(src_offset % 4 == 0);
+	assert(dst_offset % 4 == 0);
+	assert(size % 4 == 0);
+
+	assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+	assert(!src || src_offset + size <= src->width0);
+
+	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+		       SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+	si_emit_cache_flush(sctx);
+
+	/* Save states. */
+	void *saved_cs = sctx->cs_shader_state.program;
+	struct pipe_shader_buffer saved_sb[2] = {};
+	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+
+	/* The memory accesses are coalesced, meaning that the 1st instruction writes
+	 * the 1st contiguous block of data for the whole wave, the 2nd instruction
+	 * writes the 2nd contiguous block of data, etc.
+	 */
+	unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
+					   SI_COMPUTE_CLEAR_DW_PER_THREAD;
+	unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
+	unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
+	unsigned dwords_per_wave = dwords_per_thread * 64;
+
+	unsigned num_dwords = size / 4;
+	unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+	struct pipe_grid_info info = {};
+	info.block[0] = MIN2(64, num_instructions);
+	info.block[1] = 1;
+	info.block[2] = 1;
+	info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+	info.grid[1] = 1;
+	info.grid[2] = 1;
+
+	struct pipe_shader_buffer sb[2] = {};
+	sb[0].buffer = dst;
+	sb[0].buffer_offset = dst_offset;
+	sb[0].buffer_size = size;
+
+	if (src) {
+		sb[1].buffer = src;
+		sb[1].buffer_offset = src_offset;
+		sb[1].buffer_size = size;
+
+		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
+		ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
+	} else {
+		assert(clear_value_size >= 4 &&
+		       clear_value_size <= 16 &&
+		       util_is_power_of_two_or_zero(clear_value_size));
+
+		for (unsigned i = 0; i < 4; i++)
+			sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
+
+		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
+		ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
+	}
+
+	ctx->launch_grid(ctx, &info);
+
+	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
+
+	if (cache_policy != L2_BYPASS)
+		r600_resource(dst)->TC_L2_dirty = true;
+
+	/* Restore states. */
+	ctx->bind_compute_state(ctx, saved_cs);
+	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+}
+
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+		     uint64_t offset, uint64_t size, uint32_t *clear_value,
+		     uint32_t clear_value_size, enum si_coherency coher)
+{
+	if (!size)
+		return;
+
+	unsigned clear_alignment = MIN2(clear_value_size, 4);
+
+	assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
+	assert(offset % clear_alignment == 0);
+	assert(size % clear_alignment == 0);
+	assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
+
+	/* Reduce a large clear value size if possible. */
+	if (clear_value_size > 4) {
+		bool clear_dword_duplicated = true;
+
+		/* See if we can lower large fills to dword fills. */
+		for (unsigned i = 1; i < clear_value_size / 4; i++) {
+			if (clear_value[0] != clear_value[i]) {
+				clear_dword_duplicated = false;
+				break;
+			}
+		}
+		if (clear_dword_duplicated)
+			clear_value_size = 4;
+	}
+
+	/* Expand a small clear value size. */
+	uint32_t tmp_clear_value;
+	if (clear_value_size <= 2) {
+		if (clear_value_size == 1) {
+			tmp_clear_value = *(uint8_t*)clear_value;
+			tmp_clear_value |= (tmp_clear_value << 8) |
+					   (tmp_clear_value << 16) |
+					   (tmp_clear_value << 24);
+		} else {
+			tmp_clear_value = *(uint16_t*)clear_value;
+			tmp_clear_value |= tmp_clear_value << 16;
+		}
+		clear_value = &tmp_clear_value;
+		clear_value_size = 4;
+	}
+
+	/* Use transform feedback for 12-byte clears. */
+	/* TODO: Use compute. */
+	if (clear_value_size == 12) {
+		union pipe_color_union streamout_clear_value;
+
+		memcpy(&streamout_clear_value, clear_value, clear_value_size);
+		si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
+		util_blitter_clear_buffer(sctx->blitter, dst, offset,
+					  size, clear_value_size / 4,
+					  &streamout_clear_value);
+		si_blitter_end(sctx);
+		return;
+	}
+
+	uint64_t aligned_size = size & ~3ull;
+	if (aligned_size >= 4) {
+		/* Before GFX9, CP DMA was very slow when clearing GTT, so never
+		 * use CP DMA clears on those chips, because we can't be certain
+		 * about buffer placements.
+		 */
+		if (clear_value_size > 4 ||
+		    (clear_value_size == 4 &&
+		     offset % 4 == 0 &&
+		     (size > 32*1024 || sctx->chip_class <= VI))) {
+			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
+						    aligned_size, clear_value,
+						    clear_value_size, coher);
+		} else {
+			assert(clear_value_size == 4);
+			si_cp_dma_clear_buffer(sctx, dst, offset,
+					       aligned_size, *clear_value, coher,
+					       get_cache_policy(sctx, coher, size));
+		}
+
+		offset += aligned_size;
+		size -= aligned_size;
+	}
+
+	/* Handle non-dword alignment. */
+	if (size) {
+		assert(dst);
+		assert(dst->target == PIPE_BUFFER);
+		assert(size < 4);
+
+		pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+	}
+}
+
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 unsigned offset, unsigned size,
+				 const void *clear_value,
+				 int clear_value_size)
+{
+	enum si_coherency coher;
+
+	if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
+		coher = SI_COHERENCY_CP;
+	else
+		coher = SI_COHERENCY_SHADER;
+
+	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
+			clear_value_size, coher);
+}
+
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size)
+{
+	if (!size)
+		return;
+
+	enum si_coherency coher = SI_COHERENCY_SHADER;
+	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+	/* Only use compute for VRAM copies on dGPUs. */
+	if (sctx->screen->info.has_dedicated_vram &&
+	    r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+	    r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
+	    size > 32 * 1024 &&
+	    dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
+		si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
+					    size, NULL, 0, coher);
+	} else {
+		si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
+				      0, coher, cache_policy);
+	}
+}
+
+void si_init_compute_blit_functions(struct si_context *sctx)
+{
+	sctx->b.clear_buffer = si_pipe_clear_buffer;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_dma_cs.c b/lib/mesa/src/gallium/drivers/radeonsi/si_dma_cs.c
new file mode 100644
index 000000000..ffa2f5ae6
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "sid.h"
+
+static void si_dma_emit_wait_idle(struct si_context *sctx)
+{
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
+
+	/* NOP waits for idle. */
+	if (sctx->chip_class >= CIK)
+		radeon_emit(cs, 0x00000000); /* NOP */
+	else
+		radeon_emit(cs, 0xf0000000); /* NOP */
+}
+
+void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst,
+			   uint64_t offset)
+{
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
+	uint64_t va = dst->gpu_address + offset;
+
+	if (sctx->chip_class == SI) {
+		unreachable("SI DMA doesn't support the timestamp packet.");
+		return;
+	}
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&dst->valid_buffer_range, offset, offset + 8);
+
+	assert(va % 8 == 0);
+
+	si_need_dma_space(sctx, 4, dst, NULL);
+	si_dma_emit_wait_idle(sctx);
+
+	radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
+					SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
+					0));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+}
+
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+			  uint64_t offset, uint64_t size, unsigned clear_value)
+{
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
+	unsigned i, ncopy, csize;
+	struct r600_resource *rdst = r600_resource(dst);
+
+	assert(offset % 4 == 0);
+	assert(size);
+	assert(size % 4 == 0);
+
+	if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
+		sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
+		return;
+	}
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&rdst->valid_buffer_range, offset, offset + size);
+
+	offset += rdst->gpu_address;
+
+	if (sctx->chip_class == SI) {
+		/* the same maximum size as for copying */
+		ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+		si_need_dma_space(sctx, ncopy * 4, rdst, NULL);
+
+		for (i = 0; i < ncopy; i++) {
+			csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+			radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
+						      csize / 4));
+			radeon_emit(cs, offset);
+			radeon_emit(cs, clear_value);
+			radeon_emit(cs, (offset >> 32) << 16);
+			offset += csize;
+			size -= csize;
+		}
+		return;
+	}
+
+	/* The following code is for CI, VI, Vega/Raven, etc. */
+	/* the same maximum size as for copying */
+	ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+	si_need_dma_space(sctx, ncopy * 5, rdst, NULL);
+
+	for (i = 0; i < ncopy; i++) {
+		csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
+		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
+						0x8000 /* dword copy */));
+		radeon_emit(cs, offset);
+		radeon_emit(cs, offset >> 32);
+		radeon_emit(cs, clear_value);
+		radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
+		offset += csize;
+		size -= csize;
+	}
+}
+
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
+		       struct r600_resource *dst, struct r600_resource *src)
+{
+	uint64_t vram = ctx->dma_cs->used_vram;
+	uint64_t gtt = ctx->dma_cs->used_gart;
+
+	if (dst) {
+		vram += dst->vram_usage;
+		gtt += dst->gart_usage;
+	}
+	if (src) {
+		vram += src->vram_usage;
+		gtt += src->gart_usage;
+	}
+
+	/* Flush the GFX IB if DMA depends on it. */
+	if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+	    ((dst &&
+	      ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
+						 RADEON_USAGE_READWRITE)) ||
+	     (src &&
+	      ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
+						 RADEON_USAGE_WRITE))))
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+	/* Flush if there's not enough space, or if the memory usage per IB
+	 * is too large.
+	 *
+	 * IBs using too little memory are limited by the IB submission overhead.
+	 * IBs using too much memory are limited by the kernel/TTM overhead.
+	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+	 *
+	 * This heuristic makes sure that DMA requests are executed
+	 * very soon after the call is made and lowers memory usage.
+	 * It improves texture upload performance by keeping the DMA
+	 * engine busy while uploads are being submitted.
+	 */
+	num_dw++; /* for emit_wait_idle below */
+	if (!ctx->ws->cs_check_space(ctx->dma_cs, num_dw) ||
+	    ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
+	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
+		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+		assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
+	}
+
+	/* Wait for idle if either buffer has been used in the IB before to
+	 * prevent read-after-write hazards.
+	 */
+	if ((dst &&
+	     ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf,
+						RADEON_USAGE_READWRITE)) ||
+	    (src &&
+	     ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf,
+						RADEON_USAGE_WRITE)))
+		si_dma_emit_wait_idle(ctx);
+
+	if (dst) {
+		radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
+					  RADEON_USAGE_WRITE, 0);
+	}
+	if (src) {
+		radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
+					  RADEON_USAGE_READ, 0);
+	}
+
+	/* this function is called before all DMA calls, so increment this. */
+	ctx->num_dma_calls++;
+}
+
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
+		     struct pipe_fence_handle **fence)
+{
+	struct radeon_cmdbuf *cs = ctx->dma_cs;
+	struct radeon_saved_cs saved;
+	bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
+
+	if (!radeon_emitted(cs, 0)) {
+		if (fence)
+			ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+		return;
+	}
+
+	if (check_vm)
+		si_save_cs(ctx->ws, cs, &saved, true);
+
+	ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
+	if (fence)
+		ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+
+	if (check_vm) {
+		/* Use conservative timeout 800ms, after which we won't wait any
+		 * longer and assume the GPU is hung.
+		 */
+		ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
+
+		si_check_vm_faults(ctx, &saved, RING_DMA);
+		si_clear_saved_cs(&saved);
+	}
+}
+
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
+			    uint64_t offset, uint64_t size, unsigned value)
+{
+	struct si_context *ctx = (struct si_context*)sscreen->aux_context;
+
+	mtx_lock(&sscreen->aux_context_lock);
+	si_sdma_clear_buffer(ctx, dst, offset, size, value);
+	sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+	mtx_unlock(&sscreen->aux_context_lock);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h b/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h
index 532151125..38f5c3dc7 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h
@@ -16,9 +16,11 @@ DRI_CONF_BEGIN
       DRI_CONF_MESA_NO_ERROR("false")
       DRI_CONF_DISABLE_EXT_BUFFER_AGE("false")
       DRI_CONF_DISABLE_OML_SYNC_CONTROL("false")
+      DRI_CONF_DISABLE_SGI_VIDEO_SYNC("false")
       DRI_CONF_RADEONSI_ENABLE_SISCHED("false")
       DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
       DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+      DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_QUALITY
       DRI_CONF_PP_CELSHADE(0)
@@ -32,18 +34,22 @@ DRI_CONF_BEGIN
       DRI_CONF_FORCE_GLSL_EXTENSIONS_WARN("false")
       DRI_CONF_DISABLE_GLSL_LINE_CONTINUATIONS("false")
       DRI_CONF_DISABLE_BLEND_FUNC_EXTENDED("false")
-      DRI_CONF_DISABLE_SHADER_BIT_ENCODING("false")
       DRI_CONF_FORCE_GLSL_VERSION(0)
       DRI_CONF_ALLOW_GLSL_EXTENSION_DIRECTIVE_MIDSHADER("false")
+      DRI_CONF_ALLOW_GLSL_BUILTIN_CONST_EXPRESSION("false")
+      DRI_CONF_ALLOW_GLSL_RELAXED_ES("false")
       DRI_CONF_ALLOW_GLSL_BUILTIN_VARIABLE_REDECLARATION("false")
       DRI_CONF_ALLOW_GLSL_CROSS_STAGE_INTERPOLATION_MISMATCH("false")
       DRI_CONF_ALLOW_HIGHER_COMPAT_VERSION("false")
       DRI_CONF_FORCE_GLSL_ABS_SQRT("false")
       DRI_CONF_GLSL_CORRECT_DERIVATIVES_AFTER_DISCARD("false")
+      DRI_CONF_ALLOW_GLSL_LAYOUT_QUALIFIER_ON_FUNCTION_PARAMETERS("false")
+      DRI_CONF_FORCE_COMPAT_PROFILE("false")
       DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR("false")
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_MISCELLANEOUS
       DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER("false")
       DRI_CONF_GLSL_ZERO_INIT("false")
+      DRI_CONF_ALLOW_RGB10_CONFIGS("true")
    DRI_CONF_SECTION_END
 DRI_CONF_END
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_fence.c b/lib/mesa/src/gallium/drivers/radeonsi/si_fence.c
new file mode 100644
index 000000000..3f22ee31a
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_fence.c
@@ -0,0 +1,656 @@
+/*
+ * Copyright 2013-2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <libsync.h>
+
+#include "util/os_time.h"
+#include "util/u_memory.h"
+#include "util/u_queue.h"
+#include "util/u_upload_mgr.h"
+
+#include "si_build_pm4.h"
+
+struct si_fine_fence {
+	struct r600_resource *buf;
+	unsigned offset;
+};
+
+struct si_multi_fence {
+	struct pipe_reference reference;
+	struct pipe_fence_handle *gfx;
+	struct pipe_fence_handle *sdma;
+	struct tc_unflushed_batch_token *tc_token;
+	struct util_queue_fence ready;
+
+	/* If the context wasn't flushed at fence creation, this is non-NULL. */
+	struct {
+		struct si_context *ctx;
+		unsigned ib_index;
+	} gfx_unflushed;
+
+	struct si_fine_fence fine;
+};
+
+/**
+ * Write an EOP event.
+ *
+ * \param event		EVENT_TYPE_*
+ * \param event_flags	Optional cache flush flags (TC)
+ * \param dst_sel       MEM or TC_L2
+ * \param int_sel       NONE or SEND_DATA_AFTER_WR_CONFIRM
+ * \param data_sel	DISCARD, VALUE_32BIT, TIMESTAMP, or GDS
+ * \param buf		Buffer
+ * \param va		GPU address
+ * \param old_value	Previous fence value (for a bug workaround)
+ * \param new_value	Fence value to write for this event.
+ */
+void si_cp_release_mem(struct si_context *ctx,
+		       unsigned event, unsigned event_flags,
+		       unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+		       struct r600_resource *buf, uint64_t va,
+		       uint32_t new_fence, unsigned query_type)
+{
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
+	unsigned op = EVENT_TYPE(event) |
+		      EVENT_INDEX(event == V_028A90_CS_DONE ||
+				  event == V_028A90_PS_DONE ? 6 : 5) |
+		      event_flags;
+	unsigned sel = EOP_DST_SEL(dst_sel) |
+		       EOP_INT_SEL(int_sel) |
+		       EOP_DATA_SEL(data_sel);
+
+	if (ctx->chip_class >= GFX9) {
+		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+		 * counters) must immediately precede every timestamp event to
+		 * prevent a GPU hang on GFX9.
+		 *
+		 * Occlusion queries don't need to do it here, because they
+		 * always do ZPASS_DONE before the timestamp.
+		 */
+		if (ctx->chip_class == GFX9 &&
+		    query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
+		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
+		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+			struct r600_resource *scratch = ctx->eop_bug_scratch;
+
+			assert(16 * ctx->screen->info.num_render_backends <=
+			       scratch->b.b.width0);
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+			radeon_emit(cs, scratch->gpu_address);
+			radeon_emit(cs, scratch->gpu_address >> 32);
+
+			radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
+						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+		}
+
+		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
+		radeon_emit(cs, op);
+		radeon_emit(cs, sel);
+		radeon_emit(cs, va);		/* address lo */
+		radeon_emit(cs, va >> 32);	/* address hi */
+		radeon_emit(cs, new_fence);	/* immediate data lo */
+		radeon_emit(cs, 0); /* immediate data hi */
+		radeon_emit(cs, 0); /* unused */
+	} else {
+		if (ctx->chip_class == CIK ||
+		    ctx->chip_class == VI) {
+			struct r600_resource *scratch = ctx->eop_bug_scratch;
+			uint64_t va = scratch->gpu_address;
+
+			/* Two EOP events are required to make all engines go idle
+			 * (and optional cache flushes executed) before the timestamp
+			 * is written.
+			 */
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+			radeon_emit(cs, op);
+			radeon_emit(cs, va);
+			radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+			radeon_emit(cs, 0); /* immediate data */
+			radeon_emit(cs, 0); /* unused */
+
+			radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
+						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+		}
+
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+		radeon_emit(cs, op);
+		radeon_emit(cs, va);
+		radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+		radeon_emit(cs, new_fence); /* immediate data */
+		radeon_emit(cs, 0); /* unused */
+	}
+
+	if (buf) {
+		radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE,
+					  RADEON_PRIO_QUERY);
+	}
+}
+
+unsigned si_cp_write_fence_dwords(struct si_screen *screen)
+{
+	unsigned dwords = 6;
+
+	if (screen->info.chip_class == CIK ||
+	    screen->info.chip_class == VI)
+		dwords *= 2;
+
+	return dwords;
+}
+
+void si_cp_wait_mem(struct si_context *ctx,
+		    uint64_t va, uint32_t ref, uint32_t mask, unsigned flags)
+{
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1) | flags);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, ref); /* reference value */
+	radeon_emit(cs, mask); /* mask */
+	radeon_emit(cs, 4); /* poll interval */
+}
+
+static void si_add_fence_dependency(struct si_context *sctx,
+				    struct pipe_fence_handle *fence)
+{
+	struct radeon_winsys *ws = sctx->ws;
+
+	if (sctx->dma_cs)
+		ws->cs_add_fence_dependency(sctx->dma_cs, fence);
+	ws->cs_add_fence_dependency(sctx->gfx_cs, fence);
+}
+
+static void si_add_syncobj_signal(struct si_context *sctx,
+				  struct pipe_fence_handle *fence)
+{
+	sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
+}
+
+static void si_fence_reference(struct pipe_screen *screen,
+			       struct pipe_fence_handle **dst,
+			       struct pipe_fence_handle *src)
+{
+	struct radeon_winsys *ws = ((struct si_screen*)screen)->ws;
+	struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
+	struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
+
+	if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+		ws->fence_reference(&(*rdst)->gfx, NULL);
+		ws->fence_reference(&(*rdst)->sdma, NULL);
+		tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL);
+		r600_resource_reference(&(*rdst)->fine.buf, NULL);
+		FREE(*rdst);
+	}
+        *rdst = rsrc;
+}
+
+static struct si_multi_fence *si_create_multi_fence()
+{
+	struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
+	if (!fence)
+		return NULL;
+
+	pipe_reference_init(&fence->reference, 1);
+	util_queue_fence_init(&fence->ready);
+
+	return fence;
+}
+
+struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
+					  struct tc_unflushed_batch_token *tc_token)
+{
+	struct si_multi_fence *fence = si_create_multi_fence();
+	if (!fence)
+		return NULL;
+
+	util_queue_fence_reset(&fence->ready);
+	tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
+
+	return (struct pipe_fence_handle *)fence;
+}
+
+static bool si_fine_fence_signaled(struct radeon_winsys *rws,
+				   const struct si_fine_fence *fine)
+{
+	char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
+							  PIPE_TRANSFER_UNSYNCHRONIZED);
+	if (!map)
+		return false;
+
+	uint32_t *fence = (uint32_t*)(map + fine->offset);
+	return *fence != 0;
+}
+
+static void si_fine_fence_set(struct si_context *ctx,
+			      struct si_fine_fence *fine,
+			      unsigned flags)
+{
+	uint32_t *fence_ptr;
+
+	assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
+
+	/* Use uncached system memory for the fence. */
+	u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4,
+		       &fine->offset, (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
+	if (!fine->buf)
+		return;
+
+	*fence_ptr = 0;
+
+	uint64_t fence_va = fine->buf->gpu_address + fine->offset;
+
+	radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf,
+				  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+	if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
+		struct radeon_cmdbuf *cs = ctx->gfx_cs;
+		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+		radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+			S_370_WR_CONFIRM(1) |
+			S_370_ENGINE_SEL(V_370_PFP));
+		radeon_emit(cs, fence_va);
+		radeon_emit(cs, fence_va >> 32);
+		radeon_emit(cs, 0x80000000);
+	} else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
+		si_cp_release_mem(ctx,
+				  V_028A90_BOTTOM_OF_PIPE_TS, 0,
+				  EOP_DST_SEL_MEM,
+				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+				  EOP_DATA_SEL_VALUE_32BIT,
+				  NULL, fence_va, 0x80000000,
+				  PIPE_QUERY_GPU_FINISHED);
+	} else {
+		assert(false);
+	}
+}
+
+static boolean si_fence_finish(struct pipe_screen *screen,
+			       struct pipe_context *ctx,
+			       struct pipe_fence_handle *fence,
+			       uint64_t timeout)
+{
+	struct radeon_winsys *rws = ((struct si_screen*)screen)->ws;
+	struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+	struct si_context *sctx;
+	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+	ctx = threaded_context_unwrap_sync(ctx);
+	sctx = (struct si_context*)(ctx ? ctx : NULL);
+
+	if (!util_queue_fence_is_signalled(&rfence->ready)) {
+		if (rfence->tc_token) {
+			/* Ensure that si_flush_from_st will be called for
+			 * this fence, but only if we're in the API thread
+			 * where the context is current.
+			 *
+			 * Note that the batch containing the flush may already
+			 * be in flight in the driver thread, so the fence
+			 * may not be ready yet when this call returns.
+			 */
+			threaded_context_flush(ctx, rfence->tc_token,
+					       timeout == 0);
+		}
+
+		if (!timeout)
+			return false;
+
+		if (timeout == PIPE_TIMEOUT_INFINITE) {
+			util_queue_fence_wait(&rfence->ready);
+		} else {
+			if (!util_queue_fence_wait_timeout(&rfence->ready, abs_timeout))
+				return false;
+		}
+
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
+		}
+	}
+
+	if (rfence->sdma) {
+		if (!rws->fence_wait(rws, rfence->sdma, timeout))
+			return false;
+
+		/* Recompute the timeout after waiting. */
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
+		}
+	}
+
+	if (!rfence->gfx)
+		return true;
+
+	if (rfence->fine.buf &&
+	    si_fine_fence_signaled(rws, &rfence->fine)) {
+		rws->fence_reference(&rfence->gfx, NULL);
+		r600_resource_reference(&rfence->fine.buf, NULL);
+		return true;
+	}
+
+	/* Flush the gfx IB if it hasn't been flushed yet. */
+	if (sctx && rfence->gfx_unflushed.ctx == sctx &&
+	    rfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
+		/* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
+		 * spec says:
+		 *
+		 *    "If the sync object being blocked upon will not be
+		 *     signaled in finite time (for example, by an associated
+		 *     fence command issued previously, but not yet flushed to
+		 *     the graphics pipeline), then ClientWaitSync may hang
+		 *     forever. To help prevent this behavior, if
+		 *     ClientWaitSync is called and all of the following are
+		 *     true:
+		 *
+		 *     * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
+		 *     * sync is unsignaled when ClientWaitSync is called,
+		 *     * and the calls to ClientWaitSync and FenceSync were
+		 *       issued from the same context,
+		 *
+		 *     then the GL will behave as if the equivalent of Flush
+		 *     were inserted immediately after the creation of sync."
+		 *
+		 * This means we need to flush for such fences even when we're
+		 * not going to wait.
+		 */
+		si_flush_gfx_cs(sctx,
+				(timeout ? 0 : PIPE_FLUSH_ASYNC) |
+				 RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+				NULL);
+		rfence->gfx_unflushed.ctx = NULL;
+
+		if (!timeout)
+			return false;
+
+		/* Recompute the timeout after all that. */
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
+		}
+	}
+
+	if (rws->fence_wait(rws, rfence->gfx, timeout))
+		return true;
+
+	/* Re-check in case the GPU is slow or hangs, but the commands before
+	 * the fine-grained fence have completed. */
+	if (rfence->fine.buf &&
+	    si_fine_fence_signaled(rws, &rfence->fine))
+		return true;
+
+	return false;
+}
+
+static void si_create_fence_fd(struct pipe_context *ctx,
+			       struct pipe_fence_handle **pfence, int fd,
+			       enum pipe_fd_type type)
+{
+	struct si_screen *sscreen = (struct si_screen*)ctx->screen;
+	struct radeon_winsys *ws = sscreen->ws;
+	struct si_multi_fence *rfence;
+
+	*pfence = NULL;
+
+	rfence = si_create_multi_fence();
+	if (!rfence)
+		return;
+
+	switch (type) {
+	case PIPE_FD_TYPE_NATIVE_SYNC:
+		if (!sscreen->info.has_fence_to_handle)
+			goto finish;
+
+		rfence->gfx = ws->fence_import_sync_file(ws, fd);
+		break;
+
+	case PIPE_FD_TYPE_SYNCOBJ:
+		if (!sscreen->info.has_syncobj)
+			goto finish;
+
+		rfence->gfx = ws->fence_import_syncobj(ws, fd);
+		break;
+
+	default:
+		unreachable("bad fence fd type when importing");
+	}
+
+finish:
+	if (!rfence->gfx) {
+		FREE(rfence);
+		return;
+	}
+
+	*pfence = (struct pipe_fence_handle*)rfence;
+}
+
+static int si_fence_get_fd(struct pipe_screen *screen,
+			   struct pipe_fence_handle *fence)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct radeon_winsys *ws = sscreen->ws;
+	struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+	int gfx_fd = -1, sdma_fd = -1;
+
+	if (!sscreen->info.has_fence_to_handle)
+		return -1;
+
+	util_queue_fence_wait(&rfence->ready);
+
+	/* Deferred fences aren't supported. */
+	assert(!rfence->gfx_unflushed.ctx);
+	if (rfence->gfx_unflushed.ctx)
+		return -1;
+
+	if (rfence->sdma) {
+		sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma);
+		if (sdma_fd == -1)
+			return -1;
+	}
+	if (rfence->gfx) {
+		gfx_fd = ws->fence_export_sync_file(ws, rfence->gfx);
+		if (gfx_fd == -1) {
+			if (sdma_fd != -1)
+				close(sdma_fd);
+			return -1;
+		}
+	}
+
+	/* If we don't have FDs at this point, it means we don't have fences
+	 * either. */
+	if (sdma_fd == -1 && gfx_fd == -1)
+		return ws->export_signalled_sync_file(ws);
+	if (sdma_fd == -1)
+		return gfx_fd;
+	if (gfx_fd == -1)
+		return sdma_fd;
+
+	/* Get a fence that will be a combination of both fences. */
+	sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
+	close(sdma_fd);
+	return gfx_fd;
+}
+
+static void si_flush_from_st(struct pipe_context *ctx,
+			     struct pipe_fence_handle **fence,
+			     unsigned flags)
+{
+	struct pipe_screen *screen = ctx->screen;
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct radeon_winsys *ws = sctx->ws;
+	struct pipe_fence_handle *gfx_fence = NULL;
+	struct pipe_fence_handle *sdma_fence = NULL;
+	bool deferred_fence = false;
+	struct si_fine_fence fine = {};
+	unsigned rflags = PIPE_FLUSH_ASYNC;
+
+	if (flags & PIPE_FLUSH_END_OF_FRAME)
+		rflags |= PIPE_FLUSH_END_OF_FRAME;
+
+	if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
+		assert(flags & PIPE_FLUSH_DEFERRED);
+		assert(fence);
+
+		si_fine_fence_set(sctx, &fine, flags);
+	}
+
+	/* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
+	if (sctx->dma_cs)
+		si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
+
+	if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
+		if (fence)
+			ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
+		if (!(flags & PIPE_FLUSH_DEFERRED))
+			ws->cs_sync_flush(sctx->gfx_cs);
+	} else {
+		/* Instead of flushing, create a deferred fence. Constraints:
+		 * - The state tracker must allow a deferred flush.
+		 * - The state tracker must request a fence.
+		 * - fence_get_fd is not allowed.
+		 * Thread safety in fence_finish must be ensured by the state tracker.
+		 */
+		if (flags & PIPE_FLUSH_DEFERRED &&
+		    !(flags & PIPE_FLUSH_FENCE_FD) &&
+		    fence) {
+			gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
+			deferred_fence = true;
+		} else {
+			si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
+		}
+	}
+
+	/* Both engines can signal out of order, so we need to keep both fences. */
+	if (fence) {
+		struct si_multi_fence *multi_fence;
+
+		if (flags & TC_FLUSH_ASYNC) {
+			multi_fence = (struct si_multi_fence *)*fence;
+			assert(multi_fence);
+		} else {
+			multi_fence = si_create_multi_fence();
+			if (!multi_fence) {
+				ws->fence_reference(&sdma_fence, NULL);
+				ws->fence_reference(&gfx_fence, NULL);
+				goto finish;
+			}
+
+			screen->fence_reference(screen, fence, NULL);
+			*fence = (struct pipe_fence_handle*)multi_fence;
+		}
+
+		/* If both fences are NULL, fence_finish will always return true. */
+		multi_fence->gfx = gfx_fence;
+		multi_fence->sdma = sdma_fence;
+
+		if (deferred_fence) {
+			multi_fence->gfx_unflushed.ctx = sctx;
+			multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
+		}
+
+		multi_fence->fine = fine;
+		fine.buf = NULL;
+
+		if (flags & TC_FLUSH_ASYNC) {
+			util_queue_fence_signal(&multi_fence->ready);
+			tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
+		}
+	}
+	assert(!fine.buf);
+finish:
+	if (!(flags & PIPE_FLUSH_DEFERRED)) {
+		if (sctx->dma_cs)
+			ws->cs_sync_flush(sctx->dma_cs);
+		ws->cs_sync_flush(sctx->gfx_cs);
+	}
+}
+
+static void si_fence_server_signal(struct pipe_context *ctx,
+				   struct pipe_fence_handle *fence)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+
+	/* We should have at least one syncobj to signal */
+	assert(rfence->sdma || rfence->gfx);
+
+	if (rfence->sdma)
+		si_add_syncobj_signal(sctx, rfence->sdma);
+	if (rfence->gfx)
+		si_add_syncobj_signal(sctx, rfence->gfx);
+
+	/**
+	 * The spec does not require a flush here. We insert a flush
+	 * because syncobj based signals are not directly placed into
+	 * the command stream. Instead the signal happens when the
+	 * submission associated with the syncobj finishes execution.
+	 *
+	 * Therefore, we must make sure that we flush the pipe to avoid
+	 * new work being emitted and getting executed before the signal
+	 * operation.
+	 */
+	si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+}
+
+static void si_fence_server_sync(struct pipe_context *ctx,
+				 struct pipe_fence_handle *fence)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+
+	util_queue_fence_wait(&rfence->ready);
+
+	/* Unflushed fences from the same context are no-ops. */
+	if (rfence->gfx_unflushed.ctx &&
+	    rfence->gfx_unflushed.ctx == sctx)
+		return;
+
+	/* All unflushed commands will not start execution before
+	 * this fence dependency is signalled.
+	 *
+	 * Therefore we must flush before inserting the dependency
+	 */
+	si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+
+	if (rfence->sdma)
+		si_add_fence_dependency(sctx, rfence->sdma);
+	if (rfence->gfx)
+		si_add_fence_dependency(sctx, rfence->gfx);
+}
+
+void si_init_fence_functions(struct si_context *ctx)
+{
+	ctx->b.flush = si_flush_from_st;
+	ctx->b.create_fence_fd = si_create_fence_fd;
+	ctx->b.fence_server_sync = si_fence_server_sync;
+	ctx->b.fence_server_signal = si_fence_server_signal;
+}
+
+void si_init_screen_fence_functions(struct si_screen *screen)
+{
+	screen->b.fence_finish = si_fence_finish;
+	screen->b.fence_reference = si_fence_reference;
+	screen->b.fence_get_fd = si_fence_get_fd;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_get.c b/lib/mesa/src/gallium/drivers/radeonsi/si_get.c
new file mode 100644
index 000000000..91f38329d
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_get.c
@@ -0,0 +1,1004 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "radeon/radeon_video.h"
+#include "radeon/radeon_vce.h"
+#include "radeon/radeon_uvd_enc.h"
+#include "ac_llvm_util.h"
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
+#include "util/u_screen.h"
+#include "util/u_video.h"
+#include "compiler/nir/nir.h"
+
+#include <sys/utsname.h>
+
+static const char *si_get_vendor(struct pipe_screen *pscreen)
+{
+	/* Don't change this. Games such as Alien Isolation are broken if this
+	 * returns "Advanced Micro Devices, Inc."
+	 */
+	return "X.Org";
+}
+
+static const char *si_get_device_vendor(struct pipe_screen *pscreen)
+{
+	return "AMD";
+}
+
+static const char *si_get_marketing_name(struct radeon_winsys *ws)
+{
+	if (!ws->get_chip_name)
+		return NULL;
+	return ws->get_chip_name(ws);
+}
+
+static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+	struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+	switch (param) {
+	/* Supported features (boolean caps). */
+	case PIPE_CAP_ACCELERATED:
+	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+	case PIPE_CAP_POINT_SPRITE:
+	case PIPE_CAP_OCCLUSION_QUERY:
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+	case PIPE_CAP_TEXTURE_SWIZZLE:
+	case PIPE_CAP_DEPTH_CLIP_DISABLE:
+	case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
+	case PIPE_CAP_SHADER_STENCIL_EXPORT:
+	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+	case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+	case PIPE_CAP_SM3:
+	case PIPE_CAP_SEAMLESS_CUBE_MAP:
+	case PIPE_CAP_PRIMITIVE_RESTART:
+	case PIPE_CAP_CONDITIONAL_RENDER:
+	case PIPE_CAP_TEXTURE_BARRIER:
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+	case PIPE_CAP_INDEP_BLEND_FUNC:
+	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+	case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+	case PIPE_CAP_START_INSTANCE:
+	case PIPE_CAP_NPOT_TEXTURES:
+	case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+	case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
+	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+	case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+	case PIPE_CAP_TGSI_INSTANCEID:
+	case PIPE_CAP_COMPUTE:
+	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+	case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+	case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+	case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+	case PIPE_CAP_CUBE_MAP_ARRAY:
+	case PIPE_CAP_SAMPLE_SHADING:
+	case PIPE_CAP_DRAW_INDIRECT:
+	case PIPE_CAP_CLIP_HALFZ:
+	case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+	case PIPE_CAP_TGSI_TEXCOORD:
+	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_SHAREABLE_SHADERS:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
+	case PIPE_CAP_SAMPLER_VIEW_TARGET:
+	case PIPE_CAP_TEXTURE_QUERY_LOD:
+	case PIPE_CAP_TEXTURE_GATHER_SM5:
+	case PIPE_CAP_TGSI_TXQS:
+	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_INVALIDATE_BUFFER:
+	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+	case PIPE_CAP_QUERY_BUFFER_OBJECT:
+	case PIPE_CAP_QUERY_MEMORY_INFO:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+	case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
+	case PIPE_CAP_STRING_MARKER:
+	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_CULL_DISTANCE:
+	case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+	case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+	case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+	case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+	case PIPE_CAP_DOUBLES:
+	case PIPE_CAP_TGSI_TEX_TXF_LZ:
+	case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+	case PIPE_CAP_BINDLESS_TEXTURE:
+	case PIPE_CAP_QUERY_TIMESTAMP:
+	case PIPE_CAP_QUERY_TIME_ELAPSED:
+	case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
+	case PIPE_CAP_QUERY_SO_OVERFLOW:
+	case PIPE_CAP_MEMOBJ:
+	case PIPE_CAP_LOAD_CONSTBUF:
+	case PIPE_CAP_INT64:
+	case PIPE_CAP_INT64_DIVMOD:
+	case PIPE_CAP_TGSI_CLOCK:
+	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+	case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+	case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
+	case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+	case PIPE_CAP_TGSI_BALLOT:
+	case PIPE_CAP_TGSI_VOTE:
+	case PIPE_CAP_TGSI_FS_FBFETCH:
+		return 1;
+
+	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+		return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
+
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+		return sscreen->info.has_gpu_reset_status_query ||
+		       sscreen->info.has_gpu_reset_counter_query;
+
+	case PIPE_CAP_TEXTURE_MULTISAMPLE:
+		return sscreen->info.has_2d_tiling;
+
+        case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+                return SI_MAP_BUFFER_ALIGNMENT;
+
+	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+	case PIPE_CAP_MAX_VERTEX_STREAMS:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+	case PIPE_CAP_MAX_WINDOW_RECTANGLES:
+		return 4;
+
+	case PIPE_CAP_GLSL_FEATURE_LEVEL:
+	case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+		if (sscreen->info.has_indirect_compute_dispatch)
+				return 450;
+		return 420;
+
+	case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
+		/* Optimal number for good TexSubImage performance on Polaris10. */
+		return 64 * 1024 * 1024;
+
+	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+	case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
+		return MIN2(sscreen->info.max_alloc_size, INT_MAX);
+
+	case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+	case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+	case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+		return !sscreen->info.has_unaligned_shader_loads;
+
+	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+		return sscreen->info.has_sparse_vm_mappings ?
+				RADEON_SPARSE_PAGE_SIZE : 0;
+
+	case PIPE_CAP_PACKED_UNIFORMS:
+		if (sscreen->debug_flags & DBG(NIR))
+			return 1;
+		return 0;
+
+	/* Unsupported features. */
+	case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+	case PIPE_CAP_USER_VERTEX_BUFFERS:
+	case PIPE_CAP_FAKE_SW_MSAA:
+	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
+	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES:
+	case PIPE_CAP_TGSI_MUL_ZERO_WINS:
+	case PIPE_CAP_UMA:
+	case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+	case PIPE_CAP_POST_DEPTH_COVERAGE:
+	case PIPE_CAP_TILE_RASTER_ORDER:
+	case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+	case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+	case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+	case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
+		return 0;
+
+	case PIPE_CAP_FENCE_SIGNAL:
+		return sscreen->info.has_syncobj;
+
+	case PIPE_CAP_CONSTBUF0_FLAGS:
+		return SI_RESOURCE_FLAG_32BIT;
+
+	case PIPE_CAP_NATIVE_FENCE_FD:
+		return sscreen->info.has_fence_to_handle;
+
+	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+		return sscreen->has_draw_indirect_multi;
+
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+		return 30;
+
+	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+		return sscreen->info.chip_class <= VI ?
+			PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
+
+	/* Stream output. */
+	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+		return 32*4;
+
+	/* Geometry shader output. */
+	case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+		return 1024;
+	case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+		return 4095;
+	case PIPE_CAP_MAX_GS_INVOCATIONS:
+		/* The closed driver exposes 127, but 125 is the greatest
+		 * number that works. */
+		return 125;
+
+	case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+		return 2048;
+
+	/* Texturing. */
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 15; /* 16384 */
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		/* textures support 8192, but layered rendering supports 2048 */
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+		/* textures support 8192, but layered rendering supports 2048 */
+		return 2048;
+
+	/* Viewports and render targets. */
+	case PIPE_CAP_MAX_VIEWPORTS:
+		return SI_MAX_VIEWPORTS;
+	case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
+	case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 8;
+	case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
+		return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
+
+	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+	case PIPE_CAP_MIN_TEXEL_OFFSET:
+		return -32;
+
+	case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+	case PIPE_CAP_MAX_TEXEL_OFFSET:
+		return 31;
+
+	case PIPE_CAP_ENDIANNESS:
+		return PIPE_ENDIAN_LITTLE;
+
+	case PIPE_CAP_VENDOR_ID:
+		return ATI_VENDOR_ID;
+	case PIPE_CAP_DEVICE_ID:
+		return sscreen->info.pci_id;
+	case PIPE_CAP_VIDEO_MEMORY:
+		return sscreen->info.vram_size >> 20;
+	case PIPE_CAP_PCI_GROUP:
+		return sscreen->info.pci_domain;
+	case PIPE_CAP_PCI_BUS:
+		return sscreen->info.pci_bus;
+	case PIPE_CAP_PCI_DEVICE:
+		return sscreen->info.pci_dev;
+	case PIPE_CAP_PCI_FUNCTION:
+		return sscreen->info.pci_func;
+
+	default:
+		return u_pipe_screen_get_param_defaults(pscreen, param);
+	}
+}
+
+static float si_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param)
+{
+	switch (param) {
+	case PIPE_CAPF_MAX_LINE_WIDTH:
+	case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+		/* This depends on the quant mode, though the precise interactions
+		 * are unknown. */
+		return 2048;
+	case PIPE_CAPF_MAX_POINT_WIDTH:
+	case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+		return SI_MAX_POINT_SIZE;
+	case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+		return 16.0f;
+	case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+		return 16.0f;
+	case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+	case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+	case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+		return 0.0f;
+	}
+	return 0.0f;
+}
+
+static int si_get_shader_param(struct pipe_screen* pscreen,
+			       enum pipe_shader_type shader,
+			       enum pipe_shader_cap param)
+{
+	struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+	switch(shader)
+	{
+	case PIPE_SHADER_FRAGMENT:
+	case PIPE_SHADER_VERTEX:
+	case PIPE_SHADER_GEOMETRY:
+	case PIPE_SHADER_TESS_CTRL:
+	case PIPE_SHADER_TESS_EVAL:
+		break;
+	case PIPE_SHADER_COMPUTE:
+		switch (param) {
+		case PIPE_SHADER_CAP_SUPPORTED_IRS: {
+			int ir = 1 << PIPE_SHADER_IR_NATIVE;
+
+			if (sscreen->info.has_indirect_compute_dispatch)
+				ir |= 1 << PIPE_SHADER_IR_TGSI;
+
+			return ir;
+		}
+
+		case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
+			uint64_t max_const_buffer_size;
+			pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_TGSI,
+				PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
+				&max_const_buffer_size);
+			return MIN2(max_const_buffer_size, INT_MAX);
+		}
+		default:
+			/* If compute shaders don't require a special value
+			 * for this cap, we can return the same value we
+			 * do for other shader types. */
+			break;
+		}
+		break;
+	default:
+		return 0;
+	}
+
+	switch (param) {
+	/* Shader limits. */
+	case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+	case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+	case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+	case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+	case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+		return 16384;
+	case PIPE_SHADER_CAP_MAX_INPUTS:
+		return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
+	case PIPE_SHADER_CAP_MAX_OUTPUTS:
+		return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
+	case PIPE_SHADER_CAP_MAX_TEMPS:
+		return 256; /* Max native temporaries. */
+	case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+		return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
+	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+		return SI_NUM_CONST_BUFFERS;
+	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+		return SI_NUM_SAMPLERS;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		return SI_NUM_SHADER_BUFFERS;
+	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+		return SI_NUM_IMAGES;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		if (sscreen->debug_flags & DBG(NIR))
+			return 0;
+		return 32;
+	case PIPE_SHADER_CAP_PREFERRED_IR:
+		if (sscreen->debug_flags & DBG(NIR))
+			return PIPE_SHADER_IR_NIR;
+		return PIPE_SHADER_IR_TGSI;
+	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+		return 4;
+
+	/* Supported boolean features. */
+	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+	case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+	case PIPE_SHADER_CAP_INTEGERS:
+	case PIPE_SHADER_CAP_INT64_ATOMICS:
+	case PIPE_SHADER_CAP_FP16:
+	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+	case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+		return 1;
+
+	case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+		/* TODO: Indirect indexing of GS inputs is unimplemented. */
+		if (shader == PIPE_SHADER_GEOMETRY)
+			return 0;
+
+		if (shader == PIPE_SHADER_VERTEX &&
+		    !sscreen->llvm_has_working_vgpr_indexing)
+			return 0;
+
+		/* Doing indirect indexing on GFX9 with LLVM 6.0 hangs.
+		 * This means we don't support INTERP instructions with
+		 * indirect indexing on inputs.
+		 */
+		if (shader == PIPE_SHADER_FRAGMENT &&
+		    !sscreen->llvm_has_working_vgpr_indexing &&
+		    HAVE_LLVM < 0x0700)
+			return 0;
+
+		/* TCS and TES load inputs directly from LDS or offchip
+		 * memory, so indirect indexing is always supported.
+		 * PS has to support indirect indexing, because we can't
+		 * lower that to TEMPs for INTERP instructions.
+		 */
+		return 1;
+
+	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+		return sscreen->llvm_has_working_vgpr_indexing ||
+		       /* TCS stores outputs directly to memory. */
+		       shader == PIPE_SHADER_TESS_CTRL;
+
+	/* Unsupported boolean features. */
+	case PIPE_SHADER_CAP_SUBROUTINES:
+	case PIPE_SHADER_CAP_SUPPORTED_IRS:
+	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+		return 0;
+	case PIPE_SHADER_CAP_SCALAR_ISA:
+		return 1;
+	}
+	return 0;
+}
+
+static const struct nir_shader_compiler_options nir_options = {
+	.lower_scmp = true,
+	.lower_flrp32 = true,
+	.lower_flrp64 = true,
+	.lower_fpow = true,
+	.lower_fsat = true,
+	.lower_fdiv = true,
+	.lower_sub = true,
+	.lower_ffma = true,
+	.lower_pack_snorm_2x16 = true,
+	.lower_pack_snorm_4x8 = true,
+	.lower_pack_unorm_2x16 = true,
+	.lower_pack_unorm_4x8 = true,
+	.lower_unpack_snorm_2x16 = true,
+	.lower_unpack_snorm_4x8 = true,
+	.lower_unpack_unorm_2x16 = true,
+	.lower_unpack_unorm_4x8 = true,
+	.lower_extract_byte = true,
+	.lower_extract_word = true,
+	.max_unroll_iterations = 32,
+	.native_integers = true,
+};
+
+static const void *
+si_get_compiler_options(struct pipe_screen *screen,
+			enum pipe_shader_ir ir,
+			enum pipe_shader_type shader)
+{
+	assert(ir == PIPE_SHADER_IR_NIR);
+	return &nir_options;
+}
+
+static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+	ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
+}
+
+static void si_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+	struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+	ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
+}
+
+static const char* si_get_name(struct pipe_screen *pscreen)
+{
+	struct si_screen *sscreen = (struct si_screen*)pscreen;
+
+	return sscreen->renderer_string;
+}
+
+static int si_get_video_param_no_decode(struct pipe_screen *screen,
+					enum pipe_video_profile profile,
+					enum pipe_video_entrypoint entrypoint,
+					enum pipe_video_cap param)
+{
+	switch (param) {
+	case PIPE_VIDEO_CAP_SUPPORTED:
+		return vl_profile_supported(screen, profile, entrypoint);
+	case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_VIDEO_CAP_MAX_WIDTH:
+	case PIPE_VIDEO_CAP_MAX_HEIGHT:
+		return vl_video_buffer_max_size(screen);
+	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+		return PIPE_FORMAT_NV12;
+	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+		return false;
+	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+		return false;
+	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+		return true;
+	case PIPE_VIDEO_CAP_MAX_LEVEL:
+		return vl_level_supported(screen, profile);
+	default:
+		return 0;
+	}
+}
+
+static int si_get_video_param(struct pipe_screen *screen,
+			      enum pipe_video_profile profile,
+			      enum pipe_video_entrypoint entrypoint,
+			      enum pipe_video_cap param)
+{
+	struct si_screen *sscreen = (struct si_screen *)screen;
+	enum pipe_video_format codec = u_reduce_video_profile(profile);
+
+	if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+		switch (param) {
+		case PIPE_VIDEO_CAP_SUPPORTED:
+			return (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
+				(si_vce_is_fw_version_supported(sscreen) ||
+				 sscreen->info.family == CHIP_RAVEN ||
+				 sscreen->info.family == CHIP_RAVEN2)) ||
+				(profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
+				(sscreen->info.family == CHIP_RAVEN ||
+				 sscreen->info.family == CHIP_RAVEN2 ||
+				 si_radeon_uvd_enc_supported(sscreen)));
+		case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+			return 1;
+		case PIPE_VIDEO_CAP_MAX_WIDTH:
+			return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+		case PIPE_VIDEO_CAP_MAX_HEIGHT:
+			return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
+		case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+			return PIPE_FORMAT_NV12;
+		case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+			return false;
+		case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+			return false;
+		case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+			return true;
+		case PIPE_VIDEO_CAP_STACKED_FRAMES:
+			return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
+		default:
+			return 0;
+		}
+	}
+
+	switch (param) {
+	case PIPE_VIDEO_CAP_SUPPORTED:
+		switch (codec) {
+		case PIPE_VIDEO_FORMAT_MPEG12:
+			return profile != PIPE_VIDEO_PROFILE_MPEG1;
+		case PIPE_VIDEO_FORMAT_MPEG4:
+			return 1;
+		case PIPE_VIDEO_FORMAT_MPEG4_AVC:
+			if ((sscreen->info.family == CHIP_POLARIS10 ||
+			     sscreen->info.family == CHIP_POLARIS11) &&
+			    sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) {
+				RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
+				return false;
+			}
+			return true;
+		case PIPE_VIDEO_FORMAT_VC1:
+			return true;
+		case PIPE_VIDEO_FORMAT_HEVC:
+			/* Carrizo only supports HEVC Main */
+			if (sscreen->info.family >= CHIP_STONEY)
+				return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+					profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+			else if (sscreen->info.family >= CHIP_CARRIZO)
+				return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+			return false;
+		case PIPE_VIDEO_FORMAT_JPEG:
+			if (sscreen->info.family == CHIP_RAVEN ||
+			    sscreen->info.family == CHIP_RAVEN2)
+				return true;
+			if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
+				return false;
+			if (!(sscreen->info.drm_major == 3 && sscreen->info.drm_minor >= 19)) {
+				RVID_ERR("No MJPEG support for the kernel version\n");
+				return false;
+			}
+			return true;
+		case PIPE_VIDEO_FORMAT_VP9:
+			if (sscreen->info.family < CHIP_RAVEN)
+				return false;
+			return true;
+		default:
+			return false;
+		}
+	case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_VIDEO_CAP_MAX_WIDTH:
+		return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+	case PIPE_VIDEO_CAP_MAX_HEIGHT:
+		return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
+	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+		if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 ||
+		    profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+			return PIPE_FORMAT_P016;
+		else
+			return PIPE_FORMAT_NV12;
+
+	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
+		enum pipe_video_format format = u_reduce_video_profile(profile);
+
+		if (format == PIPE_VIDEO_FORMAT_HEVC)
+			return false; //The firmware doesn't support interlaced HEVC.
+		else if (format == PIPE_VIDEO_FORMAT_JPEG)
+			return false;
+		else if (format == PIPE_VIDEO_FORMAT_VP9)
+			return false;
+		return true;
+	}
+	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+		return true;
+	case PIPE_VIDEO_CAP_MAX_LEVEL:
+		switch (profile) {
+		case PIPE_VIDEO_PROFILE_MPEG1:
+			return 0;
+		case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+		case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+			return 3;
+		case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
+			return 3;
+		case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
+			return 5;
+		case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+			return 1;
+		case PIPE_VIDEO_PROFILE_VC1_MAIN:
+			return 2;
+		case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+			return 4;
+		case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+		case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+			return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
+		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+		case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+			return 186;
+		default:
+			return 0;
+		}
+	default:
+		return 0;
+	}
+}
+
+static boolean si_vid_is_format_supported(struct pipe_screen *screen,
+					  enum pipe_format format,
+					  enum pipe_video_profile profile,
+					  enum pipe_video_entrypoint entrypoint)
+{
+	/* HEVC 10 bit decoding should use P016 instead of NV12 if possible */
+	if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+		return (format == PIPE_FORMAT_NV12) ||
+			(format == PIPE_FORMAT_P016);
+
+	/* we can only handle this one with UVD */
+	if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
+		return format == PIPE_FORMAT_NV12;
+
+	return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
+}
+
+static unsigned get_max_threads_per_block(struct si_screen *screen,
+					  enum pipe_shader_ir ir_type)
+{
+	if (ir_type == PIPE_SHADER_IR_NATIVE)
+		return 256;
+
+	/* Only 16 waves per thread-group on gfx9. */
+	if (screen->info.chip_class >= GFX9)
+		return 1024;
+
+	/* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice
+	 * round number.
+	 */
+	return 2048;
+}
+
+static int si_get_compute_param(struct pipe_screen *screen,
+				enum pipe_shader_ir ir_type,
+				enum pipe_compute_cap param,
+				void *ret)
+{
+	struct si_screen *sscreen = (struct si_screen *)screen;
+
+	//TODO: select these params by asic
+	switch (param) {
+	case PIPE_COMPUTE_CAP_IR_TARGET: {
+		const char *gpu, *triple;
+
+		triple = "amdgcn-mesa-mesa3d";
+		gpu = ac_get_llvm_processor_name(sscreen->info.family);
+		if (ret) {
+			sprintf(ret, "%s-%s", gpu, triple);
+		}
+		/* +2 for dash and terminating NIL byte */
+		return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
+	}
+	case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+		if (ret) {
+			uint64_t *grid_dimension = ret;
+			grid_dimension[0] = 3;
+		}
+		return 1 * sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+		if (ret) {
+			uint64_t *grid_size = ret;
+			grid_size[0] = 65535;
+			grid_size[1] = 65535;
+			grid_size[2] = 65535;
+		}
+		return 3 * sizeof(uint64_t) ;
+
+	case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+		if (ret) {
+			uint64_t *block_size = ret;
+			unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+			block_size[0] = threads_per_block;
+			block_size[1] = threads_per_block;
+			block_size[2] = threads_per_block;
+		}
+		return 3 * sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+		if (ret) {
+			uint64_t *max_threads_per_block = ret;
+			*max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+		}
+		return sizeof(uint64_t);
+	case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+		if (ret) {
+			uint32_t *address_bits = ret;
+			address_bits[0] = 64;
+		}
+		return 1 * sizeof(uint32_t);
+
+	case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+		if (ret) {
+			uint64_t *max_global_size = ret;
+			uint64_t max_mem_alloc_size;
+
+			si_get_compute_param(screen, ir_type,
+				PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
+				&max_mem_alloc_size);
+
+			/* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
+			 * 1/4 of the MAX_GLOBAL_SIZE.  Since the
+			 * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
+			 * make sure we never report more than
+			 * 4 * MAX_MEM_ALLOC_SIZE.
+			 */
+			*max_global_size = MIN2(4 * max_mem_alloc_size,
+						MAX2(sscreen->info.gart_size,
+						     sscreen->info.vram_size));
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+		if (ret) {
+			uint64_t *max_local_size = ret;
+			/* Value reported by the closed source driver. */
+			*max_local_size = 32768;
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+		if (ret) {
+			uint64_t *max_input_size = ret;
+			/* Value reported by the closed source driver. */
+			*max_input_size = 1024;
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+		if (ret) {
+			uint64_t *max_mem_alloc_size = ret;
+
+			*max_mem_alloc_size = sscreen->info.max_alloc_size;
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+		if (ret) {
+			uint32_t *max_clock_frequency = ret;
+			*max_clock_frequency = sscreen->info.max_shader_clock;
+		}
+		return sizeof(uint32_t);
+
+	case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+		if (ret) {
+			uint32_t *max_compute_units = ret;
+			*max_compute_units = sscreen->info.num_good_compute_units;
+		}
+		return sizeof(uint32_t);
+
+	case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+		if (ret) {
+			uint32_t *images_supported = ret;
+			*images_supported = 0;
+		}
+		return sizeof(uint32_t);
+	case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+		break; /* unused */
+	case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+		if (ret) {
+			uint32_t *subgroup_size = ret;
+			*subgroup_size = 64;
+		}
+		return sizeof(uint32_t);
+	case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+		if (ret) {
+			uint64_t *max_variable_threads_per_block = ret;
+			if (ir_type == PIPE_SHADER_IR_NATIVE)
+				*max_variable_threads_per_block = 0;
+			else
+				*max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+		}
+		return sizeof(uint64_t);
+	}
+
+        fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
+        return 0;
+}
+
+static uint64_t si_get_timestamp(struct pipe_screen *screen)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+
+	return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
+			sscreen->info.clock_crystal_freq;
+}
+
+static void si_query_memory_info(struct pipe_screen *screen,
+				 struct pipe_memory_info *info)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct radeon_winsys *ws = sscreen->ws;
+	unsigned vram_usage, gtt_usage;
+
+	info->total_device_memory = sscreen->info.vram_size / 1024;
+	info->total_staging_memory = sscreen->info.gart_size / 1024;
+
+	/* The real TTM memory usage is somewhat random, because:
+	 *
+	 * 1) TTM delays freeing memory, because it can only free it after
+	 *    fences expire.
+	 *
+	 * 2) The memory usage can be really low if big VRAM evictions are
+	 *    taking place, but the real usage is well above the size of VRAM.
+	 *
+	 * Instead, return statistics of this process.
+	 */
+	vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
+	gtt_usage =  ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
+
+	info->avail_device_memory =
+		vram_usage <= info->total_device_memory ?
+				info->total_device_memory - vram_usage : 0;
+	info->avail_staging_memory =
+		gtt_usage <= info->total_staging_memory ?
+				info->total_staging_memory - gtt_usage : 0;
+
+	info->device_memory_evicted =
+		ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+
+	if (sscreen->info.drm_major == 3 && sscreen->info.drm_minor >= 4)
+		info->nr_device_memory_evictions =
+			ws->query_value(ws, RADEON_NUM_EVICTIONS);
+	else
+		/* Just return the number of evicted 64KB pages. */
+		info->nr_device_memory_evictions = info->device_memory_evicted / 64;
+}
+
+static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen)
+{
+	struct si_screen *sscreen = (struct si_screen*)pscreen;
+
+	return sscreen->disk_shader_cache;
+}
+
+static void si_init_renderer_string(struct si_screen *sscreen)
+{
+	struct radeon_winsys *ws = sscreen->ws;
+	char first_name[256], second_name[32] = {}, kernel_version[128] = {};
+	struct utsname uname_data;
+
+	const char *marketing_name = si_get_marketing_name(ws);
+
+	if (marketing_name) {
+		snprintf(first_name, sizeof(first_name), "%s", marketing_name);
+		snprintf(second_name, sizeof(second_name), "%s, ",
+			 sscreen->info.name);
+	} else {
+		snprintf(first_name, sizeof(first_name), "AMD %s",
+			 sscreen->info.name);
+	}
+
+	if (uname(&uname_data) == 0)
+		snprintf(kernel_version, sizeof(kernel_version),
+			 ", %s", uname_data.release);
+
+	snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
+		 "%s (%sDRM %i.%i.%i%s, LLVM %i.%i.%i)",
+		 first_name, second_name, sscreen->info.drm_major,
+		 sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
+		 kernel_version,
+		 (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff,
+		 MESA_LLVM_VERSION_PATCH);
+}
+
+void si_init_screen_get_functions(struct si_screen *sscreen)
+{
+	sscreen->b.get_name = si_get_name;
+	sscreen->b.get_vendor = si_get_vendor;
+	sscreen->b.get_device_vendor = si_get_device_vendor;
+	sscreen->b.get_param = si_get_param;
+	sscreen->b.get_paramf = si_get_paramf;
+	sscreen->b.get_compute_param = si_get_compute_param;
+	sscreen->b.get_timestamp = si_get_timestamp;
+	sscreen->b.get_shader_param = si_get_shader_param;
+	sscreen->b.get_compiler_options = si_get_compiler_options;
+	sscreen->b.get_device_uuid = si_get_device_uuid;
+	sscreen->b.get_driver_uuid = si_get_driver_uuid;
+	sscreen->b.query_memory_info = si_query_memory_info;
+	sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
+
+	if (sscreen->info.has_hw_decode) {
+		sscreen->b.get_video_param = si_get_video_param;
+		sscreen->b.is_video_format_supported = si_vid_is_format_supported;
+	} else {
+		sscreen->b.get_video_param = si_get_video_param_no_decode;
+		sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
+	}
+
+	si_init_renderer_string(sscreen);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_gfx_cs.c b/lib/mesa/src/gallium/drivers/radeonsi/si_gfx_cs.c
new file mode 100644
index 000000000..f178d0445
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+
+#include "util/os_time.h"
+
+/* initialize */
+void si_need_gfx_cs_space(struct si_context *ctx)
+{
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+	/* There is no need to flush the DMA IB here, because
+	 * r600_need_dma_space always flushes the GFX IB if there is
+	 * a conflict, which means any unflushed DMA commands automatically
+	 * precede the GFX IB (= they had no dependency on the GFX IB when
+	 * they were submitted).
+	 */
+
+	/* There are two memory usage counters in the winsys for all buffers
+	 * that have been added (cs_add_buffer) and two counters in the pipe
+	 * driver for those that haven't been added yet.
+	 */
+	if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
+						   ctx->vram, ctx->gtt))) {
+		ctx->gtt = 0;
+		ctx->vram = 0;
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+		return;
+	}
+	ctx->gtt = 0;
+	ctx->vram = 0;
+
+	/* If the IB is sufficiently large, don't count the space needed
+	 * and just flush if there is not enough space left.
+	 *
+	 * Also reserve space for stopping queries at the end of IB, because
+	 * the number of active queries is mostly unlimited.
+	 */
+	unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
+	if (!ctx->ws->cs_check_space(cs, need_dwords))
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+}
+
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
+		     struct pipe_fence_handle **fence)
+{
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
+	struct radeon_winsys *ws = ctx->ws;
+	unsigned wait_flags = 0;
+
+	if (ctx->gfx_flush_in_progress)
+		return;
+
+	if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
+		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			      SI_CONTEXT_CS_PARTIAL_FLUSH |
+			      SI_CONTEXT_INV_GLOBAL_L2;
+	} else if (ctx->chip_class == SI) {
+		/* The kernel flushes L2 before shaders are finished. */
+		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			      SI_CONTEXT_CS_PARTIAL_FLUSH;
+	} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			      SI_CONTEXT_CS_PARTIAL_FLUSH;
+	}
+
+	/* Drop this flush if it's a no-op. */
+	if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
+	    (!wait_flags || !ctx->gfx_last_ib_is_busy))
+		return;
+
+	if (si_check_device_reset(ctx))
+		return;
+
+	if (ctx->screen->debug_flags & DBG(CHECK_VM))
+		flags &= ~PIPE_FLUSH_ASYNC;
+
+	/* If the state tracker is flushing the GFX IB, si_flush_from_st is
+	 * responsible for flushing the DMA IB and merging the fences from both.
+	 * This code is only needed when the driver flushes the GFX IB
+	 * internally, and it never asks for a fence handle.
+	 */
+	if (radeon_emitted(ctx->dma_cs, 0)) {
+		assert(fence == NULL); /* internal flushes only */
+		si_flush_dma_cs(ctx, flags, NULL);
+	}
+
+	ctx->gfx_flush_in_progress = true;
+
+	if (!LIST_IS_EMPTY(&ctx->active_queries))
+		si_suspend_queries(ctx);
+
+	ctx->streamout.suspended = false;
+	if (ctx->streamout.begin_emitted) {
+		si_emit_streamout_end(ctx);
+		ctx->streamout.suspended = true;
+	}
+
+	/* Make sure CP DMA is idle at the end of IBs after L2 prefetches
+	 * because the kernel doesn't wait for it. */
+	if (ctx->chip_class >= CIK)
+		si_cp_dma_wait_for_idle(ctx);
+
+	/* Wait for draw calls to finish if needed. */
+	if (wait_flags) {
+		ctx->flags |= wait_flags;
+		si_emit_cache_flush(ctx);
+	}
+	ctx->gfx_last_ib_is_busy = wait_flags == 0;
+
+	if (ctx->current_saved_cs) {
+		si_trace_emit(ctx);
+
+		/* Save the IB for debug contexts. */
+		si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
+		ctx->current_saved_cs->flushed = true;
+		ctx->current_saved_cs->time_flush = os_time_get_nano();
+
+		si_log_hw_flush(ctx);
+	}
+
+	/* Flush the CS. */
+	ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
+	if (fence)
+		ws->fence_reference(fence, ctx->last_gfx_fence);
+
+	ctx->num_gfx_cs_flushes++;
+
+	/* Check VM faults if needed. */
+	if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
+		/* Use conservative timeout 800ms, after which we won't wait any
+		 * longer and assume the GPU is hung.
+		 */
+		ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000);
+
+		si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
+	}
+
+	if (ctx->current_saved_cs)
+		si_saved_cs_reference(&ctx->current_saved_cs, NULL);
+
+	si_begin_new_gfx_cs(ctx);
+	ctx->gfx_flush_in_progress = false;
+}
+
+static void si_begin_gfx_cs_debug(struct si_context *ctx)
+{
+	static const uint32_t zeros[1];
+	assert(!ctx->current_saved_cs);
+
+	ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
+	if (!ctx->current_saved_cs)
+		return;
+
+	pipe_reference_init(&ctx->current_saved_cs->reference, 1);
+
+	ctx->current_saved_cs->trace_buf = r600_resource(
+		pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+	if (!ctx->current_saved_cs->trace_buf) {
+		free(ctx->current_saved_cs);
+		ctx->current_saved_cs = NULL;
+		return;
+	}
+
+	pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b,
+				    0, sizeof(zeros), zeros);
+	ctx->current_saved_cs->trace_id = 0;
+
+	si_trace_emit(ctx);
+
+	radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
+			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+}
+
+void si_begin_new_gfx_cs(struct si_context *ctx)
+{
+	if (ctx->is_debug)
+		si_begin_gfx_cs_debug(ctx);
+
+	/* Always invalidate caches at the beginning of IBs, because external
+	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+	 * buffers.
+	 *
+	 * Note that the cache flush done by the kernel at the end of GFX IBs
+	 * isn't useful here, because that flush can finish after the following
+	 * IB starts drawing.
+	 *
+	 * TODO: Do we also need to invalidate CB & DB caches?
+	 */
+	ctx->flags |= SI_CONTEXT_INV_ICACHE |
+		      SI_CONTEXT_INV_SMEM_L1 |
+		      SI_CONTEXT_INV_VMEM_L1 |
+		      SI_CONTEXT_INV_GLOBAL_L2 |
+		      SI_CONTEXT_START_PIPELINE_STATS;
+
+	/* set all valid group as dirty so they get reemited on
+	 * next draw command
+	 */
+	si_pm4_reset_emitted(ctx);
+
+	/* The CS initialization should be emitted before everything else. */
+	si_pm4_emit(ctx, ctx->init_config);
+	if (ctx->init_config_gs_rings)
+		si_pm4_emit(ctx, ctx->init_config_gs_rings);
+
+	if (ctx->queued.named.ls)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+	if (ctx->queued.named.hs)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+	if (ctx->queued.named.es)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+	if (ctx->queued.named.gs)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+	if (ctx->queued.named.vs)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+	if (ctx->queued.named.ps)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+	if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+
+	/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
+	bool has_clear_state = ctx->screen->has_clear_state;
+	if (has_clear_state) {
+		ctx->framebuffer.dirty_cbufs =
+			 u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
+		/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
+		ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
+	} else {
+		ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
+		ctx->framebuffer.dirty_zsbuf = true;
+	}
+	/* This should always be marked as dirty to set the framebuffer scissor
+	 * at least. */
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
+
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
+	/* CLEAR_STATE sets zeros. */
+	if (!has_clear_state || ctx->clip_state.any_nonzeros)
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
+	ctx->sample_locs_num_samples = 0;
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
+	/* CLEAR_STATE sets 0xffff. */
+	if (!has_clear_state || ctx->sample_mask != 0xffff)
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
+	/* CLEAR_STATE sets zeros. */
+	if (!has_clear_state || ctx->blend_color.any_nonzeros)
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
+	if (ctx->chip_class >= GFX9)
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
+	/* CLEAR_STATE disables all window rectangles. */
+	if (!has_clear_state || ctx->num_window_rectangles > 0)
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
+	si_all_descriptors_begin_new_cs(ctx);
+	si_all_resident_buffers_begin_new_cs(ctx);
+
+	ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+	ctx->viewports.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+	ctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
+	if (ctx->scratch_buffer) {
+		si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
+	}
+
+	if (ctx->streamout.suspended) {
+		ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
+		si_streamout_buffers_dirty(ctx);
+	}
+
+	if (!LIST_IS_EMPTY(&ctx->active_queries))
+		si_resume_queries(ctx);
+
+	assert(!ctx->gfx_cs->prev_dw);
+	ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+
+	/* Invalidate various draw states so that they are emitted before
+	 * the first draw call. */
+	si_invalidate_draw_sh_constants(ctx);
+	ctx->last_index_size = -1;
+	ctx->last_primitive_restart_en = -1;
+	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
+	ctx->last_prim = -1;
+	ctx->last_multi_vgt_param = -1;
+	ctx->last_rast_prim = -1;
+	ctx->last_sc_line_stipple = ~0;
+	ctx->last_vs_state = ~0;
+	ctx->last_ls = NULL;
+	ctx->last_tcs = NULL;
+	ctx->last_tes_sh_base = -1;
+	ctx->last_num_tcs_input_cp = -1;
+	ctx->last_ls_hs_config = -1; /* impossible value */
+
+	ctx->cs_shader_state.initialized = false;
+
+	if (has_clear_state) {
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+		ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL]	= 0x00001000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG]	= 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA]	= 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL]	= 0x00090000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL]	= 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ]	= 0x3f800000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ]	= 0x3f800000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ]	= 0x3f800000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]	= 0x3f800000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE]	= 0xffff;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE]    = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL]  = 0x00000002;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK]  = 0xffffffff;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL]  = 0x0000001e; /* From VI */
+
+		/* Set all saved registers state to saved. */
+		ctx->tracked_regs.reg_saved = 0xffffffffffffffff;
+	} else {
+		/* Set all saved registers state to unknown. */
+		ctx->tracked_regs.reg_saved = 0;
+	}
+
+	/* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
+	memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_gpu_load.c b/lib/mesa/src/gallium/drivers/radeonsi/si_gpu_load.c
new file mode 100644
index 000000000..8c457b30e
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_gpu_load.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* The GPU load is measured as follows.
+ *
+ * There is a thread which samples the GRBM_STATUS register at a certain
+ * frequency and the "busy" or "idle" counter is incremented based on
+ * whether the GUI_ACTIVE bit is set or not.
+ *
+ * Then, the user can sample the counters twice and calculate the average
+ * GPU load between the two samples.
+ */
+
+#include "radeonsi/si_pipe.h"
+#include "radeonsi/si_query.h"
+#include "util/os_time.h"
+
+/* For good accuracy at 1000 fps or lower. This will be inaccurate for higher
+ * fps (there are too few samples per frame). */
+#define SAMPLES_PER_SEC 10000
+
+#define GRBM_STATUS		0x8010
+#define TA_BUSY(x)		(((x) >> 14) & 0x1)
+#define GDS_BUSY(x)		(((x) >> 15) & 0x1)
+#define VGT_BUSY(x)		(((x) >> 17) & 0x1)
+#define IA_BUSY(x)		(((x) >> 19) & 0x1)
+#define SX_BUSY(x)		(((x) >> 20) & 0x1)
+#define WD_BUSY(x)		(((x) >> 21) & 0x1)
+#define SPI_BUSY(x)		(((x) >> 22) & 0x1)
+#define BCI_BUSY(x)		(((x) >> 23) & 0x1)
+#define SC_BUSY(x)		(((x) >> 24) & 0x1)
+#define PA_BUSY(x)		(((x) >> 25) & 0x1)
+#define DB_BUSY(x)		(((x) >> 26) & 0x1)
+#define CP_BUSY(x)		(((x) >> 29) & 0x1)
+#define CB_BUSY(x)		(((x) >> 30) & 0x1)
+#define GUI_ACTIVE(x)		(((x) >> 31) & 0x1)
+
+#define SRBM_STATUS2		0x0e4c
+#define SDMA_BUSY(x)		(((x) >> 5) & 0x1)
+
+#define CP_STAT                 0x8680
+#define PFP_BUSY(x)		(((x) >> 15) & 0x1)
+#define MEQ_BUSY(x)		(((x) >> 16) & 0x1)
+#define ME_BUSY(x)		(((x) >> 17) & 0x1)
+#define SURFACE_SYNC_BUSY(x)	(((x) >> 21) & 0x1)
+#define DMA_BUSY(x)		(((x) >> 22) & 0x1)
+#define SCRATCH_RAM_BUSY(x)	(((x) >> 24) & 0x1)
+
+#define IDENTITY(x) x
+
+#define UPDATE_COUNTER(field, mask)					\
+	do {								\
+		if (mask(value))					\
+			p_atomic_inc(&counters->named.field.busy);	\
+		else							\
+			p_atomic_inc(&counters->named.field.idle);	\
+	} while (0)
+
+static void si_update_mmio_counters(struct si_screen *sscreen,
+				    union si_mmio_counters *counters)
+{
+	uint32_t value = 0;
+	bool gui_busy, sdma_busy = false;
+
+	/* GRBM_STATUS */
+	sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
+
+	UPDATE_COUNTER(ta, TA_BUSY);
+	UPDATE_COUNTER(gds, GDS_BUSY);
+	UPDATE_COUNTER(vgt, VGT_BUSY);
+	UPDATE_COUNTER(ia, IA_BUSY);
+	UPDATE_COUNTER(sx, SX_BUSY);
+	UPDATE_COUNTER(wd, WD_BUSY);
+	UPDATE_COUNTER(spi, SPI_BUSY);
+	UPDATE_COUNTER(bci, BCI_BUSY);
+	UPDATE_COUNTER(sc, SC_BUSY);
+	UPDATE_COUNTER(pa, PA_BUSY);
+	UPDATE_COUNTER(db, DB_BUSY);
+	UPDATE_COUNTER(cp, CP_BUSY);
+	UPDATE_COUNTER(cb, CB_BUSY);
+	UPDATE_COUNTER(gui, GUI_ACTIVE);
+	gui_busy = GUI_ACTIVE(value);
+
+	if (sscreen->info.chip_class == CIK || sscreen->info.chip_class == VI) {
+		/* SRBM_STATUS2 */
+		sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
+
+		UPDATE_COUNTER(sdma, SDMA_BUSY);
+		sdma_busy = SDMA_BUSY(value);
+	}
+
+	if (sscreen->info.chip_class >= VI) {
+		/* CP_STAT */
+		sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
+
+		UPDATE_COUNTER(pfp, PFP_BUSY);
+		UPDATE_COUNTER(meq, MEQ_BUSY);
+		UPDATE_COUNTER(me, ME_BUSY);
+		UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
+		UPDATE_COUNTER(cp_dma, DMA_BUSY);
+		UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
+	}
+
+	value = gui_busy || sdma_busy;
+	UPDATE_COUNTER(gpu, IDENTITY);
+}
+
+#undef UPDATE_COUNTER
+
+static int
+si_gpu_load_thread(void *param)
+{
+	struct si_screen *sscreen = (struct si_screen*)param;
+	const int period_us = 1000000 / SAMPLES_PER_SEC;
+	int sleep_us = period_us;
+	int64_t cur_time, last_time = os_time_get();
+
+	while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
+		if (sleep_us)
+			os_time_sleep(sleep_us);
+
+		/* Make sure we sleep the ideal amount of time to match
+		 * the expected frequency. */
+		cur_time = os_time_get();
+
+		if (os_time_timeout(last_time, last_time + period_us,
+				    cur_time))
+			sleep_us = MAX2(sleep_us - 1, 1);
+		else
+			sleep_us += 1;
+
+		/*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
+		last_time = cur_time;
+
+		/* Update the counters. */
+		si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
+	}
+	p_atomic_dec(&sscreen->gpu_load_stop_thread);
+	return 0;
+}
+
+void si_gpu_load_kill_thread(struct si_screen *sscreen)
+{
+	if (!sscreen->gpu_load_thread)
+		return;
+
+	p_atomic_inc(&sscreen->gpu_load_stop_thread);
+	thrd_join(sscreen->gpu_load_thread, NULL);
+	sscreen->gpu_load_thread = 0;
+}
+
+static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
+				     unsigned busy_index)
+{
+	/* Start the thread if needed. */
+	if (!sscreen->gpu_load_thread) {
+		mtx_lock(&sscreen->gpu_load_mutex);
+		/* Check again inside the mutex. */
+		if (!sscreen->gpu_load_thread)
+			sscreen->gpu_load_thread =
+				u_thread_create(si_gpu_load_thread, sscreen);
+		mtx_unlock(&sscreen->gpu_load_mutex);
+	}
+
+	unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
+	unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
+
+	return busy | ((uint64_t)idle << 32);
+}
+
+static unsigned si_end_mmio_counter(struct si_screen *sscreen,
+				    uint64_t begin, unsigned busy_index)
+{
+	uint64_t end = si_read_mmio_counter(sscreen, busy_index);
+	unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
+	unsigned idle = (end >> 32) - (begin >> 32);
+
+	/* Calculate the % of time the busy counter was being incremented.
+	 *
+	 * If no counters were incremented, return the current counter status.
+	 * It's for the case when the load is queried faster than
+	 * the counters are updated.
+	 */
+	if (idle || busy) {
+		return busy*100 / (busy + idle);
+	} else {
+		union si_mmio_counters counters;
+
+		memset(&counters, 0, sizeof(counters));
+		si_update_mmio_counters(sscreen, &counters);
+		return counters.array[busy_index] ? 100 : 0;
+	}
+}
+
+#define BUSY_INDEX(rscreen, field) (&rscreen->mmio_counters.named.field.busy - \
+				    rscreen->mmio_counters.array)
+
+static unsigned busy_index_from_type(struct si_screen *sscreen,
+				     unsigned type)
+{
+	switch (type) {
+	case SI_QUERY_GPU_LOAD:
+		return BUSY_INDEX(sscreen, gpu);
+	case SI_QUERY_GPU_SHADERS_BUSY:
+		return BUSY_INDEX(sscreen, spi);
+	case SI_QUERY_GPU_TA_BUSY:
+		return BUSY_INDEX(sscreen, ta);
+	case SI_QUERY_GPU_GDS_BUSY:
+		return BUSY_INDEX(sscreen, gds);
+	case SI_QUERY_GPU_VGT_BUSY:
+		return BUSY_INDEX(sscreen, vgt);
+	case SI_QUERY_GPU_IA_BUSY:
+		return BUSY_INDEX(sscreen, ia);
+	case SI_QUERY_GPU_SX_BUSY:
+		return BUSY_INDEX(sscreen, sx);
+	case SI_QUERY_GPU_WD_BUSY:
+		return BUSY_INDEX(sscreen, wd);
+	case SI_QUERY_GPU_BCI_BUSY:
+		return BUSY_INDEX(sscreen, bci);
+	case SI_QUERY_GPU_SC_BUSY:
+		return BUSY_INDEX(sscreen, sc);
+	case SI_QUERY_GPU_PA_BUSY:
+		return BUSY_INDEX(sscreen, pa);
+	case SI_QUERY_GPU_DB_BUSY:
+		return BUSY_INDEX(sscreen, db);
+	case SI_QUERY_GPU_CP_BUSY:
+		return BUSY_INDEX(sscreen, cp);
+	case SI_QUERY_GPU_CB_BUSY:
+		return BUSY_INDEX(sscreen, cb);
+	case SI_QUERY_GPU_SDMA_BUSY:
+		return BUSY_INDEX(sscreen, sdma);
+	case SI_QUERY_GPU_PFP_BUSY:
+		return BUSY_INDEX(sscreen, pfp);
+	case SI_QUERY_GPU_MEQ_BUSY:
+		return BUSY_INDEX(sscreen, meq);
+	case SI_QUERY_GPU_ME_BUSY:
+		return BUSY_INDEX(sscreen, me);
+	case SI_QUERY_GPU_SURF_SYNC_BUSY:
+		return BUSY_INDEX(sscreen, surf_sync);
+	case SI_QUERY_GPU_CP_DMA_BUSY:
+		return BUSY_INDEX(sscreen, cp_dma);
+	case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+		return BUSY_INDEX(sscreen, scratch_ram);
+	default:
+		unreachable("invalid query type");
+	}
+}
+
+uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
+{
+	unsigned busy_index = busy_index_from_type(sscreen, type);
+	return si_read_mmio_counter(sscreen, busy_index);
+}
+
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
+			uint64_t begin)
+{
+	unsigned busy_index = busy_index_from_type(sscreen, type);
+	return si_end_mmio_counter(sscreen, begin, busy_index);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_query.c b/lib/mesa/src/gallium/drivers/radeonsi/si_query.c
new file mode 100644
index 000000000..7a2c7afdb
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_query.c
@@ -0,0 +1,1894 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "si_query.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+#include "util/os_time.h"
+#include "util/u_suballoc.h"
+#include "amd/common/sid.h"
+
+#define SI_MAX_STREAMS 4
+
+struct si_hw_query_params {
+	unsigned start_offset;
+	unsigned end_offset;
+	unsigned fence_offset;
+	unsigned pair_stride;
+	unsigned pair_count;
+};
+
+/* Queries without buffer handling or suspend/resume. */
+struct si_query_sw {
+	struct si_query b;
+
+	uint64_t begin_result;
+	uint64_t end_result;
+
+	uint64_t begin_time;
+	uint64_t end_time;
+
+	/* Fence for GPU_FINISHED. */
+	struct pipe_fence_handle *fence;
+};
+
+static void si_query_sw_destroy(struct si_screen *sscreen,
+				struct si_query *rquery)
+{
+	struct si_query_sw *query = (struct si_query_sw *)rquery;
+
+	sscreen->b.fence_reference(&sscreen->b, &query->fence, NULL);
+	FREE(query);
+}
+
+static enum radeon_value_id winsys_id_from_type(unsigned type)
+{
+	switch (type) {
+	case SI_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
+	case SI_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
+	case SI_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
+	case SI_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
+	case SI_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
+	case SI_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
+	case SI_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
+	case SI_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
+	case SI_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
+	case SI_QUERY_GFX_IB_SIZE: return RADEON_GFX_IB_SIZE_COUNTER;
+	case SI_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
+	case SI_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
+	case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
+	case SI_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
+	case SI_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
+	case SI_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
+	case SI_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
+	case SI_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
+	case SI_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
+	case SI_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
+	default: unreachable("query type does not correspond to winsys id");
+	}
+}
+
+static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
+{
+	struct pipe_fence_handle *fence = NULL;
+
+	si_flush_dma_cs(sctx, 0, &fence);
+	if (fence) {
+		sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
+		sctx->ws->fence_reference(&fence, NULL);
+	}
+
+	return os_time_get_nano();
+}
+
+static bool si_query_sw_begin(struct si_context *sctx,
+			      struct si_query *rquery)
+{
+	struct si_query_sw *query = (struct si_query_sw *)rquery;
+	enum radeon_value_id ws_id;
+
+	switch(query->b.type) {
+	case PIPE_QUERY_TIMESTAMP_DISJOINT:
+	case PIPE_QUERY_GPU_FINISHED:
+		break;
+	case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+		query->begin_result = si_finish_dma_get_cpu_time(sctx);
+		break;
+	case SI_QUERY_DRAW_CALLS:
+		query->begin_result = sctx->num_draw_calls;
+		break;
+	case SI_QUERY_DECOMPRESS_CALLS:
+		query->begin_result = sctx->num_decompress_calls;
+		break;
+	case SI_QUERY_MRT_DRAW_CALLS:
+		query->begin_result = sctx->num_mrt_draw_calls;
+		break;
+	case SI_QUERY_PRIM_RESTART_CALLS:
+		query->begin_result = sctx->num_prim_restart_calls;
+		break;
+	case SI_QUERY_SPILL_DRAW_CALLS:
+		query->begin_result = sctx->num_spill_draw_calls;
+		break;
+	case SI_QUERY_COMPUTE_CALLS:
+		query->begin_result = sctx->num_compute_calls;
+		break;
+	case SI_QUERY_SPILL_COMPUTE_CALLS:
+		query->begin_result = sctx->num_spill_compute_calls;
+		break;
+	case SI_QUERY_DMA_CALLS:
+		query->begin_result = sctx->num_dma_calls;
+		break;
+	case SI_QUERY_CP_DMA_CALLS:
+		query->begin_result = sctx->num_cp_dma_calls;
+		break;
+	case SI_QUERY_NUM_VS_FLUSHES:
+		query->begin_result = sctx->num_vs_flushes;
+		break;
+	case SI_QUERY_NUM_PS_FLUSHES:
+		query->begin_result = sctx->num_ps_flushes;
+		break;
+	case SI_QUERY_NUM_CS_FLUSHES:
+		query->begin_result = sctx->num_cs_flushes;
+		break;
+	case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+		query->begin_result = sctx->num_cb_cache_flushes;
+		break;
+	case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+		query->begin_result = sctx->num_db_cache_flushes;
+		break;
+	case SI_QUERY_NUM_L2_INVALIDATES:
+		query->begin_result = sctx->num_L2_invalidates;
+		break;
+	case SI_QUERY_NUM_L2_WRITEBACKS:
+		query->begin_result = sctx->num_L2_writebacks;
+		break;
+	case SI_QUERY_NUM_RESIDENT_HANDLES:
+		query->begin_result = sctx->num_resident_handles;
+		break;
+	case SI_QUERY_TC_OFFLOADED_SLOTS:
+		query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+		break;
+	case SI_QUERY_TC_DIRECT_SLOTS:
+		query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+		break;
+	case SI_QUERY_TC_NUM_SYNCS:
+		query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
+		break;
+	case SI_QUERY_REQUESTED_VRAM:
+	case SI_QUERY_REQUESTED_GTT:
+	case SI_QUERY_MAPPED_VRAM:
+	case SI_QUERY_MAPPED_GTT:
+	case SI_QUERY_VRAM_USAGE:
+	case SI_QUERY_VRAM_VIS_USAGE:
+	case SI_QUERY_GTT_USAGE:
+	case SI_QUERY_GPU_TEMPERATURE:
+	case SI_QUERY_CURRENT_GPU_SCLK:
+	case SI_QUERY_CURRENT_GPU_MCLK:
+	case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+	case SI_QUERY_NUM_MAPPED_BUFFERS:
+		query->begin_result = 0;
+		break;
+	case SI_QUERY_BUFFER_WAIT_TIME:
+	case SI_QUERY_GFX_IB_SIZE:
+	case SI_QUERY_NUM_GFX_IBS:
+	case SI_QUERY_NUM_SDMA_IBS:
+	case SI_QUERY_NUM_BYTES_MOVED:
+	case SI_QUERY_NUM_EVICTIONS:
+	case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+		query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+		break;
+	}
+	case SI_QUERY_GFX_BO_LIST_SIZE:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+		query->begin_time = sctx->ws->query_value(sctx->ws,
+							  RADEON_NUM_GFX_IBS);
+		break;
+	case SI_QUERY_CS_THREAD_BUSY:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+		query->begin_time = os_time_get_nano();
+		break;
+	case SI_QUERY_GALLIUM_THREAD_BUSY:
+		query->begin_result =
+			sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+		query->begin_time = os_time_get_nano();
+		break;
+	case SI_QUERY_GPU_LOAD:
+	case SI_QUERY_GPU_SHADERS_BUSY:
+	case SI_QUERY_GPU_TA_BUSY:
+	case SI_QUERY_GPU_GDS_BUSY:
+	case SI_QUERY_GPU_VGT_BUSY:
+	case SI_QUERY_GPU_IA_BUSY:
+	case SI_QUERY_GPU_SX_BUSY:
+	case SI_QUERY_GPU_WD_BUSY:
+	case SI_QUERY_GPU_BCI_BUSY:
+	case SI_QUERY_GPU_SC_BUSY:
+	case SI_QUERY_GPU_PA_BUSY:
+	case SI_QUERY_GPU_DB_BUSY:
+	case SI_QUERY_GPU_CP_BUSY:
+	case SI_QUERY_GPU_CB_BUSY:
+	case SI_QUERY_GPU_SDMA_BUSY:
+	case SI_QUERY_GPU_PFP_BUSY:
+	case SI_QUERY_GPU_MEQ_BUSY:
+	case SI_QUERY_GPU_ME_BUSY:
+	case SI_QUERY_GPU_SURF_SYNC_BUSY:
+	case SI_QUERY_GPU_CP_DMA_BUSY:
+	case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+		query->begin_result = si_begin_counter(sctx->screen,
+							 query->b.type);
+		break;
+	case SI_QUERY_NUM_COMPILATIONS:
+		query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
+		break;
+	case SI_QUERY_NUM_SHADERS_CREATED:
+		query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
+		break;
+	case SI_QUERY_NUM_SHADER_CACHE_HITS:
+		query->begin_result =
+			p_atomic_read(&sctx->screen->num_shader_cache_hits);
+		break;
+	case SI_QUERY_GPIN_ASIC_ID:
+	case SI_QUERY_GPIN_NUM_SIMD:
+	case SI_QUERY_GPIN_NUM_RB:
+	case SI_QUERY_GPIN_NUM_SPI:
+	case SI_QUERY_GPIN_NUM_SE:
+		break;
+	default:
+		unreachable("si_query_sw_begin: bad query type");
+	}
+
+	return true;
+}
+
+static bool si_query_sw_end(struct si_context *sctx,
+			    struct si_query *rquery)
+{
+	struct si_query_sw *query = (struct si_query_sw *)rquery;
+	enum radeon_value_id ws_id;
+
+	switch(query->b.type) {
+	case PIPE_QUERY_TIMESTAMP_DISJOINT:
+		break;
+	case PIPE_QUERY_GPU_FINISHED:
+		sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
+		break;
+	case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+		query->end_result = si_finish_dma_get_cpu_time(sctx);
+		break;
+	case SI_QUERY_DRAW_CALLS:
+		query->end_result = sctx->num_draw_calls;
+		break;
+	case SI_QUERY_DECOMPRESS_CALLS:
+		query->end_result = sctx->num_decompress_calls;
+		break;
+	case SI_QUERY_MRT_DRAW_CALLS:
+		query->end_result = sctx->num_mrt_draw_calls;
+		break;
+	case SI_QUERY_PRIM_RESTART_CALLS:
+		query->end_result = sctx->num_prim_restart_calls;
+		break;
+	case SI_QUERY_SPILL_DRAW_CALLS:
+		query->end_result = sctx->num_spill_draw_calls;
+		break;
+	case SI_QUERY_COMPUTE_CALLS:
+		query->end_result = sctx->num_compute_calls;
+		break;
+	case SI_QUERY_SPILL_COMPUTE_CALLS:
+		query->end_result = sctx->num_spill_compute_calls;
+		break;
+	case SI_QUERY_DMA_CALLS:
+		query->end_result = sctx->num_dma_calls;
+		break;
+	case SI_QUERY_CP_DMA_CALLS:
+		query->end_result = sctx->num_cp_dma_calls;
+		break;
+	case SI_QUERY_NUM_VS_FLUSHES:
+		query->end_result = sctx->num_vs_flushes;
+		break;
+	case SI_QUERY_NUM_PS_FLUSHES:
+		query->end_result = sctx->num_ps_flushes;
+		break;
+	case SI_QUERY_NUM_CS_FLUSHES:
+		query->end_result = sctx->num_cs_flushes;
+		break;
+	case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+		query->end_result = sctx->num_cb_cache_flushes;
+		break;
+	case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+		query->end_result = sctx->num_db_cache_flushes;
+		break;
+	case SI_QUERY_NUM_L2_INVALIDATES:
+		query->end_result = sctx->num_L2_invalidates;
+		break;
+	case SI_QUERY_NUM_L2_WRITEBACKS:
+		query->end_result = sctx->num_L2_writebacks;
+		break;
+	case SI_QUERY_NUM_RESIDENT_HANDLES:
+		query->end_result = sctx->num_resident_handles;
+		break;
+	case SI_QUERY_TC_OFFLOADED_SLOTS:
+		query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+		break;
+	case SI_QUERY_TC_DIRECT_SLOTS:
+		query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+		break;
+	case SI_QUERY_TC_NUM_SYNCS:
+		query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
+		break;
+	case SI_QUERY_REQUESTED_VRAM:
+	case SI_QUERY_REQUESTED_GTT:
+	case SI_QUERY_MAPPED_VRAM:
+	case SI_QUERY_MAPPED_GTT:
+	case SI_QUERY_VRAM_USAGE:
+	case SI_QUERY_VRAM_VIS_USAGE:
+	case SI_QUERY_GTT_USAGE:
+	case SI_QUERY_GPU_TEMPERATURE:
+	case SI_QUERY_CURRENT_GPU_SCLK:
+	case SI_QUERY_CURRENT_GPU_MCLK:
+	case SI_QUERY_BUFFER_WAIT_TIME:
+	case SI_QUERY_GFX_IB_SIZE:
+	case SI_QUERY_NUM_MAPPED_BUFFERS:
+	case SI_QUERY_NUM_GFX_IBS:
+	case SI_QUERY_NUM_SDMA_IBS:
+	case SI_QUERY_NUM_BYTES_MOVED:
+	case SI_QUERY_NUM_EVICTIONS:
+	case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+		query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+		break;
+	}
+	case SI_QUERY_GFX_BO_LIST_SIZE:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+		query->end_time = sctx->ws->query_value(sctx->ws,
+							RADEON_NUM_GFX_IBS);
+		break;
+	case SI_QUERY_CS_THREAD_BUSY:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+		query->end_time = os_time_get_nano();
+		break;
+	case SI_QUERY_GALLIUM_THREAD_BUSY:
+		query->end_result =
+			sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+		query->end_time = os_time_get_nano();
+		break;
+	case SI_QUERY_GPU_LOAD:
+	case SI_QUERY_GPU_SHADERS_BUSY:
+	case SI_QUERY_GPU_TA_BUSY:
+	case SI_QUERY_GPU_GDS_BUSY:
+	case SI_QUERY_GPU_VGT_BUSY:
+	case SI_QUERY_GPU_IA_BUSY:
+	case SI_QUERY_GPU_SX_BUSY:
+	case SI_QUERY_GPU_WD_BUSY:
+	case SI_QUERY_GPU_BCI_BUSY:
+	case SI_QUERY_GPU_SC_BUSY:
+	case SI_QUERY_GPU_PA_BUSY:
+	case SI_QUERY_GPU_DB_BUSY:
+	case SI_QUERY_GPU_CP_BUSY:
+	case SI_QUERY_GPU_CB_BUSY:
+	case SI_QUERY_GPU_SDMA_BUSY:
+	case SI_QUERY_GPU_PFP_BUSY:
+	case SI_QUERY_GPU_MEQ_BUSY:
+	case SI_QUERY_GPU_ME_BUSY:
+	case SI_QUERY_GPU_SURF_SYNC_BUSY:
+	case SI_QUERY_GPU_CP_DMA_BUSY:
+	case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+		query->end_result = si_end_counter(sctx->screen,
+						     query->b.type,
+						     query->begin_result);
+		query->begin_result = 0;
+		break;
+	case SI_QUERY_NUM_COMPILATIONS:
+		query->end_result = p_atomic_read(&sctx->screen->num_compilations);
+		break;
+	case SI_QUERY_NUM_SHADERS_CREATED:
+		query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
+		break;
+	case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+		query->end_result = sctx->last_tex_ps_draw_ratio;
+		break;
+	case SI_QUERY_NUM_SHADER_CACHE_HITS:
+		query->end_result =
+			p_atomic_read(&sctx->screen->num_shader_cache_hits);
+		break;
+	case SI_QUERY_GPIN_ASIC_ID:
+	case SI_QUERY_GPIN_NUM_SIMD:
+	case SI_QUERY_GPIN_NUM_RB:
+	case SI_QUERY_GPIN_NUM_SPI:
+	case SI_QUERY_GPIN_NUM_SE:
+		break;
+	default:
+		unreachable("si_query_sw_end: bad query type");
+	}
+
+	return true;
+}
+
+static bool si_query_sw_get_result(struct si_context *sctx,
+				   struct si_query *rquery,
+				   bool wait,
+				   union pipe_query_result *result)
+{
+	struct si_query_sw *query = (struct si_query_sw *)rquery;
+
+	switch (query->b.type) {
+	case PIPE_QUERY_TIMESTAMP_DISJOINT:
+		/* Convert from cycles per millisecond to cycles per second (Hz). */
+		result->timestamp_disjoint.frequency =
+			(uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
+		result->timestamp_disjoint.disjoint = false;
+		return true;
+	case PIPE_QUERY_GPU_FINISHED: {
+		struct pipe_screen *screen = sctx->b.screen;
+		struct pipe_context *ctx = rquery->b.flushed ? NULL : &sctx->b;
+
+		result->b = screen->fence_finish(screen, ctx, query->fence,
+						 wait ? PIPE_TIMEOUT_INFINITE : 0);
+		return result->b;
+	}
+
+	case SI_QUERY_GFX_BO_LIST_SIZE:
+		result->u64 = (query->end_result - query->begin_result) /
+			      (query->end_time - query->begin_time);
+		return true;
+	case SI_QUERY_CS_THREAD_BUSY:
+	case SI_QUERY_GALLIUM_THREAD_BUSY:
+		result->u64 = (query->end_result - query->begin_result) * 100 /
+			      (query->end_time - query->begin_time);
+		return true;
+	case SI_QUERY_GPIN_ASIC_ID:
+		result->u32 = 0;
+		return true;
+	case SI_QUERY_GPIN_NUM_SIMD:
+		result->u32 = sctx->screen->info.num_good_compute_units;
+		return true;
+	case SI_QUERY_GPIN_NUM_RB:
+		result->u32 = sctx->screen->info.num_render_backends;
+		return true;
+	case SI_QUERY_GPIN_NUM_SPI:
+		result->u32 = 1; /* all supported chips have one SPI per SE */
+		return true;
+	case SI_QUERY_GPIN_NUM_SE:
+		result->u32 = sctx->screen->info.max_se;
+		return true;
+	}
+
+	result->u64 = query->end_result - query->begin_result;
+
+	switch (query->b.type) {
+	case SI_QUERY_BUFFER_WAIT_TIME:
+	case SI_QUERY_GPU_TEMPERATURE:
+		result->u64 /= 1000;
+		break;
+	case SI_QUERY_CURRENT_GPU_SCLK:
+	case SI_QUERY_CURRENT_GPU_MCLK:
+		result->u64 *= 1000000;
+		break;
+	}
+
+	return true;
+}
+
+
+static struct si_query_ops sw_query_ops = {
+	.destroy = si_query_sw_destroy,
+	.begin = si_query_sw_begin,
+	.end = si_query_sw_end,
+	.get_result = si_query_sw_get_result,
+	.get_result_resource = NULL
+};
+
+static struct pipe_query *si_query_sw_create(unsigned query_type)
+{
+	struct si_query_sw *query;
+
+	query = CALLOC_STRUCT(si_query_sw);
+	if (!query)
+		return NULL;
+
+	query->b.type = query_type;
+	query->b.ops = &sw_query_ops;
+
+	return (struct pipe_query *)query;
+}
+
+void si_query_hw_destroy(struct si_screen *sscreen,
+			 struct si_query *rquery)
+{
+	struct si_query_hw *query = (struct si_query_hw *)rquery;
+	struct si_query_buffer *prev = query->buffer.previous;
+
+	/* Release all query buffers. */
+	while (prev) {
+		struct si_query_buffer *qbuf = prev;
+		prev = prev->previous;
+		r600_resource_reference(&qbuf->buf, NULL);
+		FREE(qbuf);
+	}
+
+	r600_resource_reference(&query->buffer.buf, NULL);
+	r600_resource_reference(&query->workaround_buf, NULL);
+	FREE(rquery);
+}
+
+static struct r600_resource *si_new_query_buffer(struct si_screen *sscreen,
+						 struct si_query_hw *query)
+{
+	unsigned buf_size = MAX2(query->result_size,
+				 sscreen->info.min_alloc_size);
+
+	/* Queries are normally read by the CPU after
+	 * being written by the gpu, hence staging is probably a good
+	 * usage pattern.
+	 */
+	struct r600_resource *buf = r600_resource(
+		pipe_buffer_create(&sscreen->b, 0,
+				   PIPE_USAGE_STAGING, buf_size));
+	if (!buf)
+		return NULL;
+
+	if (!query->ops->prepare_buffer(sscreen, query, buf)) {
+		r600_resource_reference(&buf, NULL);
+		return NULL;
+	}
+
+	return buf;
+}
+
+static bool si_query_hw_prepare_buffer(struct si_screen *sscreen,
+				       struct si_query_hw *query,
+				       struct r600_resource *buffer)
+{
+	/* Callers ensure that the buffer is currently unused by the GPU. */
+	uint32_t *results = sscreen->ws->buffer_map(buffer->buf, NULL,
+						   PIPE_TRANSFER_WRITE |
+						   PIPE_TRANSFER_UNSYNCHRONIZED);
+	if (!results)
+		return false;
+
+	memset(results, 0, buffer->b.b.width0);
+
+	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+		unsigned max_rbs = sscreen->info.num_render_backends;
+		unsigned enabled_rb_mask = sscreen->info.enabled_rb_mask;
+		unsigned num_results;
+		unsigned i, j;
+
+		/* Set top bits for unused backends. */
+		num_results = buffer->b.b.width0 / query->result_size;
+		for (j = 0; j < num_results; j++) {
+			for (i = 0; i < max_rbs; i++) {
+				if (!(enabled_rb_mask & (1<<i))) {
+					results[(i * 4)+1] = 0x80000000;
+					results[(i * 4)+3] = 0x80000000;
+				}
+			}
+			results += 4 * max_rbs;
+		}
+	}
+
+	return true;
+}
+
+static void si_query_hw_get_result_resource(struct si_context *sctx,
+					    struct si_query *rquery,
+					    bool wait,
+					    enum pipe_query_value_type result_type,
+					    int index,
+					    struct pipe_resource *resource,
+					    unsigned offset);
+
+static struct si_query_ops query_hw_ops = {
+	.destroy = si_query_hw_destroy,
+	.begin = si_query_hw_begin,
+	.end = si_query_hw_end,
+	.get_result = si_query_hw_get_result,
+	.get_result_resource = si_query_hw_get_result_resource,
+};
+
+static void si_query_hw_do_emit_start(struct si_context *sctx,
+				      struct si_query_hw *query,
+				      struct r600_resource *buffer,
+				      uint64_t va);
+static void si_query_hw_do_emit_stop(struct si_context *sctx,
+				     struct si_query_hw *query,
+				     struct r600_resource *buffer,
+				     uint64_t va);
+static void si_query_hw_add_result(struct si_screen *sscreen,
+				   struct si_query_hw *, void *buffer,
+				   union pipe_query_result *result);
+static void si_query_hw_clear_result(struct si_query_hw *,
+				     union pipe_query_result *);
+
+static struct si_query_hw_ops query_hw_default_hw_ops = {
+	.prepare_buffer = si_query_hw_prepare_buffer,
+	.emit_start = si_query_hw_do_emit_start,
+	.emit_stop = si_query_hw_do_emit_stop,
+	.clear_result = si_query_hw_clear_result,
+	.add_result = si_query_hw_add_result,
+};
+
+bool si_query_hw_init(struct si_screen *sscreen,
+		      struct si_query_hw *query)
+{
+	query->buffer.buf = si_new_query_buffer(sscreen, query);
+	if (!query->buffer.buf)
+		return false;
+
+	return true;
+}
+
+static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
+					     unsigned query_type,
+					     unsigned index)
+{
+	struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
+	if (!query)
+		return NULL;
+
+	query->b.type = query_type;
+	query->b.ops = &query_hw_ops;
+	query->ops = &query_hw_default_hw_ops;
+
+	switch (query_type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+		query->result_size = 16 * sscreen->info.num_render_backends;
+		query->result_size += 16; /* for the fence + alignment */
+		query->num_cs_dw_end = 6 + si_cp_write_fence_dwords(sscreen);
+		break;
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		/* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
+		query->result_size = 64;
+		query->num_cs_dw_end = 0;
+		break;
+	case PIPE_QUERY_TIME_ELAPSED:
+		query->result_size = 24;
+		query->num_cs_dw_end = 8 + si_cp_write_fence_dwords(sscreen);
+		break;
+	case PIPE_QUERY_TIMESTAMP:
+		query->result_size = 16;
+		query->num_cs_dw_end = 8 + si_cp_write_fence_dwords(sscreen);
+		query->flags = SI_QUERY_HW_FLAG_NO_START;
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+		query->result_size = 32;
+		query->num_cs_dw_end = 6;
+		query->stream = index;
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+		query->result_size = 32 * SI_MAX_STREAMS;
+		query->num_cs_dw_end = 6 * SI_MAX_STREAMS;
+		break;
+	case PIPE_QUERY_PIPELINE_STATISTICS:
+		/* 11 values on GCN. */
+		query->result_size = 11 * 16;
+		query->result_size += 8; /* for the fence + alignment */
+		query->num_cs_dw_end = 6 + si_cp_write_fence_dwords(sscreen);
+		break;
+	default:
+		assert(0);
+		FREE(query);
+		return NULL;
+	}
+
+	if (!si_query_hw_init(sscreen, query)) {
+		FREE(query);
+		return NULL;
+	}
+
+	return (struct pipe_query *)query;
+}
+
+static void si_update_occlusion_query_state(struct si_context *sctx,
+					    unsigned type, int diff)
+{
+	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
+	    type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+	    type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+		bool old_enable = sctx->num_occlusion_queries != 0;
+		bool old_perfect_enable =
+			sctx->num_perfect_occlusion_queries != 0;
+		bool enable, perfect_enable;
+
+		sctx->num_occlusion_queries += diff;
+		assert(sctx->num_occlusion_queries >= 0);
+
+		if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+			sctx->num_perfect_occlusion_queries += diff;
+			assert(sctx->num_perfect_occlusion_queries >= 0);
+		}
+
+		enable = sctx->num_occlusion_queries != 0;
+		perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+
+		if (enable != old_enable || perfect_enable != old_perfect_enable) {
+			si_set_occlusion_query_state(sctx, old_perfect_enable);
+		}
+	}
+}
+
+static unsigned event_type_for_stream(unsigned stream)
+{
+	switch (stream) {
+	default:
+	case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
+	case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
+	case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
+	case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
+	}
+}
+
+static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
+				  unsigned stream)
+{
+	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+	radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+}
+
+static void si_query_hw_do_emit_start(struct si_context *sctx,
+					struct si_query_hw *query,
+					struct r600_resource *buffer,
+					uint64_t va)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	switch (query->b.type) {
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
+		return;
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		emit_sample_streamout(cs, va, query->stream);
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+			emit_sample_streamout(cs, va + 32 * stream, stream);
+		break;
+	case PIPE_QUERY_TIME_ELAPSED:
+		si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+				  EOP_DATA_SEL_TIMESTAMP, NULL, va,
+				  0, query->b.type);
+		break;
+	case PIPE_QUERY_PIPELINE_STATISTICS:
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+		break;
+	default:
+		assert(0);
+	}
+	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+				  RADEON_PRIO_QUERY);
+}
+
+static void si_query_hw_emit_start(struct si_context *sctx,
+				   struct si_query_hw *query)
+{
+	uint64_t va;
+
+	if (!query->buffer.buf)
+		return; // previous buffer allocation failure
+
+	si_update_occlusion_query_state(sctx, query->b.type, 1);
+	si_update_prims_generated_query_state(sctx, query->b.type, 1);
+
+	if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
+		si_need_gfx_cs_space(sctx);
+
+	/* Get a new query buffer if needed. */
+	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
+		struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+		*qbuf = query->buffer;
+		query->buffer.results_end = 0;
+		query->buffer.previous = qbuf;
+		query->buffer.buf = si_new_query_buffer(sctx->screen, query);
+		if (!query->buffer.buf)
+			return;
+	}
+
+	/* emit begin query */
+	va = query->buffer.buf->gpu_address + query->buffer.results_end;
+
+	query->ops->emit_start(sctx, query, query->buffer.buf, va);
+
+	sctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
+}
+
+static void si_query_hw_do_emit_stop(struct si_context *sctx,
+				       struct si_query_hw *query,
+				       struct r600_resource *buffer,
+				       uint64_t va)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+	uint64_t fence_va = 0;
+
+	switch (query->b.type) {
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
+		return;
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+		va += 8;
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+
+		fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		va += 16;
+		emit_sample_streamout(cs, va, query->stream);
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+		va += 16;
+		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+			emit_sample_streamout(cs, va + 32 * stream, stream);
+		break;
+	case PIPE_QUERY_TIME_ELAPSED:
+		va += 8;
+		/* fall through */
+	case PIPE_QUERY_TIMESTAMP:
+		si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS,
+				  0, EOP_DST_SEL_MEM,
+				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+				  EOP_DATA_SEL_TIMESTAMP, NULL, va,
+				  0, query->b.type);
+		fence_va = va + 8;
+		break;
+	case PIPE_QUERY_PIPELINE_STATISTICS: {
+		unsigned sample_size = (query->result_size - 8) / 2;
+
+		va += sample_size;
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+
+		fence_va = va + sample_size;
+		break;
+	}
+	default:
+		assert(0);
+	}
+	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+				  RADEON_PRIO_QUERY);
+
+	if (fence_va) {
+		si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+				  EOP_DST_SEL_MEM,
+				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+				  EOP_DATA_SEL_VALUE_32BIT,
+				  query->buffer.buf, fence_va, 0x80000000,
+				  query->b.type);
+	}
+}
+
+static void si_query_hw_emit_stop(struct si_context *sctx,
+				  struct si_query_hw *query)
+{
+	uint64_t va;
+
+	if (!query->buffer.buf)
+		return; // previous buffer allocation failure
+
+	/* The queries which need begin already called this in begin_query. */
+	if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+		si_need_gfx_cs_space(sctx);
+
+	/* emit end query */
+	va = query->buffer.buf->gpu_address + query->buffer.results_end;
+
+	query->ops->emit_stop(sctx, query, query->buffer.buf, va);
+
+	query->buffer.results_end += query->result_size;
+
+	if (!(query->flags & SI_QUERY_HW_FLAG_NO_START))
+		sctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
+
+	si_update_occlusion_query_state(sctx, query->b.type, -1);
+	si_update_prims_generated_query_state(sctx, query->b.type, -1);
+}
+
+static void emit_set_predicate(struct si_context *ctx,
+			       struct r600_resource *buf, uint64_t va,
+			       uint32_t op)
+{
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+	if (ctx->chip_class >= GFX9) {
+		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+		radeon_emit(cs, op);
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+	} else {
+		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+		radeon_emit(cs, va);
+		radeon_emit(cs, op | ((va >> 32) & 0xFF));
+	}
+	radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ,
+				  RADEON_PRIO_QUERY);
+}
+
+static void si_emit_query_predication(struct si_context *ctx)
+{
+	struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
+	struct si_query_buffer *qbuf;
+	uint32_t op;
+	bool flag_wait, invert;
+
+	if (!query)
+		return;
+
+	invert = ctx->render_cond_invert;
+	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+	if (query->workaround_buf) {
+		op = PRED_OP(PREDICATION_OP_BOOL64);
+	} else {
+		switch (query->b.type) {
+		case PIPE_QUERY_OCCLUSION_COUNTER:
+		case PIPE_QUERY_OCCLUSION_PREDICATE:
+		case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+			op = PRED_OP(PREDICATION_OP_ZPASS);
+			break;
+		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+			op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+			invert = !invert;
+			break;
+		default:
+			assert(0);
+			return;
+		}
+	}
+
+	/* if true then invert, see GL_ARB_conditional_render_inverted */
+	if (invert)
+		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+	else
+		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+	/* Use the value written by compute shader as a workaround. Note that
+	 * the wait flag does not apply in this predication mode.
+	 *
+	 * The shader outputs the result value to L2. Workarounds only affect VI
+	 * and later, where the CP reads data from L2, so we don't need an
+	 * additional flush.
+	 */
+	if (query->workaround_buf) {
+		uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+		emit_set_predicate(ctx, query->workaround_buf, va, op);
+		return;
+	}
+
+	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+	/* emit predicate packets for all data blocks */
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned results_base = 0;
+		uint64_t va_base = qbuf->buf->gpu_address;
+
+		while (results_base < qbuf->results_end) {
+			uint64_t va = va_base + results_base;
+
+			if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+				for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+					emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+					/* set CONTINUE bit for all packets except the first */
+					op |= PREDICATION_CONTINUE;
+				}
+			} else {
+				emit_set_predicate(ctx, qbuf->buf, va, op);
+				op |= PREDICATION_CONTINUE;
+			}
+
+			results_base += query->result_size;
+		}
+	}
+}
+
+static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+{
+	struct si_screen *sscreen =
+		(struct si_screen *)ctx->screen;
+
+	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
+	    query_type == PIPE_QUERY_GPU_FINISHED ||
+	    (query_type >= PIPE_QUERY_DRIVER_SPECIFIC &&
+	     query_type != SI_QUERY_TIME_ELAPSED_SDMA))
+		return si_query_sw_create(query_type);
+
+	return si_query_hw_create(sscreen, query_type, index);
+}
+
+static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_query *rquery = (struct si_query *)query;
+
+	rquery->ops->destroy(sctx->screen, rquery);
+}
+
+static boolean si_begin_query(struct pipe_context *ctx,
+                                struct pipe_query *query)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_query *rquery = (struct si_query *)query;
+
+	return rquery->ops->begin(sctx, rquery);
+}
+
+void si_query_hw_reset_buffers(struct si_context *sctx,
+			       struct si_query_hw *query)
+{
+	struct si_query_buffer *prev = query->buffer.previous;
+
+	/* Discard the old query buffers. */
+	while (prev) {
+		struct si_query_buffer *qbuf = prev;
+		prev = prev->previous;
+		r600_resource_reference(&qbuf->buf, NULL);
+		FREE(qbuf);
+	}
+
+	query->buffer.results_end = 0;
+	query->buffer.previous = NULL;
+
+	/* Obtain a new buffer if the current one can't be mapped without a stall. */
+	if (si_rings_is_buffer_referenced(sctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
+	    !sctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
+		r600_resource_reference(&query->buffer.buf, NULL);
+		query->buffer.buf = si_new_query_buffer(sctx->screen, query);
+	} else {
+		if (!query->ops->prepare_buffer(sctx->screen, query, query->buffer.buf))
+			r600_resource_reference(&query->buffer.buf, NULL);
+	}
+}
+
+bool si_query_hw_begin(struct si_context *sctx,
+		       struct si_query *rquery)
+{
+	struct si_query_hw *query = (struct si_query_hw *)rquery;
+
+	if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+		assert(0);
+		return false;
+	}
+
+	if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
+		si_query_hw_reset_buffers(sctx, query);
+
+	r600_resource_reference(&query->workaround_buf, NULL);
+
+	si_query_hw_emit_start(sctx, query);
+	if (!query->buffer.buf)
+		return false;
+
+	LIST_ADDTAIL(&query->list, &sctx->active_queries);
+	return true;
+}
+
+static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_query *rquery = (struct si_query *)query;
+
+	return rquery->ops->end(sctx, rquery);
+}
+
+bool si_query_hw_end(struct si_context *sctx,
+		     struct si_query *rquery)
+{
+	struct si_query_hw *query = (struct si_query_hw *)rquery;
+
+	if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+		si_query_hw_reset_buffers(sctx, query);
+
+	si_query_hw_emit_stop(sctx, query);
+
+	if (!(query->flags & SI_QUERY_HW_FLAG_NO_START))
+		LIST_DELINIT(&query->list);
+
+	if (!query->buffer.buf)
+		return false;
+
+	return true;
+}
+
+static void si_get_hw_query_params(struct si_context *sctx,
+				   struct si_query_hw *rquery, int index,
+				   struct si_hw_query_params *params)
+{
+	unsigned max_rbs = sctx->screen->info.num_render_backends;
+
+	params->pair_stride = 0;
+	params->pair_count = 1;
+
+	switch (rquery->b.type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+		params->start_offset = 0;
+		params->end_offset = 8;
+		params->fence_offset = max_rbs * 16;
+		params->pair_stride = 16;
+		params->pair_count = max_rbs;
+		break;
+	case PIPE_QUERY_TIME_ELAPSED:
+		params->start_offset = 0;
+		params->end_offset = 8;
+		params->fence_offset = 16;
+		break;
+	case PIPE_QUERY_TIMESTAMP:
+		params->start_offset = 0;
+		params->end_offset = 0;
+		params->fence_offset = 8;
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		params->start_offset = 8;
+		params->end_offset = 24;
+		params->fence_offset = params->end_offset + 4;
+		break;
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		params->start_offset = 0;
+		params->end_offset = 16;
+		params->fence_offset = params->end_offset + 4;
+		break;
+	case PIPE_QUERY_SO_STATISTICS:
+		params->start_offset = 8 - index * 8;
+		params->end_offset = 24 - index * 8;
+		params->fence_offset = params->end_offset + 4;
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+		params->pair_count = SI_MAX_STREAMS;
+		params->pair_stride = 32;
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		params->start_offset = 0;
+		params->end_offset = 16;
+
+		/* We can re-use the high dword of the last 64-bit value as a
+		 * fence: it is initialized as 0, and the high bit is set by
+		 * the write of the streamout stats event.
+		 */
+		params->fence_offset = rquery->result_size - 4;
+		break;
+	case PIPE_QUERY_PIPELINE_STATISTICS:
+	{
+		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
+		params->start_offset = offsets[index];
+		params->end_offset = 88 + offsets[index];
+		params->fence_offset = 2 * 88;
+		break;
+	}
+	default:
+		unreachable("si_get_hw_query_params unsupported");
+	}
+}
+
+static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
+				     bool test_status_bit)
+{
+	uint32_t *current_result = (uint32_t*)map;
+	uint64_t start, end;
+
+	start = (uint64_t)current_result[start_index] |
+		(uint64_t)current_result[start_index+1] << 32;
+	end = (uint64_t)current_result[end_index] |
+	      (uint64_t)current_result[end_index+1] << 32;
+
+	if (!test_status_bit ||
+	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
+		return end - start;
+	}
+	return 0;
+}
+
+static void si_query_hw_add_result(struct si_screen *sscreen,
+				     struct si_query_hw *query,
+				     void *buffer,
+				     union pipe_query_result *result)
+{
+	unsigned max_rbs = sscreen->info.num_render_backends;
+
+	switch (query->b.type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER: {
+		for (unsigned i = 0; i < max_rbs; ++i) {
+			unsigned results_base = i * 16;
+			result->u64 +=
+				si_query_read_result(buffer + results_base, 0, 2, true);
+		}
+		break;
+	}
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+		for (unsigned i = 0; i < max_rbs; ++i) {
+			unsigned results_base = i * 16;
+			result->b = result->b ||
+				si_query_read_result(buffer + results_base, 0, 2, true) != 0;
+		}
+		break;
+	}
+	case PIPE_QUERY_TIME_ELAPSED:
+		result->u64 += si_query_read_result(buffer, 0, 2, false);
+		break;
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		result->u64 += si_query_read_result(buffer, 0, 32/4, false);
+		break;
+	case PIPE_QUERY_TIMESTAMP:
+		result->u64 = *(uint64_t*)buffer;
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		/* SAMPLE_STREAMOUTSTATS stores this structure:
+		 * {
+		 *    u64 NumPrimitivesWritten;
+		 *    u64 PrimitiveStorageNeeded;
+		 * }
+		 * We only need NumPrimitivesWritten here. */
+		result->u64 += si_query_read_result(buffer, 2, 6, true);
+		break;
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		/* Here we read PrimitiveStorageNeeded. */
+		result->u64 += si_query_read_result(buffer, 0, 4, true);
+		break;
+	case PIPE_QUERY_SO_STATISTICS:
+		result->so_statistics.num_primitives_written +=
+			si_query_read_result(buffer, 2, 6, true);
+		result->so_statistics.primitives_storage_needed +=
+			si_query_read_result(buffer, 0, 4, true);
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		result->b = result->b ||
+			si_query_read_result(buffer, 2, 6, true) !=
+			si_query_read_result(buffer, 0, 4, true);
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+			result->b = result->b ||
+				si_query_read_result(buffer, 2, 6, true) !=
+				si_query_read_result(buffer, 0, 4, true);
+			buffer = (char *)buffer + 32;
+		}
+		break;
+	case PIPE_QUERY_PIPELINE_STATISTICS:
+		result->pipeline_statistics.ps_invocations +=
+			si_query_read_result(buffer, 0, 22, false);
+		result->pipeline_statistics.c_primitives +=
+			si_query_read_result(buffer, 2, 24, false);
+		result->pipeline_statistics.c_invocations +=
+			si_query_read_result(buffer, 4, 26, false);
+		result->pipeline_statistics.vs_invocations +=
+			si_query_read_result(buffer, 6, 28, false);
+		result->pipeline_statistics.gs_invocations +=
+			si_query_read_result(buffer, 8, 30, false);
+		result->pipeline_statistics.gs_primitives +=
+			si_query_read_result(buffer, 10, 32, false);
+		result->pipeline_statistics.ia_primitives +=
+			si_query_read_result(buffer, 12, 34, false);
+		result->pipeline_statistics.ia_vertices +=
+			si_query_read_result(buffer, 14, 36, false);
+		result->pipeline_statistics.hs_invocations +=
+			si_query_read_result(buffer, 16, 38, false);
+		result->pipeline_statistics.ds_invocations +=
+			si_query_read_result(buffer, 18, 40, false);
+		result->pipeline_statistics.cs_invocations +=
+			si_query_read_result(buffer, 20, 42, false);
+#if 0 /* for testing */
+		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
+		       "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
+		       "Clipper prims=%llu, PS=%llu, CS=%llu\n",
+		       result->pipeline_statistics.ia_vertices,
+		       result->pipeline_statistics.ia_primitives,
+		       result->pipeline_statistics.vs_invocations,
+		       result->pipeline_statistics.hs_invocations,
+		       result->pipeline_statistics.ds_invocations,
+		       result->pipeline_statistics.gs_invocations,
+		       result->pipeline_statistics.gs_primitives,
+		       result->pipeline_statistics.c_invocations,
+		       result->pipeline_statistics.c_primitives,
+		       result->pipeline_statistics.ps_invocations,
+		       result->pipeline_statistics.cs_invocations);
+#endif
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static boolean si_get_query_result(struct pipe_context *ctx,
+				   struct pipe_query *query, boolean wait,
+				   union pipe_query_result *result)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_query *rquery = (struct si_query *)query;
+
+	return rquery->ops->get_result(sctx, rquery, wait, result);
+}
+
+static void si_get_query_result_resource(struct pipe_context *ctx,
+					 struct pipe_query *query,
+					 boolean wait,
+					 enum pipe_query_value_type result_type,
+					 int index,
+					 struct pipe_resource *resource,
+					 unsigned offset)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_query *rquery = (struct si_query *)query;
+
+	rquery->ops->get_result_resource(sctx, rquery, wait, result_type, index,
+	                                 resource, offset);
+}
+
+static void si_query_hw_clear_result(struct si_query_hw *query,
+				       union pipe_query_result *result)
+{
+	util_query_clear_result(result, query->b.type);
+}
+
+bool si_query_hw_get_result(struct si_context *sctx,
+			    struct si_query *rquery,
+			    bool wait, union pipe_query_result *result)
+{
+	struct si_screen *sscreen = sctx->screen;
+	struct si_query_hw *query = (struct si_query_hw *)rquery;
+	struct si_query_buffer *qbuf;
+
+	query->ops->clear_result(query, result);
+
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned usage = PIPE_TRANSFER_READ |
+				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+		unsigned results_base = 0;
+		void *map;
+
+		if (rquery->b.flushed)
+			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+		else
+			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+		if (!map)
+			return false;
+
+		while (results_base != qbuf->results_end) {
+			query->ops->add_result(sscreen, query, map + results_base,
+					       result);
+			results_base += query->result_size;
+		}
+	}
+
+	/* Convert the time to expected units. */
+	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
+	    rquery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
+	    rquery->type == PIPE_QUERY_TIMESTAMP) {
+		result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
+	}
+	return true;
+}
+
+static void si_restore_qbo_state(struct si_context *sctx,
+				 struct si_qbo_state *st)
+{
+	sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
+
+	sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+	pipe_resource_reference(&st->saved_const0.buffer, NULL);
+
+	sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+	for (unsigned i = 0; i < 3; ++i)
+		pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
+}
+
+static void si_query_hw_get_result_resource(struct si_context *sctx,
+                                              struct si_query *rquery,
+                                              bool wait,
+                                              enum pipe_query_value_type result_type,
+                                              int index,
+                                              struct pipe_resource *resource,
+                                              unsigned offset)
+{
+	struct si_query_hw *query = (struct si_query_hw *)rquery;
+	struct si_query_buffer *qbuf;
+	struct si_query_buffer *qbuf_prev;
+	struct pipe_resource *tmp_buffer = NULL;
+	unsigned tmp_buffer_offset = 0;
+	struct si_qbo_state saved_state = {};
+	struct pipe_grid_info grid = {};
+	struct pipe_constant_buffer constant_buffer = {};
+	struct pipe_shader_buffer ssbo[3];
+	struct si_hw_query_params params;
+	struct {
+		uint32_t end_offset;
+		uint32_t result_stride;
+		uint32_t result_count;
+		uint32_t config;
+		uint32_t fence_offset;
+		uint32_t pair_stride;
+		uint32_t pair_count;
+	} consts;
+
+	if (!sctx->query_result_shader) {
+		sctx->query_result_shader = si_create_query_result_cs(sctx);
+		if (!sctx->query_result_shader)
+			return;
+	}
+
+	if (query->buffer.previous) {
+		u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
+				     &tmp_buffer_offset, &tmp_buffer);
+		if (!tmp_buffer)
+			return;
+	}
+
+	si_save_qbo_state(sctx, &saved_state);
+
+	si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
+	consts.end_offset = params.end_offset - params.start_offset;
+	consts.fence_offset = params.fence_offset - params.start_offset;
+	consts.result_stride = query->result_size;
+	consts.pair_stride = params.pair_stride;
+	consts.pair_count = params.pair_count;
+
+	constant_buffer.buffer_size = sizeof(consts);
+	constant_buffer.user_buffer = &consts;
+
+	ssbo[1].buffer = tmp_buffer;
+	ssbo[1].buffer_offset = tmp_buffer_offset;
+	ssbo[1].buffer_size = 16;
+
+	ssbo[2] = ssbo[1];
+
+	sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
+
+	grid.block[0] = 1;
+	grid.block[1] = 1;
+	grid.block[2] = 1;
+	grid.grid[0] = 1;
+	grid.grid[1] = 1;
+	grid.grid[2] = 1;
+
+	consts.config = 0;
+	if (index < 0)
+		consts.config |= 4;
+	if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
+		consts.config |= 8;
+	else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+		 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+		consts.config |= 8 | 256;
+	else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
+		 query->b.type == PIPE_QUERY_TIME_ELAPSED)
+		consts.config |= 32;
+
+	switch (result_type) {
+	case PIPE_QUERY_TYPE_U64:
+	case PIPE_QUERY_TYPE_I64:
+		consts.config |= 64;
+		break;
+	case PIPE_QUERY_TYPE_I32:
+		consts.config |= 128;
+		break;
+	case PIPE_QUERY_TYPE_U32:
+		break;
+	}
+
+	sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
+
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
+		if (query->b.type != PIPE_QUERY_TIMESTAMP) {
+			qbuf_prev = qbuf->previous;
+			consts.result_count = qbuf->results_end / query->result_size;
+			consts.config &= ~3;
+			if (qbuf != &query->buffer)
+				consts.config |= 1;
+			if (qbuf->previous)
+				consts.config |= 2;
+		} else {
+			/* Only read the last timestamp. */
+			qbuf_prev = NULL;
+			consts.result_count = 0;
+			consts.config |= 16;
+			params.start_offset += qbuf->results_end - query->result_size;
+		}
+
+		sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+
+		ssbo[0].buffer = &qbuf->buf->b.b;
+		ssbo[0].buffer_offset = params.start_offset;
+		ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
+
+		if (!qbuf->previous) {
+			ssbo[2].buffer = resource;
+			ssbo[2].buffer_offset = offset;
+			ssbo[2].buffer_size = 8;
+
+			r600_resource(resource)->TC_L2_dirty = true;
+		}
+
+		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
+
+		if (wait && qbuf == &query->buffer) {
+			uint64_t va;
+
+			/* Wait for result availability. Wait only for readiness
+			 * of the last entry, since the fence writes should be
+			 * serialized in the CP.
+			 */
+			va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
+			va += params.fence_offset;
+
+			si_cp_wait_mem(sctx, va, 0x80000000, 0x80000000, 0);
+		}
+
+		sctx->b.launch_grid(&sctx->b, &grid);
+		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+	}
+
+	si_restore_qbo_state(sctx, &saved_state);
+	pipe_resource_reference(&tmp_buffer, NULL);
+}
+
+static void si_render_condition(struct pipe_context *ctx,
+				struct pipe_query *query,
+				boolean condition,
+				enum pipe_render_cond_flag mode)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_query_hw *rquery = (struct si_query_hw *)query;
+	struct si_atom *atom = &sctx->atoms.s.render_cond;
+
+	if (query) {
+		bool needs_workaround = false;
+
+		/* There was a firmware regression in VI which causes successive
+		 * SET_PREDICATION packets to give the wrong answer for
+		 * non-inverted stream overflow predication.
+		 */
+		if (((sctx->chip_class == VI && sctx->screen->info.pfp_fw_feature < 49) ||
+		     (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
+		    !condition &&
+		    (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
+		     (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
+		      (rquery->buffer.previous ||
+		       rquery->buffer.results_end > rquery->result_size)))) {
+			needs_workaround = true;
+		}
+
+		if (needs_workaround && !rquery->workaround_buf) {
+			bool old_force_off = sctx->render_cond_force_off;
+			sctx->render_cond_force_off = true;
+
+			u_suballocator_alloc(
+				sctx->allocator_zeroed_memory, 8, 8,
+				&rquery->workaround_offset,
+				(struct pipe_resource **)&rquery->workaround_buf);
+
+			/* Reset to NULL to avoid a redundant SET_PREDICATION
+			 * from launching the compute grid.
+			 */
+			sctx->render_cond = NULL;
+
+			ctx->get_query_result_resource(
+				ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
+				&rquery->workaround_buf->b.b, rquery->workaround_offset);
+
+			/* Settings this in the render cond atom is too late,
+			 * so set it here. */
+			sctx->flags |= sctx->screen->barrier_flags.L2_to_cp |
+				       SI_CONTEXT_FLUSH_FOR_RENDER_COND;
+
+			sctx->render_cond_force_off = old_force_off;
+		}
+	}
+
+	sctx->render_cond = query;
+	sctx->render_cond_invert = condition;
+	sctx->render_cond_mode = mode;
+
+	si_set_atom_dirty(sctx, atom, query != NULL);
+}
+
+void si_suspend_queries(struct si_context *sctx)
+{
+	struct si_query_hw *query;
+
+	LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, list) {
+		si_query_hw_emit_stop(sctx, query);
+	}
+	assert(sctx->num_cs_dw_queries_suspend == 0);
+}
+
+void si_resume_queries(struct si_context *sctx)
+{
+	struct si_query_hw *query;
+
+	assert(sctx->num_cs_dw_queries_suspend == 0);
+
+	/* Check CS space here. Resuming must not be interrupted by flushes. */
+	si_need_gfx_cs_space(sctx);
+
+	LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, list) {
+		si_query_hw_emit_start(sctx, query);
+	}
+}
+
+#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
+	{ \
+		.name = name_, \
+		.query_type = SI_QUERY_##query_type_, \
+		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
+		.group_id = group_id_ \
+	}
+
+#define X(name_, query_type_, type_, result_type_) \
+	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+
+#define XG(group_, name_, query_type_, type_, result_type_) \
+	XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
+
+static struct pipe_driver_query_info si_driver_query_list[] = {
+	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
+	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
+	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
+	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
+	X("decompress-calls",		DECOMPRESS_CALLS,	UINT64, AVERAGE),
+	X("MRT-draw-calls",		MRT_DRAW_CALLS,		UINT64, AVERAGE),
+	X("prim-restart-calls",		PRIM_RESTART_CALLS,	UINT64, AVERAGE),
+	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
+	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
+	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
+	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
+	X("cp-dma-calls",		CP_DMA_CALLS,		UINT64, AVERAGE),
+	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
+	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
+	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
+	X("num-CB-cache-flushes",	NUM_CB_CACHE_FLUSHES,	UINT64, AVERAGE),
+	X("num-DB-cache-flushes",	NUM_DB_CACHE_FLUSHES,	UINT64, AVERAGE),
+	X("num-L2-invalidates",		NUM_L2_INVALIDATES,	UINT64, AVERAGE),
+	X("num-L2-writebacks",		NUM_L2_WRITEBACKS,	UINT64, AVERAGE),
+	X("num-resident-handles",	NUM_RESIDENT_HANDLES,	UINT64, AVERAGE),
+	X("tc-offloaded-slots",		TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
+	X("tc-direct-slots",		TC_DIRECT_SLOTS,	UINT64, AVERAGE),
+	X("tc-num-syncs",		TC_NUM_SYNCS,		UINT64, AVERAGE),
+	X("CS-thread-busy",		CS_THREAD_BUSY,		UINT64, AVERAGE),
+	X("gallium-thread-busy",	GALLIUM_THREAD_BUSY,	UINT64, AVERAGE),
+	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
+	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
+	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
+	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
+	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
+	X("num-mapped-buffers",		NUM_MAPPED_BUFFERS,	UINT64, AVERAGE),
+	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
+	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
+	X("GFX-BO-list-size",		GFX_BO_LIST_SIZE,	UINT64, AVERAGE),
+	X("GFX-IB-size",		GFX_IB_SIZE,		UINT64, AVERAGE),
+	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
+	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
+	X("VRAM-CPU-page-faults",	NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
+	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
+	X("VRAM-vis-usage",		VRAM_VIS_USAGE,		BYTES, AVERAGE),
+	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
+	X("back-buffer-ps-draw-ratio",	BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
+
+	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+	 * which use it as a fallback path to detect the GPU type.
+	 *
+	 * Note: The names of these queries are significant for GPUPerfStudio
+	 * (and possibly their order as well). */
+	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
+
+	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
+	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
+	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
+
+	/* The following queries must be at the end of the list because their
+	 * availability is adjusted dynamically based on the DRM version. */
+	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
+	X("GPU-shaders-busy",		GPU_SHADERS_BUSY,	UINT64, AVERAGE),
+	X("GPU-ta-busy",		GPU_TA_BUSY,		UINT64, AVERAGE),
+	X("GPU-gds-busy",		GPU_GDS_BUSY,		UINT64, AVERAGE),
+	X("GPU-vgt-busy",		GPU_VGT_BUSY,		UINT64, AVERAGE),
+	X("GPU-ia-busy",		GPU_IA_BUSY,		UINT64, AVERAGE),
+	X("GPU-sx-busy",		GPU_SX_BUSY,		UINT64, AVERAGE),
+	X("GPU-wd-busy",		GPU_WD_BUSY,		UINT64, AVERAGE),
+	X("GPU-bci-busy",		GPU_BCI_BUSY,		UINT64, AVERAGE),
+	X("GPU-sc-busy",		GPU_SC_BUSY,		UINT64, AVERAGE),
+	X("GPU-pa-busy",		GPU_PA_BUSY,		UINT64, AVERAGE),
+	X("GPU-db-busy",		GPU_DB_BUSY,		UINT64, AVERAGE),
+	X("GPU-cp-busy",		GPU_CP_BUSY,		UINT64, AVERAGE),
+	X("GPU-cb-busy",		GPU_CB_BUSY,		UINT64, AVERAGE),
+
+	/* SRBM_STATUS2 */
+	X("GPU-sdma-busy",		GPU_SDMA_BUSY,		UINT64, AVERAGE),
+
+	/* CP_STAT */
+	X("GPU-pfp-busy",		GPU_PFP_BUSY,		UINT64, AVERAGE),
+	X("GPU-meq-busy",		GPU_MEQ_BUSY,		UINT64, AVERAGE),
+	X("GPU-me-busy",		GPU_ME_BUSY,		UINT64, AVERAGE),
+	X("GPU-surf-sync-busy",		GPU_SURF_SYNC_BUSY,	UINT64, AVERAGE),
+	X("GPU-cp-dma-busy",		GPU_CP_DMA_BUSY,	UINT64, AVERAGE),
+	X("GPU-scratch-ram-busy",	GPU_SCRATCH_RAM_BUSY,	UINT64, AVERAGE),
+};
+
+#undef X
+#undef XG
+#undef XFULL
+
+static unsigned si_get_num_queries(struct si_screen *sscreen)
+{
+	/* amdgpu */
+	if (sscreen->info.drm_major == 3) {
+		if (sscreen->info.chip_class >= VI)
+			return ARRAY_SIZE(si_driver_query_list);
+		else
+			return ARRAY_SIZE(si_driver_query_list) - 7;
+	}
+
+	/* radeon */
+	if (sscreen->info.has_read_registers_query) {
+		if (sscreen->info.chip_class == CIK)
+			return ARRAY_SIZE(si_driver_query_list) - 6;
+		else
+			return ARRAY_SIZE(si_driver_query_list) - 7;
+	}
+
+	return ARRAY_SIZE(si_driver_query_list) - 21;
+}
+
+static int si_get_driver_query_info(struct pipe_screen *screen,
+				    unsigned index,
+				    struct pipe_driver_query_info *info)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	unsigned num_queries = si_get_num_queries(sscreen);
+
+	if (!info) {
+		unsigned num_perfcounters =
+			si_get_perfcounter_info(sscreen, 0, NULL);
+
+		return num_queries + num_perfcounters;
+	}
+
+	if (index >= num_queries)
+		return si_get_perfcounter_info(sscreen, index - num_queries, info);
+
+	*info = si_driver_query_list[index];
+
+	switch (info->query_type) {
+	case SI_QUERY_REQUESTED_VRAM:
+	case SI_QUERY_VRAM_USAGE:
+	case SI_QUERY_MAPPED_VRAM:
+		info->max_value.u64 = sscreen->info.vram_size;
+		break;
+	case SI_QUERY_REQUESTED_GTT:
+	case SI_QUERY_GTT_USAGE:
+	case SI_QUERY_MAPPED_GTT:
+		info->max_value.u64 = sscreen->info.gart_size;
+		break;
+	case SI_QUERY_GPU_TEMPERATURE:
+		info->max_value.u64 = 125;
+		break;
+	case SI_QUERY_VRAM_VIS_USAGE:
+		info->max_value.u64 = sscreen->info.vram_vis_size;
+		break;
+	}
+
+	if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
+		info->group_id += sscreen->perfcounters->num_groups;
+
+	return 1;
+}
+
+/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
+ * performance counter groups, so be careful when changing this and related
+ * functions.
+ */
+static int si_get_driver_query_group_info(struct pipe_screen *screen,
+					  unsigned index,
+					  struct pipe_driver_query_group_info *info)
+{
+	struct si_screen *sscreen = (struct si_screen *)screen;
+	unsigned num_pc_groups = 0;
+
+	if (sscreen->perfcounters)
+		num_pc_groups = sscreen->perfcounters->num_groups;
+
+	if (!info)
+		return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
+
+	if (index < num_pc_groups)
+		return si_get_perfcounter_group_info(sscreen, index, info);
+
+	index -= num_pc_groups;
+	if (index >= SI_NUM_SW_QUERY_GROUPS)
+		return 0;
+
+	info->name = "GPIN";
+	info->max_active_queries = 5;
+	info->num_queries = 5;
+	return 1;
+}
+
+void si_init_query_functions(struct si_context *sctx)
+{
+	sctx->b.create_query = si_create_query;
+	sctx->b.create_batch_query = si_create_batch_query;
+	sctx->b.destroy_query = si_destroy_query;
+	sctx->b.begin_query = si_begin_query;
+	sctx->b.end_query = si_end_query;
+	sctx->b.get_query_result = si_get_query_result;
+	sctx->b.get_query_result_resource = si_get_query_result_resource;
+	sctx->atoms.s.render_cond.emit = si_emit_query_predication;
+
+	if (((struct si_screen*)sctx->b.screen)->info.num_render_backends > 0)
+	    sctx->b.render_condition = si_render_condition;
+
+	LIST_INITHEAD(&sctx->active_queries);
+}
+
+void si_init_screen_query_functions(struct si_screen *sscreen)
+{
+	sscreen->b.get_driver_query_info = si_get_driver_query_info;
+	sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_query.h b/lib/mesa/src/gallium/drivers/radeonsi/si_query.h
new file mode 100644
index 000000000..cf2eccd86
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_query.h
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SI_QUERY_H
+#define SI_QUERY_H
+
+#include "util/u_threaded_context.h"
+
+struct pipe_context;
+struct pipe_query;
+struct pipe_resource;
+
+struct si_screen;
+struct si_context;
+struct si_query;
+struct si_query_hw;
+struct r600_resource;
+
+enum {
+	SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+	SI_QUERY_DECOMPRESS_CALLS,
+	SI_QUERY_MRT_DRAW_CALLS,
+	SI_QUERY_PRIM_RESTART_CALLS,
+	SI_QUERY_SPILL_DRAW_CALLS,
+	SI_QUERY_COMPUTE_CALLS,
+	SI_QUERY_SPILL_COMPUTE_CALLS,
+	SI_QUERY_DMA_CALLS,
+	SI_QUERY_CP_DMA_CALLS,
+	SI_QUERY_NUM_VS_FLUSHES,
+	SI_QUERY_NUM_PS_FLUSHES,
+	SI_QUERY_NUM_CS_FLUSHES,
+	SI_QUERY_NUM_CB_CACHE_FLUSHES,
+	SI_QUERY_NUM_DB_CACHE_FLUSHES,
+	SI_QUERY_NUM_L2_INVALIDATES,
+	SI_QUERY_NUM_L2_WRITEBACKS,
+	SI_QUERY_NUM_RESIDENT_HANDLES,
+	SI_QUERY_TC_OFFLOADED_SLOTS,
+	SI_QUERY_TC_DIRECT_SLOTS,
+	SI_QUERY_TC_NUM_SYNCS,
+	SI_QUERY_CS_THREAD_BUSY,
+	SI_QUERY_GALLIUM_THREAD_BUSY,
+	SI_QUERY_REQUESTED_VRAM,
+	SI_QUERY_REQUESTED_GTT,
+	SI_QUERY_MAPPED_VRAM,
+	SI_QUERY_MAPPED_GTT,
+	SI_QUERY_BUFFER_WAIT_TIME,
+	SI_QUERY_NUM_MAPPED_BUFFERS,
+	SI_QUERY_NUM_GFX_IBS,
+	SI_QUERY_NUM_SDMA_IBS,
+	SI_QUERY_GFX_BO_LIST_SIZE,
+	SI_QUERY_GFX_IB_SIZE,
+	SI_QUERY_NUM_BYTES_MOVED,
+	SI_QUERY_NUM_EVICTIONS,
+	SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
+	SI_QUERY_VRAM_USAGE,
+	SI_QUERY_VRAM_VIS_USAGE,
+	SI_QUERY_GTT_USAGE,
+	SI_QUERY_GPU_TEMPERATURE,
+	SI_QUERY_CURRENT_GPU_SCLK,
+	SI_QUERY_CURRENT_GPU_MCLK,
+	SI_QUERY_GPU_LOAD,
+	SI_QUERY_GPU_SHADERS_BUSY,
+	SI_QUERY_GPU_TA_BUSY,
+	SI_QUERY_GPU_GDS_BUSY,
+	SI_QUERY_GPU_VGT_BUSY,
+	SI_QUERY_GPU_IA_BUSY,
+	SI_QUERY_GPU_SX_BUSY,
+	SI_QUERY_GPU_WD_BUSY,
+	SI_QUERY_GPU_BCI_BUSY,
+	SI_QUERY_GPU_SC_BUSY,
+	SI_QUERY_GPU_PA_BUSY,
+	SI_QUERY_GPU_DB_BUSY,
+	SI_QUERY_GPU_CP_BUSY,
+	SI_QUERY_GPU_CB_BUSY,
+	SI_QUERY_GPU_SDMA_BUSY,
+	SI_QUERY_GPU_PFP_BUSY,
+	SI_QUERY_GPU_MEQ_BUSY,
+	SI_QUERY_GPU_ME_BUSY,
+	SI_QUERY_GPU_SURF_SYNC_BUSY,
+	SI_QUERY_GPU_CP_DMA_BUSY,
+	SI_QUERY_GPU_SCRATCH_RAM_BUSY,
+	SI_QUERY_NUM_COMPILATIONS,
+	SI_QUERY_NUM_SHADERS_CREATED,
+	SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+	SI_QUERY_NUM_SHADER_CACHE_HITS,
+	SI_QUERY_GPIN_ASIC_ID,
+	SI_QUERY_GPIN_NUM_SIMD,
+	SI_QUERY_GPIN_NUM_RB,
+	SI_QUERY_GPIN_NUM_SPI,
+	SI_QUERY_GPIN_NUM_SE,
+	SI_QUERY_TIME_ELAPSED_SDMA,
+	SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
+
+	SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+};
+
+enum {
+	SI_QUERY_GROUP_GPIN = 0,
+	SI_NUM_SW_QUERY_GROUPS
+};
+
+struct si_query_ops {
+	void (*destroy)(struct si_screen *, struct si_query *);
+	bool (*begin)(struct si_context *, struct si_query *);
+	bool (*end)(struct si_context *, struct si_query *);
+	bool (*get_result)(struct si_context *,
+			   struct si_query *, bool wait,
+			   union pipe_query_result *result);
+	void (*get_result_resource)(struct si_context *,
+				    struct si_query *, bool wait,
+				    enum pipe_query_value_type result_type,
+				    int index,
+				    struct pipe_resource *resource,
+				    unsigned offset);
+};
+
+struct si_query {
+	struct threaded_query b;
+	struct si_query_ops *ops;
+
+	/* The type of query */
+	unsigned type;
+};
+
+enum {
+	SI_QUERY_HW_FLAG_NO_START = (1 << 0),
+	/* gap */
+	/* whether begin_query doesn't clear the result */
+	SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
+};
+
+struct si_query_hw_ops {
+	bool (*prepare_buffer)(struct si_screen *,
+			       struct si_query_hw *,
+			       struct r600_resource *);
+	void (*emit_start)(struct si_context *,
+			   struct si_query_hw *,
+			   struct r600_resource *buffer, uint64_t va);
+	void (*emit_stop)(struct si_context *,
+			  struct si_query_hw *,
+			  struct r600_resource *buffer, uint64_t va);
+	void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
+	void (*add_result)(struct si_screen *screen,
+			   struct si_query_hw *, void *buffer,
+			   union pipe_query_result *result);
+};
+
+struct si_query_buffer {
+	/* The buffer where query results are stored. */
+	struct r600_resource		*buf;
+	/* Offset of the next free result after current query data */
+	unsigned			results_end;
+	/* If a query buffer is full, a new buffer is created and the old one
+	 * is put in here. When we calculate the result, we sum up the samples
+	 * from all buffers. */
+	struct si_query_buffer	*previous;
+};
+
+struct si_query_hw {
+	struct si_query b;
+	struct si_query_hw_ops *ops;
+	unsigned flags;
+
+	/* The query buffer and how many results are in it. */
+	struct si_query_buffer buffer;
+	/* Size of the result in memory for both begin_query and end_query,
+	 * this can be one or two numbers, or it could even be a size of a structure. */
+	unsigned result_size;
+	/* The number of dwords for end_query. */
+	unsigned num_cs_dw_end;
+	/* Linked list of queries */
+	struct list_head list;
+	/* For transform feedback: which stream the query is for */
+	unsigned stream;
+
+	/* Workaround via compute shader */
+	struct r600_resource *workaround_buf;
+	unsigned workaround_offset;
+};
+
+bool si_query_hw_init(struct si_screen *sscreen,
+		      struct si_query_hw *query);
+void si_query_hw_destroy(struct si_screen *sscreen,
+			 struct si_query *rquery);
+bool si_query_hw_begin(struct si_context *sctx,
+		       struct si_query *rquery);
+bool si_query_hw_end(struct si_context *sctx,
+		     struct si_query *rquery);
+bool si_query_hw_get_result(struct si_context *sctx,
+			    struct si_query *rquery,
+			    bool wait,
+			    union pipe_query_result *result);
+
+/* Performance counters */
+enum {
+	/* This block is part of the shader engine */
+	SI_PC_BLOCK_SE = (1 << 0),
+
+	/* Expose per-instance groups instead of summing all instances (within
+	 * an SE). */
+	SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
+
+	/* Expose per-SE groups instead of summing instances across SEs. */
+	SI_PC_BLOCK_SE_GROUPS = (1 << 2),
+
+	/* Shader block */
+	SI_PC_BLOCK_SHADER = (1 << 3),
+
+	/* Non-shader block with perfcounters windowed by shaders. */
+	SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
+};
+
+/* Describes a hardware block with performance counters. Multiple instances of
+ * each block, possibly per-SE, may exist on the chip. Depending on the block
+ * and on the user's configuration, we either
+ *  (a) expose every instance as a performance counter group,
+ *  (b) expose a single performance counter group that reports the sum over all
+ *      instances, or
+ *  (c) expose one performance counter group per instance, but summed over all
+ *      shader engines.
+ */
+struct si_perfcounter_block {
+	const char *basename;
+	unsigned flags;
+	unsigned num_counters;
+	unsigned num_selectors;
+	unsigned num_instances;
+
+	unsigned num_groups;
+	char *group_names;
+	unsigned group_name_stride;
+
+	char *selector_names;
+	unsigned selector_name_stride;
+
+	void *data;
+};
+
+struct si_perfcounters {
+	unsigned num_groups;
+	unsigned num_blocks;
+	struct si_perfcounter_block *blocks;
+
+	unsigned num_stop_cs_dwords;
+	unsigned num_instance_cs_dwords;
+
+	unsigned num_shader_types;
+	const char * const *shader_type_suffixes;
+	const unsigned *shader_type_bits;
+
+	void (*emit_instance)(struct si_context *,
+			      int se, int instance);
+	void (*emit_shaders)(struct si_context *, unsigned shaders);
+	void (*emit_select)(struct si_context *,
+			    struct si_perfcounter_block *,
+			    unsigned count, unsigned *selectors);
+	void (*emit_start)(struct si_context *,
+			  struct r600_resource *buffer, uint64_t va);
+	void (*emit_stop)(struct si_context *,
+			  struct r600_resource *buffer, uint64_t va);
+	void (*emit_read)(struct si_context *,
+			  struct si_perfcounter_block *,
+			  unsigned count, unsigned *selectors,
+			  struct r600_resource *buffer, uint64_t va);
+
+	void (*cleanup)(struct si_screen *);
+
+	bool separate_se;
+	bool separate_instance;
+};
+
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
+					 unsigned num_queries,
+					 unsigned *query_types);
+
+int si_get_perfcounter_info(struct si_screen *,
+			    unsigned index,
+			    struct pipe_driver_query_info *info);
+int si_get_perfcounter_group_info(struct si_screen *,
+				  unsigned index,
+				  struct pipe_driver_query_group_info *info);
+
+bool si_perfcounters_init(struct si_perfcounters *, unsigned num_blocks);
+void si_perfcounters_add_block(struct si_screen *,
+			       struct si_perfcounters *,
+			       const char *name, unsigned flags,
+			       unsigned counters, unsigned selectors,
+			       unsigned instances, void *data);
+void si_perfcounters_do_destroy(struct si_perfcounters *);
+void si_query_hw_reset_buffers(struct si_context *sctx,
+			       struct si_query_hw *query);
+
+struct si_qbo_state {
+	void *saved_compute;
+	struct pipe_constant_buffer saved_const0;
+	struct pipe_shader_buffer saved_ssbo[3];
+};
+
+#endif /* SI_QUERY_H */
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c
index 7a8822738..cd38145da 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,8 +22,8 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader.h"
 #include "si_shader_internal.h"
+#include "si_pipe.h"
 
 #include "ac_nir_to_llvm.h"
 
@@ -31,11 +32,23 @@
 #include "compiler/nir/nir.h"
 #include "compiler/nir_types.h"
 
+static nir_variable* tex_get_texture_var(nir_tex_instr *instr)
+{
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_texture_deref:
+			return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[i].src));
+		default:
+			break;
+		}
+	}
+
+	return NULL;
+}
 
-static int
-type_size(const struct glsl_type *type)
+static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr)
 {
-   return glsl_count_attribute_slots(type, false);
+	return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
 }
 
 static void scan_instruction(struct tgsi_shader_info *info,
@@ -58,6 +71,15 @@ static void scan_instruction(struct tgsi_shader_info *info,
 		}
 	} else if (instr->type == nir_instr_type_tex) {
 		nir_tex_instr *tex = nir_instr_as_tex(instr);
+		nir_variable *texture = tex_get_texture_var(tex);
+
+		if (!texture) {
+			info->samplers_declared |=
+				u_bit_consecutive(tex->sampler_index, 1);
+		} else {
+			if (texture->data.bindless)
+				info->uses_bindless_samplers = true;
+		}
 
 		switch (tex->op) {
 		case nir_texop_tex:
@@ -78,6 +100,30 @@ static void scan_instruction(struct tgsi_shader_info *info,
 		case nir_intrinsic_load_instance_id:
 			info->uses_instanceid = 1;
 			break;
+		case nir_intrinsic_load_invocation_id:
+			info->uses_invocationid = true;
+			break;
+		case nir_intrinsic_load_num_work_groups:
+			info->uses_grid_size = true;
+			break;
+		case nir_intrinsic_load_local_group_size:
+			/* The block size is translated to IMM with a fixed block size. */
+			if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+				info->uses_block_size = true;
+			break;
+		case nir_intrinsic_load_local_invocation_id:
+		case nir_intrinsic_load_work_group_id: {
+			unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
+			while (mask) {
+				unsigned i = u_bit_scan(&mask);
+
+				if (intr->intrinsic == nir_intrinsic_load_work_group_id)
+					info->uses_block_id[i] = true;
+				else
+					info->uses_thread_id[i] = true;
+			}
+			break;
+		}
 		case nir_intrinsic_load_vertex_id:
 			info->uses_vertexid = 1;
 			break;
@@ -90,15 +136,37 @@ static void scan_instruction(struct tgsi_shader_info *info,
 		case nir_intrinsic_load_primitive_id:
 			info->uses_primid = 1;
 			break;
-		case nir_intrinsic_image_store:
-		case nir_intrinsic_image_atomic_add:
-		case nir_intrinsic_image_atomic_min:
-		case nir_intrinsic_image_atomic_max:
-		case nir_intrinsic_image_atomic_and:
-		case nir_intrinsic_image_atomic_or:
-		case nir_intrinsic_image_atomic_xor:
-		case nir_intrinsic_image_atomic_exchange:
-		case nir_intrinsic_image_atomic_comp_swap:
+		case nir_intrinsic_load_sample_mask_in:
+			info->reads_samplemask = true;
+			break;
+		case nir_intrinsic_load_tess_level_inner:
+		case nir_intrinsic_load_tess_level_outer:
+			info->reads_tess_factors = true;
+			break;
+		case nir_intrinsic_image_deref_load:
+		case nir_intrinsic_image_deref_size:
+		case nir_intrinsic_image_deref_samples: {
+			nir_variable *var = intrinsic_get_var(intr);
+			if (var->data.bindless)
+				info->uses_bindless_images = true;
+
+			break;
+		}
+		case nir_intrinsic_image_deref_store:
+		case nir_intrinsic_image_deref_atomic_add:
+		case nir_intrinsic_image_deref_atomic_min:
+		case nir_intrinsic_image_deref_atomic_max:
+		case nir_intrinsic_image_deref_atomic_and:
+		case nir_intrinsic_image_deref_atomic_or:
+		case nir_intrinsic_image_deref_atomic_xor:
+		case nir_intrinsic_image_deref_atomic_exchange:
+		case nir_intrinsic_image_deref_atomic_comp_swap: {
+			nir_variable *var = intrinsic_get_var(intr);
+			if (var->data.bindless)
+				info->uses_bindless_images = true;
+
+			/* fall-through */
+		}
 		case nir_intrinsic_store_ssbo:
 		case nir_intrinsic_ssbo_atomic_add:
 		case nir_intrinsic_ssbo_atomic_imin:
@@ -112,42 +180,206 @@ static void scan_instruction(struct tgsi_shader_info *info,
 		case nir_intrinsic_ssbo_atomic_comp_swap:
 			info->writes_memory = true;
 			break;
+		case nir_intrinsic_load_deref: {
+			nir_variable *var = intrinsic_get_var(intr);
+			nir_variable_mode mode = var->data.mode;
+			enum glsl_base_type base_type =
+				glsl_get_base_type(glsl_without_array(var->type));
+
+			if (mode == nir_var_shader_in) {
+				switch (var->data.interpolation) {
+				case INTERP_MODE_NONE:
+					if (glsl_base_type_is_integer(base_type))
+						break;
+
+					/* fall-through */
+				case INTERP_MODE_SMOOTH:
+					if (var->data.sample)
+						info->uses_persp_sample = true;
+					else if (var->data.centroid)
+						info->uses_persp_centroid = true;
+					else
+						info->uses_persp_center = true;
+					break;
+
+				case INTERP_MODE_NOPERSPECTIVE:
+					if (var->data.sample)
+						info->uses_linear_sample = true;
+					else if (var->data.centroid)
+						info->uses_linear_centroid = true;
+					else
+						info->uses_linear_center = true;
+					break;
+				}
+			}
+			break;
+		}
+		case nir_intrinsic_interp_deref_at_centroid:
+		case nir_intrinsic_interp_deref_at_sample:
+		case nir_intrinsic_interp_deref_at_offset: {
+			enum glsl_interp_mode interp = intrinsic_get_var(intr)->data.interpolation;
+			switch (interp) {
+			case INTERP_MODE_SMOOTH:
+			case INTERP_MODE_NONE:
+				if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid)
+					info->uses_persp_opcode_interp_centroid = true;
+				else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
+					info->uses_persp_opcode_interp_sample = true;
+				else
+					info->uses_persp_opcode_interp_offset = true;
+				break;
+			case INTERP_MODE_NOPERSPECTIVE:
+				if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid)
+					info->uses_linear_opcode_interp_centroid = true;
+				else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
+					info->uses_linear_opcode_interp_sample = true;
+				else
+					info->uses_linear_opcode_interp_offset = true;
+				break;
+			case INTERP_MODE_FLAT:
+				break;
+			default:
+				unreachable("Unsupported interpoation type");
+			}
+			break;
+		}
 		default:
 			break;
 		}
 	}
 }
 
+void si_nir_scan_tess_ctrl(const struct nir_shader *nir,
+			   const struct tgsi_shader_info *info,
+			   struct tgsi_tessctrl_info *out)
+{
+	memset(out, 0, sizeof(*out));
+
+	if (nir->info.stage != MESA_SHADER_TESS_CTRL)
+		return;
+
+	/* Initial value = true. Here the pass will accumulate results from
+	 * multiple segments surrounded by barriers. If tess factors aren't
+	 * written at all, it's a shader bug and we don't care if this will be
+	 * true.
+	 */
+	out->tessfactors_are_def_in_all_invocs = true;
+
+	/* TODO: Implement scanning of tess factors, see tgsi backend. */
+}
+
 void si_nir_scan_shader(const struct nir_shader *nir,
 			struct tgsi_shader_info *info)
 {
 	nir_function *func;
 	unsigned i;
 
-	assert(nir->info.stage == MESA_SHADER_VERTEX ||
-	       nir->info.stage == MESA_SHADER_FRAGMENT);
-
 	info->processor = pipe_shader_type_from_mesa(nir->info.stage);
 	info->num_tokens = 2; /* indicate that the shader is non-empty */
 	info->num_instructions = 2;
 
-	info->num_inputs = nir->num_inputs;
-	info->num_outputs = nir->num_outputs;
+	info->properties[TGSI_PROPERTY_NEXT_SHADER] =
+		pipe_shader_type_from_mesa(nir->info.next_stage);
+
+	if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+		info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] =
+			nir->info.tess.tcs_vertices_out;
+	}
+
+	if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+		if (nir->info.tess.primitive_mode == GL_ISOLINES)
+			info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
+		else
+			info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
+
+		STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
+		STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 ==
+			      PIPE_TESS_SPACING_FRACTIONAL_ODD);
+		STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 ==
+			      PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+
+		info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
+		info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
+		info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
+	}
+
+	if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+		info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
+		info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
+		info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
+		info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
+	}
+
+	if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+		info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
+			nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
+		info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
+
+		if (nir->info.fs.pixel_center_integer) {
+			info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] =
+				TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
+		}
+
+		if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
+			switch (nir->info.fs.depth_layout) {
+			case FRAG_DEPTH_LAYOUT_ANY:
+				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
+				break;
+			case FRAG_DEPTH_LAYOUT_GREATER:
+				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
+				break;
+			case FRAG_DEPTH_LAYOUT_LESS:
+				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
+				break;
+			case FRAG_DEPTH_LAYOUT_UNCHANGED:
+				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
+				break;
+			default:
+				unreachable("Unknow depth layout");
+			}
+		}
+	}
+
+	if (nir->info.stage == MESA_SHADER_COMPUTE) {
+		info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
+		info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
+		info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
+	}
 
 	i = 0;
+	uint64_t processed_inputs = 0;
+	unsigned num_inputs = 0;
 	nir_foreach_variable(variable, &nir->inputs) {
 		unsigned semantic_name, semantic_index;
-		unsigned attrib_count = glsl_count_attribute_slots(variable->type,
+
+		const struct glsl_type *type = variable->type;
+		if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+			assert(glsl_type_is_array(type));
+			type = glsl_get_array_element(type);
+		}
+
+		unsigned attrib_count = glsl_count_attribute_slots(type,
 								   nir->info.stage == MESA_SHADER_VERTEX);
 
-		assert(attrib_count == 1 && "not implemented");
+		i = variable->data.driver_location;
 
 		/* Vertex shader inputs don't have semantics. The state
 		 * tracker has already mapped them to attributes via
 		 * variable->data.driver_location.
 		 */
-		if (nir->info.stage == MESA_SHADER_VERTEX)
+		if (nir->info.stage == MESA_SHADER_VERTEX) {
+			/* TODO: gather the actual input useage and remove this. */
+			info->input_usage_mask[i] = TGSI_WRITEMASK_XYZW;
+
+			if (glsl_type_is_dual_slot(variable->type)) {
+				num_inputs += 2;
+
+				/* TODO: gather the actual input useage and remove this. */
+				info->input_usage_mask[i+1] = TGSI_WRITEMASK_XYZW;
+			} else
+				num_inputs++;
 			continue;
+		}
 
 		/* Fragment shader position is a system value. */
 		if (nir->info.stage == MESA_SHADER_FRAGMENT &&
@@ -155,150 +387,345 @@ void si_nir_scan_shader(const struct nir_shader *nir,
 			if (variable->data.pixel_center_integer)
 				info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] =
 					TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
+
+			num_inputs++;
 			continue;
 		}
 
-		tgsi_get_gl_varying_semantic(variable->data.location, true,
-					     &semantic_name, &semantic_index);
+		for (unsigned j = 0; j < attrib_count; j++, i++) {
 
-		info->input_semantic_name[i] = semantic_name;
-		info->input_semantic_index[i] = semantic_index;
+			if (processed_inputs & ((uint64_t)1 << i))
+				continue;
 
-		if (variable->data.sample)
-			info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
-		else if (variable->data.centroid)
-			info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
-		else
-			info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
-
-		enum glsl_base_type base_type =
-			glsl_get_base_type(glsl_without_array(variable->type));
+			processed_inputs |= ((uint64_t)1 << i);
+			num_inputs++;
 
-		switch (variable->data.interpolation) {
-		case INTERP_MODE_NONE:
-			if (glsl_base_type_is_integer(base_type)) {
-				info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
-				break;
-			}
+			tgsi_get_gl_varying_semantic(variable->data.location + j, true,
+						     &semantic_name, &semantic_index);
 
-			if (semantic_name == TGSI_SEMANTIC_COLOR) {
-				info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
-				goto persp_locations;
-			}
-			/* fall-through */
-		case INTERP_MODE_SMOOTH:
-			assert(!glsl_base_type_is_integer(base_type));
+			info->input_semantic_name[i] = semantic_name;
+			info->input_semantic_index[i] = semantic_index;
 
-			info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
+			if (semantic_name == TGSI_SEMANTIC_PRIMID)
+				info->uses_primid = true;
 
-		persp_locations:
 			if (variable->data.sample)
-				info->uses_persp_sample = true;
+				info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
 			else if (variable->data.centroid)
-				info->uses_persp_centroid = true;
+				info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
 			else
-				info->uses_persp_center = true;
-			break;
+				info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
 
-		case INTERP_MODE_NOPERSPECTIVE:
-			assert(!glsl_base_type_is_integer(base_type));
+			enum glsl_base_type base_type =
+				glsl_get_base_type(glsl_without_array(variable->type));
 
-			info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
+			switch (variable->data.interpolation) {
+			case INTERP_MODE_NONE:
+				if (glsl_base_type_is_integer(base_type)) {
+					info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+					break;
+				}
 
-			if (variable->data.sample)
-				info->uses_linear_sample = true;
-			else if (variable->data.centroid)
-				info->uses_linear_centroid = true;
-			else
-				info->uses_linear_center = true;
-			break;
+				if (semantic_name == TGSI_SEMANTIC_COLOR) {
+					info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
+					break;
+				}
+				/* fall-through */
 
-		case INTERP_MODE_FLAT:
-			info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
-			break;
-		}
+			case INTERP_MODE_SMOOTH:
+				assert(!glsl_base_type_is_integer(base_type));
+
+				info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
+				break;
+
+			case INTERP_MODE_NOPERSPECTIVE:
+				assert(!glsl_base_type_is_integer(base_type));
 
-		/* TODO make this more precise */
-		if (variable->data.location == VARYING_SLOT_COL0)
-			info->colors_read |= 0x0f;
-		else if (variable->data.location == VARYING_SLOT_COL1)
-			info->colors_read |= 0xf0;
+				info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
+				break;
 
-		i++;
+			case INTERP_MODE_FLAT:
+				info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+				break;
+			}
+
+			/* TODO make this more precise */
+			if (variable->data.location == VARYING_SLOT_COL0)
+				info->colors_read |= 0x0f;
+			else if (variable->data.location == VARYING_SLOT_COL1)
+				info->colors_read |= 0xf0;
+		}
 	}
 
+	info->num_inputs = num_inputs;
+
+
 	i = 0;
+	uint64_t processed_outputs = 0;
+	unsigned num_outputs = 0;
 	nir_foreach_variable(variable, &nir->outputs) {
 		unsigned semantic_name, semantic_index;
 
-		if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-			tgsi_get_gl_frag_result_semantic(variable->data.location,
-				&semantic_name, &semantic_index);
-		} else {
-			tgsi_get_gl_varying_semantic(variable->data.location, true,
-						     &semantic_name, &semantic_index);
+		i = variable->data.driver_location;
+
+		const struct glsl_type *type = variable->type;
+		if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+			assert(glsl_type_is_array(type));
+			type = glsl_get_array_element(type);
 		}
 
-		info->output_semantic_name[i] = semantic_name;
-		info->output_semantic_index[i] = semantic_index;
-		info->output_usagemask[i] = TGSI_WRITEMASK_XYZW;
+		unsigned attrib_count = glsl_count_attribute_slots(type, false);
+		for (unsigned k = 0; k < attrib_count; k++, i++) {
 
-		switch (semantic_name) {
-		case TGSI_SEMANTIC_PRIMID:
-			info->writes_primid = true;
-			break;
-		case TGSI_SEMANTIC_VIEWPORT_INDEX:
-			info->writes_viewport_index = true;
-			break;
-		case TGSI_SEMANTIC_LAYER:
-			info->writes_layer = true;
-			break;
-		case TGSI_SEMANTIC_PSIZE:
-			info->writes_psize = true;
-			break;
-		case TGSI_SEMANTIC_CLIPVERTEX:
-			info->writes_clipvertex = true;
-			break;
-		case TGSI_SEMANTIC_COLOR:
-			info->colors_written |= 1 << semantic_index;
-			break;
-		case TGSI_SEMANTIC_STENCIL:
-			info->writes_stencil = true;
-			break;
-		case TGSI_SEMANTIC_SAMPLEMASK:
-			info->writes_samplemask = true;
-			break;
-		case TGSI_SEMANTIC_EDGEFLAG:
-			info->writes_edgeflag = true;
-			break;
-		case TGSI_SEMANTIC_POSITION:
-			if (info->processor == PIPE_SHADER_FRAGMENT)
-				info->writes_z = true;
-			else
-				info->writes_position = true;
-			break;
+			if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+				tgsi_get_gl_frag_result_semantic(variable->data.location + k,
+					&semantic_name, &semantic_index);
+
+				/* Adjust for dual source blending */
+				if (variable->data.index > 0) {
+					semantic_index++;
+				}
+			} else {
+				tgsi_get_gl_varying_semantic(variable->data.location + k, true,
+							     &semantic_name, &semantic_index);
+			}
+
+			unsigned num_components = 4;
+			unsigned vector_elements = glsl_get_vector_elements(glsl_without_array(variable->type));
+			if (vector_elements)
+				num_components = vector_elements;
+
+			unsigned component = variable->data.location_frac;
+			if (glsl_type_is_64bit(glsl_without_array(variable->type))) {
+				if (glsl_type_is_dual_slot(glsl_without_array(variable->type)) && k % 2) {
+					num_components = (num_components * 2) - 4;
+					component = 0;
+				} else {
+					num_components = MIN2(num_components * 2, 4);
+				}
+			}
+
+			ubyte usagemask = 0;
+			for (unsigned j = component; j < num_components + component; j++) {
+				switch (j) {
+				case 0:
+					usagemask |= TGSI_WRITEMASK_X;
+					break;
+				case 1:
+					usagemask |= TGSI_WRITEMASK_Y;
+					break;
+				case 2:
+					usagemask |= TGSI_WRITEMASK_Z;
+					break;
+				case 3:
+					usagemask |= TGSI_WRITEMASK_W;
+					break;
+				default:
+					unreachable("error calculating component index");
+				}
+			}
+
+			unsigned gs_out_streams;
+			if (variable->data.stream & (1u << 31)) {
+				gs_out_streams = variable->data.stream & ~(1u << 31);
+			} else {
+				assert(variable->data.stream < 4);
+				gs_out_streams = 0;
+				for (unsigned j = 0; j < num_components; ++j)
+					gs_out_streams |= variable->data.stream << (2 * (component + j));
+			}
+
+			unsigned streamx = gs_out_streams & 3;
+			unsigned streamy = (gs_out_streams >> 2) & 3;
+			unsigned streamz = (gs_out_streams >> 4) & 3;
+			unsigned streamw = (gs_out_streams >> 6) & 3;
+
+			if (usagemask & TGSI_WRITEMASK_X) {
+				info->output_usagemask[i] |= TGSI_WRITEMASK_X;
+				info->output_streams[i] |= streamx;
+				info->num_stream_output_components[streamx]++;
+			}
+			if (usagemask & TGSI_WRITEMASK_Y) {
+				info->output_usagemask[i] |= TGSI_WRITEMASK_Y;
+				info->output_streams[i] |= streamy << 2;
+				info->num_stream_output_components[streamy]++;
+			}
+			if (usagemask & TGSI_WRITEMASK_Z) {
+				info->output_usagemask[i] |= TGSI_WRITEMASK_Z;
+				info->output_streams[i] |= streamz << 4;
+				info->num_stream_output_components[streamz]++;
+			}
+			if (usagemask & TGSI_WRITEMASK_W) {
+				info->output_usagemask[i] |= TGSI_WRITEMASK_W;
+				info->output_streams[i] |= streamw << 6;
+				info->num_stream_output_components[streamw]++;
+			}
+
+			/* make sure we only count this location once against
+			 * the num_outputs counter.
+			 */
+			if (processed_outputs & ((uint64_t)1 << i))
+				continue;
+
+			processed_outputs |= ((uint64_t)1 << i);
+			num_outputs++;
+
+			info->output_semantic_name[i] = semantic_name;
+			info->output_semantic_index[i] = semantic_index;
+
+			switch (semantic_name) {
+			case TGSI_SEMANTIC_PRIMID:
+				info->writes_primid = true;
+				break;
+			case TGSI_SEMANTIC_VIEWPORT_INDEX:
+				info->writes_viewport_index = true;
+				break;
+			case TGSI_SEMANTIC_LAYER:
+				info->writes_layer = true;
+				break;
+			case TGSI_SEMANTIC_PSIZE:
+				info->writes_psize = true;
+				break;
+			case TGSI_SEMANTIC_CLIPVERTEX:
+				info->writes_clipvertex = true;
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				info->colors_written |= 1 << semantic_index;
+				break;
+			case TGSI_SEMANTIC_STENCIL:
+				info->writes_stencil = true;
+				break;
+			case TGSI_SEMANTIC_SAMPLEMASK:
+				info->writes_samplemask = true;
+				break;
+			case TGSI_SEMANTIC_EDGEFLAG:
+				info->writes_edgeflag = true;
+				break;
+			case TGSI_SEMANTIC_POSITION:
+				if (info->processor == PIPE_SHADER_FRAGMENT)
+					info->writes_z = true;
+				else
+					info->writes_position = true;
+				break;
+			}
+
+			if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+				switch (semantic_name) {
+				case TGSI_SEMANTIC_PATCH:
+					info->reads_perpatch_outputs = true;
+				break;
+				case TGSI_SEMANTIC_TESSINNER:
+				case TGSI_SEMANTIC_TESSOUTER:
+					info->reads_tessfactor_outputs = true;
+				break;
+				default:
+					info->reads_pervertex_outputs = true;
+				}
+			}
 		}
 
-		i++;
+		unsigned loc = variable->data.location;
+		if (nir->info.stage == MESA_SHADER_FRAGMENT &&
+		    loc == FRAG_RESULT_COLOR &&
+		    nir->info.outputs_written & (1ull << loc)) {
+			assert(attrib_count == 1);
+			info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
+		}
 	}
 
+	info->num_outputs = num_outputs;
+
+	struct set *ubo_set = _mesa_set_create(NULL, _mesa_hash_pointer,
+					       _mesa_key_pointer_equal);
+
+	/* Intialise const_file_max[0] */
+	info->const_file_max[0] = -1;
+
+	unsigned ubo_idx = 1;
 	nir_foreach_variable(variable, &nir->uniforms) {
 		const struct glsl_type *type = variable->type;
 		enum glsl_base_type base_type =
 			glsl_get_base_type(glsl_without_array(type));
 		unsigned aoa_size = MAX2(1, glsl_get_aoa_size(type));
 
+		/* Gather buffers declared bitmasks. Note: radeonsi doesn't
+		 * really use the mask (other than ubo_idx == 1 for regular
+		 * uniforms) its really only used for getting the buffer count
+		 * so we don't need to worry about the ordering.
+		 */
+		if (variable->interface_type != NULL) {
+			if (variable->data.mode == nir_var_uniform) {
+
+				unsigned block_count;
+				if (base_type != GLSL_TYPE_INTERFACE) {
+					struct set_entry *entry =
+						_mesa_set_search(ubo_set, variable->interface_type);
+
+					/* Check if we have already processed
+					 * a member from this ubo.
+					 */
+					if (entry)
+						continue;
+
+					block_count = 1;
+				} else {
+					block_count = aoa_size;
+				}
+
+				info->const_buffers_declared |= u_bit_consecutive(ubo_idx, block_count);
+				ubo_idx += block_count;
+
+				_mesa_set_add(ubo_set, variable->interface_type);
+			}
+
+			if (variable->data.mode == nir_var_shader_storage) {
+				/* TODO: make this more accurate */
+				info->shader_buffers_declared =
+					u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
+			}
+
+			continue;
+		}
+
 		/* We rely on the fact that nir_lower_samplers_as_deref has
 		 * eliminated struct dereferences.
 		 */
-		if (base_type == GLSL_TYPE_SAMPLER)
-			info->samplers_declared |=
-				u_bit_consecutive(variable->data.binding, aoa_size);
-		else if (base_type == GLSL_TYPE_IMAGE)
-			info->images_declared |=
-				u_bit_consecutive(variable->data.binding, aoa_size);
+		if (base_type == GLSL_TYPE_SAMPLER) {
+			if (variable->data.bindless) {
+				info->const_buffers_declared |= 1;
+				info->const_file_max[0] +=
+					glsl_count_attribute_slots(type, false);
+			} else {
+				info->samplers_declared |=
+					u_bit_consecutive(variable->data.binding, aoa_size);
+			}
+		} else if (base_type == GLSL_TYPE_IMAGE) {
+			if (variable->data.bindless) {
+				info->const_buffers_declared |= 1;
+				info->const_file_max[0] +=
+					glsl_count_attribute_slots(type, false);
+			} else {
+				info->images_declared |=
+					u_bit_consecutive(variable->data.binding, aoa_size);
+			}
+		} else if (base_type != GLSL_TYPE_ATOMIC_UINT) {
+			if (strncmp(variable->name, "state.", 6) == 0 ||
+			    strncmp(variable->name, "gl_", 3) == 0) {
+				/* FIXME: figure out why piglit tests with builtin
+				 * uniforms are failing without this.
+				 */
+				info->const_buffers_declared =
+					u_bit_consecutive(0, SI_NUM_CONST_BUFFERS);
+			} else {
+				info->const_buffers_declared |= 1;
+				info->const_file_max[0] +=
+					glsl_count_attribute_slots(type, false);
+			}
+		}
 	}
 
+	_mesa_set_destroy(ubo_set, NULL);
+
 	info->num_written_clipdistance = nir->info.clip_distance_array_size;
 	info->num_written_culldistance = nir->info.cull_distance_array_size;
 	info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
@@ -307,10 +734,6 @@ void si_nir_scan_shader(const struct nir_shader *nir,
 	if (info->processor == PIPE_SHADER_FRAGMENT)
 		info->uses_kill = nir->info.fs.uses_discard;
 
-	/* TODO make this more accurate */
-	info->const_buffers_declared = u_bit_consecutive(0, SI_NUM_CONST_BUFFERS);
-	info->shader_buffers_declared = u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
-
 	func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
 	nir_foreach_block(block, func->impl) {
 		nir_foreach_instr(instr, block)
@@ -350,10 +773,6 @@ si_lower_nir(struct si_shader_selector* sel)
 	 * - ensure constant offsets for texture instructions are folded
 	 *   and copy-propagated
 	 */
-	NIR_PASS_V(sel->nir, nir_lower_io, nir_var_uniform, type_size,
-		   (nir_lower_io_options)0);
-	NIR_PASS_V(sel->nir, nir_lower_uniforms_to_ubo);
-
 	NIR_PASS_V(sel->nir, nir_lower_returns);
 	NIR_PASS_V(sel->nir, nir_lower_vars_to_ssa);
 	NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar);
@@ -364,6 +783,20 @@ si_lower_nir(struct si_shader_selector* sel)
 	};
 	NIR_PASS_V(sel->nir, nir_lower_tex, &lower_tex_options);
 
+	const nir_lower_subgroups_options subgroups_options = {
+		.subgroup_size = 64,
+		.ballot_bit_size = 64,
+		.lower_to_scalar = true,
+		.lower_subgroup_masks = true,
+		.lower_vote_trivial = false,
+		.lower_vote_eq_to_ballot = true,
+	};
+	NIR_PASS_V(sel->nir, nir_lower_subgroups, &subgroups_options);
+
+	ac_lower_indirect_derefs(sel->nir, sel->screen->info.chip_class);
+
+	NIR_PASS_V(sel->nir, nir_lower_load_const_to_scalar);
+
 	bool progress;
 	do {
 		progress = false;
@@ -395,21 +828,19 @@ si_lower_nir(struct si_shader_selector* sel)
 }
 
 static void declare_nir_input_vs(struct si_shader_context *ctx,
-				 struct nir_variable *variable, unsigned rel,
+				 struct nir_variable *variable,
+				 unsigned input_index,
 				 LLVMValueRef out[4])
 {
-	si_llvm_load_input_vs(ctx, variable->data.driver_location / 4 + rel, out);
+	si_llvm_load_input_vs(ctx, input_index, out);
 }
 
 static void declare_nir_input_fs(struct si_shader_context *ctx,
-				 struct nir_variable *variable, unsigned rel,
-				 unsigned *fs_attr_idx,
+				 struct nir_variable *variable,
+				 unsigned input_index,
 				 LLVMValueRef out[4])
 {
-	unsigned slot = variable->data.location + rel;
-
-	assert(variable->data.location >= VARYING_SLOT_VAR0 || rel == 0);
-
+	unsigned slot = variable->data.location;
 	if (slot == VARYING_SLOT_POS) {
 		out[0] = LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT);
 		out[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT);
@@ -419,8 +850,43 @@ static void declare_nir_input_fs(struct si_shader_context *ctx,
 		return;
 	}
 
-	si_llvm_load_input_fs(ctx, *fs_attr_idx, out);
-	(*fs_attr_idx)++;
+	si_llvm_load_input_fs(ctx, input_index, out);
+}
+
+LLVMValueRef
+si_nir_lookup_interp_param(struct ac_shader_abi *abi,
+			   enum glsl_interp_mode interp, unsigned location)
+{
+	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+	int interp_param_idx = -1;
+
+	switch (interp) {
+	case INTERP_MODE_FLAT:
+		return NULL;
+	case INTERP_MODE_SMOOTH:
+	case INTERP_MODE_NONE:
+		if (location == INTERP_CENTER)
+			interp_param_idx = SI_PARAM_PERSP_CENTER;
+		else if (location == INTERP_CENTROID)
+			interp_param_idx = SI_PARAM_PERSP_CENTROID;
+		else if (location == INTERP_SAMPLE)
+			interp_param_idx = SI_PARAM_PERSP_SAMPLE;
+		break;
+	case INTERP_MODE_NOPERSPECTIVE:
+		if (location == INTERP_CENTER)
+			interp_param_idx = SI_PARAM_LINEAR_CENTER;
+		else if (location == INTERP_CENTROID)
+			interp_param_idx = SI_PARAM_LINEAR_CENTROID;
+		else if (location == INTERP_SAMPLE)
+			interp_param_idx = SI_PARAM_LINEAR_SAMPLE;
+		break;
+	default:
+		assert(!"Unhandled interpolation mode.");
+		return NULL;
+	}
+
+	return interp_param_idx != -1 ?
+		LLVMGetParam(ctx->main_fn, interp_param_idx) : NULL;
 }
 
 static LLVMValueRef
@@ -428,69 +894,128 @@ si_nir_load_sampler_desc(struct ac_shader_abi *abi,
 		         unsigned descriptor_set, unsigned base_index,
 		         unsigned constant_index, LLVMValueRef dynamic_index,
 		         enum ac_descriptor_type desc_type, bool image,
-			 bool write)
+			 bool write, bool bindless)
 {
 	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
-	LLVMValueRef index = dynamic_index;
+	unsigned const_index = base_index + constant_index;
+	bool dcc_off = write;
+
+	/* TODO: images_store and images_atomic are not set */
+	if (!dynamic_index && image &&
+	    (info->images_store | info->images_atomic) & (1 << const_index))
+		dcc_off = true;
 
 	assert(!descriptor_set);
+	assert(!image || desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER);
 
-	if (!index)
-		index = ctx->ac.i32_0;
+	if (bindless) {
+		LLVMValueRef list =
+			LLVMGetParam(ctx->main_fn, ctx->param_bindless_samplers_and_images);
 
-	index = LLVMBuildAdd(builder, index,
-			     LLVMConstInt(ctx->ac.i32, base_index + constant_index, false),
-			     "");
+		/* dynamic_index is the bindless handle */
+		if (image) {
+			return si_load_image_desc(ctx, list, dynamic_index, desc_type,
+						  dcc_off, true);
+		}
 
-	if (image) {
-		assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER);
-		assert(base_index + constant_index < ctx->num_images);
+		/* Since bindless handle arithmetic can contain an unsigned integer
+		 * wraparound and si_load_sampler_desc assumes there isn't any,
+		 * use GEP without "inbounds" (inside ac_build_pointer_add)
+		 * to prevent incorrect code generation and hangs.
+		 */
+		dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
+					     LLVMConstInt(ctx->i32, 2, 0), "");
+		list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
+		return si_load_sampler_desc(ctx, list, ctx->i32_0, desc_type);
+	}
+
+	unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
+	assert(const_index < num_slots);
 
-		if (dynamic_index)
-			index = si_llvm_bound_index(ctx, index, ctx->num_images);
+	LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
+	LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
+
+	if (dynamic_index) {
+		index = LLVMBuildAdd(builder, index, dynamic_index, "");
+
+		/* From the GL_ARB_shader_image_load_store extension spec:
+		 *
+		 *    If a shader performs an image load, store, or atomic
+		 *    operation using an image variable declared as an array,
+		 *    and if the index used to select an individual element is
+		 *    negative or greater than or equal to the size of the
+		 *    array, the results of the operation are undefined but may
+		 *    not lead to termination.
+		 */
+		index = si_llvm_bound_index(ctx, index, num_slots);
+	}
 
-		index = LLVMBuildSub(ctx->gallivm.builder,
+	if (image) {
+		index = LLVMBuildSub(ctx->ac.builder,
 				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
 				     index, "");
-
-		/* TODO: be smarter about when we use dcc_off */
-		return si_load_image_desc(ctx, list, index, desc_type, write);
+		return si_load_image_desc(ctx, list, index, desc_type, dcc_off, false);
 	}
 
-	assert(base_index + constant_index < ctx->num_samplers);
-
-	if (dynamic_index)
-		index = si_llvm_bound_index(ctx, index, ctx->num_samplers);
-
-	index = LLVMBuildAdd(ctx->gallivm.builder, index,
+	index = LLVMBuildAdd(ctx->ac.builder, index,
 			     LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
-
 	return si_load_sampler_desc(ctx, list, index, desc_type);
 }
 
+static void bitcast_inputs(struct si_shader_context *ctx,
+			   LLVMValueRef data[4],
+			   unsigned input_idx)
+{
+	for (unsigned chan = 0; chan < 4; chan++) {
+		ctx->inputs[input_idx + chan] =
+			LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, "");
+	}
+}
+
 bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 {
 	struct tgsi_shader_info *info = &ctx->shader->selector->info;
 
-	unsigned fs_attr_idx = 0;
-	nir_foreach_variable(variable, &nir->inputs) {
-		unsigned attrib_count = glsl_count_attribute_slots(variable->type,
-								   nir->info.stage == MESA_SHADER_VERTEX);
-		unsigned input_idx = variable->data.driver_location;
+	if (nir->info.stage == MESA_SHADER_VERTEX ||
+	    nir->info.stage == MESA_SHADER_FRAGMENT) {
+		uint64_t processed_inputs = 0;
+		nir_foreach_variable(variable, &nir->inputs) {
+			unsigned attrib_count = glsl_count_attribute_slots(variable->type,
+									   nir->info.stage == MESA_SHADER_VERTEX);
+			unsigned input_idx = variable->data.driver_location;
 
-		for (unsigned i = 0; i < attrib_count; ++i) {
 			LLVMValueRef data[4];
-
-			if (nir->info.stage == MESA_SHADER_VERTEX)
-				declare_nir_input_vs(ctx, variable, i, data);
-			else if (nir->info.stage == MESA_SHADER_FRAGMENT)
-				declare_nir_input_fs(ctx, variable, i, &fs_attr_idx, data);
-
-			for (unsigned chan = 0; chan < 4; chan++) {
-				ctx->inputs[input_idx + chan] =
-					LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, "");
+			unsigned loc = variable->data.location;
+
+			if (loc >= VARYING_SLOT_VAR0 && nir->info.stage == MESA_SHADER_FRAGMENT)
+				ctx->abi.fs_input_attr_indices[loc - VARYING_SLOT_VAR0] = input_idx / 4;
+
+			for (unsigned i = 0; i < attrib_count; i++) {
+				/* Packed components share the same location so skip
+				 * them if we have already processed the location.
+				 */
+				if (processed_inputs & ((uint64_t)1 << (loc + i))) {
+					input_idx += 4;
+					continue;
+				}
+
+				if (nir->info.stage == MESA_SHADER_VERTEX) {
+					declare_nir_input_vs(ctx, variable, input_idx / 4, data);
+					bitcast_inputs(ctx, data, input_idx);
+					if (glsl_type_is_dual_slot(variable->type)) {
+						input_idx += 4;
+						declare_nir_input_vs(ctx, variable, input_idx / 4, data);
+						bitcast_inputs(ctx, data, input_idx);
+					}
+				} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+					declare_nir_input_fs(ctx, variable, input_idx / 4, data);
+					bitcast_inputs(ctx, data, input_idx);
+				}
+
+				processed_inputs |= ((uint64_t)1 << (loc + i));
+				input_idx += 4;
 			}
 		}
 	}
@@ -502,7 +1027,11 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 	ctx->num_samplers = util_last_bit(info->samplers_declared);
 	ctx->num_images = util_last_bit(info->images_declared);
 
-	ac_nir_translate(&ctx->ac, &ctx->abi, nir, NULL);
+	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
+		assert(nir->info.stage == MESA_SHADER_COMPUTE);
+		si_declare_compute_memory(ctx);
+	}
+	ac_nir_translate(&ctx->ac, &ctx->abi, nir);
 
 	return true;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
new file mode 100644
index 000000000..da55c81dd
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "tgsi/tgsi_text.h"
+#include "tgsi/tgsi_ureg.h"
+
+void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
+			unsigned num_layers)
+{
+	unsigned vs_blit_property;
+	void **vs;
+
+	switch (type) {
+	case UTIL_BLITTER_ATTRIB_NONE:
+		vs = num_layers > 1 ? &sctx->vs_blit_pos_layered :
+				      &sctx->vs_blit_pos;
+		vs_blit_property = SI_VS_BLIT_SGPRS_POS;
+		break;
+	case UTIL_BLITTER_ATTRIB_COLOR:
+		vs = num_layers > 1 ? &sctx->vs_blit_color_layered :
+				      &sctx->vs_blit_color;
+		vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
+		break;
+	case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+	case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+		assert(num_layers == 1);
+		vs = &sctx->vs_blit_texcoord;
+		vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
+		break;
+	default:
+		assert(0);
+		return NULL;
+	}
+	if (*vs)
+		return *vs;
+
+	struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
+	if (!ureg)
+		return NULL;
+
+	/* Tell the shader to load VS inputs from SGPRs: */
+	ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS, vs_blit_property);
+	ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
+
+	/* This is just a pass-through shader with 1-3 MOV instructions. */
+	ureg_MOV(ureg,
+		 ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0),
+		 ureg_DECL_vs_input(ureg, 0));
+
+	if (type != UTIL_BLITTER_ATTRIB_NONE) {
+		ureg_MOV(ureg,
+			 ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0),
+			 ureg_DECL_vs_input(ureg, 1));
+	}
+
+	if (num_layers > 1) {
+		struct ureg_src instance_id =
+			ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
+		struct ureg_dst layer =
+			ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
+
+		ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
+			 ureg_scalar(instance_id, TGSI_SWIZZLE_X));
+	}
+	ureg_END(ureg);
+
+	*vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
+	return *vs;
+}
+
+/**
+ * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
+ * VS passes its outputs to TES directly, so the fixed-function shader only
+ * has to write TESSOUTER and TESSINNER.
+ */
+void *si_create_fixed_func_tcs(struct si_context *sctx)
+{
+	struct ureg_src outer, inner;
+	struct ureg_dst tessouter, tessinner;
+	struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
+
+	if (!ureg)
+		return NULL;
+
+	outer = ureg_DECL_system_value(ureg,
+				       TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI, 0);
+	inner = ureg_DECL_system_value(ureg,
+				       TGSI_SEMANTIC_DEFAULT_TESSINNER_SI, 0);
+
+	tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+	tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+
+	ureg_MOV(ureg, tessouter, outer);
+	ureg_MOV(ureg, tessinner, inner);
+	ureg_END(ureg);
+
+	return ureg_create_shader_and_destroy(ureg, &sctx->b);
+}
+
+/* Create a compute shader implementing clear_buffer or copy_buffer. */
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+				   unsigned num_dwords_per_thread,
+				   bool dst_stream_cache_policy, bool is_copy)
+{
+	assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+	unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+	if (dst_stream_cache_policy)
+		store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+	/* Don't cache loads, because there is no reuse. */
+	unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+	unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+	unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+	for (unsigned i = 0; i < num_mem_ops; i++) {
+		if (i*4 < num_dwords_per_thread)
+			inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
+	}
+
+	struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+	if (!ureg)
+		return NULL;
+
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+	struct ureg_src value;
+	if (!is_copy) {
+		ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_DWORDS, inst_dwords[0]);
+		value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA, 0);
+	}
+
+	struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+	struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+	struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+	struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+	struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+	struct ureg_src srcbuf;
+	struct ureg_src *values = NULL;
+
+	if (is_copy) {
+		srcbuf = ureg_DECL_buffer(ureg, 1, false);
+		values = malloc(num_mem_ops * sizeof(struct ureg_src));
+	}
+
+	/* If there are multiple stores, the first store writes into 0+tid,
+	 * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, etc.
+	 */
+	ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), tid);
+	/* Convert from a "store size unit" into bytes. */
+	ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
+		  ureg_imm1u(ureg, 4 * inst_dwords[0]));
+	ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+	/* Distance between a load and a store for latency hiding. */
+	unsigned load_store_distance = is_copy ? 8 : 0;
+
+	for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+		int d = i - load_store_distance;
+
+		if (is_copy && i < num_mem_ops) {
+			if (i) {
+				ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+					  ureg_imm1u(ureg, 4 * inst_dwords[i] * 64));
+			}
+
+			values[i] = ureg_src(ureg_DECL_temporary(ureg));
+			struct ureg_dst dst =
+				ureg_writemask(ureg_dst(values[i]),
+					       u_bit_consecutive(0, inst_dwords[i]));
+			struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+			ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2,
+					 load_qualifier, TGSI_TEXTURE_BUFFER, 0);
+		}
+
+		if (d >= 0) {
+			if (d) {
+				ureg_UADD(ureg, store_addr, ureg_src(store_addr),
+					  ureg_imm1u(ureg, 4 * inst_dwords[d] * 64));
+			}
+
+			struct ureg_dst dst =
+				ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
+			struct ureg_src srcs[] =
+				{ureg_src(store_addr), is_copy ? values[d] : value};
+			ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2,
+					 store_qualifier, TGSI_TEXTURE_BUFFER, 0);
+		}
+	}
+	ureg_END(ureg);
+
+	struct pipe_compute_state state = {};
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = ureg_get_tokens(ureg, NULL);
+
+	void *cs = ctx->create_compute_state(ctx, &state);
+	ureg_destroy(ureg);
+	free(values);
+	return cs;
+}
+
+/* Create the compute shader that is used to collect the results.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * CONST
+ *  0.x = end_offset
+ *  0.y = result_stride
+ *  0.z = result_count
+ *  0.w = bit field:
+ *          1: read previously accumulated values
+ *          2: write accumulated values for chaining
+ *          4: write result available
+ *          8: convert result to boolean (0/1)
+ *         16: only read one dword and use that as result
+ *         32: apply timestamp conversion
+ *         64: store full 64 bits result
+ *        128: store signed 32 bits result
+ *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
+ *  1.x = fence_offset
+ *  1.y = pair_stride
+ *  1.z = pair_count
+ *
+ * BUFFER[0] = query result buffer
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ */
+void *si_create_query_result_cs(struct si_context *sctx)
+{
+	/* TEMP[0].xy = accumulated result so far
+	 * TEMP[0].z = result not available
+	 *
+	 * TEMP[1].x = current result index
+	 * TEMP[1].y = current pair index
+	 */
+	static const char text_tmpl[] =
+		"COMP\n"
+		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+		"DCL BUFFER[0]\n"
+		"DCL BUFFER[1]\n"
+		"DCL BUFFER[2]\n"
+		"DCL CONST[0][0..1]\n"
+		"DCL TEMP[0..5]\n"
+		"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
+		"IMM[1] UINT32 {1, 2, 4, 8}\n"
+		"IMM[2] UINT32 {16, 32, 64, 128}\n"
+		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+		"IMM[4] UINT32 {256, 0, 0, 0}\n"
+
+		"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
+		"UIF TEMP[5]\n"
+			/* Check result availability. */
+			"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
+			"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
+			"MOV TEMP[1], TEMP[0].zzzz\n"
+			"NOT TEMP[0].z, TEMP[0].zzzz\n"
+
+			/* Load result if available. */
+			"UIF TEMP[1]\n"
+				"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+			"ENDIF\n"
+		"ELSE\n"
+			/* Load previously accumulated result if requested. */
+			"MOV TEMP[0], IMM[0].xxxx\n"
+			"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
+			"UIF TEMP[4]\n"
+				"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
+			"ENDIF\n"
+
+			"MOV TEMP[1].x, IMM[0].xxxx\n"
+			"BGNLOOP\n"
+				/* Break if accumulated result so far is not available. */
+				"UIF TEMP[0].zzzz\n"
+					"BRK\n"
+				"ENDIF\n"
+
+				/* Break if result_index >= result_count. */
+				"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
+				"UIF TEMP[5]\n"
+					"BRK\n"
+				"ENDIF\n"
+
+				/* Load fence and check result availability */
+				"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
+				"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+				"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
+				"NOT TEMP[0].z, TEMP[0].zzzz\n"
+				"UIF TEMP[0].zzzz\n"
+					"BRK\n"
+				"ENDIF\n"
+
+				"MOV TEMP[1].y, IMM[0].xxxx\n"
+				"BGNLOOP\n"
+					/* Load start and end. */
+					"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
+					"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
+					"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+					"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
+					"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+					"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
+
+					"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
+					"UIF TEMP[5].zzzz\n"
+						/* Load second start/end half-pair and
+						 * take the difference
+						 */
+						"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
+						"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+						"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+						"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+						"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
+					"ENDIF\n"
+
+					"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
+
+					/* Increment pair index */
+					"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
+					"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
+					"UIF TEMP[5]\n"
+						"BRK\n"
+					"ENDIF\n"
+				"ENDLOOP\n"
+
+				/* Increment result index */
+				"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
+			"ENDLOOP\n"
+		"ENDIF\n"
+
+		"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
+		"UIF TEMP[4]\n"
+			/* Store accumulated data for chaining. */
+			"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+		"ELSE\n"
+			"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
+			"UIF TEMP[4]\n"
+				/* Store result availability. */
+				"NOT TEMP[0].z, TEMP[0]\n"
+				"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
+				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+
+				"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+				"UIF TEMP[4]\n"
+					"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+				"ENDIF\n"
+			"ELSE\n"
+				/* Store result if it is available. */
+				"NOT TEMP[4], TEMP[0].zzzz\n"
+				"UIF TEMP[4]\n"
+					/* Apply timestamp conversion */
+					"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
+					"UIF TEMP[4]\n"
+						"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
+						"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
+					"ENDIF\n"
+
+					/* Convert to boolean */
+					"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
+					"UIF TEMP[4]\n"
+						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
+						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
+						"MOV TEMP[0].y, IMM[0].xxxx\n"
+					"ENDIF\n"
+
+					"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+					"UIF TEMP[4]\n"
+						"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+					"ELSE\n"
+						/* Clamping */
+						"UIF TEMP[0].yyyy\n"
+							"MOV TEMP[0].x, IMM[0].wwww\n"
+						"ENDIF\n"
+
+						"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
+						"UIF TEMP[4]\n"
+							"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
+						"ENDIF\n"
+
+						"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+					"ENDIF\n"
+				"ENDIF\n"
+			"ENDIF\n"
+		"ENDIF\n"
+
+		"END\n";
+
+	char text[sizeof(text_tmpl) + 32];
+	struct tgsi_token tokens[1024];
+	struct pipe_compute_state state = {};
+
+	/* Hard code the frequency into the shader so that the backend can
+	 * use the full range of optimizations for divide-by-constant.
+	 */
+	snprintf(text, sizeof(text), text_tmpl,
+		 sctx->screen->info.clock_crystal_freq);
+
+	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+		assert(false);
+		return NULL;
+	}
+
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = tokens;
+
+	return sctx->b.create_compute_state(&sctx->b, &state);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c
index 8d98d6d0d..3516e5612 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,10 +24,8 @@
 
 /* This file handles register programming of primitive binning. */
 
-#include "si_pipe.h"
-#include "sid.h"
+#include "si_build_pm4.h"
 #include "gfx9d.h"
-#include "radeon/r600_cs.h"
 
 struct uvec2 {
 	unsigned x, y;
@@ -38,7 +37,7 @@ struct si_bin_size_map {
 	unsigned bin_size_y;
 };
 
-typedef struct si_bin_size_map si_bin_size_subtable[3][9];
+typedef struct si_bin_size_map si_bin_size_subtable[3][10];
 
 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
 static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
@@ -46,16 +45,16 @@ static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
 				     unsigned sum)
 {
 	unsigned log_num_rb_per_se =
-		util_logbase2_ceil(sscreen->b.info.num_render_backends /
-				   sscreen->b.info.max_se);
-	unsigned log_num_se = util_logbase2_ceil(sscreen->b.info.max_se);
+		util_logbase2_ceil(sscreen->info.num_render_backends /
+				   sscreen->info.max_se);
+	unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
 	unsigned i;
 
 	/* Get the chip-specific subtable. */
 	const struct si_bin_size_map *subtable =
 		&table[log_num_rb_per_se][log_num_se][0];
 
-	for (i = 0; subtable[i].start != UINT_MAX; i++) {
+	for (i = 0; subtable[i].bin_size_x != 0; i++) {
 		if (sum >= subtable[i].start && sum < subtable[i + 1].start)
 			break;
 	}
@@ -67,7 +66,7 @@ static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
 static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 					  unsigned cb_target_enabled_4bit)
 {
-	unsigned nr_samples = sctx->framebuffer.nr_samples;
+	unsigned num_fragments = sctx->framebuffer.nr_color_samples;
 	unsigned sum = 0;
 
 	/* Compute the sum of all Bpp. */
@@ -75,15 +74,15 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 		if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
 			continue;
 
-		struct r600_texture *rtex =
-			(struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-		sum += rtex->surface.bpe;
+		struct si_texture *tex =
+			(struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
+		sum += tex->surface.bpe;
 	}
 
 	/* Multiply the sum by some function of the number of samples. */
-	if (nr_samples >= 2) {
-		if (sctx->ps_iter_samples >= 2)
-			sum *= nr_samples;
+	if (num_fragments >= 2) {
+		if (si_get_ps_iter_samples(sctx) >= 2)
+			sum *= num_fragments;
 		else
 			sum *= 2;
 	}
@@ -98,7 +97,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        2,   32,  128 },
 				{        3,   16,  128 },
 				{       17,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Two shader engines */
@@ -107,7 +105,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        3,   32,  128 },
 				{        5,   16,  128 },
 				{       17,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Four shader engines */
@@ -115,7 +112,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        3,   64,  128 },
 				{        5,   16,  128 },
 				{       17,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
@@ -125,9 +121,8 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        0,  128,  128 },
 				{        2,   64,  128 },
 				{        3,   32,  128 },
-				{        5,   16,  128 },
+				{        9,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Two shader engines */
@@ -136,7 +131,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        5,   32,  128 },
 				{        9,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Four shader engines */
@@ -146,7 +140,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        5,   64,  128 },
 				{        9,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
@@ -158,8 +151,7 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        3,   64,  128 },
 				{        5,   32,  128 },
 				{        9,   16,  128 },
-				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
+				{       17,    0,    0 },
 			},
 			{
 				/* Two shader engines */
@@ -170,18 +162,16 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 				{        9,   32,  128 },
 				{       17,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Four shader engines */
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        3,  128,  256 },
-				{        5,  128,  128 },
-				{        9,   64,  128 },
-				{       17,   16,  128 },
+				{        2,  128,  512 },
+				{        3,   64,  512 },
+				{        5,   32,  512 },
+				{        9,   32,  256 },
+				{       17,   32,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 	};
@@ -200,86 +190,80 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
 		return size;
 	}
 
-	struct r600_texture *rtex =
-		(struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
+	struct si_texture *tex =
+		(struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
 	unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
-	unsigned stencil_coeff = rtex->surface.has_stencil &&
+	unsigned stencil_coeff = tex->surface.has_stencil &&
 				 dsa->stencil_enabled ? 1 : 0;
 	unsigned sum = 4 * (depth_coeff + stencil_coeff) *
-		       sctx->framebuffer.nr_samples;
+		       tex->buffer.b.b.nr_samples;
 
 	static const si_bin_size_subtable table[] = {
 		{
 			// One RB / SE
 			{
 				// One shader engine
-				{        0,  128,  256 },
-				{        2,  128,  128 },
+				{        0,   64,  512 },
+				{        2,   64,  256 },
 				{        4,   64,  128 },
 				{        7,   32,  128 },
 				{       13,   16,  128 },
 				{       49,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Two shader engines
-				{        0,  256,  256 },
-				{        2,  128,  256 },
-				{        4,  128,  128 },
+				{        0,  128,  512 },
+				{        2,   64,  512 },
+				{        4,   64,  256 },
 				{        7,   64,  128 },
 				{       13,   32,  128 },
 				{       25,   16,  128 },
 				{       49,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Four shader engines
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        4,  128,  256 },
-				{        7,  128,  128 },
+				{        2,  128,  512 },
+				{        4,   64,  512 },
+				{        7,   64,  256 },
 				{       13,   64,  128 },
 				{       25,   16,  128 },
 				{       49,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
 			// Two RB / SE
 			{
 				// One shader engine
-				{        0,  256,  256 },
-				{        2,  128,  256 },
-				{        4,  128,  128 },
+				{        0,  128,  512 },
+				{        2,   64,  512 },
+				{        4,   64,  256 },
 				{        7,   64,  128 },
 				{       13,   32,  128 },
 				{       25,   16,  128 },
 				{       97,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Two shader engines
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        4,  128,  256 },
-				{        7,  128,  128 },
+				{        2,  128,  512 },
+				{        4,   64,  512 },
+				{        7,   64,  256 },
 				{       13,   64,  128 },
 				{       25,   32,  128 },
 				{       49,   16,  128 },
 				{       97,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Four shader engines
 				{        0,  512,  512 },
 				{        2,  256,  512 },
-				{        4,  256,  256 },
-				{        7,  128,  256 },
-				{       13,  128,  128 },
+				{        4,  128,  512 },
+				{        7,   64,  512 },
+				{       13,   64,  256 },
 				{       25,   64,  128 },
 				{       49,   16,  128 },
 				{       97,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
@@ -287,36 +271,36 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
 			{
 				// One shader engine
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        4,  128,  256 },
-				{        7,  128,  128 },
+				{        2,  128,  512 },
+				{        4,   64,  512 },
+				{        7,   64,  256 },
 				{       13,   64,  128 },
 				{       25,   32,  128 },
 				{       49,   16,  128 },
-				{ UINT_MAX,    0,    0 },
+				{      193,    0,    0 },
 			},
 			{
 				// Two shader engines
 				{        0,  512,  512 },
 				{        2,  256,  512 },
-				{        4,  256,  256 },
-				{        7,  128,  256 },
-				{       13,  128,  128 },
+				{        4,  128,  512 },
+				{        7,   64,  512 },
+				{       13,   64,  256 },
 				{       25,   64,  128 },
 				{       49,   32,  128 },
 				{       97,   16,  128 },
-				{ UINT_MAX,    0,    0 },
+				{      193,    0,    0 },
 			},
 			{
 				// Four shader engines
 				{        0,  512,  512 },
 				{        4,  256,  512 },
-				{        7,  256,  256 },
-				{       13,  128,  256 },
-				{       25,  128,  128 },
-				{       49,   64,  128 },
+				{        7,  128,  512 },
+				{       13,   64,  512 },
+				{       25,   32,  512 },
+				{       49,   32,  256 },
 				{       97,   16,  128 },
-				{ UINT_MAX,    0,    0 },
+				{      193,    0,    0 },
 			},
 		},
 	};
@@ -326,25 +310,30 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
 
 static void si_emit_dpbb_disable(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-
-	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
-			       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
-			       S_028C44_DISABLE_START_OF_PRIM(1));
-	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
-			       S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+	radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
+		SI_TRACKED_PA_SC_BINNER_CNTL_0,
+		S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+		S_028C44_DISABLE_START_OF_PRIM(1));
+	radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
+				   SI_TRACKED_DB_DFSM_CONTROL,
+				   S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
+				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
-void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
+void si_emit_dpbb_state(struct si_context *sctx)
 {
 	struct si_screen *sscreen = sctx->screen;
 	struct si_state_blend *blend = sctx->queued.named.blend;
 	struct si_state_dsa *dsa = sctx->queued.named.dsa;
 	unsigned db_shader_control = sctx->ps_db_shader_control;
 
-	assert(sctx->b.chip_class >= GFX9);
+	assert(sctx->chip_class >= GFX9);
 
-	if (!sscreen->dpbb_allowed || !blend || !dsa) {
+	if (!sscreen->dpbb_allowed || !blend || !dsa || sctx->dpbb_force_off) {
 		si_emit_dpbb_disable(sctx);
 		return;
 	}
@@ -354,18 +343,14 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
 			   G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
 			   blend->alpha_to_coverage;
 
-	/* This is ported from Vulkan, but it doesn't make much sense to me.
-	 * Maybe it's for RE-Z? But Vulkan doesn't use RE-Z. TODO: Clarify this.
-	 */
-	bool ps_can_reject_z_trivially =
+	bool db_can_reject_z_trivially =
 		!G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
-		G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control);
+		G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
+		G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
 
-	/* Disable binning if PS can kill trivially with DB writes.
-	 * Ported from Vulkan. (heuristic?)
-	 */
+	/* Disable DPBB when it's believed to be inefficient. */
 	if (ps_can_kill &&
-	    ps_can_reject_z_trivially &&
+	    db_can_reject_z_trivially &&
 	    sctx->framebuffer.state.zsbuf &&
 	    dsa->db_can_write) {
 		si_emit_dpbb_disable(sctx);
@@ -394,8 +379,13 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
 	/* Enable DFSM if it's preferred. */
 	unsigned punchout_mode = V_028060_FORCE_OFF;
 	bool disable_start_of_prim = true;
+	bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 &&
+				sctx->framebuffer.state.zsbuf &&
+				sctx->framebuffer.nr_samples !=
+				MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
 
 	if (sscreen->dfsm_allowed &&
+	    !zs_eqaa_dfsm_bug &&
 	    cb_target_enabled_4bit &&
 	    !G_02880C_KILL_ENABLE(db_shader_control) &&
 	    /* These two also imply that DFSM is disabled when PS writes to memory. */
@@ -412,9 +402,12 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
 	unsigned persistent_states_per_bin; /* allowed range: [0, 31] */
 	unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
 
-	switch (sctx->b.family) {
+	switch (sctx->family) {
 	case CHIP_VEGA10:
+	case CHIP_VEGA12:
+	case CHIP_VEGA20:
 	case CHIP_RAVEN:
+	case CHIP_RAVEN2:
 		/* Tuned for Raven. Vega might need different values. */
 		context_states_per_bin = 5;
 		persistent_states_per_bin = 31;
@@ -431,18 +424,24 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
 	if (bin_size.y >= 32)
 		bin_size_extend.y = util_logbase2(bin_size.y) - 5;
 
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
-			       S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
-			       S_028C44_BIN_SIZE_X(bin_size.x == 16) |
-			       S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
-			       S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
-			       S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
-			       S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
-			       S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
-			       S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
-			       S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
-			       S_028C44_OPTIMAL_BIN_SELECTION(1));
-	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
-			       S_028060_PUNCHOUT_MODE(punchout_mode));
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+	radeon_opt_set_context_reg(
+		sctx, R_028C44_PA_SC_BINNER_CNTL_0,
+		SI_TRACKED_PA_SC_BINNER_CNTL_0,
+		S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
+		S_028C44_BIN_SIZE_X(bin_size.x == 16) |
+		S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
+		S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+		S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
+		S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
+		S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
+		S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
+		S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
+		S_028C44_OPTIMAL_BIN_SELECTION(1));
+	radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
+				   SI_TRACKED_DB_DFSM_CONTROL,
+				   S_028060_PUNCHOUT_MODE(punchout_mode) |
+				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c
index 133f1e4aa..e6d97fe67 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -19,173 +20,178 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * Authors: Marek Olšák <maraeo@gmail.com>
- *
  */
 
-#include "si_pipe.h"
-#include "sid.h"
-#include "radeon/r600_cs.h"
+#include "si_build_pm4.h"
 
 /* For MSAA sample positions. */
 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
-	(((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |		   \
-	(((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) |	   \
-	(((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) |	   \
+	((((unsigned)(s0x) & 0xf) << 0)  | (((unsigned)(s0y) & 0xf) << 4)  | \
+	 (((unsigned)(s1x) & 0xf) << 8)  | (((unsigned)(s1y) & 0xf) << 12) | \
+	 (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
 	 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
 
-/* 2xMSAA
- * There are two locations (4, 4), (-4, -4). */
-static const uint32_t sample_locs_2x[4] = {
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-};
-/* 4xMSAA
- * There are 4 locations: (-2, -6), (6, -2), (-6, 2), (2, 6). */
-static const uint32_t sample_locs_4x[4] = {
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-};
+/* For obtaining location coordinates from registers */
+#define SEXT4(x)		((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
+#define GET_SFIELD(reg, index)	SEXT4(((reg) >> ((index) * 4)) & 0xf)
+#define GET_SX(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
+#define GET_SY(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
 
-/* Cayman 8xMSAA */
+/* The following sample ordering is required by EQAA.
+ *
+ * Sample 0 is approx. in the top-left quadrant.
+ * Sample 1 is approx. in the bottom-right quadrant.
+ *
+ * Sample 2 is approx. in the bottom-left quadrant.
+ * Sample 3 is approx. in the top-right quadrant.
+ * (sample I={2,3} adds more detail to the vicinity of sample I-2)
+ *
+ * Sample 4 is approx. in the same quadrant as sample 0. (top-left)
+ * Sample 5 is approx. in the same quadrant as sample 1. (bottom-right)
+ * Sample 6 is approx. in the same quadrant as sample 2. (bottom-left)
+ * Sample 7 is approx. in the same quadrant as sample 3. (top-right)
+ * (sample I={4,5,6,7} adds more detail to the vicinity of sample I-4)
+ *
+ * The next 8 samples add more detail to the vicinity of the previous samples.
+ * (sample I (I >= 8) adds more detail to the vicinity of sample I-8)
+ *
+ * The ordering is specified such that:
+ *   If we take the first 2 samples, we should get good 2x MSAA.
+ *   If we add 2 more samples, we should get good 4x MSAA with the same sample locations.
+ *   If we add 4 more samples, we should get good 8x MSAA with the same sample locations.
+ *   If we add 8 more samples, we should get perfect 16x MSAA with the same sample locations.
+ *
+ * The ordering also allows finding samples in the same vicinity.
+ *
+ * Group N of 2 samples in the same vicinity in 16x MSAA: {N,N+8}
+ * Group N of 2 samples in the same vicinity in 8x MSAA: {N,N+4}
+ * Group N of 2 samples in the same vicinity in 4x MSAA: {N,N+2}
+ *
+ * Groups of 4 samples in the same vicinity in 16x MSAA:
+ *   Top left:     {0,4,8,12}
+ *   Bottom right: {1,5,9,13}
+ *   Bottom left:  {2,6,10,14}
+ *   Top right:    {3,7,11,15}
+ *
+ * Groups of 4 samples in the same vicinity in 8x MSAA:
+ *   Left half:  {0,2,4,6}
+ *   Right half: {1,3,5,7}
+ *
+ * Groups of 8 samples in the same vicinity in 16x MSAA:
+ *   Left half:  {0,2,4,6,8,10,12,14}
+ *   Right half: {1,3,5,7,9,11,13,15}
+ */
+
+/* 1x MSAA */
+static const uint32_t sample_locs_1x =
+	FILL_SREG( 0, 0,   0, 0,   0, 0,   0, 0); /* S1, S2, S3 fields are not used by 1x */
+static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
+
+/* 2x MSAA (the positions are sorted for EQAA) */
+static const uint32_t sample_locs_2x =
+	FILL_SREG(-4,-4,   4, 4,   0, 0,   0, 0); /* S2 & S3 fields are not used by 2x MSAA */
+static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
+
+/* 4x MSAA (the positions are sorted for EQAA) */
+static const uint32_t sample_locs_4x =
+	FILL_SREG(-2,-6,   2, 6,   -6, 2,  6,-2);
+static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
+
+/* 8x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_8x[] = {
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
+	FILL_SREG(-3,-5,   5, 1,  -1, 3,   7,-7),
+	FILL_SREG(-7,-1,   3, 7,  -5, 5,   1,-3),
+	/* The following are unused by hardware, but we emit them to IBs
+	 * instead of multiple SET_CONTEXT_REG packets. */
+	0,
+	0,
 };
-/* Cayman 16xMSAA */
+static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
+
+/* 16x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_16x[] = {
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
+	FILL_SREG(-5,-2,   5, 3,  -2, 6,   3,-5),
+	FILL_SREG(-4,-6,   1, 1,  -6, 4,   7,-4),
+	FILL_SREG(-1,-3,   6, 7,  -3, 2,   0,-7),
+	FILL_SREG(-7,-8,   2, 5,  -8, 0,   4,-1),
 };
+static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
 
 static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
 				   unsigned sample_index, float *out_value)
 {
-	int offset, index;
-	struct {
-		int idx:4;
-	} val;
+	const uint32_t *sample_locs;
 
 	switch (sample_count) {
 	case 1:
 	default:
-		out_value[0] = out_value[1] = 0.5;
+		sample_locs = &sample_locs_1x;
 		break;
 	case 2:
-		offset = 4 * (sample_index * 2);
-		val.idx = (sample_locs_2x[0] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_2x[0] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
+		sample_locs = &sample_locs_2x;
 		break;
 	case 4:
-		offset = 4 * (sample_index * 2);
-		val.idx = (sample_locs_4x[0] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_4x[0] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
+		sample_locs = &sample_locs_4x;
 		break;
 	case 8:
-		offset = 4 * (sample_index % 4 * 2);
-		index = (sample_index / 4) * 4;
-		val.idx = (sample_locs_8x[index] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_8x[index] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
+		sample_locs = sample_locs_8x;
 		break;
 	case 16:
-		offset = 4 * (sample_index % 4 * 2);
-		index = (sample_index / 4) * 4;
-		val.idx = (sample_locs_16x[index] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_16x[index] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
+		sample_locs = sample_locs_16x;
 		break;
 	}
+
+	out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
+	out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
+}
+
+static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
+				      uint64_t centroid_priority,
+				      uint32_t sample_locs)
+{
+	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+	radeon_emit(cs, centroid_priority);
+	radeon_emit(cs, centroid_priority >> 32);
+	radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+	radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+	radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+	radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+}
+
+static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
+				       uint64_t centroid_priority,
+				       const uint32_t *sample_locs,
+				       unsigned num_samples)
+{
+	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+	radeon_emit(cs, centroid_priority);
+	radeon_emit(cs, centroid_priority >> 32);
+	radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+				   num_samples == 8 ? 14 : 16);
+	radeon_emit_array(cs, sample_locs, 4);
+	radeon_emit_array(cs, sample_locs, 4);
+	radeon_emit_array(cs, sample_locs, 4);
+	radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
 }
 
-void si_emit_sample_locations(struct radeon_winsys_cs *cs, int nr_samples)
+void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
 {
 	switch (nr_samples) {
 	default:
 	case 1:
-		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0);
-		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0);
-		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0);
-		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0);
+		si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
 		break;
 	case 2:
-		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x[0]);
-		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x[1]);
-		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_2x[2]);
-		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_2x[3]);
+		si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
 		break;
 	case 4:
-		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x[0]);
-		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x[1]);
-		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_4x[2]);
-		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_4x[3]);
+		si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
 		break;
 	case 8:
-		radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
-		radeon_emit(cs, sample_locs_8x[0]);
-		radeon_emit(cs, sample_locs_8x[4]);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, sample_locs_8x[1]);
-		radeon_emit(cs, sample_locs_8x[5]);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, sample_locs_8x[2]);
-		radeon_emit(cs, sample_locs_8x[6]);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, sample_locs_8x[3]);
-		radeon_emit(cs, sample_locs_8x[7]);
+		si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
 		break;
 	case 16:
-		radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16);
-		radeon_emit(cs, sample_locs_16x[0]);
-		radeon_emit(cs, sample_locs_16x[4]);
-		radeon_emit(cs, sample_locs_16x[8]);
-		radeon_emit(cs, sample_locs_16x[12]);
-		radeon_emit(cs, sample_locs_16x[1]);
-		radeon_emit(cs, sample_locs_16x[5]);
-		radeon_emit(cs, sample_locs_16x[9]);
-		radeon_emit(cs, sample_locs_16x[13]);
-		radeon_emit(cs, sample_locs_16x[2]);
-		radeon_emit(cs, sample_locs_16x[6]);
-		radeon_emit(cs, sample_locs_16x[10]);
-		radeon_emit(cs, sample_locs_16x[14]);
-		radeon_emit(cs, sample_locs_16x[3]);
-		radeon_emit(cs, sample_locs_16x[7]);
-		radeon_emit(cs, sample_locs_16x[11]);
-		radeon_emit(cs, sample_locs_16x[15]);
+		si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
 		break;
 	}
 }
@@ -194,16 +200,16 @@ void si_init_msaa_functions(struct si_context *sctx)
 {
 	int i;
 
-	sctx->b.b.get_sample_position = si_get_sample_position;
+	sctx->b.get_sample_position = si_get_sample_position;
 
-	si_get_sample_position(&sctx->b.b, 1, 0, sctx->sample_locations_1x[0]);
+	si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
 
 	for (i = 0; i < 2; i++)
-		si_get_sample_position(&sctx->b.b, 2, i, sctx->sample_locations_2x[i]);
+		si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
 	for (i = 0; i < 4; i++)
-		si_get_sample_position(&sctx->b.b, 4, i, sctx->sample_locations_4x[i]);
+		si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
 	for (i = 0; i < 8; i++)
-		si_get_sample_position(&sctx->b.b, 8, i, sctx->sample_locations_8x[i]);
+		si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
 	for (i = 0; i < 16; i++)
-		si_get_sample_position(&sctx->b.b, 16, i, sctx->sample_locations_16x[i]);
+		si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c
index 9971bc815..fd7e843bc 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -19,17 +20,12 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * Authors: Marek Olšák <maraeo@gmail.com>
- *
  */
 
-#include "si_pipe.h"
-#include "si_state.h"
-#include "sid.h"
-#include "radeon/r600_cs.h"
+#include "si_build_pm4.h"
 
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 
 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
 
@@ -47,14 +43,14 @@ si_create_so_target(struct pipe_context *ctx,
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_streamout_target *t;
-	struct r600_resource *rbuffer = (struct r600_resource*)buffer;
+	struct r600_resource *rbuffer = r600_resource(buffer);
 
 	t = CALLOC_STRUCT(si_streamout_target);
 	if (!t) {
 		return NULL;
 	}
 
-	u_suballocator_alloc(sctx->b.allocator_zeroed_memory, 4, 4,
+	u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
 			     &t->buf_filled_size_offset,
 			     (struct pipe_resource**)&t->buf_filled_size);
 	if (!t->buf_filled_size) {
@@ -87,7 +83,7 @@ void si_streamout_buffers_dirty(struct si_context *sctx)
 	if (!sctx->streamout.enabled_mask)
 		return;
 
-	si_mark_atom_dirty(sctx, &sctx->streamout.begin_atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
 	si_set_streamout_enable(sctx, true);
 }
 
@@ -120,14 +116,14 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 		/* Invalidate the scalar cache in case a streamout buffer is
 		 * going to be used as a constant buffer.
 		 *
-		 * Invalidate TC L1, because streamout bypasses it (done by
-		 * setting GLC=1 in the store instruction), but it can contain
-		 * outdated data of streamout buffers.
+		 * Invalidate vL1, because streamout bypasses it (done by
+		 * setting GLC=1 in the store instruction), but vL1 in other
+		 * CUs can contain outdated data of streamout buffers.
 		 *
 		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
 		 * used as an input immediately.
 		 */
-		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+		sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
 				 SI_CONTEXT_INV_VMEM_L1 |
 				 SI_CONTEXT_VS_PARTIAL_FLUSH;
 	}
@@ -136,7 +132,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 	 * start writing to the targets.
 	 */
 	if (num_targets)
-		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 		                 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	/* Streamout buffers must be bound in 2 places:
@@ -155,7 +151,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 		if (!targets[i])
 			continue;
 
-		r600_context_add_resource_size(ctx, targets[i]->buffer);
+		si_context_add_resource_size(sctx, targets[i]->buffer);
 		enabled_mask |= 1 << i;
 
 		if (offsets[i] == ((unsigned)-1))
@@ -173,7 +169,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 	if (num_targets) {
 		si_streamout_buffers_dirty(sctx);
 	} else {
-		si_set_atom_dirty(sctx, &sctx->streamout.begin_atom, false);
+		si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
 		si_set_streamout_enable(sctx, false);
 	}
 
@@ -204,8 +200,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			/* Set the resource. */
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						buffer);
-			radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
-							    (struct r600_resource*)buffer,
+			radeon_add_to_gfx_buffer_list_check_mem(sctx,
+							    r600_resource(buffer),
 							    buffers->shader_usage,
 							    RADEON_PRIO_SHADER_RW_BUFFER,
 							    true);
@@ -234,11 +230,11 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 
 static void si_flush_vgt_streamout(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned reg_strmout_cntl;
 
 	/* The register is at different places on different ASICs. */
-	if (sctx->b.chip_class >= CIK) {
+	if (sctx->chip_class >= CIK) {
 		reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
 		radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
 	} else {
@@ -258,10 +254,9 @@ static void si_flush_vgt_streamout(struct si_context *sctx)
 	radeon_emit(cs, 4); /* poll interval */
 }
 
-static void si_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
+static void si_emit_streamout_begin(struct si_context *sctx)
 {
-	struct si_context *sctx = (struct si_context*)rctx;
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_streamout_target **t = sctx->streamout.targets;
 	uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
 	unsigned i;
@@ -295,7 +290,7 @@ static void si_emit_streamout_begin(struct r600_common_context *rctx, struct r60
 			radeon_emit(cs, va); /* src address lo */
 			radeon_emit(cs, va >> 32); /* src address hi */
 
-			radeon_add_to_buffer_list(&sctx->b,  &sctx->b.gfx,
+			radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 						  t[i]->buf_filled_size,
 						  RADEON_USAGE_READ,
 						  RADEON_PRIO_SO_FILLED_SIZE);
@@ -316,7 +311,7 @@ static void si_emit_streamout_begin(struct r600_common_context *rctx, struct r60
 
 void si_emit_streamout_end(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_streamout_target **t = sctx->streamout.targets;
 	unsigned i;
 	uint64_t va;
@@ -337,7 +332,7 @@ void si_emit_streamout_end(struct si_context *sctx)
 		radeon_emit(cs, 0); /* unused */
 		radeon_emit(cs, 0); /* unused */
 
-		radeon_add_to_buffer_list(&sctx->b,  &sctx->b.gfx,
+		radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
 					  t[i]->buf_filled_size,
 					  RADEON_USAGE_WRITE,
 					  RADEON_PRIO_SO_FILLED_SIZE);
@@ -352,7 +347,6 @@ void si_emit_streamout_end(struct si_context *sctx)
 	}
 
 	sctx->streamout.begin_emitted = false;
-	sctx->b.flags |= R600_CONTEXT_STREAMOUT_FLUSH;
 }
 
 /* STREAMOUT CONFIG DERIVED STATE
@@ -362,19 +356,16 @@ void si_emit_streamout_end(struct si_context *sctx)
  * are no buffers bound.
  */
 
-static void si_emit_streamout_enable(struct r600_common_context *rctx,
-				     struct r600_atom *atom)
+static void si_emit_streamout_enable(struct si_context *sctx)
 {
-	struct si_context *sctx = (struct si_context*)rctx;
-
-	radeon_set_context_reg_seq(sctx->b.gfx.cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
-	radeon_emit(sctx->b.gfx.cs,
+	radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
+	radeon_emit(sctx->gfx_cs,
 		    S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
 		    S_028B94_RAST_STREAM(0) |
 		    S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
 		    S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
 		    S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
-	radeon_emit(sctx->b.gfx.cs,
+	radeon_emit(sctx->gfx_cs,
 		    sctx->streamout.hw_enabled_mask &
 		    sctx->streamout.enabled_stream_buffers_mask);
 }
@@ -393,7 +384,7 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 
 	if ((old_strmout_en != si_get_strmout_en(sctx)) ||
             (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
-		si_mark_atom_dirty(sctx, &sctx->streamout.enable_atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 }
 
 void si_update_prims_generated_query_state(struct si_context *sctx,
@@ -409,15 +400,15 @@ void si_update_prims_generated_query_state(struct si_context *sctx,
 			sctx->streamout.num_prims_gen_queries != 0;
 
 		if (old_strmout_en != si_get_strmout_en(sctx))
-			si_mark_atom_dirty(sctx, &sctx->streamout.enable_atom);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 	}
 }
 
 void si_init_streamout_functions(struct si_context *sctx)
 {
-	sctx->b.b.create_stream_output_target = si_create_so_target;
-	sctx->b.b.stream_output_target_destroy = si_so_target_destroy;
-	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
-	sctx->streamout.begin_atom.emit = si_emit_streamout_begin;
-	sctx->streamout.enable_atom.emit = si_emit_streamout_enable;
+	sctx->b.create_stream_output_target = si_create_so_target;
+	sctx->b.stream_output_target_destroy = si_so_target_destroy;
+	sctx->b.set_stream_output_targets = si_set_streamout_targets;
+	sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+	sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c
index f41655847..76c56447e 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2012 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,9 +22,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_pipe.h"
-#include "sid.h"
-#include "radeon/r600_cs.h"
+#include "si_build_pm4.h"
 #include "util/u_viewport.h"
 #include "tgsi/tgsi_scan.h"
 
@@ -45,7 +44,7 @@ static void si_set_scissor_states(struct pipe_context *pctx,
 		return;
 
 	ctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
-	si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 }
 
 /* Since the guard band disables clipping, we have to clip per-pixel
@@ -108,10 +107,11 @@ static void si_scissor_make_union(struct si_signed_scissor *out,
 	out->miny = MIN2(out->miny, in->miny);
 	out->maxx = MAX2(out->maxx, in->maxx);
 	out->maxy = MAX2(out->maxy, in->maxy);
+	out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
 }
 
 static void si_emit_one_scissor(struct si_context *ctx,
-				struct radeon_winsys_cs *cs,
+				struct radeon_cmdbuf *cs,
 				struct si_signed_scissor *vp_scissor,
 				struct pipe_scissor_state *scissor)
 {
@@ -127,6 +127,18 @@ static void si_emit_one_scissor(struct si_context *ctx,
 	if (scissor)
 		si_clip_scissor(&final, scissor);
 
+	/* Workaround for a hw bug on SI that occurs when PA_SU_HARDWARE_-
+	 * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+	 */
+	if (ctx->chip_class == SI && (final.maxx == 0 || final.maxy == 0)) {
+		radeon_emit(cs, S_028250_TL_X(1) |
+				S_028250_TL_Y(1) |
+				S_028250_WINDOW_OFFSET_DISABLE(1));
+		radeon_emit(cs, S_028254_BR_X(1) |
+				S_028254_BR_Y(1));
+		return;
+	}
+
 	radeon_emit(cs, S_028250_TL_X(final.minx) |
 			S_028250_TL_Y(final.miny) |
 			S_028250_WINDOW_OFFSET_DISABLE(1));
@@ -134,27 +146,67 @@ static void si_emit_one_scissor(struct si_context *ctx,
 			S_028254_BR_Y(final.maxy));
 }
 
-/* the range is [-MAX, MAX] */
-#define GET_MAX_VIEWPORT_RANGE(rctx) (32768)
-
-static void si_emit_guardband(struct si_context *ctx,
-			      struct si_signed_scissor *vp_as_scissor)
+static void si_emit_guardband(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
+	struct si_signed_scissor vp_as_scissor;
 	struct pipe_viewport_state vp;
 	float left, top, right, bottom, max_range, guardband_x, guardband_y;
 	float discard_x, discard_y;
 
+	if (ctx->vs_writes_viewport_index) {
+		/* Shaders can draw to any viewport. Make a union of all
+		 * viewports. */
+		vp_as_scissor = ctx->viewports.as_scissor[0];
+		for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
+			si_scissor_make_union(&vp_as_scissor,
+					      &ctx->viewports.as_scissor[i]);
+		}
+	} else {
+		vp_as_scissor = ctx->viewports.as_scissor[0];
+	}
+
+	/* Blits don't set the viewport state. The vertex shader determines
+	 * the viewport size by scaling the coordinates, so we don't know
+	 * how large the viewport is. Assume the worst case.
+	 */
+	if (ctx->vs_disables_clipping_viewport)
+		vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+
+	/* Determine the optimal hardware screen offset to center the viewport
+	 * within the viewport range in order to maximize the guardband size.
+	 */
+	int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
+	int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
+
+	const unsigned hw_screen_offset_max = 8176;
+	/* SI-CI need to align the offset to an ubertile consisting of all SEs. */
+	const unsigned hw_screen_offset_alignment =
+		ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
+
+	hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, hw_screen_offset_max);
+	hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, hw_screen_offset_max);
+
+	/* Align the screen offset by dropping the low bits. */
+	hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
+	hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
+
+	/* Apply the offset to center the viewport and maximize the guardband. */
+	vp_as_scissor.minx -= hw_screen_offset_x;
+	vp_as_scissor.maxx -= hw_screen_offset_x;
+	vp_as_scissor.miny -= hw_screen_offset_y;
+	vp_as_scissor.maxy -= hw_screen_offset_y;
+
 	/* Reconstruct the viewport transformation from the scissor. */
-	vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0;
-	vp.translate[1] = (vp_as_scissor->miny + vp_as_scissor->maxy) / 2.0;
-	vp.scale[0] = vp_as_scissor->maxx - vp.translate[0];
-	vp.scale[1] = vp_as_scissor->maxy - vp.translate[1];
+	vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
+	vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
+	vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
+	vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
 
 	/* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
-	if (vp_as_scissor->minx == vp_as_scissor->maxx)
+	if (vp_as_scissor.minx == vp_as_scissor.maxx)
 		vp.scale[0] = 0.5;
-	if (vp_as_scissor->miny == vp_as_scissor->maxy)
+	if (vp_as_scissor.miny == vp_as_scissor.maxy)
 		vp.scale[1] = 0.5;
 
 	/* Find the biggest guard band that is inside the supported viewport
@@ -164,9 +216,11 @@ static void si_emit_guardband(struct si_context *ctx,
 	 * This is done by applying the inverse viewport transformation
 	 * on the viewport limits to get those limits in clip space.
 	 *
-	 * Use a limit one pixel smaller to allow for some precision error.
+	 * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
 	 */
-	max_range = GET_MAX_VIEWPORT_RANGE(ctx) - 1;
+	static unsigned max_viewport_size[] = {65535, 16383, 4095};
+	assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
+	max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
 	left   = (-max_range - vp.translate[0]) / vp.scale[0];
 	right  = ( max_range - vp.translate[0]) / vp.scale[0];
 	top    = (-max_range - vp.translate[1]) / vp.scale[1];
@@ -180,11 +234,9 @@ static void si_emit_guardband(struct si_context *ctx,
 	discard_x = 1.0;
 	discard_y = 1.0;
 
-	if (unlikely(ctx->current_rast_prim < PIPE_PRIM_TRIANGLES) &&
-	    ctx->queued.named.rasterizer) {
+	if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
 		/* When rendering wide points or lines, we need to be more
 		 * conservative about when to discard them entirely. */
-		const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
 		float pixels;
 
 		if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
@@ -202,27 +254,34 @@ static void si_emit_guardband(struct si_context *ctx,
 		discard_y = MIN2(discard_y, guardband_y);
 	}
 
-	/* If any of the GB registers is updated, all of them must be updated. */
-	radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
-
-	radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */
-	radeon_emit(cs, fui(discard_y));   /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
-	radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */
-	radeon_emit(cs, fui(discard_x));   /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
+	/* If any of the GB registers is updated, all of them must be updated.
+	 * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
+	 * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
+	 */
+	unsigned initial_cdw = ctx->gfx_cs->current.cdw;
+	radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
+				    SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
+				    fui(guardband_y), fui(discard_y),
+				    fui(guardband_x), fui(discard_x));
+	radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
+				   SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+				   S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
+				   S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
+	radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
+				   SI_TRACKED_PA_SU_VTX_CNTL,
+				   S_028BE4_PIX_CENTER(rs->half_pixel_center) |
+				   S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
+						       vp_as_scissor.quant_mode));
+	if (initial_cdw != ctx->gfx_cs->current.cdw)
+		ctx->context_roll_counter++;
 }
 
-static void si_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom)
+static void si_emit_scissors(struct si_context *ctx)
 {
-	struct si_context *ctx = (struct si_context *)rctx;
-	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct pipe_scissor_state *states = ctx->scissors.states;
 	unsigned mask = ctx->scissors.dirty_mask;
-	bool scissor_enabled = false;
-	struct si_signed_scissor max_vp_scissor;
-	int i;
-
-	if (ctx->queued.named.rasterizer)
-		scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
+	bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
 
 	/* The simple case: Only 1 viewport is active. */
 	if (!ctx->vs_writes_viewport_index) {
@@ -233,17 +292,10 @@ static void si_emit_scissors(struct r600_common_context *rctx, struct r600_atom
 
 		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
 		si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
-		si_emit_guardband(ctx, vp);
 		ctx->scissors.dirty_mask &= ~1; /* clear one bit */
 		return;
 	}
 
-	/* Shaders can draw to any viewport. Make a union of all viewports. */
-	max_vp_scissor = ctx->viewports.as_scissor[0];
-	for (i = 1; i < SI_MAX_VIEWPORTS; i++)
-		si_scissor_make_union(&max_vp_scissor,
-				      &ctx->viewports.as_scissor[i]);
-
 	while (mask) {
 		int start, count, i;
 
@@ -256,7 +308,6 @@ static void si_emit_scissors(struct r600_common_context *rctx, struct r600_atom
 					    scissor_enabled ? &states[i] : NULL);
 		}
 	}
-	si_emit_guardband(ctx, &max_vp_scissor);
 	ctx->scissors.dirty_mask = 0;
 }
 
@@ -271,24 +322,48 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 
 	for (i = 0; i < num_viewports; i++) {
 		unsigned index = start_slot + i;
+		struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
 
 		ctx->viewports.states[index] = state[i];
-		si_get_scissor_from_viewport(ctx, &state[i],
-					     &ctx->viewports.as_scissor[index]);
+
+		si_get_scissor_from_viewport(ctx, &state[i], scissor);
+
+		unsigned w = scissor->maxx - scissor->minx;
+		unsigned h = scissor->maxy - scissor->miny;
+		unsigned max_extent = MAX2(w, h);
+
+		/* Determine the best quantization mode (subpixel precision),
+		 * but also leave enough space for the guardband.
+		 *
+		 * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
+		 * and Raven1. What we do depends on the chip:
+		 * - Vega10: Never use primitive binning.
+		 * - Raven1: Always use QUANT_MODE == 16_8.
+		 */
+		if (ctx->family == CHIP_RAVEN)
+			max_extent = 16384; /* Use QUANT_MODE == 16_8. */
+
+		if (max_extent <= 1024) /* 4K scanline area for guardband */
+			scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
+		else if (max_extent <= 4096) /* 16K scanline area for guardband */
+			scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
+		else /* 64K scanline area for guardband */
+			scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
 	}
 
 	mask = ((1 << num_viewports) - 1) << start_slot;
 	ctx->viewports.dirty_mask |= mask;
 	ctx->viewports.depth_range_dirty_mask |= mask;
 	ctx->scissors.dirty_mask |= mask;
-	si_mark_atom_dirty(ctx, &ctx->viewports.atom);
-	si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 }
 
 static void si_emit_one_viewport(struct si_context *ctx,
 				 struct pipe_viewport_state *state)
 {
-	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 
 	radeon_emit(cs, fui(state->scale[0]));
 	radeon_emit(cs, fui(state->translate[0]));
@@ -300,7 +375,7 @@ static void si_emit_one_viewport(struct si_context *ctx,
 
 static void si_emit_viewports(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct pipe_viewport_state *states = ctx->viewports.states;
 	unsigned mask = ctx->viewports.dirty_mask;
 
@@ -342,16 +417,13 @@ si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
 
 static void si_emit_depth_ranges(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct pipe_viewport_state *states = ctx->viewports.states;
 	unsigned mask = ctx->viewports.depth_range_dirty_mask;
-	bool clip_halfz = false;
+	bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
 	bool window_space = ctx->vs_disables_clipping_viewport;
 	float zmin, zmax;
 
-	if (ctx->queued.named.rasterizer)
-		clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
-
 	/* The simple case: Only 1 viewport is active. */
 	if (!ctx->vs_writes_viewport_index) {
 		if (!(mask & 1))
@@ -384,10 +456,8 @@ static void si_emit_depth_ranges(struct si_context *ctx)
 	ctx->viewports.depth_range_dirty_mask = 0;
 }
 
-static void si_emit_viewport_states(struct r600_common_context *rctx,
-				    struct r600_atom *atom)
+static void si_emit_viewport_states(struct si_context *ctx)
 {
-	struct si_context *ctx = (struct si_context *)rctx;
 	si_emit_viewports(ctx);
 	si_emit_depth_ranges(ctx);
 }
@@ -418,28 +488,118 @@ void si_update_vs_viewport_state(struct si_context *ctx)
 		ctx->vs_disables_clipping_viewport = vs_window_space;
 		ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
 		ctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		si_mark_atom_dirty(ctx, &ctx->scissors.atom);
-		si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
 	}
 
 	/* Viewport index handling. */
+	if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
+		return;
+
+	/* This changes how the guardband is computed. */
 	ctx->vs_writes_viewport_index = info->writes_viewport_index;
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+
 	if (!ctx->vs_writes_viewport_index)
 		return;
 
 	if (ctx->scissors.dirty_mask)
-	    si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+	    si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 
 	if (ctx->viewports.dirty_mask ||
 	    ctx->viewports.depth_range_dirty_mask)
-	    si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+	    si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+}
+
+static void si_emit_window_rectangles(struct si_context *sctx)
+{
+	/* There are four clipping rectangles. Their corner coordinates are inclusive.
+	 * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+	 * on whether the pixel is inside cliprects 0-3, respectively. For example,
+	 * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+	 * the number 3 (binary 0011).
+	 *
+	 * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
+	 */
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+	static const unsigned outside[4] = {
+		/* outside rectangle 0 */
+		V_02820C_OUT |
+		V_02820C_IN_1 |
+		V_02820C_IN_2 |
+		V_02820C_IN_21 |
+		V_02820C_IN_3 |
+		V_02820C_IN_31 |
+		V_02820C_IN_32 |
+		V_02820C_IN_321,
+		/* outside rectangles 0, 1 */
+		V_02820C_OUT |
+		V_02820C_IN_2 |
+		V_02820C_IN_3 |
+		V_02820C_IN_32,
+		/* outside rectangles 0, 1, 2 */
+		V_02820C_OUT |
+		V_02820C_IN_3,
+		/* outside rectangles 0, 1, 2, 3 */
+		V_02820C_OUT,
+	};
+	const unsigned disabled = 0xffff; /* all inside and outside cases */
+	unsigned num_rectangles = sctx->num_window_rectangles;
+	struct pipe_scissor_state *rects = sctx->window_rectangles;
+	unsigned rule;
+
+	assert(num_rectangles <= 4);
+
+	if (num_rectangles == 0)
+		rule = disabled;
+	else if (sctx->window_rectangles_include)
+		rule = ~outside[num_rectangles - 1];
+	else
+		rule = outside[num_rectangles - 1];
+
+	radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE,
+				   SI_TRACKED_PA_SC_CLIPRECT_RULE, rule);
+	if (num_rectangles == 0)
+		return;
+
+	radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL,
+				   num_rectangles * 2);
+	for (unsigned i = 0; i < num_rectangles; i++) {
+		radeon_emit(cs, S_028210_TL_X(rects[i].minx) |
+				S_028210_TL_Y(rects[i].miny));
+		radeon_emit(cs, S_028214_BR_X(rects[i].maxx) |
+				S_028214_BR_Y(rects[i].maxy));
+	}
+}
+
+static void si_set_window_rectangles(struct pipe_context *ctx,
+				     boolean include,
+				     unsigned num_rectangles,
+				     const struct pipe_scissor_state *rects)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	sctx->num_window_rectangles = num_rectangles;
+	sctx->window_rectangles_include = include;
+	if (num_rectangles) {
+		memcpy(sctx->window_rectangles, rects,
+		       sizeof(*rects) * num_rectangles);
+	}
+
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
 }
 
 void si_init_viewport_functions(struct si_context *ctx)
 {
-	ctx->scissors.atom.emit = si_emit_scissors;
-	ctx->viewports.atom.emit = si_emit_viewport_states;
+	ctx->atoms.s.guardband.emit = si_emit_guardband;
+	ctx->atoms.s.scissors.emit = si_emit_scissors;
+	ctx->atoms.s.viewports.emit = si_emit_viewport_states;
+	ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+
+	ctx->b.set_scissor_states = si_set_scissor_states;
+	ctx->b.set_viewport_states = si_set_viewport_states;
+	ctx->b.set_window_rectangles = si_set_window_rectangles;
 
-	ctx->b.b.set_scissor_states = si_set_scissor_states;
-	ctx->b.b.set_viewport_states = si_set_viewport_states;
+	for (unsigned i = 0; i < 16; i++)
+		ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
 }
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma.c b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma.c
new file mode 100644
index 000000000..90a2032cd
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file implements randomized SDMA texture blit tests. */
+
+#include "si_pipe.h"
+#include "util/u_surface.h"
+#include "util/rand_xor.h"
+
+static uint64_t seed_xorshift128plus[2];
+
+#define RAND_NUM_SIZE 8
+
+/* The GPU blits are emulated on the CPU using these CPU textures. */
+
+struct cpu_texture {
+	uint8_t *ptr;
+	uint64_t size;
+	uint64_t layer_stride;
+	unsigned stride;
+};
+
+static void alloc_cpu_texture(struct cpu_texture *tex,
+			      struct pipe_resource *templ, int bpp)
+{
+	tex->stride = align(templ->width0 * bpp, RAND_NUM_SIZE);
+	tex->layer_stride = (uint64_t)tex->stride * templ->height0;
+	tex->size = tex->layer_stride * templ->array_size;
+	tex->ptr = malloc(tex->size);
+	assert(tex->ptr);
+}
+
+static void set_random_pixels(struct pipe_context *ctx,
+			      struct pipe_resource *tex,
+			      struct cpu_texture *cpu)
+{
+	struct pipe_transfer *t;
+	uint8_t *map;
+	int x,y,z;
+
+	map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
+				   0, 0, 0, tex->width0, tex->height0,
+				   tex->array_size, &t);
+	assert(map);
+
+	for (z = 0; z < tex->array_size; z++) {
+		for (y = 0; y < tex->height0; y++) {
+			uint64_t *ptr = (uint64_t*)
+				(map + t->layer_stride*z + t->stride*y);
+			uint64_t *ptr_cpu = (uint64_t*)
+				(cpu->ptr + cpu->layer_stride*z + cpu->stride*y);
+			unsigned size = cpu->stride / RAND_NUM_SIZE;
+
+			assert(t->stride % RAND_NUM_SIZE == 0);
+			assert(cpu->stride % RAND_NUM_SIZE == 0);
+
+			for (x = 0; x < size; x++) {
+				*ptr++ = *ptr_cpu++ =
+					rand_xorshift128plus(seed_xorshift128plus);
+			}
+		}
+	}
+
+	pipe_transfer_unmap(ctx, t);
+}
+
+static bool compare_textures(struct pipe_context *ctx,
+			     struct pipe_resource *tex,
+			     struct cpu_texture *cpu, int bpp)
+{
+	struct pipe_transfer *t;
+	uint8_t *map;
+	int y,z;
+	bool pass = true;
+
+	map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ,
+				   0, 0, 0, tex->width0, tex->height0,
+				   tex->array_size, &t);
+	assert(map);
+
+	for (z = 0; z < tex->array_size; z++) {
+		for (y = 0; y < tex->height0; y++) {
+			uint8_t *ptr = map + t->layer_stride*z + t->stride*y;
+			uint8_t *cpu_ptr = cpu->ptr +
+					   cpu->layer_stride*z + cpu->stride*y;
+
+			if (memcmp(ptr, cpu_ptr, tex->width0 * bpp)) {
+				pass = false;
+				goto done;
+			}
+		}
+	}
+done:
+	pipe_transfer_unmap(ctx, t);
+	return pass;
+}
+
+static enum pipe_format get_format_from_bpp(int bpp)
+{
+	switch (bpp) {
+	case 1:
+		return PIPE_FORMAT_R8_UINT;
+	case 2:
+		return PIPE_FORMAT_R16_UINT;
+	case 4:
+		return PIPE_FORMAT_R32_UINT;
+	case 8:
+		return PIPE_FORMAT_R32G32_UINT;
+	case 16:
+		return PIPE_FORMAT_R32G32B32A32_UINT;
+	default:
+		assert(0);
+		return PIPE_FORMAT_NONE;
+	}
+}
+
+static const char *array_mode_to_string(struct si_screen *sscreen,
+					struct radeon_surf *surf)
+{
+	if (sscreen->info.chip_class >= GFX9) {
+		switch (surf->u.gfx9.surf.swizzle_mode) {
+		case 0:
+			return "  LINEAR";
+		case 21:
+			return " 4KB_S_X";
+		case 22:
+			return " 4KB_D_X";
+		case 25:
+			return "64KB_S_X";
+		case 26:
+			return "64KB_D_X";
+		default:
+			printf("Unhandled swizzle mode = %u\n",
+			       surf->u.gfx9.surf.swizzle_mode);
+			return " UNKNOWN";
+		}
+	} else {
+		switch (surf->u.legacy.level[0].mode) {
+		case RADEON_SURF_MODE_LINEAR_ALIGNED:
+			return "LINEAR_ALIGNED";
+		case RADEON_SURF_MODE_1D:
+			return "1D_TILED_THIN1";
+		case RADEON_SURF_MODE_2D:
+			return "2D_TILED_THIN1";
+		default:
+			assert(0);
+			return "       UNKNOWN";
+		}
+	}
+}
+
+static unsigned generate_max_tex_side(unsigned max_tex_side)
+{
+	switch (rand() % 4) {
+	case 0:
+		/* Try to hit large sizes in 1/4 of the cases. */
+		return max_tex_side;
+	case 1:
+		/* Try to hit 1D tiling in 1/4 of the cases. */
+		return 128;
+	default:
+		/* Try to hit common sizes in 2/4 of the cases. */
+		return 2048;
+	}
+}
+
+void si_test_dma(struct si_screen *sscreen)
+{
+	struct pipe_screen *screen = &sscreen->b;
+	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+	struct si_context *sctx = (struct si_context*)ctx;
+	uint64_t max_alloc_size;
+	unsigned i, iterations, num_partial_copies, max_levels, max_tex_side;
+	unsigned num_pass = 0, num_fail = 0;
+
+	max_levels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+	max_tex_side = 1 << (max_levels - 1);
+
+	/* Max 128 MB allowed for both textures. */
+	max_alloc_size = 128 * 1024 * 1024;
+
+	/* the seed for random test parameters */
+	srand(0x9b47d95b);
+	/* the seed for random pixel data */
+	s_rand_xorshift128plus(seed_xorshift128plus, false);
+
+	iterations = 1000000000; /* just kill it when you are bored */
+	num_partial_copies = 30;
+
+	/* These parameters are randomly generated per test:
+	 * - whether to do one whole-surface copy or N partial copies per test
+	 * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
+	 * - which texture dimensions to use
+	 * - whether to use VRAM (all tiling modes) and GTT (staging, linear
+	 *   only) allocations
+	 * - random initial pixels in src
+	 * - generate random subrectangle copies for partial blits
+	 */
+	for (i = 0; i < iterations; i++) {
+		struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
+		struct si_texture *sdst;
+		struct si_texture *ssrc;
+		struct cpu_texture src_cpu, dst_cpu;
+		unsigned bpp, max_width, max_height, max_depth, j, num;
+		unsigned gfx_blits = 0, dma_blits = 0, max_tex_side_gen;
+		unsigned max_tex_layers;
+		bool pass;
+		bool do_partial_copies = rand() & 1;
+
+		/* generate a random test case */
+		tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
+		tsrc.depth0 = tdst.depth0 = 1;
+
+		bpp = 1 << (rand() % 5);
+		tsrc.format = tdst.format = get_format_from_bpp(bpp);
+
+		max_tex_side_gen = generate_max_tex_side(max_tex_side);
+		max_tex_layers = rand() % 4 ? 1 : 5;
+
+		tsrc.width0 = (rand() % max_tex_side_gen) + 1;
+		tsrc.height0 = (rand() % max_tex_side_gen) + 1;
+		tsrc.array_size = (rand() % max_tex_layers) + 1;
+
+		/* Have a 1/4 chance of getting power-of-two dimensions. */
+		if (rand() % 4 == 0) {
+			tsrc.width0 = util_next_power_of_two(tsrc.width0);
+			tsrc.height0 = util_next_power_of_two(tsrc.height0);
+		}
+
+		if (!do_partial_copies) {
+			/* whole-surface copies only, same dimensions */
+			tdst = tsrc;
+		} else {
+			max_tex_side_gen = generate_max_tex_side(max_tex_side);
+			max_tex_layers = rand() % 4 ? 1 : 5;
+
+			/* many partial copies, dimensions can be different */
+			tdst.width0 = (rand() % max_tex_side_gen) + 1;
+			tdst.height0 = (rand() % max_tex_side_gen) + 1;
+			tdst.array_size = (rand() % max_tex_layers) + 1;
+
+			/* Have a 1/4 chance of getting power-of-two dimensions. */
+			if (rand() % 4 == 0) {
+				tdst.width0 = util_next_power_of_two(tdst.width0);
+				tdst.height0 = util_next_power_of_two(tdst.height0);
+			}
+		}
+
+		/* check texture sizes */
+		if ((uint64_t)tsrc.width0 * tsrc.height0 * tsrc.array_size * bpp +
+		    (uint64_t)tdst.width0 * tdst.height0 * tdst.array_size * bpp >
+		    max_alloc_size) {
+			/* too large, try again */
+			i--;
+			continue;
+		}
+
+		/* VRAM + the tiling mode depends on dimensions (3/4 of cases),
+		 * or GTT + linear only (1/4 of cases)
+		 */
+		tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+		tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+
+		/* Allocate textures (both the GPU and CPU copies).
+		 * The CPU will emulate what the GPU should be doing.
+		 */
+		src = screen->resource_create(screen, &tsrc);
+		dst = screen->resource_create(screen, &tdst);
+		assert(src);
+		assert(dst);
+		sdst = (struct si_texture*)dst;
+		ssrc = (struct si_texture*)src;
+		alloc_cpu_texture(&src_cpu, &tsrc, bpp);
+		alloc_cpu_texture(&dst_cpu, &tdst, bpp);
+
+		printf("%4u: dst = (%5u x %5u x %u, %s), "
+		       " src = (%5u x %5u x %u, %s), bpp = %2u, ",
+		       i, tdst.width0, tdst.height0, tdst.array_size,
+		       array_mode_to_string(sscreen, &sdst->surface),
+		       tsrc.width0, tsrc.height0, tsrc.array_size,
+		       array_mode_to_string(sscreen, &ssrc->surface), bpp);
+		fflush(stdout);
+
+		/* set src pixels */
+		set_random_pixels(ctx, src, &src_cpu);
+
+		/* clear dst pixels */
+		uint32_t zero = 0;
+		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
+		                SI_COHERENCY_SHADER);
+		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
+
+		/* preparation */
+		max_width = MIN2(tsrc.width0, tdst.width0);
+		max_height = MIN2(tsrc.height0, tdst.height0);
+		max_depth = MIN2(tsrc.array_size, tdst.array_size);
+
+		num = do_partial_copies ? num_partial_copies : 1;
+		for (j = 0; j < num; j++) {
+			int width, height, depth;
+			int srcx, srcy, srcz, dstx, dsty, dstz;
+			struct pipe_box box;
+			unsigned old_num_draw_calls = sctx->num_draw_calls;
+			unsigned old_num_dma_calls = sctx->num_dma_calls;
+
+			if (!do_partial_copies) {
+				/* copy whole src to dst */
+				width = max_width;
+				height = max_height;
+				depth = max_depth;
+
+				srcx = srcy = srcz = dstx = dsty = dstz = 0;
+			} else {
+				/* random sub-rectangle copies from src to dst */
+				depth = (rand() % max_depth) + 1;
+				srcz = rand() % (tsrc.array_size - depth + 1);
+				dstz = rand() % (tdst.array_size - depth + 1);
+
+				/* special code path to hit the tiled partial copies */
+				if (!ssrc->surface.is_linear &&
+				    !sdst->surface.is_linear &&
+				    rand() & 1) {
+					if (max_width < 8 || max_height < 8)
+						continue;
+					width = ((rand() % (max_width / 8)) + 1) * 8;
+					height = ((rand() % (max_height / 8)) + 1) * 8;
+
+					srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
+					srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
+
+					dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
+					dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
+				} else {
+					/* just make sure that it doesn't divide by zero */
+					assert(max_width > 0 && max_height > 0);
+
+					width = (rand() % max_width) + 1;
+					height = (rand() % max_height) + 1;
+
+					srcx = rand() % (tsrc.width0 - width + 1);
+					srcy = rand() % (tsrc.height0 - height + 1);
+
+					dstx = rand() % (tdst.width0 - width + 1);
+					dsty = rand() % (tdst.height0 - height + 1);
+				}
+
+				/* special code path to hit out-of-bounds reads in L2T */
+				if (ssrc->surface.is_linear &&
+				    !sdst->surface.is_linear &&
+				    rand() % 4 == 0) {
+					srcx = 0;
+					srcy = 0;
+					srcz = 0;
+				}
+			}
+
+			/* GPU copy */
+			u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
+			sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
+
+			/* See which engine was used. */
+			gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
+			dma_blits += sctx->num_dma_calls > old_num_dma_calls;
+
+			/* CPU copy */
+			util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride,
+				      dst_cpu.layer_stride,
+				      dstx, dsty, dstz, width, height, depth,
+				      src_cpu.ptr, src_cpu.stride,
+				      src_cpu.layer_stride,
+				      srcx, srcy, srcz);
+		}
+
+		pass = compare_textures(ctx, dst, &dst_cpu, bpp);
+		if (pass)
+			num_pass++;
+		else
+			num_fail++;
+
+		printf("BLITs: GFX = %2u, DMA = %2u, %s [%u/%u]\n",
+		       gfx_blits, dma_blits, pass ? "pass" : "fail",
+		       num_pass, num_pass+num_fail);
+
+		/* cleanup */
+		pipe_resource_reference(&src, NULL);
+		pipe_resource_reference(&dst, NULL);
+		free(src_cpu.ptr);
+		free(dst_cpu.ptr);
+	}
+
+	ctx->destroy(ctx);
+	exit(0);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma_perf.c
new file mode 100644
index 000000000..6c04720e9
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file implements tests on the si_clearbuffer function. */
+
+#include "si_pipe.h"
+#include "si_query.h"
+
+#define MIN_SIZE	512
+#define MAX_SIZE	(128 * 1024 * 1024)
+#define SIZE_SHIFT	1
+#define NUM_RUNS	128
+
+static double get_MBps_rate(unsigned num_bytes, unsigned ns)
+{
+	return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+}
+
+void si_test_dma_perf(struct si_screen *sscreen)
+{
+	struct pipe_screen *screen = &sscreen->b;
+	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+	struct si_context *sctx = (struct si_context*)ctx;
+	const uint32_t clear_value = 0x12345678;
+	static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+	static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+
+#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
+#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+	static const char *method_str[] = {
+		"CP MC   ",
+		"CP L2   ",
+		"CP L2   ",
+		"SDMA    ",
+	};
+	static const char *placement_str[] = {
+		/* Clear */
+		"fill->VRAM",
+		"fill->GTT ",
+		/* Copy */
+		"VRAM->VRAM",
+		"VRAM->GTT ",
+		"GTT ->VRAM",
+	};
+
+	printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+	printf("Heap       ,Method  ,L2p,Wa,");
+	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+		if (size >= 1024)
+			printf("%6uKB,", size / 1024);
+		else
+			printf(" %6uB,", size);
+	}
+	printf("\n");
+
+	/* results[log2(size)][placement][method][] */
+	struct si_result {
+		bool is_valid;
+		bool is_cp;
+		bool is_sdma;
+		bool is_cs;
+		unsigned cache_policy;
+		unsigned dwords_per_thread;
+		unsigned waves_per_sh;
+		unsigned score;
+		unsigned index; /* index in results[x][y][index] */
+	} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+	/* Run benchmarks. */
+	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+		bool is_copy = placement >= 2;
+
+		printf("-----------,--------,---,--,");
+		for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+			printf("--------,");
+		printf("\n");
+
+		for (unsigned method = 0; method < NUM_METHODS; method++) {
+			bool test_cp = method <= 2;
+			bool test_sdma = method == 3;
+			bool test_cs = method >= 4;
+			unsigned cs_method = method - 4;
+			STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
+			unsigned cs_waves_per_sh =
+				test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
+			cs_method %= 2*NUM_SHADERS;
+			unsigned cache_policy = test_cp ? method % 3 :
+						test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
+			unsigned cs_dwords_per_thread =
+				test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+			if (sctx->chip_class == SI) {
+				/* SI doesn't support CP DMA operations through L2. */
+				if (test_cp && cache_policy != L2_BYPASS)
+					continue;
+				/* WAVES_PER_SH is in multiples of 16 on SI. */
+				if (test_cs && cs_waves_per_sh % 16 != 0)
+					continue;
+			}
+
+			printf("%s ,", placement_str[placement]);
+			if (test_cs) {
+				printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+				       cache_policy == L2_LRU ? "LRU" :
+				       cache_policy == L2_STREAM ? "Str" : "");
+			} else {
+				printf("%s,%3s,", method_str[method],
+				       method == L2_LRU ? "LRU" :
+				       method == L2_STREAM ? "Str" : "");
+			}
+			if (test_cs && cs_waves_per_sh)
+				printf("%2u,", cs_waves_per_sh);
+			else
+				printf("  ,");
+
+			double score = 0;
+			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+				/* Don't test bigger sizes if it's too slow. Print 0. */
+				if (size >= 512*1024 &&
+				    score < 400 * (size / (4*1024*1024))) {
+					printf("%7.0f ,", 0.0);
+					continue;
+				}
+
+				enum pipe_resource_usage dst_usage, src_usage;
+				struct pipe_resource *dst, *src;
+				struct pipe_query *q[NUM_RUNS];
+				unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+
+				if (test_sdma) {
+					if (sctx->chip_class == SI)
+						query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+					else
+						query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+				}
+
+				if (placement == 0 || placement == 2 || placement == 4)
+					dst_usage = PIPE_USAGE_DEFAULT;
+				else
+					dst_usage = PIPE_USAGE_STREAM;
+
+				if (placement == 2 || placement == 3)
+					src_usage = PIPE_USAGE_DEFAULT;
+				else
+					src_usage = PIPE_USAGE_STREAM;
+
+				dst = pipe_buffer_create(screen, 0, dst_usage, size);
+				src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
+
+				/* Run tests. */
+				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+					q[iter] = ctx->create_query(ctx, query_type, 0);
+					ctx->begin_query(ctx, q[iter]);
+
+					if (test_cp) {
+						/* CP DMA */
+						if (is_copy) {
+							si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
+									      SI_COHERENCY_NONE, cache_policy);
+						} else {
+							si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
+									       SI_COHERENCY_NONE, cache_policy);
+						}
+					} else if (test_sdma) {
+						/* SDMA */
+						if (is_copy) {
+							struct pipe_box box;
+							u_box_1d(0, size, &box);
+							sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
+						} else {
+							si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+						}
+					} else {
+						/* Compute */
+						/* The memory accesses are coalesced, meaning that the 1st instruction writes
+						 * the 1st contiguous block of data for the whole wave, the 2nd instruction
+						 * writes the 2nd contiguous block of data, etc.
+						 */
+						unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+						unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+						unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+						unsigned num_dwords = size / 4;
+						unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+						void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+											cache_policy == L2_STREAM, is_copy);
+
+						struct pipe_grid_info info = {};
+						info.block[0] = MIN2(64, num_instructions);
+						info.block[1] = 1;
+						info.block[2] = 1;
+						info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+						info.grid[1] = 1;
+						info.grid[2] = 1;
+
+						struct pipe_shader_buffer sb[2] = {};
+						sb[0].buffer = dst;
+						sb[0].buffer_size = size;
+
+						if (is_copy) {
+							sb[1].buffer = src;
+							sb[1].buffer_size = size;
+						} else {
+							for (unsigned i = 0; i < 4; i++)
+								sctx->cs_user_data[i] = clear_value;
+						}
+
+						sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
+							       SI_CONTEXT_INV_SMEM_L1;
+
+						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb);
+						ctx->bind_compute_state(ctx, cs);
+						sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+						ctx->launch_grid(ctx, &info);
+
+						ctx->bind_compute_state(ctx, NULL);
+						ctx->delete_compute_state(ctx, cs);
+						sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+					}
+
+					/* Flush L2, so that we don't just test L2 cache performance. */
+					if (!test_sdma) {
+						sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+						si_emit_cache_flush(sctx);
+					}
+
+					ctx->end_query(ctx, q[iter]);
+					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+				}
+				pipe_resource_reference(&dst, NULL);
+				pipe_resource_reference(&src, NULL);
+
+				/* Get results. */
+				uint64_t min = ~0ull, max = 0, total = 0;
+
+				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+					union pipe_query_result result;
+
+					ctx->get_query_result(ctx, q[iter], true, &result);
+					ctx->destroy_query(ctx, q[iter]);
+
+					min = MIN2(min, result.u64);
+					max = MAX2(max, result.u64);
+					total += result.u64;
+				}
+
+				score = get_MBps_rate(size, total / (double)NUM_RUNS);
+				printf("%7.0f ,", score);
+				fflush(stdout);
+
+				struct si_result *r = &results[util_logbase2(size)][placement][method];
+				r->is_valid = true;
+				r->is_cp = test_cp;
+				r->is_sdma = test_sdma;
+				r->is_cs = test_cs;
+				r->cache_policy = cache_policy;
+				r->dwords_per_thread = cs_dwords_per_thread;
+				r->waves_per_sh = cs_waves_per_sh;
+				r->score = score;
+				r->index = method;
+			}
+			puts("");
+		}
+	}
+
+	puts("");
+	puts("static struct si_method");
+	printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
+	       sctx->screen->info.name);
+	puts("{");
+	puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+
+	/* Analyze results and find the best methods. */
+	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+		if (placement == 0)
+			puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+		else if (placement == 1)
+			puts("   } else { /* GTT */");
+		else if (placement == 2) {
+			puts("}");
+			puts("");
+			puts("static struct si_method");
+			printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+			       sctx->screen->info.name);
+			printf("                     uint64_t size64, bool async, bool cached)\n");
+			puts("{");
+			puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+			puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+		} else if (placement == 3)
+			puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+		else
+			puts("   } else { /* GTT -> VRAM */");
+
+		for (unsigned mode = 0; mode < 3; mode++) {
+			bool async = mode == 0;
+			bool cached = mode == 1;
+
+			if (async)
+				puts("      if (async) { /* SDMA or async compute */");
+			else if (cached)
+				puts("      if (cached) { /* gfx ring */");
+			else
+				puts("      } else { /* gfx ring - uncached */");
+
+			/* The list of best chosen methods. */
+			struct si_result *methods[32];
+			unsigned method_max_size[32];
+			unsigned num_methods = 0;
+
+			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+				/* Find the best method. */
+				struct si_result *best = NULL;
+
+				for (unsigned i = 0; i < NUM_METHODS; i++) {
+					struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+					if (!r->is_valid)
+						continue;
+
+					/* Ban CP DMA clears via MC on <= VI. They are super slow
+					 * on GTT, which we can get due to BO evictions.
+					 */
+					if (sctx->chip_class <= VI && placement == 1 &&
+					    r->is_cp && r->cache_policy == L2_BYPASS)
+						continue;
+
+					if (async) {
+						/* The following constraints for compute IBs try to limit
+						 * resource usage so as not to decrease the performance
+						 * of gfx IBs too much.
+						 */
+
+						/* Don't use CP DMA on asynchronous rings, because
+						 * the engine is shared with gfx IBs.
+						 */
+						if (r->is_cp)
+							continue;
+
+						/* Don't use L2 caching on asynchronous rings to minimize
+						 * L2 usage.
+						 */
+						if (r->cache_policy == L2_LRU)
+							continue;
+
+						/* Asynchronous compute recommends waves_per_sh != 0
+						 * to limit CU usage. */
+						if (r->is_cs && r->waves_per_sh == 0)
+							continue;
+					} else {
+						/* SDMA is always asynchronous */
+						if (r->is_sdma)
+							continue;
+
+						if (cached && r->cache_policy == L2_BYPASS)
+							continue;
+						if (!cached && r->cache_policy == L2_LRU)
+							continue;
+					}
+
+					if (!best) {
+						best = r;
+						continue;
+					}
+
+					/* Assume some measurement error. Earlier methods occupy fewer
+					 * resources, so the next method is always more greedy, and we
+					 * don't want to select it due to a measurement error.
+					 */
+					double min_improvement = 1.03;
+
+					if (best->score * min_improvement < r->score)
+						best = r;
+				}
+
+				if (num_methods > 0) {
+					unsigned prev_index = num_methods - 1;
+					struct si_result *prev = methods[prev_index];
+					struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
+
+					/* If the best one is also the best for the previous size,
+					 * just bump the size for the previous one.
+					 *
+					 * If there is no best, it means all methods were too slow
+					 * for this size and were not tested. Use the best one for
+					 * the previous size.
+					 */
+					if (!best ||
+					    /* If it's the same method as for the previous size: */
+					    (prev->is_cp == best->is_cp &&
+					     prev->is_sdma == best->is_sdma &&
+					     prev->is_cs == best->is_cs &&
+					     prev->cache_policy == best->cache_policy &&
+					     prev->dwords_per_thread == best->dwords_per_thread &&
+					     prev->waves_per_sh == best->waves_per_sh) ||
+					    /* If the method for the previous size is also the best
+					     * for this size: */
+					    (prev_this_size->is_valid &&
+					     prev_this_size->score * 1.03 > best->score)) {
+						method_max_size[prev_index] = size;
+						continue;
+					}
+				}
+
+				/* Add it to the list. */
+				assert(num_methods < ARRAY_SIZE(methods));
+				methods[num_methods] = best;
+				method_max_size[num_methods] = size;
+				num_methods++;
+			}
+
+			for (unsigned i = 0; i < num_methods; i++) {
+				struct si_result *best = methods[i];
+				unsigned size = method_max_size[i];
+
+				/* The size threshold is between the current benchmarked
+				 * size and the next benchmarked size. */
+				if (i < num_methods - 1)
+					printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+				else if (i > 0)
+					printf("         else                   ");
+				else
+					printf("         ");
+				printf("return ");
+
+				assert(best);
+				if (best->is_cp) {
+					printf("CP_DMA(%s);\n",
+					       best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
+					       best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
+				}
+				if (best->is_sdma)
+					printf("SDMA;\n");
+				if (best->is_cs) {
+					printf("COMPUTE(%s, %u, %u);\n",
+					       best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
+					       best->dwords_per_thread,
+					       best->waves_per_sh);
+				}
+			}
+		}
+		puts("      }");
+	}
+	puts("   }");
+	puts("}");
+
+	ctx->destroy(ctx);
+	exit(0);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_texture.c b/lib/mesa/src/gallium/drivers/radeonsi/si_texture.c
new file mode 100644
index 000000000..2fb79253a
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_texture.c
@@ -0,0 +1,2424 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "radeonsi/si_pipe.h"
+#include "radeonsi/si_query.h"
+#include "util/u_format.h"
+#include "util/u_log.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "util/u_resource.h"
+#include "util/u_surface.h"
+#include "util/u_transfer.h"
+#include "util/os_time.h"
+#include <errno.h>
+#include <inttypes.h>
+#include "state_tracker/drm_driver.h"
+#include "amd/common/sid.h"
+
+static enum radeon_surf_mode
+si_choose_tiling(struct si_screen *sscreen,
+		 const struct pipe_resource *templ, bool tc_compatible_htile);
+
+
+bool si_prepare_for_dma_blit(struct si_context *sctx,
+			     struct si_texture *dst,
+			     unsigned dst_level, unsigned dstx,
+			     unsigned dsty, unsigned dstz,
+			     struct si_texture *src,
+			     unsigned src_level,
+			     const struct pipe_box *src_box)
+{
+	if (!sctx->dma_cs)
+		return false;
+
+	if (dst->surface.bpe != src->surface.bpe)
+		return false;
+
+	/* MSAA: Blits don't exist in the real world. */
+	if (src->buffer.b.b.nr_samples > 1 ||
+	    dst->buffer.b.b.nr_samples > 1)
+		return false;
+
+	/* Depth-stencil surfaces:
+	 *   When dst is linear, the DB->CB copy preserves HTILE.
+	 *   When dst is tiled, the 3D path must be used to update HTILE.
+	 */
+	if (src->is_depth || dst->is_depth)
+		return false;
+
+	/* DCC as:
+	 *   src: Use the 3D path. DCC decompression is expensive.
+	 *   dst: Use the 3D path to compress the pixels with DCC.
+	 */
+	if (vi_dcc_enabled(src, src_level) ||
+	    vi_dcc_enabled(dst, dst_level))
+		return false;
+
+	/* CMASK as:
+	 *   src: Both texture and SDMA paths need decompression. Use SDMA.
+	 *   dst: If overwriting the whole texture, discard CMASK and use
+	 *        SDMA. Otherwise, use the 3D path.
+	 */
+	if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
+		/* The CMASK clear is only enabled for the first level. */
+		assert(dst_level == 0);
+		if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level,
+						      dstx, dsty, dstz, src_box->width,
+						      src_box->height, src_box->depth))
+			return false;
+
+		si_texture_discard_cmask(sctx->screen, dst);
+	}
+
+	/* All requirements are met. Prepare textures for SDMA. */
+	if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
+		sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
+
+	assert(!(src->dirty_level_mask & (1 << src_level)));
+	assert(!(dst->dirty_level_mask & (1 << dst_level)));
+
+	return true;
+}
+
+/* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
+static void si_copy_region_with_blit(struct pipe_context *pipe,
+				     struct pipe_resource *dst,
+				     unsigned dst_level,
+				     unsigned dstx, unsigned dsty, unsigned dstz,
+				     struct pipe_resource *src,
+				     unsigned src_level,
+				     const struct pipe_box *src_box)
+{
+	struct pipe_blit_info blit;
+
+	memset(&blit, 0, sizeof(blit));
+	blit.src.resource = src;
+	blit.src.format = src->format;
+	blit.src.level = src_level;
+	blit.src.box = *src_box;
+	blit.dst.resource = dst;
+	blit.dst.format = dst->format;
+	blit.dst.level = dst_level;
+	blit.dst.box.x = dstx;
+	blit.dst.box.y = dsty;
+	blit.dst.box.z = dstz;
+	blit.dst.box.width = src_box->width;
+	blit.dst.box.height = src_box->height;
+	blit.dst.box.depth = src_box->depth;
+	blit.mask = util_format_get_mask(src->format) &
+		    util_format_get_mask(dst->format);
+	blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+	if (blit.mask) {
+		pipe->blit(pipe, &blit);
+	}
+}
+
+/* Copy from a full GPU texture to a transfer's staging one. */
+static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
+	struct pipe_resource *dst = &stransfer->staging->b.b;
+	struct pipe_resource *src = transfer->resource;
+
+	if (src->nr_samples > 1) {
+		si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
+					   src, transfer->level, &transfer->box);
+		return;
+	}
+
+	sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
+		       &transfer->box);
+}
+
+/* Copy from a transfer's staging texture to a full GPU one. */
+static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
+	struct pipe_resource *dst = transfer->resource;
+	struct pipe_resource *src = &stransfer->staging->b.b;
+	struct pipe_box sbox;
+
+	u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
+
+	if (dst->nr_samples > 1) {
+		si_copy_region_with_blit(ctx, dst, transfer->level,
+					   transfer->box.x, transfer->box.y, transfer->box.z,
+					   src, 0, &sbox);
+		return;
+	}
+
+	sctx->dma_copy(ctx, dst, transfer->level,
+		       transfer->box.x, transfer->box.y, transfer->box.z,
+		       src, 0, &sbox);
+}
+
+static unsigned si_texture_get_offset(struct si_screen *sscreen,
+				      struct si_texture *tex, unsigned level,
+				      const struct pipe_box *box,
+				      unsigned *stride,
+				      unsigned *layer_stride)
+{
+	if (sscreen->info.chip_class >= GFX9) {
+		*stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+		*layer_stride = tex->surface.u.gfx9.surf_slice_size;
+
+		if (!box)
+			return 0;
+
+		/* Each texture is an array of slices. Each slice is an array
+		 * of mipmap levels. */
+		return box->z * tex->surface.u.gfx9.surf_slice_size +
+		       tex->surface.u.gfx9.offset[level] +
+		       (box->y / tex->surface.blk_h *
+			tex->surface.u.gfx9.surf_pitch +
+			box->x / tex->surface.blk_w) * tex->surface.bpe;
+	} else {
+		*stride = tex->surface.u.legacy.level[level].nblk_x *
+			  tex->surface.bpe;
+		assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
+		*layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
+
+		if (!box)
+			return tex->surface.u.legacy.level[level].offset;
+
+		/* Each texture is an array of mipmap levels. Each level is
+		 * an array of slices. */
+		return tex->surface.u.legacy.level[level].offset +
+		       box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
+		       (box->y / tex->surface.blk_h *
+		        tex->surface.u.legacy.level[level].nblk_x +
+		        box->x / tex->surface.blk_w) * tex->surface.bpe;
+	}
+}
+
+static int si_init_surface(struct si_screen *sscreen,
+			   struct radeon_surf *surface,
+			   const struct pipe_resource *ptex,
+			   enum radeon_surf_mode array_mode,
+			   unsigned pitch_in_bytes_override,
+			   unsigned offset,
+			   bool is_imported,
+			   bool is_scanout,
+			   bool is_flushed_depth,
+			   bool tc_compatible_htile)
+{
+	const struct util_format_description *desc =
+		util_format_description(ptex->format);
+	bool is_depth, is_stencil;
+	int r;
+	unsigned i, bpe, flags = 0;
+
+	is_depth = util_format_has_depth(desc);
+	is_stencil = util_format_has_stencil(desc);
+
+	if (!is_flushed_depth &&
+	    ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
+		bpe = 4; /* stencil is allocated separately */
+	} else {
+		bpe = util_format_get_blocksize(ptex->format);
+		assert(util_is_power_of_two_or_zero(bpe));
+	}
+
+	if (!is_flushed_depth && is_depth) {
+		flags |= RADEON_SURF_ZBUFFER;
+
+		if (tc_compatible_htile &&
+		    (sscreen->info.chip_class >= GFX9 ||
+		     array_mode == RADEON_SURF_MODE_2D)) {
+			/* TC-compatible HTILE only supports Z32_FLOAT.
+			 * GFX9 also supports Z16_UNORM.
+			 * On VI, promote Z16 to Z32. DB->CB copies will convert
+			 * the format for transfers.
+			 */
+			if (sscreen->info.chip_class == VI)
+				bpe = 4;
+
+			flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+		}
+
+		if (is_stencil)
+			flags |= RADEON_SURF_SBUFFER;
+	}
+
+	if (sscreen->info.chip_class >= VI &&
+	    (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC ||
+	     ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
+	     (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
+		flags |= RADEON_SURF_DISABLE_DCC;
+
+	/* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
+	if (sscreen->info.family == CHIP_STONEY &&
+	    bpe == 16 && ptex->nr_samples >= 2)
+		flags |= RADEON_SURF_DISABLE_DCC;
+
+	/* VI: DCC clear for 4x and 8x MSAA array textures unimplemented. */
+	if (sscreen->info.chip_class == VI &&
+	    ptex->nr_storage_samples >= 4 &&
+	    ptex->array_size > 1)
+		flags |= RADEON_SURF_DISABLE_DCC;
+
+	/* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
+	if (sscreen->info.chip_class >= GFX9 &&
+	    ptex->nr_storage_samples >= 4)
+		flags |= RADEON_SURF_DISABLE_DCC;
+
+	if (ptex->bind & PIPE_BIND_SCANOUT || is_scanout) {
+		/* This should catch bugs in gallium users setting incorrect flags. */
+		assert(ptex->nr_samples <= 1 &&
+		       ptex->array_size == 1 &&
+		       ptex->depth0 == 1 &&
+		       ptex->last_level == 0 &&
+		       !(flags & RADEON_SURF_Z_OR_SBUFFER));
+
+		flags |= RADEON_SURF_SCANOUT;
+	}
+
+	if (ptex->bind & PIPE_BIND_SHARED)
+		flags |= RADEON_SURF_SHAREABLE;
+	if (is_imported)
+		flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
+	if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_TILING))
+		flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
+
+	r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe,
+				      array_mode, surface);
+	if (r) {
+		return r;
+	}
+
+	unsigned pitch = pitch_in_bytes_override / bpe;
+
+	if (sscreen->info.chip_class >= GFX9) {
+		if (pitch) {
+			surface->u.gfx9.surf_pitch = pitch;
+			surface->u.gfx9.surf_slice_size =
+				(uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
+		}
+		surface->u.gfx9.surf_offset = offset;
+	} else {
+		if (pitch) {
+			surface->u.legacy.level[0].nblk_x = pitch;
+			surface->u.legacy.level[0].slice_size_dw =
+				((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
+		}
+		if (offset) {
+			for (i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
+				surface->u.legacy.level[i].offset += offset;
+		}
+	}
+	return 0;
+}
+
+static void si_texture_init_metadata(struct si_screen *sscreen,
+				     struct si_texture *tex,
+				     struct radeon_bo_metadata *metadata)
+{
+	struct radeon_surf *surface = &tex->surface;
+
+	memset(metadata, 0, sizeof(*metadata));
+
+	if (sscreen->info.chip_class >= GFX9) {
+		metadata->u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
+	} else {
+		metadata->u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ?
+					   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+		metadata->u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ?
+					   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+		metadata->u.legacy.pipe_config = surface->u.legacy.pipe_config;
+		metadata->u.legacy.bankw = surface->u.legacy.bankw;
+		metadata->u.legacy.bankh = surface->u.legacy.bankh;
+		metadata->u.legacy.tile_split = surface->u.legacy.tile_split;
+		metadata->u.legacy.mtilea = surface->u.legacy.mtilea;
+		metadata->u.legacy.num_banks = surface->u.legacy.num_banks;
+		metadata->u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
+		metadata->u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+	}
+}
+
+static void si_surface_import_metadata(struct si_screen *sscreen,
+				       struct radeon_surf *surf,
+				       struct radeon_bo_metadata *metadata,
+				       enum radeon_surf_mode *array_mode,
+				       bool *is_scanout)
+{
+	if (sscreen->info.chip_class >= GFX9) {
+		if (metadata->u.gfx9.swizzle_mode > 0)
+			*array_mode = RADEON_SURF_MODE_2D;
+		else
+			*array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		*is_scanout = metadata->u.gfx9.swizzle_mode == 0 ||
+			      metadata->u.gfx9.swizzle_mode % 4 == 2;
+
+		surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
+	} else {
+		surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
+		surf->u.legacy.bankw = metadata->u.legacy.bankw;
+		surf->u.legacy.bankh = metadata->u.legacy.bankh;
+		surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
+		surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
+		surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
+
+		if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
+			*array_mode = RADEON_SURF_MODE_2D;
+		else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
+			*array_mode = RADEON_SURF_MODE_1D;
+		else
+			*array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		*is_scanout = metadata->u.legacy.scanout;
+	}
+}
+
+void si_eliminate_fast_color_clear(struct si_context *sctx,
+				   struct si_texture *tex)
+{
+	struct si_screen *sscreen = sctx->screen;
+	struct pipe_context *ctx = &sctx->b;
+
+	if (ctx == sscreen->aux_context)
+		mtx_lock(&sscreen->aux_context_lock);
+
+	unsigned n = sctx->num_decompress_calls;
+	ctx->flush_resource(ctx, &tex->buffer.b.b);
+
+	/* Flush only if any fast clear elimination took place. */
+	if (n != sctx->num_decompress_calls)
+		ctx->flush(ctx, NULL, 0);
+
+	if (ctx == sscreen->aux_context)
+		mtx_unlock(&sscreen->aux_context_lock);
+}
+
+void si_texture_discard_cmask(struct si_screen *sscreen,
+			      struct si_texture *tex)
+{
+	if (!tex->cmask_buffer)
+		return;
+
+	assert(tex->buffer.b.b.nr_samples <= 1);
+
+	/* Disable CMASK. */
+	tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
+	tex->dirty_level_mask = 0;
+
+	tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
+
+	if (tex->cmask_buffer != &tex->buffer)
+	    r600_resource_reference(&tex->cmask_buffer, NULL);
+
+	tex->cmask_buffer = NULL;
+
+	/* Notify all contexts about the change. */
+	p_atomic_inc(&sscreen->dirty_tex_counter);
+	p_atomic_inc(&sscreen->compressed_colortex_counter);
+}
+
+static bool si_can_disable_dcc(struct si_texture *tex)
+{
+	/* We can't disable DCC if it can be written by another process. */
+	return tex->dcc_offset &&
+	       (!tex->buffer.b.is_shared ||
+		!(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
+}
+
+static bool si_texture_discard_dcc(struct si_screen *sscreen,
+				   struct si_texture *tex)
+{
+	if (!si_can_disable_dcc(tex))
+		return false;
+
+	assert(tex->dcc_separate_buffer == NULL);
+
+	/* Disable DCC. */
+	tex->dcc_offset = 0;
+
+	/* Notify all contexts about the change. */
+	p_atomic_inc(&sscreen->dirty_tex_counter);
+	return true;
+}
+
+/**
+ * Disable DCC for the texture. (first decompress, then discard metadata).
+ *
+ * There is unresolved multi-context synchronization issue between
+ * screen::aux_context and the current context. If applications do this with
+ * multiple contexts, it's already undefined behavior for them and we don't
+ * have to worry about that. The scenario is:
+ *
+ * If context 1 disables DCC and context 2 has queued commands that write
+ * to the texture via CB with DCC enabled, and the order of operations is
+ * as follows:
+ *   context 2 queues draw calls rendering to the texture, but doesn't flush
+ *   context 1 disables DCC and flushes
+ *   context 1 & 2 reset descriptors and FB state
+ *   context 2 flushes (new compressed tiles written by the draw calls)
+ *   context 1 & 2 read garbage, because DCC is disabled, yet there are
+ *   compressed tiled
+ *
+ * \param sctx  the current context if you have one, or rscreen->aux_context
+ *              if you don't.
+ */
+bool si_texture_disable_dcc(struct si_context *sctx,
+			    struct si_texture *tex)
+{
+	struct si_screen *sscreen = sctx->screen;
+
+	if (!si_can_disable_dcc(tex))
+		return false;
+
+	if (&sctx->b == sscreen->aux_context)
+		mtx_lock(&sscreen->aux_context_lock);
+
+	/* Decompress DCC. */
+	si_decompress_dcc(sctx, tex);
+	sctx->b.flush(&sctx->b, NULL, 0);
+
+	if (&sctx->b == sscreen->aux_context)
+		mtx_unlock(&sscreen->aux_context_lock);
+
+	return si_texture_discard_dcc(sscreen, tex);
+}
+
+static void si_reallocate_texture_inplace(struct si_context *sctx,
+					  struct si_texture *tex,
+					  unsigned new_bind_flag,
+					  bool invalidate_storage)
+{
+	struct pipe_screen *screen = sctx->b.screen;
+	struct si_texture *new_tex;
+	struct pipe_resource templ = tex->buffer.b.b;
+	unsigned i;
+
+	templ.bind |= new_bind_flag;
+
+	if (tex->buffer.b.is_shared)
+		return;
+
+	if (new_bind_flag == PIPE_BIND_LINEAR) {
+		if (tex->surface.is_linear)
+			return;
+
+		/* This fails with MSAA, depth, and compressed textures. */
+		if (si_choose_tiling(sctx->screen, &templ, false) !=
+		    RADEON_SURF_MODE_LINEAR_ALIGNED)
+			return;
+	}
+
+	new_tex = (struct si_texture*)screen->resource_create(screen, &templ);
+	if (!new_tex)
+		return;
+
+	/* Copy the pixels to the new texture. */
+	if (!invalidate_storage) {
+		for (i = 0; i <= templ.last_level; i++) {
+			struct pipe_box box;
+
+			u_box_3d(0, 0, 0,
+				 u_minify(templ.width0, i), u_minify(templ.height0, i),
+				 util_num_layers(&templ, i), &box);
+
+			sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0,
+				       &tex->buffer.b.b, i, &box);
+		}
+	}
+
+	if (new_bind_flag == PIPE_BIND_LINEAR) {
+		si_texture_discard_cmask(sctx->screen, tex);
+		si_texture_discard_dcc(sctx->screen, tex);
+	}
+
+	/* Replace the structure fields of tex. */
+	tex->buffer.b.b.bind = templ.bind;
+	pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
+	tex->buffer.gpu_address = new_tex->buffer.gpu_address;
+	tex->buffer.vram_usage = new_tex->buffer.vram_usage;
+	tex->buffer.gart_usage = new_tex->buffer.gart_usage;
+	tex->buffer.bo_size = new_tex->buffer.bo_size;
+	tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
+	tex->buffer.domains = new_tex->buffer.domains;
+	tex->buffer.flags = new_tex->buffer.flags;
+
+	tex->surface = new_tex->surface;
+	tex->size = new_tex->size;
+	si_texture_reference(&tex->flushed_depth_texture,
+			     new_tex->flushed_depth_texture);
+
+	tex->fmask_offset = new_tex->fmask_offset;
+	tex->cmask_offset = new_tex->cmask_offset;
+	tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
+
+	if (tex->cmask_buffer == &tex->buffer)
+		tex->cmask_buffer = NULL;
+	else
+		r600_resource_reference(&tex->cmask_buffer, NULL);
+
+	if (new_tex->cmask_buffer == &new_tex->buffer)
+		tex->cmask_buffer = &tex->buffer;
+	else
+		r600_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
+
+	tex->dcc_offset = new_tex->dcc_offset;
+	tex->cb_color_info = new_tex->cb_color_info;
+	memcpy(tex->color_clear_value, new_tex->color_clear_value,
+	       sizeof(tex->color_clear_value));
+	tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
+
+	tex->htile_offset = new_tex->htile_offset;
+	tex->depth_clear_value = new_tex->depth_clear_value;
+	tex->dirty_level_mask = new_tex->dirty_level_mask;
+	tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
+	tex->db_render_format = new_tex->db_render_format;
+	tex->stencil_clear_value = new_tex->stencil_clear_value;
+	tex->tc_compatible_htile = new_tex->tc_compatible_htile;
+	tex->depth_cleared = new_tex->depth_cleared;
+	tex->stencil_cleared = new_tex->stencil_cleared;
+	tex->upgraded_depth = new_tex->upgraded_depth;
+	tex->db_compatible = new_tex->db_compatible;
+	tex->can_sample_z = new_tex->can_sample_z;
+	tex->can_sample_s = new_tex->can_sample_s;
+
+	tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
+	tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
+	r600_resource_reference(&tex->dcc_separate_buffer,
+				new_tex->dcc_separate_buffer);
+	r600_resource_reference(&tex->last_dcc_separate_buffer,
+				new_tex->last_dcc_separate_buffer);
+
+	if (new_bind_flag == PIPE_BIND_LINEAR) {
+		assert(!tex->htile_offset);
+		assert(!tex->cmask_buffer);
+		assert(!tex->surface.fmask_size);
+		assert(!tex->dcc_offset);
+		assert(!tex->is_depth);
+	}
+
+	si_texture_reference(&new_tex, NULL);
+
+	p_atomic_inc(&sctx->screen->dirty_tex_counter);
+}
+
+static uint32_t si_get_bo_metadata_word1(struct si_screen *sscreen)
+{
+	return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
+}
+
+static void si_query_opaque_metadata(struct si_screen *sscreen,
+				     struct si_texture *tex,
+			             struct radeon_bo_metadata *md)
+{
+	struct pipe_resource *res = &tex->buffer.b.b;
+	static const unsigned char swizzle[] = {
+		PIPE_SWIZZLE_X,
+		PIPE_SWIZZLE_Y,
+		PIPE_SWIZZLE_Z,
+		PIPE_SWIZZLE_W
+	};
+	uint32_t desc[8], i;
+	bool is_array = util_texture_is_array(res->target);
+
+	if (!sscreen->info.has_bo_metadata)
+		return;
+
+	assert(tex->dcc_separate_buffer == NULL);
+	assert(tex->surface.fmask_size == 0);
+
+	/* Metadata image format format version 1:
+	 * [0] = 1 (metadata format identifier)
+	 * [1] = (VENDOR_ID << 16) | PCI_ID
+	 * [2:9] = image descriptor for the whole resource
+	 *         [2] is always 0, because the base address is cleared
+	 *         [9] is the DCC offset bits [39:8] from the beginning of
+	 *             the buffer
+	 * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
+	 */
+
+	md->metadata[0] = 1; /* metadata image format version 1 */
+
+	/* TILE_MODE_INDEX is ambiguous without a PCI ID. */
+	md->metadata[1] = si_get_bo_metadata_word1(sscreen);
+
+	si_make_texture_descriptor(sscreen, tex, true,
+				   res->target, res->format,
+				   swizzle, 0, res->last_level, 0,
+				   is_array ? res->array_size - 1 : 0,
+				   res->width0, res->height0, res->depth0,
+				   desc, NULL);
+
+	si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0],
+				       0, 0, tex->surface.blk_w, false, desc);
+
+	/* Clear the base address and set the relative DCC offset. */
+	desc[0] = 0;
+	desc[1] &= C_008F14_BASE_ADDRESS_HI;
+	desc[7] = tex->dcc_offset >> 8;
+
+	/* Dwords [2:9] contain the image descriptor. */
+	memcpy(&md->metadata[2], desc, sizeof(desc));
+	md->size_metadata = 10 * 4;
+
+	/* Dwords [10:..] contain the mipmap level offsets. */
+	if (sscreen->info.chip_class <= VI) {
+		for (i = 0; i <= res->last_level; i++)
+			md->metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8;
+
+		md->size_metadata += (1 + res->last_level) * 4;
+	}
+}
+
+static void si_apply_opaque_metadata(struct si_screen *sscreen,
+				     struct si_texture *tex,
+			             struct radeon_bo_metadata *md)
+{
+	uint32_t *desc = &md->metadata[2];
+
+	if (sscreen->info.chip_class < VI)
+		return;
+
+	/* Return if DCC is enabled. The texture should be set up with it
+	 * already.
+	 */
+	if (md->size_metadata >= 10 * 4 && /* at least 2(header) + 8(desc) dwords */
+	    md->metadata[0] != 0 &&
+	    md->metadata[1] == si_get_bo_metadata_word1(sscreen) &&
+	    G_008F28_COMPRESSION_EN(desc[6])) {
+		tex->dcc_offset = (uint64_t)desc[7] << 8;
+		return;
+	}
+
+	/* Disable DCC. These are always set by texture_from_handle and must
+	 * be cleared here.
+	 */
+	tex->dcc_offset = 0;
+}
+
+static boolean si_texture_get_handle(struct pipe_screen* screen,
+				     struct pipe_context *ctx,
+				     struct pipe_resource *resource,
+				     struct winsys_handle *whandle,
+				     unsigned usage)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct si_context *sctx;
+	struct r600_resource *res = r600_resource(resource);
+	struct si_texture *tex = (struct si_texture*)resource;
+	struct radeon_bo_metadata metadata;
+	bool update_metadata = false;
+	unsigned stride, offset, slice_size;
+	bool flush = false;
+
+	ctx = threaded_context_unwrap_sync(ctx);
+	sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context);
+
+	if (resource->target != PIPE_BUFFER) {
+		/* This is not supported now, but it might be required for OpenCL
+		 * interop in the future.
+		 */
+		if (resource->nr_samples > 1 || tex->is_depth)
+			return false;
+
+		/* Move a suballocated texture into a non-suballocated allocation. */
+		if (sscreen->ws->buffer_is_suballocated(res->buf) ||
+		    tex->surface.tile_swizzle ||
+		    (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+		     sscreen->info.has_local_buffers &&
+		     whandle->type != WINSYS_HANDLE_TYPE_KMS)) {
+			assert(!res->b.is_shared);
+			si_reallocate_texture_inplace(sctx, tex,
+							PIPE_BIND_SHARED, false);
+			flush = true;
+			assert(res->b.b.bind & PIPE_BIND_SHARED);
+			assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+			assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
+			assert(tex->surface.tile_swizzle == 0);
+		}
+
+		/* Since shader image stores don't support DCC on VI,
+		 * disable it for external clients that want write
+		 * access.
+		 */
+		if (usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->dcc_offset) {
+			if (si_texture_disable_dcc(sctx, tex)) {
+				update_metadata = true;
+				/* si_texture_disable_dcc flushes the context */
+				flush = false;
+			}
+		}
+
+		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+		    (tex->cmask_buffer || tex->dcc_offset)) {
+			/* Eliminate fast clear (both CMASK and DCC) */
+			si_eliminate_fast_color_clear(sctx, tex);
+			/* eliminate_fast_color_clear flushes the context */
+			flush = false;
+
+			/* Disable CMASK if flush_resource isn't going
+			 * to be called.
+			 */
+			if (tex->cmask_buffer)
+				si_texture_discard_cmask(sscreen, tex);
+		}
+
+		/* Set metadata. */
+		if (!res->b.is_shared || update_metadata) {
+			si_texture_init_metadata(sscreen, tex, &metadata);
+			si_query_opaque_metadata(sscreen, tex, &metadata);
+
+			sscreen->ws->buffer_set_metadata(res->buf, &metadata);
+		}
+
+		if (sscreen->info.chip_class >= GFX9) {
+			offset = tex->surface.u.gfx9.surf_offset;
+			stride = tex->surface.u.gfx9.surf_pitch *
+				 tex->surface.bpe;
+			slice_size = tex->surface.u.gfx9.surf_slice_size;
+		} else {
+			offset = tex->surface.u.legacy.level[0].offset;
+			stride = tex->surface.u.legacy.level[0].nblk_x *
+				 tex->surface.bpe;
+			slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+		}
+	} else {
+		/* Buffer exports are for the OpenCL interop. */
+		/* Move a suballocated buffer into a non-suballocated allocation. */
+		if (sscreen->ws->buffer_is_suballocated(res->buf) ||
+		    /* A DMABUF export always fails if the BO is local. */
+		    (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+		     sscreen->info.has_local_buffers)) {
+			assert(!res->b.is_shared);
+
+			/* Allocate a new buffer with PIPE_BIND_SHARED. */
+			struct pipe_resource templ = res->b.b;
+			templ.bind |= PIPE_BIND_SHARED;
+
+			struct pipe_resource *newb =
+				screen->resource_create(screen, &templ);
+			if (!newb)
+				return false;
+
+			/* Copy the old buffer contents to the new one. */
+			struct pipe_box box;
+			u_box_1d(0, newb->width0, &box);
+			sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0,
+						     &res->b.b, 0, &box);
+			flush = true;
+			/* Move the new buffer storage to the old pipe_resource. */
+			si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
+			pipe_resource_reference(&newb, NULL);
+
+			assert(res->b.b.bind & PIPE_BIND_SHARED);
+			assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+		}
+
+		/* Buffers */
+		offset = 0;
+		stride = 0;
+		slice_size = 0;
+	}
+
+	if (flush)
+		sctx->b.flush(&sctx->b, NULL, 0);
+
+	if (res->b.is_shared) {
+		/* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+		 * doesn't set it.
+		 */
+		res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+			res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+	} else {
+		res->b.is_shared = true;
+		res->external_usage = usage;
+	}
+
+	return sscreen->ws->buffer_get_handle(res->buf, stride, offset,
+					      slice_size, whandle);
+}
+
+static void si_texture_destroy(struct pipe_screen *screen,
+			       struct pipe_resource *ptex)
+{
+	struct si_texture *tex = (struct si_texture*)ptex;
+	struct r600_resource *resource = &tex->buffer;
+
+	si_texture_reference(&tex->flushed_depth_texture, NULL);
+
+	if (tex->cmask_buffer != &tex->buffer) {
+	    r600_resource_reference(&tex->cmask_buffer, NULL);
+	}
+	pb_reference(&resource->buf, NULL);
+	r600_resource_reference(&tex->dcc_separate_buffer, NULL);
+	r600_resource_reference(&tex->last_dcc_separate_buffer, NULL);
+	FREE(tex);
+}
+
+static const struct u_resource_vtbl si_texture_vtbl;
+
+static void si_texture_get_htile_size(struct si_screen *sscreen,
+				      struct si_texture *tex)
+{
+	unsigned cl_width, cl_height, width, height;
+	unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
+	unsigned num_pipes = sscreen->info.num_tile_pipes;
+
+	assert(sscreen->info.chip_class <= VI);
+
+	tex->surface.htile_size = 0;
+
+	if (tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+	    !sscreen->info.htile_cmask_support_1d_tiling)
+		return;
+
+	/* Overalign HTILE on P2 configs to work around GPU hangs in
+	 * piglit/depthstencil-render-miplevels 585.
+	 *
+	 * This has been confirmed to help Kabini & Stoney, where the hangs
+	 * are always reproducible. I think I have seen the test hang
+	 * on Carrizo too, though it was very rare there.
+	 */
+	if (sscreen->info.chip_class >= CIK && num_pipes < 4)
+		num_pipes = 4;
+
+	switch (num_pipes) {
+	case 1:
+		cl_width = 32;
+		cl_height = 16;
+		break;
+	case 2:
+		cl_width = 32;
+		cl_height = 32;
+		break;
+	case 4:
+		cl_width = 64;
+		cl_height = 32;
+		break;
+	case 8:
+		cl_width = 64;
+		cl_height = 64;
+		break;
+	case 16:
+		cl_width = 128;
+		cl_height = 64;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	width = align(tex->surface.u.legacy.level[0].nblk_x, cl_width * 8);
+	height = align(tex->surface.u.legacy.level[0].nblk_y, cl_height * 8);
+
+	slice_elements = (width * height) / (8 * 8);
+	slice_bytes = slice_elements * 4;
+
+	pipe_interleave_bytes = sscreen->info.pipe_interleave_bytes;
+	base_align = num_pipes * pipe_interleave_bytes;
+
+	tex->surface.htile_alignment = base_align;
+	tex->surface.htile_size =
+		util_num_layers(&tex->buffer.b.b, 0) *
+		align(slice_bytes, base_align);
+}
+
+static void si_texture_allocate_htile(struct si_screen *sscreen,
+				      struct si_texture *tex)
+{
+	if (sscreen->info.chip_class <= VI && !tex->tc_compatible_htile)
+		si_texture_get_htile_size(sscreen, tex);
+
+	if (!tex->surface.htile_size)
+		return;
+
+	tex->htile_offset = align(tex->size, tex->surface.htile_alignment);
+	tex->size = tex->htile_offset + tex->surface.htile_size;
+}
+
+void si_print_texture_info(struct si_screen *sscreen,
+			   struct si_texture *tex, struct u_log_context *log)
+{
+	int i;
+
+	/* Common parameters. */
+	u_log_printf(log, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
+		"blk_h=%u, array_size=%u, last_level=%u, "
+		"bpe=%u, nsamples=%u, flags=0x%x, %s\n",
+		tex->buffer.b.b.width0, tex->buffer.b.b.height0,
+		tex->buffer.b.b.depth0, tex->surface.blk_w,
+		tex->surface.blk_h,
+		tex->buffer.b.b.array_size, tex->buffer.b.b.last_level,
+		tex->surface.bpe, tex->buffer.b.b.nr_samples,
+		tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
+
+	if (sscreen->info.chip_class >= GFX9) {
+		u_log_printf(log, "  Surf: size=%"PRIu64", slice_size=%"PRIu64", "
+			"alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
+			tex->surface.surf_size,
+			tex->surface.u.gfx9.surf_slice_size,
+			tex->surface.surf_alignment,
+			tex->surface.u.gfx9.surf.swizzle_mode,
+			tex->surface.u.gfx9.surf.epitch,
+			tex->surface.u.gfx9.surf_pitch);
+
+		if (tex->surface.fmask_size) {
+			u_log_printf(log, "  FMASK: offset=%"PRIu64", size=%"PRIu64", "
+				"alignment=%u, swmode=%u, epitch=%u\n",
+				tex->fmask_offset,
+				tex->surface.fmask_size,
+				tex->surface.fmask_alignment,
+				tex->surface.u.gfx9.fmask.swizzle_mode,
+				tex->surface.u.gfx9.fmask.epitch);
+		}
+
+		if (tex->cmask_buffer) {
+			u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, "
+				"alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
+				tex->cmask_offset,
+				tex->surface.cmask_size,
+				tex->surface.cmask_alignment,
+				tex->surface.u.gfx9.cmask.rb_aligned,
+				tex->surface.u.gfx9.cmask.pipe_aligned);
+		}
+
+		if (tex->htile_offset) {
+			u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, alignment=%u, "
+				"rb_aligned=%u, pipe_aligned=%u\n",
+				tex->htile_offset,
+				tex->surface.htile_size,
+				tex->surface.htile_alignment,
+				tex->surface.u.gfx9.htile.rb_aligned,
+				tex->surface.u.gfx9.htile.pipe_aligned);
+		}
+
+		if (tex->dcc_offset) {
+			u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, "
+				"alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
+				tex->dcc_offset, tex->surface.dcc_size,
+				tex->surface.dcc_alignment,
+				tex->surface.u.gfx9.dcc_pitch_max,
+				tex->surface.num_dcc_levels);
+		}
+
+		if (tex->surface.u.gfx9.stencil_offset) {
+			u_log_printf(log, "  Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n",
+				tex->surface.u.gfx9.stencil_offset,
+				tex->surface.u.gfx9.stencil.swizzle_mode,
+				tex->surface.u.gfx9.stencil.epitch);
+		}
+		return;
+	}
+
+	u_log_printf(log, "  Layout: size=%"PRIu64", alignment=%u, bankw=%u, "
+		"bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
+		tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
+		tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea,
+		tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config,
+		(tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+
+	if (tex->surface.fmask_size)
+		u_log_printf(log, "  FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
+			"bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
+			tex->fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
+			tex->surface.u.legacy.fmask.pitch_in_pixels,
+			tex->surface.u.legacy.fmask.bankh,
+			tex->surface.u.legacy.fmask.slice_tile_max,
+			tex->surface.u.legacy.fmask.tiling_index);
+
+	if (tex->cmask_buffer)
+		u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, alignment=%u, "
+			"slice_tile_max=%u\n",
+			tex->cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
+			tex->surface.u.legacy.cmask_slice_tile_max);
+
+	if (tex->htile_offset)
+		u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, "
+			"alignment=%u, TC_compatible = %u\n",
+			tex->htile_offset, tex->surface.htile_size,
+			tex->surface.htile_alignment,
+			tex->tc_compatible_htile);
+
+	if (tex->dcc_offset) {
+		u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, alignment=%u\n",
+			tex->dcc_offset, tex->surface.dcc_size,
+			tex->surface.dcc_alignment);
+		for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+			u_log_printf(log, "  DCCLevel[%i]: enabled=%u, offset=%u, "
+				"fast_clear_size=%u\n",
+				i, i < tex->surface.num_dcc_levels,
+				tex->surface.u.legacy.level[i].dcc_offset,
+				tex->surface.u.legacy.level[i].dcc_fast_clear_size);
+	}
+
+	for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+		u_log_printf(log, "  Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
+			"npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+			"mode=%u, tiling_index = %u\n",
+			i, tex->surface.u.legacy.level[i].offset,
+			(uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
+			u_minify(tex->buffer.b.b.width0, i),
+			u_minify(tex->buffer.b.b.height0, i),
+			u_minify(tex->buffer.b.b.depth0, i),
+			tex->surface.u.legacy.level[i].nblk_x,
+			tex->surface.u.legacy.level[i].nblk_y,
+			tex->surface.u.legacy.level[i].mode,
+			tex->surface.u.legacy.tiling_index[i]);
+
+	if (tex->surface.has_stencil) {
+		u_log_printf(log, "  StencilLayout: tilesplit=%u\n",
+			tex->surface.u.legacy.stencil_tile_split);
+		for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
+			u_log_printf(log, "  StencilLevel[%i]: offset=%"PRIu64", "
+				"slice_size=%"PRIu64", npix_x=%u, "
+				"npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+				"mode=%u, tiling_index = %u\n",
+				i, tex->surface.u.legacy.stencil_level[i].offset,
+				(uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
+				u_minify(tex->buffer.b.b.width0, i),
+				u_minify(tex->buffer.b.b.height0, i),
+				u_minify(tex->buffer.b.b.depth0, i),
+				tex->surface.u.legacy.stencil_level[i].nblk_x,
+				tex->surface.u.legacy.stencil_level[i].nblk_y,
+				tex->surface.u.legacy.stencil_level[i].mode,
+				tex->surface.u.legacy.stencil_tiling_index[i]);
+		}
+	}
+}
+
+/* Common processing for si_texture_create and si_texture_from_handle */
+static struct si_texture *
+si_texture_create_object(struct pipe_screen *screen,
+			 const struct pipe_resource *base,
+			 struct pb_buffer *buf,
+			 struct radeon_surf *surface)
+{
+	struct si_texture *tex;
+	struct r600_resource *resource;
+	struct si_screen *sscreen = (struct si_screen*)screen;
+
+	tex = CALLOC_STRUCT(si_texture);
+	if (!tex)
+		return NULL;
+
+	resource = &tex->buffer;
+	resource->b.b = *base;
+	resource->b.b.next = NULL;
+	resource->b.vtbl = &si_texture_vtbl;
+	pipe_reference_init(&resource->b.b.reference, 1);
+	resource->b.b.screen = screen;
+
+	/* don't include stencil-only formats which we don't support for rendering */
+	tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
+
+	tex->surface = *surface;
+	tex->size = tex->surface.surf_size;
+
+	tex->tc_compatible_htile = tex->surface.htile_size != 0 &&
+				   (tex->surface.flags &
+				    RADEON_SURF_TC_COMPATIBLE_HTILE);
+
+	/* TC-compatible HTILE:
+	 * - VI only supports Z32_FLOAT.
+	 * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
+	if (tex->tc_compatible_htile) {
+		if (sscreen->info.chip_class >= GFX9 &&
+		    base->format == PIPE_FORMAT_Z16_UNORM)
+			tex->db_render_format = base->format;
+		else {
+			tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+			tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
+					       base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
+		}
+	} else {
+		tex->db_render_format = base->format;
+	}
+
+	/* Applies to GCN. */
+	tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
+
+	/* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
+	 * between frames, so the only thing that can enable separate DCC
+	 * with DRI2 is multiple slow clears within a frame.
+	 */
+	tex->ps_draw_ratio = 0;
+
+	if (tex->is_depth) {
+		if (sscreen->info.chip_class >= GFX9) {
+			tex->can_sample_z = true;
+			tex->can_sample_s = true;
+		} else {
+			tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
+			tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
+		}
+
+		if (!(base->flags & (SI_RESOURCE_FLAG_TRANSFER |
+				     SI_RESOURCE_FLAG_FLUSHED_DEPTH))) {
+			tex->db_compatible = true;
+
+			if (!(sscreen->debug_flags & DBG(NO_HYPERZ)))
+				si_texture_allocate_htile(sscreen, tex);
+		}
+	} else {
+		if (base->nr_samples > 1 &&
+		    !buf &&
+		    !(sscreen->debug_flags & DBG(NO_FMASK))) {
+			/* Allocate FMASK. */
+			tex->fmask_offset = align64(tex->size,
+						     tex->surface.fmask_alignment);
+			tex->size = tex->fmask_offset + tex->surface.fmask_size;
+
+			/* Allocate CMASK. */
+			tex->cmask_offset = align64(tex->size, tex->surface.cmask_alignment);
+			tex->size = tex->cmask_offset + tex->surface.cmask_size;
+			tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+			tex->cmask_buffer = &tex->buffer;
+
+			if (!tex->surface.fmask_size || !tex->surface.cmask_size) {
+				FREE(tex);
+				return NULL;
+			}
+		}
+
+		/* Shared textures must always set up DCC here.
+		 * If it's not present, it will be disabled by
+		 * apply_opaque_metadata later.
+		 */
+		if (tex->surface.dcc_size &&
+		    (buf || !(sscreen->debug_flags & DBG(NO_DCC))) &&
+		    !(tex->surface.flags & RADEON_SURF_SCANOUT)) {
+			/* Reserve space for the DCC buffer. */
+			tex->dcc_offset = align64(tex->size, tex->surface.dcc_alignment);
+			tex->size = tex->dcc_offset + tex->surface.dcc_size;
+		}
+	}
+
+	/* Now create the backing buffer. */
+	if (!buf) {
+		si_init_resource_fields(sscreen, resource, tex->size,
+					  tex->surface.surf_alignment);
+
+		if (!si_alloc_resource(sscreen, resource)) {
+			FREE(tex);
+			return NULL;
+		}
+	} else {
+		resource->buf = buf;
+		resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
+		resource->bo_size = buf->size;
+		resource->bo_alignment = buf->alignment;
+		resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
+		if (resource->domains & RADEON_DOMAIN_VRAM)
+			resource->vram_usage = buf->size;
+		else if (resource->domains & RADEON_DOMAIN_GTT)
+			resource->gart_usage = buf->size;
+	}
+
+	if (tex->cmask_buffer) {
+		/* Initialize the cmask to 0xCC (= compressed state). */
+		si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b,
+					 tex->cmask_offset, tex->surface.cmask_size,
+					 0xCCCCCCCC);
+	}
+	if (tex->htile_offset) {
+		uint32_t clear_value = 0;
+
+		if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
+			clear_value = 0x0000030F;
+
+		si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
+					 tex->htile_offset,
+					 tex->surface.htile_size,
+					 clear_value);
+	}
+
+	/* Initialize DCC only if the texture is not being imported. */
+	if (!buf && tex->dcc_offset) {
+		si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
+					 tex->dcc_offset,
+					 tex->surface.dcc_size,
+					 0xFFFFFFFF);
+	}
+
+	/* Initialize the CMASK base register value. */
+	tex->cmask_base_address_reg =
+		(tex->buffer.gpu_address + tex->cmask_offset) >> 8;
+
+	if (sscreen->debug_flags & DBG(VM)) {
+		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n",
+			tex->buffer.gpu_address,
+			tex->buffer.gpu_address + tex->buffer.buf->size,
+			base->width0, base->height0, util_num_layers(base, 0), base->last_level+1,
+			base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
+	}
+
+	if (sscreen->debug_flags & DBG(TEX)) {
+		puts("Texture:");
+		struct u_log_context log;
+		u_log_context_init(&log);
+		si_print_texture_info(sscreen, tex, &log);
+		u_log_new_page_print(&log, stdout);
+		fflush(stdout);
+		u_log_context_destroy(&log);
+	}
+
+	return tex;
+}
+
+static enum radeon_surf_mode
+si_choose_tiling(struct si_screen *sscreen,
+		 const struct pipe_resource *templ, bool tc_compatible_htile)
+{
+	const struct util_format_description *desc = util_format_description(templ->format);
+	bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_TILING;
+	bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
+				!(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
+
+	/* MSAA resources must be 2D tiled. */
+	if (templ->nr_samples > 1)
+		return RADEON_SURF_MODE_2D;
+
+	/* Transfer resources should be linear. */
+	if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
+		return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+	/* Avoid Z/S decompress blits by forcing TC-compatible HTILE on VI,
+	 * which requires 2D tiling.
+	 */
+	if (sscreen->info.chip_class == VI && tc_compatible_htile)
+		return RADEON_SURF_MODE_2D;
+
+	/* Handle common candidates for the linear mode.
+	 * Compressed textures and DB surfaces must always be tiled.
+	 */
+	if (!force_tiling &&
+	    !is_depth_stencil &&
+	    !util_format_is_compressed(templ->format)) {
+		if (sscreen->debug_flags & DBG(NO_TILING))
+			return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		/* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
+		if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
+			return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		/* Cursors are linear on SI.
+		 * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
+		if (templ->bind & PIPE_BIND_CURSOR)
+			return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		if (templ->bind & PIPE_BIND_LINEAR)
+			return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		/* Textures with a very small height are recommended to be linear. */
+		if (templ->target == PIPE_TEXTURE_1D ||
+		    templ->target == PIPE_TEXTURE_1D_ARRAY ||
+		    /* Only very thin and long 2D textures should benefit from
+		     * linear_aligned. */
+		    (templ->width0 > 8 && templ->height0 <= 2))
+			return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+		/* Textures likely to be mapped often. */
+		if (templ->usage == PIPE_USAGE_STAGING ||
+		    templ->usage == PIPE_USAGE_STREAM)
+			return RADEON_SURF_MODE_LINEAR_ALIGNED;
+	}
+
+	/* Make small textures 1D tiled. */
+	if (templ->width0 <= 16 || templ->height0 <= 16 ||
+	    (sscreen->debug_flags & DBG(NO_2D_TILING)))
+		return RADEON_SURF_MODE_1D;
+
+	/* The allocator will switch to 1D if needed. */
+	return RADEON_SURF_MODE_2D;
+}
+
+struct pipe_resource *si_texture_create(struct pipe_screen *screen,
+					const struct pipe_resource *templ)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	bool is_zs = util_format_is_depth_or_stencil(templ->format);
+
+	if (templ->nr_samples >= 2) {
+		/* This is hackish (overwriting the const pipe_resource template),
+		 * but should be harmless and state trackers can also see
+		 * the overriden number of samples in the created pipe_resource.
+		 */
+		if (is_zs && sscreen->eqaa_force_z_samples) {
+			((struct pipe_resource*)templ)->nr_samples =
+			((struct pipe_resource*)templ)->nr_storage_samples =
+				sscreen->eqaa_force_z_samples;
+		} else if (!is_zs && sscreen->eqaa_force_color_samples) {
+			((struct pipe_resource*)templ)->nr_samples =
+				sscreen->eqaa_force_coverage_samples;
+			((struct pipe_resource*)templ)->nr_storage_samples =
+				sscreen->eqaa_force_color_samples;
+		}
+	}
+
+	struct radeon_surf surface = {0};
+	bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH;
+	bool tc_compatible_htile =
+		sscreen->info.chip_class >= VI &&
+		/* There are issues with TC-compatible HTILE on Tonga (and
+		 * Iceland is the same design), and documented bug workarounds
+		 * don't help. For example, this fails:
+		 *   piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
+		 */
+		sscreen->info.family != CHIP_TONGA &&
+		sscreen->info.family != CHIP_ICELAND &&
+		(templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+		!(sscreen->debug_flags & DBG(NO_HYPERZ)) &&
+		!is_flushed_depth &&
+		templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+		is_zs;
+	int r;
+
+	r = si_init_surface(sscreen, &surface, templ,
+			    si_choose_tiling(sscreen, templ, tc_compatible_htile),
+			    0, 0, false, false, is_flushed_depth,
+			    tc_compatible_htile);
+	if (r) {
+		return NULL;
+	}
+
+	return (struct pipe_resource *)
+	       si_texture_create_object(screen, templ, NULL, &surface);
+}
+
+static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen,
+							   const struct pipe_resource *templ,
+							   struct pb_buffer *buf,
+							   unsigned stride,
+							   unsigned offset,
+							   unsigned usage,
+							   bool dedicated)
+{
+	enum radeon_surf_mode array_mode;
+	struct radeon_surf surface = {};
+	struct radeon_bo_metadata metadata = {};
+	struct si_texture *tex;
+	bool is_scanout;
+	int r;
+
+	if (dedicated) {
+		sscreen->ws->buffer_get_metadata(buf, &metadata);
+		si_surface_import_metadata(sscreen, &surface, &metadata,
+					   &array_mode, &is_scanout);
+	} else {
+		/**
+		 * The bo metadata is unset for un-dedicated images. So we fall
+		 * back to linear. See answer to question 5 of the
+		 * VK_KHX_external_memory spec for some details.
+		 *
+		 * It is possible that this case isn't going to work if the
+		 * surface pitch isn't correctly aligned by default.
+		 *
+		 * In order to support it correctly we require multi-image
+		 * metadata to be syncrhonized between radv and radeonsi. The
+		 * semantics of associating multiple image metadata to a memory
+		 * object on the vulkan export side are not concretely defined
+		 * either.
+		 *
+		 * All the use cases we are aware of at the moment for memory
+		 * objects use dedicated allocations. So lets keep the initial
+		 * implementation simple.
+		 *
+		 * A possible alternative is to attempt to reconstruct the
+		 * tiling information when the TexParameter TEXTURE_TILING_EXT
+		 * is set.
+		 */
+		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+		is_scanout = false;
+	}
+
+	r = si_init_surface(sscreen, &surface, templ,
+			    array_mode, stride, offset, true, is_scanout,
+			    false, false);
+	if (r)
+		return NULL;
+
+	tex = si_texture_create_object(&sscreen->b, templ, buf, &surface);
+	if (!tex)
+		return NULL;
+
+	tex->buffer.b.is_shared = true;
+	tex->buffer.external_usage = usage;
+
+	si_apply_opaque_metadata(sscreen, tex, &metadata);
+
+	assert(tex->surface.tile_swizzle == 0);
+	return &tex->buffer.b.b;
+}
+
+static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
+						    const struct pipe_resource *templ,
+						    struct winsys_handle *whandle,
+						    unsigned usage)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct pb_buffer *buf = NULL;
+	unsigned stride = 0, offset = 0;
+
+	/* Support only 2D textures without mipmaps */
+	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
+	      templ->depth0 != 1 || templ->last_level != 0)
+		return NULL;
+
+	buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, &stride, &offset);
+	if (!buf)
+		return NULL;
+
+	return si_texture_from_winsys_buffer(sscreen, templ, buf, stride,
+					     offset, usage, true);
+}
+
+bool si_init_flushed_depth_texture(struct pipe_context *ctx,
+				   struct pipe_resource *texture,
+				   struct si_texture **staging)
+{
+	struct si_texture *tex = (struct si_texture*)texture;
+	struct pipe_resource resource;
+	struct si_texture **flushed_depth_texture = staging ?
+			staging : &tex->flushed_depth_texture;
+	enum pipe_format pipe_format = texture->format;
+
+	if (!staging) {
+		if (tex->flushed_depth_texture)
+			return true; /* it's ready */
+
+		if (!tex->can_sample_z && tex->can_sample_s) {
+			switch (pipe_format) {
+			case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+				/* Save memory by not allocating the S plane. */
+				pipe_format = PIPE_FORMAT_Z32_FLOAT;
+				break;
+			case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+			case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+				/* Save memory bandwidth by not copying the
+				 * stencil part during flush.
+				 *
+				 * This potentially increases memory bandwidth
+				 * if an application uses both Z and S texturing
+				 * simultaneously (a flushed Z24S8 texture
+				 * would be stored compactly), but how often
+				 * does that really happen?
+				 */
+				pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+				break;
+			default:;
+			}
+		} else if (!tex->can_sample_s && tex->can_sample_z) {
+			assert(util_format_has_stencil(util_format_description(pipe_format)));
+
+			/* DB->CB copies to an 8bpp surface don't work. */
+			pipe_format = PIPE_FORMAT_X24S8_UINT;
+		}
+	}
+
+	memset(&resource, 0, sizeof(resource));
+	resource.target = texture->target;
+	resource.format = pipe_format;
+	resource.width0 = texture->width0;
+	resource.height0 = texture->height0;
+	resource.depth0 = texture->depth0;
+	resource.array_size = texture->array_size;
+	resource.last_level = texture->last_level;
+	resource.nr_samples = texture->nr_samples;
+	resource.usage = staging ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+	resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
+	resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
+
+	if (staging)
+		resource.flags |= SI_RESOURCE_FLAG_TRANSFER;
+
+	*flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+	if (*flushed_depth_texture == NULL) {
+		PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
+		return false;
+	}
+	return true;
+}
+
+/**
+ * Initialize the pipe_resource descriptor to be of the same size as the box,
+ * which is supposed to hold a subregion of the texture "orig" at the given
+ * mipmap level.
+ */
+static void si_init_temp_resource_from_box(struct pipe_resource *res,
+					   struct pipe_resource *orig,
+					   const struct pipe_box *box,
+					   unsigned level, unsigned flags)
+{
+	memset(res, 0, sizeof(*res));
+	res->format = orig->format;
+	res->width0 = box->width;
+	res->height0 = box->height;
+	res->depth0 = 1;
+	res->array_size = 1;
+	res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+	res->flags = flags;
+
+	/* We must set the correct texture target and dimensions for a 3D box. */
+	if (box->depth > 1 && util_max_layer(orig, level) > 0) {
+		res->target = PIPE_TEXTURE_2D_ARRAY;
+		res->array_size = box->depth;
+	} else {
+		res->target = PIPE_TEXTURE_2D;
+	}
+}
+
+static bool si_can_invalidate_texture(struct si_screen *sscreen,
+				      struct si_texture *tex,
+				      unsigned transfer_usage,
+				      const struct pipe_box *box)
+{
+	return !tex->buffer.b.is_shared &&
+		!(transfer_usage & PIPE_TRANSFER_READ) &&
+		tex->buffer.b.b.last_level == 0 &&
+		util_texrange_covers_whole_level(&tex->buffer.b.b, 0,
+						 box->x, box->y, box->z,
+						 box->width, box->height,
+						 box->depth);
+}
+
+static void si_texture_invalidate_storage(struct si_context *sctx,
+					  struct si_texture *tex)
+{
+	struct si_screen *sscreen = sctx->screen;
+
+	/* There is no point in discarding depth and tiled buffers. */
+	assert(!tex->is_depth);
+	assert(tex->surface.is_linear);
+
+	/* Reallocate the buffer in the same pipe_resource. */
+	si_alloc_resource(sscreen, &tex->buffer);
+
+	/* Initialize the CMASK base address (needed even without CMASK). */
+	tex->cmask_base_address_reg =
+		(tex->buffer.gpu_address + tex->cmask_offset) >> 8;
+
+	p_atomic_inc(&sscreen->dirty_tex_counter);
+
+	sctx->num_alloc_tex_transfer_bytes += tex->size;
+}
+
+static void *si_texture_transfer_map(struct pipe_context *ctx,
+				     struct pipe_resource *texture,
+				     unsigned level,
+				     unsigned usage,
+				     const struct pipe_box *box,
+				     struct pipe_transfer **ptransfer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct si_texture *tex = (struct si_texture*)texture;
+	struct si_transfer *trans;
+	struct r600_resource *buf;
+	unsigned offset = 0;
+	char *map;
+	bool use_staging_texture = false;
+
+	assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
+	assert(box->width && box->height && box->depth);
+
+	/* Depth textures use staging unconditionally. */
+	if (!tex->is_depth) {
+		/* Degrade the tile mode if we get too many transfers on APUs.
+		 * On dGPUs, the staging texture is always faster.
+		 * Only count uploads that are at least 4x4 pixels large.
+		 */
+		if (!sctx->screen->info.has_dedicated_vram &&
+		    level == 0 &&
+		    box->width >= 4 && box->height >= 4 &&
+		    p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
+			bool can_invalidate =
+				si_can_invalidate_texture(sctx->screen, tex,
+							    usage, box);
+
+			si_reallocate_texture_inplace(sctx, tex,
+							PIPE_BIND_LINEAR,
+							can_invalidate);
+		}
+
+		/* Tiled textures need to be converted into a linear texture for CPU
+		 * access. The staging texture is always linear and is placed in GART.
+		 *
+		 * Reading from VRAM or GTT WC is slow, always use the staging
+		 * texture in this case.
+		 *
+		 * Use the staging texture for uploads if the underlying BO
+		 * is busy.
+		 */
+		if (!tex->surface.is_linear)
+			use_staging_texture = true;
+		else if (usage & PIPE_TRANSFER_READ)
+			use_staging_texture =
+				tex->buffer.domains & RADEON_DOMAIN_VRAM ||
+				tex->buffer.flags & RADEON_FLAG_GTT_WC;
+		/* Write & linear only: */
+		else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf,
+						       RADEON_USAGE_READWRITE) ||
+			 !sctx->ws->buffer_wait(tex->buffer.buf, 0,
+						RADEON_USAGE_READWRITE)) {
+			/* It's busy. */
+			if (si_can_invalidate_texture(sctx->screen, tex,
+							usage, box))
+				si_texture_invalidate_storage(sctx, tex);
+			else
+				use_staging_texture = true;
+		}
+	}
+
+	trans = CALLOC_STRUCT(si_transfer);
+	if (!trans)
+		return NULL;
+	pipe_resource_reference(&trans->b.b.resource, texture);
+	trans->b.b.level = level;
+	trans->b.b.usage = usage;
+	trans->b.b.box = *box;
+
+	if (tex->is_depth) {
+		struct si_texture *staging_depth;
+
+		if (tex->buffer.b.b.nr_samples > 1) {
+			/* MSAA depth buffers need to be converted to single sample buffers.
+			 *
+			 * Mapping MSAA depth buffers can occur if ReadPixels is called
+			 * with a multisample GLX visual.
+			 *
+			 * First downsample the depth buffer to a temporary texture,
+			 * then decompress the temporary one to staging.
+			 *
+			 * Only the region being mapped is transfered.
+			 */
+			struct pipe_resource resource;
+
+			si_init_temp_resource_from_box(&resource, texture, box, level, 0);
+
+			if (!si_init_flushed_depth_texture(ctx, &resource, &staging_depth)) {
+				PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+				goto fail_trans;
+			}
+
+			if (usage & PIPE_TRANSFER_READ) {
+				struct pipe_resource *temp = ctx->screen->resource_create(ctx->screen, &resource);
+				if (!temp) {
+					PRINT_ERR("failed to create a temporary depth texture\n");
+					goto fail_trans;
+				}
+
+				si_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box);
+				si_blit_decompress_depth(ctx, (struct si_texture*)temp, staging_depth,
+							 0, 0, 0, box->depth, 0, 0);
+				pipe_resource_reference(&temp, NULL);
+			}
+
+			/* Just get the strides. */
+			si_texture_get_offset(sctx->screen, staging_depth, level, NULL,
+						&trans->b.b.stride,
+						&trans->b.b.layer_stride);
+		} else {
+			/* XXX: only readback the rectangle which is being mapped? */
+			/* XXX: when discard is true, no need to read back from depth texture */
+			if (!si_init_flushed_depth_texture(ctx, texture, &staging_depth)) {
+				PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+				goto fail_trans;
+			}
+
+			si_blit_decompress_depth(ctx, tex, staging_depth,
+						 level, level,
+						 box->z, box->z + box->depth - 1,
+						 0, 0);
+
+			offset = si_texture_get_offset(sctx->screen, staging_depth,
+							 level, box,
+							 &trans->b.b.stride,
+							 &trans->b.b.layer_stride);
+		}
+
+		trans->staging = &staging_depth->buffer;
+		buf = trans->staging;
+	} else if (use_staging_texture) {
+		struct pipe_resource resource;
+		struct si_texture *staging;
+
+		si_init_temp_resource_from_box(&resource, texture, box, level,
+						 SI_RESOURCE_FLAG_TRANSFER);
+		resource.usage = (usage & PIPE_TRANSFER_READ) ?
+			PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
+
+		/* Create the temporary texture. */
+		staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
+		if (!staging) {
+			PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+			goto fail_trans;
+		}
+		trans->staging = &staging->buffer;
+
+		/* Just get the strides. */
+		si_texture_get_offset(sctx->screen, staging, 0, NULL,
+					&trans->b.b.stride,
+					&trans->b.b.layer_stride);
+
+		if (usage & PIPE_TRANSFER_READ)
+			si_copy_to_staging_texture(ctx, trans);
+		else
+			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+		buf = trans->staging;
+	} else {
+		/* the resource is mapped directly */
+		offset = si_texture_get_offset(sctx->screen, tex, level, box,
+						 &trans->b.b.stride,
+						 &trans->b.b.layer_stride);
+		buf = &tex->buffer;
+	}
+
+	if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
+		goto fail_trans;
+
+	*ptransfer = &trans->b.b;
+	return map + offset;
+
+fail_trans:
+	r600_resource_reference(&trans->staging, NULL);
+	pipe_resource_reference(&trans->b.b.resource, NULL);
+	FREE(trans);
+	return NULL;
+}
+
+static void si_texture_transfer_unmap(struct pipe_context *ctx,
+				      struct pipe_transfer* transfer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct si_transfer *stransfer = (struct si_transfer*)transfer;
+	struct pipe_resource *texture = transfer->resource;
+	struct si_texture *tex = (struct si_texture*)texture;
+
+	if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) {
+		if (tex->is_depth && tex->buffer.b.b.nr_samples <= 1) {
+			ctx->resource_copy_region(ctx, texture, transfer->level,
+						  transfer->box.x, transfer->box.y, transfer->box.z,
+						  &stransfer->staging->b.b, transfer->level,
+						  &transfer->box);
+		} else {
+			si_copy_from_staging_texture(ctx, stransfer);
+		}
+	}
+
+	if (stransfer->staging) {
+		sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
+		r600_resource_reference(&stransfer->staging, NULL);
+	}
+
+	/* Heuristic for {upload, draw, upload, draw, ..}:
+	 *
+	 * Flush the gfx IB if we've allocated too much texture storage.
+	 *
+	 * The idea is that we don't want to build IBs that use too much
+	 * memory and put pressure on the kernel memory manager and we also
+	 * want to make temporary and invalidated buffers go idle ASAP to
+	 * decrease the total memory usage or make them reusable. The memory
+	 * usage will be slightly higher than given here because of the buffer
+	 * cache in the winsys.
+	 *
+	 * The result is that the kernel memory manager is never a bottleneck.
+	 */
+	if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
+		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+		sctx->num_alloc_tex_transfer_bytes = 0;
+	}
+
+	pipe_resource_reference(&transfer->resource, NULL);
+	FREE(transfer);
+}
+
+static const struct u_resource_vtbl si_texture_vtbl =
+{
+	NULL,				/* get_handle */
+	si_texture_destroy,		/* resource_destroy */
+	si_texture_transfer_map,	/* transfer_map */
+	u_default_transfer_flush_region, /* transfer_flush_region */
+	si_texture_transfer_unmap,	/* transfer_unmap */
+};
+
+/* Return if it's allowed to reinterpret one format as another with DCC enabled.
+ */
+bool vi_dcc_formats_compatible(enum pipe_format format1,
+			       enum pipe_format format2)
+{
+	const struct util_format_description *desc1, *desc2;
+
+	/* No format change - exit early. */
+	if (format1 == format2)
+		return true;
+
+	format1 = si_simplify_cb_format(format1);
+	format2 = si_simplify_cb_format(format2);
+
+	/* Check again after format adjustments. */
+	if (format1 == format2)
+		return true;
+
+	desc1 = util_format_description(format1);
+	desc2 = util_format_description(format2);
+
+	if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+	    desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+		return false;
+
+	/* Float and non-float are totally incompatible. */
+	if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
+	    (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
+		return false;
+
+	/* Channel sizes must match across DCC formats.
+	 * Comparing just the first 2 channels should be enough.
+	 */
+	if (desc1->channel[0].size != desc2->channel[0].size ||
+	    (desc1->nr_channels >= 2 &&
+	     desc1->channel[1].size != desc2->channel[1].size))
+		return false;
+
+	/* Everything below is not needed if the driver never uses the DCC
+	 * clear code with the value of 1.
+	 */
+
+	/* If the clear values are all 1 or all 0, this constraint can be
+	 * ignored. */
+	if (vi_alpha_is_on_msb(format1) != vi_alpha_is_on_msb(format2))
+		return false;
+
+	/* Channel types must match if the clear value of 1 is used.
+	 * The type categories are only float, signed, unsigned.
+	 * NORM and INT are always compatible.
+	 */
+	if (desc1->channel[0].type != desc2->channel[0].type ||
+	    (desc1->nr_channels >= 2 &&
+	     desc1->channel[1].type != desc2->channel[1].type))
+		return false;
+
+	return true;
+}
+
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
+				     unsigned level,
+				     enum pipe_format view_format)
+{
+	struct si_texture *stex = (struct si_texture *)tex;
+
+	return vi_dcc_enabled(stex, level) &&
+	       !vi_dcc_formats_compatible(tex->format, view_format);
+}
+
+/* This can't be merged with the above function, because
+ * vi_dcc_formats_compatible should be called only when DCC is enabled. */
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
+					   struct pipe_resource *tex,
+					   unsigned level,
+					   enum pipe_format view_format)
+{
+	struct si_texture *stex = (struct si_texture *)tex;
+
+	if (vi_dcc_formats_are_incompatible(tex, level, view_format))
+		if (!si_texture_disable_dcc(sctx, stex))
+			si_decompress_dcc(sctx, stex);
+}
+
+struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
+					      struct pipe_resource *texture,
+					      const struct pipe_surface *templ,
+					      unsigned width0, unsigned height0,
+					      unsigned width, unsigned height)
+{
+	struct si_surface *surface = CALLOC_STRUCT(si_surface);
+
+	if (!surface)
+		return NULL;
+
+	assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
+	assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
+
+	pipe_reference_init(&surface->base.reference, 1);
+	pipe_resource_reference(&surface->base.texture, texture);
+	surface->base.context = pipe;
+	surface->base.format = templ->format;
+	surface->base.width = width;
+	surface->base.height = height;
+	surface->base.u = templ->u;
+
+	surface->width0 = width0;
+	surface->height0 = height0;
+
+	surface->dcc_incompatible =
+		texture->target != PIPE_BUFFER &&
+		vi_dcc_formats_are_incompatible(texture, templ->u.tex.level,
+						templ->format);
+	return &surface->base;
+}
+
+static struct pipe_surface *si_create_surface(struct pipe_context *pipe,
+					      struct pipe_resource *tex,
+					      const struct pipe_surface *templ)
+{
+	unsigned level = templ->u.tex.level;
+	unsigned width = u_minify(tex->width0, level);
+	unsigned height = u_minify(tex->height0, level);
+	unsigned width0 = tex->width0;
+	unsigned height0 = tex->height0;
+
+	if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+		const struct util_format_description *tex_desc
+			= util_format_description(tex->format);
+		const struct util_format_description *templ_desc
+			= util_format_description(templ->format);
+
+		assert(tex_desc->block.bits == templ_desc->block.bits);
+
+		/* Adjust size of surface if and only if the block width or
+		 * height is changed. */
+		if (tex_desc->block.width != templ_desc->block.width ||
+		    tex_desc->block.height != templ_desc->block.height) {
+			unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+			unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+			width = nblks_x * templ_desc->block.width;
+			height = nblks_y * templ_desc->block.height;
+
+			width0 = util_format_get_nblocksx(tex->format, width0);
+			height0 = util_format_get_nblocksy(tex->format, height0);
+		}
+	}
+
+	return si_create_surface_custom(pipe, tex, templ,
+					  width0, height0,
+					  width, height);
+}
+
+static void si_surface_destroy(struct pipe_context *pipe,
+			       struct pipe_surface *surface)
+{
+	pipe_resource_reference(&surface->texture, NULL);
+	FREE(surface);
+}
+
+unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap)
+{
+	const struct util_format_description *desc = util_format_description(format);
+
+#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
+
+	if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+		return V_028C70_SWAP_STD;
+
+	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+		return ~0U;
+
+	switch (desc->nr_channels) {
+	case 1:
+		if (HAS_SWIZZLE(0,X))
+			return V_028C70_SWAP_STD; /* X___ */
+		else if (HAS_SWIZZLE(3,X))
+			return V_028C70_SWAP_ALT_REV; /* ___X */
+		break;
+	case 2:
+		if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) ||
+		    (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) ||
+		    (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y)))
+			return V_028C70_SWAP_STD; /* XY__ */
+		else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) ||
+			 (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) ||
+		         (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X)))
+			/* YX__ */
+			return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
+		else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y))
+			return V_028C70_SWAP_ALT; /* X__Y */
+		else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X))
+			return V_028C70_SWAP_ALT_REV; /* Y__X */
+		break;
+	case 3:
+		if (HAS_SWIZZLE(0,X))
+			return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
+		else if (HAS_SWIZZLE(0,Z))
+			return V_028C70_SWAP_STD_REV; /* ZYX */
+		break;
+	case 4:
+		/* check the middle channels, the 1st and 4th channel can be NONE */
+		if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) {
+			return V_028C70_SWAP_STD; /* XYZW */
+		} else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) {
+			return V_028C70_SWAP_STD_REV; /* WZYX */
+		} else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) {
+			return V_028C70_SWAP_ALT; /* ZYXW */
+		} else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) {
+			/* YZWX */
+			if (desc->is_array)
+				return V_028C70_SWAP_ALT_REV;
+			else
+				return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
+		}
+		break;
+	}
+	return ~0U;
+}
+
+/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
+
+static void vi_dcc_clean_up_context_slot(struct si_context *sctx,
+					 int slot)
+{
+	int i;
+
+	if (sctx->dcc_stats[slot].query_active)
+		vi_separate_dcc_stop_query(sctx,
+					   sctx->dcc_stats[slot].tex);
+
+	for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
+		if (sctx->dcc_stats[slot].ps_stats[i]) {
+			sctx->b.destroy_query(&sctx->b,
+					      sctx->dcc_stats[slot].ps_stats[i]);
+			sctx->dcc_stats[slot].ps_stats[i] = NULL;
+		}
+
+	si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
+}
+
+/**
+ * Return the per-context slot where DCC statistics queries for the texture live.
+ */
+static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx,
+					       struct si_texture *tex)
+{
+	int i, empty_slot = -1;
+
+	/* Remove zombie textures (textures kept alive by this array only). */
+	for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+		if (sctx->dcc_stats[i].tex &&
+		    sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
+			vi_dcc_clean_up_context_slot(sctx, i);
+
+	/* Find the texture. */
+	for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+		/* Return if found. */
+		if (sctx->dcc_stats[i].tex == tex) {
+			sctx->dcc_stats[i].last_use_timestamp = os_time_get();
+			return i;
+		}
+
+		/* Record the first seen empty slot. */
+		if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
+			empty_slot = i;
+	}
+
+	/* Not found. Remove the oldest member to make space in the array. */
+	if (empty_slot == -1) {
+		int oldest_slot = 0;
+
+		/* Find the oldest slot. */
+		for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+			if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
+			    sctx->dcc_stats[i].last_use_timestamp)
+				oldest_slot = i;
+
+		/* Clean up the oldest slot. */
+		vi_dcc_clean_up_context_slot(sctx, oldest_slot);
+		empty_slot = oldest_slot;
+	}
+
+	/* Add the texture to the new slot. */
+	si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
+	sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+	return empty_slot;
+}
+
+static struct pipe_query *
+vi_create_resuming_pipestats_query(struct si_context *sctx)
+{
+	struct si_query_hw *query = (struct si_query_hw*)
+		sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+
+	query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
+	return (struct pipe_query*)query;
+}
+
+/**
+ * Called when binding a color buffer.
+ */
+void vi_separate_dcc_start_query(struct si_context *sctx,
+				 struct si_texture *tex)
+{
+	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+
+	assert(!sctx->dcc_stats[i].query_active);
+
+	if (!sctx->dcc_stats[i].ps_stats[0])
+		sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
+
+	/* begin or resume the query */
+	sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+	sctx->dcc_stats[i].query_active = true;
+}
+
+/**
+ * Called when unbinding a color buffer.
+ */
+void vi_separate_dcc_stop_query(struct si_context *sctx,
+				struct si_texture *tex)
+{
+	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+
+	assert(sctx->dcc_stats[i].query_active);
+	assert(sctx->dcc_stats[i].ps_stats[0]);
+
+	/* pause or end the query */
+	sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+	sctx->dcc_stats[i].query_active = false;
+}
+
+static bool vi_should_enable_separate_dcc(struct si_texture *tex)
+{
+	/* The minimum number of fullscreen draws per frame that is required
+	 * to enable DCC. */
+	return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+}
+
+/* Called by fast clear. */
+void vi_separate_dcc_try_enable(struct si_context *sctx,
+				struct si_texture *tex)
+{
+	/* The intent is to use this with shared displayable back buffers,
+	 * but it's not strictly limited only to them.
+	 */
+	if (!tex->buffer.b.is_shared ||
+	    !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+	    tex->buffer.b.b.target != PIPE_TEXTURE_2D ||
+	    tex->buffer.b.b.last_level > 0 ||
+	    !tex->surface.dcc_size ||
+	    sctx->screen->debug_flags & DBG(NO_DCC) ||
+	    sctx->screen->debug_flags & DBG(NO_DCC_FB))
+		return;
+
+	assert(sctx->chip_class >= VI);
+
+	if (tex->dcc_offset)
+		return; /* already enabled */
+
+	/* Enable the DCC stat gathering. */
+	if (!tex->dcc_gather_statistics) {
+		tex->dcc_gather_statistics = true;
+		vi_separate_dcc_start_query(sctx, tex);
+	}
+
+	if (!vi_should_enable_separate_dcc(tex))
+		return; /* stats show that DCC decompression is too expensive */
+
+	assert(tex->surface.num_dcc_levels);
+	assert(!tex->dcc_separate_buffer);
+
+	si_texture_discard_cmask(sctx->screen, tex);
+
+	/* Get a DCC buffer. */
+	if (tex->last_dcc_separate_buffer) {
+		assert(tex->dcc_gather_statistics);
+		assert(!tex->dcc_separate_buffer);
+		tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+		tex->last_dcc_separate_buffer = NULL;
+	} else {
+		tex->dcc_separate_buffer =
+			si_aligned_buffer_create(sctx->b.screen,
+						   SI_RESOURCE_FLAG_UNMAPPABLE,
+						   PIPE_USAGE_DEFAULT,
+						   tex->surface.dcc_size,
+						   tex->surface.dcc_alignment);
+		if (!tex->dcc_separate_buffer)
+			return;
+	}
+
+	/* dcc_offset is the absolute GPUVM address. */
+	tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+	/* no need to flag anything since this is called by fast clear that
+	 * flags framebuffer state
+	 */
+}
+
+/**
+ * Called by pipe_context::flush_resource, the place where DCC decompression
+ * takes place.
+ */
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+					     struct si_texture *tex)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct pipe_query *tmp;
+	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+	bool query_active = sctx->dcc_stats[i].query_active;
+	bool disable = false;
+
+	if (sctx->dcc_stats[i].ps_stats[2]) {
+		union pipe_query_result result;
+
+		/* Read the results. */
+		ctx->get_query_result(ctx, sctx->dcc_stats[i].ps_stats[2],
+				      true, &result);
+		si_query_hw_reset_buffers(sctx,
+					  (struct si_query_hw*)
+					  sctx->dcc_stats[i].ps_stats[2]);
+
+		/* Compute the approximate number of fullscreen draws. */
+		tex->ps_draw_ratio =
+			result.pipeline_statistics.ps_invocations /
+			(tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
+		sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
+
+		disable = tex->dcc_separate_buffer &&
+			  !vi_should_enable_separate_dcc(tex);
+	}
+
+	tex->num_slow_clears = 0;
+
+	/* stop the statistics query for ps_stats[0] */
+	if (query_active)
+		vi_separate_dcc_stop_query(sctx, tex);
+
+	/* Move the queries in the queue by one. */
+	tmp = sctx->dcc_stats[i].ps_stats[2];
+	sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
+	sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
+	sctx->dcc_stats[i].ps_stats[0] = tmp;
+
+	/* create and start a new query as ps_stats[0] */
+	if (query_active)
+		vi_separate_dcc_start_query(sctx, tex);
+
+	if (disable) {
+		assert(!tex->last_dcc_separate_buffer);
+		tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+		tex->dcc_separate_buffer = NULL;
+		tex->dcc_offset = 0;
+		/* no need to flag anything since this is called after
+		 * decompression that re-sets framebuffer state
+		 */
+	}
+}
+
+static struct pipe_memory_object *
+si_memobj_from_handle(struct pipe_screen *screen,
+		      struct winsys_handle *whandle,
+		      bool dedicated)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
+	struct pb_buffer *buf = NULL;
+	uint32_t stride, offset;
+
+	if (!memobj)
+		return NULL;
+
+	buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
+					      &stride, &offset);
+	if (!buf) {
+		free(memobj);
+		return NULL;
+	}
+
+	memobj->b.dedicated = dedicated;
+	memobj->buf = buf;
+	memobj->stride = stride;
+
+	return (struct pipe_memory_object *)memobj;
+
+}
+
+static void
+si_memobj_destroy(struct pipe_screen *screen,
+		  struct pipe_memory_object *_memobj)
+{
+	struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+
+	pb_reference(&memobj->buf, NULL);
+	free(memobj);
+}
+
+static struct pipe_resource *
+si_texture_from_memobj(struct pipe_screen *screen,
+		       const struct pipe_resource *templ,
+		       struct pipe_memory_object *_memobj,
+		       uint64_t offset)
+{
+	struct si_screen *sscreen = (struct si_screen*)screen;
+	struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+	struct pipe_resource *tex =
+		si_texture_from_winsys_buffer(sscreen, templ, memobj->buf,
+					      memobj->stride, offset,
+					      PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE |
+					      PIPE_HANDLE_USAGE_SHADER_WRITE,
+					      memobj->b.dedicated);
+	if (!tex)
+		return NULL;
+
+	/* si_texture_from_winsys_buffer doesn't increment refcount of
+	 * memobj->buf, so increment it here.
+	 */
+	struct pb_buffer *buf = NULL;
+	pb_reference(&buf, memobj->buf);
+	return tex;
+}
+
+static bool si_check_resource_capability(struct pipe_screen *screen,
+					 struct pipe_resource *resource,
+					 unsigned bind)
+{
+	struct si_texture *tex = (struct si_texture*)resource;
+
+	/* Buffers only support the linear flag. */
+	if (resource->target == PIPE_BUFFER)
+		return (bind & ~PIPE_BIND_LINEAR) == 0;
+
+	if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
+		return false;
+
+	if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
+		return false;
+
+	/* TODO: PIPE_BIND_CURSOR - do we care? */
+	return true;
+}
+
+void si_init_screen_texture_functions(struct si_screen *sscreen)
+{
+	sscreen->b.resource_from_handle = si_texture_from_handle;
+	sscreen->b.resource_get_handle = si_texture_get_handle;
+	sscreen->b.resource_from_memobj = si_texture_from_memobj;
+	sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
+	sscreen->b.memobj_destroy = si_memobj_destroy;
+	sscreen->b.check_resource_capability = si_check_resource_capability;
+}
+
+void si_init_context_texture_functions(struct si_context *sctx)
+{
+	sctx->b.create_surface = si_create_surface;
+	sctx->b.surface_destroy = si_surface_destroy;
+}
author	Jonathan Gray <jsg@cvs.openbsd.org>	2019-01-29 11:08:07 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2019-01-29 11:08:07 +0000
commit	6b139c2063623e9310025247cd966490b9aa57ea (patch)
tree	375acfd898ca3d721250aa17291bbb90a8d7250a /lib/mesa/src/gallium/drivers/radeonsi
parent	cce99579dcfb1d54c54cff65573be3430e77f2c5 (diff)