summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers/radeonsi
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2019-01-29 11:08:07 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2019-01-29 11:08:07 +0000
commit6b139c2063623e9310025247cd966490b9aa57ea (patch)
tree375acfd898ca3d721250aa17291bbb90a8d7250a /lib/mesa/src/gallium/drivers/radeonsi
parentcce99579dcfb1d54c54cff65573be3430e77f2c5 (diff)
Import Mesa 18.3.2
Diffstat (limited to 'lib/mesa/src/gallium/drivers/radeonsi')
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h1
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/meson.build120
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_buffer.c761
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_build_pm4.h229
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_clear.c758
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_compute_blit.c285
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_dma_cs.c235
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h8
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_fence.c656
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_get.c1004
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_gfx_cs.c394
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_gpu_load.c281
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_query.c1894
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_query.h320
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c891
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c441
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c209
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c264
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c75
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c296
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_test_dma.c415
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_test_dma_perf.c475
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_texture.c2424
23 files changed, 11910 insertions, 526 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
index 7f57b4ea8..8c5078c13 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -3,6 +3,7 @@ DRI_CONF_SECTION_PERFORMANCE
DRI_CONF_RADEONSI_ENABLE_SISCHED("false")
DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+ DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
DRI_CONF_SECTION_END
DRI_CONF_SECTION_DEBUG
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/meson.build b/lib/mesa/src/gallium/drivers/radeonsi/meson.build
new file mode 100644
index 000000000..ac8ed949e
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/meson.build
@@ -0,0 +1,120 @@
+# Copyright © 2017 Dylan Baker
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+files_libradeonsi = files(
+ 'cik_sdma.c',
+ 'driinfo_radeonsi.h',
+ 'si_blit.c',
+ 'si_buffer.c',
+ 'si_build_pm4.h',
+ 'si_clear.c',
+ 'si_compute.c',
+ 'si_compute.h',
+ 'si_compute_blit.c',
+ 'si_cp_dma.c',
+ 'si_debug.c',
+ 'si_descriptors.c',
+ 'si_dma.c',
+ 'si_dma_cs.c',
+ 'si_fence.c',
+ 'si_get.c',
+ 'si_gfx_cs.c',
+ 'si_gpu_load.c',
+ 'si_perfcounter.c',
+ 'si_pipe.c',
+ 'si_pipe.h',
+ 'si_pm4.c',
+ 'si_pm4.h',
+ 'si_public.h',
+ 'si_query.c',
+ 'si_query.h',
+ 'si_shader.c',
+ 'si_shader.h',
+ 'si_shader_internal.h',
+ 'si_shader_nir.c',
+ 'si_shader_tgsi_alu.c',
+ 'si_shader_tgsi_mem.c',
+ 'si_shader_tgsi_setup.c',
+ 'si_shaderlib_tgsi.c',
+ 'si_state.c',
+ 'si_state.h',
+ 'si_state_binning.c',
+ 'si_state_draw.c',
+ 'si_state_msaa.c',
+ 'si_state_shaders.c',
+ 'si_state_streamout.c',
+ 'si_state_viewport.c',
+ 'si_test_dma.c',
+ 'si_test_dma_perf.c',
+ 'si_texture.c',
+ 'si_uvd.c',
+ '../radeon/r600_perfcounter.c',
+ '../radeon/radeon_uvd.c',
+ '../radeon/radeon_uvd.h',
+ '../radeon/radeon_vcn_enc_1_2.c',
+ '../radeon/radeon_vcn_enc.c',
+ '../radeon/radeon_vcn_enc.h',
+ '../radeon/radeon_vcn_dec_jpeg.c',
+ '../radeon/radeon_vcn_dec.c',
+ '../radeon/radeon_vcn_dec.h',
+ '../radeon/radeon_uvd_enc_1_1.c',
+ '../radeon/radeon_uvd_enc.c',
+ '../radeon/radeon_uvd_enc.h',
+ '../radeon/radeon_vce_40_2_2.c',
+ '../radeon/radeon_vce_50.c',
+ '../radeon/radeon_vce_52.c',
+ '../radeon/radeon_vce.c',
+ '../radeon/radeon_vce.h',
+ '../radeon/radeon_video.c',
+ '../radeon/radeon_video.h',
+ '../radeon/radeon_winsys.h',
+)
+
+si_driinfo_h = custom_target(
+ 'si_driinfo.h',
+ input : files(
+ '../../../util/merge_driinfo.py',
+ '../../auxiliary/pipe-loader/driinfo_gallium.h', 'driinfo_radeonsi.h'
+ ),
+ output : 'si_driinfo.h',
+ command : [prog_python, '@INPUT@'],
+ capture : true,
+)
+
+libradeonsi = static_library(
+ 'radeonsi',
+ [files_libradeonsi, si_driinfo_h, sid_tables_h],
+ include_directories : [
+ inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_amd_common,
+ inc_gallium_drivers,
+ ],
+ c_args : ['-Wstrict-overflow=0', c_vis_args],
+ cpp_args : [cpp_vis_args],
+ dependencies : [dep_llvm, dep_clock, dep_libdrm_radeon, idep_nir_headers],
+)
+
+driver_radeonsi = declare_dependency(
+ compile_args : '-DGALLIUM_RADEONSI',
+ sources : si_driinfo_h,
+ link_with : [
+ libradeonsi, libradeonwinsys, libamdgpuwinsys, libamd_common,
+ ],
+ dependencies : idep_nir,
+)
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_buffer.c b/lib/mesa/src/gallium/drivers/radeonsi/si_buffer.c
new file mode 100644
index 000000000..c7260e06c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_buffer.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "radeonsi/si_pipe.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_transfer.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+bool si_rings_is_buffer_referenced(struct si_context *sctx,
+ struct pb_buffer *buf,
+ enum radeon_bo_usage usage)
+{
+ if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
+ return true;
+ }
+ if (radeon_emitted(sctx->dma_cs, 0) &&
+ sctx->ws->cs_is_buffer_referenced(sctx->dma_cs, buf, usage)) {
+ return true;
+ }
+ return false;
+}
+
+void *si_buffer_map_sync_with_rings(struct si_context *sctx,
+ struct r600_resource *resource,
+ unsigned usage)
+{
+ enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
+ bool busy = false;
+
+ assert(!(resource->flags & RADEON_FLAG_SPARSE));
+
+ if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+ return sctx->ws->buffer_map(resource->buf, NULL, usage);
+ }
+
+ if (!(usage & PIPE_TRANSFER_WRITE)) {
+ /* have to wait for the last write */
+ rusage = RADEON_USAGE_WRITE;
+ }
+
+ if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
+ sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
+ resource->buf, rusage)) {
+ if (usage & PIPE_TRANSFER_DONTBLOCK) {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ return NULL;
+ } else {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ busy = true;
+ }
+ }
+ if (radeon_emitted(sctx->dma_cs, 0) &&
+ sctx->ws->cs_is_buffer_referenced(sctx->dma_cs,
+ resource->buf, rusage)) {
+ if (usage & PIPE_TRANSFER_DONTBLOCK) {
+ si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+ return NULL;
+ } else {
+ si_flush_dma_cs(sctx, 0, NULL);
+ busy = true;
+ }
+ }
+
+ if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
+ if (usage & PIPE_TRANSFER_DONTBLOCK) {
+ return NULL;
+ } else {
+ /* We will be wait for the GPU. Wait for any offloaded
+ * CS flush to complete to avoid busy-waiting in the winsys. */
+ sctx->ws->cs_sync_flush(sctx->gfx_cs);
+ if (sctx->dma_cs)
+ sctx->ws->cs_sync_flush(sctx->dma_cs);
+ }
+ }
+
+ /* Setting the CS to NULL will prevent doing checks we have done already. */
+ return sctx->ws->buffer_map(resource->buf, NULL, usage);
+}
+
+void si_init_resource_fields(struct si_screen *sscreen,
+ struct r600_resource *res,
+ uint64_t size, unsigned alignment)
+{
+ struct si_texture *tex = (struct si_texture*)res;
+
+ res->bo_size = size;
+ res->bo_alignment = alignment;
+ res->flags = 0;
+ res->texture_handle_allocated = false;
+ res->image_handle_allocated = false;
+
+ switch (res->b.b.usage) {
+ case PIPE_USAGE_STREAM:
+ res->flags = RADEON_FLAG_GTT_WC;
+ /* fall through */
+ case PIPE_USAGE_STAGING:
+ /* Transfers are likely to occur more often with these
+ * resources. */
+ res->domains = RADEON_DOMAIN_GTT;
+ break;
+ case PIPE_USAGE_DYNAMIC:
+ /* Older kernels didn't always flush the HDP cache before
+ * CS execution
+ */
+ if (!sscreen->info.kernel_flushes_hdp_before_ib) {
+ res->domains = RADEON_DOMAIN_GTT;
+ res->flags |= RADEON_FLAG_GTT_WC;
+ break;
+ }
+ /* fall through */
+ case PIPE_USAGE_DEFAULT:
+ case PIPE_USAGE_IMMUTABLE:
+ default:
+ /* Not listing GTT here improves performance in some
+ * apps. */
+ res->domains = RADEON_DOMAIN_VRAM;
+ res->flags |= RADEON_FLAG_GTT_WC;
+ break;
+ }
+
+ if (res->b.b.target == PIPE_BUFFER &&
+ res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
+ /* Use GTT for all persistent mappings with older
+ * kernels, because they didn't always flush the HDP
+ * cache before CS execution.
+ *
+ * Write-combined CPU mappings are fine, the kernel
+ * ensures all CPU writes finish before the GPU
+ * executes a command stream.
+ *
+ * radeon doesn't have good BO move throttling, so put all
+ * persistent buffers into GTT to prevent VRAM CPU page faults.
+ */
+ if (!sscreen->info.kernel_flushes_hdp_before_ib ||
+ sscreen->info.drm_major == 2)
+ res->domains = RADEON_DOMAIN_GTT;
+ }
+
+ /* Tiled textures are unmappable. Always put them in VRAM. */
+ if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
+ res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
+ res->domains = RADEON_DOMAIN_VRAM;
+ res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
+ RADEON_FLAG_GTT_WC;
+ }
+
+ /* Displayable and shareable surfaces are not suballocated. */
+ if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
+ res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
+ else
+ res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
+ if (sscreen->debug_flags & DBG(NO_WC))
+ res->flags &= ~RADEON_FLAG_GTT_WC;
+
+ if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
+ res->flags |= RADEON_FLAG_READ_ONLY;
+
+ if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
+ res->flags |= RADEON_FLAG_32BIT;
+
+ /* Set expected VRAM and GART usage for the buffer. */
+ res->vram_usage = 0;
+ res->gart_usage = 0;
+ res->max_forced_staging_uploads = 0;
+ res->b.max_forced_staging_uploads = 0;
+
+ if (res->domains & RADEON_DOMAIN_VRAM) {
+ res->vram_usage = size;
+
+ res->max_forced_staging_uploads =
+ res->b.max_forced_staging_uploads =
+ sscreen->info.has_dedicated_vram &&
+ size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
+ } else if (res->domains & RADEON_DOMAIN_GTT) {
+ res->gart_usage = size;
+ }
+}
+
+bool si_alloc_resource(struct si_screen *sscreen,
+ struct r600_resource *res)
+{
+ struct pb_buffer *old_buf, *new_buf;
+
+ /* Allocate a new resource. */
+ new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size,
+ res->bo_alignment,
+ res->domains, res->flags);
+ if (!new_buf) {
+ return false;
+ }
+
+ /* Replace the pointer such that if res->buf wasn't NULL, it won't be
+ * NULL. This should prevent crashes with multiple contexts using
+ * the same buffer where one of the contexts invalidates it while
+ * the others are using it. */
+ old_buf = res->buf;
+ res->buf = new_buf; /* should be atomic */
+ res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
+
+ if (res->flags & RADEON_FLAG_32BIT) {
+ uint64_t start = res->gpu_address;
+ uint64_t last = start + res->bo_size - 1;
+ (void)start;
+ (void)last;
+
+ assert((start >> 32) == sscreen->info.address32_hi);
+ assert((last >> 32) == sscreen->info.address32_hi);
+ }
+
+ pb_reference(&old_buf, NULL);
+
+ util_range_set_empty(&res->valid_buffer_range);
+ res->TC_L2_dirty = false;
+
+ /* Print debug information. */
+ if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
+ fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
+ res->gpu_address, res->gpu_address + res->buf->size,
+ res->buf->size);
+ }
+ return true;
+}
+
+static void si_buffer_destroy(struct pipe_screen *screen,
+ struct pipe_resource *buf)
+{
+ struct r600_resource *rbuffer = r600_resource(buf);
+
+ threaded_resource_deinit(buf);
+ util_range_destroy(&rbuffer->valid_buffer_range);
+ pb_reference(&rbuffer->buf, NULL);
+ FREE(rbuffer);
+}
+
+/* Reallocate the buffer a update all resource bindings where the buffer is
+ * bound.
+ *
+ * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
+ * idle by discarding its contents.
+ */
+static bool
+si_invalidate_buffer(struct si_context *sctx,
+ struct r600_resource *rbuffer)
+{
+ /* Shared buffers can't be reallocated. */
+ if (rbuffer->b.is_shared)
+ return false;
+
+ /* Sparse buffers can't be reallocated. */
+ if (rbuffer->flags & RADEON_FLAG_SPARSE)
+ return false;
+
+ /* In AMD_pinned_memory, the user pointer association only gets
+ * broken when the buffer is explicitly re-allocated.
+ */
+ if (rbuffer->b.is_user_ptr)
+ return false;
+
+ /* Check if mapping this buffer would cause waiting for the GPU. */
+ if (si_rings_is_buffer_referenced(sctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+ uint64_t old_va = rbuffer->gpu_address;
+
+ /* Reallocate the buffer in the same pipe_resource. */
+ si_alloc_resource(sctx->screen, rbuffer);
+ si_rebind_buffer(sctx, &rbuffer->b.b, old_va);
+ } else {
+ util_range_set_empty(&rbuffer->valid_buffer_range);
+ }
+
+ return true;
+}
+
+/* Replace the storage of dst with src. */
+void si_replace_buffer_storage(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ struct pipe_resource *src)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct r600_resource *rdst = r600_resource(dst);
+ struct r600_resource *rsrc = r600_resource(src);
+ uint64_t old_gpu_address = rdst->gpu_address;
+
+ pb_reference(&rdst->buf, rsrc->buf);
+ rdst->gpu_address = rsrc->gpu_address;
+ rdst->b.b.bind = rsrc->b.b.bind;
+ rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads;
+ rdst->max_forced_staging_uploads = rsrc->max_forced_staging_uploads;
+ rdst->flags = rsrc->flags;
+
+ assert(rdst->vram_usage == rsrc->vram_usage);
+ assert(rdst->gart_usage == rsrc->gart_usage);
+ assert(rdst->bo_size == rsrc->bo_size);
+ assert(rdst->bo_alignment == rsrc->bo_alignment);
+ assert(rdst->domains == rsrc->domains);
+
+ si_rebind_buffer(sctx, dst, old_gpu_address);
+}
+
+static void si_invalidate_resource(struct pipe_context *ctx,
+ struct pipe_resource *resource)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct r600_resource *rbuffer = r600_resource(resource);
+
+ /* We currently only do anyting here for buffers */
+ if (resource->target == PIPE_BUFFER)
+ (void)si_invalidate_buffer(sctx, rbuffer);
+}
+
+static void *si_buffer_get_transfer(struct pipe_context *ctx,
+ struct pipe_resource *resource,
+ unsigned usage,
+ const struct pipe_box *box,
+ struct pipe_transfer **ptransfer,
+ void *data, struct r600_resource *staging,
+ unsigned offset)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct si_transfer *transfer;
+
+ if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+ transfer = slab_alloc(&sctx->pool_transfers_unsync);
+ else
+ transfer = slab_alloc(&sctx->pool_transfers);
+
+ transfer->b.b.resource = NULL;
+ pipe_resource_reference(&transfer->b.b.resource, resource);
+ transfer->b.b.level = 0;
+ transfer->b.b.usage = usage;
+ transfer->b.b.box = *box;
+ transfer->b.b.stride = 0;
+ transfer->b.b.layer_stride = 0;
+ transfer->b.staging = NULL;
+ transfer->offset = offset;
+ transfer->staging = staging;
+ *ptransfer = &transfer->b.b;
+ return data;
+}
+
+static void *si_buffer_transfer_map(struct pipe_context *ctx,
+ struct pipe_resource *resource,
+ unsigned level,
+ unsigned usage,
+ const struct pipe_box *box,
+ struct pipe_transfer **ptransfer)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct r600_resource *rbuffer = r600_resource(resource);
+ uint8_t *data;
+
+ assert(box->x + box->width <= resource->width0);
+
+ /* From GL_AMD_pinned_memory issues:
+ *
+ * 4) Is glMapBuffer on a shared buffer guaranteed to return the
+ * same system address which was specified at creation time?
+ *
+ * RESOLVED: NO. The GL implementation might return a different
+ * virtual mapping of that memory, although the same physical
+ * page will be used.
+ *
+ * So don't ever use staging buffers.
+ */
+ if (rbuffer->b.is_user_ptr)
+ usage |= PIPE_TRANSFER_PERSISTENT;
+
+ /* See if the buffer range being mapped has never been initialized,
+ * in which case it can be mapped unsynchronized. */
+ if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+ TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
+ usage & PIPE_TRANSFER_WRITE &&
+ !rbuffer->b.is_shared &&
+ !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+ }
+
+ /* If discarding the entire range, discard the whole resource instead. */
+ if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
+ box->x == 0 && box->width == resource->width0) {
+ usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+ }
+
+ /* If a buffer in VRAM is too large and the range is discarded, don't
+ * map it directly. This makes sure that the buffer stays in VRAM.
+ */
+ bool force_discard_range = false;
+ if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+ PIPE_TRANSFER_DISCARD_RANGE) &&
+ !(usage & PIPE_TRANSFER_PERSISTENT) &&
+ /* Try not to decrement the counter if it's not positive. Still racy,
+ * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
+ rbuffer->max_forced_staging_uploads > 0 &&
+ p_atomic_dec_return(&rbuffer->max_forced_staging_uploads) >= 0) {
+ usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+ PIPE_TRANSFER_UNSYNCHRONIZED);
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ force_discard_range = true;
+ }
+
+ if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
+ !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+ TC_TRANSFER_MAP_NO_INVALIDATE))) {
+ assert(usage & PIPE_TRANSFER_WRITE);
+
+ if (si_invalidate_buffer(sctx, rbuffer)) {
+ /* At this point, the buffer is always idle. */
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+ } else {
+ /* Fall back to a temporary buffer. */
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ }
+ }
+
+ if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+ ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+ PIPE_TRANSFER_PERSISTENT))) ||
+ (rbuffer->flags & RADEON_FLAG_SPARSE))) {
+ assert(usage & PIPE_TRANSFER_WRITE);
+
+ /* Check if mapping this buffer would cause waiting for the GPU.
+ */
+ if (rbuffer->flags & RADEON_FLAG_SPARSE ||
+ force_discard_range ||
+ si_rings_is_buffer_referenced(sctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+ /* Do a wait-free write-only transfer using a temporary buffer. */
+ unsigned offset;
+ struct r600_resource *staging = NULL;
+
+ u_upload_alloc(ctx->stream_uploader, 0,
+ box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
+ sctx->screen->info.tcc_cache_line_size,
+ &offset, (struct pipe_resource**)&staging,
+ (void**)&data);
+
+ if (staging) {
+ data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+ return si_buffer_get_transfer(ctx, resource, usage, box,
+ ptransfer, data, staging, offset);
+ } else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+ return NULL;
+ }
+ } else {
+ /* At this point, the buffer is always idle (we checked it above). */
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+ }
+ }
+ /* Use a staging buffer in cached GTT for reads. */
+ else if (((usage & PIPE_TRANSFER_READ) &&
+ !(usage & PIPE_TRANSFER_PERSISTENT) &&
+ (rbuffer->domains & RADEON_DOMAIN_VRAM ||
+ rbuffer->flags & RADEON_FLAG_GTT_WC)) ||
+ (rbuffer->flags & RADEON_FLAG_SPARSE)) {
+ struct r600_resource *staging;
+
+ assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
+ staging = r600_resource(pipe_buffer_create(
+ ctx->screen, 0, PIPE_USAGE_STAGING,
+ box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
+ if (staging) {
+ /* Copy the VRAM buffer to the staging buffer. */
+ sctx->dma_copy(ctx, &staging->b.b, 0,
+ box->x % SI_MAP_BUFFER_ALIGNMENT,
+ 0, 0, resource, 0, box);
+
+ data = si_buffer_map_sync_with_rings(sctx, staging,
+ usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!data) {
+ r600_resource_reference(&staging, NULL);
+ return NULL;
+ }
+ data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+
+ return si_buffer_get_transfer(ctx, resource, usage, box,
+ ptransfer, data, staging, 0);
+ } else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+ return NULL;
+ }
+ }
+
+ data = si_buffer_map_sync_with_rings(sctx, rbuffer, usage);
+ if (!data) {
+ return NULL;
+ }
+ data += box->x;
+
+ return si_buffer_get_transfer(ctx, resource, usage, box,
+ ptransfer, data, NULL, 0);
+}
+
+static void si_buffer_do_flush_region(struct pipe_context *ctx,
+ struct pipe_transfer *transfer,
+ const struct pipe_box *box)
+{
+ struct si_transfer *stransfer = (struct si_transfer*)transfer;
+ struct r600_resource *rbuffer = r600_resource(transfer->resource);
+
+ if (stransfer->staging) {
+ /* Copy the staging buffer into the original one. */
+ si_copy_buffer((struct si_context*)ctx, transfer->resource,
+ &stransfer->staging->b.b, box->x,
+ stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT,
+ box->width);
+ }
+
+ util_range_add(&rbuffer->valid_buffer_range, box->x,
+ box->x + box->width);
+}
+
+static void si_buffer_flush_region(struct pipe_context *ctx,
+ struct pipe_transfer *transfer,
+ const struct pipe_box *rel_box)
+{
+ unsigned required_usage = PIPE_TRANSFER_WRITE |
+ PIPE_TRANSFER_FLUSH_EXPLICIT;
+
+ if ((transfer->usage & required_usage) == required_usage) {
+ struct pipe_box box;
+
+ u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+ si_buffer_do_flush_region(ctx, transfer, &box);
+ }
+}
+
+static void si_buffer_transfer_unmap(struct pipe_context *ctx,
+ struct pipe_transfer *transfer)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct si_transfer *stransfer = (struct si_transfer*)transfer;
+
+ if (transfer->usage & PIPE_TRANSFER_WRITE &&
+ !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+ si_buffer_do_flush_region(ctx, transfer, &transfer->box);
+
+ r600_resource_reference(&stransfer->staging, NULL);
+ assert(stransfer->b.staging == NULL); /* for threaded context only */
+ pipe_resource_reference(&transfer->resource, NULL);
+
+ /* Don't use pool_transfers_unsync. We are always in the driver
+ * thread. */
+ slab_free(&sctx->pool_transfers, transfer);
+}
+
+static void si_buffer_subdata(struct pipe_context *ctx,
+ struct pipe_resource *buffer,
+ unsigned usage, unsigned offset,
+ unsigned size, const void *data)
+{
+ struct pipe_transfer *transfer = NULL;
+ struct pipe_box box;
+ uint8_t *map = NULL;
+
+ u_box_1d(offset, size, &box);
+ map = si_buffer_transfer_map(ctx, buffer, 0,
+ PIPE_TRANSFER_WRITE |
+ PIPE_TRANSFER_DISCARD_RANGE |
+ usage,
+ &box, &transfer);
+ if (!map)
+ return;
+
+ memcpy(map, data, size);
+ si_buffer_transfer_unmap(ctx, transfer);
+}
+
+static const struct u_resource_vtbl si_buffer_vtbl =
+{
+ NULL, /* get_handle */
+ si_buffer_destroy, /* resource_destroy */
+ si_buffer_transfer_map, /* transfer_map */
+ si_buffer_flush_region, /* transfer_flush_region */
+ si_buffer_transfer_unmap, /* transfer_unmap */
+};
+
+static struct r600_resource *
+si_alloc_buffer_struct(struct pipe_screen *screen,
+ const struct pipe_resource *templ)
+{
+ struct r600_resource *rbuffer;
+
+ rbuffer = MALLOC_STRUCT(r600_resource);
+
+ rbuffer->b.b = *templ;
+ rbuffer->b.b.next = NULL;
+ pipe_reference_init(&rbuffer->b.b.reference, 1);
+ rbuffer->b.b.screen = screen;
+
+ rbuffer->b.vtbl = &si_buffer_vtbl;
+ threaded_resource_init(&rbuffer->b.b);
+
+ rbuffer->buf = NULL;
+ rbuffer->bind_history = 0;
+ rbuffer->TC_L2_dirty = false;
+ util_range_init(&rbuffer->valid_buffer_range);
+ return rbuffer;
+}
+
+static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
+ const struct pipe_resource *templ,
+ unsigned alignment)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct r600_resource *rbuffer = si_alloc_buffer_struct(screen, templ);
+
+ if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+ rbuffer->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
+
+ si_init_resource_fields(sscreen, rbuffer, templ->width0, alignment);
+
+ if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+ rbuffer->flags |= RADEON_FLAG_SPARSE;
+
+ if (!si_alloc_resource(sscreen, rbuffer)) {
+ FREE(rbuffer);
+ return NULL;
+ }
+ return &rbuffer->b.b;
+}
+
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
+ unsigned flags, unsigned usage,
+ unsigned size, unsigned alignment)
+{
+ struct pipe_resource buffer;
+
+ memset(&buffer, 0, sizeof buffer);
+ buffer.target = PIPE_BUFFER;
+ buffer.format = PIPE_FORMAT_R8_UNORM;
+ buffer.bind = 0;
+ buffer.usage = usage;
+ buffer.flags = flags;
+ buffer.width0 = size;
+ buffer.height0 = 1;
+ buffer.depth0 = 1;
+ buffer.array_size = 1;
+ return si_buffer_create(screen, &buffer, alignment);
+}
+
+struct r600_resource *si_aligned_buffer_create(struct pipe_screen *screen,
+ unsigned flags, unsigned usage,
+ unsigned size, unsigned alignment)
+{
+ return r600_resource(pipe_aligned_buffer_create(screen, flags, usage,
+ size, alignment));
+}
+
+static struct pipe_resource *
+si_buffer_from_user_memory(struct pipe_screen *screen,
+ const struct pipe_resource *templ,
+ void *user_memory)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ struct r600_resource *rbuffer = si_alloc_buffer_struct(screen, templ);
+
+ rbuffer->domains = RADEON_DOMAIN_GTT;
+ rbuffer->flags = 0;
+ rbuffer->b.is_user_ptr = true;
+ util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0);
+ util_range_add(&rbuffer->b.valid_buffer_range, 0, templ->width0);
+
+ /* Convert a user pointer to a buffer. */
+ rbuffer->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
+ if (!rbuffer->buf) {
+ FREE(rbuffer);
+ return NULL;
+ }
+
+ rbuffer->gpu_address = ws->buffer_get_virtual_address(rbuffer->buf);
+ rbuffer->vram_usage = 0;
+ rbuffer->gart_usage = templ->width0;
+
+ return &rbuffer->b.b;
+}
+
+static struct pipe_resource *si_resource_create(struct pipe_screen *screen,
+ const struct pipe_resource *templ)
+{
+ if (templ->target == PIPE_BUFFER) {
+ return si_buffer_create(screen, templ, 256);
+ } else {
+ return si_texture_create(screen, templ);
+ }
+}
+
+static bool si_resource_commit(struct pipe_context *pctx,
+ struct pipe_resource *resource,
+ unsigned level, struct pipe_box *box,
+ bool commit)
+{
+ struct si_context *ctx = (struct si_context *)pctx;
+ struct r600_resource *res = r600_resource(resource);
+
+ /*
+ * Since buffer commitment changes cannot be pipelined, we need to
+ * (a) flush any pending commands that refer to the buffer we're about
+ * to change, and
+ * (b) wait for threaded submit to finish, including those that were
+ * triggered by some other, earlier operation.
+ */
+ if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+ ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
+ res->buf, RADEON_USAGE_READWRITE)) {
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ }
+ if (radeon_emitted(ctx->dma_cs, 0) &&
+ ctx->ws->cs_is_buffer_referenced(ctx->dma_cs,
+ res->buf, RADEON_USAGE_READWRITE)) {
+ si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+ }
+
+ ctx->ws->cs_sync_flush(ctx->dma_cs);
+ ctx->ws->cs_sync_flush(ctx->gfx_cs);
+
+ assert(resource->target == PIPE_BUFFER);
+
+ return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
+}
+
+void si_init_screen_buffer_functions(struct si_screen *sscreen)
+{
+ sscreen->b.resource_create = si_resource_create;
+ sscreen->b.resource_destroy = u_resource_destroy_vtbl;
+ sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
+}
+
+void si_init_buffer_functions(struct si_context *sctx)
+{
+ sctx->b.invalidate_resource = si_invalidate_resource;
+ sctx->b.transfer_map = u_transfer_map_vtbl;
+ sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
+ sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
+ sctx->b.texture_subdata = u_default_texture_subdata;
+ sctx->b.buffer_subdata = si_buffer_subdata;
+ sctx->b.resource_commit = si_resource_commit;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_build_pm4.h b/lib/mesa/src/gallium/drivers/radeonsi/si_build_pm4.h
new file mode 100644
index 000000000..796adda09
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * This file contains helpers for writing commands to commands streams.
+ */
+
+#ifndef SI_BUILD_PM4_H
+#define SI_BUILD_PM4_H
+
+#include "si_pipe.h"
+#include "sid.h"
+
+static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+ assert(reg < SI_CONTEXT_REG_OFFSET);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
+ radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+ radeon_set_config_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
+}
+
+static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+ assert(reg >= SI_CONTEXT_REG_OFFSET);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
+ radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+ radeon_set_context_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
+}
+
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
+ unsigned reg, unsigned idx,
+ unsigned value)
+{
+ assert(reg >= SI_CONTEXT_REG_OFFSET);
+ assert(cs->current.cdw + 3 <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
+ radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
+ radeon_emit(cs, value);
+}
+
+static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+ assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
+ radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+ radeon_set_sh_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
+}
+
+static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
+{
+ assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
+ radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
+}
+
+static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
+{
+ radeon_set_uconfig_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
+}
+
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
+ unsigned reg, unsigned idx,
+ unsigned value)
+{
+ assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+ assert(cs->current.cdw + 3 <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, 1, 0));
+ radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
+ radeon_emit(cs, value);
+}
+
+/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
+static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
+ enum si_tracked_reg reg, unsigned value)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+ sctx->tracked_regs.reg_value[reg] != value) {
+ radeon_set_context_reg(cs, offset, value);
+
+ sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+ sctx->tracked_regs.reg_value[reg] = value;
+ }
+}
+
+/**
+ * Set 2 consecutive registers if any registers value is different.
+ * @param offset starting register offset
+ * @param value1 is written to first register
+ * @param value2 is written to second register
+ */
+static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
+ enum si_tracked_reg reg, unsigned value1,
+ unsigned value2)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
+ sctx->tracked_regs.reg_value[reg] != value1 ||
+ sctx->tracked_regs.reg_value[reg+1] != value2) {
+ radeon_set_context_reg_seq(cs, offset, 2);
+ radeon_emit(cs, value1);
+ radeon_emit(cs, value2);
+
+ sctx->tracked_regs.reg_value[reg] = value1;
+ sctx->tracked_regs.reg_value[reg+1] = value2;
+ sctx->tracked_regs.reg_saved |= 0x3ull << reg;
+ }
+}
+
+/**
+ * Set 3 consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
+ enum si_tracked_reg reg, unsigned value1,
+ unsigned value2, unsigned value3)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
+ sctx->tracked_regs.reg_value[reg] != value1 ||
+ sctx->tracked_regs.reg_value[reg+1] != value2 ||
+ sctx->tracked_regs.reg_value[reg+2] != value3) {
+ radeon_set_context_reg_seq(cs, offset, 3);
+ radeon_emit(cs, value1);
+ radeon_emit(cs, value2);
+ radeon_emit(cs, value3);
+
+ sctx->tracked_regs.reg_value[reg] = value1;
+ sctx->tracked_regs.reg_value[reg+1] = value2;
+ sctx->tracked_regs.reg_value[reg+2] = value3;
+ sctx->tracked_regs.reg_saved |= 0x7ull << reg;
+ }
+}
+
+/**
+ * Set 4 consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
+ enum si_tracked_reg reg, unsigned value1,
+ unsigned value2, unsigned value3,
+ unsigned value4)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
+ sctx->tracked_regs.reg_value[reg] != value1 ||
+ sctx->tracked_regs.reg_value[reg+1] != value2 ||
+ sctx->tracked_regs.reg_value[reg+2] != value3 ||
+ sctx->tracked_regs.reg_value[reg+3] != value4) {
+ radeon_set_context_reg_seq(cs, offset, 4);
+ radeon_emit(cs, value1);
+ radeon_emit(cs, value2);
+ radeon_emit(cs, value3);
+ radeon_emit(cs, value4);
+
+ sctx->tracked_regs.reg_value[reg] = value1;
+ sctx->tracked_regs.reg_value[reg+1] = value2;
+ sctx->tracked_regs.reg_value[reg+2] = value3;
+ sctx->tracked_regs.reg_value[reg+3] = value4;
+ sctx->tracked_regs.reg_saved |= 0xfull << reg;
+ }
+}
+
+/**
+ * Set consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
+ unsigned *value, unsigned *saved_val,
+ unsigned num)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ int i, j;
+
+ for (i = 0; i < num; i++) {
+ if (saved_val[i] != value[i]) {
+ radeon_set_context_reg_seq(cs, offset, num);
+ for (j = 0; j < num; j++)
+ radeon_emit(cs, value[j]);
+
+ memcpy(saved_val, value, sizeof(uint32_t) * num);
+ break;
+ }
+ }
+}
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_clear.c b/lib/mesa/src/gallium/drivers/radeonsi/si_clear.c
new file mode 100644
index 000000000..8aa3355af
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_clear.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "sid.h"
+
+#include "util/u_format.h"
+#include "util/u_pack_color.h"
+#include "util/u_surface.h"
+
+enum {
+ SI_CLEAR = SI_SAVE_FRAGMENT_STATE,
+ SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
+};
+
+static void si_alloc_separate_cmask(struct si_screen *sscreen,
+ struct si_texture *tex)
+{
+ if (tex->cmask_buffer || !tex->surface.cmask_size)
+ return;
+
+ tex->cmask_buffer =
+ si_aligned_buffer_create(&sscreen->b,
+ SI_RESOURCE_FLAG_UNMAPPABLE,
+ PIPE_USAGE_DEFAULT,
+ tex->surface.cmask_size,
+ tex->surface.cmask_alignment);
+ if (tex->cmask_buffer == NULL)
+ return;
+
+ tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
+ tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+
+ p_atomic_inc(&sscreen->compressed_colortex_counter);
+}
+
+static bool si_set_clear_color(struct si_texture *tex,
+ enum pipe_format surface_format,
+ const union pipe_color_union *color)
+{
+ union util_color uc;
+
+ memset(&uc, 0, sizeof(uc));
+
+ if (tex->surface.bpe == 16) {
+ /* DCC fast clear only:
+ * CLEAR_WORD0 = R = G = B
+ * CLEAR_WORD1 = A
+ */
+ assert(color->ui[0] == color->ui[1] &&
+ color->ui[0] == color->ui[2]);
+ uc.ui[0] = color->ui[0];
+ uc.ui[1] = color->ui[3];
+ } else if (util_format_is_pure_uint(surface_format)) {
+ util_format_write_4ui(surface_format, color->ui, 0, &uc, 0, 0, 0, 1, 1);
+ } else if (util_format_is_pure_sint(surface_format)) {
+ util_format_write_4i(surface_format, color->i, 0, &uc, 0, 0, 0, 1, 1);
+ } else {
+ util_pack_color(color->f, surface_format, &uc);
+ }
+
+ if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
+ return false;
+
+ memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
+ return true;
+}
+
+/** Linearize and convert luminace/intensity to red. */
+enum pipe_format si_simplify_cb_format(enum pipe_format format)
+{
+ format = util_format_linear(format);
+ format = util_format_luminance_to_red(format);
+ return util_format_intensity_to_red(format);
+}
+
+bool vi_alpha_is_on_msb(enum pipe_format format)
+{
+ format = si_simplify_cb_format(format);
+
+ /* Formats with 3 channels can't have alpha. */
+ if (util_format_description(format)->nr_channels == 3)
+ return true; /* same as xxxA; is any value OK here? */
+
+ return si_translate_colorswap(format, false) <= 1;
+}
+
+static bool vi_get_fast_clear_parameters(enum pipe_format base_format,
+ enum pipe_format surface_format,
+ const union pipe_color_union *color,
+ uint32_t* clear_value,
+ bool *eliminate_needed)
+{
+ /* If we want to clear without needing a fast clear eliminate step, we
+ * can set color and alpha independently to 0 or 1 (or 0/max for integer
+ * formats).
+ */
+ bool values[4] = {}; /* whether to clear to 0 or 1 */
+ bool color_value = false; /* clear color to 0 or 1 */
+ bool alpha_value = false; /* clear alpha to 0 or 1 */
+ int alpha_channel; /* index of the alpha component */
+ bool has_color = false;
+ bool has_alpha = false;
+
+ const struct util_format_description *desc =
+ util_format_description(si_simplify_cb_format(surface_format));
+
+ /* 128-bit fast clear with different R,G,B values is unsupported. */
+ if (desc->block.bits == 128 &&
+ (color->ui[0] != color->ui[1] ||
+ color->ui[0] != color->ui[2]))
+ return false;
+
+ *eliminate_needed = true;
+ *clear_value = 0x20202020U; /* use CB clear color registers */
+
+ if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+
+ bool base_alpha_is_on_msb = vi_alpha_is_on_msb(base_format);
+ bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(surface_format);
+
+ /* Formats with 3 channels can't have alpha. */
+ if (desc->nr_channels == 3)
+ alpha_channel = -1;
+ else if (surf_alpha_is_on_msb)
+ alpha_channel = desc->nr_channels - 1;
+ else
+ alpha_channel = 0;
+
+ for (int i = 0; i < 4; ++i) {
+ if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
+ continue;
+
+ if (desc->channel[i].pure_integer &&
+ desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+ /* Use the maximum value for clamping the clear color. */
+ int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+ values[i] = color->i[i] != 0;
+ if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+ } else if (desc->channel[i].pure_integer &&
+ desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+ /* Use the maximum value for clamping the clear color. */
+ unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+ values[i] = color->ui[i] != 0U;
+ if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+ } else {
+ values[i] = color->f[i] != 0.0F;
+ if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+ }
+
+ if (desc->swizzle[i] == alpha_channel) {
+ alpha_value = values[i];
+ has_alpha = true;
+ } else {
+ color_value = values[i];
+ has_color = true;
+ }
+ }
+
+ /* If alpha isn't present, make it the same as color, and vice versa. */
+ if (!has_alpha)
+ alpha_value = color_value;
+ else if (!has_color)
+ color_value = alpha_value;
+
+ if (color_value != alpha_value &&
+ base_alpha_is_on_msb != surf_alpha_is_on_msb)
+ return true; /* require ELIMINATE_FAST_CLEAR */
+
+ /* Check if all color values are equal if they are present. */
+ for (int i = 0; i < 4; ++i) {
+ if (desc->swizzle[i] <= PIPE_SWIZZLE_W &&
+ desc->swizzle[i] != alpha_channel &&
+ values[i] != color_value)
+ return true; /* require ELIMINATE_FAST_CLEAR */
+ }
+
+ /* This doesn't need ELIMINATE_FAST_CLEAR.
+ * CB uses both the DCC clear codes and the CB clear color registers,
+ * so they must match.
+ */
+ *eliminate_needed = false;
+
+ if (color_value)
+ *clear_value |= 0x80808080U;
+ if (alpha_value)
+ *clear_value |= 0x40404040U;
+ return true;
+}
+
+void vi_dcc_clear_level(struct si_context *sctx,
+ struct si_texture *tex,
+ unsigned level, unsigned clear_value)
+{
+ struct pipe_resource *dcc_buffer;
+ uint64_t dcc_offset, clear_size;
+
+ assert(vi_dcc_enabled(tex, level));
+
+ if (tex->dcc_separate_buffer) {
+ dcc_buffer = &tex->dcc_separate_buffer->b.b;
+ dcc_offset = 0;
+ } else {
+ dcc_buffer = &tex->buffer.b.b;
+ dcc_offset = tex->dcc_offset;
+ }
+
+ if (sctx->chip_class >= GFX9) {
+ /* Mipmap level clears aren't implemented. */
+ assert(tex->buffer.b.b.last_level == 0);
+ /* 4x and 8x MSAA needs a sophisticated compute shader for
+ * the clear. See AMDVLK. */
+ assert(tex->buffer.b.b.nr_storage_samples <= 2);
+ clear_size = tex->surface.dcc_size;
+ } else {
+ unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
+
+ /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
+ assert(tex->surface.u.legacy.level[level].dcc_fast_clear_size);
+ /* Layered 4x and 8x MSAA DCC fast clears need to clear
+ * dcc_fast_clear_size bytes for each layer. A compute shader
+ * would be more efficient than separate per-layer clear operations.
+ */
+ assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1);
+
+ dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
+ clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
+ num_layers;
+ }
+
+ si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
+ &clear_value, 4, SI_COHERENCY_CB_META);
+}
+
+/* Set the same micro tile mode as the destination of the last MSAA resolve.
+ * This allows hitting the MSAA resolve fast path, which requires that both
+ * src and dst micro tile modes match.
+ */
+static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
+ struct si_texture *tex)
+{
+ if (tex->buffer.b.is_shared ||
+ tex->buffer.b.b.nr_samples <= 1 ||
+ tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
+ return;
+
+ assert(sscreen->info.chip_class >= GFX9 ||
+ tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+ assert(tex->buffer.b.b.last_level == 0);
+
+ if (sscreen->info.chip_class >= GFX9) {
+ /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
+ assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
+
+ /* If you do swizzle_mode % 4, you'll get:
+ * 0 = Depth
+ * 1 = Standard,
+ * 2 = Displayable
+ * 3 = Rotated
+ *
+ * Depth-sample order isn't allowed:
+ */
+ assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
+
+ switch (tex->last_msaa_resolve_target_micro_mode) {
+ case RADEON_MICRO_MODE_DISPLAY:
+ tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+ tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
+ break;
+ case RADEON_MICRO_MODE_THIN:
+ tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+ tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
+ break;
+ case RADEON_MICRO_MODE_ROTATED:
+ tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+ tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
+ break;
+ default: /* depth */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ } else if (sscreen->info.chip_class >= CIK) {
+ /* These magic numbers were copied from addrlib. It doesn't use
+ * any definitions for them either. They are all 2D_TILED_THIN1
+ * modes with different bpp and micro tile mode.
+ */
+ switch (tex->last_msaa_resolve_target_micro_mode) {
+ case RADEON_MICRO_MODE_DISPLAY:
+ tex->surface.u.legacy.tiling_index[0] = 10;
+ break;
+ case RADEON_MICRO_MODE_THIN:
+ tex->surface.u.legacy.tiling_index[0] = 14;
+ break;
+ case RADEON_MICRO_MODE_ROTATED:
+ tex->surface.u.legacy.tiling_index[0] = 28;
+ break;
+ default: /* depth, thick */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ } else { /* SI */
+ switch (tex->last_msaa_resolve_target_micro_mode) {
+ case RADEON_MICRO_MODE_DISPLAY:
+ switch (tex->surface.bpe) {
+ case 1:
+ tex->surface.u.legacy.tiling_index[0] = 10;
+ break;
+ case 2:
+ tex->surface.u.legacy.tiling_index[0] = 11;
+ break;
+ default: /* 4, 8 */
+ tex->surface.u.legacy.tiling_index[0] = 12;
+ break;
+ }
+ break;
+ case RADEON_MICRO_MODE_THIN:
+ switch (tex->surface.bpe) {
+ case 1:
+ tex->surface.u.legacy.tiling_index[0] = 14;
+ break;
+ case 2:
+ tex->surface.u.legacy.tiling_index[0] = 15;
+ break;
+ case 4:
+ tex->surface.u.legacy.tiling_index[0] = 16;
+ break;
+ default: /* 8, 16 */
+ tex->surface.u.legacy.tiling_index[0] = 17;
+ break;
+ }
+ break;
+ default: /* depth, thick */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ }
+
+ tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
+
+ p_atomic_inc(&sscreen->dirty_tex_counter);
+}
+
+static void si_do_fast_color_clear(struct si_context *sctx,
+ unsigned *buffers,
+ const union pipe_color_union *color)
+{
+ struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+ int i;
+
+ /* This function is broken in BE, so just disable this path for now */
+#ifdef PIPE_ARCH_BIG_ENDIAN
+ return;
+#endif
+
+ if (sctx->render_cond)
+ return;
+
+ for (i = 0; i < fb->nr_cbufs; i++) {
+ struct si_texture *tex;
+ unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
+
+ if (!fb->cbufs[i])
+ continue;
+
+ /* if this colorbuffer is not being cleared */
+ if (!(*buffers & clear_bit))
+ continue;
+
+ unsigned level = fb->cbufs[i]->u.tex.level;
+ if (level > 0)
+ continue;
+
+ tex = (struct si_texture *)fb->cbufs[i]->texture;
+
+ /* TODO: GFX9: Implement DCC fast clear for level 0 of
+ * mipmapped textures. Mipmapped DCC has to clear a rectangular
+ * area of DCC for level 0 (because the whole miptree is
+ * organized in a 2D plane).
+ */
+ if (sctx->chip_class >= GFX9 &&
+ tex->buffer.b.b.last_level > 0)
+ continue;
+
+ /* the clear is allowed if all layers are bound */
+ if (fb->cbufs[i]->u.tex.first_layer != 0 ||
+ fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
+ continue;
+ }
+
+ /* only supported on tiled surfaces */
+ if (tex->surface.is_linear) {
+ continue;
+ }
+
+ /* shared textures can't use fast clear without an explicit flush,
+ * because there is no way to communicate the clear color among
+ * all clients
+ */
+ if (tex->buffer.b.is_shared &&
+ !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+ continue;
+
+ if (sctx->chip_class <= VI &&
+ tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+ !sctx->screen->info.htile_cmask_support_1d_tiling)
+ continue;
+
+ /* Use a slow clear for small surfaces where the cost of
+ * the eliminate pass can be higher than the benefit of fast
+ * clear. The closed driver does this, but the numbers may differ.
+ *
+ * This helps on both dGPUs and APUs, even small APUs like Mullins.
+ */
+ bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
+ tex->buffer.b.b.width0 *
+ tex->buffer.b.b.height0 <= 512 * 512;
+ bool eliminate_needed = false;
+ bool fmask_decompress_needed = false;
+
+ /* Fast clear is the most appropriate place to enable DCC for
+ * displayable surfaces.
+ */
+ if (sctx->family == CHIP_STONEY && !too_small) {
+ vi_separate_dcc_try_enable(sctx, tex);
+
+ /* RB+ isn't supported with a CMASK clear only on Stoney,
+ * so all clears are considered to be hypothetically slow
+ * clears, which is weighed when determining whether to
+ * enable separate DCC.
+ */
+ if (tex->dcc_gather_statistics) /* only for Stoney */
+ tex->num_slow_clears++;
+ }
+
+ /* Try to clear DCC first, otherwise try CMASK. */
+ if (vi_dcc_enabled(tex, 0)) {
+ uint32_t reset_value;
+
+ if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
+ continue;
+
+ /* This can happen with mipmapping or MSAA. */
+ if (sctx->chip_class == VI &&
+ !tex->surface.u.legacy.level[level].dcc_fast_clear_size)
+ continue;
+
+ if (!vi_get_fast_clear_parameters(tex->buffer.b.b.format,
+ fb->cbufs[i]->format,
+ color, &reset_value,
+ &eliminate_needed))
+ continue;
+
+ if (eliminate_needed && too_small)
+ continue;
+
+ /* DCC fast clear with MSAA should clear CMASK to 0xC. */
+ if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
+ /* TODO: This doesn't work with MSAA. */
+ if (eliminate_needed)
+ continue;
+
+ uint32_t clear_value = 0xCCCCCCCC;
+ si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
+ tex->cmask_offset, tex->surface.cmask_size,
+ &clear_value, 4, SI_COHERENCY_CB_META);
+ fmask_decompress_needed = true;
+ }
+
+ vi_dcc_clear_level(sctx, tex, 0, reset_value);
+ tex->separate_dcc_dirty = true;
+ } else {
+ if (too_small)
+ continue;
+
+ /* 128-bit formats are unusupported */
+ if (tex->surface.bpe > 8) {
+ continue;
+ }
+
+ /* RB+ doesn't work with CMASK fast clear on Stoney. */
+ if (sctx->family == CHIP_STONEY)
+ continue;
+
+ /* ensure CMASK is enabled */
+ si_alloc_separate_cmask(sctx->screen, tex);
+ if (!tex->cmask_buffer)
+ continue;
+
+ /* Do the fast clear. */
+ uint32_t clear_value = 0;
+ si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
+ tex->cmask_offset, tex->surface.cmask_size,
+ &clear_value, 4, SI_COHERENCY_CB_META);
+ eliminate_needed = true;
+ }
+
+ if ((eliminate_needed || fmask_decompress_needed) &&
+ !(tex->dirty_level_mask & (1 << level))) {
+ tex->dirty_level_mask |= 1 << level;
+ p_atomic_inc(&sctx->screen->compressed_colortex_counter);
+ }
+
+ /* We can change the micro tile mode before a full clear. */
+ si_set_optimal_micro_tile_mode(sctx->screen, tex);
+
+ *buffers &= ~clear_bit;
+
+ if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
+ sctx->framebuffer.dirty_cbufs |= 1 << i;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ }
+ }
+}
+
+static void si_clear(struct pipe_context *ctx, unsigned buffers,
+ const union pipe_color_union *color,
+ double depth, unsigned stencil)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+ struct pipe_surface *zsbuf = fb->zsbuf;
+ struct si_texture *zstex =
+ zsbuf ? (struct si_texture*)zsbuf->texture : NULL;
+
+ if (buffers & PIPE_CLEAR_COLOR) {
+ si_do_fast_color_clear(sctx, &buffers, color);
+ if (!buffers)
+ return; /* all buffers have been fast cleared */
+
+ /* These buffers cannot use fast clear, make sure to disable expansion. */
+ for (unsigned i = 0; i < fb->nr_cbufs; i++) {
+ struct si_texture *tex;
+
+ /* If not clearing this buffer, skip. */
+ if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
+ continue;
+
+ tex = (struct si_texture *)fb->cbufs[i]->texture;
+ if (tex->surface.fmask_size == 0)
+ tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
+ }
+ }
+
+ if (zstex &&
+ si_htile_enabled(zstex, zsbuf->u.tex.level) &&
+ zsbuf->u.tex.first_layer == 0 &&
+ zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
+ /* TC-compatible HTILE only supports depth clears to 0 or 1. */
+ if (buffers & PIPE_CLEAR_DEPTH &&
+ (!zstex->tc_compatible_htile ||
+ depth == 0 || depth == 1)) {
+ /* Need to disable EXPCLEAR temporarily if clearing
+ * to a new value. */
+ if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
+ sctx->db_depth_disable_expclear = true;
+ }
+
+ if (zstex->depth_clear_value != (float)depth) {
+ /* Update DB_DEPTH_CLEAR. */
+ zstex->depth_clear_value = depth;
+ sctx->framebuffer.dirty_zsbuf = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ }
+ sctx->db_depth_clear = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+
+ /* TC-compatible HTILE only supports stencil clears to 0. */
+ if (buffers & PIPE_CLEAR_STENCIL &&
+ (!zstex->tc_compatible_htile || stencil == 0)) {
+ stencil &= 0xff;
+
+ /* Need to disable EXPCLEAR temporarily if clearing
+ * to a new value. */
+ if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
+ sctx->db_stencil_disable_expclear = true;
+ }
+
+ if (zstex->stencil_clear_value != (uint8_t)stencil) {
+ /* Update DB_STENCIL_CLEAR. */
+ zstex->stencil_clear_value = stencil;
+ sctx->framebuffer.dirty_zsbuf = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ }
+ sctx->db_stencil_clear = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+
+ /* TODO: Find out what's wrong here. Fast depth clear leads to
+ * corruption in ARK: Survival Evolved, but that may just be
+ * a coincidence and the root cause is elsewhere.
+ *
+ * The corruption can be fixed by putting the DB flush before
+ * or after the depth clear. (surprisingly)
+ *
+ * https://bugs.freedesktop.org/show_bug.cgi?id=102955 (apitrace)
+ *
+ * This hack decreases back-to-back ClearDepth performance.
+ */
+ if ((sctx->db_depth_clear || sctx->db_stencil_clear) &&
+ sctx->screen->clear_db_cache_before_clear)
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+ }
+
+ si_blitter_begin(sctx, SI_CLEAR);
+ util_blitter_clear(sctx->blitter, fb->width, fb->height,
+ util_framebuffer_get_num_layers(fb),
+ buffers, color, depth, stencil);
+ si_blitter_end(sctx);
+
+ if (sctx->db_depth_clear) {
+ sctx->db_depth_clear = false;
+ sctx->db_depth_disable_expclear = false;
+ zstex->depth_cleared = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+
+ if (sctx->db_stencil_clear) {
+ sctx->db_stencil_clear = false;
+ sctx->db_stencil_disable_expclear = false;
+ zstex->stencil_cleared = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+}
+
+static void si_clear_render_target(struct pipe_context *ctx,
+ struct pipe_surface *dst,
+ const union pipe_color_union *color,
+ unsigned dstx, unsigned dsty,
+ unsigned width, unsigned height,
+ bool render_condition_enabled)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ si_blitter_begin(sctx, SI_CLEAR_SURFACE |
+ (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+ util_blitter_clear_render_target(sctx->blitter, dst, color,
+ dstx, dsty, width, height);
+ si_blitter_end(sctx);
+}
+
+static void si_clear_depth_stencil(struct pipe_context *ctx,
+ struct pipe_surface *dst,
+ unsigned clear_flags,
+ double depth,
+ unsigned stencil,
+ unsigned dstx, unsigned dsty,
+ unsigned width, unsigned height,
+ bool render_condition_enabled)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ si_blitter_begin(sctx, SI_CLEAR_SURFACE |
+ (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+ util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil,
+ dstx, dsty, width, height);
+ si_blitter_end(sctx);
+}
+
+static void si_clear_texture(struct pipe_context *pipe,
+ struct pipe_resource *tex,
+ unsigned level,
+ const struct pipe_box *box,
+ const void *data)
+{
+ struct pipe_screen *screen = pipe->screen;
+ struct si_texture *stex = (struct si_texture*)tex;
+ struct pipe_surface tmpl = {{0}};
+ struct pipe_surface *sf;
+ const struct util_format_description *desc =
+ util_format_description(tex->format);
+
+ tmpl.format = tex->format;
+ tmpl.u.tex.first_layer = box->z;
+ tmpl.u.tex.last_layer = box->z + box->depth - 1;
+ tmpl.u.tex.level = level;
+ sf = pipe->create_surface(pipe, tex, &tmpl);
+ if (!sf)
+ return;
+
+ if (stex->is_depth) {
+ unsigned clear;
+ float depth;
+ uint8_t stencil = 0;
+
+ /* Depth is always present. */
+ clear = PIPE_CLEAR_DEPTH;
+ desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+
+ if (stex->surface.has_stencil) {
+ clear |= PIPE_CLEAR_STENCIL;
+ desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+ }
+
+ si_clear_depth_stencil(pipe, sf, clear, depth, stencil,
+ box->x, box->y,
+ box->width, box->height, false);
+ } else {
+ union pipe_color_union color;
+
+ /* pipe_color_union requires the full vec4 representation. */
+ if (util_format_is_pure_uint(tex->format))
+ desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1);
+ else if (util_format_is_pure_sint(tex->format))
+ desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1);
+ else
+ desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
+
+ if (screen->is_format_supported(screen, tex->format,
+ tex->target, 0, 0,
+ PIPE_BIND_RENDER_TARGET)) {
+ si_clear_render_target(pipe, sf, &color,
+ box->x, box->y,
+ box->width, box->height, false);
+ } else {
+ /* Software fallback - just for R9G9B9E5_FLOAT */
+ util_clear_render_target(pipe, sf, &color,
+ box->x, box->y,
+ box->width, box->height);
+ }
+ }
+ pipe_surface_reference(&sf, NULL);
+}
+
+void si_init_clear_functions(struct si_context *sctx)
+{
+ sctx->b.clear = si_clear;
+ sctx->b.clear_render_target = si_clear_render_target;
+ sctx->b.clear_depth_stencil = si_clear_depth_stencil;
+ sctx->b.clear_texture = si_clear_texture;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute_blit.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_blit.c
new file mode 100644
index 000000000..20e4f591f
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "si_pipe.h"
+
+/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
+ * and L2_STREAM for src.
+ */
+static enum si_cache_policy get_cache_policy(struct si_context *sctx,
+ enum si_coherency coher,
+ uint64_t size)
+{
+ if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
+ coher == SI_COHERENCY_CP)) ||
+ (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
+ return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
+
+ return L2_BYPASS;
+}
+
+unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
+ enum si_cache_policy cache_policy)
+{
+ switch (coher) {
+ default:
+ case SI_COHERENCY_NONE:
+ case SI_COHERENCY_CP:
+ return 0;
+ case SI_COHERENCY_SHADER:
+ return SI_CONTEXT_INV_SMEM_L1 |
+ SI_CONTEXT_INV_VMEM_L1 |
+ (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+ case SI_COHERENCY_CB_META:
+ return SI_CONTEXT_FLUSH_AND_INV_CB;
+ }
+}
+
+static void si_compute_do_clear_or_copy(struct si_context *sctx,
+ struct pipe_resource *dst,
+ unsigned dst_offset,
+ struct pipe_resource *src,
+ unsigned src_offset,
+ unsigned size,
+ const uint32_t *clear_value,
+ unsigned clear_value_size,
+ enum si_coherency coher)
+{
+ struct pipe_context *ctx = &sctx->b;
+
+ assert(src_offset % 4 == 0);
+ assert(dst_offset % 4 == 0);
+ assert(size % 4 == 0);
+
+ assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+ assert(!src || src_offset + size <= src->width0);
+
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+ si_emit_cache_flush(sctx);
+
+ /* Save states. */
+ void *saved_cs = sctx->cs_shader_state.program;
+ struct pipe_shader_buffer saved_sb[2] = {};
+ si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+
+ /* The memory accesses are coalesced, meaning that the 1st instruction writes
+ * the 1st contiguous block of data for the whole wave, the 2nd instruction
+ * writes the 2nd contiguous block of data, etc.
+ */
+ unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
+ SI_COMPUTE_CLEAR_DW_PER_THREAD;
+ unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
+ unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
+ unsigned dwords_per_wave = dwords_per_thread * 64;
+
+ unsigned num_dwords = size / 4;
+ unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+ struct pipe_grid_info info = {};
+ info.block[0] = MIN2(64, num_instructions);
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+
+ struct pipe_shader_buffer sb[2] = {};
+ sb[0].buffer = dst;
+ sb[0].buffer_offset = dst_offset;
+ sb[0].buffer_size = size;
+
+ if (src) {
+ sb[1].buffer = src;
+ sb[1].buffer_offset = src_offset;
+ sb[1].buffer_size = size;
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
+ ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
+ } else {
+ assert(clear_value_size >= 4 &&
+ clear_value_size <= 16 &&
+ util_is_power_of_two_or_zero(clear_value_size));
+
+ for (unsigned i = 0; i < 4; i++)
+ sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
+ ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
+ }
+
+ ctx->launch_grid(ctx, &info);
+
+ enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
+ (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
+
+ if (cache_policy != L2_BYPASS)
+ r600_resource(dst)->TC_L2_dirty = true;
+
+ /* Restore states. */
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+}
+
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+ uint64_t offset, uint64_t size, uint32_t *clear_value,
+ uint32_t clear_value_size, enum si_coherency coher)
+{
+ if (!size)
+ return;
+
+ unsigned clear_alignment = MIN2(clear_value_size, 4);
+
+ assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
+ assert(offset % clear_alignment == 0);
+ assert(size % clear_alignment == 0);
+ assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
+
+ /* Reduce a large clear value size if possible. */
+ if (clear_value_size > 4) {
+ bool clear_dword_duplicated = true;
+
+ /* See if we can lower large fills to dword fills. */
+ for (unsigned i = 1; i < clear_value_size / 4; i++) {
+ if (clear_value[0] != clear_value[i]) {
+ clear_dword_duplicated = false;
+ break;
+ }
+ }
+ if (clear_dword_duplicated)
+ clear_value_size = 4;
+ }
+
+ /* Expand a small clear value size. */
+ uint32_t tmp_clear_value;
+ if (clear_value_size <= 2) {
+ if (clear_value_size == 1) {
+ tmp_clear_value = *(uint8_t*)clear_value;
+ tmp_clear_value |= (tmp_clear_value << 8) |
+ (tmp_clear_value << 16) |
+ (tmp_clear_value << 24);
+ } else {
+ tmp_clear_value = *(uint16_t*)clear_value;
+ tmp_clear_value |= tmp_clear_value << 16;
+ }
+ clear_value = &tmp_clear_value;
+ clear_value_size = 4;
+ }
+
+ /* Use transform feedback for 12-byte clears. */
+ /* TODO: Use compute. */
+ if (clear_value_size == 12) {
+ union pipe_color_union streamout_clear_value;
+
+ memcpy(&streamout_clear_value, clear_value, clear_value_size);
+ si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
+ util_blitter_clear_buffer(sctx->blitter, dst, offset,
+ size, clear_value_size / 4,
+ &streamout_clear_value);
+ si_blitter_end(sctx);
+ return;
+ }
+
+ uint64_t aligned_size = size & ~3ull;
+ if (aligned_size >= 4) {
+ /* Before GFX9, CP DMA was very slow when clearing GTT, so never
+ * use CP DMA clears on those chips, because we can't be certain
+ * about buffer placements.
+ */
+ if (clear_value_size > 4 ||
+ (clear_value_size == 4 &&
+ offset % 4 == 0 &&
+ (size > 32*1024 || sctx->chip_class <= VI))) {
+ si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
+ aligned_size, clear_value,
+ clear_value_size, coher);
+ } else {
+ assert(clear_value_size == 4);
+ si_cp_dma_clear_buffer(sctx, dst, offset,
+ aligned_size, *clear_value, coher,
+ get_cache_policy(sctx, coher, size));
+ }
+
+ offset += aligned_size;
+ size -= aligned_size;
+ }
+
+ /* Handle non-dword alignment. */
+ if (size) {
+ assert(dst);
+ assert(dst->target == PIPE_BUFFER);
+ assert(size < 4);
+
+ pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+ }
+}
+
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ unsigned offset, unsigned size,
+ const void *clear_value,
+ int clear_value_size)
+{
+ enum si_coherency coher;
+
+ if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
+ coher = SI_COHERENCY_CP;
+ else
+ coher = SI_COHERENCY_SHADER;
+
+ si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
+ clear_value_size, coher);
+}
+
+void si_copy_buffer(struct si_context *sctx,
+ struct pipe_resource *dst, struct pipe_resource *src,
+ uint64_t dst_offset, uint64_t src_offset, unsigned size)
+{
+ if (!size)
+ return;
+
+ enum si_coherency coher = SI_COHERENCY_SHADER;
+ enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+ /* Only use compute for VRAM copies on dGPUs. */
+ if (sctx->screen->info.has_dedicated_vram &&
+ r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+ r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
+ size > 32 * 1024 &&
+ dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
+ si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
+ size, NULL, 0, coher);
+ } else {
+ si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
+ 0, coher, cache_policy);
+ }
+}
+
+void si_init_compute_blit_functions(struct si_context *sctx)
+{
+ sctx->b.clear_buffer = si_pipe_clear_buffer;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_dma_cs.c b/lib/mesa/src/gallium/drivers/radeonsi/si_dma_cs.c
new file mode 100644
index 000000000..ffa2f5ae6
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "sid.h"
+
+static void si_dma_emit_wait_idle(struct si_context *sctx)
+{
+ struct radeon_cmdbuf *cs = sctx->dma_cs;
+
+ /* NOP waits for idle. */
+ if (sctx->chip_class >= CIK)
+ radeon_emit(cs, 0x00000000); /* NOP */
+ else
+ radeon_emit(cs, 0xf0000000); /* NOP */
+}
+
+void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst,
+ uint64_t offset)
+{
+ struct radeon_cmdbuf *cs = sctx->dma_cs;
+ uint64_t va = dst->gpu_address + offset;
+
+ if (sctx->chip_class == SI) {
+ unreachable("SI DMA doesn't support the timestamp packet.");
+ return;
+ }
+
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(&dst->valid_buffer_range, offset, offset + 8);
+
+ assert(va % 8 == 0);
+
+ si_need_dma_space(sctx, 4, dst, NULL);
+ si_dma_emit_wait_idle(sctx);
+
+ radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
+ SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
+ 0));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+}
+
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+ uint64_t offset, uint64_t size, unsigned clear_value)
+{
+ struct radeon_cmdbuf *cs = sctx->dma_cs;
+ unsigned i, ncopy, csize;
+ struct r600_resource *rdst = r600_resource(dst);
+
+ assert(offset % 4 == 0);
+ assert(size);
+ assert(size % 4 == 0);
+
+ if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
+ sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
+ return;
+ }
+
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(&rdst->valid_buffer_range, offset, offset + size);
+
+ offset += rdst->gpu_address;
+
+ if (sctx->chip_class == SI) {
+ /* the same maximum size as for copying */
+ ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+ si_need_dma_space(sctx, ncopy * 4, rdst, NULL);
+
+ for (i = 0; i < ncopy; i++) {
+ csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+ radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
+ csize / 4));
+ radeon_emit(cs, offset);
+ radeon_emit(cs, clear_value);
+ radeon_emit(cs, (offset >> 32) << 16);
+ offset += csize;
+ size -= csize;
+ }
+ return;
+ }
+
+ /* The following code is for CI, VI, Vega/Raven, etc. */
+ /* the same maximum size as for copying */
+ ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+ si_need_dma_space(sctx, ncopy * 5, rdst, NULL);
+
+ for (i = 0; i < ncopy; i++) {
+ csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
+ radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
+ 0x8000 /* dword copy */));
+ radeon_emit(cs, offset);
+ radeon_emit(cs, offset >> 32);
+ radeon_emit(cs, clear_value);
+ radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
+ offset += csize;
+ size -= csize;
+ }
+}
+
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
+ struct r600_resource *dst, struct r600_resource *src)
+{
+ uint64_t vram = ctx->dma_cs->used_vram;
+ uint64_t gtt = ctx->dma_cs->used_gart;
+
+ if (dst) {
+ vram += dst->vram_usage;
+ gtt += dst->gart_usage;
+ }
+ if (src) {
+ vram += src->vram_usage;
+ gtt += src->gart_usage;
+ }
+
+ /* Flush the GFX IB if DMA depends on it. */
+ if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+ ((dst &&
+ ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
+ RADEON_USAGE_READWRITE)) ||
+ (src &&
+ ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
+ RADEON_USAGE_WRITE))))
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+ /* Flush if there's not enough space, or if the memory usage per IB
+ * is too large.
+ *
+ * IBs using too little memory are limited by the IB submission overhead.
+ * IBs using too much memory are limited by the kernel/TTM overhead.
+ * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+ *
+ * This heuristic makes sure that DMA requests are executed
+ * very soon after the call is made and lowers memory usage.
+ * It improves texture upload performance by keeping the DMA
+ * engine busy while uploads are being submitted.
+ */
+ num_dw++; /* for emit_wait_idle below */
+ if (!ctx->ws->cs_check_space(ctx->dma_cs, num_dw) ||
+ ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
+ !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
+ si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+ assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
+ }
+
+ /* Wait for idle if either buffer has been used in the IB before to
+ * prevent read-after-write hazards.
+ */
+ if ((dst &&
+ ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf,
+ RADEON_USAGE_READWRITE)) ||
+ (src &&
+ ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf,
+ RADEON_USAGE_WRITE)))
+ si_dma_emit_wait_idle(ctx);
+
+ if (dst) {
+ radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
+ RADEON_USAGE_WRITE, 0);
+ }
+ if (src) {
+ radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
+ RADEON_USAGE_READ, 0);
+ }
+
+ /* this function is called before all DMA calls, so increment this. */
+ ctx->num_dma_calls++;
+}
+
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
+ struct pipe_fence_handle **fence)
+{
+ struct radeon_cmdbuf *cs = ctx->dma_cs;
+ struct radeon_saved_cs saved;
+ bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
+
+ if (!radeon_emitted(cs, 0)) {
+ if (fence)
+ ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+ return;
+ }
+
+ if (check_vm)
+ si_save_cs(ctx->ws, cs, &saved, true);
+
+ ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
+ if (fence)
+ ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+
+ if (check_vm) {
+ /* Use conservative timeout 800ms, after which we won't wait any
+ * longer and assume the GPU is hung.
+ */
+ ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
+
+ si_check_vm_faults(ctx, &saved, RING_DMA);
+ si_clear_saved_cs(&saved);
+ }
+}
+
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
+ uint64_t offset, uint64_t size, unsigned value)
+{
+ struct si_context *ctx = (struct si_context*)sscreen->aux_context;
+
+ mtx_lock(&sscreen->aux_context_lock);
+ si_sdma_clear_buffer(ctx, dst, offset, size, value);
+ sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+ mtx_unlock(&sscreen->aux_context_lock);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h b/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h
index 532151125..38f5c3dc7 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_driinfo.h
@@ -16,9 +16,11 @@ DRI_CONF_BEGIN
DRI_CONF_MESA_NO_ERROR("false")
DRI_CONF_DISABLE_EXT_BUFFER_AGE("false")
DRI_CONF_DISABLE_OML_SYNC_CONTROL("false")
+ DRI_CONF_DISABLE_SGI_VIDEO_SYNC("false")
DRI_CONF_RADEONSI_ENABLE_SISCHED("false")
DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+ DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
DRI_CONF_SECTION_END
DRI_CONF_SECTION_QUALITY
DRI_CONF_PP_CELSHADE(0)
@@ -32,18 +34,22 @@ DRI_CONF_BEGIN
DRI_CONF_FORCE_GLSL_EXTENSIONS_WARN("false")
DRI_CONF_DISABLE_GLSL_LINE_CONTINUATIONS("false")
DRI_CONF_DISABLE_BLEND_FUNC_EXTENDED("false")
- DRI_CONF_DISABLE_SHADER_BIT_ENCODING("false")
DRI_CONF_FORCE_GLSL_VERSION(0)
DRI_CONF_ALLOW_GLSL_EXTENSION_DIRECTIVE_MIDSHADER("false")
+ DRI_CONF_ALLOW_GLSL_BUILTIN_CONST_EXPRESSION("false")
+ DRI_CONF_ALLOW_GLSL_RELAXED_ES("false")
DRI_CONF_ALLOW_GLSL_BUILTIN_VARIABLE_REDECLARATION("false")
DRI_CONF_ALLOW_GLSL_CROSS_STAGE_INTERPOLATION_MISMATCH("false")
DRI_CONF_ALLOW_HIGHER_COMPAT_VERSION("false")
DRI_CONF_FORCE_GLSL_ABS_SQRT("false")
DRI_CONF_GLSL_CORRECT_DERIVATIVES_AFTER_DISCARD("false")
+ DRI_CONF_ALLOW_GLSL_LAYOUT_QUALIFIER_ON_FUNCTION_PARAMETERS("false")
+ DRI_CONF_FORCE_COMPAT_PROFILE("false")
DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR("false")
DRI_CONF_SECTION_END
DRI_CONF_SECTION_MISCELLANEOUS
DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER("false")
DRI_CONF_GLSL_ZERO_INIT("false")
+ DRI_CONF_ALLOW_RGB10_CONFIGS("true")
DRI_CONF_SECTION_END
DRI_CONF_END
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_fence.c b/lib/mesa/src/gallium/drivers/radeonsi/si_fence.c
new file mode 100644
index 000000000..3f22ee31a
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_fence.c
@@ -0,0 +1,656 @@
+/*
+ * Copyright 2013-2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <libsync.h>
+
+#include "util/os_time.h"
+#include "util/u_memory.h"
+#include "util/u_queue.h"
+#include "util/u_upload_mgr.h"
+
+#include "si_build_pm4.h"
+
+struct si_fine_fence {
+ struct r600_resource *buf;
+ unsigned offset;
+};
+
+struct si_multi_fence {
+ struct pipe_reference reference;
+ struct pipe_fence_handle *gfx;
+ struct pipe_fence_handle *sdma;
+ struct tc_unflushed_batch_token *tc_token;
+ struct util_queue_fence ready;
+
+ /* If the context wasn't flushed at fence creation, this is non-NULL. */
+ struct {
+ struct si_context *ctx;
+ unsigned ib_index;
+ } gfx_unflushed;
+
+ struct si_fine_fence fine;
+};
+
+/**
+ * Write an EOP event.
+ *
+ * \param event EVENT_TYPE_*
+ * \param event_flags Optional cache flush flags (TC)
+ * \param dst_sel MEM or TC_L2
+ * \param int_sel NONE or SEND_DATA_AFTER_WR_CONFIRM
+ * \param data_sel DISCARD, VALUE_32BIT, TIMESTAMP, or GDS
+ * \param buf Buffer
+ * \param va GPU address
+ * \param old_value Previous fence value (for a bug workaround)
+ * \param new_value Fence value to write for this event.
+ */
+void si_cp_release_mem(struct si_context *ctx,
+ unsigned event, unsigned event_flags,
+ unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+ struct r600_resource *buf, uint64_t va,
+ uint32_t new_fence, unsigned query_type)
+{
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ unsigned op = EVENT_TYPE(event) |
+ EVENT_INDEX(event == V_028A90_CS_DONE ||
+ event == V_028A90_PS_DONE ? 6 : 5) |
+ event_flags;
+ unsigned sel = EOP_DST_SEL(dst_sel) |
+ EOP_INT_SEL(int_sel) |
+ EOP_DATA_SEL(data_sel);
+
+ if (ctx->chip_class >= GFX9) {
+ /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+ * counters) must immediately precede every timestamp event to
+ * prevent a GPU hang on GFX9.
+ *
+ * Occlusion queries don't need to do it here, because they
+ * always do ZPASS_DONE before the timestamp.
+ */
+ if (ctx->chip_class == GFX9 &&
+ query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
+ query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
+ query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ struct r600_resource *scratch = ctx->eop_bug_scratch;
+
+ assert(16 * ctx->screen->info.num_render_backends <=
+ scratch->b.b.width0);
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+ radeon_emit(cs, scratch->gpu_address);
+ radeon_emit(cs, scratch->gpu_address >> 32);
+
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
+ RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+ }
+
+ radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, sel);
+ radeon_emit(cs, va); /* address lo */
+ radeon_emit(cs, va >> 32); /* address hi */
+ radeon_emit(cs, new_fence); /* immediate data lo */
+ radeon_emit(cs, 0); /* immediate data hi */
+ radeon_emit(cs, 0); /* unused */
+ } else {
+ if (ctx->chip_class == CIK ||
+ ctx->chip_class == VI) {
+ struct r600_resource *scratch = ctx->eop_bug_scratch;
+ uint64_t va = scratch->gpu_address;
+
+ /* Two EOP events are required to make all engines go idle
+ * (and optional cache flushes executed) before the timestamp
+ * is written.
+ */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, va);
+ radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+ radeon_emit(cs, 0); /* immediate data */
+ radeon_emit(cs, 0); /* unused */
+
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
+ RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+ }
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, va);
+ radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+ radeon_emit(cs, new_fence); /* immediate data */
+ radeon_emit(cs, 0); /* unused */
+ }
+
+ if (buf) {
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
+ }
+}
+
+unsigned si_cp_write_fence_dwords(struct si_screen *screen)
+{
+ unsigned dwords = 6;
+
+ if (screen->info.chip_class == CIK ||
+ screen->info.chip_class == VI)
+ dwords *= 2;
+
+ return dwords;
+}
+
+void si_cp_wait_mem(struct si_context *ctx,
+ uint64_t va, uint32_t ref, uint32_t mask, unsigned flags)
+{
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+ radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1) | flags);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, ref); /* reference value */
+ radeon_emit(cs, mask); /* mask */
+ radeon_emit(cs, 4); /* poll interval */
+}
+
+static void si_add_fence_dependency(struct si_context *sctx,
+ struct pipe_fence_handle *fence)
+{
+ struct radeon_winsys *ws = sctx->ws;
+
+ if (sctx->dma_cs)
+ ws->cs_add_fence_dependency(sctx->dma_cs, fence);
+ ws->cs_add_fence_dependency(sctx->gfx_cs, fence);
+}
+
+static void si_add_syncobj_signal(struct si_context *sctx,
+ struct pipe_fence_handle *fence)
+{
+ sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
+}
+
+static void si_fence_reference(struct pipe_screen *screen,
+ struct pipe_fence_handle **dst,
+ struct pipe_fence_handle *src)
+{
+ struct radeon_winsys *ws = ((struct si_screen*)screen)->ws;
+ struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
+ struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
+
+ if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+ ws->fence_reference(&(*rdst)->gfx, NULL);
+ ws->fence_reference(&(*rdst)->sdma, NULL);
+ tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL);
+ r600_resource_reference(&(*rdst)->fine.buf, NULL);
+ FREE(*rdst);
+ }
+ *rdst = rsrc;
+}
+
+static struct si_multi_fence *si_create_multi_fence()
+{
+ struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
+ if (!fence)
+ return NULL;
+
+ pipe_reference_init(&fence->reference, 1);
+ util_queue_fence_init(&fence->ready);
+
+ return fence;
+}
+
+struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
+ struct tc_unflushed_batch_token *tc_token)
+{
+ struct si_multi_fence *fence = si_create_multi_fence();
+ if (!fence)
+ return NULL;
+
+ util_queue_fence_reset(&fence->ready);
+ tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
+
+ return (struct pipe_fence_handle *)fence;
+}
+
+static bool si_fine_fence_signaled(struct radeon_winsys *rws,
+ const struct si_fine_fence *fine)
+{
+ char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
+ PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!map)
+ return false;
+
+ uint32_t *fence = (uint32_t*)(map + fine->offset);
+ return *fence != 0;
+}
+
+static void si_fine_fence_set(struct si_context *ctx,
+ struct si_fine_fence *fine,
+ unsigned flags)
+{
+ uint32_t *fence_ptr;
+
+ assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
+
+ /* Use uncached system memory for the fence. */
+ u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4,
+ &fine->offset, (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
+ if (!fine->buf)
+ return;
+
+ *fence_ptr = 0;
+
+ uint64_t fence_va = fine->buf->gpu_address + fine->offset;
+
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf,
+ RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+ if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+ radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+ S_370_WR_CONFIRM(1) |
+ S_370_ENGINE_SEL(V_370_PFP));
+ radeon_emit(cs, fence_va);
+ radeon_emit(cs, fence_va >> 32);
+ radeon_emit(cs, 0x80000000);
+ } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
+ si_cp_release_mem(ctx,
+ V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+ EOP_DATA_SEL_VALUE_32BIT,
+ NULL, fence_va, 0x80000000,
+ PIPE_QUERY_GPU_FINISHED);
+ } else {
+ assert(false);
+ }
+}
+
+static boolean si_fence_finish(struct pipe_screen *screen,
+ struct pipe_context *ctx,
+ struct pipe_fence_handle *fence,
+ uint64_t timeout)
+{
+ struct radeon_winsys *rws = ((struct si_screen*)screen)->ws;
+ struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+ struct si_context *sctx;
+ int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+ ctx = threaded_context_unwrap_sync(ctx);
+ sctx = (struct si_context*)(ctx ? ctx : NULL);
+
+ if (!util_queue_fence_is_signalled(&rfence->ready)) {
+ if (rfence->tc_token) {
+ /* Ensure that si_flush_from_st will be called for
+ * this fence, but only if we're in the API thread
+ * where the context is current.
+ *
+ * Note that the batch containing the flush may already
+ * be in flight in the driver thread, so the fence
+ * may not be ready yet when this call returns.
+ */
+ threaded_context_flush(ctx, rfence->tc_token,
+ timeout == 0);
+ }
+
+ if (!timeout)
+ return false;
+
+ if (timeout == PIPE_TIMEOUT_INFINITE) {
+ util_queue_fence_wait(&rfence->ready);
+ } else {
+ if (!util_queue_fence_wait_timeout(&rfence->ready, abs_timeout))
+ return false;
+ }
+
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
+ }
+
+ if (rfence->sdma) {
+ if (!rws->fence_wait(rws, rfence->sdma, timeout))
+ return false;
+
+ /* Recompute the timeout after waiting. */
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
+ }
+
+ if (!rfence->gfx)
+ return true;
+
+ if (rfence->fine.buf &&
+ si_fine_fence_signaled(rws, &rfence->fine)) {
+ rws->fence_reference(&rfence->gfx, NULL);
+ r600_resource_reference(&rfence->fine.buf, NULL);
+ return true;
+ }
+
+ /* Flush the gfx IB if it hasn't been flushed yet. */
+ if (sctx && rfence->gfx_unflushed.ctx == sctx &&
+ rfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
+ /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
+ * spec says:
+ *
+ * "If the sync object being blocked upon will not be
+ * signaled in finite time (for example, by an associated
+ * fence command issued previously, but not yet flushed to
+ * the graphics pipeline), then ClientWaitSync may hang
+ * forever. To help prevent this behavior, if
+ * ClientWaitSync is called and all of the following are
+ * true:
+ *
+ * * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
+ * * sync is unsignaled when ClientWaitSync is called,
+ * * and the calls to ClientWaitSync and FenceSync were
+ * issued from the same context,
+ *
+ * then the GL will behave as if the equivalent of Flush
+ * were inserted immediately after the creation of sync."
+ *
+ * This means we need to flush for such fences even when we're
+ * not going to wait.
+ */
+ si_flush_gfx_cs(sctx,
+ (timeout ? 0 : PIPE_FLUSH_ASYNC) |
+ RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+ NULL);
+ rfence->gfx_unflushed.ctx = NULL;
+
+ if (!timeout)
+ return false;
+
+ /* Recompute the timeout after all that. */
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
+ }
+
+ if (rws->fence_wait(rws, rfence->gfx, timeout))
+ return true;
+
+ /* Re-check in case the GPU is slow or hangs, but the commands before
+ * the fine-grained fence have completed. */
+ if (rfence->fine.buf &&
+ si_fine_fence_signaled(rws, &rfence->fine))
+ return true;
+
+ return false;
+}
+
+static void si_create_fence_fd(struct pipe_context *ctx,
+ struct pipe_fence_handle **pfence, int fd,
+ enum pipe_fd_type type)
+{
+ struct si_screen *sscreen = (struct si_screen*)ctx->screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ struct si_multi_fence *rfence;
+
+ *pfence = NULL;
+
+ rfence = si_create_multi_fence();
+ if (!rfence)
+ return;
+
+ switch (type) {
+ case PIPE_FD_TYPE_NATIVE_SYNC:
+ if (!sscreen->info.has_fence_to_handle)
+ goto finish;
+
+ rfence->gfx = ws->fence_import_sync_file(ws, fd);
+ break;
+
+ case PIPE_FD_TYPE_SYNCOBJ:
+ if (!sscreen->info.has_syncobj)
+ goto finish;
+
+ rfence->gfx = ws->fence_import_syncobj(ws, fd);
+ break;
+
+ default:
+ unreachable("bad fence fd type when importing");
+ }
+
+finish:
+ if (!rfence->gfx) {
+ FREE(rfence);
+ return;
+ }
+
+ *pfence = (struct pipe_fence_handle*)rfence;
+}
+
+static int si_fence_get_fd(struct pipe_screen *screen,
+ struct pipe_fence_handle *fence)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+ int gfx_fd = -1, sdma_fd = -1;
+
+ if (!sscreen->info.has_fence_to_handle)
+ return -1;
+
+ util_queue_fence_wait(&rfence->ready);
+
+ /* Deferred fences aren't supported. */
+ assert(!rfence->gfx_unflushed.ctx);
+ if (rfence->gfx_unflushed.ctx)
+ return -1;
+
+ if (rfence->sdma) {
+ sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma);
+ if (sdma_fd == -1)
+ return -1;
+ }
+ if (rfence->gfx) {
+ gfx_fd = ws->fence_export_sync_file(ws, rfence->gfx);
+ if (gfx_fd == -1) {
+ if (sdma_fd != -1)
+ close(sdma_fd);
+ return -1;
+ }
+ }
+
+ /* If we don't have FDs at this point, it means we don't have fences
+ * either. */
+ if (sdma_fd == -1 && gfx_fd == -1)
+ return ws->export_signalled_sync_file(ws);
+ if (sdma_fd == -1)
+ return gfx_fd;
+ if (gfx_fd == -1)
+ return sdma_fd;
+
+ /* Get a fence that will be a combination of both fences. */
+ sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
+ close(sdma_fd);
+ return gfx_fd;
+}
+
+static void si_flush_from_st(struct pipe_context *ctx,
+ struct pipe_fence_handle **fence,
+ unsigned flags)
+{
+ struct pipe_screen *screen = ctx->screen;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct radeon_winsys *ws = sctx->ws;
+ struct pipe_fence_handle *gfx_fence = NULL;
+ struct pipe_fence_handle *sdma_fence = NULL;
+ bool deferred_fence = false;
+ struct si_fine_fence fine = {};
+ unsigned rflags = PIPE_FLUSH_ASYNC;
+
+ if (flags & PIPE_FLUSH_END_OF_FRAME)
+ rflags |= PIPE_FLUSH_END_OF_FRAME;
+
+ if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
+ assert(flags & PIPE_FLUSH_DEFERRED);
+ assert(fence);
+
+ si_fine_fence_set(sctx, &fine, flags);
+ }
+
+ /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
+ if (sctx->dma_cs)
+ si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
+
+ if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
+ if (fence)
+ ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
+ if (!(flags & PIPE_FLUSH_DEFERRED))
+ ws->cs_sync_flush(sctx->gfx_cs);
+ } else {
+ /* Instead of flushing, create a deferred fence. Constraints:
+ * - The state tracker must allow a deferred flush.
+ * - The state tracker must request a fence.
+ * - fence_get_fd is not allowed.
+ * Thread safety in fence_finish must be ensured by the state tracker.
+ */
+ if (flags & PIPE_FLUSH_DEFERRED &&
+ !(flags & PIPE_FLUSH_FENCE_FD) &&
+ fence) {
+ gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
+ deferred_fence = true;
+ } else {
+ si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
+ }
+ }
+
+ /* Both engines can signal out of order, so we need to keep both fences. */
+ if (fence) {
+ struct si_multi_fence *multi_fence;
+
+ if (flags & TC_FLUSH_ASYNC) {
+ multi_fence = (struct si_multi_fence *)*fence;
+ assert(multi_fence);
+ } else {
+ multi_fence = si_create_multi_fence();
+ if (!multi_fence) {
+ ws->fence_reference(&sdma_fence, NULL);
+ ws->fence_reference(&gfx_fence, NULL);
+ goto finish;
+ }
+
+ screen->fence_reference(screen, fence, NULL);
+ *fence = (struct pipe_fence_handle*)multi_fence;
+ }
+
+ /* If both fences are NULL, fence_finish will always return true. */
+ multi_fence->gfx = gfx_fence;
+ multi_fence->sdma = sdma_fence;
+
+ if (deferred_fence) {
+ multi_fence->gfx_unflushed.ctx = sctx;
+ multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
+ }
+
+ multi_fence->fine = fine;
+ fine.buf = NULL;
+
+ if (flags & TC_FLUSH_ASYNC) {
+ util_queue_fence_signal(&multi_fence->ready);
+ tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
+ }
+ }
+ assert(!fine.buf);
+finish:
+ if (!(flags & PIPE_FLUSH_DEFERRED)) {
+ if (sctx->dma_cs)
+ ws->cs_sync_flush(sctx->dma_cs);
+ ws->cs_sync_flush(sctx->gfx_cs);
+ }
+}
+
+static void si_fence_server_signal(struct pipe_context *ctx,
+ struct pipe_fence_handle *fence)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+
+ /* We should have at least one syncobj to signal */
+ assert(rfence->sdma || rfence->gfx);
+
+ if (rfence->sdma)
+ si_add_syncobj_signal(sctx, rfence->sdma);
+ if (rfence->gfx)
+ si_add_syncobj_signal(sctx, rfence->gfx);
+
+ /**
+ * The spec does not require a flush here. We insert a flush
+ * because syncobj based signals are not directly placed into
+ * the command stream. Instead the signal happens when the
+ * submission associated with the syncobj finishes execution.
+ *
+ * Therefore, we must make sure that we flush the pipe to avoid
+ * new work being emitted and getting executed before the signal
+ * operation.
+ */
+ si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+}
+
+static void si_fence_server_sync(struct pipe_context *ctx,
+ struct pipe_fence_handle *fence)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+
+ util_queue_fence_wait(&rfence->ready);
+
+ /* Unflushed fences from the same context are no-ops. */
+ if (rfence->gfx_unflushed.ctx &&
+ rfence->gfx_unflushed.ctx == sctx)
+ return;
+
+ /* All unflushed commands will not start execution before
+ * this fence dependency is signalled.
+ *
+ * Therefore we must flush before inserting the dependency
+ */
+ si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+
+ if (rfence->sdma)
+ si_add_fence_dependency(sctx, rfence->sdma);
+ if (rfence->gfx)
+ si_add_fence_dependency(sctx, rfence->gfx);
+}
+
+void si_init_fence_functions(struct si_context *ctx)
+{
+ ctx->b.flush = si_flush_from_st;
+ ctx->b.create_fence_fd = si_create_fence_fd;
+ ctx->b.fence_server_sync = si_fence_server_sync;
+ ctx->b.fence_server_signal = si_fence_server_signal;
+}
+
+void si_init_screen_fence_functions(struct si_screen *screen)
+{
+ screen->b.fence_finish = si_fence_finish;
+ screen->b.fence_reference = si_fence_reference;
+ screen->b.fence_get_fd = si_fence_get_fd;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_get.c b/lib/mesa/src/gallium/drivers/radeonsi/si_get.c
new file mode 100644
index 000000000..91f38329d
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_get.c
@@ -0,0 +1,1004 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "radeon/radeon_video.h"
+#include "radeon/radeon_vce.h"
+#include "radeon/radeon_uvd_enc.h"
+#include "ac_llvm_util.h"
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
+#include "util/u_screen.h"
+#include "util/u_video.h"
+#include "compiler/nir/nir.h"
+
+#include <sys/utsname.h>
+
+static const char *si_get_vendor(struct pipe_screen *pscreen)
+{
+ /* Don't change this. Games such as Alien Isolation are broken if this
+ * returns "Advanced Micro Devices, Inc."
+ */
+ return "X.Org";
+}
+
+static const char *si_get_device_vendor(struct pipe_screen *pscreen)
+{
+ return "AMD";
+}
+
+static const char *si_get_marketing_name(struct radeon_winsys *ws)
+{
+ if (!ws->get_chip_name)
+ return NULL;
+ return ws->get_chip_name(ws);
+}
+
+static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+ switch (param) {
+ /* Supported features (boolean caps). */
+ case PIPE_CAP_ACCELERATED:
+ case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+ case PIPE_CAP_ANISOTROPIC_FILTER:
+ case PIPE_CAP_POINT_SPRITE:
+ case PIPE_CAP_OCCLUSION_QUERY:
+ case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+ case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+ case PIPE_CAP_TEXTURE_SWIZZLE:
+ case PIPE_CAP_DEPTH_CLIP_DISABLE:
+ case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
+ case PIPE_CAP_SHADER_STENCIL_EXPORT:
+ case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+ case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+ case PIPE_CAP_SM3:
+ case PIPE_CAP_SEAMLESS_CUBE_MAP:
+ case PIPE_CAP_PRIMITIVE_RESTART:
+ case PIPE_CAP_CONDITIONAL_RENDER:
+ case PIPE_CAP_TEXTURE_BARRIER:
+ case PIPE_CAP_INDEP_BLEND_ENABLE:
+ case PIPE_CAP_INDEP_BLEND_FUNC:
+ case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+ case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+ case PIPE_CAP_START_INSTANCE:
+ case PIPE_CAP_NPOT_TEXTURES:
+ case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+ case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
+ case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+ case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+ case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+ case PIPE_CAP_TGSI_INSTANCEID:
+ case PIPE_CAP_COMPUTE:
+ case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+ case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+ case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+ case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+ case PIPE_CAP_CUBE_MAP_ARRAY:
+ case PIPE_CAP_SAMPLE_SHADING:
+ case PIPE_CAP_DRAW_INDIRECT:
+ case PIPE_CAP_CLIP_HALFZ:
+ case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+ case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+ case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+ case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+ case PIPE_CAP_TGSI_TEXCOORD:
+ case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+ case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+ case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+ case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+ case PIPE_CAP_SHAREABLE_SHADERS:
+ case PIPE_CAP_DEPTH_BOUNDS_TEST:
+ case PIPE_CAP_SAMPLER_VIEW_TARGET:
+ case PIPE_CAP_TEXTURE_QUERY_LOD:
+ case PIPE_CAP_TEXTURE_GATHER_SM5:
+ case PIPE_CAP_TGSI_TXQS:
+ case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+ case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+ case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+ case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+ case PIPE_CAP_INVALIDATE_BUFFER:
+ case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+ case PIPE_CAP_QUERY_BUFFER_OBJECT:
+ case PIPE_CAP_QUERY_MEMORY_INFO:
+ case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+ case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+ case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+ case PIPE_CAP_GENERATE_MIPMAP:
+ case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
+ case PIPE_CAP_STRING_MARKER:
+ case PIPE_CAP_CLEAR_TEXTURE:
+ case PIPE_CAP_CULL_DISTANCE:
+ case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+ case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+ case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+ case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+ case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+ case PIPE_CAP_DOUBLES:
+ case PIPE_CAP_TGSI_TEX_TXF_LZ:
+ case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+ case PIPE_CAP_BINDLESS_TEXTURE:
+ case PIPE_CAP_QUERY_TIMESTAMP:
+ case PIPE_CAP_QUERY_TIME_ELAPSED:
+ case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
+ case PIPE_CAP_QUERY_SO_OVERFLOW:
+ case PIPE_CAP_MEMOBJ:
+ case PIPE_CAP_LOAD_CONSTBUF:
+ case PIPE_CAP_INT64:
+ case PIPE_CAP_INT64_DIVMOD:
+ case PIPE_CAP_TGSI_CLOCK:
+ case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+ case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+ case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
+ case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+ case PIPE_CAP_TGSI_BALLOT:
+ case PIPE_CAP_TGSI_VOTE:
+ case PIPE_CAP_TGSI_FS_FBFETCH:
+ return 1;
+
+ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+ return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
+
+ case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+ return sscreen->info.has_gpu_reset_status_query ||
+ sscreen->info.has_gpu_reset_counter_query;
+
+ case PIPE_CAP_TEXTURE_MULTISAMPLE:
+ return sscreen->info.has_2d_tiling;
+
+ case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+ return SI_MAP_BUFFER_ALIGNMENT;
+
+ case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+ case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+ case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+ case PIPE_CAP_MAX_VERTEX_STREAMS:
+ case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+ case PIPE_CAP_MAX_WINDOW_RECTANGLES:
+ return 4;
+
+ case PIPE_CAP_GLSL_FEATURE_LEVEL:
+ case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+ if (sscreen->info.has_indirect_compute_dispatch)
+ return 450;
+ return 420;
+
+ case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
+ /* Optimal number for good TexSubImage performance on Polaris10. */
+ return 64 * 1024 * 1024;
+
+ case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+ case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
+ return MIN2(sscreen->info.max_alloc_size, INT_MAX);
+
+ case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+ case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+ case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+ return !sscreen->info.has_unaligned_shader_loads;
+
+ case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+ return sscreen->info.has_sparse_vm_mappings ?
+ RADEON_SPARSE_PAGE_SIZE : 0;
+
+ case PIPE_CAP_PACKED_UNIFORMS:
+ if (sscreen->debug_flags & DBG(NIR))
+ return 1;
+ return 0;
+
+ /* Unsupported features. */
+ case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+ case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+ case PIPE_CAP_USER_VERTEX_BUFFERS:
+ case PIPE_CAP_FAKE_SW_MSAA:
+ case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
+ case PIPE_CAP_VERTEXID_NOBASE:
+ case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES:
+ case PIPE_CAP_TGSI_MUL_ZERO_WINS:
+ case PIPE_CAP_UMA:
+ case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+ case PIPE_CAP_POST_DEPTH_COVERAGE:
+ case PIPE_CAP_TILE_RASTER_ORDER:
+ case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+ case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+ case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+ case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+ case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+ case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+ case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+ case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+ case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
+ return 0;
+
+ case PIPE_CAP_FENCE_SIGNAL:
+ return sscreen->info.has_syncobj;
+
+ case PIPE_CAP_CONSTBUF0_FLAGS:
+ return SI_RESOURCE_FLAG_32BIT;
+
+ case PIPE_CAP_NATIVE_FENCE_FD:
+ return sscreen->info.has_fence_to_handle;
+
+ case PIPE_CAP_DRAW_PARAMETERS:
+ case PIPE_CAP_MULTI_DRAW_INDIRECT:
+ case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+ return sscreen->has_draw_indirect_multi;
+
+ case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+ return 30;
+
+ case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+ return sscreen->info.chip_class <= VI ?
+ PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
+
+ /* Stream output. */
+ case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+ case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+ return 32*4;
+
+ /* Geometry shader output. */
+ case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+ return 1024;
+ case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+ return 4095;
+ case PIPE_CAP_MAX_GS_INVOCATIONS:
+ /* The closed driver exposes 127, but 125 is the greatest
+ * number that works. */
+ return 125;
+
+ case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+ return 2048;
+
+ /* Texturing. */
+ case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+ case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+ return 15; /* 16384 */
+ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+ /* textures support 8192, but layered rendering supports 2048 */
+ return 12;
+ case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+ /* textures support 8192, but layered rendering supports 2048 */
+ return 2048;
+
+ /* Viewports and render targets. */
+ case PIPE_CAP_MAX_VIEWPORTS:
+ return SI_MAX_VIEWPORTS;
+ case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
+ case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
+ case PIPE_CAP_MAX_RENDER_TARGETS:
+ return 8;
+ case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
+ return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
+
+ case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+ case PIPE_CAP_MIN_TEXEL_OFFSET:
+ return -32;
+
+ case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+ case PIPE_CAP_MAX_TEXEL_OFFSET:
+ return 31;
+
+ case PIPE_CAP_ENDIANNESS:
+ return PIPE_ENDIAN_LITTLE;
+
+ case PIPE_CAP_VENDOR_ID:
+ return ATI_VENDOR_ID;
+ case PIPE_CAP_DEVICE_ID:
+ return sscreen->info.pci_id;
+ case PIPE_CAP_VIDEO_MEMORY:
+ return sscreen->info.vram_size >> 20;
+ case PIPE_CAP_PCI_GROUP:
+ return sscreen->info.pci_domain;
+ case PIPE_CAP_PCI_BUS:
+ return sscreen->info.pci_bus;
+ case PIPE_CAP_PCI_DEVICE:
+ return sscreen->info.pci_dev;
+ case PIPE_CAP_PCI_FUNCTION:
+ return sscreen->info.pci_func;
+
+ default:
+ return u_pipe_screen_get_param_defaults(pscreen, param);
+ }
+}
+
+static float si_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param)
+{
+ switch (param) {
+ case PIPE_CAPF_MAX_LINE_WIDTH:
+ case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+ /* This depends on the quant mode, though the precise interactions
+ * are unknown. */
+ return 2048;
+ case PIPE_CAPF_MAX_POINT_WIDTH:
+ case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+ return SI_MAX_POINT_SIZE;
+ case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+ return 16.0f;
+ case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+ return 16.0f;
+ case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+ case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+ case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+ return 0.0f;
+ }
+ return 0.0f;
+}
+
+static int si_get_shader_param(struct pipe_screen* pscreen,
+ enum pipe_shader_type shader,
+ enum pipe_shader_cap param)
+{
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+ switch(shader)
+ {
+ case PIPE_SHADER_FRAGMENT:
+ case PIPE_SHADER_VERTEX:
+ case PIPE_SHADER_GEOMETRY:
+ case PIPE_SHADER_TESS_CTRL:
+ case PIPE_SHADER_TESS_EVAL:
+ break;
+ case PIPE_SHADER_COMPUTE:
+ switch (param) {
+ case PIPE_SHADER_CAP_SUPPORTED_IRS: {
+ int ir = 1 << PIPE_SHADER_IR_NATIVE;
+
+ if (sscreen->info.has_indirect_compute_dispatch)
+ ir |= 1 << PIPE_SHADER_IR_TGSI;
+
+ return ir;
+ }
+
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
+ uint64_t max_const_buffer_size;
+ pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_TGSI,
+ PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
+ &max_const_buffer_size);
+ return MIN2(max_const_buffer_size, INT_MAX);
+ }
+ default:
+ /* If compute shaders don't require a special value
+ * for this cap, we can return the same value we
+ * do for other shader types. */
+ break;
+ }
+ break;
+ default:
+ return 0;
+ }
+
+ switch (param) {
+ /* Shader limits. */
+ case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+ case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+ return 16384;
+ case PIPE_SHADER_CAP_MAX_INPUTS:
+ return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
+ case PIPE_SHADER_CAP_MAX_OUTPUTS:
+ return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
+ case PIPE_SHADER_CAP_MAX_TEMPS:
+ return 256; /* Max native temporaries. */
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+ return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+ return SI_NUM_CONST_BUFFERS;
+ case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+ case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+ return SI_NUM_SAMPLERS;
+ case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+ return SI_NUM_SHADER_BUFFERS;
+ case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+ return SI_NUM_IMAGES;
+ case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+ if (sscreen->debug_flags & DBG(NIR))
+ return 0;
+ return 32;
+ case PIPE_SHADER_CAP_PREFERRED_IR:
+ if (sscreen->debug_flags & DBG(NIR))
+ return PIPE_SHADER_IR_NIR;
+ return PIPE_SHADER_IR_TGSI;
+ case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+ return 4;
+
+ /* Supported boolean features. */
+ case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+ case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+ case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+ case PIPE_SHADER_CAP_INTEGERS:
+ case PIPE_SHADER_CAP_INT64_ATOMICS:
+ case PIPE_SHADER_CAP_FP16:
+ case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+ case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+ case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+ return 1;
+
+ case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+ /* TODO: Indirect indexing of GS inputs is unimplemented. */
+ if (shader == PIPE_SHADER_GEOMETRY)
+ return 0;
+
+ if (shader == PIPE_SHADER_VERTEX &&
+ !sscreen->llvm_has_working_vgpr_indexing)
+ return 0;
+
+ /* Doing indirect indexing on GFX9 with LLVM 6.0 hangs.
+ * This means we don't support INTERP instructions with
+ * indirect indexing on inputs.
+ */
+ if (shader == PIPE_SHADER_FRAGMENT &&
+ !sscreen->llvm_has_working_vgpr_indexing &&
+ HAVE_LLVM < 0x0700)
+ return 0;
+
+ /* TCS and TES load inputs directly from LDS or offchip
+ * memory, so indirect indexing is always supported.
+ * PS has to support indirect indexing, because we can't
+ * lower that to TEMPs for INTERP instructions.
+ */
+ return 1;
+
+ case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+ return sscreen->llvm_has_working_vgpr_indexing ||
+ /* TCS stores outputs directly to memory. */
+ shader == PIPE_SHADER_TESS_CTRL;
+
+ /* Unsupported boolean features. */
+ case PIPE_SHADER_CAP_SUBROUTINES:
+ case PIPE_SHADER_CAP_SUPPORTED_IRS:
+ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+ return 0;
+ case PIPE_SHADER_CAP_SCALAR_ISA:
+ return 1;
+ }
+ return 0;
+}
+
+static const struct nir_shader_compiler_options nir_options = {
+ .lower_scmp = true,
+ .lower_flrp32 = true,
+ .lower_flrp64 = true,
+ .lower_fpow = true,
+ .lower_fsat = true,
+ .lower_fdiv = true,
+ .lower_sub = true,
+ .lower_ffma = true,
+ .lower_pack_snorm_2x16 = true,
+ .lower_pack_snorm_4x8 = true,
+ .lower_pack_unorm_2x16 = true,
+ .lower_pack_unorm_4x8 = true,
+ .lower_unpack_snorm_2x16 = true,
+ .lower_unpack_snorm_4x8 = true,
+ .lower_unpack_unorm_2x16 = true,
+ .lower_unpack_unorm_4x8 = true,
+ .lower_extract_byte = true,
+ .lower_extract_word = true,
+ .max_unroll_iterations = 32,
+ .native_integers = true,
+};
+
+static const void *
+si_get_compiler_options(struct pipe_screen *screen,
+ enum pipe_shader_ir ir,
+ enum pipe_shader_type shader)
+{
+ assert(ir == PIPE_SHADER_IR_NIR);
+ return &nir_options;
+}
+
+static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+ ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
+}
+
+static void si_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+ ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
+}
+
+static const char* si_get_name(struct pipe_screen *pscreen)
+{
+ struct si_screen *sscreen = (struct si_screen*)pscreen;
+
+ return sscreen->renderer_string;
+}
+
+static int si_get_video_param_no_decode(struct pipe_screen *screen,
+ enum pipe_video_profile profile,
+ enum pipe_video_entrypoint entrypoint,
+ enum pipe_video_cap param)
+{
+ switch (param) {
+ case PIPE_VIDEO_CAP_SUPPORTED:
+ return vl_profile_supported(screen, profile, entrypoint);
+ case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+ return 1;
+ case PIPE_VIDEO_CAP_MAX_WIDTH:
+ case PIPE_VIDEO_CAP_MAX_HEIGHT:
+ return vl_video_buffer_max_size(screen);
+ case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+ return PIPE_FORMAT_NV12;
+ case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+ return true;
+ case PIPE_VIDEO_CAP_MAX_LEVEL:
+ return vl_level_supported(screen, profile);
+ default:
+ return 0;
+ }
+}
+
+static int si_get_video_param(struct pipe_screen *screen,
+ enum pipe_video_profile profile,
+ enum pipe_video_entrypoint entrypoint,
+ enum pipe_video_cap param)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ enum pipe_video_format codec = u_reduce_video_profile(profile);
+
+ if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+ switch (param) {
+ case PIPE_VIDEO_CAP_SUPPORTED:
+ return (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
+ (si_vce_is_fw_version_supported(sscreen) ||
+ sscreen->info.family == CHIP_RAVEN ||
+ sscreen->info.family == CHIP_RAVEN2)) ||
+ (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
+ (sscreen->info.family == CHIP_RAVEN ||
+ sscreen->info.family == CHIP_RAVEN2 ||
+ si_radeon_uvd_enc_supported(sscreen)));
+ case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+ return 1;
+ case PIPE_VIDEO_CAP_MAX_WIDTH:
+ return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+ case PIPE_VIDEO_CAP_MAX_HEIGHT:
+ return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
+ case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+ return PIPE_FORMAT_NV12;
+ case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+ return true;
+ case PIPE_VIDEO_CAP_STACKED_FRAMES:
+ return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
+ default:
+ return 0;
+ }
+ }
+
+ switch (param) {
+ case PIPE_VIDEO_CAP_SUPPORTED:
+ switch (codec) {
+ case PIPE_VIDEO_FORMAT_MPEG12:
+ return profile != PIPE_VIDEO_PROFILE_MPEG1;
+ case PIPE_VIDEO_FORMAT_MPEG4:
+ return 1;
+ case PIPE_VIDEO_FORMAT_MPEG4_AVC:
+ if ((sscreen->info.family == CHIP_POLARIS10 ||
+ sscreen->info.family == CHIP_POLARIS11) &&
+ sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) {
+ RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
+ return false;
+ }
+ return true;
+ case PIPE_VIDEO_FORMAT_VC1:
+ return true;
+ case PIPE_VIDEO_FORMAT_HEVC:
+ /* Carrizo only supports HEVC Main */
+ if (sscreen->info.family >= CHIP_STONEY)
+ return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+ profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+ else if (sscreen->info.family >= CHIP_CARRIZO)
+ return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+ return false;
+ case PIPE_VIDEO_FORMAT_JPEG:
+ if (sscreen->info.family == CHIP_RAVEN ||
+ sscreen->info.family == CHIP_RAVEN2)
+ return true;
+ if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
+ return false;
+ if (!(sscreen->info.drm_major == 3 && sscreen->info.drm_minor >= 19)) {
+ RVID_ERR("No MJPEG support for the kernel version\n");
+ return false;
+ }
+ return true;
+ case PIPE_VIDEO_FORMAT_VP9:
+ if (sscreen->info.family < CHIP_RAVEN)
+ return false;
+ return true;
+ default:
+ return false;
+ }
+ case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+ return 1;
+ case PIPE_VIDEO_CAP_MAX_WIDTH:
+ return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+ case PIPE_VIDEO_CAP_MAX_HEIGHT:
+ return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
+ case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+ if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 ||
+ profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+ return PIPE_FORMAT_P016;
+ else
+ return PIPE_FORMAT_NV12;
+
+ case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+ case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
+ enum pipe_video_format format = u_reduce_video_profile(profile);
+
+ if (format == PIPE_VIDEO_FORMAT_HEVC)
+ return false; //The firmware doesn't support interlaced HEVC.
+ else if (format == PIPE_VIDEO_FORMAT_JPEG)
+ return false;
+ else if (format == PIPE_VIDEO_FORMAT_VP9)
+ return false;
+ return true;
+ }
+ case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+ return true;
+ case PIPE_VIDEO_CAP_MAX_LEVEL:
+ switch (profile) {
+ case PIPE_VIDEO_PROFILE_MPEG1:
+ return 0;
+ case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+ case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+ return 3;
+ case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
+ return 3;
+ case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
+ return 5;
+ case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+ return 1;
+ case PIPE_VIDEO_PROFILE_VC1_MAIN:
+ return 2;
+ case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+ return 4;
+ case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+ case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+ case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+ return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
+ case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+ case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+ return 186;
+ default:
+ return 0;
+ }
+ default:
+ return 0;
+ }
+}
+
+static boolean si_vid_is_format_supported(struct pipe_screen *screen,
+ enum pipe_format format,
+ enum pipe_video_profile profile,
+ enum pipe_video_entrypoint entrypoint)
+{
+ /* HEVC 10 bit decoding should use P016 instead of NV12 if possible */
+ if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+ return (format == PIPE_FORMAT_NV12) ||
+ (format == PIPE_FORMAT_P016);
+
+ /* we can only handle this one with UVD */
+ if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
+ return format == PIPE_FORMAT_NV12;
+
+ return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
+}
+
+static unsigned get_max_threads_per_block(struct si_screen *screen,
+ enum pipe_shader_ir ir_type)
+{
+ if (ir_type == PIPE_SHADER_IR_NATIVE)
+ return 256;
+
+ /* Only 16 waves per thread-group on gfx9. */
+ if (screen->info.chip_class >= GFX9)
+ return 1024;
+
+ /* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice
+ * round number.
+ */
+ return 2048;
+}
+
+static int si_get_compute_param(struct pipe_screen *screen,
+ enum pipe_shader_ir ir_type,
+ enum pipe_compute_cap param,
+ void *ret)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+
+ //TODO: select these params by asic
+ switch (param) {
+ case PIPE_COMPUTE_CAP_IR_TARGET: {
+ const char *gpu, *triple;
+
+ triple = "amdgcn-mesa-mesa3d";
+ gpu = ac_get_llvm_processor_name(sscreen->info.family);
+ if (ret) {
+ sprintf(ret, "%s-%s", gpu, triple);
+ }
+ /* +2 for dash and terminating NIL byte */
+ return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
+ }
+ case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+ if (ret) {
+ uint64_t *grid_dimension = ret;
+ grid_dimension[0] = 3;
+ }
+ return 1 * sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+ if (ret) {
+ uint64_t *grid_size = ret;
+ grid_size[0] = 65535;
+ grid_size[1] = 65535;
+ grid_size[2] = 65535;
+ }
+ return 3 * sizeof(uint64_t) ;
+
+ case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+ if (ret) {
+ uint64_t *block_size = ret;
+ unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+ block_size[0] = threads_per_block;
+ block_size[1] = threads_per_block;
+ block_size[2] = threads_per_block;
+ }
+ return 3 * sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+ if (ret) {
+ uint64_t *max_threads_per_block = ret;
+ *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+ }
+ return sizeof(uint64_t);
+ case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+ if (ret) {
+ uint32_t *address_bits = ret;
+ address_bits[0] = 64;
+ }
+ return 1 * sizeof(uint32_t);
+
+ case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+ if (ret) {
+ uint64_t *max_global_size = ret;
+ uint64_t max_mem_alloc_size;
+
+ si_get_compute_param(screen, ir_type,
+ PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
+ &max_mem_alloc_size);
+
+ /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
+ * 1/4 of the MAX_GLOBAL_SIZE. Since the
+ * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
+ * make sure we never report more than
+ * 4 * MAX_MEM_ALLOC_SIZE.
+ */
+ *max_global_size = MIN2(4 * max_mem_alloc_size,
+ MAX2(sscreen->info.gart_size,
+ sscreen->info.vram_size));
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+ if (ret) {
+ uint64_t *max_local_size = ret;
+ /* Value reported by the closed source driver. */
+ *max_local_size = 32768;
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+ if (ret) {
+ uint64_t *max_input_size = ret;
+ /* Value reported by the closed source driver. */
+ *max_input_size = 1024;
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+ if (ret) {
+ uint64_t *max_mem_alloc_size = ret;
+
+ *max_mem_alloc_size = sscreen->info.max_alloc_size;
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+ if (ret) {
+ uint32_t *max_clock_frequency = ret;
+ *max_clock_frequency = sscreen->info.max_shader_clock;
+ }
+ return sizeof(uint32_t);
+
+ case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+ if (ret) {
+ uint32_t *max_compute_units = ret;
+ *max_compute_units = sscreen->info.num_good_compute_units;
+ }
+ return sizeof(uint32_t);
+
+ case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+ if (ret) {
+ uint32_t *images_supported = ret;
+ *images_supported = 0;
+ }
+ return sizeof(uint32_t);
+ case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+ break; /* unused */
+ case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+ if (ret) {
+ uint32_t *subgroup_size = ret;
+ *subgroup_size = 64;
+ }
+ return sizeof(uint32_t);
+ case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+ if (ret) {
+ uint64_t *max_variable_threads_per_block = ret;
+ if (ir_type == PIPE_SHADER_IR_NATIVE)
+ *max_variable_threads_per_block = 0;
+ else
+ *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+ }
+ return sizeof(uint64_t);
+ }
+
+ fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
+ return 0;
+}
+
+static uint64_t si_get_timestamp(struct pipe_screen *screen)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+
+ return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
+ sscreen->info.clock_crystal_freq;
+}
+
+static void si_query_memory_info(struct pipe_screen *screen,
+ struct pipe_memory_info *info)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ unsigned vram_usage, gtt_usage;
+
+ info->total_device_memory = sscreen->info.vram_size / 1024;
+ info->total_staging_memory = sscreen->info.gart_size / 1024;
+
+ /* The real TTM memory usage is somewhat random, because:
+ *
+ * 1) TTM delays freeing memory, because it can only free it after
+ * fences expire.
+ *
+ * 2) The memory usage can be really low if big VRAM evictions are
+ * taking place, but the real usage is well above the size of VRAM.
+ *
+ * Instead, return statistics of this process.
+ */
+ vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
+ gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
+
+ info->avail_device_memory =
+ vram_usage <= info->total_device_memory ?
+ info->total_device_memory - vram_usage : 0;
+ info->avail_staging_memory =
+ gtt_usage <= info->total_staging_memory ?
+ info->total_staging_memory - gtt_usage : 0;
+
+ info->device_memory_evicted =
+ ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+
+ if (sscreen->info.drm_major == 3 && sscreen->info.drm_minor >= 4)
+ info->nr_device_memory_evictions =
+ ws->query_value(ws, RADEON_NUM_EVICTIONS);
+ else
+ /* Just return the number of evicted 64KB pages. */
+ info->nr_device_memory_evictions = info->device_memory_evicted / 64;
+}
+
+static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen)
+{
+ struct si_screen *sscreen = (struct si_screen*)pscreen;
+
+ return sscreen->disk_shader_cache;
+}
+
+static void si_init_renderer_string(struct si_screen *sscreen)
+{
+ struct radeon_winsys *ws = sscreen->ws;
+ char first_name[256], second_name[32] = {}, kernel_version[128] = {};
+ struct utsname uname_data;
+
+ const char *marketing_name = si_get_marketing_name(ws);
+
+ if (marketing_name) {
+ snprintf(first_name, sizeof(first_name), "%s", marketing_name);
+ snprintf(second_name, sizeof(second_name), "%s, ",
+ sscreen->info.name);
+ } else {
+ snprintf(first_name, sizeof(first_name), "AMD %s",
+ sscreen->info.name);
+ }
+
+ if (uname(&uname_data) == 0)
+ snprintf(kernel_version, sizeof(kernel_version),
+ ", %s", uname_data.release);
+
+ snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
+ "%s (%sDRM %i.%i.%i%s, LLVM %i.%i.%i)",
+ first_name, second_name, sscreen->info.drm_major,
+ sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
+ kernel_version,
+ (HAVE_LLVM >> 8) & 0xff,
+ HAVE_LLVM & 0xff,
+ MESA_LLVM_VERSION_PATCH);
+}
+
+void si_init_screen_get_functions(struct si_screen *sscreen)
+{
+ sscreen->b.get_name = si_get_name;
+ sscreen->b.get_vendor = si_get_vendor;
+ sscreen->b.get_device_vendor = si_get_device_vendor;
+ sscreen->b.get_param = si_get_param;
+ sscreen->b.get_paramf = si_get_paramf;
+ sscreen->b.get_compute_param = si_get_compute_param;
+ sscreen->b.get_timestamp = si_get_timestamp;
+ sscreen->b.get_shader_param = si_get_shader_param;
+ sscreen->b.get_compiler_options = si_get_compiler_options;
+ sscreen->b.get_device_uuid = si_get_device_uuid;
+ sscreen->b.get_driver_uuid = si_get_driver_uuid;
+ sscreen->b.query_memory_info = si_query_memory_info;
+ sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
+
+ if (sscreen->info.has_hw_decode) {
+ sscreen->b.get_video_param = si_get_video_param;
+ sscreen->b.is_video_format_supported = si_vid_is_format_supported;
+ } else {
+ sscreen->b.get_video_param = si_get_video_param_no_decode;
+ sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
+ }
+
+ si_init_renderer_string(sscreen);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_gfx_cs.c b/lib/mesa/src/gallium/drivers/radeonsi/si_gfx_cs.c
new file mode 100644
index 000000000..f178d0445
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+
+#include "util/os_time.h"
+
+/* initialize */
+void si_need_gfx_cs_space(struct si_context *ctx)
+{
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+ /* There is no need to flush the DMA IB here, because
+ * r600_need_dma_space always flushes the GFX IB if there is
+ * a conflict, which means any unflushed DMA commands automatically
+ * precede the GFX IB (= they had no dependency on the GFX IB when
+ * they were submitted).
+ */
+
+ /* There are two memory usage counters in the winsys for all buffers
+ * that have been added (cs_add_buffer) and two counters in the pipe
+ * driver for those that haven't been added yet.
+ */
+ if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
+ ctx->vram, ctx->gtt))) {
+ ctx->gtt = 0;
+ ctx->vram = 0;
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ return;
+ }
+ ctx->gtt = 0;
+ ctx->vram = 0;
+
+ /* If the IB is sufficiently large, don't count the space needed
+ * and just flush if there is not enough space left.
+ *
+ * Also reserve space for stopping queries at the end of IB, because
+ * the number of active queries is mostly unlimited.
+ */
+ unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
+ if (!ctx->ws->cs_check_space(cs, need_dwords))
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+}
+
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
+ struct pipe_fence_handle **fence)
+{
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ struct radeon_winsys *ws = ctx->ws;
+ unsigned wait_flags = 0;
+
+ if (ctx->gfx_flush_in_progress)
+ return;
+
+ if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
+ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH |
+ SI_CONTEXT_INV_GLOBAL_L2;
+ } else if (ctx->chip_class == SI) {
+ /* The kernel flushes L2 before shaders are finished. */
+ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+ } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ /* Drop this flush if it's a no-op. */
+ if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
+ (!wait_flags || !ctx->gfx_last_ib_is_busy))
+ return;
+
+ if (si_check_device_reset(ctx))
+ return;
+
+ if (ctx->screen->debug_flags & DBG(CHECK_VM))
+ flags &= ~PIPE_FLUSH_ASYNC;
+
+ /* If the state tracker is flushing the GFX IB, si_flush_from_st is
+ * responsible for flushing the DMA IB and merging the fences from both.
+ * This code is only needed when the driver flushes the GFX IB
+ * internally, and it never asks for a fence handle.
+ */
+ if (radeon_emitted(ctx->dma_cs, 0)) {
+ assert(fence == NULL); /* internal flushes only */
+ si_flush_dma_cs(ctx, flags, NULL);
+ }
+
+ ctx->gfx_flush_in_progress = true;
+
+ if (!LIST_IS_EMPTY(&ctx->active_queries))
+ si_suspend_queries(ctx);
+
+ ctx->streamout.suspended = false;
+ if (ctx->streamout.begin_emitted) {
+ si_emit_streamout_end(ctx);
+ ctx->streamout.suspended = true;
+ }
+
+ /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
+ * because the kernel doesn't wait for it. */
+ if (ctx->chip_class >= CIK)
+ si_cp_dma_wait_for_idle(ctx);
+
+ /* Wait for draw calls to finish if needed. */
+ if (wait_flags) {
+ ctx->flags |= wait_flags;
+ si_emit_cache_flush(ctx);
+ }
+ ctx->gfx_last_ib_is_busy = wait_flags == 0;
+
+ if (ctx->current_saved_cs) {
+ si_trace_emit(ctx);
+
+ /* Save the IB for debug contexts. */
+ si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
+ ctx->current_saved_cs->flushed = true;
+ ctx->current_saved_cs->time_flush = os_time_get_nano();
+
+ si_log_hw_flush(ctx);
+ }
+
+ /* Flush the CS. */
+ ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
+ if (fence)
+ ws->fence_reference(fence, ctx->last_gfx_fence);
+
+ ctx->num_gfx_cs_flushes++;
+
+ /* Check VM faults if needed. */
+ if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
+ /* Use conservative timeout 800ms, after which we won't wait any
+ * longer and assume the GPU is hung.
+ */
+ ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000);
+
+ si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
+ }
+
+ if (ctx->current_saved_cs)
+ si_saved_cs_reference(&ctx->current_saved_cs, NULL);
+
+ si_begin_new_gfx_cs(ctx);
+ ctx->gfx_flush_in_progress = false;
+}
+
+static void si_begin_gfx_cs_debug(struct si_context *ctx)
+{
+ static const uint32_t zeros[1];
+ assert(!ctx->current_saved_cs);
+
+ ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
+ if (!ctx->current_saved_cs)
+ return;
+
+ pipe_reference_init(&ctx->current_saved_cs->reference, 1);
+
+ ctx->current_saved_cs->trace_buf = r600_resource(
+ pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+ if (!ctx->current_saved_cs->trace_buf) {
+ free(ctx->current_saved_cs);
+ ctx->current_saved_cs = NULL;
+ return;
+ }
+
+ pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b,
+ 0, sizeof(zeros), zeros);
+ ctx->current_saved_cs->trace_id = 0;
+
+ si_trace_emit(ctx);
+
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
+ RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+}
+
+void si_begin_new_gfx_cs(struct si_context *ctx)
+{
+ if (ctx->is_debug)
+ si_begin_gfx_cs_debug(ctx);
+
+ /* Always invalidate caches at the beginning of IBs, because external
+ * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+ * buffers.
+ *
+ * Note that the cache flush done by the kernel at the end of GFX IBs
+ * isn't useful here, because that flush can finish after the following
+ * IB starts drawing.
+ *
+ * TODO: Do we also need to invalidate CB & DB caches?
+ */
+ ctx->flags |= SI_CONTEXT_INV_ICACHE |
+ SI_CONTEXT_INV_SMEM_L1 |
+ SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
+ SI_CONTEXT_START_PIPELINE_STATS;
+
+ /* set all valid group as dirty so they get reemited on
+ * next draw command
+ */
+ si_pm4_reset_emitted(ctx);
+
+ /* The CS initialization should be emitted before everything else. */
+ si_pm4_emit(ctx, ctx->init_config);
+ if (ctx->init_config_gs_rings)
+ si_pm4_emit(ctx, ctx->init_config_gs_rings);
+
+ if (ctx->queued.named.ls)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+ if (ctx->queued.named.hs)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+ if (ctx->queued.named.es)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+ if (ctx->queued.named.gs)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+ if (ctx->queued.named.vs)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+ if (ctx->queued.named.ps)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+ if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+
+ /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
+ bool has_clear_state = ctx->screen->has_clear_state;
+ if (has_clear_state) {
+ ctx->framebuffer.dirty_cbufs =
+ u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
+ /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
+ ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
+ } else {
+ ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
+ ctx->framebuffer.dirty_zsbuf = true;
+ }
+ /* This should always be marked as dirty to set the framebuffer scissor
+ * at least. */
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
+
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
+ /* CLEAR_STATE sets zeros. */
+ if (!has_clear_state || ctx->clip_state.any_nonzeros)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
+ ctx->sample_locs_num_samples = 0;
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
+ /* CLEAR_STATE sets 0xffff. */
+ if (!has_clear_state || ctx->sample_mask != 0xffff)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
+ /* CLEAR_STATE sets zeros. */
+ if (!has_clear_state || ctx->blend_color.any_nonzeros)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
+ if (ctx->chip_class >= GFX9)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
+ /* CLEAR_STATE disables all window rectangles. */
+ if (!has_clear_state || ctx->num_window_rectangles > 0)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
+ si_all_descriptors_begin_new_cs(ctx);
+ si_all_resident_buffers_begin_new_cs(ctx);
+
+ ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+ ctx->viewports.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+ ctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
+ if (ctx->scratch_buffer) {
+ si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
+ }
+
+ if (ctx->streamout.suspended) {
+ ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
+ si_streamout_buffers_dirty(ctx);
+ }
+
+ if (!LIST_IS_EMPTY(&ctx->active_queries))
+ si_resume_queries(ctx);
+
+ assert(!ctx->gfx_cs->prev_dw);
+ ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+
+ /* Invalidate various draw states so that they are emitted before
+ * the first draw call. */
+ si_invalidate_draw_sh_constants(ctx);
+ ctx->last_index_size = -1;
+ ctx->last_primitive_restart_en = -1;
+ ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
+ ctx->last_prim = -1;
+ ctx->last_multi_vgt_param = -1;
+ ctx->last_rast_prim = -1;
+ ctx->last_sc_line_stipple = ~0;
+ ctx->last_vs_state = ~0;
+ ctx->last_ls = NULL;
+ ctx->last_tcs = NULL;
+ ctx->last_tes_sh_base = -1;
+ ctx->last_num_tcs_input_cp = -1;
+ ctx->last_ls_hs_config = -1; /* impossible value */
+
+ ctx->cs_shader_state.initialized = false;
+
+ if (has_clear_state) {
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+ ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From VI */
+
+ /* Set all saved registers state to saved. */
+ ctx->tracked_regs.reg_saved = 0xffffffffffffffff;
+ } else {
+ /* Set all saved registers state to unknown. */
+ ctx->tracked_regs.reg_saved = 0;
+ }
+
+ /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
+ memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_gpu_load.c b/lib/mesa/src/gallium/drivers/radeonsi/si_gpu_load.c
new file mode 100644
index 000000000..8c457b30e
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_gpu_load.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* The GPU load is measured as follows.
+ *
+ * There is a thread which samples the GRBM_STATUS register at a certain
+ * frequency and the "busy" or "idle" counter is incremented based on
+ * whether the GUI_ACTIVE bit is set or not.
+ *
+ * Then, the user can sample the counters twice and calculate the average
+ * GPU load between the two samples.
+ */
+
+#include "radeonsi/si_pipe.h"
+#include "radeonsi/si_query.h"
+#include "util/os_time.h"
+
+/* For good accuracy at 1000 fps or lower. This will be inaccurate for higher
+ * fps (there are too few samples per frame). */
+#define SAMPLES_PER_SEC 10000
+
+#define GRBM_STATUS 0x8010
+#define TA_BUSY(x) (((x) >> 14) & 0x1)
+#define GDS_BUSY(x) (((x) >> 15) & 0x1)
+#define VGT_BUSY(x) (((x) >> 17) & 0x1)
+#define IA_BUSY(x) (((x) >> 19) & 0x1)
+#define SX_BUSY(x) (((x) >> 20) & 0x1)
+#define WD_BUSY(x) (((x) >> 21) & 0x1)
+#define SPI_BUSY(x) (((x) >> 22) & 0x1)
+#define BCI_BUSY(x) (((x) >> 23) & 0x1)
+#define SC_BUSY(x) (((x) >> 24) & 0x1)
+#define PA_BUSY(x) (((x) >> 25) & 0x1)
+#define DB_BUSY(x) (((x) >> 26) & 0x1)
+#define CP_BUSY(x) (((x) >> 29) & 0x1)
+#define CB_BUSY(x) (((x) >> 30) & 0x1)
+#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
+
+#define SRBM_STATUS2 0x0e4c
+#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
+
+#define CP_STAT 0x8680
+#define PFP_BUSY(x) (((x) >> 15) & 0x1)
+#define MEQ_BUSY(x) (((x) >> 16) & 0x1)
+#define ME_BUSY(x) (((x) >> 17) & 0x1)
+#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
+#define DMA_BUSY(x) (((x) >> 22) & 0x1)
+#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1)
+
+#define IDENTITY(x) x
+
+#define UPDATE_COUNTER(field, mask) \
+ do { \
+ if (mask(value)) \
+ p_atomic_inc(&counters->named.field.busy); \
+ else \
+ p_atomic_inc(&counters->named.field.idle); \
+ } while (0)
+
+static void si_update_mmio_counters(struct si_screen *sscreen,
+ union si_mmio_counters *counters)
+{
+ uint32_t value = 0;
+ bool gui_busy, sdma_busy = false;
+
+ /* GRBM_STATUS */
+ sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
+
+ UPDATE_COUNTER(ta, TA_BUSY);
+ UPDATE_COUNTER(gds, GDS_BUSY);
+ UPDATE_COUNTER(vgt, VGT_BUSY);
+ UPDATE_COUNTER(ia, IA_BUSY);
+ UPDATE_COUNTER(sx, SX_BUSY);
+ UPDATE_COUNTER(wd, WD_BUSY);
+ UPDATE_COUNTER(spi, SPI_BUSY);
+ UPDATE_COUNTER(bci, BCI_BUSY);
+ UPDATE_COUNTER(sc, SC_BUSY);
+ UPDATE_COUNTER(pa, PA_BUSY);
+ UPDATE_COUNTER(db, DB_BUSY);
+ UPDATE_COUNTER(cp, CP_BUSY);
+ UPDATE_COUNTER(cb, CB_BUSY);
+ UPDATE_COUNTER(gui, GUI_ACTIVE);
+ gui_busy = GUI_ACTIVE(value);
+
+ if (sscreen->info.chip_class == CIK || sscreen->info.chip_class == VI) {
+ /* SRBM_STATUS2 */
+ sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
+
+ UPDATE_COUNTER(sdma, SDMA_BUSY);
+ sdma_busy = SDMA_BUSY(value);
+ }
+
+ if (sscreen->info.chip_class >= VI) {
+ /* CP_STAT */
+ sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
+
+ UPDATE_COUNTER(pfp, PFP_BUSY);
+ UPDATE_COUNTER(meq, MEQ_BUSY);
+ UPDATE_COUNTER(me, ME_BUSY);
+ UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
+ UPDATE_COUNTER(cp_dma, DMA_BUSY);
+ UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
+ }
+
+ value = gui_busy || sdma_busy;
+ UPDATE_COUNTER(gpu, IDENTITY);
+}
+
+#undef UPDATE_COUNTER
+
+static int
+si_gpu_load_thread(void *param)
+{
+ struct si_screen *sscreen = (struct si_screen*)param;
+ const int period_us = 1000000 / SAMPLES_PER_SEC;
+ int sleep_us = period_us;
+ int64_t cur_time, last_time = os_time_get();
+
+ while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
+ if (sleep_us)
+ os_time_sleep(sleep_us);
+
+ /* Make sure we sleep the ideal amount of time to match
+ * the expected frequency. */
+ cur_time = os_time_get();
+
+ if (os_time_timeout(last_time, last_time + period_us,
+ cur_time))
+ sleep_us = MAX2(sleep_us - 1, 1);
+ else
+ sleep_us += 1;
+
+ /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
+ last_time = cur_time;
+
+ /* Update the counters. */
+ si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
+ }
+ p_atomic_dec(&sscreen->gpu_load_stop_thread);
+ return 0;
+}
+
+void si_gpu_load_kill_thread(struct si_screen *sscreen)
+{
+ if (!sscreen->gpu_load_thread)
+ return;
+
+ p_atomic_inc(&sscreen->gpu_load_stop_thread);
+ thrd_join(sscreen->gpu_load_thread, NULL);
+ sscreen->gpu_load_thread = 0;
+}
+
+static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
+ unsigned busy_index)
+{
+ /* Start the thread if needed. */
+ if (!sscreen->gpu_load_thread) {
+ mtx_lock(&sscreen->gpu_load_mutex);
+ /* Check again inside the mutex. */
+ if (!sscreen->gpu_load_thread)
+ sscreen->gpu_load_thread =
+ u_thread_create(si_gpu_load_thread, sscreen);
+ mtx_unlock(&sscreen->gpu_load_mutex);
+ }
+
+ unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
+ unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
+
+ return busy | ((uint64_t)idle << 32);
+}
+
+static unsigned si_end_mmio_counter(struct si_screen *sscreen,
+ uint64_t begin, unsigned busy_index)
+{
+ uint64_t end = si_read_mmio_counter(sscreen, busy_index);
+ unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
+ unsigned idle = (end >> 32) - (begin >> 32);
+
+ /* Calculate the % of time the busy counter was being incremented.
+ *
+ * If no counters were incremented, return the current counter status.
+ * It's for the case when the load is queried faster than
+ * the counters are updated.
+ */
+ if (idle || busy) {
+ return busy*100 / (busy + idle);
+ } else {
+ union si_mmio_counters counters;
+
+ memset(&counters, 0, sizeof(counters));
+ si_update_mmio_counters(sscreen, &counters);
+ return counters.array[busy_index] ? 100 : 0;
+ }
+}
+
+#define BUSY_INDEX(rscreen, field) (&rscreen->mmio_counters.named.field.busy - \
+ rscreen->mmio_counters.array)
+
+static unsigned busy_index_from_type(struct si_screen *sscreen,
+ unsigned type)
+{
+ switch (type) {
+ case SI_QUERY_GPU_LOAD:
+ return BUSY_INDEX(sscreen, gpu);
+ case SI_QUERY_GPU_SHADERS_BUSY:
+ return BUSY_INDEX(sscreen, spi);
+ case SI_QUERY_GPU_TA_BUSY:
+ return BUSY_INDEX(sscreen, ta);
+ case SI_QUERY_GPU_GDS_BUSY:
+ return BUSY_INDEX(sscreen, gds);
+ case SI_QUERY_GPU_VGT_BUSY:
+ return BUSY_INDEX(sscreen, vgt);
+ case SI_QUERY_GPU_IA_BUSY:
+ return BUSY_INDEX(sscreen, ia);
+ case SI_QUERY_GPU_SX_BUSY:
+ return BUSY_INDEX(sscreen, sx);
+ case SI_QUERY_GPU_WD_BUSY:
+ return BUSY_INDEX(sscreen, wd);
+ case SI_QUERY_GPU_BCI_BUSY:
+ return BUSY_INDEX(sscreen, bci);
+ case SI_QUERY_GPU_SC_BUSY:
+ return BUSY_INDEX(sscreen, sc);
+ case SI_QUERY_GPU_PA_BUSY:
+ return BUSY_INDEX(sscreen, pa);
+ case SI_QUERY_GPU_DB_BUSY:
+ return BUSY_INDEX(sscreen, db);
+ case SI_QUERY_GPU_CP_BUSY:
+ return BUSY_INDEX(sscreen, cp);
+ case SI_QUERY_GPU_CB_BUSY:
+ return BUSY_INDEX(sscreen, cb);
+ case SI_QUERY_GPU_SDMA_BUSY:
+ return BUSY_INDEX(sscreen, sdma);
+ case SI_QUERY_GPU_PFP_BUSY:
+ return BUSY_INDEX(sscreen, pfp);
+ case SI_QUERY_GPU_MEQ_BUSY:
+ return BUSY_INDEX(sscreen, meq);
+ case SI_QUERY_GPU_ME_BUSY:
+ return BUSY_INDEX(sscreen, me);
+ case SI_QUERY_GPU_SURF_SYNC_BUSY:
+ return BUSY_INDEX(sscreen, surf_sync);
+ case SI_QUERY_GPU_CP_DMA_BUSY:
+ return BUSY_INDEX(sscreen, cp_dma);
+ case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+ return BUSY_INDEX(sscreen, scratch_ram);
+ default:
+ unreachable("invalid query type");
+ }
+}
+
+uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
+{
+ unsigned busy_index = busy_index_from_type(sscreen, type);
+ return si_read_mmio_counter(sscreen, busy_index);
+}
+
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
+ uint64_t begin)
+{
+ unsigned busy_index = busy_index_from_type(sscreen, type);
+ return si_end_mmio_counter(sscreen, begin, busy_index);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_query.c b/lib/mesa/src/gallium/drivers/radeonsi/si_query.c
new file mode 100644
index 000000000..7a2c7afdb
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_query.c
@@ -0,0 +1,1894 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "si_query.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+#include "util/os_time.h"
+#include "util/u_suballoc.h"
+#include "amd/common/sid.h"
+
+#define SI_MAX_STREAMS 4
+
+struct si_hw_query_params {
+ unsigned start_offset;
+ unsigned end_offset;
+ unsigned fence_offset;
+ unsigned pair_stride;
+ unsigned pair_count;
+};
+
+/* Queries without buffer handling or suspend/resume. */
+struct si_query_sw {
+ struct si_query b;
+
+ uint64_t begin_result;
+ uint64_t end_result;
+
+ uint64_t begin_time;
+ uint64_t end_time;
+
+ /* Fence for GPU_FINISHED. */
+ struct pipe_fence_handle *fence;
+};
+
+static void si_query_sw_destroy(struct si_screen *sscreen,
+ struct si_query *rquery)
+{
+ struct si_query_sw *query = (struct si_query_sw *)rquery;
+
+ sscreen->b.fence_reference(&sscreen->b, &query->fence, NULL);
+ FREE(query);
+}
+
+static enum radeon_value_id winsys_id_from_type(unsigned type)
+{
+ switch (type) {
+ case SI_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
+ case SI_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
+ case SI_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
+ case SI_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
+ case SI_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
+ case SI_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
+ case SI_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
+ case SI_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
+ case SI_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
+ case SI_QUERY_GFX_IB_SIZE: return RADEON_GFX_IB_SIZE_COUNTER;
+ case SI_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
+ case SI_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
+ case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
+ case SI_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
+ case SI_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
+ case SI_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
+ case SI_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
+ case SI_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
+ case SI_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
+ case SI_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
+ default: unreachable("query type does not correspond to winsys id");
+ }
+}
+
+static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
+{
+ struct pipe_fence_handle *fence = NULL;
+
+ si_flush_dma_cs(sctx, 0, &fence);
+ if (fence) {
+ sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
+ sctx->ws->fence_reference(&fence, NULL);
+ }
+
+ return os_time_get_nano();
+}
+
+static bool si_query_sw_begin(struct si_context *sctx,
+ struct si_query *rquery)
+{
+ struct si_query_sw *query = (struct si_query_sw *)rquery;
+ enum radeon_value_id ws_id;
+
+ switch(query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ case PIPE_QUERY_GPU_FINISHED:
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+ query->begin_result = si_finish_dma_get_cpu_time(sctx);
+ break;
+ case SI_QUERY_DRAW_CALLS:
+ query->begin_result = sctx->num_draw_calls;
+ break;
+ case SI_QUERY_DECOMPRESS_CALLS:
+ query->begin_result = sctx->num_decompress_calls;
+ break;
+ case SI_QUERY_MRT_DRAW_CALLS:
+ query->begin_result = sctx->num_mrt_draw_calls;
+ break;
+ case SI_QUERY_PRIM_RESTART_CALLS:
+ query->begin_result = sctx->num_prim_restart_calls;
+ break;
+ case SI_QUERY_SPILL_DRAW_CALLS:
+ query->begin_result = sctx->num_spill_draw_calls;
+ break;
+ case SI_QUERY_COMPUTE_CALLS:
+ query->begin_result = sctx->num_compute_calls;
+ break;
+ case SI_QUERY_SPILL_COMPUTE_CALLS:
+ query->begin_result = sctx->num_spill_compute_calls;
+ break;
+ case SI_QUERY_DMA_CALLS:
+ query->begin_result = sctx->num_dma_calls;
+ break;
+ case SI_QUERY_CP_DMA_CALLS:
+ query->begin_result = sctx->num_cp_dma_calls;
+ break;
+ case SI_QUERY_NUM_VS_FLUSHES:
+ query->begin_result = sctx->num_vs_flushes;
+ break;
+ case SI_QUERY_NUM_PS_FLUSHES:
+ query->begin_result = sctx->num_ps_flushes;
+ break;
+ case SI_QUERY_NUM_CS_FLUSHES:
+ query->begin_result = sctx->num_cs_flushes;
+ break;
+ case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+ query->begin_result = sctx->num_cb_cache_flushes;
+ break;
+ case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+ query->begin_result = sctx->num_db_cache_flushes;
+ break;
+ case SI_QUERY_NUM_L2_INVALIDATES:
+ query->begin_result = sctx->num_L2_invalidates;
+ break;
+ case SI_QUERY_NUM_L2_WRITEBACKS:
+ query->begin_result = sctx->num_L2_writebacks;
+ break;
+ case SI_QUERY_NUM_RESIDENT_HANDLES:
+ query->begin_result = sctx->num_resident_handles;
+ break;
+ case SI_QUERY_TC_OFFLOADED_SLOTS:
+ query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+ break;
+ case SI_QUERY_TC_DIRECT_SLOTS:
+ query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+ break;
+ case SI_QUERY_TC_NUM_SYNCS:
+ query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
+ break;
+ case SI_QUERY_REQUESTED_VRAM:
+ case SI_QUERY_REQUESTED_GTT:
+ case SI_QUERY_MAPPED_VRAM:
+ case SI_QUERY_MAPPED_GTT:
+ case SI_QUERY_VRAM_USAGE:
+ case SI_QUERY_VRAM_VIS_USAGE:
+ case SI_QUERY_GTT_USAGE:
+ case SI_QUERY_GPU_TEMPERATURE:
+ case SI_QUERY_CURRENT_GPU_SCLK:
+ case SI_QUERY_CURRENT_GPU_MCLK:
+ case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+ case SI_QUERY_NUM_MAPPED_BUFFERS:
+ query->begin_result = 0;
+ break;
+ case SI_QUERY_BUFFER_WAIT_TIME:
+ case SI_QUERY_GFX_IB_SIZE:
+ case SI_QUERY_NUM_GFX_IBS:
+ case SI_QUERY_NUM_SDMA_IBS:
+ case SI_QUERY_NUM_BYTES_MOVED:
+ case SI_QUERY_NUM_EVICTIONS:
+ case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+ enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+ break;
+ }
+ case SI_QUERY_GFX_BO_LIST_SIZE:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->begin_time = sctx->ws->query_value(sctx->ws,
+ RADEON_NUM_GFX_IBS);
+ break;
+ case SI_QUERY_CS_THREAD_BUSY:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->begin_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GALLIUM_THREAD_BUSY:
+ query->begin_result =
+ sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+ query->begin_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GPU_LOAD:
+ case SI_QUERY_GPU_SHADERS_BUSY:
+ case SI_QUERY_GPU_TA_BUSY:
+ case SI_QUERY_GPU_GDS_BUSY:
+ case SI_QUERY_GPU_VGT_BUSY:
+ case SI_QUERY_GPU_IA_BUSY:
+ case SI_QUERY_GPU_SX_BUSY:
+ case SI_QUERY_GPU_WD_BUSY:
+ case SI_QUERY_GPU_BCI_BUSY:
+ case SI_QUERY_GPU_SC_BUSY:
+ case SI_QUERY_GPU_PA_BUSY:
+ case SI_QUERY_GPU_DB_BUSY:
+ case SI_QUERY_GPU_CP_BUSY:
+ case SI_QUERY_GPU_CB_BUSY:
+ case SI_QUERY_GPU_SDMA_BUSY:
+ case SI_QUERY_GPU_PFP_BUSY:
+ case SI_QUERY_GPU_MEQ_BUSY:
+ case SI_QUERY_GPU_ME_BUSY:
+ case SI_QUERY_GPU_SURF_SYNC_BUSY:
+ case SI_QUERY_GPU_CP_DMA_BUSY:
+ case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+ query->begin_result = si_begin_counter(sctx->screen,
+ query->b.type);
+ break;
+ case SI_QUERY_NUM_COMPILATIONS:
+ query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
+ break;
+ case SI_QUERY_NUM_SHADERS_CREATED:
+ query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
+ break;
+ case SI_QUERY_NUM_SHADER_CACHE_HITS:
+ query->begin_result =
+ p_atomic_read(&sctx->screen->num_shader_cache_hits);
+ break;
+ case SI_QUERY_GPIN_ASIC_ID:
+ case SI_QUERY_GPIN_NUM_SIMD:
+ case SI_QUERY_GPIN_NUM_RB:
+ case SI_QUERY_GPIN_NUM_SPI:
+ case SI_QUERY_GPIN_NUM_SE:
+ break;
+ default:
+ unreachable("si_query_sw_begin: bad query type");
+ }
+
+ return true;
+}
+
+static bool si_query_sw_end(struct si_context *sctx,
+ struct si_query *rquery)
+{
+ struct si_query_sw *query = (struct si_query_sw *)rquery;
+ enum radeon_value_id ws_id;
+
+ switch(query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ break;
+ case PIPE_QUERY_GPU_FINISHED:
+ sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+ query->end_result = si_finish_dma_get_cpu_time(sctx);
+ break;
+ case SI_QUERY_DRAW_CALLS:
+ query->end_result = sctx->num_draw_calls;
+ break;
+ case SI_QUERY_DECOMPRESS_CALLS:
+ query->end_result = sctx->num_decompress_calls;
+ break;
+ case SI_QUERY_MRT_DRAW_CALLS:
+ query->end_result = sctx->num_mrt_draw_calls;
+ break;
+ case SI_QUERY_PRIM_RESTART_CALLS:
+ query->end_result = sctx->num_prim_restart_calls;
+ break;
+ case SI_QUERY_SPILL_DRAW_CALLS:
+ query->end_result = sctx->num_spill_draw_calls;
+ break;
+ case SI_QUERY_COMPUTE_CALLS:
+ query->end_result = sctx->num_compute_calls;
+ break;
+ case SI_QUERY_SPILL_COMPUTE_CALLS:
+ query->end_result = sctx->num_spill_compute_calls;
+ break;
+ case SI_QUERY_DMA_CALLS:
+ query->end_result = sctx->num_dma_calls;
+ break;
+ case SI_QUERY_CP_DMA_CALLS:
+ query->end_result = sctx->num_cp_dma_calls;
+ break;
+ case SI_QUERY_NUM_VS_FLUSHES:
+ query->end_result = sctx->num_vs_flushes;
+ break;
+ case SI_QUERY_NUM_PS_FLUSHES:
+ query->end_result = sctx->num_ps_flushes;
+ break;
+ case SI_QUERY_NUM_CS_FLUSHES:
+ query->end_result = sctx->num_cs_flushes;
+ break;
+ case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+ query->end_result = sctx->num_cb_cache_flushes;
+ break;
+ case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+ query->end_result = sctx->num_db_cache_flushes;
+ break;
+ case SI_QUERY_NUM_L2_INVALIDATES:
+ query->end_result = sctx->num_L2_invalidates;
+ break;
+ case SI_QUERY_NUM_L2_WRITEBACKS:
+ query->end_result = sctx->num_L2_writebacks;
+ break;
+ case SI_QUERY_NUM_RESIDENT_HANDLES:
+ query->end_result = sctx->num_resident_handles;
+ break;
+ case SI_QUERY_TC_OFFLOADED_SLOTS:
+ query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+ break;
+ case SI_QUERY_TC_DIRECT_SLOTS:
+ query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+ break;
+ case SI_QUERY_TC_NUM_SYNCS:
+ query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
+ break;
+ case SI_QUERY_REQUESTED_VRAM:
+ case SI_QUERY_REQUESTED_GTT:
+ case SI_QUERY_MAPPED_VRAM:
+ case SI_QUERY_MAPPED_GTT:
+ case SI_QUERY_VRAM_USAGE:
+ case SI_QUERY_VRAM_VIS_USAGE:
+ case SI_QUERY_GTT_USAGE:
+ case SI_QUERY_GPU_TEMPERATURE:
+ case SI_QUERY_CURRENT_GPU_SCLK:
+ case SI_QUERY_CURRENT_GPU_MCLK:
+ case SI_QUERY_BUFFER_WAIT_TIME:
+ case SI_QUERY_GFX_IB_SIZE:
+ case SI_QUERY_NUM_MAPPED_BUFFERS:
+ case SI_QUERY_NUM_GFX_IBS:
+ case SI_QUERY_NUM_SDMA_IBS:
+ case SI_QUERY_NUM_BYTES_MOVED:
+ case SI_QUERY_NUM_EVICTIONS:
+ case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+ enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+ break;
+ }
+ case SI_QUERY_GFX_BO_LIST_SIZE:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->end_time = sctx->ws->query_value(sctx->ws,
+ RADEON_NUM_GFX_IBS);
+ break;
+ case SI_QUERY_CS_THREAD_BUSY:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->end_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GALLIUM_THREAD_BUSY:
+ query->end_result =
+ sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+ query->end_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GPU_LOAD:
+ case SI_QUERY_GPU_SHADERS_BUSY:
+ case SI_QUERY_GPU_TA_BUSY:
+ case SI_QUERY_GPU_GDS_BUSY:
+ case SI_QUERY_GPU_VGT_BUSY:
+ case SI_QUERY_GPU_IA_BUSY:
+ case SI_QUERY_GPU_SX_BUSY:
+ case SI_QUERY_GPU_WD_BUSY:
+ case SI_QUERY_GPU_BCI_BUSY:
+ case SI_QUERY_GPU_SC_BUSY:
+ case SI_QUERY_GPU_PA_BUSY:
+ case SI_QUERY_GPU_DB_BUSY:
+ case SI_QUERY_GPU_CP_BUSY:
+ case SI_QUERY_GPU_CB_BUSY:
+ case SI_QUERY_GPU_SDMA_BUSY:
+ case SI_QUERY_GPU_PFP_BUSY:
+ case SI_QUERY_GPU_MEQ_BUSY:
+ case SI_QUERY_GPU_ME_BUSY:
+ case SI_QUERY_GPU_SURF_SYNC_BUSY:
+ case SI_QUERY_GPU_CP_DMA_BUSY:
+ case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+ query->end_result = si_end_counter(sctx->screen,
+ query->b.type,
+ query->begin_result);
+ query->begin_result = 0;
+ break;
+ case SI_QUERY_NUM_COMPILATIONS:
+ query->end_result = p_atomic_read(&sctx->screen->num_compilations);
+ break;
+ case SI_QUERY_NUM_SHADERS_CREATED:
+ query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
+ break;
+ case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+ query->end_result = sctx->last_tex_ps_draw_ratio;
+ break;
+ case SI_QUERY_NUM_SHADER_CACHE_HITS:
+ query->end_result =
+ p_atomic_read(&sctx->screen->num_shader_cache_hits);
+ break;
+ case SI_QUERY_GPIN_ASIC_ID:
+ case SI_QUERY_GPIN_NUM_SIMD:
+ case SI_QUERY_GPIN_NUM_RB:
+ case SI_QUERY_GPIN_NUM_SPI:
+ case SI_QUERY_GPIN_NUM_SE:
+ break;
+ default:
+ unreachable("si_query_sw_end: bad query type");
+ }
+
+ return true;
+}
+
+static bool si_query_sw_get_result(struct si_context *sctx,
+ struct si_query *rquery,
+ bool wait,
+ union pipe_query_result *result)
+{
+ struct si_query_sw *query = (struct si_query_sw *)rquery;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ /* Convert from cycles per millisecond to cycles per second (Hz). */
+ result->timestamp_disjoint.frequency =
+ (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
+ result->timestamp_disjoint.disjoint = false;
+ return true;
+ case PIPE_QUERY_GPU_FINISHED: {
+ struct pipe_screen *screen = sctx->b.screen;
+ struct pipe_context *ctx = rquery->b.flushed ? NULL : &sctx->b;
+
+ result->b = screen->fence_finish(screen, ctx, query->fence,
+ wait ? PIPE_TIMEOUT_INFINITE : 0);
+ return result->b;
+ }
+
+ case SI_QUERY_GFX_BO_LIST_SIZE:
+ result->u64 = (query->end_result - query->begin_result) /
+ (query->end_time - query->begin_time);
+ return true;
+ case SI_QUERY_CS_THREAD_BUSY:
+ case SI_QUERY_GALLIUM_THREAD_BUSY:
+ result->u64 = (query->end_result - query->begin_result) * 100 /
+ (query->end_time - query->begin_time);
+ return true;
+ case SI_QUERY_GPIN_ASIC_ID:
+ result->u32 = 0;
+ return true;
+ case SI_QUERY_GPIN_NUM_SIMD:
+ result->u32 = sctx->screen->info.num_good_compute_units;
+ return true;
+ case SI_QUERY_GPIN_NUM_RB:
+ result->u32 = sctx->screen->info.num_render_backends;
+ return true;
+ case SI_QUERY_GPIN_NUM_SPI:
+ result->u32 = 1; /* all supported chips have one SPI per SE */
+ return true;
+ case SI_QUERY_GPIN_NUM_SE:
+ result->u32 = sctx->screen->info.max_se;
+ return true;
+ }
+
+ result->u64 = query->end_result - query->begin_result;
+
+ switch (query->b.type) {
+ case SI_QUERY_BUFFER_WAIT_TIME:
+ case SI_QUERY_GPU_TEMPERATURE:
+ result->u64 /= 1000;
+ break;
+ case SI_QUERY_CURRENT_GPU_SCLK:
+ case SI_QUERY_CURRENT_GPU_MCLK:
+ result->u64 *= 1000000;
+ break;
+ }
+
+ return true;
+}
+
+
+static struct si_query_ops sw_query_ops = {
+ .destroy = si_query_sw_destroy,
+ .begin = si_query_sw_begin,
+ .end = si_query_sw_end,
+ .get_result = si_query_sw_get_result,
+ .get_result_resource = NULL
+};
+
+static struct pipe_query *si_query_sw_create(unsigned query_type)
+{
+ struct si_query_sw *query;
+
+ query = CALLOC_STRUCT(si_query_sw);
+ if (!query)
+ return NULL;
+
+ query->b.type = query_type;
+ query->b.ops = &sw_query_ops;
+
+ return (struct pipe_query *)query;
+}
+
+void si_query_hw_destroy(struct si_screen *sscreen,
+ struct si_query *rquery)
+{
+ struct si_query_hw *query = (struct si_query_hw *)rquery;
+ struct si_query_buffer *prev = query->buffer.previous;
+
+ /* Release all query buffers. */
+ while (prev) {
+ struct si_query_buffer *qbuf = prev;
+ prev = prev->previous;
+ r600_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
+ }
+
+ r600_resource_reference(&query->buffer.buf, NULL);
+ r600_resource_reference(&query->workaround_buf, NULL);
+ FREE(rquery);
+}
+
+static struct r600_resource *si_new_query_buffer(struct si_screen *sscreen,
+ struct si_query_hw *query)
+{
+ unsigned buf_size = MAX2(query->result_size,
+ sscreen->info.min_alloc_size);
+
+ /* Queries are normally read by the CPU after
+ * being written by the gpu, hence staging is probably a good
+ * usage pattern.
+ */
+ struct r600_resource *buf = r600_resource(
+ pipe_buffer_create(&sscreen->b, 0,
+ PIPE_USAGE_STAGING, buf_size));
+ if (!buf)
+ return NULL;
+
+ if (!query->ops->prepare_buffer(sscreen, query, buf)) {
+ r600_resource_reference(&buf, NULL);
+ return NULL;
+ }
+
+ return buf;
+}
+
+static bool si_query_hw_prepare_buffer(struct si_screen *sscreen,
+ struct si_query_hw *query,
+ struct r600_resource *buffer)
+{
+ /* Callers ensure that the buffer is currently unused by the GPU. */
+ uint32_t *results = sscreen->ws->buffer_map(buffer->buf, NULL,
+ PIPE_TRANSFER_WRITE |
+ PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!results)
+ return false;
+
+ memset(results, 0, buffer->b.b.width0);
+
+ if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ unsigned max_rbs = sscreen->info.num_render_backends;
+ unsigned enabled_rb_mask = sscreen->info.enabled_rb_mask;
+ unsigned num_results;
+ unsigned i, j;
+
+ /* Set top bits for unused backends. */
+ num_results = buffer->b.b.width0 / query->result_size;
+ for (j = 0; j < num_results; j++) {
+ for (i = 0; i < max_rbs; i++) {
+ if (!(enabled_rb_mask & (1<<i))) {
+ results[(i * 4)+1] = 0x80000000;
+ results[(i * 4)+3] = 0x80000000;
+ }
+ }
+ results += 4 * max_rbs;
+ }
+ }
+
+ return true;
+}
+
+static void si_query_hw_get_result_resource(struct si_context *sctx,
+ struct si_query *rquery,
+ bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset);
+
+static struct si_query_ops query_hw_ops = {
+ .destroy = si_query_hw_destroy,
+ .begin = si_query_hw_begin,
+ .end = si_query_hw_end,
+ .get_result = si_query_hw_get_result,
+ .get_result_resource = si_query_hw_get_result_resource,
+};
+
+static void si_query_hw_do_emit_start(struct si_context *sctx,
+ struct si_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va);
+static void si_query_hw_do_emit_stop(struct si_context *sctx,
+ struct si_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va);
+static void si_query_hw_add_result(struct si_screen *sscreen,
+ struct si_query_hw *, void *buffer,
+ union pipe_query_result *result);
+static void si_query_hw_clear_result(struct si_query_hw *,
+ union pipe_query_result *);
+
+static struct si_query_hw_ops query_hw_default_hw_ops = {
+ .prepare_buffer = si_query_hw_prepare_buffer,
+ .emit_start = si_query_hw_do_emit_start,
+ .emit_stop = si_query_hw_do_emit_stop,
+ .clear_result = si_query_hw_clear_result,
+ .add_result = si_query_hw_add_result,
+};
+
+bool si_query_hw_init(struct si_screen *sscreen,
+ struct si_query_hw *query)
+{
+ query->buffer.buf = si_new_query_buffer(sscreen, query);
+ if (!query->buffer.buf)
+ return false;
+
+ return true;
+}
+
+static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
+ unsigned query_type,
+ unsigned index)
+{
+ struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
+ if (!query)
+ return NULL;
+
+ query->b.type = query_type;
+ query->b.ops = &query_hw_ops;
+ query->ops = &query_hw_default_hw_ops;
+
+ switch (query_type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ query->result_size = 16 * sscreen->info.num_render_backends;
+ query->result_size += 16; /* for the fence + alignment */
+ query->num_cs_dw_end = 6 + si_cp_write_fence_dwords(sscreen);
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
+ query->result_size = 64;
+ query->num_cs_dw_end = 0;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ query->result_size = 24;
+ query->num_cs_dw_end = 8 + si_cp_write_fence_dwords(sscreen);
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ query->result_size = 16;
+ query->num_cs_dw_end = 8 + si_cp_write_fence_dwords(sscreen);
+ query->flags = SI_QUERY_HW_FLAG_NO_START;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+ query->result_size = 32;
+ query->num_cs_dw_end = 6;
+ query->stream = index;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+ query->result_size = 32 * SI_MAX_STREAMS;
+ query->num_cs_dw_end = 6 * SI_MAX_STREAMS;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ /* 11 values on GCN. */
+ query->result_size = 11 * 16;
+ query->result_size += 8; /* for the fence + alignment */
+ query->num_cs_dw_end = 6 + si_cp_write_fence_dwords(sscreen);
+ break;
+ default:
+ assert(0);
+ FREE(query);
+ return NULL;
+ }
+
+ if (!si_query_hw_init(sscreen, query)) {
+ FREE(query);
+ return NULL;
+ }
+
+ return (struct pipe_query *)query;
+}
+
+static void si_update_occlusion_query_state(struct si_context *sctx,
+ unsigned type, int diff)
+{
+ if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ bool old_enable = sctx->num_occlusion_queries != 0;
+ bool old_perfect_enable =
+ sctx->num_perfect_occlusion_queries != 0;
+ bool enable, perfect_enable;
+
+ sctx->num_occlusion_queries += diff;
+ assert(sctx->num_occlusion_queries >= 0);
+
+ if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ sctx->num_perfect_occlusion_queries += diff;
+ assert(sctx->num_perfect_occlusion_queries >= 0);
+ }
+
+ enable = sctx->num_occlusion_queries != 0;
+ perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+
+ if (enable != old_enable || perfect_enable != old_perfect_enable) {
+ si_set_occlusion_query_state(sctx, old_perfect_enable);
+ }
+ }
+}
+
+static unsigned event_type_for_stream(unsigned stream)
+{
+ switch (stream) {
+ default:
+ case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
+ case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
+ case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
+ case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
+ }
+}
+
+static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
+ unsigned stream)
+{
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+}
+
+static void si_query_hw_do_emit_start(struct si_context *sctx,
+ struct si_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ switch (query->b.type) {
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
+ return;
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ emit_sample_streamout(cs, va, query->stream);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+ emit_sample_streamout(cs, va + 32 * stream, stream);
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_TIMESTAMP, NULL, va,
+ 0, query->b.type);
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ break;
+ default:
+ assert(0);
+ }
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
+}
+
+static void si_query_hw_emit_start(struct si_context *sctx,
+ struct si_query_hw *query)
+{
+ uint64_t va;
+
+ if (!query->buffer.buf)
+ return; // previous buffer allocation failure
+
+ si_update_occlusion_query_state(sctx, query->b.type, 1);
+ si_update_prims_generated_query_state(sctx, query->b.type, 1);
+
+ if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
+ si_need_gfx_cs_space(sctx);
+
+ /* Get a new query buffer if needed. */
+ if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
+ struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+ *qbuf = query->buffer;
+ query->buffer.results_end = 0;
+ query->buffer.previous = qbuf;
+ query->buffer.buf = si_new_query_buffer(sctx->screen, query);
+ if (!query->buffer.buf)
+ return;
+ }
+
+ /* emit begin query */
+ va = query->buffer.buf->gpu_address + query->buffer.results_end;
+
+ query->ops->emit_start(sctx, query, query->buffer.buf, va);
+
+ sctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
+}
+
+static void si_query_hw_do_emit_stop(struct si_context *sctx,
+ struct si_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ uint64_t fence_va = 0;
+
+ switch (query->b.type) {
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
+ return;
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ va += 8;
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+
+ fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ va += 16;
+ emit_sample_streamout(cs, va, query->stream);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ va += 16;
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+ emit_sample_streamout(cs, va + 32 * stream, stream);
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ va += 8;
+ /* fall through */
+ case PIPE_QUERY_TIMESTAMP:
+ si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS,
+ 0, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+ EOP_DATA_SEL_TIMESTAMP, NULL, va,
+ 0, query->b.type);
+ fence_va = va + 8;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS: {
+ unsigned sample_size = (query->result_size - 8) / 2;
+
+ va += sample_size;
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+
+ fence_va = va + sample_size;
+ break;
+ }
+ default:
+ assert(0);
+ }
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
+
+ if (fence_va) {
+ si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+ EOP_DATA_SEL_VALUE_32BIT,
+ query->buffer.buf, fence_va, 0x80000000,
+ query->b.type);
+ }
+}
+
+static void si_query_hw_emit_stop(struct si_context *sctx,
+ struct si_query_hw *query)
+{
+ uint64_t va;
+
+ if (!query->buffer.buf)
+ return; // previous buffer allocation failure
+
+ /* The queries which need begin already called this in begin_query. */
+ if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+ si_need_gfx_cs_space(sctx);
+
+ /* emit end query */
+ va = query->buffer.buf->gpu_address + query->buffer.results_end;
+
+ query->ops->emit_stop(sctx, query, query->buffer.buf, va);
+
+ query->buffer.results_end += query->result_size;
+
+ if (!(query->flags & SI_QUERY_HW_FLAG_NO_START))
+ sctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
+
+ si_update_occlusion_query_state(sctx, query->b.type, -1);
+ si_update_prims_generated_query_state(sctx, query->b.type, -1);
+}
+
+static void emit_set_predicate(struct si_context *ctx,
+ struct r600_resource *buf, uint64_t va,
+ uint32_t op)
+{
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+ if (ctx->chip_class >= GFX9) {
+ radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+ radeon_emit(cs, va);
+ radeon_emit(cs, op | ((va >> 32) & 0xFF));
+ }
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ,
+ RADEON_PRIO_QUERY);
+}
+
+static void si_emit_query_predication(struct si_context *ctx)
+{
+ struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
+ struct si_query_buffer *qbuf;
+ uint32_t op;
+ bool flag_wait, invert;
+
+ if (!query)
+ return;
+
+ invert = ctx->render_cond_invert;
+ flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+ ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+ if (query->workaround_buf) {
+ op = PRED_OP(PREDICATION_OP_BOOL64);
+ } else {
+ switch (query->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ op = PRED_OP(PREDICATION_OP_ZPASS);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+ invert = !invert;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+ }
+
+ /* if true then invert, see GL_ARB_conditional_render_inverted */
+ if (invert)
+ op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+ else
+ op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+ /* Use the value written by compute shader as a workaround. Note that
+ * the wait flag does not apply in this predication mode.
+ *
+ * The shader outputs the result value to L2. Workarounds only affect VI
+ * and later, where the CP reads data from L2, so we don't need an
+ * additional flush.
+ */
+ if (query->workaround_buf) {
+ uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+ emit_set_predicate(ctx, query->workaround_buf, va, op);
+ return;
+ }
+
+ op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+ /* emit predicate packets for all data blocks */
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned results_base = 0;
+ uint64_t va_base = qbuf->buf->gpu_address;
+
+ while (results_base < qbuf->results_end) {
+ uint64_t va = va_base + results_base;
+
+ if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+ /* set CONTINUE bit for all packets except the first */
+ op |= PREDICATION_CONTINUE;
+ }
+ } else {
+ emit_set_predicate(ctx, qbuf->buf, va, op);
+ op |= PREDICATION_CONTINUE;
+ }
+
+ results_base += query->result_size;
+ }
+ }
+}
+
+static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+{
+ struct si_screen *sscreen =
+ (struct si_screen *)ctx->screen;
+
+ if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
+ query_type == PIPE_QUERY_GPU_FINISHED ||
+ (query_type >= PIPE_QUERY_DRIVER_SPECIFIC &&
+ query_type != SI_QUERY_TIME_ELAPSED_SDMA))
+ return si_query_sw_create(query_type);
+
+ return si_query_hw_create(sscreen, query_type, index);
+}
+
+static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *rquery = (struct si_query *)query;
+
+ rquery->ops->destroy(sctx->screen, rquery);
+}
+
+static boolean si_begin_query(struct pipe_context *ctx,
+ struct pipe_query *query)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *rquery = (struct si_query *)query;
+
+ return rquery->ops->begin(sctx, rquery);
+}
+
+void si_query_hw_reset_buffers(struct si_context *sctx,
+ struct si_query_hw *query)
+{
+ struct si_query_buffer *prev = query->buffer.previous;
+
+ /* Discard the old query buffers. */
+ while (prev) {
+ struct si_query_buffer *qbuf = prev;
+ prev = prev->previous;
+ r600_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
+ }
+
+ query->buffer.results_end = 0;
+ query->buffer.previous = NULL;
+
+ /* Obtain a new buffer if the current one can't be mapped without a stall. */
+ if (si_rings_is_buffer_referenced(sctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
+ r600_resource_reference(&query->buffer.buf, NULL);
+ query->buffer.buf = si_new_query_buffer(sctx->screen, query);
+ } else {
+ if (!query->ops->prepare_buffer(sctx->screen, query, query->buffer.buf))
+ r600_resource_reference(&query->buffer.buf, NULL);
+ }
+}
+
+bool si_query_hw_begin(struct si_context *sctx,
+ struct si_query *rquery)
+{
+ struct si_query_hw *query = (struct si_query_hw *)rquery;
+
+ if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+ assert(0);
+ return false;
+ }
+
+ if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
+ si_query_hw_reset_buffers(sctx, query);
+
+ r600_resource_reference(&query->workaround_buf, NULL);
+
+ si_query_hw_emit_start(sctx, query);
+ if (!query->buffer.buf)
+ return false;
+
+ LIST_ADDTAIL(&query->list, &sctx->active_queries);
+ return true;
+}
+
+static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *rquery = (struct si_query *)query;
+
+ return rquery->ops->end(sctx, rquery);
+}
+
+bool si_query_hw_end(struct si_context *sctx,
+ struct si_query *rquery)
+{
+ struct si_query_hw *query = (struct si_query_hw *)rquery;
+
+ if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+ si_query_hw_reset_buffers(sctx, query);
+
+ si_query_hw_emit_stop(sctx, query);
+
+ if (!(query->flags & SI_QUERY_HW_FLAG_NO_START))
+ LIST_DELINIT(&query->list);
+
+ if (!query->buffer.buf)
+ return false;
+
+ return true;
+}
+
+static void si_get_hw_query_params(struct si_context *sctx,
+ struct si_query_hw *rquery, int index,
+ struct si_hw_query_params *params)
+{
+ unsigned max_rbs = sctx->screen->info.num_render_backends;
+
+ params->pair_stride = 0;
+ params->pair_count = 1;
+
+ switch (rquery->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ params->start_offset = 0;
+ params->end_offset = 8;
+ params->fence_offset = max_rbs * 16;
+ params->pair_stride = 16;
+ params->pair_count = max_rbs;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ params->start_offset = 0;
+ params->end_offset = 8;
+ params->fence_offset = 16;
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ params->start_offset = 0;
+ params->end_offset = 0;
+ params->fence_offset = 8;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ params->start_offset = 8;
+ params->end_offset = 24;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ params->start_offset = 0;
+ params->end_offset = 16;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ params->start_offset = 8 - index * 8;
+ params->end_offset = 24 - index * 8;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ params->pair_count = SI_MAX_STREAMS;
+ params->pair_stride = 32;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ params->start_offset = 0;
+ params->end_offset = 16;
+
+ /* We can re-use the high dword of the last 64-bit value as a
+ * fence: it is initialized as 0, and the high bit is set by
+ * the write of the streamout stats event.
+ */
+ params->fence_offset = rquery->result_size - 4;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ {
+ static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
+ params->start_offset = offsets[index];
+ params->end_offset = 88 + offsets[index];
+ params->fence_offset = 2 * 88;
+ break;
+ }
+ default:
+ unreachable("si_get_hw_query_params unsupported");
+ }
+}
+
+static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
+ bool test_status_bit)
+{
+ uint32_t *current_result = (uint32_t*)map;
+ uint64_t start, end;
+
+ start = (uint64_t)current_result[start_index] |
+ (uint64_t)current_result[start_index+1] << 32;
+ end = (uint64_t)current_result[end_index] |
+ (uint64_t)current_result[end_index+1] << 32;
+
+ if (!test_status_bit ||
+ ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
+ return end - start;
+ }
+ return 0;
+}
+
+static void si_query_hw_add_result(struct si_screen *sscreen,
+ struct si_query_hw *query,
+ void *buffer,
+ union pipe_query_result *result)
+{
+ unsigned max_rbs = sscreen->info.num_render_backends;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER: {
+ for (unsigned i = 0; i < max_rbs; ++i) {
+ unsigned results_base = i * 16;
+ result->u64 +=
+ si_query_read_result(buffer + results_base, 0, 2, true);
+ }
+ break;
+ }
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+ for (unsigned i = 0; i < max_rbs; ++i) {
+ unsigned results_base = i * 16;
+ result->b = result->b ||
+ si_query_read_result(buffer + results_base, 0, 2, true) != 0;
+ }
+ break;
+ }
+ case PIPE_QUERY_TIME_ELAPSED:
+ result->u64 += si_query_read_result(buffer, 0, 2, false);
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ result->u64 += si_query_read_result(buffer, 0, 32/4, false);
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ result->u64 = *(uint64_t*)buffer;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ /* SAMPLE_STREAMOUTSTATS stores this structure:
+ * {
+ * u64 NumPrimitivesWritten;
+ * u64 PrimitiveStorageNeeded;
+ * }
+ * We only need NumPrimitivesWritten here. */
+ result->u64 += si_query_read_result(buffer, 2, 6, true);
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ /* Here we read PrimitiveStorageNeeded. */
+ result->u64 += si_query_read_result(buffer, 0, 4, true);
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ result->so_statistics.num_primitives_written +=
+ si_query_read_result(buffer, 2, 6, true);
+ result->so_statistics.primitives_storage_needed +=
+ si_query_read_result(buffer, 0, 4, true);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result->b = result->b ||
+ si_query_read_result(buffer, 2, 6, true) !=
+ si_query_read_result(buffer, 0, 4, true);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ result->b = result->b ||
+ si_query_read_result(buffer, 2, 6, true) !=
+ si_query_read_result(buffer, 0, 4, true);
+ buffer = (char *)buffer + 32;
+ }
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ result->pipeline_statistics.ps_invocations +=
+ si_query_read_result(buffer, 0, 22, false);
+ result->pipeline_statistics.c_primitives +=
+ si_query_read_result(buffer, 2, 24, false);
+ result->pipeline_statistics.c_invocations +=
+ si_query_read_result(buffer, 4, 26, false);
+ result->pipeline_statistics.vs_invocations +=
+ si_query_read_result(buffer, 6, 28, false);
+ result->pipeline_statistics.gs_invocations +=
+ si_query_read_result(buffer, 8, 30, false);
+ result->pipeline_statistics.gs_primitives +=
+ si_query_read_result(buffer, 10, 32, false);
+ result->pipeline_statistics.ia_primitives +=
+ si_query_read_result(buffer, 12, 34, false);
+ result->pipeline_statistics.ia_vertices +=
+ si_query_read_result(buffer, 14, 36, false);
+ result->pipeline_statistics.hs_invocations +=
+ si_query_read_result(buffer, 16, 38, false);
+ result->pipeline_statistics.ds_invocations +=
+ si_query_read_result(buffer, 18, 40, false);
+ result->pipeline_statistics.cs_invocations +=
+ si_query_read_result(buffer, 20, 42, false);
+#if 0 /* for testing */
+ printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
+ "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
+ "Clipper prims=%llu, PS=%llu, CS=%llu\n",
+ result->pipeline_statistics.ia_vertices,
+ result->pipeline_statistics.ia_primitives,
+ result->pipeline_statistics.vs_invocations,
+ result->pipeline_statistics.hs_invocations,
+ result->pipeline_statistics.ds_invocations,
+ result->pipeline_statistics.gs_invocations,
+ result->pipeline_statistics.gs_primitives,
+ result->pipeline_statistics.c_invocations,
+ result->pipeline_statistics.c_primitives,
+ result->pipeline_statistics.ps_invocations,
+ result->pipeline_statistics.cs_invocations);
+#endif
+ break;
+ default:
+ assert(0);
+ }
+}
+
+static boolean si_get_query_result(struct pipe_context *ctx,
+ struct pipe_query *query, boolean wait,
+ union pipe_query_result *result)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *rquery = (struct si_query *)query;
+
+ return rquery->ops->get_result(sctx, rquery, wait, result);
+}
+
+static void si_get_query_result_resource(struct pipe_context *ctx,
+ struct pipe_query *query,
+ boolean wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *rquery = (struct si_query *)query;
+
+ rquery->ops->get_result_resource(sctx, rquery, wait, result_type, index,
+ resource, offset);
+}
+
+static void si_query_hw_clear_result(struct si_query_hw *query,
+ union pipe_query_result *result)
+{
+ util_query_clear_result(result, query->b.type);
+}
+
+bool si_query_hw_get_result(struct si_context *sctx,
+ struct si_query *rquery,
+ bool wait, union pipe_query_result *result)
+{
+ struct si_screen *sscreen = sctx->screen;
+ struct si_query_hw *query = (struct si_query_hw *)rquery;
+ struct si_query_buffer *qbuf;
+
+ query->ops->clear_result(query, result);
+
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned usage = PIPE_TRANSFER_READ |
+ (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+ unsigned results_base = 0;
+ void *map;
+
+ if (rquery->b.flushed)
+ map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+ else
+ map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+ if (!map)
+ return false;
+
+ while (results_base != qbuf->results_end) {
+ query->ops->add_result(sscreen, query, map + results_base,
+ result);
+ results_base += query->result_size;
+ }
+ }
+
+ /* Convert the time to expected units. */
+ if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
+ rquery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
+ rquery->type == PIPE_QUERY_TIMESTAMP) {
+ result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
+ }
+ return true;
+}
+
+static void si_restore_qbo_state(struct si_context *sctx,
+ struct si_qbo_state *st)
+{
+ sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
+
+ sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+ pipe_resource_reference(&st->saved_const0.buffer, NULL);
+
+ sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+ for (unsigned i = 0; i < 3; ++i)
+ pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
+}
+
+static void si_query_hw_get_result_resource(struct si_context *sctx,
+ struct si_query *rquery,
+ bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset)
+{
+ struct si_query_hw *query = (struct si_query_hw *)rquery;
+ struct si_query_buffer *qbuf;
+ struct si_query_buffer *qbuf_prev;
+ struct pipe_resource *tmp_buffer = NULL;
+ unsigned tmp_buffer_offset = 0;
+ struct si_qbo_state saved_state = {};
+ struct pipe_grid_info grid = {};
+ struct pipe_constant_buffer constant_buffer = {};
+ struct pipe_shader_buffer ssbo[3];
+ struct si_hw_query_params params;
+ struct {
+ uint32_t end_offset;
+ uint32_t result_stride;
+ uint32_t result_count;
+ uint32_t config;
+ uint32_t fence_offset;
+ uint32_t pair_stride;
+ uint32_t pair_count;
+ } consts;
+
+ if (!sctx->query_result_shader) {
+ sctx->query_result_shader = si_create_query_result_cs(sctx);
+ if (!sctx->query_result_shader)
+ return;
+ }
+
+ if (query->buffer.previous) {
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
+ &tmp_buffer_offset, &tmp_buffer);
+ if (!tmp_buffer)
+ return;
+ }
+
+ si_save_qbo_state(sctx, &saved_state);
+
+ si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
+ consts.end_offset = params.end_offset - params.start_offset;
+ consts.fence_offset = params.fence_offset - params.start_offset;
+ consts.result_stride = query->result_size;
+ consts.pair_stride = params.pair_stride;
+ consts.pair_count = params.pair_count;
+
+ constant_buffer.buffer_size = sizeof(consts);
+ constant_buffer.user_buffer = &consts;
+
+ ssbo[1].buffer = tmp_buffer;
+ ssbo[1].buffer_offset = tmp_buffer_offset;
+ ssbo[1].buffer_size = 16;
+
+ ssbo[2] = ssbo[1];
+
+ sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
+
+ grid.block[0] = 1;
+ grid.block[1] = 1;
+ grid.block[2] = 1;
+ grid.grid[0] = 1;
+ grid.grid[1] = 1;
+ grid.grid[2] = 1;
+
+ consts.config = 0;
+ if (index < 0)
+ consts.config |= 4;
+ if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
+ consts.config |= 8;
+ else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ consts.config |= 8 | 256;
+ else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
+ query->b.type == PIPE_QUERY_TIME_ELAPSED)
+ consts.config |= 32;
+
+ switch (result_type) {
+ case PIPE_QUERY_TYPE_U64:
+ case PIPE_QUERY_TYPE_I64:
+ consts.config |= 64;
+ break;
+ case PIPE_QUERY_TYPE_I32:
+ consts.config |= 128;
+ break;
+ case PIPE_QUERY_TYPE_U32:
+ break;
+ }
+
+ sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
+
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
+ if (query->b.type != PIPE_QUERY_TIMESTAMP) {
+ qbuf_prev = qbuf->previous;
+ consts.result_count = qbuf->results_end / query->result_size;
+ consts.config &= ~3;
+ if (qbuf != &query->buffer)
+ consts.config |= 1;
+ if (qbuf->previous)
+ consts.config |= 2;
+ } else {
+ /* Only read the last timestamp. */
+ qbuf_prev = NULL;
+ consts.result_count = 0;
+ consts.config |= 16;
+ params.start_offset += qbuf->results_end - query->result_size;
+ }
+
+ sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+
+ ssbo[0].buffer = &qbuf->buf->b.b;
+ ssbo[0].buffer_offset = params.start_offset;
+ ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
+
+ if (!qbuf->previous) {
+ ssbo[2].buffer = resource;
+ ssbo[2].buffer_offset = offset;
+ ssbo[2].buffer_size = 8;
+
+ r600_resource(resource)->TC_L2_dirty = true;
+ }
+
+ sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
+
+ if (wait && qbuf == &query->buffer) {
+ uint64_t va;
+
+ /* Wait for result availability. Wait only for readiness
+ * of the last entry, since the fence writes should be
+ * serialized in the CP.
+ */
+ va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
+ va += params.fence_offset;
+
+ si_cp_wait_mem(sctx, va, 0x80000000, 0x80000000, 0);
+ }
+
+ sctx->b.launch_grid(&sctx->b, &grid);
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ si_restore_qbo_state(sctx, &saved_state);
+ pipe_resource_reference(&tmp_buffer, NULL);
+}
+
+static void si_render_condition(struct pipe_context *ctx,
+ struct pipe_query *query,
+ boolean condition,
+ enum pipe_render_cond_flag mode)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query_hw *rquery = (struct si_query_hw *)query;
+ struct si_atom *atom = &sctx->atoms.s.render_cond;
+
+ if (query) {
+ bool needs_workaround = false;
+
+ /* There was a firmware regression in VI which causes successive
+ * SET_PREDICATION packets to give the wrong answer for
+ * non-inverted stream overflow predication.
+ */
+ if (((sctx->chip_class == VI && sctx->screen->info.pfp_fw_feature < 49) ||
+ (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
+ !condition &&
+ (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
+ (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
+ (rquery->buffer.previous ||
+ rquery->buffer.results_end > rquery->result_size)))) {
+ needs_workaround = true;
+ }
+
+ if (needs_workaround && !rquery->workaround_buf) {
+ bool old_force_off = sctx->render_cond_force_off;
+ sctx->render_cond_force_off = true;
+
+ u_suballocator_alloc(
+ sctx->allocator_zeroed_memory, 8, 8,
+ &rquery->workaround_offset,
+ (struct pipe_resource **)&rquery->workaround_buf);
+
+ /* Reset to NULL to avoid a redundant SET_PREDICATION
+ * from launching the compute grid.
+ */
+ sctx->render_cond = NULL;
+
+ ctx->get_query_result_resource(
+ ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
+ &rquery->workaround_buf->b.b, rquery->workaround_offset);
+
+ /* Settings this in the render cond atom is too late,
+ * so set it here. */
+ sctx->flags |= sctx->screen->barrier_flags.L2_to_cp |
+ SI_CONTEXT_FLUSH_FOR_RENDER_COND;
+
+ sctx->render_cond_force_off = old_force_off;
+ }
+ }
+
+ sctx->render_cond = query;
+ sctx->render_cond_invert = condition;
+ sctx->render_cond_mode = mode;
+
+ si_set_atom_dirty(sctx, atom, query != NULL);
+}
+
+void si_suspend_queries(struct si_context *sctx)
+{
+ struct si_query_hw *query;
+
+ LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, list) {
+ si_query_hw_emit_stop(sctx, query);
+ }
+ assert(sctx->num_cs_dw_queries_suspend == 0);
+}
+
+void si_resume_queries(struct si_context *sctx)
+{
+ struct si_query_hw *query;
+
+ assert(sctx->num_cs_dw_queries_suspend == 0);
+
+ /* Check CS space here. Resuming must not be interrupted by flushes. */
+ si_need_gfx_cs_space(sctx);
+
+ LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, list) {
+ si_query_hw_emit_start(sctx, query);
+ }
+}
+
+#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
+ { \
+ .name = name_, \
+ .query_type = SI_QUERY_##query_type_, \
+ .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+ .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
+ .group_id = group_id_ \
+ }
+
+#define X(name_, query_type_, type_, result_type_) \
+ XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+
+#define XG(group_, name_, query_type_, type_, result_type_) \
+ XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
+
+static struct pipe_driver_query_info si_driver_query_list[] = {
+ X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
+ X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
+ X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE),
+ X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
+ X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
+ X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
+ X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
+ X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
+ X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
+ X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
+ X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
+ X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
+ X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
+ X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
+ X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
+ X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
+ X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
+ X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
+ X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
+ X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
+ X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
+ X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
+ X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
+ X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
+ X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
+ X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
+ X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
+ X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
+ X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
+ X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
+ X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
+ X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
+ X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
+ X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
+ X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
+ X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
+ X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
+ X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
+ X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
+ X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
+ X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
+ X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
+
+ /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+ * which use it as a fallback path to detect the GPU type.
+ *
+ * Note: The names of these queries are significant for GPUPerfStudio
+ * (and possibly their order as well). */
+ XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
+ XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
+ XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
+ XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
+ XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
+
+ X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
+ X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
+ X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
+
+ /* The following queries must be at the end of the list because their
+ * availability is adjusted dynamically based on the DRM version. */
+ X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
+ X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
+ X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
+ X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
+ X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
+ X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
+ X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
+ X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
+ X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
+ X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
+ X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
+ X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
+ X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
+ X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
+
+ /* SRBM_STATUS2 */
+ X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
+
+ /* CP_STAT */
+ X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
+ X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
+ X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
+ X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
+ X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
+ X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
+};
+
+#undef X
+#undef XG
+#undef XFULL
+
+static unsigned si_get_num_queries(struct si_screen *sscreen)
+{
+ /* amdgpu */
+ if (sscreen->info.drm_major == 3) {
+ if (sscreen->info.chip_class >= VI)
+ return ARRAY_SIZE(si_driver_query_list);
+ else
+ return ARRAY_SIZE(si_driver_query_list) - 7;
+ }
+
+ /* radeon */
+ if (sscreen->info.has_read_registers_query) {
+ if (sscreen->info.chip_class == CIK)
+ return ARRAY_SIZE(si_driver_query_list) - 6;
+ else
+ return ARRAY_SIZE(si_driver_query_list) - 7;
+ }
+
+ return ARRAY_SIZE(si_driver_query_list) - 21;
+}
+
+static int si_get_driver_query_info(struct pipe_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_info *info)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ unsigned num_queries = si_get_num_queries(sscreen);
+
+ if (!info) {
+ unsigned num_perfcounters =
+ si_get_perfcounter_info(sscreen, 0, NULL);
+
+ return num_queries + num_perfcounters;
+ }
+
+ if (index >= num_queries)
+ return si_get_perfcounter_info(sscreen, index - num_queries, info);
+
+ *info = si_driver_query_list[index];
+
+ switch (info->query_type) {
+ case SI_QUERY_REQUESTED_VRAM:
+ case SI_QUERY_VRAM_USAGE:
+ case SI_QUERY_MAPPED_VRAM:
+ info->max_value.u64 = sscreen->info.vram_size;
+ break;
+ case SI_QUERY_REQUESTED_GTT:
+ case SI_QUERY_GTT_USAGE:
+ case SI_QUERY_MAPPED_GTT:
+ info->max_value.u64 = sscreen->info.gart_size;
+ break;
+ case SI_QUERY_GPU_TEMPERATURE:
+ info->max_value.u64 = 125;
+ break;
+ case SI_QUERY_VRAM_VIS_USAGE:
+ info->max_value.u64 = sscreen->info.vram_vis_size;
+ break;
+ }
+
+ if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
+ info->group_id += sscreen->perfcounters->num_groups;
+
+ return 1;
+}
+
+/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
+ * performance counter groups, so be careful when changing this and related
+ * functions.
+ */
+static int si_get_driver_query_group_info(struct pipe_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ unsigned num_pc_groups = 0;
+
+ if (sscreen->perfcounters)
+ num_pc_groups = sscreen->perfcounters->num_groups;
+
+ if (!info)
+ return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
+
+ if (index < num_pc_groups)
+ return si_get_perfcounter_group_info(sscreen, index, info);
+
+ index -= num_pc_groups;
+ if (index >= SI_NUM_SW_QUERY_GROUPS)
+ return 0;
+
+ info->name = "GPIN";
+ info->max_active_queries = 5;
+ info->num_queries = 5;
+ return 1;
+}
+
+void si_init_query_functions(struct si_context *sctx)
+{
+ sctx->b.create_query = si_create_query;
+ sctx->b.create_batch_query = si_create_batch_query;
+ sctx->b.destroy_query = si_destroy_query;
+ sctx->b.begin_query = si_begin_query;
+ sctx->b.end_query = si_end_query;
+ sctx->b.get_query_result = si_get_query_result;
+ sctx->b.get_query_result_resource = si_get_query_result_resource;
+ sctx->atoms.s.render_cond.emit = si_emit_query_predication;
+
+ if (((struct si_screen*)sctx->b.screen)->info.num_render_backends > 0)
+ sctx->b.render_condition = si_render_condition;
+
+ LIST_INITHEAD(&sctx->active_queries);
+}
+
+void si_init_screen_query_functions(struct si_screen *sscreen)
+{
+ sscreen->b.get_driver_query_info = si_get_driver_query_info;
+ sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_query.h b/lib/mesa/src/gallium/drivers/radeonsi/si_query.h
new file mode 100644
index 000000000..cf2eccd86
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_query.h
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SI_QUERY_H
+#define SI_QUERY_H
+
+#include "util/u_threaded_context.h"
+
+struct pipe_context;
+struct pipe_query;
+struct pipe_resource;
+
+struct si_screen;
+struct si_context;
+struct si_query;
+struct si_query_hw;
+struct r600_resource;
+
+enum {
+ SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+ SI_QUERY_DECOMPRESS_CALLS,
+ SI_QUERY_MRT_DRAW_CALLS,
+ SI_QUERY_PRIM_RESTART_CALLS,
+ SI_QUERY_SPILL_DRAW_CALLS,
+ SI_QUERY_COMPUTE_CALLS,
+ SI_QUERY_SPILL_COMPUTE_CALLS,
+ SI_QUERY_DMA_CALLS,
+ SI_QUERY_CP_DMA_CALLS,
+ SI_QUERY_NUM_VS_FLUSHES,
+ SI_QUERY_NUM_PS_FLUSHES,
+ SI_QUERY_NUM_CS_FLUSHES,
+ SI_QUERY_NUM_CB_CACHE_FLUSHES,
+ SI_QUERY_NUM_DB_CACHE_FLUSHES,
+ SI_QUERY_NUM_L2_INVALIDATES,
+ SI_QUERY_NUM_L2_WRITEBACKS,
+ SI_QUERY_NUM_RESIDENT_HANDLES,
+ SI_QUERY_TC_OFFLOADED_SLOTS,
+ SI_QUERY_TC_DIRECT_SLOTS,
+ SI_QUERY_TC_NUM_SYNCS,
+ SI_QUERY_CS_THREAD_BUSY,
+ SI_QUERY_GALLIUM_THREAD_BUSY,
+ SI_QUERY_REQUESTED_VRAM,
+ SI_QUERY_REQUESTED_GTT,
+ SI_QUERY_MAPPED_VRAM,
+ SI_QUERY_MAPPED_GTT,
+ SI_QUERY_BUFFER_WAIT_TIME,
+ SI_QUERY_NUM_MAPPED_BUFFERS,
+ SI_QUERY_NUM_GFX_IBS,
+ SI_QUERY_NUM_SDMA_IBS,
+ SI_QUERY_GFX_BO_LIST_SIZE,
+ SI_QUERY_GFX_IB_SIZE,
+ SI_QUERY_NUM_BYTES_MOVED,
+ SI_QUERY_NUM_EVICTIONS,
+ SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
+ SI_QUERY_VRAM_USAGE,
+ SI_QUERY_VRAM_VIS_USAGE,
+ SI_QUERY_GTT_USAGE,
+ SI_QUERY_GPU_TEMPERATURE,
+ SI_QUERY_CURRENT_GPU_SCLK,
+ SI_QUERY_CURRENT_GPU_MCLK,
+ SI_QUERY_GPU_LOAD,
+ SI_QUERY_GPU_SHADERS_BUSY,
+ SI_QUERY_GPU_TA_BUSY,
+ SI_QUERY_GPU_GDS_BUSY,
+ SI_QUERY_GPU_VGT_BUSY,
+ SI_QUERY_GPU_IA_BUSY,
+ SI_QUERY_GPU_SX_BUSY,
+ SI_QUERY_GPU_WD_BUSY,
+ SI_QUERY_GPU_BCI_BUSY,
+ SI_QUERY_GPU_SC_BUSY,
+ SI_QUERY_GPU_PA_BUSY,
+ SI_QUERY_GPU_DB_BUSY,
+ SI_QUERY_GPU_CP_BUSY,
+ SI_QUERY_GPU_CB_BUSY,
+ SI_QUERY_GPU_SDMA_BUSY,
+ SI_QUERY_GPU_PFP_BUSY,
+ SI_QUERY_GPU_MEQ_BUSY,
+ SI_QUERY_GPU_ME_BUSY,
+ SI_QUERY_GPU_SURF_SYNC_BUSY,
+ SI_QUERY_GPU_CP_DMA_BUSY,
+ SI_QUERY_GPU_SCRATCH_RAM_BUSY,
+ SI_QUERY_NUM_COMPILATIONS,
+ SI_QUERY_NUM_SHADERS_CREATED,
+ SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+ SI_QUERY_NUM_SHADER_CACHE_HITS,
+ SI_QUERY_GPIN_ASIC_ID,
+ SI_QUERY_GPIN_NUM_SIMD,
+ SI_QUERY_GPIN_NUM_RB,
+ SI_QUERY_GPIN_NUM_SPI,
+ SI_QUERY_GPIN_NUM_SE,
+ SI_QUERY_TIME_ELAPSED_SDMA,
+ SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
+
+ SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+};
+
+enum {
+ SI_QUERY_GROUP_GPIN = 0,
+ SI_NUM_SW_QUERY_GROUPS
+};
+
+struct si_query_ops {
+ void (*destroy)(struct si_screen *, struct si_query *);
+ bool (*begin)(struct si_context *, struct si_query *);
+ bool (*end)(struct si_context *, struct si_query *);
+ bool (*get_result)(struct si_context *,
+ struct si_query *, bool wait,
+ union pipe_query_result *result);
+ void (*get_result_resource)(struct si_context *,
+ struct si_query *, bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset);
+};
+
+struct si_query {
+ struct threaded_query b;
+ struct si_query_ops *ops;
+
+ /* The type of query */
+ unsigned type;
+};
+
+enum {
+ SI_QUERY_HW_FLAG_NO_START = (1 << 0),
+ /* gap */
+ /* whether begin_query doesn't clear the result */
+ SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
+};
+
+struct si_query_hw_ops {
+ bool (*prepare_buffer)(struct si_screen *,
+ struct si_query_hw *,
+ struct r600_resource *);
+ void (*emit_start)(struct si_context *,
+ struct si_query_hw *,
+ struct r600_resource *buffer, uint64_t va);
+ void (*emit_stop)(struct si_context *,
+ struct si_query_hw *,
+ struct r600_resource *buffer, uint64_t va);
+ void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
+ void (*add_result)(struct si_screen *screen,
+ struct si_query_hw *, void *buffer,
+ union pipe_query_result *result);
+};
+
+struct si_query_buffer {
+ /* The buffer where query results are stored. */
+ struct r600_resource *buf;
+ /* Offset of the next free result after current query data */
+ unsigned results_end;
+ /* If a query buffer is full, a new buffer is created and the old one
+ * is put in here. When we calculate the result, we sum up the samples
+ * from all buffers. */
+ struct si_query_buffer *previous;
+};
+
+struct si_query_hw {
+ struct si_query b;
+ struct si_query_hw_ops *ops;
+ unsigned flags;
+
+ /* The query buffer and how many results are in it. */
+ struct si_query_buffer buffer;
+ /* Size of the result in memory for both begin_query and end_query,
+ * this can be one or two numbers, or it could even be a size of a structure. */
+ unsigned result_size;
+ /* The number of dwords for end_query. */
+ unsigned num_cs_dw_end;
+ /* Linked list of queries */
+ struct list_head list;
+ /* For transform feedback: which stream the query is for */
+ unsigned stream;
+
+ /* Workaround via compute shader */
+ struct r600_resource *workaround_buf;
+ unsigned workaround_offset;
+};
+
+bool si_query_hw_init(struct si_screen *sscreen,
+ struct si_query_hw *query);
+void si_query_hw_destroy(struct si_screen *sscreen,
+ struct si_query *rquery);
+bool si_query_hw_begin(struct si_context *sctx,
+ struct si_query *rquery);
+bool si_query_hw_end(struct si_context *sctx,
+ struct si_query *rquery);
+bool si_query_hw_get_result(struct si_context *sctx,
+ struct si_query *rquery,
+ bool wait,
+ union pipe_query_result *result);
+
+/* Performance counters */
+enum {
+ /* This block is part of the shader engine */
+ SI_PC_BLOCK_SE = (1 << 0),
+
+ /* Expose per-instance groups instead of summing all instances (within
+ * an SE). */
+ SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
+
+ /* Expose per-SE groups instead of summing instances across SEs. */
+ SI_PC_BLOCK_SE_GROUPS = (1 << 2),
+
+ /* Shader block */
+ SI_PC_BLOCK_SHADER = (1 << 3),
+
+ /* Non-shader block with perfcounters windowed by shaders. */
+ SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
+};
+
+/* Describes a hardware block with performance counters. Multiple instances of
+ * each block, possibly per-SE, may exist on the chip. Depending on the block
+ * and on the user's configuration, we either
+ * (a) expose every instance as a performance counter group,
+ * (b) expose a single performance counter group that reports the sum over all
+ * instances, or
+ * (c) expose one performance counter group per instance, but summed over all
+ * shader engines.
+ */
+struct si_perfcounter_block {
+ const char *basename;
+ unsigned flags;
+ unsigned num_counters;
+ unsigned num_selectors;
+ unsigned num_instances;
+
+ unsigned num_groups;
+ char *group_names;
+ unsigned group_name_stride;
+
+ char *selector_names;
+ unsigned selector_name_stride;
+
+ void *data;
+};
+
+struct si_perfcounters {
+ unsigned num_groups;
+ unsigned num_blocks;
+ struct si_perfcounter_block *blocks;
+
+ unsigned num_stop_cs_dwords;
+ unsigned num_instance_cs_dwords;
+
+ unsigned num_shader_types;
+ const char * const *shader_type_suffixes;
+ const unsigned *shader_type_bits;
+
+ void (*emit_instance)(struct si_context *,
+ int se, int instance);
+ void (*emit_shaders)(struct si_context *, unsigned shaders);
+ void (*emit_select)(struct si_context *,
+ struct si_perfcounter_block *,
+ unsigned count, unsigned *selectors);
+ void (*emit_start)(struct si_context *,
+ struct r600_resource *buffer, uint64_t va);
+ void (*emit_stop)(struct si_context *,
+ struct r600_resource *buffer, uint64_t va);
+ void (*emit_read)(struct si_context *,
+ struct si_perfcounter_block *,
+ unsigned count, unsigned *selectors,
+ struct r600_resource *buffer, uint64_t va);
+
+ void (*cleanup)(struct si_screen *);
+
+ bool separate_se;
+ bool separate_instance;
+};
+
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
+ unsigned num_queries,
+ unsigned *query_types);
+
+int si_get_perfcounter_info(struct si_screen *,
+ unsigned index,
+ struct pipe_driver_query_info *info);
+int si_get_perfcounter_group_info(struct si_screen *,
+ unsigned index,
+ struct pipe_driver_query_group_info *info);
+
+bool si_perfcounters_init(struct si_perfcounters *, unsigned num_blocks);
+void si_perfcounters_add_block(struct si_screen *,
+ struct si_perfcounters *,
+ const char *name, unsigned flags,
+ unsigned counters, unsigned selectors,
+ unsigned instances, void *data);
+void si_perfcounters_do_destroy(struct si_perfcounters *);
+void si_query_hw_reset_buffers(struct si_context *sctx,
+ struct si_query_hw *query);
+
+struct si_qbo_state {
+ void *saved_compute;
+ struct pipe_constant_buffer saved_const0;
+ struct pipe_shader_buffer saved_ssbo[3];
+};
+
+#endif /* SI_QUERY_H */
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c
index 7a8822738..cd38145da 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -1,5 +1,6 @@
/*
* Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -21,8 +22,8 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader.h"
#include "si_shader_internal.h"
+#include "si_pipe.h"
#include "ac_nir_to_llvm.h"
@@ -31,11 +32,23 @@
#include "compiler/nir/nir.h"
#include "compiler/nir_types.h"
+static nir_variable* tex_get_texture_var(nir_tex_instr *instr)
+{
+ for (unsigned i = 0; i < instr->num_srcs; i++) {
+ switch (instr->src[i].src_type) {
+ case nir_tex_src_texture_deref:
+ return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[i].src));
+ default:
+ break;
+ }
+ }
+
+ return NULL;
+}
-static int
-type_size(const struct glsl_type *type)
+static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr)
{
- return glsl_count_attribute_slots(type, false);
+ return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
}
static void scan_instruction(struct tgsi_shader_info *info,
@@ -58,6 +71,15 @@ static void scan_instruction(struct tgsi_shader_info *info,
}
} else if (instr->type == nir_instr_type_tex) {
nir_tex_instr *tex = nir_instr_as_tex(instr);
+ nir_variable *texture = tex_get_texture_var(tex);
+
+ if (!texture) {
+ info->samplers_declared |=
+ u_bit_consecutive(tex->sampler_index, 1);
+ } else {
+ if (texture->data.bindless)
+ info->uses_bindless_samplers = true;
+ }
switch (tex->op) {
case nir_texop_tex:
@@ -78,6 +100,30 @@ static void scan_instruction(struct tgsi_shader_info *info,
case nir_intrinsic_load_instance_id:
info->uses_instanceid = 1;
break;
+ case nir_intrinsic_load_invocation_id:
+ info->uses_invocationid = true;
+ break;
+ case nir_intrinsic_load_num_work_groups:
+ info->uses_grid_size = true;
+ break;
+ case nir_intrinsic_load_local_group_size:
+ /* The block size is translated to IMM with a fixed block size. */
+ if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+ info->uses_block_size = true;
+ break;
+ case nir_intrinsic_load_local_invocation_id:
+ case nir_intrinsic_load_work_group_id: {
+ unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+
+ if (intr->intrinsic == nir_intrinsic_load_work_group_id)
+ info->uses_block_id[i] = true;
+ else
+ info->uses_thread_id[i] = true;
+ }
+ break;
+ }
case nir_intrinsic_load_vertex_id:
info->uses_vertexid = 1;
break;
@@ -90,15 +136,37 @@ static void scan_instruction(struct tgsi_shader_info *info,
case nir_intrinsic_load_primitive_id:
info->uses_primid = 1;
break;
- case nir_intrinsic_image_store:
- case nir_intrinsic_image_atomic_add:
- case nir_intrinsic_image_atomic_min:
- case nir_intrinsic_image_atomic_max:
- case nir_intrinsic_image_atomic_and:
- case nir_intrinsic_image_atomic_or:
- case nir_intrinsic_image_atomic_xor:
- case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_load_sample_mask_in:
+ info->reads_samplemask = true;
+ break;
+ case nir_intrinsic_load_tess_level_inner:
+ case nir_intrinsic_load_tess_level_outer:
+ info->reads_tess_factors = true;
+ break;
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_deref_size:
+ case nir_intrinsic_image_deref_samples: {
+ nir_variable *var = intrinsic_get_var(intr);
+ if (var->data.bindless)
+ info->uses_bindless_images = true;
+
+ break;
+ }
+ case nir_intrinsic_image_deref_store:
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_min:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap: {
+ nir_variable *var = intrinsic_get_var(intr);
+ if (var->data.bindless)
+ info->uses_bindless_images = true;
+
+ /* fall-through */
+ }
case nir_intrinsic_store_ssbo:
case nir_intrinsic_ssbo_atomic_add:
case nir_intrinsic_ssbo_atomic_imin:
@@ -112,42 +180,206 @@ static void scan_instruction(struct tgsi_shader_info *info,
case nir_intrinsic_ssbo_atomic_comp_swap:
info->writes_memory = true;
break;
+ case nir_intrinsic_load_deref: {
+ nir_variable *var = intrinsic_get_var(intr);
+ nir_variable_mode mode = var->data.mode;
+ enum glsl_base_type base_type =
+ glsl_get_base_type(glsl_without_array(var->type));
+
+ if (mode == nir_var_shader_in) {
+ switch (var->data.interpolation) {
+ case INTERP_MODE_NONE:
+ if (glsl_base_type_is_integer(base_type))
+ break;
+
+ /* fall-through */
+ case INTERP_MODE_SMOOTH:
+ if (var->data.sample)
+ info->uses_persp_sample = true;
+ else if (var->data.centroid)
+ info->uses_persp_centroid = true;
+ else
+ info->uses_persp_center = true;
+ break;
+
+ case INTERP_MODE_NOPERSPECTIVE:
+ if (var->data.sample)
+ info->uses_linear_sample = true;
+ else if (var->data.centroid)
+ info->uses_linear_centroid = true;
+ else
+ info->uses_linear_center = true;
+ break;
+ }
+ }
+ break;
+ }
+ case nir_intrinsic_interp_deref_at_centroid:
+ case nir_intrinsic_interp_deref_at_sample:
+ case nir_intrinsic_interp_deref_at_offset: {
+ enum glsl_interp_mode interp = intrinsic_get_var(intr)->data.interpolation;
+ switch (interp) {
+ case INTERP_MODE_SMOOTH:
+ case INTERP_MODE_NONE:
+ if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid)
+ info->uses_persp_opcode_interp_centroid = true;
+ else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
+ info->uses_persp_opcode_interp_sample = true;
+ else
+ info->uses_persp_opcode_interp_offset = true;
+ break;
+ case INTERP_MODE_NOPERSPECTIVE:
+ if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid)
+ info->uses_linear_opcode_interp_centroid = true;
+ else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
+ info->uses_linear_opcode_interp_sample = true;
+ else
+ info->uses_linear_opcode_interp_offset = true;
+ break;
+ case INTERP_MODE_FLAT:
+ break;
+ default:
+ unreachable("Unsupported interpoation type");
+ }
+ break;
+ }
default:
break;
}
}
}
+void si_nir_scan_tess_ctrl(const struct nir_shader *nir,
+ const struct tgsi_shader_info *info,
+ struct tgsi_tessctrl_info *out)
+{
+ memset(out, 0, sizeof(*out));
+
+ if (nir->info.stage != MESA_SHADER_TESS_CTRL)
+ return;
+
+ /* Initial value = true. Here the pass will accumulate results from
+ * multiple segments surrounded by barriers. If tess factors aren't
+ * written at all, it's a shader bug and we don't care if this will be
+ * true.
+ */
+ out->tessfactors_are_def_in_all_invocs = true;
+
+ /* TODO: Implement scanning of tess factors, see tgsi backend. */
+}
+
void si_nir_scan_shader(const struct nir_shader *nir,
struct tgsi_shader_info *info)
{
nir_function *func;
unsigned i;
- assert(nir->info.stage == MESA_SHADER_VERTEX ||
- nir->info.stage == MESA_SHADER_FRAGMENT);
-
info->processor = pipe_shader_type_from_mesa(nir->info.stage);
info->num_tokens = 2; /* indicate that the shader is non-empty */
info->num_instructions = 2;
- info->num_inputs = nir->num_inputs;
- info->num_outputs = nir->num_outputs;
+ info->properties[TGSI_PROPERTY_NEXT_SHADER] =
+ pipe_shader_type_from_mesa(nir->info.next_stage);
+
+ if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+ info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] =
+ nir->info.tess.tcs_vertices_out;
+ }
+
+ if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+ if (nir->info.tess.primitive_mode == GL_ISOLINES)
+ info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
+ else
+ info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
+
+ STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
+ STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 ==
+ PIPE_TESS_SPACING_FRACTIONAL_ODD);
+ STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 ==
+ PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+
+ info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
+ info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
+ info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
+ }
+
+ if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+ info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
+ info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
+ info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
+ info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
+ }
+
+ if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+ info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
+ nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
+ info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
+
+ if (nir->info.fs.pixel_center_integer) {
+ info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] =
+ TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
+ }
+
+ if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
+ switch (nir->info.fs.depth_layout) {
+ case FRAG_DEPTH_LAYOUT_ANY:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
+ break;
+ case FRAG_DEPTH_LAYOUT_GREATER:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
+ break;
+ case FRAG_DEPTH_LAYOUT_LESS:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
+ break;
+ case FRAG_DEPTH_LAYOUT_UNCHANGED:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
+ break;
+ default:
+ unreachable("Unknow depth layout");
+ }
+ }
+ }
+
+ if (nir->info.stage == MESA_SHADER_COMPUTE) {
+ info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
+ info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
+ info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
+ }
i = 0;
+ uint64_t processed_inputs = 0;
+ unsigned num_inputs = 0;
nir_foreach_variable(variable, &nir->inputs) {
unsigned semantic_name, semantic_index;
- unsigned attrib_count = glsl_count_attribute_slots(variable->type,
+
+ const struct glsl_type *type = variable->type;
+ if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+ assert(glsl_type_is_array(type));
+ type = glsl_get_array_element(type);
+ }
+
+ unsigned attrib_count = glsl_count_attribute_slots(type,
nir->info.stage == MESA_SHADER_VERTEX);
- assert(attrib_count == 1 && "not implemented");
+ i = variable->data.driver_location;
/* Vertex shader inputs don't have semantics. The state
* tracker has already mapped them to attributes via
* variable->data.driver_location.
*/
- if (nir->info.stage == MESA_SHADER_VERTEX)
+ if (nir->info.stage == MESA_SHADER_VERTEX) {
+ /* TODO: gather the actual input useage and remove this. */
+ info->input_usage_mask[i] = TGSI_WRITEMASK_XYZW;
+
+ if (glsl_type_is_dual_slot(variable->type)) {
+ num_inputs += 2;
+
+ /* TODO: gather the actual input useage and remove this. */
+ info->input_usage_mask[i+1] = TGSI_WRITEMASK_XYZW;
+ } else
+ num_inputs++;
continue;
+ }
/* Fragment shader position is a system value. */
if (nir->info.stage == MESA_SHADER_FRAGMENT &&
@@ -155,150 +387,345 @@ void si_nir_scan_shader(const struct nir_shader *nir,
if (variable->data.pixel_center_integer)
info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] =
TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
+
+ num_inputs++;
continue;
}
- tgsi_get_gl_varying_semantic(variable->data.location, true,
- &semantic_name, &semantic_index);
+ for (unsigned j = 0; j < attrib_count; j++, i++) {
- info->input_semantic_name[i] = semantic_name;
- info->input_semantic_index[i] = semantic_index;
+ if (processed_inputs & ((uint64_t)1 << i))
+ continue;
- if (variable->data.sample)
- info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
- else if (variable->data.centroid)
- info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
- else
- info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
-
- enum glsl_base_type base_type =
- glsl_get_base_type(glsl_without_array(variable->type));
+ processed_inputs |= ((uint64_t)1 << i);
+ num_inputs++;
- switch (variable->data.interpolation) {
- case INTERP_MODE_NONE:
- if (glsl_base_type_is_integer(base_type)) {
- info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
- break;
- }
+ tgsi_get_gl_varying_semantic(variable->data.location + j, true,
+ &semantic_name, &semantic_index);
- if (semantic_name == TGSI_SEMANTIC_COLOR) {
- info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
- goto persp_locations;
- }
- /* fall-through */
- case INTERP_MODE_SMOOTH:
- assert(!glsl_base_type_is_integer(base_type));
+ info->input_semantic_name[i] = semantic_name;
+ info->input_semantic_index[i] = semantic_index;
- info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
+ if (semantic_name == TGSI_SEMANTIC_PRIMID)
+ info->uses_primid = true;
- persp_locations:
if (variable->data.sample)
- info->uses_persp_sample = true;
+ info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
else if (variable->data.centroid)
- info->uses_persp_centroid = true;
+ info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
else
- info->uses_persp_center = true;
- break;
+ info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
- case INTERP_MODE_NOPERSPECTIVE:
- assert(!glsl_base_type_is_integer(base_type));
+ enum glsl_base_type base_type =
+ glsl_get_base_type(glsl_without_array(variable->type));
- info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
+ switch (variable->data.interpolation) {
+ case INTERP_MODE_NONE:
+ if (glsl_base_type_is_integer(base_type)) {
+ info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+ break;
+ }
- if (variable->data.sample)
- info->uses_linear_sample = true;
- else if (variable->data.centroid)
- info->uses_linear_centroid = true;
- else
- info->uses_linear_center = true;
- break;
+ if (semantic_name == TGSI_SEMANTIC_COLOR) {
+ info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
+ break;
+ }
+ /* fall-through */
- case INTERP_MODE_FLAT:
- info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
- break;
- }
+ case INTERP_MODE_SMOOTH:
+ assert(!glsl_base_type_is_integer(base_type));
+
+ info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
+ break;
+
+ case INTERP_MODE_NOPERSPECTIVE:
+ assert(!glsl_base_type_is_integer(base_type));
- /* TODO make this more precise */
- if (variable->data.location == VARYING_SLOT_COL0)
- info->colors_read |= 0x0f;
- else if (variable->data.location == VARYING_SLOT_COL1)
- info->colors_read |= 0xf0;
+ info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
+ break;
- i++;
+ case INTERP_MODE_FLAT:
+ info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+ break;
+ }
+
+ /* TODO make this more precise */
+ if (variable->data.location == VARYING_SLOT_COL0)
+ info->colors_read |= 0x0f;
+ else if (variable->data.location == VARYING_SLOT_COL1)
+ info->colors_read |= 0xf0;
+ }
}
+ info->num_inputs = num_inputs;
+
+
i = 0;
+ uint64_t processed_outputs = 0;
+ unsigned num_outputs = 0;
nir_foreach_variable(variable, &nir->outputs) {
unsigned semantic_name, semantic_index;
- if (nir->info.stage == MESA_SHADER_FRAGMENT) {
- tgsi_get_gl_frag_result_semantic(variable->data.location,
- &semantic_name, &semantic_index);
- } else {
- tgsi_get_gl_varying_semantic(variable->data.location, true,
- &semantic_name, &semantic_index);
+ i = variable->data.driver_location;
+
+ const struct glsl_type *type = variable->type;
+ if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+ assert(glsl_type_is_array(type));
+ type = glsl_get_array_element(type);
}
- info->output_semantic_name[i] = semantic_name;
- info->output_semantic_index[i] = semantic_index;
- info->output_usagemask[i] = TGSI_WRITEMASK_XYZW;
+ unsigned attrib_count = glsl_count_attribute_slots(type, false);
+ for (unsigned k = 0; k < attrib_count; k++, i++) {
- switch (semantic_name) {
- case TGSI_SEMANTIC_PRIMID:
- info->writes_primid = true;
- break;
- case TGSI_SEMANTIC_VIEWPORT_INDEX:
- info->writes_viewport_index = true;
- break;
- case TGSI_SEMANTIC_LAYER:
- info->writes_layer = true;
- break;
- case TGSI_SEMANTIC_PSIZE:
- info->writes_psize = true;
- break;
- case TGSI_SEMANTIC_CLIPVERTEX:
- info->writes_clipvertex = true;
- break;
- case TGSI_SEMANTIC_COLOR:
- info->colors_written |= 1 << semantic_index;
- break;
- case TGSI_SEMANTIC_STENCIL:
- info->writes_stencil = true;
- break;
- case TGSI_SEMANTIC_SAMPLEMASK:
- info->writes_samplemask = true;
- break;
- case TGSI_SEMANTIC_EDGEFLAG:
- info->writes_edgeflag = true;
- break;
- case TGSI_SEMANTIC_POSITION:
- if (info->processor == PIPE_SHADER_FRAGMENT)
- info->writes_z = true;
- else
- info->writes_position = true;
- break;
+ if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+ tgsi_get_gl_frag_result_semantic(variable->data.location + k,
+ &semantic_name, &semantic_index);
+
+ /* Adjust for dual source blending */
+ if (variable->data.index > 0) {
+ semantic_index++;
+ }
+ } else {
+ tgsi_get_gl_varying_semantic(variable->data.location + k, true,
+ &semantic_name, &semantic_index);
+ }
+
+ unsigned num_components = 4;
+ unsigned vector_elements = glsl_get_vector_elements(glsl_without_array(variable->type));
+ if (vector_elements)
+ num_components = vector_elements;
+
+ unsigned component = variable->data.location_frac;
+ if (glsl_type_is_64bit(glsl_without_array(variable->type))) {
+ if (glsl_type_is_dual_slot(glsl_without_array(variable->type)) && k % 2) {
+ num_components = (num_components * 2) - 4;
+ component = 0;
+ } else {
+ num_components = MIN2(num_components * 2, 4);
+ }
+ }
+
+ ubyte usagemask = 0;
+ for (unsigned j = component; j < num_components + component; j++) {
+ switch (j) {
+ case 0:
+ usagemask |= TGSI_WRITEMASK_X;
+ break;
+ case 1:
+ usagemask |= TGSI_WRITEMASK_Y;
+ break;
+ case 2:
+ usagemask |= TGSI_WRITEMASK_Z;
+ break;
+ case 3:
+ usagemask |= TGSI_WRITEMASK_W;
+ break;
+ default:
+ unreachable("error calculating component index");
+ }
+ }
+
+ unsigned gs_out_streams;
+ if (variable->data.stream & (1u << 31)) {
+ gs_out_streams = variable->data.stream & ~(1u << 31);
+ } else {
+ assert(variable->data.stream < 4);
+ gs_out_streams = 0;
+ for (unsigned j = 0; j < num_components; ++j)
+ gs_out_streams |= variable->data.stream << (2 * (component + j));
+ }
+
+ unsigned streamx = gs_out_streams & 3;
+ unsigned streamy = (gs_out_streams >> 2) & 3;
+ unsigned streamz = (gs_out_streams >> 4) & 3;
+ unsigned streamw = (gs_out_streams >> 6) & 3;
+
+ if (usagemask & TGSI_WRITEMASK_X) {
+ info->output_usagemask[i] |= TGSI_WRITEMASK_X;
+ info->output_streams[i] |= streamx;
+ info->num_stream_output_components[streamx]++;
+ }
+ if (usagemask & TGSI_WRITEMASK_Y) {
+ info->output_usagemask[i] |= TGSI_WRITEMASK_Y;
+ info->output_streams[i] |= streamy << 2;
+ info->num_stream_output_components[streamy]++;
+ }
+ if (usagemask & TGSI_WRITEMASK_Z) {
+ info->output_usagemask[i] |= TGSI_WRITEMASK_Z;
+ info->output_streams[i] |= streamz << 4;
+ info->num_stream_output_components[streamz]++;
+ }
+ if (usagemask & TGSI_WRITEMASK_W) {
+ info->output_usagemask[i] |= TGSI_WRITEMASK_W;
+ info->output_streams[i] |= streamw << 6;
+ info->num_stream_output_components[streamw]++;
+ }
+
+ /* make sure we only count this location once against
+ * the num_outputs counter.
+ */
+ if (processed_outputs & ((uint64_t)1 << i))
+ continue;
+
+ processed_outputs |= ((uint64_t)1 << i);
+ num_outputs++;
+
+ info->output_semantic_name[i] = semantic_name;
+ info->output_semantic_index[i] = semantic_index;
+
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_PRIMID:
+ info->writes_primid = true;
+ break;
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ info->writes_viewport_index = true;
+ break;
+ case TGSI_SEMANTIC_LAYER:
+ info->writes_layer = true;
+ break;
+ case TGSI_SEMANTIC_PSIZE:
+ info->writes_psize = true;
+ break;
+ case TGSI_SEMANTIC_CLIPVERTEX:
+ info->writes_clipvertex = true;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ info->colors_written |= 1 << semantic_index;
+ break;
+ case TGSI_SEMANTIC_STENCIL:
+ info->writes_stencil = true;
+ break;
+ case TGSI_SEMANTIC_SAMPLEMASK:
+ info->writes_samplemask = true;
+ break;
+ case TGSI_SEMANTIC_EDGEFLAG:
+ info->writes_edgeflag = true;
+ break;
+ case TGSI_SEMANTIC_POSITION:
+ if (info->processor == PIPE_SHADER_FRAGMENT)
+ info->writes_z = true;
+ else
+ info->writes_position = true;
+ break;
+ }
+
+ if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_PATCH:
+ info->reads_perpatch_outputs = true;
+ break;
+ case TGSI_SEMANTIC_TESSINNER:
+ case TGSI_SEMANTIC_TESSOUTER:
+ info->reads_tessfactor_outputs = true;
+ break;
+ default:
+ info->reads_pervertex_outputs = true;
+ }
+ }
}
- i++;
+ unsigned loc = variable->data.location;
+ if (nir->info.stage == MESA_SHADER_FRAGMENT &&
+ loc == FRAG_RESULT_COLOR &&
+ nir->info.outputs_written & (1ull << loc)) {
+ assert(attrib_count == 1);
+ info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
+ }
}
+ info->num_outputs = num_outputs;
+
+ struct set *ubo_set = _mesa_set_create(NULL, _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+
+ /* Intialise const_file_max[0] */
+ info->const_file_max[0] = -1;
+
+ unsigned ubo_idx = 1;
nir_foreach_variable(variable, &nir->uniforms) {
const struct glsl_type *type = variable->type;
enum glsl_base_type base_type =
glsl_get_base_type(glsl_without_array(type));
unsigned aoa_size = MAX2(1, glsl_get_aoa_size(type));
+ /* Gather buffers declared bitmasks. Note: radeonsi doesn't
+ * really use the mask (other than ubo_idx == 1 for regular
+ * uniforms) its really only used for getting the buffer count
+ * so we don't need to worry about the ordering.
+ */
+ if (variable->interface_type != NULL) {
+ if (variable->data.mode == nir_var_uniform) {
+
+ unsigned block_count;
+ if (base_type != GLSL_TYPE_INTERFACE) {
+ struct set_entry *entry =
+ _mesa_set_search(ubo_set, variable->interface_type);
+
+ /* Check if we have already processed
+ * a member from this ubo.
+ */
+ if (entry)
+ continue;
+
+ block_count = 1;
+ } else {
+ block_count = aoa_size;
+ }
+
+ info->const_buffers_declared |= u_bit_consecutive(ubo_idx, block_count);
+ ubo_idx += block_count;
+
+ _mesa_set_add(ubo_set, variable->interface_type);
+ }
+
+ if (variable->data.mode == nir_var_shader_storage) {
+ /* TODO: make this more accurate */
+ info->shader_buffers_declared =
+ u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
+ }
+
+ continue;
+ }
+
/* We rely on the fact that nir_lower_samplers_as_deref has
* eliminated struct dereferences.
*/
- if (base_type == GLSL_TYPE_SAMPLER)
- info->samplers_declared |=
- u_bit_consecutive(variable->data.binding, aoa_size);
- else if (base_type == GLSL_TYPE_IMAGE)
- info->images_declared |=
- u_bit_consecutive(variable->data.binding, aoa_size);
+ if (base_type == GLSL_TYPE_SAMPLER) {
+ if (variable->data.bindless) {
+ info->const_buffers_declared |= 1;
+ info->const_file_max[0] +=
+ glsl_count_attribute_slots(type, false);
+ } else {
+ info->samplers_declared |=
+ u_bit_consecutive(variable->data.binding, aoa_size);
+ }
+ } else if (base_type == GLSL_TYPE_IMAGE) {
+ if (variable->data.bindless) {
+ info->const_buffers_declared |= 1;
+ info->const_file_max[0] +=
+ glsl_count_attribute_slots(type, false);
+ } else {
+ info->images_declared |=
+ u_bit_consecutive(variable->data.binding, aoa_size);
+ }
+ } else if (base_type != GLSL_TYPE_ATOMIC_UINT) {
+ if (strncmp(variable->name, "state.", 6) == 0 ||
+ strncmp(variable->name, "gl_", 3) == 0) {
+ /* FIXME: figure out why piglit tests with builtin
+ * uniforms are failing without this.
+ */
+ info->const_buffers_declared =
+ u_bit_consecutive(0, SI_NUM_CONST_BUFFERS);
+ } else {
+ info->const_buffers_declared |= 1;
+ info->const_file_max[0] +=
+ glsl_count_attribute_slots(type, false);
+ }
+ }
}
+ _mesa_set_destroy(ubo_set, NULL);
+
info->num_written_clipdistance = nir->info.clip_distance_array_size;
info->num_written_culldistance = nir->info.cull_distance_array_size;
info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
@@ -307,10 +734,6 @@ void si_nir_scan_shader(const struct nir_shader *nir,
if (info->processor == PIPE_SHADER_FRAGMENT)
info->uses_kill = nir->info.fs.uses_discard;
- /* TODO make this more accurate */
- info->const_buffers_declared = u_bit_consecutive(0, SI_NUM_CONST_BUFFERS);
- info->shader_buffers_declared = u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
-
func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
nir_foreach_block(block, func->impl) {
nir_foreach_instr(instr, block)
@@ -350,10 +773,6 @@ si_lower_nir(struct si_shader_selector* sel)
* - ensure constant offsets for texture instructions are folded
* and copy-propagated
*/
- NIR_PASS_V(sel->nir, nir_lower_io, nir_var_uniform, type_size,
- (nir_lower_io_options)0);
- NIR_PASS_V(sel->nir, nir_lower_uniforms_to_ubo);
-
NIR_PASS_V(sel->nir, nir_lower_returns);
NIR_PASS_V(sel->nir, nir_lower_vars_to_ssa);
NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar);
@@ -364,6 +783,20 @@ si_lower_nir(struct si_shader_selector* sel)
};
NIR_PASS_V(sel->nir, nir_lower_tex, &lower_tex_options);
+ const nir_lower_subgroups_options subgroups_options = {
+ .subgroup_size = 64,
+ .ballot_bit_size = 64,
+ .lower_to_scalar = true,
+ .lower_subgroup_masks = true,
+ .lower_vote_trivial = false,
+ .lower_vote_eq_to_ballot = true,
+ };
+ NIR_PASS_V(sel->nir, nir_lower_subgroups, &subgroups_options);
+
+ ac_lower_indirect_derefs(sel->nir, sel->screen->info.chip_class);
+
+ NIR_PASS_V(sel->nir, nir_lower_load_const_to_scalar);
+
bool progress;
do {
progress = false;
@@ -395,21 +828,19 @@ si_lower_nir(struct si_shader_selector* sel)
}
static void declare_nir_input_vs(struct si_shader_context *ctx,
- struct nir_variable *variable, unsigned rel,
+ struct nir_variable *variable,
+ unsigned input_index,
LLVMValueRef out[4])
{
- si_llvm_load_input_vs(ctx, variable->data.driver_location / 4 + rel, out);
+ si_llvm_load_input_vs(ctx, input_index, out);
}
static void declare_nir_input_fs(struct si_shader_context *ctx,
- struct nir_variable *variable, unsigned rel,
- unsigned *fs_attr_idx,
+ struct nir_variable *variable,
+ unsigned input_index,
LLVMValueRef out[4])
{
- unsigned slot = variable->data.location + rel;
-
- assert(variable->data.location >= VARYING_SLOT_VAR0 || rel == 0);
-
+ unsigned slot = variable->data.location;
if (slot == VARYING_SLOT_POS) {
out[0] = LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT);
out[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT);
@@ -419,8 +850,43 @@ static void declare_nir_input_fs(struct si_shader_context *ctx,
return;
}
- si_llvm_load_input_fs(ctx, *fs_attr_idx, out);
- (*fs_attr_idx)++;
+ si_llvm_load_input_fs(ctx, input_index, out);
+}
+
+LLVMValueRef
+si_nir_lookup_interp_param(struct ac_shader_abi *abi,
+ enum glsl_interp_mode interp, unsigned location)
+{
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ int interp_param_idx = -1;
+
+ switch (interp) {
+ case INTERP_MODE_FLAT:
+ return NULL;
+ case INTERP_MODE_SMOOTH:
+ case INTERP_MODE_NONE:
+ if (location == INTERP_CENTER)
+ interp_param_idx = SI_PARAM_PERSP_CENTER;
+ else if (location == INTERP_CENTROID)
+ interp_param_idx = SI_PARAM_PERSP_CENTROID;
+ else if (location == INTERP_SAMPLE)
+ interp_param_idx = SI_PARAM_PERSP_SAMPLE;
+ break;
+ case INTERP_MODE_NOPERSPECTIVE:
+ if (location == INTERP_CENTER)
+ interp_param_idx = SI_PARAM_LINEAR_CENTER;
+ else if (location == INTERP_CENTROID)
+ interp_param_idx = SI_PARAM_LINEAR_CENTROID;
+ else if (location == INTERP_SAMPLE)
+ interp_param_idx = SI_PARAM_LINEAR_SAMPLE;
+ break;
+ default:
+ assert(!"Unhandled interpolation mode.");
+ return NULL;
+ }
+
+ return interp_param_idx != -1 ?
+ LLVMGetParam(ctx->main_fn, interp_param_idx) : NULL;
}
static LLVMValueRef
@@ -428,69 +894,128 @@ si_nir_load_sampler_desc(struct ac_shader_abi *abi,
unsigned descriptor_set, unsigned base_index,
unsigned constant_index, LLVMValueRef dynamic_index,
enum ac_descriptor_type desc_type, bool image,
- bool write)
+ bool write, bool bindless)
{
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ const struct tgsi_shader_info *info = &ctx->shader->selector->info;
LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
- LLVMValueRef index = dynamic_index;
+ unsigned const_index = base_index + constant_index;
+ bool dcc_off = write;
+
+ /* TODO: images_store and images_atomic are not set */
+ if (!dynamic_index && image &&
+ (info->images_store | info->images_atomic) & (1 << const_index))
+ dcc_off = true;
assert(!descriptor_set);
+ assert(!image || desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER);
- if (!index)
- index = ctx->ac.i32_0;
+ if (bindless) {
+ LLVMValueRef list =
+ LLVMGetParam(ctx->main_fn, ctx->param_bindless_samplers_and_images);
- index = LLVMBuildAdd(builder, index,
- LLVMConstInt(ctx->ac.i32, base_index + constant_index, false),
- "");
+ /* dynamic_index is the bindless handle */
+ if (image) {
+ return si_load_image_desc(ctx, list, dynamic_index, desc_type,
+ dcc_off, true);
+ }
- if (image) {
- assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER);
- assert(base_index + constant_index < ctx->num_images);
+ /* Since bindless handle arithmetic can contain an unsigned integer
+ * wraparound and si_load_sampler_desc assumes there isn't any,
+ * use GEP without "inbounds" (inside ac_build_pointer_add)
+ * to prevent incorrect code generation and hangs.
+ */
+ dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
+ LLVMConstInt(ctx->i32, 2, 0), "");
+ list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
+ return si_load_sampler_desc(ctx, list, ctx->i32_0, desc_type);
+ }
+
+ unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
+ assert(const_index < num_slots);
- if (dynamic_index)
- index = si_llvm_bound_index(ctx, index, ctx->num_images);
+ LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
+ LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
+
+ if (dynamic_index) {
+ index = LLVMBuildAdd(builder, index, dynamic_index, "");
+
+ /* From the GL_ARB_shader_image_load_store extension spec:
+ *
+ * If a shader performs an image load, store, or atomic
+ * operation using an image variable declared as an array,
+ * and if the index used to select an individual element is
+ * negative or greater than or equal to the size of the
+ * array, the results of the operation are undefined but may
+ * not lead to termination.
+ */
+ index = si_llvm_bound_index(ctx, index, num_slots);
+ }
- index = LLVMBuildSub(ctx->gallivm.builder,
+ if (image) {
+ index = LLVMBuildSub(ctx->ac.builder,
LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
index, "");
-
- /* TODO: be smarter about when we use dcc_off */
- return si_load_image_desc(ctx, list, index, desc_type, write);
+ return si_load_image_desc(ctx, list, index, desc_type, dcc_off, false);
}
- assert(base_index + constant_index < ctx->num_samplers);
-
- if (dynamic_index)
- index = si_llvm_bound_index(ctx, index, ctx->num_samplers);
-
- index = LLVMBuildAdd(ctx->gallivm.builder, index,
+ index = LLVMBuildAdd(ctx->ac.builder, index,
LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
-
return si_load_sampler_desc(ctx, list, index, desc_type);
}
+static void bitcast_inputs(struct si_shader_context *ctx,
+ LLVMValueRef data[4],
+ unsigned input_idx)
+{
+ for (unsigned chan = 0; chan < 4; chan++) {
+ ctx->inputs[input_idx + chan] =
+ LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, "");
+ }
+}
+
bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
{
struct tgsi_shader_info *info = &ctx->shader->selector->info;
- unsigned fs_attr_idx = 0;
- nir_foreach_variable(variable, &nir->inputs) {
- unsigned attrib_count = glsl_count_attribute_slots(variable->type,
- nir->info.stage == MESA_SHADER_VERTEX);
- unsigned input_idx = variable->data.driver_location;
+ if (nir->info.stage == MESA_SHADER_VERTEX ||
+ nir->info.stage == MESA_SHADER_FRAGMENT) {
+ uint64_t processed_inputs = 0;
+ nir_foreach_variable(variable, &nir->inputs) {
+ unsigned attrib_count = glsl_count_attribute_slots(variable->type,
+ nir->info.stage == MESA_SHADER_VERTEX);
+ unsigned input_idx = variable->data.driver_location;
- for (unsigned i = 0; i < attrib_count; ++i) {
LLVMValueRef data[4];
-
- if (nir->info.stage == MESA_SHADER_VERTEX)
- declare_nir_input_vs(ctx, variable, i, data);
- else if (nir->info.stage == MESA_SHADER_FRAGMENT)
- declare_nir_input_fs(ctx, variable, i, &fs_attr_idx, data);
-
- for (unsigned chan = 0; chan < 4; chan++) {
- ctx->inputs[input_idx + chan] =
- LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, "");
+ unsigned loc = variable->data.location;
+
+ if (loc >= VARYING_SLOT_VAR0 && nir->info.stage == MESA_SHADER_FRAGMENT)
+ ctx->abi.fs_input_attr_indices[loc - VARYING_SLOT_VAR0] = input_idx / 4;
+
+ for (unsigned i = 0; i < attrib_count; i++) {
+ /* Packed components share the same location so skip
+ * them if we have already processed the location.
+ */
+ if (processed_inputs & ((uint64_t)1 << (loc + i))) {
+ input_idx += 4;
+ continue;
+ }
+
+ if (nir->info.stage == MESA_SHADER_VERTEX) {
+ declare_nir_input_vs(ctx, variable, input_idx / 4, data);
+ bitcast_inputs(ctx, data, input_idx);
+ if (glsl_type_is_dual_slot(variable->type)) {
+ input_idx += 4;
+ declare_nir_input_vs(ctx, variable, input_idx / 4, data);
+ bitcast_inputs(ctx, data, input_idx);
+ }
+ } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+ declare_nir_input_fs(ctx, variable, input_idx / 4, data);
+ bitcast_inputs(ctx, data, input_idx);
+ }
+
+ processed_inputs |= ((uint64_t)1 << (loc + i));
+ input_idx += 4;
}
}
}
@@ -502,7 +1027,11 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
ctx->num_samplers = util_last_bit(info->samplers_declared);
ctx->num_images = util_last_bit(info->images_declared);
- ac_nir_translate(&ctx->ac, &ctx->abi, nir, NULL);
+ if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
+ assert(nir->info.stage == MESA_SHADER_COMPUTE);
+ si_declare_compute_memory(ctx);
+ }
+ ac_nir_translate(&ctx->ac, &ctx->abi, nir);
return true;
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
new file mode 100644
index 000000000..da55c81dd
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "tgsi/tgsi_text.h"
+#include "tgsi/tgsi_ureg.h"
+
+void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
+ unsigned num_layers)
+{
+ unsigned vs_blit_property;
+ void **vs;
+
+ switch (type) {
+ case UTIL_BLITTER_ATTRIB_NONE:
+ vs = num_layers > 1 ? &sctx->vs_blit_pos_layered :
+ &sctx->vs_blit_pos;
+ vs_blit_property = SI_VS_BLIT_SGPRS_POS;
+ break;
+ case UTIL_BLITTER_ATTRIB_COLOR:
+ vs = num_layers > 1 ? &sctx->vs_blit_color_layered :
+ &sctx->vs_blit_color;
+ vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
+ break;
+ case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+ case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+ assert(num_layers == 1);
+ vs = &sctx->vs_blit_texcoord;
+ vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
+ break;
+ default:
+ assert(0);
+ return NULL;
+ }
+ if (*vs)
+ return *vs;
+
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
+ if (!ureg)
+ return NULL;
+
+ /* Tell the shader to load VS inputs from SGPRs: */
+ ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS, vs_blit_property);
+ ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
+
+ /* This is just a pass-through shader with 1-3 MOV instructions. */
+ ureg_MOV(ureg,
+ ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0),
+ ureg_DECL_vs_input(ureg, 0));
+
+ if (type != UTIL_BLITTER_ATTRIB_NONE) {
+ ureg_MOV(ureg,
+ ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0),
+ ureg_DECL_vs_input(ureg, 1));
+ }
+
+ if (num_layers > 1) {
+ struct ureg_src instance_id =
+ ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
+ struct ureg_dst layer =
+ ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
+
+ ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
+ ureg_scalar(instance_id, TGSI_SWIZZLE_X));
+ }
+ ureg_END(ureg);
+
+ *vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
+ return *vs;
+}
+
+/**
+ * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
+ * VS passes its outputs to TES directly, so the fixed-function shader only
+ * has to write TESSOUTER and TESSINNER.
+ */
+void *si_create_fixed_func_tcs(struct si_context *sctx)
+{
+ struct ureg_src outer, inner;
+ struct ureg_dst tessouter, tessinner;
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
+
+ if (!ureg)
+ return NULL;
+
+ outer = ureg_DECL_system_value(ureg,
+ TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI, 0);
+ inner = ureg_DECL_system_value(ureg,
+ TGSI_SEMANTIC_DEFAULT_TESSINNER_SI, 0);
+
+ tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+ tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+
+ ureg_MOV(ureg, tessouter, outer);
+ ureg_MOV(ureg, tessinner, inner);
+ ureg_END(ureg);
+
+ return ureg_create_shader_and_destroy(ureg, &sctx->b);
+}
+
+/* Create a compute shader implementing clear_buffer or copy_buffer. */
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+ unsigned num_dwords_per_thread,
+ bool dst_stream_cache_policy, bool is_copy)
+{
+ assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+ unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+ if (dst_stream_cache_policy)
+ store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+ /* Don't cache loads, because there is no reuse. */
+ unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+ unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+ unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+ for (unsigned i = 0; i < num_mem_ops; i++) {
+ if (i*4 < num_dwords_per_thread)
+ inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
+ }
+
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+ if (!ureg)
+ return NULL;
+
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+ struct ureg_src value;
+ if (!is_copy) {
+ ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_DWORDS, inst_dwords[0]);
+ value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA, 0);
+ }
+
+ struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+ struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+ struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+ struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+ struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+ struct ureg_src srcbuf;
+ struct ureg_src *values = NULL;
+
+ if (is_copy) {
+ srcbuf = ureg_DECL_buffer(ureg, 1, false);
+ values = malloc(num_mem_ops * sizeof(struct ureg_src));
+ }
+
+ /* If there are multiple stores, the first store writes into 0+tid,
+ * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, etc.
+ */
+ ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), tid);
+ /* Convert from a "store size unit" into bytes. */
+ ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
+ ureg_imm1u(ureg, 4 * inst_dwords[0]));
+ ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+ /* Distance between a load and a store for latency hiding. */
+ unsigned load_store_distance = is_copy ? 8 : 0;
+
+ for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+ int d = i - load_store_distance;
+
+ if (is_copy && i < num_mem_ops) {
+ if (i) {
+ ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+ ureg_imm1u(ureg, 4 * inst_dwords[i] * 64));
+ }
+
+ values[i] = ureg_src(ureg_DECL_temporary(ureg));
+ struct ureg_dst dst =
+ ureg_writemask(ureg_dst(values[i]),
+ u_bit_consecutive(0, inst_dwords[i]));
+ struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+ ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2,
+ load_qualifier, TGSI_TEXTURE_BUFFER, 0);
+ }
+
+ if (d >= 0) {
+ if (d) {
+ ureg_UADD(ureg, store_addr, ureg_src(store_addr),
+ ureg_imm1u(ureg, 4 * inst_dwords[d] * 64));
+ }
+
+ struct ureg_dst dst =
+ ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
+ struct ureg_src srcs[] =
+ {ureg_src(store_addr), is_copy ? values[d] : value};
+ ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2,
+ store_qualifier, TGSI_TEXTURE_BUFFER, 0);
+ }
+ }
+ ureg_END(ureg);
+
+ struct pipe_compute_state state = {};
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = ureg_get_tokens(ureg, NULL);
+
+ void *cs = ctx->create_compute_state(ctx, &state);
+ ureg_destroy(ureg);
+ free(values);
+ return cs;
+}
+
+/* Create the compute shader that is used to collect the results.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * CONST
+ * 0.x = end_offset
+ * 0.y = result_stride
+ * 0.z = result_count
+ * 0.w = bit field:
+ * 1: read previously accumulated values
+ * 2: write accumulated values for chaining
+ * 4: write result available
+ * 8: convert result to boolean (0/1)
+ * 16: only read one dword and use that as result
+ * 32: apply timestamp conversion
+ * 64: store full 64 bits result
+ * 128: store signed 32 bits result
+ * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
+ * 1.x = fence_offset
+ * 1.y = pair_stride
+ * 1.z = pair_count
+ *
+ * BUFFER[0] = query result buffer
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ */
+void *si_create_query_result_cs(struct si_context *sctx)
+{
+ /* TEMP[0].xy = accumulated result so far
+ * TEMP[0].z = result not available
+ *
+ * TEMP[1].x = current result index
+ * TEMP[1].y = current pair index
+ */
+ static const char text_tmpl[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL BUFFER[0]\n"
+ "DCL BUFFER[1]\n"
+ "DCL BUFFER[2]\n"
+ "DCL CONST[0][0..1]\n"
+ "DCL TEMP[0..5]\n"
+ "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
+ "IMM[1] UINT32 {1, 2, 4, 8}\n"
+ "IMM[2] UINT32 {16, 32, 64, 128}\n"
+ "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+ "IMM[4] UINT32 {256, 0, 0, 0}\n"
+
+ "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
+ "UIF TEMP[5]\n"
+ /* Check result availability. */
+ "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
+ "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
+ "MOV TEMP[1], TEMP[0].zzzz\n"
+ "NOT TEMP[0].z, TEMP[0].zzzz\n"
+
+ /* Load result if available. */
+ "UIF TEMP[1]\n"
+ "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+ "ENDIF\n"
+ "ELSE\n"
+ /* Load previously accumulated result if requested. */
+ "MOV TEMP[0], IMM[0].xxxx\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
+ "UIF TEMP[4]\n"
+ "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ "MOV TEMP[1].x, IMM[0].xxxx\n"
+ "BGNLOOP\n"
+ /* Break if accumulated result so far is not available. */
+ "UIF TEMP[0].zzzz\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /* Break if result_index >= result_count. */
+ "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /* Load fence and check result availability */
+ "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
+ "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+ "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
+ "NOT TEMP[0].z, TEMP[0].zzzz\n"
+ "UIF TEMP[0].zzzz\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ "MOV TEMP[1].y, IMM[0].xxxx\n"
+ "BGNLOOP\n"
+ /* Load start and end. */
+ "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
+ "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
+ "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+ "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
+ "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+ "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
+
+ "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
+ "UIF TEMP[5].zzzz\n"
+ /* Load second start/end half-pair and
+ * take the difference
+ */
+ "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
+ "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+ "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+ "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+ "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
+ "ENDIF\n"
+
+ "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
+
+ /* Increment pair index */
+ "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
+ "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+ "ENDLOOP\n"
+
+ /* Increment result index */
+ "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
+ "ENDLOOP\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
+ "UIF TEMP[4]\n"
+ /* Store accumulated data for chaining. */
+ "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+ "ELSE\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
+ "UIF TEMP[4]\n"
+ /* Store result availability. */
+ "NOT TEMP[0].z, TEMP[0]\n"
+ "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+ "UIF TEMP[4]\n"
+ "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+ "ENDIF\n"
+ "ELSE\n"
+ /* Store result if it is available. */
+ "NOT TEMP[4], TEMP[0].zzzz\n"
+ "UIF TEMP[4]\n"
+ /* Apply timestamp conversion */
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
+ "UIF TEMP[4]\n"
+ "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
+ "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
+ "ENDIF\n"
+
+ /* Convert to boolean */
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
+ "UIF TEMP[4]\n"
+ "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
+ "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
+ "MOV TEMP[0].y, IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+ "UIF TEMP[4]\n"
+ "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+ "ELSE\n"
+ /* Clamping */
+ "UIF TEMP[0].yyyy\n"
+ "MOV TEMP[0].x, IMM[0].wwww\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
+ "UIF TEMP[4]\n"
+ "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
+ "ENDIF\n"
+
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+
+ "END\n";
+
+ char text[sizeof(text_tmpl) + 32];
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {};
+
+ /* Hard code the frequency into the shader so that the backend can
+ * use the full range of optimizations for divide-by-constant.
+ */
+ snprintf(text, sizeof(text), text_tmpl,
+ sctx->screen->info.clock_crystal_freq);
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return sctx->b.create_compute_state(&sctx->b, &state);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c
index 8d98d6d0d..3516e5612 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -1,5 +1,6 @@
/*
* Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -23,10 +24,8 @@
/* This file handles register programming of primitive binning. */
-#include "si_pipe.h"
-#include "sid.h"
+#include "si_build_pm4.h"
#include "gfx9d.h"
-#include "radeon/r600_cs.h"
struct uvec2 {
unsigned x, y;
@@ -38,7 +37,7 @@ struct si_bin_size_map {
unsigned bin_size_y;
};
-typedef struct si_bin_size_map si_bin_size_subtable[3][9];
+typedef struct si_bin_size_map si_bin_size_subtable[3][10];
/* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
@@ -46,16 +45,16 @@ static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
unsigned sum)
{
unsigned log_num_rb_per_se =
- util_logbase2_ceil(sscreen->b.info.num_render_backends /
- sscreen->b.info.max_se);
- unsigned log_num_se = util_logbase2_ceil(sscreen->b.info.max_se);
+ util_logbase2_ceil(sscreen->info.num_render_backends /
+ sscreen->info.max_se);
+ unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
unsigned i;
/* Get the chip-specific subtable. */
const struct si_bin_size_map *subtable =
&table[log_num_rb_per_se][log_num_se][0];
- for (i = 0; subtable[i].start != UINT_MAX; i++) {
+ for (i = 0; subtable[i].bin_size_x != 0; i++) {
if (sum >= subtable[i].start && sum < subtable[i + 1].start)
break;
}
@@ -67,7 +66,7 @@ static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
unsigned cb_target_enabled_4bit)
{
- unsigned nr_samples = sctx->framebuffer.nr_samples;
+ unsigned num_fragments = sctx->framebuffer.nr_color_samples;
unsigned sum = 0;
/* Compute the sum of all Bpp. */
@@ -75,15 +74,15 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
continue;
- struct r600_texture *rtex =
- (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
- sum += rtex->surface.bpe;
+ struct si_texture *tex =
+ (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
+ sum += tex->surface.bpe;
}
/* Multiply the sum by some function of the number of samples. */
- if (nr_samples >= 2) {
- if (sctx->ps_iter_samples >= 2)
- sum *= nr_samples;
+ if (num_fragments >= 2) {
+ if (si_get_ps_iter_samples(sctx) >= 2)
+ sum *= num_fragments;
else
sum *= 2;
}
@@ -98,7 +97,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 2, 32, 128 },
{ 3, 16, 128 },
{ 17, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
/* Two shader engines */
@@ -107,7 +105,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 3, 32, 128 },
{ 5, 16, 128 },
{ 17, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
/* Four shader engines */
@@ -115,7 +112,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 3, 64, 128 },
{ 5, 16, 128 },
{ 17, 0, 0 },
- { UINT_MAX, 0, 0 },
},
},
{
@@ -125,9 +121,8 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 0, 128, 128 },
{ 2, 64, 128 },
{ 3, 32, 128 },
- { 5, 16, 128 },
+ { 9, 16, 128 },
{ 33, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
/* Two shader engines */
@@ -136,7 +131,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 5, 32, 128 },
{ 9, 16, 128 },
{ 33, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
/* Four shader engines */
@@ -146,7 +140,6 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 5, 64, 128 },
{ 9, 16, 128 },
{ 33, 0, 0 },
- { UINT_MAX, 0, 0 },
},
},
{
@@ -158,8 +151,7 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 3, 64, 128 },
{ 5, 32, 128 },
{ 9, 16, 128 },
- { 33, 0, 0 },
- { UINT_MAX, 0, 0 },
+ { 17, 0, 0 },
},
{
/* Two shader engines */
@@ -170,18 +162,16 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
{ 9, 32, 128 },
{ 17, 16, 128 },
{ 33, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
/* Four shader engines */
{ 0, 256, 512 },
- { 2, 256, 256 },
- { 3, 128, 256 },
- { 5, 128, 128 },
- { 9, 64, 128 },
- { 17, 16, 128 },
+ { 2, 128, 512 },
+ { 3, 64, 512 },
+ { 5, 32, 512 },
+ { 9, 32, 256 },
+ { 17, 32, 128 },
{ 33, 0, 0 },
- { UINT_MAX, 0, 0 },
},
},
};
@@ -200,86 +190,80 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
return size;
}
- struct r600_texture *rtex =
- (struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
+ struct si_texture *tex =
+ (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
- unsigned stencil_coeff = rtex->surface.has_stencil &&
+ unsigned stencil_coeff = tex->surface.has_stencil &&
dsa->stencil_enabled ? 1 : 0;
unsigned sum = 4 * (depth_coeff + stencil_coeff) *
- sctx->framebuffer.nr_samples;
+ tex->buffer.b.b.nr_samples;
static const si_bin_size_subtable table[] = {
{
// One RB / SE
{
// One shader engine
- { 0, 128, 256 },
- { 2, 128, 128 },
+ { 0, 64, 512 },
+ { 2, 64, 256 },
{ 4, 64, 128 },
{ 7, 32, 128 },
{ 13, 16, 128 },
{ 49, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
// Two shader engines
- { 0, 256, 256 },
- { 2, 128, 256 },
- { 4, 128, 128 },
+ { 0, 128, 512 },
+ { 2, 64, 512 },
+ { 4, 64, 256 },
{ 7, 64, 128 },
{ 13, 32, 128 },
{ 25, 16, 128 },
{ 49, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
// Four shader engines
{ 0, 256, 512 },
- { 2, 256, 256 },
- { 4, 128, 256 },
- { 7, 128, 128 },
+ { 2, 128, 512 },
+ { 4, 64, 512 },
+ { 7, 64, 256 },
{ 13, 64, 128 },
{ 25, 16, 128 },
{ 49, 0, 0 },
- { UINT_MAX, 0, 0 },
},
},
{
// Two RB / SE
{
// One shader engine
- { 0, 256, 256 },
- { 2, 128, 256 },
- { 4, 128, 128 },
+ { 0, 128, 512 },
+ { 2, 64, 512 },
+ { 4, 64, 256 },
{ 7, 64, 128 },
{ 13, 32, 128 },
{ 25, 16, 128 },
{ 97, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
// Two shader engines
{ 0, 256, 512 },
- { 2, 256, 256 },
- { 4, 128, 256 },
- { 7, 128, 128 },
+ { 2, 128, 512 },
+ { 4, 64, 512 },
+ { 7, 64, 256 },
{ 13, 64, 128 },
{ 25, 32, 128 },
{ 49, 16, 128 },
{ 97, 0, 0 },
- { UINT_MAX, 0, 0 },
},
{
// Four shader engines
{ 0, 512, 512 },
{ 2, 256, 512 },
- { 4, 256, 256 },
- { 7, 128, 256 },
- { 13, 128, 128 },
+ { 4, 128, 512 },
+ { 7, 64, 512 },
+ { 13, 64, 256 },
{ 25, 64, 128 },
{ 49, 16, 128 },
{ 97, 0, 0 },
- { UINT_MAX, 0, 0 },
},
},
{
@@ -287,36 +271,36 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
{
// One shader engine
{ 0, 256, 512 },
- { 2, 256, 256 },
- { 4, 128, 256 },
- { 7, 128, 128 },
+ { 2, 128, 512 },
+ { 4, 64, 512 },
+ { 7, 64, 256 },
{ 13, 64, 128 },
{ 25, 32, 128 },
{ 49, 16, 128 },
- { UINT_MAX, 0, 0 },
+ { 193, 0, 0 },
},
{
// Two shader engines
{ 0, 512, 512 },
{ 2, 256, 512 },
- { 4, 256, 256 },
- { 7, 128, 256 },
- { 13, 128, 128 },
+ { 4, 128, 512 },
+ { 7, 64, 512 },
+ { 13, 64, 256 },
{ 25, 64, 128 },
{ 49, 32, 128 },
{ 97, 16, 128 },
- { UINT_MAX, 0, 0 },
+ { 193, 0, 0 },
},
{
// Four shader engines
{ 0, 512, 512 },
{ 4, 256, 512 },
- { 7, 256, 256 },
- { 13, 128, 256 },
- { 25, 128, 128 },
- { 49, 64, 128 },
+ { 7, 128, 512 },
+ { 13, 64, 512 },
+ { 25, 32, 512 },
+ { 49, 32, 256 },
{ 97, 16, 128 },
- { UINT_MAX, 0, 0 },
+ { 193, 0, 0 },
},
},
};
@@ -326,25 +310,30 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
static void si_emit_dpbb_disable(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-
- radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
- S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
- S_028C44_DISABLE_START_OF_PRIM(1));
- radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
- S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
+ SI_TRACKED_PA_SC_BINNER_CNTL_0,
+ S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+ S_028C44_DISABLE_START_OF_PRIM(1));
+ radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
+ SI_TRACKED_DB_DFSM_CONTROL,
+ S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
+ S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll_counter++;
}
-void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
+void si_emit_dpbb_state(struct si_context *sctx)
{
struct si_screen *sscreen = sctx->screen;
struct si_state_blend *blend = sctx->queued.named.blend;
struct si_state_dsa *dsa = sctx->queued.named.dsa;
unsigned db_shader_control = sctx->ps_db_shader_control;
- assert(sctx->b.chip_class >= GFX9);
+ assert(sctx->chip_class >= GFX9);
- if (!sscreen->dpbb_allowed || !blend || !dsa) {
+ if (!sscreen->dpbb_allowed || !blend || !dsa || sctx->dpbb_force_off) {
si_emit_dpbb_disable(sctx);
return;
}
@@ -354,18 +343,14 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
blend->alpha_to_coverage;
- /* This is ported from Vulkan, but it doesn't make much sense to me.
- * Maybe it's for RE-Z? But Vulkan doesn't use RE-Z. TODO: Clarify this.
- */
- bool ps_can_reject_z_trivially =
+ bool db_can_reject_z_trivially =
!G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
- G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control);
+ G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
+ G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
- /* Disable binning if PS can kill trivially with DB writes.
- * Ported from Vulkan. (heuristic?)
- */
+ /* Disable DPBB when it's believed to be inefficient. */
if (ps_can_kill &&
- ps_can_reject_z_trivially &&
+ db_can_reject_z_trivially &&
sctx->framebuffer.state.zsbuf &&
dsa->db_can_write) {
si_emit_dpbb_disable(sctx);
@@ -394,8 +379,13 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
/* Enable DFSM if it's preferred. */
unsigned punchout_mode = V_028060_FORCE_OFF;
bool disable_start_of_prim = true;
+ bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 &&
+ sctx->framebuffer.state.zsbuf &&
+ sctx->framebuffer.nr_samples !=
+ MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
if (sscreen->dfsm_allowed &&
+ !zs_eqaa_dfsm_bug &&
cb_target_enabled_4bit &&
!G_02880C_KILL_ENABLE(db_shader_control) &&
/* These two also imply that DFSM is disabled when PS writes to memory. */
@@ -412,9 +402,12 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
unsigned persistent_states_per_bin; /* allowed range: [0, 31] */
unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
- switch (sctx->b.family) {
+ switch (sctx->family) {
case CHIP_VEGA10:
+ case CHIP_VEGA12:
+ case CHIP_VEGA20:
case CHIP_RAVEN:
+ case CHIP_RAVEN2:
/* Tuned for Raven. Vega might need different values. */
context_states_per_bin = 5;
persistent_states_per_bin = 31;
@@ -431,18 +424,24 @@ void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
if (bin_size.y >= 32)
bin_size_extend.y = util_logbase2(bin_size.y) - 5;
- struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
- radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
- S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
- S_028C44_BIN_SIZE_X(bin_size.x == 16) |
- S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
- S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
- S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
- S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
- S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
- S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
- S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
- S_028C44_OPTIMAL_BIN_SELECTION(1));
- radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
- S_028060_PUNCHOUT_MODE(punchout_mode));
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ radeon_opt_set_context_reg(
+ sctx, R_028C44_PA_SC_BINNER_CNTL_0,
+ SI_TRACKED_PA_SC_BINNER_CNTL_0,
+ S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
+ S_028C44_BIN_SIZE_X(bin_size.x == 16) |
+ S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
+ S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+ S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
+ S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
+ S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
+ S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
+ S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
+ S_028C44_OPTIMAL_BIN_SELECTION(1));
+ radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
+ SI_TRACKED_DB_DFSM_CONTROL,
+ S_028060_PUNCHOUT_MODE(punchout_mode) |
+ S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll_counter++;
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c
index 133f1e4aa..e6d97fe67 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -1,5 +1,6 @@
/*
* Copyright 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -19,173 +20,178 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * Authors: Marek Olšák <maraeo@gmail.com>
- *
*/
-#include "si_pipe.h"
-#include "sid.h"
-#include "radeon/r600_cs.h"
+#include "si_build_pm4.h"
/* For MSAA sample positions. */
#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
- (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \
- (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \
- (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
+ ((((unsigned)(s0x) & 0xf) << 0) | (((unsigned)(s0y) & 0xf) << 4) | \
+ (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \
+ (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
(((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
-/* 2xMSAA
- * There are two locations (4, 4), (-4, -4). */
-static const uint32_t sample_locs_2x[4] = {
- FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
- FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
- FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
- FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-};
-/* 4xMSAA
- * There are 4 locations: (-2, -6), (6, -2), (-6, 2), (2, 6). */
-static const uint32_t sample_locs_4x[4] = {
- FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
- FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
- FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
- FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-};
+/* For obtaining location coordinates from registers */
+#define SEXT4(x) ((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
+#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index) * 4)) & 0xf)
+#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
+#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
-/* Cayman 8xMSAA */
+/* The following sample ordering is required by EQAA.
+ *
+ * Sample 0 is approx. in the top-left quadrant.
+ * Sample 1 is approx. in the bottom-right quadrant.
+ *
+ * Sample 2 is approx. in the bottom-left quadrant.
+ * Sample 3 is approx. in the top-right quadrant.
+ * (sample I={2,3} adds more detail to the vicinity of sample I-2)
+ *
+ * Sample 4 is approx. in the same quadrant as sample 0. (top-left)
+ * Sample 5 is approx. in the same quadrant as sample 1. (bottom-right)
+ * Sample 6 is approx. in the same quadrant as sample 2. (bottom-left)
+ * Sample 7 is approx. in the same quadrant as sample 3. (top-right)
+ * (sample I={4,5,6,7} adds more detail to the vicinity of sample I-4)
+ *
+ * The next 8 samples add more detail to the vicinity of the previous samples.
+ * (sample I (I >= 8) adds more detail to the vicinity of sample I-8)
+ *
+ * The ordering is specified such that:
+ * If we take the first 2 samples, we should get good 2x MSAA.
+ * If we add 2 more samples, we should get good 4x MSAA with the same sample locations.
+ * If we add 4 more samples, we should get good 8x MSAA with the same sample locations.
+ * If we add 8 more samples, we should get perfect 16x MSAA with the same sample locations.
+ *
+ * The ordering also allows finding samples in the same vicinity.
+ *
+ * Group N of 2 samples in the same vicinity in 16x MSAA: {N,N+8}
+ * Group N of 2 samples in the same vicinity in 8x MSAA: {N,N+4}
+ * Group N of 2 samples in the same vicinity in 4x MSAA: {N,N+2}
+ *
+ * Groups of 4 samples in the same vicinity in 16x MSAA:
+ * Top left: {0,4,8,12}
+ * Bottom right: {1,5,9,13}
+ * Bottom left: {2,6,10,14}
+ * Top right: {3,7,11,15}
+ *
+ * Groups of 4 samples in the same vicinity in 8x MSAA:
+ * Left half: {0,2,4,6}
+ * Right half: {1,3,5,7}
+ *
+ * Groups of 8 samples in the same vicinity in 16x MSAA:
+ * Left half: {0,2,4,6,8,10,12,14}
+ * Right half: {1,3,5,7,9,11,13,15}
+ */
+
+/* 1x MSAA */
+static const uint32_t sample_locs_1x =
+ FILL_SREG( 0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
+static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
+
+/* 2x MSAA (the positions are sorted for EQAA) */
+static const uint32_t sample_locs_2x =
+ FILL_SREG(-4,-4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
+static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
+
+/* 4x MSAA (the positions are sorted for EQAA) */
+static const uint32_t sample_locs_4x =
+ FILL_SREG(-2,-6, 2, 6, -6, 2, 6,-2);
+static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
+
+/* 8x MSAA (the positions are sorted for EQAA) */
static const uint32_t sample_locs_8x[] = {
- FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5),
- FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5),
- FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5),
- FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5),
- FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7),
- FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7),
- FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7),
- FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7),
+ FILL_SREG(-3,-5, 5, 1, -1, 3, 7,-7),
+ FILL_SREG(-7,-1, 3, 7, -5, 5, 1,-3),
+ /* The following are unused by hardware, but we emit them to IBs
+ * instead of multiple SET_CONTEXT_REG packets. */
+ 0,
+ 0,
};
-/* Cayman 16xMSAA */
+static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
+
+/* 16x MSAA (the positions are sorted for EQAA) */
static const uint32_t sample_locs_16x[] = {
- FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1),
- FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1),
- FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1),
- FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1),
- FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5),
- FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5),
- FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5),
- FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5),
- FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4),
- FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4),
- FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4),
- FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4),
- FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8),
- FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8),
- FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8),
- FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8),
+ FILL_SREG(-5,-2, 5, 3, -2, 6, 3,-5),
+ FILL_SREG(-4,-6, 1, 1, -6, 4, 7,-4),
+ FILL_SREG(-1,-3, 6, 7, -3, 2, 0,-7),
+ FILL_SREG(-7,-8, 2, 5, -8, 0, 4,-1),
};
+static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
unsigned sample_index, float *out_value)
{
- int offset, index;
- struct {
- int idx:4;
- } val;
+ const uint32_t *sample_locs;
switch (sample_count) {
case 1:
default:
- out_value[0] = out_value[1] = 0.5;
+ sample_locs = &sample_locs_1x;
break;
case 2:
- offset = 4 * (sample_index * 2);
- val.idx = (sample_locs_2x[0] >> offset) & 0xf;
- out_value[0] = (float)(val.idx + 8) / 16.0f;
- val.idx = (sample_locs_2x[0] >> (offset + 4)) & 0xf;
- out_value[1] = (float)(val.idx + 8) / 16.0f;
+ sample_locs = &sample_locs_2x;
break;
case 4:
- offset = 4 * (sample_index * 2);
- val.idx = (sample_locs_4x[0] >> offset) & 0xf;
- out_value[0] = (float)(val.idx + 8) / 16.0f;
- val.idx = (sample_locs_4x[0] >> (offset + 4)) & 0xf;
- out_value[1] = (float)(val.idx + 8) / 16.0f;
+ sample_locs = &sample_locs_4x;
break;
case 8:
- offset = 4 * (sample_index % 4 * 2);
- index = (sample_index / 4) * 4;
- val.idx = (sample_locs_8x[index] >> offset) & 0xf;
- out_value[0] = (float)(val.idx + 8) / 16.0f;
- val.idx = (sample_locs_8x[index] >> (offset + 4)) & 0xf;
- out_value[1] = (float)(val.idx + 8) / 16.0f;
+ sample_locs = sample_locs_8x;
break;
case 16:
- offset = 4 * (sample_index % 4 * 2);
- index = (sample_index / 4) * 4;
- val.idx = (sample_locs_16x[index] >> offset) & 0xf;
- out_value[0] = (float)(val.idx + 8) / 16.0f;
- val.idx = (sample_locs_16x[index] >> (offset + 4)) & 0xf;
- out_value[1] = (float)(val.idx + 8) / 16.0f;
+ sample_locs = sample_locs_16x;
break;
}
+
+ out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
+ out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
+}
+
+static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
+ uint64_t centroid_priority,
+ uint32_t sample_locs)
+{
+ radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+ radeon_emit(cs, centroid_priority);
+ radeon_emit(cs, centroid_priority >> 32);
+ radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+ radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+ radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+ radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+}
+
+static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
+ uint64_t centroid_priority,
+ const uint32_t *sample_locs,
+ unsigned num_samples)
+{
+ radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+ radeon_emit(cs, centroid_priority);
+ radeon_emit(cs, centroid_priority >> 32);
+ radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+ num_samples == 8 ? 14 : 16);
+ radeon_emit_array(cs, sample_locs, 4);
+ radeon_emit_array(cs, sample_locs, 4);
+ radeon_emit_array(cs, sample_locs, 4);
+ radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
}
-void si_emit_sample_locations(struct radeon_winsys_cs *cs, int nr_samples)
+void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
{
switch (nr_samples) {
default:
case 1:
- radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0);
- radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0);
- radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0);
- radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0);
+ si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
break;
case 2:
- radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x[0]);
- radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x[1]);
- radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_2x[2]);
- radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_2x[3]);
+ si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
break;
case 4:
- radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x[0]);
- radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x[1]);
- radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_4x[2]);
- radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_4x[3]);
+ si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
break;
case 8:
- radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
- radeon_emit(cs, sample_locs_8x[0]);
- radeon_emit(cs, sample_locs_8x[4]);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0);
- radeon_emit(cs, sample_locs_8x[1]);
- radeon_emit(cs, sample_locs_8x[5]);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0);
- radeon_emit(cs, sample_locs_8x[2]);
- radeon_emit(cs, sample_locs_8x[6]);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0);
- radeon_emit(cs, sample_locs_8x[3]);
- radeon_emit(cs, sample_locs_8x[7]);
+ si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
break;
case 16:
- radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16);
- radeon_emit(cs, sample_locs_16x[0]);
- radeon_emit(cs, sample_locs_16x[4]);
- radeon_emit(cs, sample_locs_16x[8]);
- radeon_emit(cs, sample_locs_16x[12]);
- radeon_emit(cs, sample_locs_16x[1]);
- radeon_emit(cs, sample_locs_16x[5]);
- radeon_emit(cs, sample_locs_16x[9]);
- radeon_emit(cs, sample_locs_16x[13]);
- radeon_emit(cs, sample_locs_16x[2]);
- radeon_emit(cs, sample_locs_16x[6]);
- radeon_emit(cs, sample_locs_16x[10]);
- radeon_emit(cs, sample_locs_16x[14]);
- radeon_emit(cs, sample_locs_16x[3]);
- radeon_emit(cs, sample_locs_16x[7]);
- radeon_emit(cs, sample_locs_16x[11]);
- radeon_emit(cs, sample_locs_16x[15]);
+ si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
break;
}
}
@@ -194,16 +200,16 @@ void si_init_msaa_functions(struct si_context *sctx)
{
int i;
- sctx->b.b.get_sample_position = si_get_sample_position;
+ sctx->b.get_sample_position = si_get_sample_position;
- si_get_sample_position(&sctx->b.b, 1, 0, sctx->sample_locations_1x[0]);
+ si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
for (i = 0; i < 2; i++)
- si_get_sample_position(&sctx->b.b, 2, i, sctx->sample_locations_2x[i]);
+ si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
for (i = 0; i < 4; i++)
- si_get_sample_position(&sctx->b.b, 4, i, sctx->sample_locations_4x[i]);
+ si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
for (i = 0; i < 8; i++)
- si_get_sample_position(&sctx->b.b, 8, i, sctx->sample_locations_8x[i]);
+ si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
for (i = 0; i < 16; i++)
- si_get_sample_position(&sctx->b.b, 16, i, sctx->sample_locations_16x[i]);
+ si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c
index 9971bc815..fd7e843bc 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -1,5 +1,6 @@
/*
* Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -19,17 +20,12 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * Authors: Marek Olšák <maraeo@gmail.com>
- *
*/
-#include "si_pipe.h"
-#include "si_state.h"
-#include "sid.h"
-#include "radeon/r600_cs.h"
+#include "si_build_pm4.h"
#include "util/u_memory.h"
+#include "util/u_suballoc.h"
static void si_set_streamout_enable(struct si_context *sctx, bool enable);
@@ -47,14 +43,14 @@ si_create_so_target(struct pipe_context *ctx,
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_streamout_target *t;
- struct r600_resource *rbuffer = (struct r600_resource*)buffer;
+ struct r600_resource *rbuffer = r600_resource(buffer);
t = CALLOC_STRUCT(si_streamout_target);
if (!t) {
return NULL;
}
- u_suballocator_alloc(sctx->b.allocator_zeroed_memory, 4, 4,
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
&t->buf_filled_size_offset,
(struct pipe_resource**)&t->buf_filled_size);
if (!t->buf_filled_size) {
@@ -87,7 +83,7 @@ void si_streamout_buffers_dirty(struct si_context *sctx)
if (!sctx->streamout.enabled_mask)
return;
- si_mark_atom_dirty(sctx, &sctx->streamout.begin_atom);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
si_set_streamout_enable(sctx, true);
}
@@ -120,14 +116,14 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
/* Invalidate the scalar cache in case a streamout buffer is
* going to be used as a constant buffer.
*
- * Invalidate TC L1, because streamout bypasses it (done by
- * setting GLC=1 in the store instruction), but it can contain
- * outdated data of streamout buffers.
+ * Invalidate vL1, because streamout bypasses it (done by
+ * setting GLC=1 in the store instruction), but vL1 in other
+ * CUs can contain outdated data of streamout buffers.
*
* VS_PARTIAL_FLUSH is required if the buffers are going to be
* used as an input immediately.
*/
- sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+ sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
SI_CONTEXT_INV_VMEM_L1 |
SI_CONTEXT_VS_PARTIAL_FLUSH;
}
@@ -136,7 +132,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
* start writing to the targets.
*/
if (num_targets)
- sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH;
/* Streamout buffers must be bound in 2 places:
@@ -155,7 +151,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
if (!targets[i])
continue;
- r600_context_add_resource_size(ctx, targets[i]->buffer);
+ si_context_add_resource_size(sctx, targets[i]->buffer);
enabled_mask |= 1 << i;
if (offsets[i] == ((unsigned)-1))
@@ -173,7 +169,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
if (num_targets) {
si_streamout_buffers_dirty(sctx);
} else {
- si_set_atom_dirty(sctx, &sctx->streamout.begin_atom, false);
+ si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
si_set_streamout_enable(sctx, false);
}
@@ -204,8 +200,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
/* Set the resource. */
pipe_resource_reference(&buffers->buffers[bufidx],
buffer);
- radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
- (struct r600_resource*)buffer,
+ radeon_add_to_gfx_buffer_list_check_mem(sctx,
+ r600_resource(buffer),
buffers->shader_usage,
RADEON_PRIO_SHADER_RW_BUFFER,
true);
@@ -234,11 +230,11 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
static void si_flush_vgt_streamout(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
unsigned reg_strmout_cntl;
/* The register is at different places on different ASICs. */
- if (sctx->b.chip_class >= CIK) {
+ if (sctx->chip_class >= CIK) {
reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
} else {
@@ -258,10 +254,9 @@ static void si_flush_vgt_streamout(struct si_context *sctx)
radeon_emit(cs, 4); /* poll interval */
}
-static void si_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
+static void si_emit_streamout_begin(struct si_context *sctx)
{
- struct si_context *sctx = (struct si_context*)rctx;
- struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
struct si_streamout_target **t = sctx->streamout.targets;
uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
unsigned i;
@@ -295,7 +290,7 @@ static void si_emit_streamout_begin(struct r600_common_context *rctx, struct r60
radeon_emit(cs, va); /* src address lo */
radeon_emit(cs, va >> 32); /* src address hi */
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
t[i]->buf_filled_size,
RADEON_USAGE_READ,
RADEON_PRIO_SO_FILLED_SIZE);
@@ -316,7 +311,7 @@ static void si_emit_streamout_begin(struct r600_common_context *rctx, struct r60
void si_emit_streamout_end(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
struct si_streamout_target **t = sctx->streamout.targets;
unsigned i;
uint64_t va;
@@ -337,7 +332,7 @@ void si_emit_streamout_end(struct si_context *sctx)
radeon_emit(cs, 0); /* unused */
radeon_emit(cs, 0); /* unused */
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
t[i]->buf_filled_size,
RADEON_USAGE_WRITE,
RADEON_PRIO_SO_FILLED_SIZE);
@@ -352,7 +347,6 @@ void si_emit_streamout_end(struct si_context *sctx)
}
sctx->streamout.begin_emitted = false;
- sctx->b.flags |= R600_CONTEXT_STREAMOUT_FLUSH;
}
/* STREAMOUT CONFIG DERIVED STATE
@@ -362,19 +356,16 @@ void si_emit_streamout_end(struct si_context *sctx)
* are no buffers bound.
*/
-static void si_emit_streamout_enable(struct r600_common_context *rctx,
- struct r600_atom *atom)
+static void si_emit_streamout_enable(struct si_context *sctx)
{
- struct si_context *sctx = (struct si_context*)rctx;
-
- radeon_set_context_reg_seq(sctx->b.gfx.cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
- radeon_emit(sctx->b.gfx.cs,
+ radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
+ radeon_emit(sctx->gfx_cs,
S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
S_028B94_RAST_STREAM(0) |
S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
- radeon_emit(sctx->b.gfx.cs,
+ radeon_emit(sctx->gfx_cs,
sctx->streamout.hw_enabled_mask &
sctx->streamout.enabled_stream_buffers_mask);
}
@@ -393,7 +384,7 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable)
if ((old_strmout_en != si_get_strmout_en(sctx)) ||
(old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
- si_mark_atom_dirty(sctx, &sctx->streamout.enable_atom);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
}
void si_update_prims_generated_query_state(struct si_context *sctx,
@@ -409,15 +400,15 @@ void si_update_prims_generated_query_state(struct si_context *sctx,
sctx->streamout.num_prims_gen_queries != 0;
if (old_strmout_en != si_get_strmout_en(sctx))
- si_mark_atom_dirty(sctx, &sctx->streamout.enable_atom);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
}
}
void si_init_streamout_functions(struct si_context *sctx)
{
- sctx->b.b.create_stream_output_target = si_create_so_target;
- sctx->b.b.stream_output_target_destroy = si_so_target_destroy;
- sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
- sctx->streamout.begin_atom.emit = si_emit_streamout_begin;
- sctx->streamout.enable_atom.emit = si_emit_streamout_enable;
+ sctx->b.create_stream_output_target = si_create_so_target;
+ sctx->b.stream_output_target_destroy = si_so_target_destroy;
+ sctx->b.set_stream_output_targets = si_set_streamout_targets;
+ sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+ sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c
index f41655847..76c56447e 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -1,5 +1,6 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -21,9 +22,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_pipe.h"
-#include "sid.h"
-#include "radeon/r600_cs.h"
+#include "si_build_pm4.h"
#include "util/u_viewport.h"
#include "tgsi/tgsi_scan.h"
@@ -45,7 +44,7 @@ static void si_set_scissor_states(struct pipe_context *pctx,
return;
ctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
- si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
}
/* Since the guard band disables clipping, we have to clip per-pixel
@@ -108,10 +107,11 @@ static void si_scissor_make_union(struct si_signed_scissor *out,
out->miny = MIN2(out->miny, in->miny);
out->maxx = MAX2(out->maxx, in->maxx);
out->maxy = MAX2(out->maxy, in->maxy);
+ out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
}
static void si_emit_one_scissor(struct si_context *ctx,
- struct radeon_winsys_cs *cs,
+ struct radeon_cmdbuf *cs,
struct si_signed_scissor *vp_scissor,
struct pipe_scissor_state *scissor)
{
@@ -127,6 +127,18 @@ static void si_emit_one_scissor(struct si_context *ctx,
if (scissor)
si_clip_scissor(&final, scissor);
+ /* Workaround for a hw bug on SI that occurs when PA_SU_HARDWARE_-
+ * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+ */
+ if (ctx->chip_class == SI && (final.maxx == 0 || final.maxy == 0)) {
+ radeon_emit(cs, S_028250_TL_X(1) |
+ S_028250_TL_Y(1) |
+ S_028250_WINDOW_OFFSET_DISABLE(1));
+ radeon_emit(cs, S_028254_BR_X(1) |
+ S_028254_BR_Y(1));
+ return;
+ }
+
radeon_emit(cs, S_028250_TL_X(final.minx) |
S_028250_TL_Y(final.miny) |
S_028250_WINDOW_OFFSET_DISABLE(1));
@@ -134,27 +146,67 @@ static void si_emit_one_scissor(struct si_context *ctx,
S_028254_BR_Y(final.maxy));
}
-/* the range is [-MAX, MAX] */
-#define GET_MAX_VIEWPORT_RANGE(rctx) (32768)
-
-static void si_emit_guardband(struct si_context *ctx,
- struct si_signed_scissor *vp_as_scissor)
+static void si_emit_guardband(struct si_context *ctx)
{
- struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+ const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
+ struct si_signed_scissor vp_as_scissor;
struct pipe_viewport_state vp;
float left, top, right, bottom, max_range, guardband_x, guardband_y;
float discard_x, discard_y;
+ if (ctx->vs_writes_viewport_index) {
+ /* Shaders can draw to any viewport. Make a union of all
+ * viewports. */
+ vp_as_scissor = ctx->viewports.as_scissor[0];
+ for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
+ si_scissor_make_union(&vp_as_scissor,
+ &ctx->viewports.as_scissor[i]);
+ }
+ } else {
+ vp_as_scissor = ctx->viewports.as_scissor[0];
+ }
+
+ /* Blits don't set the viewport state. The vertex shader determines
+ * the viewport size by scaling the coordinates, so we don't know
+ * how large the viewport is. Assume the worst case.
+ */
+ if (ctx->vs_disables_clipping_viewport)
+ vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+
+ /* Determine the optimal hardware screen offset to center the viewport
+ * within the viewport range in order to maximize the guardband size.
+ */
+ int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
+ int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
+
+ const unsigned hw_screen_offset_max = 8176;
+ /* SI-CI need to align the offset to an ubertile consisting of all SEs. */
+ const unsigned hw_screen_offset_alignment =
+ ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
+
+ hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, hw_screen_offset_max);
+ hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, hw_screen_offset_max);
+
+ /* Align the screen offset by dropping the low bits. */
+ hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
+ hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
+
+ /* Apply the offset to center the viewport and maximize the guardband. */
+ vp_as_scissor.minx -= hw_screen_offset_x;
+ vp_as_scissor.maxx -= hw_screen_offset_x;
+ vp_as_scissor.miny -= hw_screen_offset_y;
+ vp_as_scissor.maxy -= hw_screen_offset_y;
+
/* Reconstruct the viewport transformation from the scissor. */
- vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0;
- vp.translate[1] = (vp_as_scissor->miny + vp_as_scissor->maxy) / 2.0;
- vp.scale[0] = vp_as_scissor->maxx - vp.translate[0];
- vp.scale[1] = vp_as_scissor->maxy - vp.translate[1];
+ vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
+ vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
+ vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
+ vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
/* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
- if (vp_as_scissor->minx == vp_as_scissor->maxx)
+ if (vp_as_scissor.minx == vp_as_scissor.maxx)
vp.scale[0] = 0.5;
- if (vp_as_scissor->miny == vp_as_scissor->maxy)
+ if (vp_as_scissor.miny == vp_as_scissor.maxy)
vp.scale[1] = 0.5;
/* Find the biggest guard band that is inside the supported viewport
@@ -164,9 +216,11 @@ static void si_emit_guardband(struct si_context *ctx,
* This is done by applying the inverse viewport transformation
* on the viewport limits to get those limits in clip space.
*
- * Use a limit one pixel smaller to allow for some precision error.
+ * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
*/
- max_range = GET_MAX_VIEWPORT_RANGE(ctx) - 1;
+ static unsigned max_viewport_size[] = {65535, 16383, 4095};
+ assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
+ max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
left = (-max_range - vp.translate[0]) / vp.scale[0];
right = ( max_range - vp.translate[0]) / vp.scale[0];
top = (-max_range - vp.translate[1]) / vp.scale[1];
@@ -180,11 +234,9 @@ static void si_emit_guardband(struct si_context *ctx,
discard_x = 1.0;
discard_y = 1.0;
- if (unlikely(ctx->current_rast_prim < PIPE_PRIM_TRIANGLES) &&
- ctx->queued.named.rasterizer) {
+ if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
/* When rendering wide points or lines, we need to be more
* conservative about when to discard them entirely. */
- const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
float pixels;
if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
@@ -202,27 +254,34 @@ static void si_emit_guardband(struct si_context *ctx,
discard_y = MIN2(discard_y, guardband_y);
}
- /* If any of the GB registers is updated, all of them must be updated. */
- radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
-
- radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */
- radeon_emit(cs, fui(discard_y)); /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
- radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */
- radeon_emit(cs, fui(discard_x)); /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
+ /* If any of the GB registers is updated, all of them must be updated.
+ * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
+ * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
+ */
+ unsigned initial_cdw = ctx->gfx_cs->current.cdw;
+ radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
+ SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
+ fui(guardband_y), fui(discard_y),
+ fui(guardband_x), fui(discard_x));
+ radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
+ SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+ S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
+ S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
+ radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
+ SI_TRACKED_PA_SU_VTX_CNTL,
+ S_028BE4_PIX_CENTER(rs->half_pixel_center) |
+ S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
+ vp_as_scissor.quant_mode));
+ if (initial_cdw != ctx->gfx_cs->current.cdw)
+ ctx->context_roll_counter++;
}
-static void si_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom)
+static void si_emit_scissors(struct si_context *ctx)
{
- struct si_context *ctx = (struct si_context *)rctx;
- struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
struct pipe_scissor_state *states = ctx->scissors.states;
unsigned mask = ctx->scissors.dirty_mask;
- bool scissor_enabled = false;
- struct si_signed_scissor max_vp_scissor;
- int i;
-
- if (ctx->queued.named.rasterizer)
- scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
+ bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
/* The simple case: Only 1 viewport is active. */
if (!ctx->vs_writes_viewport_index) {
@@ -233,17 +292,10 @@ static void si_emit_scissors(struct r600_common_context *rctx, struct r600_atom
radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
- si_emit_guardband(ctx, vp);
ctx->scissors.dirty_mask &= ~1; /* clear one bit */
return;
}
- /* Shaders can draw to any viewport. Make a union of all viewports. */
- max_vp_scissor = ctx->viewports.as_scissor[0];
- for (i = 1; i < SI_MAX_VIEWPORTS; i++)
- si_scissor_make_union(&max_vp_scissor,
- &ctx->viewports.as_scissor[i]);
-
while (mask) {
int start, count, i;
@@ -256,7 +308,6 @@ static void si_emit_scissors(struct r600_common_context *rctx, struct r600_atom
scissor_enabled ? &states[i] : NULL);
}
}
- si_emit_guardband(ctx, &max_vp_scissor);
ctx->scissors.dirty_mask = 0;
}
@@ -271,24 +322,48 @@ static void si_set_viewport_states(struct pipe_context *pctx,
for (i = 0; i < num_viewports; i++) {
unsigned index = start_slot + i;
+ struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
ctx->viewports.states[index] = state[i];
- si_get_scissor_from_viewport(ctx, &state[i],
- &ctx->viewports.as_scissor[index]);
+
+ si_get_scissor_from_viewport(ctx, &state[i], scissor);
+
+ unsigned w = scissor->maxx - scissor->minx;
+ unsigned h = scissor->maxy - scissor->miny;
+ unsigned max_extent = MAX2(w, h);
+
+ /* Determine the best quantization mode (subpixel precision),
+ * but also leave enough space for the guardband.
+ *
+ * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
+ * and Raven1. What we do depends on the chip:
+ * - Vega10: Never use primitive binning.
+ * - Raven1: Always use QUANT_MODE == 16_8.
+ */
+ if (ctx->family == CHIP_RAVEN)
+ max_extent = 16384; /* Use QUANT_MODE == 16_8. */
+
+ if (max_extent <= 1024) /* 4K scanline area for guardband */
+ scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
+ else if (max_extent <= 4096) /* 16K scanline area for guardband */
+ scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
+ else /* 64K scanline area for guardband */
+ scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
}
mask = ((1 << num_viewports) - 1) << start_slot;
ctx->viewports.dirty_mask |= mask;
ctx->viewports.depth_range_dirty_mask |= mask;
ctx->scissors.dirty_mask |= mask;
- si_mark_atom_dirty(ctx, &ctx->viewports.atom);
- si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
}
static void si_emit_one_viewport(struct si_context *ctx,
struct pipe_viewport_state *state)
{
- struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
radeon_emit(cs, fui(state->scale[0]));
radeon_emit(cs, fui(state->translate[0]));
@@ -300,7 +375,7 @@ static void si_emit_one_viewport(struct si_context *ctx,
static void si_emit_viewports(struct si_context *ctx)
{
- struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
struct pipe_viewport_state *states = ctx->viewports.states;
unsigned mask = ctx->viewports.dirty_mask;
@@ -342,16 +417,13 @@ si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
static void si_emit_depth_ranges(struct si_context *ctx)
{
- struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
struct pipe_viewport_state *states = ctx->viewports.states;
unsigned mask = ctx->viewports.depth_range_dirty_mask;
- bool clip_halfz = false;
+ bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
bool window_space = ctx->vs_disables_clipping_viewport;
float zmin, zmax;
- if (ctx->queued.named.rasterizer)
- clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
-
/* The simple case: Only 1 viewport is active. */
if (!ctx->vs_writes_viewport_index) {
if (!(mask & 1))
@@ -384,10 +456,8 @@ static void si_emit_depth_ranges(struct si_context *ctx)
ctx->viewports.depth_range_dirty_mask = 0;
}
-static void si_emit_viewport_states(struct r600_common_context *rctx,
- struct r600_atom *atom)
+static void si_emit_viewport_states(struct si_context *ctx)
{
- struct si_context *ctx = (struct si_context *)rctx;
si_emit_viewports(ctx);
si_emit_depth_ranges(ctx);
}
@@ -418,28 +488,118 @@ void si_update_vs_viewport_state(struct si_context *ctx)
ctx->vs_disables_clipping_viewport = vs_window_space;
ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
ctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
- si_mark_atom_dirty(ctx, &ctx->scissors.atom);
- si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
}
/* Viewport index handling. */
+ if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
+ return;
+
+ /* This changes how the guardband is computed. */
ctx->vs_writes_viewport_index = info->writes_viewport_index;
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+
if (!ctx->vs_writes_viewport_index)
return;
if (ctx->scissors.dirty_mask)
- si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
if (ctx->viewports.dirty_mask ||
ctx->viewports.depth_range_dirty_mask)
- si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+}
+
+static void si_emit_window_rectangles(struct si_context *sctx)
+{
+ /* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ *
+ * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
+ */
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ static const unsigned outside[4] = {
+ /* outside rectangle 0 */
+ V_02820C_OUT |
+ V_02820C_IN_1 |
+ V_02820C_IN_2 |
+ V_02820C_IN_21 |
+ V_02820C_IN_3 |
+ V_02820C_IN_31 |
+ V_02820C_IN_32 |
+ V_02820C_IN_321,
+ /* outside rectangles 0, 1 */
+ V_02820C_OUT |
+ V_02820C_IN_2 |
+ V_02820C_IN_3 |
+ V_02820C_IN_32,
+ /* outside rectangles 0, 1, 2 */
+ V_02820C_OUT |
+ V_02820C_IN_3,
+ /* outside rectangles 0, 1, 2, 3 */
+ V_02820C_OUT,
+ };
+ const unsigned disabled = 0xffff; /* all inside and outside cases */
+ unsigned num_rectangles = sctx->num_window_rectangles;
+ struct pipe_scissor_state *rects = sctx->window_rectangles;
+ unsigned rule;
+
+ assert(num_rectangles <= 4);
+
+ if (num_rectangles == 0)
+ rule = disabled;
+ else if (sctx->window_rectangles_include)
+ rule = ~outside[num_rectangles - 1];
+ else
+ rule = outside[num_rectangles - 1];
+
+ radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE,
+ SI_TRACKED_PA_SC_CLIPRECT_RULE, rule);
+ if (num_rectangles == 0)
+ return;
+
+ radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL,
+ num_rectangles * 2);
+ for (unsigned i = 0; i < num_rectangles; i++) {
+ radeon_emit(cs, S_028210_TL_X(rects[i].minx) |
+ S_028210_TL_Y(rects[i].miny));
+ radeon_emit(cs, S_028214_BR_X(rects[i].maxx) |
+ S_028214_BR_Y(rects[i].maxy));
+ }
+}
+
+static void si_set_window_rectangles(struct pipe_context *ctx,
+ boolean include,
+ unsigned num_rectangles,
+ const struct pipe_scissor_state *rects)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ sctx->num_window_rectangles = num_rectangles;
+ sctx->window_rectangles_include = include;
+ if (num_rectangles) {
+ memcpy(sctx->window_rectangles, rects,
+ sizeof(*rects) * num_rectangles);
+ }
+
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
}
void si_init_viewport_functions(struct si_context *ctx)
{
- ctx->scissors.atom.emit = si_emit_scissors;
- ctx->viewports.atom.emit = si_emit_viewport_states;
+ ctx->atoms.s.guardband.emit = si_emit_guardband;
+ ctx->atoms.s.scissors.emit = si_emit_scissors;
+ ctx->atoms.s.viewports.emit = si_emit_viewport_states;
+ ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+
+ ctx->b.set_scissor_states = si_set_scissor_states;
+ ctx->b.set_viewport_states = si_set_viewport_states;
+ ctx->b.set_window_rectangles = si_set_window_rectangles;
- ctx->b.b.set_scissor_states = si_set_scissor_states;
- ctx->b.b.set_viewport_states = si_set_viewport_states;
+ for (unsigned i = 0; i < 16; i++)
+ ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma.c b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma.c
new file mode 100644
index 000000000..90a2032cd
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file implements randomized SDMA texture blit tests. */
+
+#include "si_pipe.h"
+#include "util/u_surface.h"
+#include "util/rand_xor.h"
+
+static uint64_t seed_xorshift128plus[2];
+
+#define RAND_NUM_SIZE 8
+
+/* The GPU blits are emulated on the CPU using these CPU textures. */
+
+struct cpu_texture {
+ uint8_t *ptr;
+ uint64_t size;
+ uint64_t layer_stride;
+ unsigned stride;
+};
+
+static void alloc_cpu_texture(struct cpu_texture *tex,
+ struct pipe_resource *templ, int bpp)
+{
+ tex->stride = align(templ->width0 * bpp, RAND_NUM_SIZE);
+ tex->layer_stride = (uint64_t)tex->stride * templ->height0;
+ tex->size = tex->layer_stride * templ->array_size;
+ tex->ptr = malloc(tex->size);
+ assert(tex->ptr);
+}
+
+static void set_random_pixels(struct pipe_context *ctx,
+ struct pipe_resource *tex,
+ struct cpu_texture *cpu)
+{
+ struct pipe_transfer *t;
+ uint8_t *map;
+ int x,y,z;
+
+ map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
+ 0, 0, 0, tex->width0, tex->height0,
+ tex->array_size, &t);
+ assert(map);
+
+ for (z = 0; z < tex->array_size; z++) {
+ for (y = 0; y < tex->height0; y++) {
+ uint64_t *ptr = (uint64_t*)
+ (map + t->layer_stride*z + t->stride*y);
+ uint64_t *ptr_cpu = (uint64_t*)
+ (cpu->ptr + cpu->layer_stride*z + cpu->stride*y);
+ unsigned size = cpu->stride / RAND_NUM_SIZE;
+
+ assert(t->stride % RAND_NUM_SIZE == 0);
+ assert(cpu->stride % RAND_NUM_SIZE == 0);
+
+ for (x = 0; x < size; x++) {
+ *ptr++ = *ptr_cpu++ =
+ rand_xorshift128plus(seed_xorshift128plus);
+ }
+ }
+ }
+
+ pipe_transfer_unmap(ctx, t);
+}
+
+static bool compare_textures(struct pipe_context *ctx,
+ struct pipe_resource *tex,
+ struct cpu_texture *cpu, int bpp)
+{
+ struct pipe_transfer *t;
+ uint8_t *map;
+ int y,z;
+ bool pass = true;
+
+ map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ,
+ 0, 0, 0, tex->width0, tex->height0,
+ tex->array_size, &t);
+ assert(map);
+
+ for (z = 0; z < tex->array_size; z++) {
+ for (y = 0; y < tex->height0; y++) {
+ uint8_t *ptr = map + t->layer_stride*z + t->stride*y;
+ uint8_t *cpu_ptr = cpu->ptr +
+ cpu->layer_stride*z + cpu->stride*y;
+
+ if (memcmp(ptr, cpu_ptr, tex->width0 * bpp)) {
+ pass = false;
+ goto done;
+ }
+ }
+ }
+done:
+ pipe_transfer_unmap(ctx, t);
+ return pass;
+}
+
+static enum pipe_format get_format_from_bpp(int bpp)
+{
+ switch (bpp) {
+ case 1:
+ return PIPE_FORMAT_R8_UINT;
+ case 2:
+ return PIPE_FORMAT_R16_UINT;
+ case 4:
+ return PIPE_FORMAT_R32_UINT;
+ case 8:
+ return PIPE_FORMAT_R32G32_UINT;
+ case 16:
+ return PIPE_FORMAT_R32G32B32A32_UINT;
+ default:
+ assert(0);
+ return PIPE_FORMAT_NONE;
+ }
+}
+
+static const char *array_mode_to_string(struct si_screen *sscreen,
+ struct radeon_surf *surf)
+{
+ if (sscreen->info.chip_class >= GFX9) {
+ switch (surf->u.gfx9.surf.swizzle_mode) {
+ case 0:
+ return " LINEAR";
+ case 21:
+ return " 4KB_S_X";
+ case 22:
+ return " 4KB_D_X";
+ case 25:
+ return "64KB_S_X";
+ case 26:
+ return "64KB_D_X";
+ default:
+ printf("Unhandled swizzle mode = %u\n",
+ surf->u.gfx9.surf.swizzle_mode);
+ return " UNKNOWN";
+ }
+ } else {
+ switch (surf->u.legacy.level[0].mode) {
+ case RADEON_SURF_MODE_LINEAR_ALIGNED:
+ return "LINEAR_ALIGNED";
+ case RADEON_SURF_MODE_1D:
+ return "1D_TILED_THIN1";
+ case RADEON_SURF_MODE_2D:
+ return "2D_TILED_THIN1";
+ default:
+ assert(0);
+ return " UNKNOWN";
+ }
+ }
+}
+
+static unsigned generate_max_tex_side(unsigned max_tex_side)
+{
+ switch (rand() % 4) {
+ case 0:
+ /* Try to hit large sizes in 1/4 of the cases. */
+ return max_tex_side;
+ case 1:
+ /* Try to hit 1D tiling in 1/4 of the cases. */
+ return 128;
+ default:
+ /* Try to hit common sizes in 2/4 of the cases. */
+ return 2048;
+ }
+}
+
+void si_test_dma(struct si_screen *sscreen)
+{
+ struct pipe_screen *screen = &sscreen->b;
+ struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+ struct si_context *sctx = (struct si_context*)ctx;
+ uint64_t max_alloc_size;
+ unsigned i, iterations, num_partial_copies, max_levels, max_tex_side;
+ unsigned num_pass = 0, num_fail = 0;
+
+ max_levels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+ max_tex_side = 1 << (max_levels - 1);
+
+ /* Max 128 MB allowed for both textures. */
+ max_alloc_size = 128 * 1024 * 1024;
+
+ /* the seed for random test parameters */
+ srand(0x9b47d95b);
+ /* the seed for random pixel data */
+ s_rand_xorshift128plus(seed_xorshift128plus, false);
+
+ iterations = 1000000000; /* just kill it when you are bored */
+ num_partial_copies = 30;
+
+ /* These parameters are randomly generated per test:
+ * - whether to do one whole-surface copy or N partial copies per test
+ * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
+ * - which texture dimensions to use
+ * - whether to use VRAM (all tiling modes) and GTT (staging, linear
+ * only) allocations
+ * - random initial pixels in src
+ * - generate random subrectangle copies for partial blits
+ */
+ for (i = 0; i < iterations; i++) {
+ struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
+ struct si_texture *sdst;
+ struct si_texture *ssrc;
+ struct cpu_texture src_cpu, dst_cpu;
+ unsigned bpp, max_width, max_height, max_depth, j, num;
+ unsigned gfx_blits = 0, dma_blits = 0, max_tex_side_gen;
+ unsigned max_tex_layers;
+ bool pass;
+ bool do_partial_copies = rand() & 1;
+
+ /* generate a random test case */
+ tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
+ tsrc.depth0 = tdst.depth0 = 1;
+
+ bpp = 1 << (rand() % 5);
+ tsrc.format = tdst.format = get_format_from_bpp(bpp);
+
+ max_tex_side_gen = generate_max_tex_side(max_tex_side);
+ max_tex_layers = rand() % 4 ? 1 : 5;
+
+ tsrc.width0 = (rand() % max_tex_side_gen) + 1;
+ tsrc.height0 = (rand() % max_tex_side_gen) + 1;
+ tsrc.array_size = (rand() % max_tex_layers) + 1;
+
+ /* Have a 1/4 chance of getting power-of-two dimensions. */
+ if (rand() % 4 == 0) {
+ tsrc.width0 = util_next_power_of_two(tsrc.width0);
+ tsrc.height0 = util_next_power_of_two(tsrc.height0);
+ }
+
+ if (!do_partial_copies) {
+ /* whole-surface copies only, same dimensions */
+ tdst = tsrc;
+ } else {
+ max_tex_side_gen = generate_max_tex_side(max_tex_side);
+ max_tex_layers = rand() % 4 ? 1 : 5;
+
+ /* many partial copies, dimensions can be different */
+ tdst.width0 = (rand() % max_tex_side_gen) + 1;
+ tdst.height0 = (rand() % max_tex_side_gen) + 1;
+ tdst.array_size = (rand() % max_tex_layers) + 1;
+
+ /* Have a 1/4 chance of getting power-of-two dimensions. */
+ if (rand() % 4 == 0) {
+ tdst.width0 = util_next_power_of_two(tdst.width0);
+ tdst.height0 = util_next_power_of_two(tdst.height0);
+ }
+ }
+
+ /* check texture sizes */
+ if ((uint64_t)tsrc.width0 * tsrc.height0 * tsrc.array_size * bpp +
+ (uint64_t)tdst.width0 * tdst.height0 * tdst.array_size * bpp >
+ max_alloc_size) {
+ /* too large, try again */
+ i--;
+ continue;
+ }
+
+ /* VRAM + the tiling mode depends on dimensions (3/4 of cases),
+ * or GTT + linear only (1/4 of cases)
+ */
+ tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+ tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+
+ /* Allocate textures (both the GPU and CPU copies).
+ * The CPU will emulate what the GPU should be doing.
+ */
+ src = screen->resource_create(screen, &tsrc);
+ dst = screen->resource_create(screen, &tdst);
+ assert(src);
+ assert(dst);
+ sdst = (struct si_texture*)dst;
+ ssrc = (struct si_texture*)src;
+ alloc_cpu_texture(&src_cpu, &tsrc, bpp);
+ alloc_cpu_texture(&dst_cpu, &tdst, bpp);
+
+ printf("%4u: dst = (%5u x %5u x %u, %s), "
+ " src = (%5u x %5u x %u, %s), bpp = %2u, ",
+ i, tdst.width0, tdst.height0, tdst.array_size,
+ array_mode_to_string(sscreen, &sdst->surface),
+ tsrc.width0, tsrc.height0, tsrc.array_size,
+ array_mode_to_string(sscreen, &ssrc->surface), bpp);
+ fflush(stdout);
+
+ /* set src pixels */
+ set_random_pixels(ctx, src, &src_cpu);
+
+ /* clear dst pixels */
+ uint32_t zero = 0;
+ si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
+ SI_COHERENCY_SHADER);
+ memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
+
+ /* preparation */
+ max_width = MIN2(tsrc.width0, tdst.width0);
+ max_height = MIN2(tsrc.height0, tdst.height0);
+ max_depth = MIN2(tsrc.array_size, tdst.array_size);
+
+ num = do_partial_copies ? num_partial_copies : 1;
+ for (j = 0; j < num; j++) {
+ int width, height, depth;
+ int srcx, srcy, srcz, dstx, dsty, dstz;
+ struct pipe_box box;
+ unsigned old_num_draw_calls = sctx->num_draw_calls;
+ unsigned old_num_dma_calls = sctx->num_dma_calls;
+
+ if (!do_partial_copies) {
+ /* copy whole src to dst */
+ width = max_width;
+ height = max_height;
+ depth = max_depth;
+
+ srcx = srcy = srcz = dstx = dsty = dstz = 0;
+ } else {
+ /* random sub-rectangle copies from src to dst */
+ depth = (rand() % max_depth) + 1;
+ srcz = rand() % (tsrc.array_size - depth + 1);
+ dstz = rand() % (tdst.array_size - depth + 1);
+
+ /* special code path to hit the tiled partial copies */
+ if (!ssrc->surface.is_linear &&
+ !sdst->surface.is_linear &&
+ rand() & 1) {
+ if (max_width < 8 || max_height < 8)
+ continue;
+ width = ((rand() % (max_width / 8)) + 1) * 8;
+ height = ((rand() % (max_height / 8)) + 1) * 8;
+
+ srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
+ srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
+
+ dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
+ dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
+ } else {
+ /* just make sure that it doesn't divide by zero */
+ assert(max_width > 0 && max_height > 0);
+
+ width = (rand() % max_width) + 1;
+ height = (rand() % max_height) + 1;
+
+ srcx = rand() % (tsrc.width0 - width + 1);
+ srcy = rand() % (tsrc.height0 - height + 1);
+
+ dstx = rand() % (tdst.width0 - width + 1);
+ dsty = rand() % (tdst.height0 - height + 1);
+ }
+
+ /* special code path to hit out-of-bounds reads in L2T */
+ if (ssrc->surface.is_linear &&
+ !sdst->surface.is_linear &&
+ rand() % 4 == 0) {
+ srcx = 0;
+ srcy = 0;
+ srcz = 0;
+ }
+ }
+
+ /* GPU copy */
+ u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
+ sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
+
+ /* See which engine was used. */
+ gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
+ dma_blits += sctx->num_dma_calls > old_num_dma_calls;
+
+ /* CPU copy */
+ util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride,
+ dst_cpu.layer_stride,
+ dstx, dsty, dstz, width, height, depth,
+ src_cpu.ptr, src_cpu.stride,
+ src_cpu.layer_stride,
+ srcx, srcy, srcz);
+ }
+
+ pass = compare_textures(ctx, dst, &dst_cpu, bpp);
+ if (pass)
+ num_pass++;
+ else
+ num_fail++;
+
+ printf("BLITs: GFX = %2u, DMA = %2u, %s [%u/%u]\n",
+ gfx_blits, dma_blits, pass ? "pass" : "fail",
+ num_pass, num_pass+num_fail);
+
+ /* cleanup */
+ pipe_resource_reference(&src, NULL);
+ pipe_resource_reference(&dst, NULL);
+ free(src_cpu.ptr);
+ free(dst_cpu.ptr);
+ }
+
+ ctx->destroy(ctx);
+ exit(0);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma_perf.c
new file mode 100644
index 000000000..6c04720e9
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file implements tests on the si_clearbuffer function. */
+
+#include "si_pipe.h"
+#include "si_query.h"
+
+#define MIN_SIZE 512
+#define MAX_SIZE (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS 128
+
+static double get_MBps_rate(unsigned num_bytes, unsigned ns)
+{
+ return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+}
+
+void si_test_dma_perf(struct si_screen *sscreen)
+{
+ struct pipe_screen *screen = &sscreen->b;
+ struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+ struct si_context *sctx = (struct si_context*)ctx;
+ const uint32_t clear_value = 0x12345678;
+ static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+ static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+
+#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
+#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+ static const char *method_str[] = {
+ "CP MC ",
+ "CP L2 ",
+ "CP L2 ",
+ "SDMA ",
+ };
+ static const char *placement_str[] = {
+ /* Clear */
+ "fill->VRAM",
+ "fill->GTT ",
+ /* Copy */
+ "VRAM->VRAM",
+ "VRAM->GTT ",
+ "GTT ->VRAM",
+ };
+
+ printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+ printf("Heap ,Method ,L2p,Wa,");
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ if (size >= 1024)
+ printf("%6uKB,", size / 1024);
+ else
+ printf(" %6uB,", size);
+ }
+ printf("\n");
+
+ /* results[log2(size)][placement][method][] */
+ struct si_result {
+ bool is_valid;
+ bool is_cp;
+ bool is_sdma;
+ bool is_cs;
+ unsigned cache_policy;
+ unsigned dwords_per_thread;
+ unsigned waves_per_sh;
+ unsigned score;
+ unsigned index; /* index in results[x][y][index] */
+ } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+ /* Run benchmarks. */
+ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+ bool is_copy = placement >= 2;
+
+ printf("-----------,--------,---,--,");
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+ printf("--------,");
+ printf("\n");
+
+ for (unsigned method = 0; method < NUM_METHODS; method++) {
+ bool test_cp = method <= 2;
+ bool test_sdma = method == 3;
+ bool test_cs = method >= 4;
+ unsigned cs_method = method - 4;
+ STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
+ unsigned cs_waves_per_sh =
+ test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
+ cs_method %= 2*NUM_SHADERS;
+ unsigned cache_policy = test_cp ? method % 3 :
+ test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
+ unsigned cs_dwords_per_thread =
+ test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+ if (sctx->chip_class == SI) {
+ /* SI doesn't support CP DMA operations through L2. */
+ if (test_cp && cache_policy != L2_BYPASS)
+ continue;
+ /* WAVES_PER_SH is in multiples of 16 on SI. */
+ if (test_cs && cs_waves_per_sh % 16 != 0)
+ continue;
+ }
+
+ printf("%s ,", placement_str[placement]);
+ if (test_cs) {
+ printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+ cache_policy == L2_LRU ? "LRU" :
+ cache_policy == L2_STREAM ? "Str" : "");
+ } else {
+ printf("%s,%3s,", method_str[method],
+ method == L2_LRU ? "LRU" :
+ method == L2_STREAM ? "Str" : "");
+ }
+ if (test_cs && cs_waves_per_sh)
+ printf("%2u,", cs_waves_per_sh);
+ else
+ printf(" ,");
+
+ double score = 0;
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ /* Don't test bigger sizes if it's too slow. Print 0. */
+ if (size >= 512*1024 &&
+ score < 400 * (size / (4*1024*1024))) {
+ printf("%7.0f ,", 0.0);
+ continue;
+ }
+
+ enum pipe_resource_usage dst_usage, src_usage;
+ struct pipe_resource *dst, *src;
+ struct pipe_query *q[NUM_RUNS];
+ unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+
+ if (test_sdma) {
+ if (sctx->chip_class == SI)
+ query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+ else
+ query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+ }
+
+ if (placement == 0 || placement == 2 || placement == 4)
+ dst_usage = PIPE_USAGE_DEFAULT;
+ else
+ dst_usage = PIPE_USAGE_STREAM;
+
+ if (placement == 2 || placement == 3)
+ src_usage = PIPE_USAGE_DEFAULT;
+ else
+ src_usage = PIPE_USAGE_STREAM;
+
+ dst = pipe_buffer_create(screen, 0, dst_usage, size);
+ src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
+
+ /* Run tests. */
+ for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+ q[iter] = ctx->create_query(ctx, query_type, 0);
+ ctx->begin_query(ctx, q[iter]);
+
+ if (test_cp) {
+ /* CP DMA */
+ if (is_copy) {
+ si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
+ SI_COHERENCY_NONE, cache_policy);
+ } else {
+ si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
+ SI_COHERENCY_NONE, cache_policy);
+ }
+ } else if (test_sdma) {
+ /* SDMA */
+ if (is_copy) {
+ struct pipe_box box;
+ u_box_1d(0, size, &box);
+ sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
+ } else {
+ si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+ }
+ } else {
+ /* Compute */
+ /* The memory accesses are coalesced, meaning that the 1st instruction writes
+ * the 1st contiguous block of data for the whole wave, the 2nd instruction
+ * writes the 2nd contiguous block of data, etc.
+ */
+ unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+ unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+ unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+ unsigned num_dwords = size / 4;
+ unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+ void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+ cache_policy == L2_STREAM, is_copy);
+
+ struct pipe_grid_info info = {};
+ info.block[0] = MIN2(64, num_instructions);
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+
+ struct pipe_shader_buffer sb[2] = {};
+ sb[0].buffer = dst;
+ sb[0].buffer_size = size;
+
+ if (is_copy) {
+ sb[1].buffer = src;
+ sb[1].buffer_size = size;
+ } else {
+ for (unsigned i = 0; i < 4; i++)
+ sctx->cs_user_data[i] = clear_value;
+ }
+
+ sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_SMEM_L1;
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb);
+ ctx->bind_compute_state(ctx, cs);
+ sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+ ctx->launch_grid(ctx, &info);
+
+ ctx->bind_compute_state(ctx, NULL);
+ ctx->delete_compute_state(ctx, cs);
+ sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ /* Flush L2, so that we don't just test L2 cache performance. */
+ if (!test_sdma) {
+ sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+ si_emit_cache_flush(sctx);
+ }
+
+ ctx->end_query(ctx, q[iter]);
+ ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+ }
+ pipe_resource_reference(&dst, NULL);
+ pipe_resource_reference(&src, NULL);
+
+ /* Get results. */
+ uint64_t min = ~0ull, max = 0, total = 0;
+
+ for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+ union pipe_query_result result;
+
+ ctx->get_query_result(ctx, q[iter], true, &result);
+ ctx->destroy_query(ctx, q[iter]);
+
+ min = MIN2(min, result.u64);
+ max = MAX2(max, result.u64);
+ total += result.u64;
+ }
+
+ score = get_MBps_rate(size, total / (double)NUM_RUNS);
+ printf("%7.0f ,", score);
+ fflush(stdout);
+
+ struct si_result *r = &results[util_logbase2(size)][placement][method];
+ r->is_valid = true;
+ r->is_cp = test_cp;
+ r->is_sdma = test_sdma;
+ r->is_cs = test_cs;
+ r->cache_policy = cache_policy;
+ r->dwords_per_thread = cs_dwords_per_thread;
+ r->waves_per_sh = cs_waves_per_sh;
+ r->score = score;
+ r->index = method;
+ }
+ puts("");
+ }
+ }
+
+ puts("");
+ puts("static struct si_method");
+ printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
+ sctx->screen->info.name);
+ puts("{");
+ puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
+
+ /* Analyze results and find the best methods. */
+ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+ if (placement == 0)
+ puts(" if (dst == RADEON_DOMAIN_VRAM) {");
+ else if (placement == 1)
+ puts(" } else { /* GTT */");
+ else if (placement == 2) {
+ puts("}");
+ puts("");
+ puts("static struct si_method");
+ printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+ sctx->screen->info.name);
+ printf(" uint64_t size64, bool async, bool cached)\n");
+ puts("{");
+ puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
+ puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+ } else if (placement == 3)
+ puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+ else
+ puts(" } else { /* GTT -> VRAM */");
+
+ for (unsigned mode = 0; mode < 3; mode++) {
+ bool async = mode == 0;
+ bool cached = mode == 1;
+
+ if (async)
+ puts(" if (async) { /* SDMA or async compute */");
+ else if (cached)
+ puts(" if (cached) { /* gfx ring */");
+ else
+ puts(" } else { /* gfx ring - uncached */");
+
+ /* The list of best chosen methods. */
+ struct si_result *methods[32];
+ unsigned method_max_size[32];
+ unsigned num_methods = 0;
+
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ /* Find the best method. */
+ struct si_result *best = NULL;
+
+ for (unsigned i = 0; i < NUM_METHODS; i++) {
+ struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+ if (!r->is_valid)
+ continue;
+
+ /* Ban CP DMA clears via MC on <= VI. They are super slow
+ * on GTT, which we can get due to BO evictions.
+ */
+ if (sctx->chip_class <= VI && placement == 1 &&
+ r->is_cp && r->cache_policy == L2_BYPASS)
+ continue;
+
+ if (async) {
+ /* The following constraints for compute IBs try to limit
+ * resource usage so as not to decrease the performance
+ * of gfx IBs too much.
+ */
+
+ /* Don't use CP DMA on asynchronous rings, because
+ * the engine is shared with gfx IBs.
+ */
+ if (r->is_cp)
+ continue;
+
+ /* Don't use L2 caching on asynchronous rings to minimize
+ * L2 usage.
+ */
+ if (r->cache_policy == L2_LRU)
+ continue;
+
+ /* Asynchronous compute recommends waves_per_sh != 0
+ * to limit CU usage. */
+ if (r->is_cs && r->waves_per_sh == 0)
+ continue;
+ } else {
+ /* SDMA is always asynchronous */
+ if (r->is_sdma)
+ continue;
+
+ if (cached && r->cache_policy == L2_BYPASS)
+ continue;
+ if (!cached && r->cache_policy == L2_LRU)
+ continue;
+ }
+
+ if (!best) {
+ best = r;
+ continue;
+ }
+
+ /* Assume some measurement error. Earlier methods occupy fewer
+ * resources, so the next method is always more greedy, and we
+ * don't want to select it due to a measurement error.
+ */
+ double min_improvement = 1.03;
+
+ if (best->score * min_improvement < r->score)
+ best = r;
+ }
+
+ if (num_methods > 0) {
+ unsigned prev_index = num_methods - 1;
+ struct si_result *prev = methods[prev_index];
+ struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
+
+ /* If the best one is also the best for the previous size,
+ * just bump the size for the previous one.
+ *
+ * If there is no best, it means all methods were too slow
+ * for this size and were not tested. Use the best one for
+ * the previous size.
+ */
+ if (!best ||
+ /* If it's the same method as for the previous size: */
+ (prev->is_cp == best->is_cp &&
+ prev->is_sdma == best->is_sdma &&
+ prev->is_cs == best->is_cs &&
+ prev->cache_policy == best->cache_policy &&
+ prev->dwords_per_thread == best->dwords_per_thread &&
+ prev->waves_per_sh == best->waves_per_sh) ||
+ /* If the method for the previous size is also the best
+ * for this size: */
+ (prev_this_size->is_valid &&
+ prev_this_size->score * 1.03 > best->score)) {
+ method_max_size[prev_index] = size;
+ continue;
+ }
+ }
+
+ /* Add it to the list. */
+ assert(num_methods < ARRAY_SIZE(methods));
+ methods[num_methods] = best;
+ method_max_size[num_methods] = size;
+ num_methods++;
+ }
+
+ for (unsigned i = 0; i < num_methods; i++) {
+ struct si_result *best = methods[i];
+ unsigned size = method_max_size[i];
+
+ /* The size threshold is between the current benchmarked
+ * size and the next benchmarked size. */
+ if (i < num_methods - 1)
+ printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+ else if (i > 0)
+ printf(" else ");
+ else
+ printf(" ");
+ printf("return ");
+
+ assert(best);
+ if (best->is_cp) {
+ printf("CP_DMA(%s);\n",
+ best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
+ best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM");
+ }
+ if (best->is_sdma)
+ printf("SDMA;\n");
+ if (best->is_cs) {
+ printf("COMPUTE(%s, %u, %u);\n",
+ best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM",
+ best->dwords_per_thread,
+ best->waves_per_sh);
+ }
+ }
+ }
+ puts(" }");
+ }
+ puts(" }");
+ puts("}");
+
+ ctx->destroy(ctx);
+ exit(0);
+}
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_texture.c b/lib/mesa/src/gallium/drivers/radeonsi/si_texture.c
new file mode 100644
index 000000000..2fb79253a
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_texture.c
@@ -0,0 +1,2424 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "radeonsi/si_pipe.h"
+#include "radeonsi/si_query.h"
+#include "util/u_format.h"
+#include "util/u_log.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "util/u_resource.h"
+#include "util/u_surface.h"
+#include "util/u_transfer.h"
+#include "util/os_time.h"
+#include <errno.h>
+#include <inttypes.h>
+#include "state_tracker/drm_driver.h"
+#include "amd/common/sid.h"
+
+static enum radeon_surf_mode
+si_choose_tiling(struct si_screen *sscreen,
+ const struct pipe_resource *templ, bool tc_compatible_htile);
+
+
+bool si_prepare_for_dma_blit(struct si_context *sctx,
+ struct si_texture *dst,
+ unsigned dst_level, unsigned dstx,
+ unsigned dsty, unsigned dstz,
+ struct si_texture *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ if (!sctx->dma_cs)
+ return false;
+
+ if (dst->surface.bpe != src->surface.bpe)
+ return false;
+
+ /* MSAA: Blits don't exist in the real world. */
+ if (src->buffer.b.b.nr_samples > 1 ||
+ dst->buffer.b.b.nr_samples > 1)
+ return false;
+
+ /* Depth-stencil surfaces:
+ * When dst is linear, the DB->CB copy preserves HTILE.
+ * When dst is tiled, the 3D path must be used to update HTILE.
+ */
+ if (src->is_depth || dst->is_depth)
+ return false;
+
+ /* DCC as:
+ * src: Use the 3D path. DCC decompression is expensive.
+ * dst: Use the 3D path to compress the pixels with DCC.
+ */
+ if (vi_dcc_enabled(src, src_level) ||
+ vi_dcc_enabled(dst, dst_level))
+ return false;
+
+ /* CMASK as:
+ * src: Both texture and SDMA paths need decompression. Use SDMA.
+ * dst: If overwriting the whole texture, discard CMASK and use
+ * SDMA. Otherwise, use the 3D path.
+ */
+ if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
+ /* The CMASK clear is only enabled for the first level. */
+ assert(dst_level == 0);
+ if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level,
+ dstx, dsty, dstz, src_box->width,
+ src_box->height, src_box->depth))
+ return false;
+
+ si_texture_discard_cmask(sctx->screen, dst);
+ }
+
+ /* All requirements are met. Prepare textures for SDMA. */
+ if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
+ sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
+
+ assert(!(src->dirty_level_mask & (1 << src_level)));
+ assert(!(dst->dirty_level_mask & (1 << dst_level)));
+
+ return true;
+}
+
+/* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
+static void si_copy_region_with_blit(struct pipe_context *pipe,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ struct pipe_blit_info blit;
+
+ memset(&blit, 0, sizeof(blit));
+ blit.src.resource = src;
+ blit.src.format = src->format;
+ blit.src.level = src_level;
+ blit.src.box = *src_box;
+ blit.dst.resource = dst;
+ blit.dst.format = dst->format;
+ blit.dst.level = dst_level;
+ blit.dst.box.x = dstx;
+ blit.dst.box.y = dsty;
+ blit.dst.box.z = dstz;
+ blit.dst.box.width = src_box->width;
+ blit.dst.box.height = src_box->height;
+ blit.dst.box.depth = src_box->depth;
+ blit.mask = util_format_get_mask(src->format) &
+ util_format_get_mask(dst->format);
+ blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+ if (blit.mask) {
+ pipe->blit(pipe, &blit);
+ }
+}
+
+/* Copy from a full GPU texture to a transfer's staging one. */
+static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
+ struct pipe_resource *dst = &stransfer->staging->b.b;
+ struct pipe_resource *src = transfer->resource;
+
+ if (src->nr_samples > 1) {
+ si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
+ src, transfer->level, &transfer->box);
+ return;
+ }
+
+ sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
+ &transfer->box);
+}
+
+/* Copy from a transfer's staging texture to a full GPU one. */
+static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
+ struct pipe_resource *dst = transfer->resource;
+ struct pipe_resource *src = &stransfer->staging->b.b;
+ struct pipe_box sbox;
+
+ u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
+
+ if (dst->nr_samples > 1) {
+ si_copy_region_with_blit(ctx, dst, transfer->level,
+ transfer->box.x, transfer->box.y, transfer->box.z,
+ src, 0, &sbox);
+ return;
+ }
+
+ sctx->dma_copy(ctx, dst, transfer->level,
+ transfer->box.x, transfer->box.y, transfer->box.z,
+ src, 0, &sbox);
+}
+
+static unsigned si_texture_get_offset(struct si_screen *sscreen,
+ struct si_texture *tex, unsigned level,
+ const struct pipe_box *box,
+ unsigned *stride,
+ unsigned *layer_stride)
+{
+ if (sscreen->info.chip_class >= GFX9) {
+ *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+ *layer_stride = tex->surface.u.gfx9.surf_slice_size;
+
+ if (!box)
+ return 0;
+
+ /* Each texture is an array of slices. Each slice is an array
+ * of mipmap levels. */
+ return box->z * tex->surface.u.gfx9.surf_slice_size +
+ tex->surface.u.gfx9.offset[level] +
+ (box->y / tex->surface.blk_h *
+ tex->surface.u.gfx9.surf_pitch +
+ box->x / tex->surface.blk_w) * tex->surface.bpe;
+ } else {
+ *stride = tex->surface.u.legacy.level[level].nblk_x *
+ tex->surface.bpe;
+ assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
+ *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
+
+ if (!box)
+ return tex->surface.u.legacy.level[level].offset;
+
+ /* Each texture is an array of mipmap levels. Each level is
+ * an array of slices. */
+ return tex->surface.u.legacy.level[level].offset +
+ box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
+ (box->y / tex->surface.blk_h *
+ tex->surface.u.legacy.level[level].nblk_x +
+ box->x / tex->surface.blk_w) * tex->surface.bpe;
+ }
+}
+
+static int si_init_surface(struct si_screen *sscreen,
+ struct radeon_surf *surface,
+ const struct pipe_resource *ptex,
+ enum radeon_surf_mode array_mode,
+ unsigned pitch_in_bytes_override,
+ unsigned offset,
+ bool is_imported,
+ bool is_scanout,
+ bool is_flushed_depth,
+ bool tc_compatible_htile)
+{
+ const struct util_format_description *desc =
+ util_format_description(ptex->format);
+ bool is_depth, is_stencil;
+ int r;
+ unsigned i, bpe, flags = 0;
+
+ is_depth = util_format_has_depth(desc);
+ is_stencil = util_format_has_stencil(desc);
+
+ if (!is_flushed_depth &&
+ ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
+ bpe = 4; /* stencil is allocated separately */
+ } else {
+ bpe = util_format_get_blocksize(ptex->format);
+ assert(util_is_power_of_two_or_zero(bpe));
+ }
+
+ if (!is_flushed_depth && is_depth) {
+ flags |= RADEON_SURF_ZBUFFER;
+
+ if (tc_compatible_htile &&
+ (sscreen->info.chip_class >= GFX9 ||
+ array_mode == RADEON_SURF_MODE_2D)) {
+ /* TC-compatible HTILE only supports Z32_FLOAT.
+ * GFX9 also supports Z16_UNORM.
+ * On VI, promote Z16 to Z32. DB->CB copies will convert
+ * the format for transfers.
+ */
+ if (sscreen->info.chip_class == VI)
+ bpe = 4;
+
+ flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+ }
+
+ if (is_stencil)
+ flags |= RADEON_SURF_SBUFFER;
+ }
+
+ if (sscreen->info.chip_class >= VI &&
+ (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC ||
+ ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
+ (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
+ if (sscreen->info.family == CHIP_STONEY &&
+ bpe == 16 && ptex->nr_samples >= 2)
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* VI: DCC clear for 4x and 8x MSAA array textures unimplemented. */
+ if (sscreen->info.chip_class == VI &&
+ ptex->nr_storage_samples >= 4 &&
+ ptex->array_size > 1)
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
+ if (sscreen->info.chip_class >= GFX9 &&
+ ptex->nr_storage_samples >= 4)
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ if (ptex->bind & PIPE_BIND_SCANOUT || is_scanout) {
+ /* This should catch bugs in gallium users setting incorrect flags. */
+ assert(ptex->nr_samples <= 1 &&
+ ptex->array_size == 1 &&
+ ptex->depth0 == 1 &&
+ ptex->last_level == 0 &&
+ !(flags & RADEON_SURF_Z_OR_SBUFFER));
+
+ flags |= RADEON_SURF_SCANOUT;
+ }
+
+ if (ptex->bind & PIPE_BIND_SHARED)
+ flags |= RADEON_SURF_SHAREABLE;
+ if (is_imported)
+ flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
+ if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_TILING))
+ flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
+
+ r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe,
+ array_mode, surface);
+ if (r) {
+ return r;
+ }
+
+ unsigned pitch = pitch_in_bytes_override / bpe;
+
+ if (sscreen->info.chip_class >= GFX9) {
+ if (pitch) {
+ surface->u.gfx9.surf_pitch = pitch;
+ surface->u.gfx9.surf_slice_size =
+ (uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
+ }
+ surface->u.gfx9.surf_offset = offset;
+ } else {
+ if (pitch) {
+ surface->u.legacy.level[0].nblk_x = pitch;
+ surface->u.legacy.level[0].slice_size_dw =
+ ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
+ }
+ if (offset) {
+ for (i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
+ surface->u.legacy.level[i].offset += offset;
+ }
+ }
+ return 0;
+}
+
+static void si_texture_init_metadata(struct si_screen *sscreen,
+ struct si_texture *tex,
+ struct radeon_bo_metadata *metadata)
+{
+ struct radeon_surf *surface = &tex->surface;
+
+ memset(metadata, 0, sizeof(*metadata));
+
+ if (sscreen->info.chip_class >= GFX9) {
+ metadata->u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
+ } else {
+ metadata->u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ?
+ RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+ metadata->u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ?
+ RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+ metadata->u.legacy.pipe_config = surface->u.legacy.pipe_config;
+ metadata->u.legacy.bankw = surface->u.legacy.bankw;
+ metadata->u.legacy.bankh = surface->u.legacy.bankh;
+ metadata->u.legacy.tile_split = surface->u.legacy.tile_split;
+ metadata->u.legacy.mtilea = surface->u.legacy.mtilea;
+ metadata->u.legacy.num_banks = surface->u.legacy.num_banks;
+ metadata->u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
+ metadata->u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+ }
+}
+
+static void si_surface_import_metadata(struct si_screen *sscreen,
+ struct radeon_surf *surf,
+ struct radeon_bo_metadata *metadata,
+ enum radeon_surf_mode *array_mode,
+ bool *is_scanout)
+{
+ if (sscreen->info.chip_class >= GFX9) {
+ if (metadata->u.gfx9.swizzle_mode > 0)
+ *array_mode = RADEON_SURF_MODE_2D;
+ else
+ *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ *is_scanout = metadata->u.gfx9.swizzle_mode == 0 ||
+ metadata->u.gfx9.swizzle_mode % 4 == 2;
+
+ surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
+ } else {
+ surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
+ surf->u.legacy.bankw = metadata->u.legacy.bankw;
+ surf->u.legacy.bankh = metadata->u.legacy.bankh;
+ surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
+ surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
+ surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
+
+ if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
+ *array_mode = RADEON_SURF_MODE_2D;
+ else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
+ *array_mode = RADEON_SURF_MODE_1D;
+ else
+ *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ *is_scanout = metadata->u.legacy.scanout;
+ }
+}
+
+void si_eliminate_fast_color_clear(struct si_context *sctx,
+ struct si_texture *tex)
+{
+ struct si_screen *sscreen = sctx->screen;
+ struct pipe_context *ctx = &sctx->b;
+
+ if (ctx == sscreen->aux_context)
+ mtx_lock(&sscreen->aux_context_lock);
+
+ unsigned n = sctx->num_decompress_calls;
+ ctx->flush_resource(ctx, &tex->buffer.b.b);
+
+ /* Flush only if any fast clear elimination took place. */
+ if (n != sctx->num_decompress_calls)
+ ctx->flush(ctx, NULL, 0);
+
+ if (ctx == sscreen->aux_context)
+ mtx_unlock(&sscreen->aux_context_lock);
+}
+
+void si_texture_discard_cmask(struct si_screen *sscreen,
+ struct si_texture *tex)
+{
+ if (!tex->cmask_buffer)
+ return;
+
+ assert(tex->buffer.b.b.nr_samples <= 1);
+
+ /* Disable CMASK. */
+ tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
+ tex->dirty_level_mask = 0;
+
+ tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
+
+ if (tex->cmask_buffer != &tex->buffer)
+ r600_resource_reference(&tex->cmask_buffer, NULL);
+
+ tex->cmask_buffer = NULL;
+
+ /* Notify all contexts about the change. */
+ p_atomic_inc(&sscreen->dirty_tex_counter);
+ p_atomic_inc(&sscreen->compressed_colortex_counter);
+}
+
+static bool si_can_disable_dcc(struct si_texture *tex)
+{
+ /* We can't disable DCC if it can be written by another process. */
+ return tex->dcc_offset &&
+ (!tex->buffer.b.is_shared ||
+ !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
+}
+
+static bool si_texture_discard_dcc(struct si_screen *sscreen,
+ struct si_texture *tex)
+{
+ if (!si_can_disable_dcc(tex))
+ return false;
+
+ assert(tex->dcc_separate_buffer == NULL);
+
+ /* Disable DCC. */
+ tex->dcc_offset = 0;
+
+ /* Notify all contexts about the change. */
+ p_atomic_inc(&sscreen->dirty_tex_counter);
+ return true;
+}
+
+/**
+ * Disable DCC for the texture. (first decompress, then discard metadata).
+ *
+ * There is unresolved multi-context synchronization issue between
+ * screen::aux_context and the current context. If applications do this with
+ * multiple contexts, it's already undefined behavior for them and we don't
+ * have to worry about that. The scenario is:
+ *
+ * If context 1 disables DCC and context 2 has queued commands that write
+ * to the texture via CB with DCC enabled, and the order of operations is
+ * as follows:
+ * context 2 queues draw calls rendering to the texture, but doesn't flush
+ * context 1 disables DCC and flushes
+ * context 1 & 2 reset descriptors and FB state
+ * context 2 flushes (new compressed tiles written by the draw calls)
+ * context 1 & 2 read garbage, because DCC is disabled, yet there are
+ * compressed tiled
+ *
+ * \param sctx the current context if you have one, or rscreen->aux_context
+ * if you don't.
+ */
+bool si_texture_disable_dcc(struct si_context *sctx,
+ struct si_texture *tex)
+{
+ struct si_screen *sscreen = sctx->screen;
+
+ if (!si_can_disable_dcc(tex))
+ return false;
+
+ if (&sctx->b == sscreen->aux_context)
+ mtx_lock(&sscreen->aux_context_lock);
+
+ /* Decompress DCC. */
+ si_decompress_dcc(sctx, tex);
+ sctx->b.flush(&sctx->b, NULL, 0);
+
+ if (&sctx->b == sscreen->aux_context)
+ mtx_unlock(&sscreen->aux_context_lock);
+
+ return si_texture_discard_dcc(sscreen, tex);
+}
+
+static void si_reallocate_texture_inplace(struct si_context *sctx,
+ struct si_texture *tex,
+ unsigned new_bind_flag,
+ bool invalidate_storage)
+{
+ struct pipe_screen *screen = sctx->b.screen;
+ struct si_texture *new_tex;
+ struct pipe_resource templ = tex->buffer.b.b;
+ unsigned i;
+
+ templ.bind |= new_bind_flag;
+
+ if (tex->buffer.b.is_shared)
+ return;
+
+ if (new_bind_flag == PIPE_BIND_LINEAR) {
+ if (tex->surface.is_linear)
+ return;
+
+ /* This fails with MSAA, depth, and compressed textures. */
+ if (si_choose_tiling(sctx->screen, &templ, false) !=
+ RADEON_SURF_MODE_LINEAR_ALIGNED)
+ return;
+ }
+
+ new_tex = (struct si_texture*)screen->resource_create(screen, &templ);
+ if (!new_tex)
+ return;
+
+ /* Copy the pixels to the new texture. */
+ if (!invalidate_storage) {
+ for (i = 0; i <= templ.last_level; i++) {
+ struct pipe_box box;
+
+ u_box_3d(0, 0, 0,
+ u_minify(templ.width0, i), u_minify(templ.height0, i),
+ util_num_layers(&templ, i), &box);
+
+ sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0,
+ &tex->buffer.b.b, i, &box);
+ }
+ }
+
+ if (new_bind_flag == PIPE_BIND_LINEAR) {
+ si_texture_discard_cmask(sctx->screen, tex);
+ si_texture_discard_dcc(sctx->screen, tex);
+ }
+
+ /* Replace the structure fields of tex. */
+ tex->buffer.b.b.bind = templ.bind;
+ pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
+ tex->buffer.gpu_address = new_tex->buffer.gpu_address;
+ tex->buffer.vram_usage = new_tex->buffer.vram_usage;
+ tex->buffer.gart_usage = new_tex->buffer.gart_usage;
+ tex->buffer.bo_size = new_tex->buffer.bo_size;
+ tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
+ tex->buffer.domains = new_tex->buffer.domains;
+ tex->buffer.flags = new_tex->buffer.flags;
+
+ tex->surface = new_tex->surface;
+ tex->size = new_tex->size;
+ si_texture_reference(&tex->flushed_depth_texture,
+ new_tex->flushed_depth_texture);
+
+ tex->fmask_offset = new_tex->fmask_offset;
+ tex->cmask_offset = new_tex->cmask_offset;
+ tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
+
+ if (tex->cmask_buffer == &tex->buffer)
+ tex->cmask_buffer = NULL;
+ else
+ r600_resource_reference(&tex->cmask_buffer, NULL);
+
+ if (new_tex->cmask_buffer == &new_tex->buffer)
+ tex->cmask_buffer = &tex->buffer;
+ else
+ r600_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
+
+ tex->dcc_offset = new_tex->dcc_offset;
+ tex->cb_color_info = new_tex->cb_color_info;
+ memcpy(tex->color_clear_value, new_tex->color_clear_value,
+ sizeof(tex->color_clear_value));
+ tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
+
+ tex->htile_offset = new_tex->htile_offset;
+ tex->depth_clear_value = new_tex->depth_clear_value;
+ tex->dirty_level_mask = new_tex->dirty_level_mask;
+ tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
+ tex->db_render_format = new_tex->db_render_format;
+ tex->stencil_clear_value = new_tex->stencil_clear_value;
+ tex->tc_compatible_htile = new_tex->tc_compatible_htile;
+ tex->depth_cleared = new_tex->depth_cleared;
+ tex->stencil_cleared = new_tex->stencil_cleared;
+ tex->upgraded_depth = new_tex->upgraded_depth;
+ tex->db_compatible = new_tex->db_compatible;
+ tex->can_sample_z = new_tex->can_sample_z;
+ tex->can_sample_s = new_tex->can_sample_s;
+
+ tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
+ tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
+ r600_resource_reference(&tex->dcc_separate_buffer,
+ new_tex->dcc_separate_buffer);
+ r600_resource_reference(&tex->last_dcc_separate_buffer,
+ new_tex->last_dcc_separate_buffer);
+
+ if (new_bind_flag == PIPE_BIND_LINEAR) {
+ assert(!tex->htile_offset);
+ assert(!tex->cmask_buffer);
+ assert(!tex->surface.fmask_size);
+ assert(!tex->dcc_offset);
+ assert(!tex->is_depth);
+ }
+
+ si_texture_reference(&new_tex, NULL);
+
+ p_atomic_inc(&sctx->screen->dirty_tex_counter);
+}
+
+static uint32_t si_get_bo_metadata_word1(struct si_screen *sscreen)
+{
+ return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
+}
+
+static void si_query_opaque_metadata(struct si_screen *sscreen,
+ struct si_texture *tex,
+ struct radeon_bo_metadata *md)
+{
+ struct pipe_resource *res = &tex->buffer.b.b;
+ static const unsigned char swizzle[] = {
+ PIPE_SWIZZLE_X,
+ PIPE_SWIZZLE_Y,
+ PIPE_SWIZZLE_Z,
+ PIPE_SWIZZLE_W
+ };
+ uint32_t desc[8], i;
+ bool is_array = util_texture_is_array(res->target);
+
+ if (!sscreen->info.has_bo_metadata)
+ return;
+
+ assert(tex->dcc_separate_buffer == NULL);
+ assert(tex->surface.fmask_size == 0);
+
+ /* Metadata image format format version 1:
+ * [0] = 1 (metadata format identifier)
+ * [1] = (VENDOR_ID << 16) | PCI_ID
+ * [2:9] = image descriptor for the whole resource
+ * [2] is always 0, because the base address is cleared
+ * [9] is the DCC offset bits [39:8] from the beginning of
+ * the buffer
+ * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
+ */
+
+ md->metadata[0] = 1; /* metadata image format version 1 */
+
+ /* TILE_MODE_INDEX is ambiguous without a PCI ID. */
+ md->metadata[1] = si_get_bo_metadata_word1(sscreen);
+
+ si_make_texture_descriptor(sscreen, tex, true,
+ res->target, res->format,
+ swizzle, 0, res->last_level, 0,
+ is_array ? res->array_size - 1 : 0,
+ res->width0, res->height0, res->depth0,
+ desc, NULL);
+
+ si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0],
+ 0, 0, tex->surface.blk_w, false, desc);
+
+ /* Clear the base address and set the relative DCC offset. */
+ desc[0] = 0;
+ desc[1] &= C_008F14_BASE_ADDRESS_HI;
+ desc[7] = tex->dcc_offset >> 8;
+
+ /* Dwords [2:9] contain the image descriptor. */
+ memcpy(&md->metadata[2], desc, sizeof(desc));
+ md->size_metadata = 10 * 4;
+
+ /* Dwords [10:..] contain the mipmap level offsets. */
+ if (sscreen->info.chip_class <= VI) {
+ for (i = 0; i <= res->last_level; i++)
+ md->metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8;
+
+ md->size_metadata += (1 + res->last_level) * 4;
+ }
+}
+
+static void si_apply_opaque_metadata(struct si_screen *sscreen,
+ struct si_texture *tex,
+ struct radeon_bo_metadata *md)
+{
+ uint32_t *desc = &md->metadata[2];
+
+ if (sscreen->info.chip_class < VI)
+ return;
+
+ /* Return if DCC is enabled. The texture should be set up with it
+ * already.
+ */
+ if (md->size_metadata >= 10 * 4 && /* at least 2(header) + 8(desc) dwords */
+ md->metadata[0] != 0 &&
+ md->metadata[1] == si_get_bo_metadata_word1(sscreen) &&
+ G_008F28_COMPRESSION_EN(desc[6])) {
+ tex->dcc_offset = (uint64_t)desc[7] << 8;
+ return;
+ }
+
+ /* Disable DCC. These are always set by texture_from_handle and must
+ * be cleared here.
+ */
+ tex->dcc_offset = 0;
+}
+
+static boolean si_texture_get_handle(struct pipe_screen* screen,
+ struct pipe_context *ctx,
+ struct pipe_resource *resource,
+ struct winsys_handle *whandle,
+ unsigned usage)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct si_context *sctx;
+ struct r600_resource *res = r600_resource(resource);
+ struct si_texture *tex = (struct si_texture*)resource;
+ struct radeon_bo_metadata metadata;
+ bool update_metadata = false;
+ unsigned stride, offset, slice_size;
+ bool flush = false;
+
+ ctx = threaded_context_unwrap_sync(ctx);
+ sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context);
+
+ if (resource->target != PIPE_BUFFER) {
+ /* This is not supported now, but it might be required for OpenCL
+ * interop in the future.
+ */
+ if (resource->nr_samples > 1 || tex->is_depth)
+ return false;
+
+ /* Move a suballocated texture into a non-suballocated allocation. */
+ if (sscreen->ws->buffer_is_suballocated(res->buf) ||
+ tex->surface.tile_swizzle ||
+ (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+ sscreen->info.has_local_buffers &&
+ whandle->type != WINSYS_HANDLE_TYPE_KMS)) {
+ assert(!res->b.is_shared);
+ si_reallocate_texture_inplace(sctx, tex,
+ PIPE_BIND_SHARED, false);
+ flush = true;
+ assert(res->b.b.bind & PIPE_BIND_SHARED);
+ assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+ assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
+ assert(tex->surface.tile_swizzle == 0);
+ }
+
+ /* Since shader image stores don't support DCC on VI,
+ * disable it for external clients that want write
+ * access.
+ */
+ if (usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->dcc_offset) {
+ if (si_texture_disable_dcc(sctx, tex)) {
+ update_metadata = true;
+ /* si_texture_disable_dcc flushes the context */
+ flush = false;
+ }
+ }
+
+ if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+ (tex->cmask_buffer || tex->dcc_offset)) {
+ /* Eliminate fast clear (both CMASK and DCC) */
+ si_eliminate_fast_color_clear(sctx, tex);
+ /* eliminate_fast_color_clear flushes the context */
+ flush = false;
+
+ /* Disable CMASK if flush_resource isn't going
+ * to be called.
+ */
+ if (tex->cmask_buffer)
+ si_texture_discard_cmask(sscreen, tex);
+ }
+
+ /* Set metadata. */
+ if (!res->b.is_shared || update_metadata) {
+ si_texture_init_metadata(sscreen, tex, &metadata);
+ si_query_opaque_metadata(sscreen, tex, &metadata);
+
+ sscreen->ws->buffer_set_metadata(res->buf, &metadata);
+ }
+
+ if (sscreen->info.chip_class >= GFX9) {
+ offset = tex->surface.u.gfx9.surf_offset;
+ stride = tex->surface.u.gfx9.surf_pitch *
+ tex->surface.bpe;
+ slice_size = tex->surface.u.gfx9.surf_slice_size;
+ } else {
+ offset = tex->surface.u.legacy.level[0].offset;
+ stride = tex->surface.u.legacy.level[0].nblk_x *
+ tex->surface.bpe;
+ slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+ }
+ } else {
+ /* Buffer exports are for the OpenCL interop. */
+ /* Move a suballocated buffer into a non-suballocated allocation. */
+ if (sscreen->ws->buffer_is_suballocated(res->buf) ||
+ /* A DMABUF export always fails if the BO is local. */
+ (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+ sscreen->info.has_local_buffers)) {
+ assert(!res->b.is_shared);
+
+ /* Allocate a new buffer with PIPE_BIND_SHARED. */
+ struct pipe_resource templ = res->b.b;
+ templ.bind |= PIPE_BIND_SHARED;
+
+ struct pipe_resource *newb =
+ screen->resource_create(screen, &templ);
+ if (!newb)
+ return false;
+
+ /* Copy the old buffer contents to the new one. */
+ struct pipe_box box;
+ u_box_1d(0, newb->width0, &box);
+ sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0,
+ &res->b.b, 0, &box);
+ flush = true;
+ /* Move the new buffer storage to the old pipe_resource. */
+ si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
+ pipe_resource_reference(&newb, NULL);
+
+ assert(res->b.b.bind & PIPE_BIND_SHARED);
+ assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+ }
+
+ /* Buffers */
+ offset = 0;
+ stride = 0;
+ slice_size = 0;
+ }
+
+ if (flush)
+ sctx->b.flush(&sctx->b, NULL, 0);
+
+ if (res->b.is_shared) {
+ /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+ * doesn't set it.
+ */
+ res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+ if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+ res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+ } else {
+ res->b.is_shared = true;
+ res->external_usage = usage;
+ }
+
+ return sscreen->ws->buffer_get_handle(res->buf, stride, offset,
+ slice_size, whandle);
+}
+
+static void si_texture_destroy(struct pipe_screen *screen,
+ struct pipe_resource *ptex)
+{
+ struct si_texture *tex = (struct si_texture*)ptex;
+ struct r600_resource *resource = &tex->buffer;
+
+ si_texture_reference(&tex->flushed_depth_texture, NULL);
+
+ if (tex->cmask_buffer != &tex->buffer) {
+ r600_resource_reference(&tex->cmask_buffer, NULL);
+ }
+ pb_reference(&resource->buf, NULL);
+ r600_resource_reference(&tex->dcc_separate_buffer, NULL);
+ r600_resource_reference(&tex->last_dcc_separate_buffer, NULL);
+ FREE(tex);
+}
+
+static const struct u_resource_vtbl si_texture_vtbl;
+
+static void si_texture_get_htile_size(struct si_screen *sscreen,
+ struct si_texture *tex)
+{
+ unsigned cl_width, cl_height, width, height;
+ unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
+ unsigned num_pipes = sscreen->info.num_tile_pipes;
+
+ assert(sscreen->info.chip_class <= VI);
+
+ tex->surface.htile_size = 0;
+
+ if (tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+ !sscreen->info.htile_cmask_support_1d_tiling)
+ return;
+
+ /* Overalign HTILE on P2 configs to work around GPU hangs in
+ * piglit/depthstencil-render-miplevels 585.
+ *
+ * This has been confirmed to help Kabini & Stoney, where the hangs
+ * are always reproducible. I think I have seen the test hang
+ * on Carrizo too, though it was very rare there.
+ */
+ if (sscreen->info.chip_class >= CIK && num_pipes < 4)
+ num_pipes = 4;
+
+ switch (num_pipes) {
+ case 1:
+ cl_width = 32;
+ cl_height = 16;
+ break;
+ case 2:
+ cl_width = 32;
+ cl_height = 32;
+ break;
+ case 4:
+ cl_width = 64;
+ cl_height = 32;
+ break;
+ case 8:
+ cl_width = 64;
+ cl_height = 64;
+ break;
+ case 16:
+ cl_width = 128;
+ cl_height = 64;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ width = align(tex->surface.u.legacy.level[0].nblk_x, cl_width * 8);
+ height = align(tex->surface.u.legacy.level[0].nblk_y, cl_height * 8);
+
+ slice_elements = (width * height) / (8 * 8);
+ slice_bytes = slice_elements * 4;
+
+ pipe_interleave_bytes = sscreen->info.pipe_interleave_bytes;
+ base_align = num_pipes * pipe_interleave_bytes;
+
+ tex->surface.htile_alignment = base_align;
+ tex->surface.htile_size =
+ util_num_layers(&tex->buffer.b.b, 0) *
+ align(slice_bytes, base_align);
+}
+
+static void si_texture_allocate_htile(struct si_screen *sscreen,
+ struct si_texture *tex)
+{
+ if (sscreen->info.chip_class <= VI && !tex->tc_compatible_htile)
+ si_texture_get_htile_size(sscreen, tex);
+
+ if (!tex->surface.htile_size)
+ return;
+
+ tex->htile_offset = align(tex->size, tex->surface.htile_alignment);
+ tex->size = tex->htile_offset + tex->surface.htile_size;
+}
+
+void si_print_texture_info(struct si_screen *sscreen,
+ struct si_texture *tex, struct u_log_context *log)
+{
+ int i;
+
+ /* Common parameters. */
+ u_log_printf(log, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
+ "blk_h=%u, array_size=%u, last_level=%u, "
+ "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
+ tex->buffer.b.b.width0, tex->buffer.b.b.height0,
+ tex->buffer.b.b.depth0, tex->surface.blk_w,
+ tex->surface.blk_h,
+ tex->buffer.b.b.array_size, tex->buffer.b.b.last_level,
+ tex->surface.bpe, tex->buffer.b.b.nr_samples,
+ tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
+
+ if (sscreen->info.chip_class >= GFX9) {
+ u_log_printf(log, " Surf: size=%"PRIu64", slice_size=%"PRIu64", "
+ "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
+ tex->surface.surf_size,
+ tex->surface.u.gfx9.surf_slice_size,
+ tex->surface.surf_alignment,
+ tex->surface.u.gfx9.surf.swizzle_mode,
+ tex->surface.u.gfx9.surf.epitch,
+ tex->surface.u.gfx9.surf_pitch);
+
+ if (tex->surface.fmask_size) {
+ u_log_printf(log, " FMASK: offset=%"PRIu64", size=%"PRIu64", "
+ "alignment=%u, swmode=%u, epitch=%u\n",
+ tex->fmask_offset,
+ tex->surface.fmask_size,
+ tex->surface.fmask_alignment,
+ tex->surface.u.gfx9.fmask.swizzle_mode,
+ tex->surface.u.gfx9.fmask.epitch);
+ }
+
+ if (tex->cmask_buffer) {
+ u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, "
+ "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
+ tex->cmask_offset,
+ tex->surface.cmask_size,
+ tex->surface.cmask_alignment,
+ tex->surface.u.gfx9.cmask.rb_aligned,
+ tex->surface.u.gfx9.cmask.pipe_aligned);
+ }
+
+ if (tex->htile_offset) {
+ u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, alignment=%u, "
+ "rb_aligned=%u, pipe_aligned=%u\n",
+ tex->htile_offset,
+ tex->surface.htile_size,
+ tex->surface.htile_alignment,
+ tex->surface.u.gfx9.htile.rb_aligned,
+ tex->surface.u.gfx9.htile.pipe_aligned);
+ }
+
+ if (tex->dcc_offset) {
+ u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, "
+ "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
+ tex->dcc_offset, tex->surface.dcc_size,
+ tex->surface.dcc_alignment,
+ tex->surface.u.gfx9.dcc_pitch_max,
+ tex->surface.num_dcc_levels);
+ }
+
+ if (tex->surface.u.gfx9.stencil_offset) {
+ u_log_printf(log, " Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n",
+ tex->surface.u.gfx9.stencil_offset,
+ tex->surface.u.gfx9.stencil.swizzle_mode,
+ tex->surface.u.gfx9.stencil.epitch);
+ }
+ return;
+ }
+
+ u_log_printf(log, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, "
+ "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
+ tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
+ tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea,
+ tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config,
+ (tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+
+ if (tex->surface.fmask_size)
+ u_log_printf(log, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
+ "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
+ tex->fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
+ tex->surface.u.legacy.fmask.pitch_in_pixels,
+ tex->surface.u.legacy.fmask.bankh,
+ tex->surface.u.legacy.fmask.slice_tile_max,
+ tex->surface.u.legacy.fmask.tiling_index);
+
+ if (tex->cmask_buffer)
+ u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, alignment=%u, "
+ "slice_tile_max=%u\n",
+ tex->cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
+ tex->surface.u.legacy.cmask_slice_tile_max);
+
+ if (tex->htile_offset)
+ u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, "
+ "alignment=%u, TC_compatible = %u\n",
+ tex->htile_offset, tex->surface.htile_size,
+ tex->surface.htile_alignment,
+ tex->tc_compatible_htile);
+
+ if (tex->dcc_offset) {
+ u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, alignment=%u\n",
+ tex->dcc_offset, tex->surface.dcc_size,
+ tex->surface.dcc_alignment);
+ for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+ u_log_printf(log, " DCCLevel[%i]: enabled=%u, offset=%u, "
+ "fast_clear_size=%u\n",
+ i, i < tex->surface.num_dcc_levels,
+ tex->surface.u.legacy.level[i].dcc_offset,
+ tex->surface.u.legacy.level[i].dcc_fast_clear_size);
+ }
+
+ for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+ u_log_printf(log, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
+ "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+ "mode=%u, tiling_index = %u\n",
+ i, tex->surface.u.legacy.level[i].offset,
+ (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
+ u_minify(tex->buffer.b.b.width0, i),
+ u_minify(tex->buffer.b.b.height0, i),
+ u_minify(tex->buffer.b.b.depth0, i),
+ tex->surface.u.legacy.level[i].nblk_x,
+ tex->surface.u.legacy.level[i].nblk_y,
+ tex->surface.u.legacy.level[i].mode,
+ tex->surface.u.legacy.tiling_index[i]);
+
+ if (tex->surface.has_stencil) {
+ u_log_printf(log, " StencilLayout: tilesplit=%u\n",
+ tex->surface.u.legacy.stencil_tile_split);
+ for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
+ u_log_printf(log, " StencilLevel[%i]: offset=%"PRIu64", "
+ "slice_size=%"PRIu64", npix_x=%u, "
+ "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+ "mode=%u, tiling_index = %u\n",
+ i, tex->surface.u.legacy.stencil_level[i].offset,
+ (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
+ u_minify(tex->buffer.b.b.width0, i),
+ u_minify(tex->buffer.b.b.height0, i),
+ u_minify(tex->buffer.b.b.depth0, i),
+ tex->surface.u.legacy.stencil_level[i].nblk_x,
+ tex->surface.u.legacy.stencil_level[i].nblk_y,
+ tex->surface.u.legacy.stencil_level[i].mode,
+ tex->surface.u.legacy.stencil_tiling_index[i]);
+ }
+ }
+}
+
+/* Common processing for si_texture_create and si_texture_from_handle */
+static struct si_texture *
+si_texture_create_object(struct pipe_screen *screen,
+ const struct pipe_resource *base,
+ struct pb_buffer *buf,
+ struct radeon_surf *surface)
+{
+ struct si_texture *tex;
+ struct r600_resource *resource;
+ struct si_screen *sscreen = (struct si_screen*)screen;
+
+ tex = CALLOC_STRUCT(si_texture);
+ if (!tex)
+ return NULL;
+
+ resource = &tex->buffer;
+ resource->b.b = *base;
+ resource->b.b.next = NULL;
+ resource->b.vtbl = &si_texture_vtbl;
+ pipe_reference_init(&resource->b.b.reference, 1);
+ resource->b.b.screen = screen;
+
+ /* don't include stencil-only formats which we don't support for rendering */
+ tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
+
+ tex->surface = *surface;
+ tex->size = tex->surface.surf_size;
+
+ tex->tc_compatible_htile = tex->surface.htile_size != 0 &&
+ (tex->surface.flags &
+ RADEON_SURF_TC_COMPATIBLE_HTILE);
+
+ /* TC-compatible HTILE:
+ * - VI only supports Z32_FLOAT.
+ * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
+ if (tex->tc_compatible_htile) {
+ if (sscreen->info.chip_class >= GFX9 &&
+ base->format == PIPE_FORMAT_Z16_UNORM)
+ tex->db_render_format = base->format;
+ else {
+ tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+ tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
+ base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
+ }
+ } else {
+ tex->db_render_format = base->format;
+ }
+
+ /* Applies to GCN. */
+ tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
+
+ /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
+ * between frames, so the only thing that can enable separate DCC
+ * with DRI2 is multiple slow clears within a frame.
+ */
+ tex->ps_draw_ratio = 0;
+
+ if (tex->is_depth) {
+ if (sscreen->info.chip_class >= GFX9) {
+ tex->can_sample_z = true;
+ tex->can_sample_s = true;
+ } else {
+ tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
+ tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
+ }
+
+ if (!(base->flags & (SI_RESOURCE_FLAG_TRANSFER |
+ SI_RESOURCE_FLAG_FLUSHED_DEPTH))) {
+ tex->db_compatible = true;
+
+ if (!(sscreen->debug_flags & DBG(NO_HYPERZ)))
+ si_texture_allocate_htile(sscreen, tex);
+ }
+ } else {
+ if (base->nr_samples > 1 &&
+ !buf &&
+ !(sscreen->debug_flags & DBG(NO_FMASK))) {
+ /* Allocate FMASK. */
+ tex->fmask_offset = align64(tex->size,
+ tex->surface.fmask_alignment);
+ tex->size = tex->fmask_offset + tex->surface.fmask_size;
+
+ /* Allocate CMASK. */
+ tex->cmask_offset = align64(tex->size, tex->surface.cmask_alignment);
+ tex->size = tex->cmask_offset + tex->surface.cmask_size;
+ tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+ tex->cmask_buffer = &tex->buffer;
+
+ if (!tex->surface.fmask_size || !tex->surface.cmask_size) {
+ FREE(tex);
+ return NULL;
+ }
+ }
+
+ /* Shared textures must always set up DCC here.
+ * If it's not present, it will be disabled by
+ * apply_opaque_metadata later.
+ */
+ if (tex->surface.dcc_size &&
+ (buf || !(sscreen->debug_flags & DBG(NO_DCC))) &&
+ !(tex->surface.flags & RADEON_SURF_SCANOUT)) {
+ /* Reserve space for the DCC buffer. */
+ tex->dcc_offset = align64(tex->size, tex->surface.dcc_alignment);
+ tex->size = tex->dcc_offset + tex->surface.dcc_size;
+ }
+ }
+
+ /* Now create the backing buffer. */
+ if (!buf) {
+ si_init_resource_fields(sscreen, resource, tex->size,
+ tex->surface.surf_alignment);
+
+ if (!si_alloc_resource(sscreen, resource)) {
+ FREE(tex);
+ return NULL;
+ }
+ } else {
+ resource->buf = buf;
+ resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
+ resource->bo_size = buf->size;
+ resource->bo_alignment = buf->alignment;
+ resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
+ if (resource->domains & RADEON_DOMAIN_VRAM)
+ resource->vram_usage = buf->size;
+ else if (resource->domains & RADEON_DOMAIN_GTT)
+ resource->gart_usage = buf->size;
+ }
+
+ if (tex->cmask_buffer) {
+ /* Initialize the cmask to 0xCC (= compressed state). */
+ si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b,
+ tex->cmask_offset, tex->surface.cmask_size,
+ 0xCCCCCCCC);
+ }
+ if (tex->htile_offset) {
+ uint32_t clear_value = 0;
+
+ if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
+ clear_value = 0x0000030F;
+
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
+ tex->htile_offset,
+ tex->surface.htile_size,
+ clear_value);
+ }
+
+ /* Initialize DCC only if the texture is not being imported. */
+ if (!buf && tex->dcc_offset) {
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
+ tex->dcc_offset,
+ tex->surface.dcc_size,
+ 0xFFFFFFFF);
+ }
+
+ /* Initialize the CMASK base register value. */
+ tex->cmask_base_address_reg =
+ (tex->buffer.gpu_address + tex->cmask_offset) >> 8;
+
+ if (sscreen->debug_flags & DBG(VM)) {
+ fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n",
+ tex->buffer.gpu_address,
+ tex->buffer.gpu_address + tex->buffer.buf->size,
+ base->width0, base->height0, util_num_layers(base, 0), base->last_level+1,
+ base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
+ }
+
+ if (sscreen->debug_flags & DBG(TEX)) {
+ puts("Texture:");
+ struct u_log_context log;
+ u_log_context_init(&log);
+ si_print_texture_info(sscreen, tex, &log);
+ u_log_new_page_print(&log, stdout);
+ fflush(stdout);
+ u_log_context_destroy(&log);
+ }
+
+ return tex;
+}
+
+static enum radeon_surf_mode
+si_choose_tiling(struct si_screen *sscreen,
+ const struct pipe_resource *templ, bool tc_compatible_htile)
+{
+ const struct util_format_description *desc = util_format_description(templ->format);
+ bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_TILING;
+ bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
+ !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
+
+ /* MSAA resources must be 2D tiled. */
+ if (templ->nr_samples > 1)
+ return RADEON_SURF_MODE_2D;
+
+ /* Transfer resources should be linear. */
+ if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on VI,
+ * which requires 2D tiling.
+ */
+ if (sscreen->info.chip_class == VI && tc_compatible_htile)
+ return RADEON_SURF_MODE_2D;
+
+ /* Handle common candidates for the linear mode.
+ * Compressed textures and DB surfaces must always be tiled.
+ */
+ if (!force_tiling &&
+ !is_depth_stencil &&
+ !util_format_is_compressed(templ->format)) {
+ if (sscreen->debug_flags & DBG(NO_TILING))
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
+ if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Cursors are linear on SI.
+ * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
+ if (templ->bind & PIPE_BIND_CURSOR)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ if (templ->bind & PIPE_BIND_LINEAR)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Textures with a very small height are recommended to be linear. */
+ if (templ->target == PIPE_TEXTURE_1D ||
+ templ->target == PIPE_TEXTURE_1D_ARRAY ||
+ /* Only very thin and long 2D textures should benefit from
+ * linear_aligned. */
+ (templ->width0 > 8 && templ->height0 <= 2))
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Textures likely to be mapped often. */
+ if (templ->usage == PIPE_USAGE_STAGING ||
+ templ->usage == PIPE_USAGE_STREAM)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+ }
+
+ /* Make small textures 1D tiled. */
+ if (templ->width0 <= 16 || templ->height0 <= 16 ||
+ (sscreen->debug_flags & DBG(NO_2D_TILING)))
+ return RADEON_SURF_MODE_1D;
+
+ /* The allocator will switch to 1D if needed. */
+ return RADEON_SURF_MODE_2D;
+}
+
+struct pipe_resource *si_texture_create(struct pipe_screen *screen,
+ const struct pipe_resource *templ)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ bool is_zs = util_format_is_depth_or_stencil(templ->format);
+
+ if (templ->nr_samples >= 2) {
+ /* This is hackish (overwriting the const pipe_resource template),
+ * but should be harmless and state trackers can also see
+ * the overriden number of samples in the created pipe_resource.
+ */
+ if (is_zs && sscreen->eqaa_force_z_samples) {
+ ((struct pipe_resource*)templ)->nr_samples =
+ ((struct pipe_resource*)templ)->nr_storage_samples =
+ sscreen->eqaa_force_z_samples;
+ } else if (!is_zs && sscreen->eqaa_force_color_samples) {
+ ((struct pipe_resource*)templ)->nr_samples =
+ sscreen->eqaa_force_coverage_samples;
+ ((struct pipe_resource*)templ)->nr_storage_samples =
+ sscreen->eqaa_force_color_samples;
+ }
+ }
+
+ struct radeon_surf surface = {0};
+ bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH;
+ bool tc_compatible_htile =
+ sscreen->info.chip_class >= VI &&
+ /* There are issues with TC-compatible HTILE on Tonga (and
+ * Iceland is the same design), and documented bug workarounds
+ * don't help. For example, this fails:
+ * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
+ */
+ sscreen->info.family != CHIP_TONGA &&
+ sscreen->info.family != CHIP_ICELAND &&
+ (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+ !(sscreen->debug_flags & DBG(NO_HYPERZ)) &&
+ !is_flushed_depth &&
+ templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+ is_zs;
+ int r;
+
+ r = si_init_surface(sscreen, &surface, templ,
+ si_choose_tiling(sscreen, templ, tc_compatible_htile),
+ 0, 0, false, false, is_flushed_depth,
+ tc_compatible_htile);
+ if (r) {
+ return NULL;
+ }
+
+ return (struct pipe_resource *)
+ si_texture_create_object(screen, templ, NULL, &surface);
+}
+
+static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen,
+ const struct pipe_resource *templ,
+ struct pb_buffer *buf,
+ unsigned stride,
+ unsigned offset,
+ unsigned usage,
+ bool dedicated)
+{
+ enum radeon_surf_mode array_mode;
+ struct radeon_surf surface = {};
+ struct radeon_bo_metadata metadata = {};
+ struct si_texture *tex;
+ bool is_scanout;
+ int r;
+
+ if (dedicated) {
+ sscreen->ws->buffer_get_metadata(buf, &metadata);
+ si_surface_import_metadata(sscreen, &surface, &metadata,
+ &array_mode, &is_scanout);
+ } else {
+ /**
+ * The bo metadata is unset for un-dedicated images. So we fall
+ * back to linear. See answer to question 5 of the
+ * VK_KHX_external_memory spec for some details.
+ *
+ * It is possible that this case isn't going to work if the
+ * surface pitch isn't correctly aligned by default.
+ *
+ * In order to support it correctly we require multi-image
+ * metadata to be syncrhonized between radv and radeonsi. The
+ * semantics of associating multiple image metadata to a memory
+ * object on the vulkan export side are not concretely defined
+ * either.
+ *
+ * All the use cases we are aware of at the moment for memory
+ * objects use dedicated allocations. So lets keep the initial
+ * implementation simple.
+ *
+ * A possible alternative is to attempt to reconstruct the
+ * tiling information when the TexParameter TEXTURE_TILING_EXT
+ * is set.
+ */
+ array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+ is_scanout = false;
+ }
+
+ r = si_init_surface(sscreen, &surface, templ,
+ array_mode, stride, offset, true, is_scanout,
+ false, false);
+ if (r)
+ return NULL;
+
+ tex = si_texture_create_object(&sscreen->b, templ, buf, &surface);
+ if (!tex)
+ return NULL;
+
+ tex->buffer.b.is_shared = true;
+ tex->buffer.external_usage = usage;
+
+ si_apply_opaque_metadata(sscreen, tex, &metadata);
+
+ assert(tex->surface.tile_swizzle == 0);
+ return &tex->buffer.b.b;
+}
+
+static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
+ const struct pipe_resource *templ,
+ struct winsys_handle *whandle,
+ unsigned usage)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct pb_buffer *buf = NULL;
+ unsigned stride = 0, offset = 0;
+
+ /* Support only 2D textures without mipmaps */
+ if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
+ templ->depth0 != 1 || templ->last_level != 0)
+ return NULL;
+
+ buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, &stride, &offset);
+ if (!buf)
+ return NULL;
+
+ return si_texture_from_winsys_buffer(sscreen, templ, buf, stride,
+ offset, usage, true);
+}
+
+bool si_init_flushed_depth_texture(struct pipe_context *ctx,
+ struct pipe_resource *texture,
+ struct si_texture **staging)
+{
+ struct si_texture *tex = (struct si_texture*)texture;
+ struct pipe_resource resource;
+ struct si_texture **flushed_depth_texture = staging ?
+ staging : &tex->flushed_depth_texture;
+ enum pipe_format pipe_format = texture->format;
+
+ if (!staging) {
+ if (tex->flushed_depth_texture)
+ return true; /* it's ready */
+
+ if (!tex->can_sample_z && tex->can_sample_s) {
+ switch (pipe_format) {
+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+ /* Save memory by not allocating the S plane. */
+ pipe_format = PIPE_FORMAT_Z32_FLOAT;
+ break;
+ case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ /* Save memory bandwidth by not copying the
+ * stencil part during flush.
+ *
+ * This potentially increases memory bandwidth
+ * if an application uses both Z and S texturing
+ * simultaneously (a flushed Z24S8 texture
+ * would be stored compactly), but how often
+ * does that really happen?
+ */
+ pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+ break;
+ default:;
+ }
+ } else if (!tex->can_sample_s && tex->can_sample_z) {
+ assert(util_format_has_stencil(util_format_description(pipe_format)));
+
+ /* DB->CB copies to an 8bpp surface don't work. */
+ pipe_format = PIPE_FORMAT_X24S8_UINT;
+ }
+ }
+
+ memset(&resource, 0, sizeof(resource));
+ resource.target = texture->target;
+ resource.format = pipe_format;
+ resource.width0 = texture->width0;
+ resource.height0 = texture->height0;
+ resource.depth0 = texture->depth0;
+ resource.array_size = texture->array_size;
+ resource.last_level = texture->last_level;
+ resource.nr_samples = texture->nr_samples;
+ resource.usage = staging ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+ resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
+ resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
+
+ if (staging)
+ resource.flags |= SI_RESOURCE_FLAG_TRANSFER;
+
+ *flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+ if (*flushed_depth_texture == NULL) {
+ PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
+ return false;
+ }
+ return true;
+}
+
+/**
+ * Initialize the pipe_resource descriptor to be of the same size as the box,
+ * which is supposed to hold a subregion of the texture "orig" at the given
+ * mipmap level.
+ */
+static void si_init_temp_resource_from_box(struct pipe_resource *res,
+ struct pipe_resource *orig,
+ const struct pipe_box *box,
+ unsigned level, unsigned flags)
+{
+ memset(res, 0, sizeof(*res));
+ res->format = orig->format;
+ res->width0 = box->width;
+ res->height0 = box->height;
+ res->depth0 = 1;
+ res->array_size = 1;
+ res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+ res->flags = flags;
+
+ /* We must set the correct texture target and dimensions for a 3D box. */
+ if (box->depth > 1 && util_max_layer(orig, level) > 0) {
+ res->target = PIPE_TEXTURE_2D_ARRAY;
+ res->array_size = box->depth;
+ } else {
+ res->target = PIPE_TEXTURE_2D;
+ }
+}
+
+static bool si_can_invalidate_texture(struct si_screen *sscreen,
+ struct si_texture *tex,
+ unsigned transfer_usage,
+ const struct pipe_box *box)
+{
+ return !tex->buffer.b.is_shared &&
+ !(transfer_usage & PIPE_TRANSFER_READ) &&
+ tex->buffer.b.b.last_level == 0 &&
+ util_texrange_covers_whole_level(&tex->buffer.b.b, 0,
+ box->x, box->y, box->z,
+ box->width, box->height,
+ box->depth);
+}
+
+static void si_texture_invalidate_storage(struct si_context *sctx,
+ struct si_texture *tex)
+{
+ struct si_screen *sscreen = sctx->screen;
+
+ /* There is no point in discarding depth and tiled buffers. */
+ assert(!tex->is_depth);
+ assert(tex->surface.is_linear);
+
+ /* Reallocate the buffer in the same pipe_resource. */
+ si_alloc_resource(sscreen, &tex->buffer);
+
+ /* Initialize the CMASK base address (needed even without CMASK). */
+ tex->cmask_base_address_reg =
+ (tex->buffer.gpu_address + tex->cmask_offset) >> 8;
+
+ p_atomic_inc(&sscreen->dirty_tex_counter);
+
+ sctx->num_alloc_tex_transfer_bytes += tex->size;
+}
+
+static void *si_texture_transfer_map(struct pipe_context *ctx,
+ struct pipe_resource *texture,
+ unsigned level,
+ unsigned usage,
+ const struct pipe_box *box,
+ struct pipe_transfer **ptransfer)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct si_texture *tex = (struct si_texture*)texture;
+ struct si_transfer *trans;
+ struct r600_resource *buf;
+ unsigned offset = 0;
+ char *map;
+ bool use_staging_texture = false;
+
+ assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
+ assert(box->width && box->height && box->depth);
+
+ /* Depth textures use staging unconditionally. */
+ if (!tex->is_depth) {
+ /* Degrade the tile mode if we get too many transfers on APUs.
+ * On dGPUs, the staging texture is always faster.
+ * Only count uploads that are at least 4x4 pixels large.
+ */
+ if (!sctx->screen->info.has_dedicated_vram &&
+ level == 0 &&
+ box->width >= 4 && box->height >= 4 &&
+ p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
+ bool can_invalidate =
+ si_can_invalidate_texture(sctx->screen, tex,
+ usage, box);
+
+ si_reallocate_texture_inplace(sctx, tex,
+ PIPE_BIND_LINEAR,
+ can_invalidate);
+ }
+
+ /* Tiled textures need to be converted into a linear texture for CPU
+ * access. The staging texture is always linear and is placed in GART.
+ *
+ * Reading from VRAM or GTT WC is slow, always use the staging
+ * texture in this case.
+ *
+ * Use the staging texture for uploads if the underlying BO
+ * is busy.
+ */
+ if (!tex->surface.is_linear)
+ use_staging_texture = true;
+ else if (usage & PIPE_TRANSFER_READ)
+ use_staging_texture =
+ tex->buffer.domains & RADEON_DOMAIN_VRAM ||
+ tex->buffer.flags & RADEON_FLAG_GTT_WC;
+ /* Write & linear only: */
+ else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf,
+ RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(tex->buffer.buf, 0,
+ RADEON_USAGE_READWRITE)) {
+ /* It's busy. */
+ if (si_can_invalidate_texture(sctx->screen, tex,
+ usage, box))
+ si_texture_invalidate_storage(sctx, tex);
+ else
+ use_staging_texture = true;
+ }
+ }
+
+ trans = CALLOC_STRUCT(si_transfer);
+ if (!trans)
+ return NULL;
+ pipe_resource_reference(&trans->b.b.resource, texture);
+ trans->b.b.level = level;
+ trans->b.b.usage = usage;
+ trans->b.b.box = *box;
+
+ if (tex->is_depth) {
+ struct si_texture *staging_depth;
+
+ if (tex->buffer.b.b.nr_samples > 1) {
+ /* MSAA depth buffers need to be converted to single sample buffers.
+ *
+ * Mapping MSAA depth buffers can occur if ReadPixels is called
+ * with a multisample GLX visual.
+ *
+ * First downsample the depth buffer to a temporary texture,
+ * then decompress the temporary one to staging.
+ *
+ * Only the region being mapped is transfered.
+ */
+ struct pipe_resource resource;
+
+ si_init_temp_resource_from_box(&resource, texture, box, level, 0);
+
+ if (!si_init_flushed_depth_texture(ctx, &resource, &staging_depth)) {
+ PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+ goto fail_trans;
+ }
+
+ if (usage & PIPE_TRANSFER_READ) {
+ struct pipe_resource *temp = ctx->screen->resource_create(ctx->screen, &resource);
+ if (!temp) {
+ PRINT_ERR("failed to create a temporary depth texture\n");
+ goto fail_trans;
+ }
+
+ si_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box);
+ si_blit_decompress_depth(ctx, (struct si_texture*)temp, staging_depth,
+ 0, 0, 0, box->depth, 0, 0);
+ pipe_resource_reference(&temp, NULL);
+ }
+
+ /* Just get the strides. */
+ si_texture_get_offset(sctx->screen, staging_depth, level, NULL,
+ &trans->b.b.stride,
+ &trans->b.b.layer_stride);
+ } else {
+ /* XXX: only readback the rectangle which is being mapped? */
+ /* XXX: when discard is true, no need to read back from depth texture */
+ if (!si_init_flushed_depth_texture(ctx, texture, &staging_depth)) {
+ PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+ goto fail_trans;
+ }
+
+ si_blit_decompress_depth(ctx, tex, staging_depth,
+ level, level,
+ box->z, box->z + box->depth - 1,
+ 0, 0);
+
+ offset = si_texture_get_offset(sctx->screen, staging_depth,
+ level, box,
+ &trans->b.b.stride,
+ &trans->b.b.layer_stride);
+ }
+
+ trans->staging = &staging_depth->buffer;
+ buf = trans->staging;
+ } else if (use_staging_texture) {
+ struct pipe_resource resource;
+ struct si_texture *staging;
+
+ si_init_temp_resource_from_box(&resource, texture, box, level,
+ SI_RESOURCE_FLAG_TRANSFER);
+ resource.usage = (usage & PIPE_TRANSFER_READ) ?
+ PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
+
+ /* Create the temporary texture. */
+ staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
+ if (!staging) {
+ PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+ goto fail_trans;
+ }
+ trans->staging = &staging->buffer;
+
+ /* Just get the strides. */
+ si_texture_get_offset(sctx->screen, staging, 0, NULL,
+ &trans->b.b.stride,
+ &trans->b.b.layer_stride);
+
+ if (usage & PIPE_TRANSFER_READ)
+ si_copy_to_staging_texture(ctx, trans);
+ else
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+ buf = trans->staging;
+ } else {
+ /* the resource is mapped directly */
+ offset = si_texture_get_offset(sctx->screen, tex, level, box,
+ &trans->b.b.stride,
+ &trans->b.b.layer_stride);
+ buf = &tex->buffer;
+ }
+
+ if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
+ goto fail_trans;
+
+ *ptransfer = &trans->b.b;
+ return map + offset;
+
+fail_trans:
+ r600_resource_reference(&trans->staging, NULL);
+ pipe_resource_reference(&trans->b.b.resource, NULL);
+ FREE(trans);
+ return NULL;
+}
+
+static void si_texture_transfer_unmap(struct pipe_context *ctx,
+ struct pipe_transfer* transfer)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct si_transfer *stransfer = (struct si_transfer*)transfer;
+ struct pipe_resource *texture = transfer->resource;
+ struct si_texture *tex = (struct si_texture*)texture;
+
+ if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) {
+ if (tex->is_depth && tex->buffer.b.b.nr_samples <= 1) {
+ ctx->resource_copy_region(ctx, texture, transfer->level,
+ transfer->box.x, transfer->box.y, transfer->box.z,
+ &stransfer->staging->b.b, transfer->level,
+ &transfer->box);
+ } else {
+ si_copy_from_staging_texture(ctx, stransfer);
+ }
+ }
+
+ if (stransfer->staging) {
+ sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
+ r600_resource_reference(&stransfer->staging, NULL);
+ }
+
+ /* Heuristic for {upload, draw, upload, draw, ..}:
+ *
+ * Flush the gfx IB if we've allocated too much texture storage.
+ *
+ * The idea is that we don't want to build IBs that use too much
+ * memory and put pressure on the kernel memory manager and we also
+ * want to make temporary and invalidated buffers go idle ASAP to
+ * decrease the total memory usage or make them reusable. The memory
+ * usage will be slightly higher than given here because of the buffer
+ * cache in the winsys.
+ *
+ * The result is that the kernel memory manager is never a bottleneck.
+ */
+ if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ sctx->num_alloc_tex_transfer_bytes = 0;
+ }
+
+ pipe_resource_reference(&transfer->resource, NULL);
+ FREE(transfer);
+}
+
+static const struct u_resource_vtbl si_texture_vtbl =
+{
+ NULL, /* get_handle */
+ si_texture_destroy, /* resource_destroy */
+ si_texture_transfer_map, /* transfer_map */
+ u_default_transfer_flush_region, /* transfer_flush_region */
+ si_texture_transfer_unmap, /* transfer_unmap */
+};
+
+/* Return if it's allowed to reinterpret one format as another with DCC enabled.
+ */
+bool vi_dcc_formats_compatible(enum pipe_format format1,
+ enum pipe_format format2)
+{
+ const struct util_format_description *desc1, *desc2;
+
+ /* No format change - exit early. */
+ if (format1 == format2)
+ return true;
+
+ format1 = si_simplify_cb_format(format1);
+ format2 = si_simplify_cb_format(format2);
+
+ /* Check again after format adjustments. */
+ if (format1 == format2)
+ return true;
+
+ desc1 = util_format_description(format1);
+ desc2 = util_format_description(format2);
+
+ if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+ desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+ return false;
+
+ /* Float and non-float are totally incompatible. */
+ if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
+ (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
+ return false;
+
+ /* Channel sizes must match across DCC formats.
+ * Comparing just the first 2 channels should be enough.
+ */
+ if (desc1->channel[0].size != desc2->channel[0].size ||
+ (desc1->nr_channels >= 2 &&
+ desc1->channel[1].size != desc2->channel[1].size))
+ return false;
+
+ /* Everything below is not needed if the driver never uses the DCC
+ * clear code with the value of 1.
+ */
+
+ /* If the clear values are all 1 or all 0, this constraint can be
+ * ignored. */
+ if (vi_alpha_is_on_msb(format1) != vi_alpha_is_on_msb(format2))
+ return false;
+
+ /* Channel types must match if the clear value of 1 is used.
+ * The type categories are only float, signed, unsigned.
+ * NORM and INT are always compatible.
+ */
+ if (desc1->channel[0].type != desc2->channel[0].type ||
+ (desc1->nr_channels >= 2 &&
+ desc1->channel[1].type != desc2->channel[1].type))
+ return false;
+
+ return true;
+}
+
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
+ unsigned level,
+ enum pipe_format view_format)
+{
+ struct si_texture *stex = (struct si_texture *)tex;
+
+ return vi_dcc_enabled(stex, level) &&
+ !vi_dcc_formats_compatible(tex->format, view_format);
+}
+
+/* This can't be merged with the above function, because
+ * vi_dcc_formats_compatible should be called only when DCC is enabled. */
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
+ struct pipe_resource *tex,
+ unsigned level,
+ enum pipe_format view_format)
+{
+ struct si_texture *stex = (struct si_texture *)tex;
+
+ if (vi_dcc_formats_are_incompatible(tex, level, view_format))
+ if (!si_texture_disable_dcc(sctx, stex))
+ si_decompress_dcc(sctx, stex);
+}
+
+struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
+ struct pipe_resource *texture,
+ const struct pipe_surface *templ,
+ unsigned width0, unsigned height0,
+ unsigned width, unsigned height)
+{
+ struct si_surface *surface = CALLOC_STRUCT(si_surface);
+
+ if (!surface)
+ return NULL;
+
+ assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
+ assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
+
+ pipe_reference_init(&surface->base.reference, 1);
+ pipe_resource_reference(&surface->base.texture, texture);
+ surface->base.context = pipe;
+ surface->base.format = templ->format;
+ surface->base.width = width;
+ surface->base.height = height;
+ surface->base.u = templ->u;
+
+ surface->width0 = width0;
+ surface->height0 = height0;
+
+ surface->dcc_incompatible =
+ texture->target != PIPE_BUFFER &&
+ vi_dcc_formats_are_incompatible(texture, templ->u.tex.level,
+ templ->format);
+ return &surface->base;
+}
+
+static struct pipe_surface *si_create_surface(struct pipe_context *pipe,
+ struct pipe_resource *tex,
+ const struct pipe_surface *templ)
+{
+ unsigned level = templ->u.tex.level;
+ unsigned width = u_minify(tex->width0, level);
+ unsigned height = u_minify(tex->height0, level);
+ unsigned width0 = tex->width0;
+ unsigned height0 = tex->height0;
+
+ if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+ const struct util_format_description *tex_desc
+ = util_format_description(tex->format);
+ const struct util_format_description *templ_desc
+ = util_format_description(templ->format);
+
+ assert(tex_desc->block.bits == templ_desc->block.bits);
+
+ /* Adjust size of surface if and only if the block width or
+ * height is changed. */
+ if (tex_desc->block.width != templ_desc->block.width ||
+ tex_desc->block.height != templ_desc->block.height) {
+ unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+ unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+ width = nblks_x * templ_desc->block.width;
+ height = nblks_y * templ_desc->block.height;
+
+ width0 = util_format_get_nblocksx(tex->format, width0);
+ height0 = util_format_get_nblocksy(tex->format, height0);
+ }
+ }
+
+ return si_create_surface_custom(pipe, tex, templ,
+ width0, height0,
+ width, height);
+}
+
+static void si_surface_destroy(struct pipe_context *pipe,
+ struct pipe_surface *surface)
+{
+ pipe_resource_reference(&surface->texture, NULL);
+ FREE(surface);
+}
+
+unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap)
+{
+ const struct util_format_description *desc = util_format_description(format);
+
+#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
+
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+ return V_028C70_SWAP_STD;
+
+ if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+ return ~0U;
+
+ switch (desc->nr_channels) {
+ case 1:
+ if (HAS_SWIZZLE(0,X))
+ return V_028C70_SWAP_STD; /* X___ */
+ else if (HAS_SWIZZLE(3,X))
+ return V_028C70_SWAP_ALT_REV; /* ___X */
+ break;
+ case 2:
+ if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) ||
+ (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) ||
+ (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y)))
+ return V_028C70_SWAP_STD; /* XY__ */
+ else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) ||
+ (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) ||
+ (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X)))
+ /* YX__ */
+ return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
+ else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y))
+ return V_028C70_SWAP_ALT; /* X__Y */
+ else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X))
+ return V_028C70_SWAP_ALT_REV; /* Y__X */
+ break;
+ case 3:
+ if (HAS_SWIZZLE(0,X))
+ return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
+ else if (HAS_SWIZZLE(0,Z))
+ return V_028C70_SWAP_STD_REV; /* ZYX */
+ break;
+ case 4:
+ /* check the middle channels, the 1st and 4th channel can be NONE */
+ if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) {
+ return V_028C70_SWAP_STD; /* XYZW */
+ } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) {
+ return V_028C70_SWAP_STD_REV; /* WZYX */
+ } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) {
+ return V_028C70_SWAP_ALT; /* ZYXW */
+ } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) {
+ /* YZWX */
+ if (desc->is_array)
+ return V_028C70_SWAP_ALT_REV;
+ else
+ return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
+ }
+ break;
+ }
+ return ~0U;
+}
+
+/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
+
+static void vi_dcc_clean_up_context_slot(struct si_context *sctx,
+ int slot)
+{
+ int i;
+
+ if (sctx->dcc_stats[slot].query_active)
+ vi_separate_dcc_stop_query(sctx,
+ sctx->dcc_stats[slot].tex);
+
+ for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
+ if (sctx->dcc_stats[slot].ps_stats[i]) {
+ sctx->b.destroy_query(&sctx->b,
+ sctx->dcc_stats[slot].ps_stats[i]);
+ sctx->dcc_stats[slot].ps_stats[i] = NULL;
+ }
+
+ si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
+}
+
+/**
+ * Return the per-context slot where DCC statistics queries for the texture live.
+ */
+static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx,
+ struct si_texture *tex)
+{
+ int i, empty_slot = -1;
+
+ /* Remove zombie textures (textures kept alive by this array only). */
+ for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+ if (sctx->dcc_stats[i].tex &&
+ sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
+ vi_dcc_clean_up_context_slot(sctx, i);
+
+ /* Find the texture. */
+ for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+ /* Return if found. */
+ if (sctx->dcc_stats[i].tex == tex) {
+ sctx->dcc_stats[i].last_use_timestamp = os_time_get();
+ return i;
+ }
+
+ /* Record the first seen empty slot. */
+ if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
+ empty_slot = i;
+ }
+
+ /* Not found. Remove the oldest member to make space in the array. */
+ if (empty_slot == -1) {
+ int oldest_slot = 0;
+
+ /* Find the oldest slot. */
+ for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+ if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
+ sctx->dcc_stats[i].last_use_timestamp)
+ oldest_slot = i;
+
+ /* Clean up the oldest slot. */
+ vi_dcc_clean_up_context_slot(sctx, oldest_slot);
+ empty_slot = oldest_slot;
+ }
+
+ /* Add the texture to the new slot. */
+ si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
+ sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+ return empty_slot;
+}
+
+static struct pipe_query *
+vi_create_resuming_pipestats_query(struct si_context *sctx)
+{
+ struct si_query_hw *query = (struct si_query_hw*)
+ sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+
+ query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
+ return (struct pipe_query*)query;
+}
+
+/**
+ * Called when binding a color buffer.
+ */
+void vi_separate_dcc_start_query(struct si_context *sctx,
+ struct si_texture *tex)
+{
+ unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+
+ assert(!sctx->dcc_stats[i].query_active);
+
+ if (!sctx->dcc_stats[i].ps_stats[0])
+ sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
+
+ /* begin or resume the query */
+ sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+ sctx->dcc_stats[i].query_active = true;
+}
+
+/**
+ * Called when unbinding a color buffer.
+ */
+void vi_separate_dcc_stop_query(struct si_context *sctx,
+ struct si_texture *tex)
+{
+ unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+
+ assert(sctx->dcc_stats[i].query_active);
+ assert(sctx->dcc_stats[i].ps_stats[0]);
+
+ /* pause or end the query */
+ sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+ sctx->dcc_stats[i].query_active = false;
+}
+
+static bool vi_should_enable_separate_dcc(struct si_texture *tex)
+{
+ /* The minimum number of fullscreen draws per frame that is required
+ * to enable DCC. */
+ return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+}
+
+/* Called by fast clear. */
+void vi_separate_dcc_try_enable(struct si_context *sctx,
+ struct si_texture *tex)
+{
+ /* The intent is to use this with shared displayable back buffers,
+ * but it's not strictly limited only to them.
+ */
+ if (!tex->buffer.b.is_shared ||
+ !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+ tex->buffer.b.b.target != PIPE_TEXTURE_2D ||
+ tex->buffer.b.b.last_level > 0 ||
+ !tex->surface.dcc_size ||
+ sctx->screen->debug_flags & DBG(NO_DCC) ||
+ sctx->screen->debug_flags & DBG(NO_DCC_FB))
+ return;
+
+ assert(sctx->chip_class >= VI);
+
+ if (tex->dcc_offset)
+ return; /* already enabled */
+
+ /* Enable the DCC stat gathering. */
+ if (!tex->dcc_gather_statistics) {
+ tex->dcc_gather_statistics = true;
+ vi_separate_dcc_start_query(sctx, tex);
+ }
+
+ if (!vi_should_enable_separate_dcc(tex))
+ return; /* stats show that DCC decompression is too expensive */
+
+ assert(tex->surface.num_dcc_levels);
+ assert(!tex->dcc_separate_buffer);
+
+ si_texture_discard_cmask(sctx->screen, tex);
+
+ /* Get a DCC buffer. */
+ if (tex->last_dcc_separate_buffer) {
+ assert(tex->dcc_gather_statistics);
+ assert(!tex->dcc_separate_buffer);
+ tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+ tex->last_dcc_separate_buffer = NULL;
+ } else {
+ tex->dcc_separate_buffer =
+ si_aligned_buffer_create(sctx->b.screen,
+ SI_RESOURCE_FLAG_UNMAPPABLE,
+ PIPE_USAGE_DEFAULT,
+ tex->surface.dcc_size,
+ tex->surface.dcc_alignment);
+ if (!tex->dcc_separate_buffer)
+ return;
+ }
+
+ /* dcc_offset is the absolute GPUVM address. */
+ tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+ /* no need to flag anything since this is called by fast clear that
+ * flags framebuffer state
+ */
+}
+
+/**
+ * Called by pipe_context::flush_resource, the place where DCC decompression
+ * takes place.
+ */
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+ struct si_texture *tex)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct pipe_query *tmp;
+ unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+ bool query_active = sctx->dcc_stats[i].query_active;
+ bool disable = false;
+
+ if (sctx->dcc_stats[i].ps_stats[2]) {
+ union pipe_query_result result;
+
+ /* Read the results. */
+ ctx->get_query_result(ctx, sctx->dcc_stats[i].ps_stats[2],
+ true, &result);
+ si_query_hw_reset_buffers(sctx,
+ (struct si_query_hw*)
+ sctx->dcc_stats[i].ps_stats[2]);
+
+ /* Compute the approximate number of fullscreen draws. */
+ tex->ps_draw_ratio =
+ result.pipeline_statistics.ps_invocations /
+ (tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
+ sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
+
+ disable = tex->dcc_separate_buffer &&
+ !vi_should_enable_separate_dcc(tex);
+ }
+
+ tex->num_slow_clears = 0;
+
+ /* stop the statistics query for ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_stop_query(sctx, tex);
+
+ /* Move the queries in the queue by one. */
+ tmp = sctx->dcc_stats[i].ps_stats[2];
+ sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
+ sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
+ sctx->dcc_stats[i].ps_stats[0] = tmp;
+
+ /* create and start a new query as ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_start_query(sctx, tex);
+
+ if (disable) {
+ assert(!tex->last_dcc_separate_buffer);
+ tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+ tex->dcc_separate_buffer = NULL;
+ tex->dcc_offset = 0;
+ /* no need to flag anything since this is called after
+ * decompression that re-sets framebuffer state
+ */
+ }
+}
+
+static struct pipe_memory_object *
+si_memobj_from_handle(struct pipe_screen *screen,
+ struct winsys_handle *whandle,
+ bool dedicated)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
+ struct pb_buffer *buf = NULL;
+ uint32_t stride, offset;
+
+ if (!memobj)
+ return NULL;
+
+ buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
+ &stride, &offset);
+ if (!buf) {
+ free(memobj);
+ return NULL;
+ }
+
+ memobj->b.dedicated = dedicated;
+ memobj->buf = buf;
+ memobj->stride = stride;
+
+ return (struct pipe_memory_object *)memobj;
+
+}
+
+static void
+si_memobj_destroy(struct pipe_screen *screen,
+ struct pipe_memory_object *_memobj)
+{
+ struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+
+ pb_reference(&memobj->buf, NULL);
+ free(memobj);
+}
+
+static struct pipe_resource *
+si_texture_from_memobj(struct pipe_screen *screen,
+ const struct pipe_resource *templ,
+ struct pipe_memory_object *_memobj,
+ uint64_t offset)
+{
+ struct si_screen *sscreen = (struct si_screen*)screen;
+ struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+ struct pipe_resource *tex =
+ si_texture_from_winsys_buffer(sscreen, templ, memobj->buf,
+ memobj->stride, offset,
+ PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE |
+ PIPE_HANDLE_USAGE_SHADER_WRITE,
+ memobj->b.dedicated);
+ if (!tex)
+ return NULL;
+
+ /* si_texture_from_winsys_buffer doesn't increment refcount of
+ * memobj->buf, so increment it here.
+ */
+ struct pb_buffer *buf = NULL;
+ pb_reference(&buf, memobj->buf);
+ return tex;
+}
+
+static bool si_check_resource_capability(struct pipe_screen *screen,
+ struct pipe_resource *resource,
+ unsigned bind)
+{
+ struct si_texture *tex = (struct si_texture*)resource;
+
+ /* Buffers only support the linear flag. */
+ if (resource->target == PIPE_BUFFER)
+ return (bind & ~PIPE_BIND_LINEAR) == 0;
+
+ if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
+ return false;
+
+ if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
+ return false;
+
+ /* TODO: PIPE_BIND_CURSOR - do we care? */
+ return true;
+}
+
+void si_init_screen_texture_functions(struct si_screen *sscreen)
+{
+ sscreen->b.resource_from_handle = si_texture_from_handle;
+ sscreen->b.resource_get_handle = si_texture_get_handle;
+ sscreen->b.resource_from_memobj = si_texture_from_memobj;
+ sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
+ sscreen->b.memobj_destroy = si_memobj_destroy;
+ sscreen->b.check_resource_capability = si_check_resource_capability;
+}
+
+void si_init_context_texture_functions(struct si_context *sctx)
+{
+ sctx->b.create_surface = si_create_surface;
+ sctx->b.surface_destroy = si_surface_destroy;
+}