diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2017-08-14 09:45:54 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2017-08-14 09:45:54 +0000 |
commit | 4c58069f5013f0a621503525f7d5193bfe9976b3 (patch) | |
tree | bd8f8a08b889e9a8b99c9de01ae12459d527ea6d /lib/mesa/src/gallium/drivers/radeon | |
parent | 5caa025e6b62d0456faad86c89f239a14d1eaadb (diff) |
Import Mesa 17.1.6
Diffstat (limited to 'lib/mesa/src/gallium/drivers/radeon')
21 files changed, 1930 insertions, 1015 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.am b/lib/mesa/src/gallium/drivers/radeon/Makefile.am index a6fc145cb..2be6af4b1 100644 --- a/lib/mesa/src/gallium/drivers/radeon/Makefile.am +++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.am @@ -13,19 +13,14 @@ noinst_LTLIBRARIES = libradeon.la libradeon_la_SOURCES = \ $(C_SOURCES) -if NEED_RADEON_LLVM +if HAVE_GALLIUM_LLVM AM_CFLAGS += \ - $(LLVM_CFLAGS) \ - $(LIBELF_CFLAGS) - -libradeon_la_SOURCES += \ - $(LLVM_C_FILES) + $(LLVM_CFLAGS) libradeon_la_LIBADD = \ $(CLOCK_LIB) \ - $(LLVM_LIBS) \ - $(LIBELF_LIBS) + $(LLVM_LIBS) libradeon_la_LDFLAGS = \ $(LLVM_LDFLAGS) diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources index 3e13dae3c..9dd4e1a88 100644 --- a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources +++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources @@ -22,7 +22,3 @@ C_SOURCES := \ radeon_video.c \ radeon_video.h \ radeon_winsys.h - -LLVM_C_FILES := \ - radeon_elf_util.c \ - radeon_elf_util.h diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c index bbab58946..b2289e26f 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c @@ -51,6 +51,8 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE; bool busy = false; + assert(!(resource->flags & RADEON_FLAG_SPARSE)); + if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { return ctx->ws->buffer_map(resource->buf, NULL, usage); } @@ -159,8 +161,8 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen, } /* Tiled textures are unmappable. Always put them in VRAM. */ - if (res->b.b.target != PIPE_BUFFER && - rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) { + if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) || + res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) { res->domains = RADEON_DOMAIN_VRAM; res->flags &= ~RADEON_FLAG_CPU_ACCESS; res->flags |= RADEON_FLAG_NO_CPU_ACCESS | @@ -170,8 +172,12 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen, /* If VRAM is just stolen system memory, allow both VRAM and * GTT, whichever has free space. If a buffer is evicted from * VRAM to GTT, it will stay there. + * + * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only + * placements even with a low amount of stolen VRAM. */ if (!rscreen->info.has_dedicated_vram && + (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) && res->domains == RADEON_DOMAIN_VRAM) res->domains = RADEON_DOMAIN_VRAM_GTT; @@ -245,6 +251,10 @@ r600_invalidate_buffer(struct r600_common_context *rctx, if (rbuffer->is_shared) return false; + /* Sparse buffers can't be reallocated. */ + if (rbuffer->flags & RADEON_FLAG_SPARSE) + return false; + /* In AMD_pinned_memory, the user pointer association only gets * broken when the buffer is explicitly re-allocated. */ @@ -275,7 +285,6 @@ void r600_invalidate_resource(struct pipe_context *ctx, static void *r600_buffer_get_transfer(struct pipe_context *ctx, struct pipe_resource *resource, - unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer, @@ -285,8 +294,9 @@ static void *r600_buffer_get_transfer(struct pipe_context *ctx, struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_transfer *transfer = slab_alloc(&rctx->pool_transfers); - transfer->transfer.resource = resource; - transfer->transfer.level = level; + transfer->transfer.resource = NULL; + pipe_resource_reference(&transfer->transfer.resource, resource); + transfer->transfer.level = 0; transfer->transfer.usage = usage; transfer->transfer.box = *box; transfer->transfer.stride = 0; @@ -317,11 +327,25 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, { struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; - struct r600_resource *rbuffer = r600_resource(resource); - uint8_t *data; + struct r600_resource *rbuffer = r600_resource(resource); + uint8_t *data; assert(box->x + box->width <= resource->width0); + /* From GL_AMD_pinned_memory issues: + * + * 4) Is glMapBuffer on a shared buffer guaranteed to return the + * same system address which was specified at creation time? + * + * RESOLVED: NO. The GL implementation might return a different + * virtual mapping of that memory, although the same physical + * page will be used. + * + * So don't ever use staging buffers. + */ + if (rscreen->ws->buffer_is_user_ptr(rbuffer->buf)) + usage |= PIPE_TRANSFER_PERSISTENT; + /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && @@ -351,26 +375,34 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, } if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && - !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | - PIPE_TRANSFER_PERSISTENT)) && !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && - r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { + ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_PERSISTENT)) && + r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) || + (rbuffer->flags & RADEON_FLAG_SPARSE))) { assert(usage & PIPE_TRANSFER_WRITE); - /* Check if mapping this buffer would cause waiting for the GPU. */ - if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || + /* Check if mapping this buffer would cause waiting for the GPU. + */ + if (rbuffer->flags & RADEON_FLAG_SPARSE || + r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; - u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), - 256, &offset, (struct pipe_resource**)&staging, (void**)&data); + u_upload_alloc(ctx->stream_uploader, 0, + box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), + rctx->screen->info.tcc_cache_line_size, + &offset, (struct pipe_resource**)&staging, + (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; - return r600_buffer_get_transfer(ctx, resource, level, usage, box, + return r600_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging, offset); + } else if (rbuffer->flags & RADEON_FLAG_SPARSE) { + return NULL; } } else { /* At this point, the buffer is always idle (we checked it above). */ @@ -378,11 +410,12 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, } } /* Use a staging buffer in cached GTT for reads. */ - else if ((usage & PIPE_TRANSFER_READ) && - !(usage & PIPE_TRANSFER_PERSISTENT) && - (rbuffer->domains & RADEON_DOMAIN_VRAM || - rbuffer->flags & RADEON_FLAG_GTT_WC) && - r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) { + else if (((usage & PIPE_TRANSFER_READ) && + !(usage & PIPE_TRANSFER_PERSISTENT) && + (rbuffer->domains & RADEON_DOMAIN_VRAM || + rbuffer->flags & RADEON_FLAG_GTT_WC) && + r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) || + (rbuffer->flags & RADEON_FLAG_SPARSE)) { struct r600_resource *staging; staging = (struct r600_resource*) pipe_buffer_create( @@ -402,8 +435,10 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, } data += box->x % R600_MAP_BUFFER_ALIGNMENT; - return r600_buffer_get_transfer(ctx, resource, level, usage, box, + return r600_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging, 0); + } else if (rbuffer->flags & RADEON_FLAG_SPARSE) { + return NULL; } } @@ -413,7 +448,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, } data += box->x; - return r600_buffer_get_transfer(ctx, resource, level, usage, box, + return r600_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, NULL, 0); } @@ -469,6 +504,7 @@ static void r600_buffer_transfer_unmap(struct pipe_context *ctx, if (rtransfer->staging) r600_resource_reference(&rtransfer->staging, NULL); + pipe_resource_reference(&transfer->resource, NULL); slab_free(&rctx->pool_transfers, transfer); } @@ -535,6 +571,8 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, if (templ->bind & PIPE_BIND_SHARED) rbuffer->flags |= RADEON_FLAG_HANDLE; + if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE) + rbuffer->flags |= RADEON_FLAG_SPARSE; if (!r600_alloc_resource(rscreen, rbuffer)) { FREE(rbuffer); @@ -544,7 +582,7 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, } struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen, - unsigned bind, + unsigned flags, unsigned usage, unsigned size, unsigned alignment) @@ -554,9 +592,9 @@ struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen, memset(&buffer, 0, sizeof buffer); buffer.target = PIPE_BUFFER; buffer.format = PIPE_FORMAT_R8_UNORM; - buffer.bind = bind; + buffer.bind = 0; buffer.usage = usage; - buffer.flags = 0; + buffer.flags = flags; buffer.width0 = size; buffer.height0 = 1; buffer.depth0 = 1; @@ -574,6 +612,7 @@ r600_buffer_from_user_memory(struct pipe_screen *screen, struct r600_resource *rbuffer = r600_alloc_buffer_struct(screen, templ); rbuffer->domains = RADEON_DOMAIN_GTT; + rbuffer->flags = 0; util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0); /* Convert a user pointer to a buffer. */ @@ -589,5 +628,8 @@ r600_buffer_from_user_memory(struct pipe_screen *screen, else rbuffer->gpu_address = 0; + rbuffer->vram_usage = 0; + rbuffer->gart_usage = templ->width0; + return &rbuffer->b.b; } diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c b/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c index a653834b3..3b45545b7 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_gpu_load.c @@ -35,6 +35,7 @@ */ #include "r600_pipe_common.h" +#include "r600_query.h" #include "os/os_time.h" /* For good accuracy at 1000 fps or lower. This will be inaccurate for higher @@ -42,17 +43,97 @@ #define SAMPLES_PER_SEC 10000 #define GRBM_STATUS 0x8010 +#define TA_BUSY(x) (((x) >> 14) & 0x1) +#define GDS_BUSY(x) (((x) >> 15) & 0x1) +#define VGT_BUSY(x) (((x) >> 17) & 0x1) +#define IA_BUSY(x) (((x) >> 19) & 0x1) +#define SX_BUSY(x) (((x) >> 20) & 0x1) +#define WD_BUSY(x) (((x) >> 21) & 0x1) +#define SPI_BUSY(x) (((x) >> 22) & 0x1) +#define BCI_BUSY(x) (((x) >> 23) & 0x1) +#define SC_BUSY(x) (((x) >> 24) & 0x1) +#define PA_BUSY(x) (((x) >> 25) & 0x1) +#define DB_BUSY(x) (((x) >> 26) & 0x1) +#define CP_BUSY(x) (((x) >> 29) & 0x1) +#define CB_BUSY(x) (((x) >> 30) & 0x1) #define GUI_ACTIVE(x) (((x) >> 31) & 0x1) -static bool r600_is_gpu_busy(struct r600_common_screen *rscreen) +#define SRBM_STATUS2 0x0e4c +#define SDMA_BUSY(x) (((x) >> 5) & 0x1) + +#define CP_STAT 0x8680 +#define PFP_BUSY(x) (((x) >> 15) & 0x1) +#define MEQ_BUSY(x) (((x) >> 16) & 0x1) +#define ME_BUSY(x) (((x) >> 17) & 0x1) +#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1) +#define DMA_BUSY(x) (((x) >> 22) & 0x1) +#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1) +#define CE_BUSY(x) (((x) >> 26) & 0x1) + +#define IDENTITY(x) x + +#define UPDATE_COUNTER(field, mask) \ + do { \ + if (mask(value)) \ + p_atomic_inc(&counters->named.field.busy); \ + else \ + p_atomic_inc(&counters->named.field.idle); \ + } while (0) + +static void r600_update_mmio_counters(struct r600_common_screen *rscreen, + union r600_mmio_counters *counters) { uint32_t value = 0; + bool gui_busy, sdma_busy = false; + /* GRBM_STATUS */ rscreen->ws->read_registers(rscreen->ws, GRBM_STATUS, 1, &value); - return GUI_ACTIVE(value); + + UPDATE_COUNTER(ta, TA_BUSY); + UPDATE_COUNTER(gds, GDS_BUSY); + UPDATE_COUNTER(vgt, VGT_BUSY); + UPDATE_COUNTER(ia, IA_BUSY); + UPDATE_COUNTER(sx, SX_BUSY); + UPDATE_COUNTER(wd, WD_BUSY); + UPDATE_COUNTER(spi, SPI_BUSY); + UPDATE_COUNTER(bci, BCI_BUSY); + UPDATE_COUNTER(sc, SC_BUSY); + UPDATE_COUNTER(pa, PA_BUSY); + UPDATE_COUNTER(db, DB_BUSY); + UPDATE_COUNTER(cp, CP_BUSY); + UPDATE_COUNTER(cb, CB_BUSY); + UPDATE_COUNTER(gui, GUI_ACTIVE); + gui_busy = GUI_ACTIVE(value); + + if (rscreen->chip_class >= CIK) { + /* SRBM_STATUS2 */ + rscreen->ws->read_registers(rscreen->ws, SRBM_STATUS2, 1, &value); + + UPDATE_COUNTER(sdma, SDMA_BUSY); + sdma_busy = SDMA_BUSY(value); + } + + if (rscreen->chip_class >= VI) { + /* CP_STAT */ + rscreen->ws->read_registers(rscreen->ws, CP_STAT, 1, &value); + + UPDATE_COUNTER(pfp, PFP_BUSY); + UPDATE_COUNTER(meq, MEQ_BUSY); + UPDATE_COUNTER(me, ME_BUSY); + UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY); + UPDATE_COUNTER(dma, DMA_BUSY); + UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY); + UPDATE_COUNTER(ce, CE_BUSY); + } + + value = gui_busy || sdma_busy; + UPDATE_COUNTER(gpu, IDENTITY); } -static PIPE_THREAD_ROUTINE(r600_gpu_load_thread, param) +#undef UPDATE_COUNTER + +static int +r600_gpu_load_thread(void *param) { struct r600_common_screen *rscreen = (struct r600_common_screen*)param; const int period_us = 1000000 / SAMPLES_PER_SEC; @@ -77,10 +158,7 @@ static PIPE_THREAD_ROUTINE(r600_gpu_load_thread, param) last_time = cur_time; /* Update the counters. */ - if (r600_is_gpu_busy(rscreen)) - p_atomic_inc(&rscreen->gpu_load_counter_busy); - else - p_atomic_inc(&rscreen->gpu_load_counter_idle); + r600_update_mmio_counters(rscreen, &rscreen->mmio_counters); } p_atomic_dec(&rscreen->gpu_load_stop_thread); return 0; @@ -92,50 +170,118 @@ void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen) return; p_atomic_inc(&rscreen->gpu_load_stop_thread); - pipe_thread_wait(rscreen->gpu_load_thread); + thrd_join(rscreen->gpu_load_thread, NULL); rscreen->gpu_load_thread = 0; } -static uint64_t r600_gpu_load_read_counter(struct r600_common_screen *rscreen) +static uint64_t r600_read_mmio_counter(struct r600_common_screen *rscreen, + unsigned busy_index) { /* Start the thread if needed. */ if (!rscreen->gpu_load_thread) { - pipe_mutex_lock(rscreen->gpu_load_mutex); + mtx_lock(&rscreen->gpu_load_mutex); /* Check again inside the mutex. */ if (!rscreen->gpu_load_thread) rscreen->gpu_load_thread = - pipe_thread_create(r600_gpu_load_thread, rscreen); - pipe_mutex_unlock(rscreen->gpu_load_mutex); + u_thread_create(r600_gpu_load_thread, rscreen); + mtx_unlock(&rscreen->gpu_load_mutex); } - /* The busy counter is in the lower 32 bits. - * The idle counter is in the upper 32 bits. */ - return p_atomic_read(&rscreen->gpu_load_counter_busy) | - ((uint64_t)p_atomic_read(&rscreen->gpu_load_counter_idle) << 32); -} + unsigned busy = p_atomic_read(&rscreen->mmio_counters.array[busy_index]); + unsigned idle = p_atomic_read(&rscreen->mmio_counters.array[busy_index + 1]); -/** - * Just return the counters. - */ -uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen) -{ - return r600_gpu_load_read_counter(rscreen); + return busy | ((uint64_t)idle << 32); } -unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin) +static unsigned r600_end_mmio_counter(struct r600_common_screen *rscreen, + uint64_t begin, unsigned busy_index) { - uint64_t end = r600_gpu_load_read_counter(rscreen); + uint64_t end = r600_read_mmio_counter(rscreen, busy_index); unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff); unsigned idle = (end >> 32) - (begin >> 32); - /* Calculate the GPU load. + /* Calculate the % of time the busy counter was being incremented. * - * If no counters have been incremented, return the current load. + * If no counters were incremented, return the current counter status. * It's for the case when the load is queried faster than * the counters are updated. */ - if (idle || busy) + if (idle || busy) { return busy*100 / (busy + idle); - else - return r600_is_gpu_busy(rscreen) ? 100 : 0; + } else { + union r600_mmio_counters counters; + + memset(&counters, 0, sizeof(counters)); + r600_update_mmio_counters(rscreen, &counters); + return counters.array[busy_index] ? 100 : 0; + } +} + +#define BUSY_INDEX(rscreen, field) (&rscreen->mmio_counters.named.field.busy - \ + rscreen->mmio_counters.array) + +static unsigned busy_index_from_type(struct r600_common_screen *rscreen, + unsigned type) +{ + switch (type) { + case R600_QUERY_GPU_LOAD: + return BUSY_INDEX(rscreen, gpu); + case R600_QUERY_GPU_SHADERS_BUSY: + return BUSY_INDEX(rscreen, spi); + case R600_QUERY_GPU_TA_BUSY: + return BUSY_INDEX(rscreen, ta); + case R600_QUERY_GPU_GDS_BUSY: + return BUSY_INDEX(rscreen, gds); + case R600_QUERY_GPU_VGT_BUSY: + return BUSY_INDEX(rscreen, vgt); + case R600_QUERY_GPU_IA_BUSY: + return BUSY_INDEX(rscreen, ia); + case R600_QUERY_GPU_SX_BUSY: + return BUSY_INDEX(rscreen, sx); + case R600_QUERY_GPU_WD_BUSY: + return BUSY_INDEX(rscreen, wd); + case R600_QUERY_GPU_BCI_BUSY: + return BUSY_INDEX(rscreen, bci); + case R600_QUERY_GPU_SC_BUSY: + return BUSY_INDEX(rscreen, sc); + case R600_QUERY_GPU_PA_BUSY: + return BUSY_INDEX(rscreen, pa); + case R600_QUERY_GPU_DB_BUSY: + return BUSY_INDEX(rscreen, db); + case R600_QUERY_GPU_CP_BUSY: + return BUSY_INDEX(rscreen, cp); + case R600_QUERY_GPU_CB_BUSY: + return BUSY_INDEX(rscreen, cb); + case R600_QUERY_GPU_SDMA_BUSY: + return BUSY_INDEX(rscreen, sdma); + case R600_QUERY_GPU_PFP_BUSY: + return BUSY_INDEX(rscreen, pfp); + case R600_QUERY_GPU_MEQ_BUSY: + return BUSY_INDEX(rscreen, meq); + case R600_QUERY_GPU_ME_BUSY: + return BUSY_INDEX(rscreen, me); + case R600_QUERY_GPU_SURF_SYNC_BUSY: + return BUSY_INDEX(rscreen, surf_sync); + case R600_QUERY_GPU_DMA_BUSY: + return BUSY_INDEX(rscreen, dma); + case R600_QUERY_GPU_SCRATCH_RAM_BUSY: + return BUSY_INDEX(rscreen, scratch_ram); + case R600_QUERY_GPU_CE_BUSY: + return BUSY_INDEX(rscreen, ce); + default: + unreachable("invalid query type"); + } +} + +uint64_t r600_begin_counter(struct r600_common_screen *rscreen, unsigned type) +{ + unsigned busy_index = busy_index_from_type(rscreen, type); + return r600_read_mmio_counter(rscreen, busy_index); +} + +unsigned r600_end_counter(struct r600_common_screen *rscreen, unsigned type, + uint64_t begin) +{ + unsigned busy_index = busy_index_from_type(rscreen, type); + return r600_end_mmio_counter(rscreen, begin, busy_index); } diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c index 0c55fc2a2..48f609bcb 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c @@ -99,7 +99,7 @@ struct r600_query_pc { struct r600_pc_group *groups; }; -static void r600_pc_query_destroy(struct r600_common_context *ctx, +static void r600_pc_query_destroy(struct r600_common_screen *rscreen, struct r600_query *rquery) { struct r600_query_pc *query = (struct r600_query_pc *)rquery; @@ -112,10 +112,10 @@ static void r600_pc_query_destroy(struct r600_common_context *ctx, FREE(query->counters); - r600_query_hw_destroy(ctx, rquery); + r600_query_hw_destroy(rscreen, rquery); } -static bool r600_pc_query_prepare_buffer(struct r600_common_context *ctx, +static bool r600_pc_query_prepare_buffer(struct r600_common_screen *screen, struct r600_query_hw *hwquery, struct r600_resource *buffer) { @@ -196,7 +196,7 @@ static void r600_pc_query_clear_result(struct r600_query_hw *hwquery, memset(result, 0, sizeof(result->batch[0]) * query->num_counters); } -static void r600_pc_query_add_result(struct r600_common_context *ctx, +static void r600_pc_query_add_result(struct r600_common_screen *rscreen, struct r600_query_hw *hwquery, void *buffer, union pipe_query_result *result) @@ -301,8 +301,8 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, unsigned num_queries, unsigned *query_types) { - struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct r600_common_screen *screen = rctx->screen; + struct r600_common_screen *screen = + (struct r600_common_screen *)ctx->screen; struct r600_perfcounters *pc = screen->perfcounters; struct r600_perfcounter_block *block; struct r600_pc_group *group; @@ -365,7 +365,7 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, unsigned instances = 1; if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0) - instances = rctx->screen->info.max_se; + instances = screen->info.max_se; if (group->instance < 0) instances *= block->num_instances; @@ -417,13 +417,13 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, counter->qwords *= block->num_instances; } - if (!r600_query_hw_init(rctx, &query->b)) + if (!r600_query_hw_init(screen, &query->b)) goto error; return (struct pipe_query *)query; error: - r600_pc_query_destroy(rctx, &query->b.b); + r600_pc_query_destroy(screen, &query->b.b); return NULL; } @@ -545,7 +545,7 @@ int r600_get_perfcounter_info(struct r600_common_screen *screen, info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index; info->max_value.u64 = 0; info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; - info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE; info->group_id = base_gid + sub / block->num_selectors; info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; if (sub > 0 && sub + 1 < block->num_selectors * block->num_groups) diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c index f62bbf2e0..2019ecdd5 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c @@ -43,6 +43,14 @@ #define HAVE_LLVM 0 #endif +#if HAVE_LLVM +#include <llvm-c/TargetMachine.h> +#endif + +#ifndef MESA_LLVM_VERSION_PATCH +#define MESA_LLVM_VERSION_PATCH 0 +#endif + struct r600_multi_fence { struct pipe_reference reference; struct pipe_fence_handle *gfx; @@ -58,12 +66,12 @@ struct r600_multi_fence { /* * shader binary helpers. */ -void radeon_shader_binary_init(struct radeon_shader_binary *b) +void radeon_shader_binary_init(struct ac_shader_binary *b) { memset(b, 0, sizeof(*b)); } -void radeon_shader_binary_clean(struct radeon_shader_binary *b) +void radeon_shader_binary_clean(struct ac_shader_binary *b) { if (!b) return; @@ -80,35 +88,63 @@ void radeon_shader_binary_clean(struct radeon_shader_binary *b) * pipe_context */ -void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf, - uint64_t va, uint32_t old_value, uint32_t new_value) +/** + * Write an EOP event. + * + * \param event EVENT_TYPE_* + * \param event_flags Optional cache flush flags (TC) + * \param data_sel 1 = fence, 3 = timestamp + * \param buf Buffer + * \param va GPU address + * \param old_value Previous fence value (for a bug workaround) + * \param new_value Fence value to write for this event. + */ +void r600_gfx_write_event_eop(struct r600_common_context *ctx, + unsigned event, unsigned event_flags, + unsigned data_sel, + struct r600_resource *buf, uint64_t va, + uint32_t old_fence, uint32_t new_fence) { struct radeon_winsys_cs *cs = ctx->gfx.cs; + unsigned op = EVENT_TYPE(event) | + EVENT_INDEX(5) | + event_flags; + + if (ctx->chip_class >= GFX9) { + radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0)); + radeon_emit(cs, op); + radeon_emit(cs, EOP_DATA_SEL(data_sel)); + radeon_emit(cs, va); /* address lo */ + radeon_emit(cs, va >> 32); /* address hi */ + radeon_emit(cs, new_fence); /* immediate data lo */ + radeon_emit(cs, 0); /* immediate data hi */ + radeon_emit(cs, 0); /* unused */ + } else { + if (ctx->chip_class == CIK || + ctx->chip_class == VI) { + /* Two EOP events are required to make all engines go idle + * (and optional cache flushes executed) before the timestamp + * is written. + */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, op); + radeon_emit(cs, va); + radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel)); + radeon_emit(cs, old_fence); /* immediate data */ + radeon_emit(cs, 0); /* unused */ + } - if (ctx->chip_class == CIK || - ctx->chip_class == VI) { - /* Two EOP events are required to make all engines go idle - * (and optional cache flushes executed) before the timestamp - * is written. - */ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | - EVENT_INDEX(5)); + radeon_emit(cs, op); radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); - radeon_emit(cs, old_value); /* immediate data */ + radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel)); + radeon_emit(cs, new_fence); /* immediate data */ radeon_emit(cs, 0); /* unused */ } - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | - EVENT_INDEX(5)); - radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); - radeon_emit(cs, new_value); /* immediate data */ - radeon_emit(cs, 0); /* unused */ - - r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); + if (buf) + r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); } unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen) @@ -172,7 +208,9 @@ void r600_draw_rectangle(struct blitter_context *blitter, /* Upload vertices. The hw rectangle has only 3 vertices, * I guess the 4th one is derived from the first 3. * The vertex specification should match u_blitter's vertex element state. */ - u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb); + u_upload_alloc(rctx->b.stream_uploader, 0, sizeof(float) * 24, + rctx->screen->info.tcc_cache_line_size, + &offset, &buf, (void**)&vb); if (!buf) return; @@ -203,10 +241,26 @@ void r600_draw_rectangle(struct blitter_context *blitter, pipe_resource_reference(&buf, NULL); } +static void r600_dma_emit_wait_idle(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->dma.cs; + + /* NOP waits for idle on Evergreen and later. */ + if (rctx->chip_class >= CIK) + radeon_emit(cs, 0x00000000); /* NOP */ + else if (rctx->chip_class >= EVERGREEN) + radeon_emit(cs, 0xf0000000); /* NOP */ + else { + /* TODO: R600-R700 should use the FENCE packet. + * CS checker support is required. */ + } +} + void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { - uint64_t vram = 0, gtt = 0; + uint64_t vram = ctx->dma.cs->used_vram; + uint64_t gtt = ctx->dma.cs->used_gart; if (dst) { vram += dst->vram_usage; @@ -229,13 +283,35 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, /* Flush if there's not enough space, or if the memory usage per IB * is too large. + * + * IBs using too little memory are limited by the IB submission overhead. + * IBs using too much memory are limited by the kernel/TTM overhead. + * Too long IBs create CPU-GPU pipeline bubbles and add latency. + * + * This heuristic makes sure that DMA requests are executed + * very soon after the call is made and lowers memory usage. + * It improves texture upload performance by keeping the DMA + * engine busy while uploads are being submitted. */ + num_dw++; /* for emit_wait_idle below */ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || + ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 || !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); } + /* Wait for idle if either buffer has been used in the IB before to + * prevent read-after-write hazards. + */ + if ((dst && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf, + RADEON_USAGE_READWRITE)) || + (src && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf, + RADEON_USAGE_WRITE))) + r600_dma_emit_wait_idle(ctx); + /* If GPUVM is not supported, the CS checker needs 2 entries * in the buffer list per packet, which has to be done manually. */ @@ -249,44 +325,9 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); } -} - -/* This is required to prevent read-after-write hazards. */ -void r600_dma_emit_wait_idle(struct r600_common_context *rctx) -{ - struct radeon_winsys_cs *cs = rctx->dma.cs; - - /* done at the end of DMA calls, so increment this. */ - rctx->num_dma_calls++; - - /* IBs using too little memory are limited by the IB submission overhead. - * IBs using too much memory are limited by the kernel/TTM overhead. - * Too long IBs create CPU-GPU pipeline bubbles and add latency. - * - * This heuristic makes sure that DMA requests are executed - * very soon after the call is made and lowers memory usage. - * It improves texture upload performance by keeping the DMA - * engine busy while uploads are being submitted. - */ - if (cs->used_vram + cs->used_gart > 64 * 1024 * 1024) { - rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); - return; - } - - r600_need_dma_space(rctx, 1, NULL, NULL); - - if (!radeon_emitted(cs, 0)) /* empty queue */ - return; - /* NOP waits for idle on Evergreen and later. */ - if (rctx->chip_class >= CIK) - radeon_emit(cs, 0x00000000); /* NOP */ - else if (rctx->chip_class >= EVERGREEN) - radeon_emit(cs, 0xf0000000); /* NOP */ - else { - /* TODO: R600-R700 should use the FENCE packet. - * CS checker support is required. */ - } + /* this function is called before all DMA calls, so increment this. */ + ctx->num_dma_calls++; } static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) @@ -325,24 +366,22 @@ static void r600_flush_from_st(struct pipe_context *ctx, struct pipe_screen *screen = ctx->screen; struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct radeon_winsys *ws = rctx->ws; - unsigned rflags = 0; struct pipe_fence_handle *gfx_fence = NULL; struct pipe_fence_handle *sdma_fence = NULL; bool deferred_fence = false; + unsigned rflags = RADEON_FLUSH_ASYNC; if (flags & PIPE_FLUSH_END_OF_FRAME) rflags |= RADEON_FLUSH_END_OF_FRAME; - if (flags & PIPE_FLUSH_DEFERRED) - rflags |= RADEON_FLUSH_ASYNC; - if (rctx->dma.cs) { + /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ + if (rctx->dma.cs) rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); - } if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) { if (fence) ws->fence_reference(&gfx_fence, rctx->last_gfx_fence); - if (!(rflags & RADEON_FLUSH_ASYNC)) + if (!(flags & PIPE_FLUSH_DEFERRED)) ws->cs_sync_flush(rctx->gfx.cs); } else { /* Instead of flushing, create a deferred fence. Constraints: @@ -378,6 +417,12 @@ static void r600_flush_from_st(struct pipe_context *ctx, screen->fence_reference(screen, fence, NULL); *fence = (struct pipe_fence_handle*)multi_fence; } + + if (!(flags & PIPE_FLUSH_DEFERRED)) { + if (rctx->dma.cs) + ws->cs_sync_flush(rctx->dma.cs); + ws->cs_sync_flush(rctx->gfx.cs); + } } static void r600_flush_dma_ring(void *ctx, unsigned flags, @@ -516,6 +561,50 @@ bool r600_check_device_reset(struct r600_common_context *rctx) return true; } +static void r600_dma_clear_buffer_fallback(struct pipe_context *ctx, + struct pipe_resource *dst, + uint64_t offset, uint64_t size, + unsigned value) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + + rctx->clear_buffer(ctx, dst, offset, size, value, R600_COHERENCY_NONE); +} + +static bool r600_resource_commit(struct pipe_context *pctx, + struct pipe_resource *resource, + unsigned level, struct pipe_box *box, + bool commit) +{ + struct r600_common_context *ctx = (struct r600_common_context *)pctx; + struct r600_resource *res = r600_resource(resource); + + /* + * Since buffer commitment changes cannot be pipelined, we need to + * (a) flush any pending commands that refer to the buffer we're about + * to change, and + * (b) wait for threaded submit to finish, including those that were + * triggered by some other, earlier operation. + */ + if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, + res->buf, RADEON_USAGE_READWRITE)) { + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + } + if (radeon_emitted(ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, + res->buf, RADEON_USAGE_READWRITE)) { + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + } + + ctx->ws->cs_sync_flush(ctx->dma.cs); + ctx->ws->cs_sync_flush(ctx->gfx.cs); + + assert(resource->target == PIPE_BUFFER); + + return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit); +} + bool r600_common_context_init(struct r600_common_context *rctx, struct r600_common_screen *rscreen, unsigned context_flags) @@ -527,14 +616,8 @@ bool r600_common_context_init(struct r600_common_context *rctx, rctx->family = rscreen->family; rctx->chip_class = rscreen->chip_class; - if (rscreen->chip_class >= CIK) - rctx->max_db = MAX2(8, rscreen->info.num_render_backends); - else if (rscreen->chip_class >= EVERGREEN) - rctx->max_db = 8; - else - rctx->max_db = 4; - rctx->b.invalidate_resource = r600_invalidate_resource; + rctx->b.resource_commit = r600_resource_commit; rctx->b.transfer_map = u_transfer_map_vtbl; rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl; rctx->b.transfer_unmap = u_transfer_unmap_vtbl; @@ -542,6 +625,7 @@ bool r600_common_context_init(struct r600_common_context *rctx, rctx->b.memory_barrier = r600_memory_barrier; rctx->b.flush = r600_flush_from_st; rctx->b.set_debug_callback = r600_set_debug_callback; + rctx->dma_clear_buffer = r600_dma_clear_buffer_fallback; /* evergreen_compute.c has a special codepath for global buffers. * Everything else can use the direct path. @@ -569,14 +653,18 @@ bool r600_common_context_init(struct r600_common_context *rctx, rctx->allocator_zeroed_memory = u_suballocator_create(&rctx->b, rscreen->info.gart_page_size, - 0, PIPE_USAGE_DEFAULT, true); + 0, PIPE_USAGE_DEFAULT, 0, true); if (!rctx->allocator_zeroed_memory) return false; - rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, - PIPE_BIND_INDEX_BUFFER | - PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM); - if (!rctx->uploader) + rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024, + 0, PIPE_USAGE_STREAM); + if (!rctx->b.stream_uploader) + return false; + + rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024, + 0, PIPE_USAGE_DEFAULT); + if (!rctx->b.const_uploader) return false; rctx->ctx = rctx->ws->ctx_create(rctx->ws); @@ -619,9 +707,10 @@ void r600_common_context_cleanup(struct r600_common_context *rctx) if (rctx->ctx) rctx->ws->ctx_destroy(rctx->ctx); - if (rctx->uploader) { - u_upload_destroy(rctx->uploader); - } + if (rctx->b.stream_uploader) + u_upload_destroy(rctx->b.stream_uploader); + if (rctx->b.const_uploader) + u_upload_destroy(rctx->b.const_uploader); slab_destroy_child(&rctx->pool_transfers); @@ -656,8 +745,12 @@ static const struct debug_named_value common_debug_options[] = { { "noasm", DBG_NO_ASM, "Don't print disassembled shaders"}, { "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial optimizations" }, { "checkir", DBG_CHECK_IR, "Enable additional sanity checks on shader IR" }, + { "nooptvariant", DBG_NO_OPT_VARIANT, "Disable compiling optimized shader variants." }, { "testdma", DBG_TEST_DMA, "Invoke SDMA tests and exit." }, + { "testvmfaultcp", DBG_TEST_VMFAULT_CP, "Invoke a CP VM fault test and exit." }, + { "testvmfaultsdma", DBG_TEST_VMFAULT_SDMA, "Invoke a SDMA VM fault test and exit." }, + { "testvmfaultshader", DBG_TEST_VMFAULT_SHADER, "Invoke a shader VM fault test and exit." }, /* features */ { "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" }, @@ -673,7 +766,7 @@ static const struct debug_named_value common_debug_options[] = { { "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." }, { "nodcc", DBG_NO_DCC, "Disable DCC." }, { "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." }, - { "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." }, + { "norbplus", DBG_NO_RB_PLUS, "Disable RB+." }, { "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." }, { "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" }, { "noce", DBG_NO_CE, "Disable the constant engine"}, @@ -737,11 +830,54 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen) case CHIP_FIJI: return "AMD FIJI"; case CHIP_POLARIS10: return "AMD POLARIS10"; case CHIP_POLARIS11: return "AMD POLARIS11"; + case CHIP_POLARIS12: return "AMD POLARIS12"; case CHIP_STONEY: return "AMD STONEY"; + case CHIP_VEGA10: return "AMD VEGA10"; + case CHIP_RAVEN: return "AMD RAVEN"; default: return "AMD unknown"; } } +static void r600_disk_cache_create(struct r600_common_screen *rscreen) +{ + /* Don't use the cache if shader dumping is enabled. */ + if (rscreen->debug_flags & + (DBG_FS | DBG_VS | DBG_TCS | DBG_TES | DBG_GS | DBG_PS | DBG_CS)) + return; + + uint32_t mesa_timestamp; + if (disk_cache_get_function_timestamp(r600_disk_cache_create, + &mesa_timestamp)) { + char *timestamp_str; + int res = -1; + if (rscreen->chip_class < SI) { + res = asprintf(×tamp_str, "%u",mesa_timestamp); + } +#if HAVE_LLVM + else { + uint32_t llvm_timestamp; + if (disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo, + &llvm_timestamp)) { + res = asprintf(×tamp_str, "%u_%u", + mesa_timestamp, llvm_timestamp); + } + } +#endif + if (res != -1) { + rscreen->disk_shader_cache = + disk_cache_create(r600_get_chip_name(rscreen), + timestamp_str); + free(timestamp_str); + } + } +} + +static struct disk_cache *r600_get_disk_shader_cache(struct pipe_screen *pscreen) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen; + return rscreen->disk_shader_cache; +} + static const char* r600_get_name(struct pipe_screen* pscreen) { struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen; @@ -861,24 +997,45 @@ const char *r600_get_llvm_processor_name(enum radeon_family family) case CHIP_TONGA: return "tonga"; case CHIP_ICELAND: return "iceland"; case CHIP_CARRIZO: return "carrizo"; -#if HAVE_LLVM <= 0x0307 - case CHIP_FIJI: return "tonga"; - case CHIP_STONEY: return "carrizo"; -#else - case CHIP_FIJI: return "fiji"; - case CHIP_STONEY: return "stoney"; -#endif -#if HAVE_LLVM <= 0x0308 - case CHIP_POLARIS10: return "tonga"; - case CHIP_POLARIS11: return "tonga"; -#else - case CHIP_POLARIS10: return "polaris10"; - case CHIP_POLARIS11: return "polaris11"; -#endif - default: return ""; + case CHIP_FIJI: + return "fiji"; + case CHIP_STONEY: + return "stoney"; + case CHIP_POLARIS10: + return HAVE_LLVM >= 0x0309 ? "polaris10" : "carrizo"; + case CHIP_POLARIS11: + case CHIP_POLARIS12: /* same as polaris11 */ + return HAVE_LLVM >= 0x0309 ? "polaris11" : "carrizo"; + case CHIP_VEGA10: + case CHIP_RAVEN: + return "gfx900"; + default: + return ""; } } +static unsigned get_max_threads_per_block(struct r600_common_screen *screen, + enum pipe_shader_ir ir_type) +{ + if (ir_type != PIPE_SHADER_IR_TGSI) + return 256; + + if (HAVE_LLVM < 0x309) + return 256; + + /* Only 16 waves per thread-group on gfx9. */ + if (screen->chip_class >= GFX9) + return 1024; + + /* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice + * round number. + */ + if (screen->chip_class >= SI) + return 2048; + + return 256; +} + static int r600_get_compute_param(struct pipe_screen *screen, enum pipe_shader_ir ir_type, enum pipe_compute_cap param, @@ -933,27 +1090,17 @@ static int r600_get_compute_param(struct pipe_screen *screen, case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: if (ret) { uint64_t *block_size = ret; - if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && - ir_type == PIPE_SHADER_IR_TGSI) { - block_size[0] = 2048; - block_size[1] = 2048; - block_size[2] = 2048; - } else { - block_size[0] = 256; - block_size[1] = 256; - block_size[2] = 256; - } + unsigned threads_per_block = get_max_threads_per_block(rscreen, ir_type); + block_size[0] = threads_per_block; + block_size[1] = threads_per_block; + block_size[2] = threads_per_block; } return 3 * sizeof(uint64_t); case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: if (ret) { uint64_t *max_threads_per_block = ret; - if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && - ir_type == PIPE_SHADER_IR_TGSI) - *max_threads_per_block = 2048; - else - *max_threads_per_block = 256; + *max_threads_per_block = get_max_threads_per_block(rscreen, ir_type); } return sizeof(uint64_t); case PIPE_COMPUTE_CAP_ADDRESS_BITS: @@ -1186,11 +1333,11 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, snprintf(kernel_version, sizeof(kernel_version), " / %s", uname_data.release); -#if HAVE_LLVM - snprintf(llvm_string, sizeof(llvm_string), - ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff, - HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH); -#endif + if (HAVE_LLVM > 0) { + snprintf(llvm_string, sizeof(llvm_string), + ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff, + HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH); + } snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string), "%s (DRM %i.%i.%i%s%s)", @@ -1201,6 +1348,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, rscreen->b.get_name = r600_get_name; rscreen->b.get_vendor = r600_get_vendor; rscreen->b.get_device_vendor = r600_get_device_vendor; + rscreen->b.get_disk_shader_cache = r600_get_disk_shader_cache; rscreen->b.get_compute_param = r600_get_compute_param; rscreen->b.get_paramf = r600_get_paramf; rscreen->b.get_timestamp = r600_get_timestamp; @@ -1225,6 +1373,10 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, rscreen->family = rscreen->info.family; rscreen->chip_class = rscreen->info.chip_class; rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0); + rscreen->has_rbplus = false; + rscreen->rbplus_allowed = false; + + r600_disk_cache_create(rscreen); slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64); @@ -1236,8 +1388,8 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, } util_format_s3tc_init(); - pipe_mutex_init(rscreen->aux_context_lock); - pipe_mutex_init(rscreen->gpu_load_mutex); + (void) mtx_init(&rscreen->aux_context_lock, mtx_plain); + (void) mtx_init(&rscreen->gpu_load_mutex, mtx_plain); if (rscreen->debug_flags & DBG_INFO) { printf("pci_id = 0x%x\n", rscreen->info.pci_id); @@ -1246,6 +1398,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, printf("chip_class = %i\n", rscreen->info.chip_class); printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024)); printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024)); + printf("vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_vis_size, 1024*1024)); printf("max_alloc_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024)); printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory); @@ -1274,6 +1427,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, printf("num_render_backends = %i\n", rscreen->info.num_render_backends); printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes); printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes); + printf("enabled_rb_mask = 0x%x\n", rscreen->info.enabled_rb_mask); } return true; } @@ -1283,12 +1437,13 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen) r600_perfcounters_destroy(rscreen); r600_gpu_load_kill_thread(rscreen); - pipe_mutex_destroy(rscreen->gpu_load_mutex); - pipe_mutex_destroy(rscreen->aux_context_lock); + mtx_destroy(&rscreen->gpu_load_mutex); + mtx_destroy(&rscreen->aux_context_lock); rscreen->aux_context->destroy(rscreen->aux_context); slab_destroy_parent(&rscreen->pool_transfers); + disk_cache_destroy(rscreen->disk_shader_cache); rscreen->ws->destroy(rscreen->ws); FREE(rscreen); } @@ -1321,13 +1476,12 @@ bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned proce } void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value, - enum r600_coherency coher) + uint64_t offset, uint64_t size, unsigned value) { struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context; - pipe_mutex_lock(rscreen->aux_context_lock); - rctx->clear_buffer(&rctx->b, dst, offset, size, value, coher); + mtx_lock(&rscreen->aux_context_lock); + rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value); rscreen->aux_context->flush(rscreen->aux_context, NULL, 0); - pipe_mutex_unlock(rscreen->aux_context_lock); + mtx_unlock(&rscreen->aux_context_lock); } diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h index 86772c0af..bd542e500 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h @@ -34,8 +34,11 @@ #include <stdio.h> +#include "amd/common/ac_binary.h" + #include "radeon/radeon_winsys.h" +#include "util/disk_cache.h" #include "util/u_blitter.h" #include "util/list.h" #include "util/u_range.h" @@ -49,6 +52,7 @@ #define R600_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define R600_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) #define R600_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) +#define R600_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) #define R600_CONTEXT_STREAMOUT_FLUSH (1u << 0) /* Pipeline & streamout query controls. */ @@ -79,6 +83,7 @@ #define DBG_NO_ASM (1 << 14) #define DBG_PREOPT_IR (1 << 15) #define DBG_CHECK_IR (1 << 16) +#define DBG_NO_OPT_VARIANT (1 << 17) /* gaps */ #define DBG_TEST_DMA (1 << 20) /* Bits 21-31 are reserved for the r600g driver. */ @@ -102,6 +107,9 @@ #define DBG_NO_CE (1llu << 48) #define DBG_UNSAFE_MATH (1llu << 49) #define DBG_NO_DCC_FB (1llu << 50) +#define DBG_TEST_VMFAULT_CP (1llu << 51) +#define DBG_TEST_VMFAULT_SDMA (1llu << 52) +#define DBG_TEST_VMFAULT_SHADER (1llu << 53) #define R600_MAP_BUFFER_ALIGNMENT 64 #define R600_MAX_VIEWPORTS 16 @@ -125,45 +133,8 @@ struct r600_perfcounters; struct tgsi_shader_info; struct r600_qbo_state; -struct radeon_shader_reloc { - char name[32]; - uint64_t offset; -}; - -struct radeon_shader_binary { - /** Shader code */ - unsigned char *code; - unsigned code_size; - - /** Config/Context register state that accompanies this shader. - * This is a stream of dword pairs. First dword contains the - * register address, the second dword contains the value.*/ - unsigned char *config; - unsigned config_size; - - /** The number of bytes of config information for each global symbol. - */ - unsigned config_size_per_symbol; - - /** Constant data accessed by the shader. This will be uploaded - * into a constant buffer. */ - unsigned char *rodata; - unsigned rodata_size; - - /** List of symbol offsets for the shader */ - uint64_t *global_symbol_offsets; - unsigned global_symbol_count; - - struct radeon_shader_reloc *relocs; - unsigned reloc_count; - - /** Disassembled shader in a string. */ - char *disasm_string; - char *llvm_ir_string; -}; - -void radeon_shader_binary_init(struct radeon_shader_binary *b); -void radeon_shader_binary_clean(struct radeon_shader_binary *b); +void radeon_shader_binary_init(struct ac_shader_binary *b); +void radeon_shader_binary_clean(struct ac_shader_binary *b); /* Only 32-bit buffer allocations are supported, gallium doesn't support more * at the moment. @@ -232,20 +203,8 @@ struct r600_cmask_info { uint64_t offset; uint64_t size; unsigned alignment; - unsigned pitch; - unsigned height; - unsigned xalign; - unsigned yalign; unsigned slice_tile_max; - unsigned base_address_reg; -}; - -struct r600_htile_info { - unsigned pitch; - unsigned height; - unsigned xalign; - unsigned yalign; - unsigned alignment; + uint64_t base_address_reg; }; struct r600_texture { @@ -273,7 +232,6 @@ struct r600_texture { unsigned last_msaa_resolve_target_micro_mode; /* Depth buffer compression and fast clear. */ - struct r600_htile_info htile; struct r600_resource *htile_buffer; bool tc_compatible_htile; bool depth_cleared; /* if it was cleared at least once */ @@ -319,7 +277,10 @@ struct r600_texture { struct r600_surface { struct pipe_surface base; - const struct radeon_surf_level *level_info; + + /* These can vary with block-compressed textures. */ + unsigned width0; + unsigned height0; bool color_initialized; bool depth_initialized; @@ -329,6 +290,7 @@ struct r600_surface { bool export_16bpc; bool color_is_int8; bool color_is_int10; + bool dcc_incompatible; /* Color registers. */ unsigned cb_color_info; @@ -339,6 +301,7 @@ struct r600_surface { unsigned cb_color_pitch; /* EG and later */ unsigned cb_color_slice; /* EG and later */ unsigned cb_color_attrib; /* EG and later */ + unsigned cb_color_attrib2; /* GFX9 and later */ unsigned cb_dcc_control; /* VI and later */ unsigned cb_color_fmask; /* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */ unsigned cb_color_fmask_slice; /* EG and later */ @@ -352,20 +315,63 @@ struct r600_surface { struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */ /* DB registers. */ + uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE (EG and later) or DB_DEPTH_BASE (r600) */ + uint64_t db_stencil_base; /* EG and later */ + uint64_t db_htile_data_base; unsigned db_depth_info; /* R600 only, then SI and later */ unsigned db_z_info; /* EG and later */ - unsigned db_depth_base; /* DB_Z_READ/WRITE_BASE (EG and later) or DB_DEPTH_BASE (r600) */ + unsigned db_z_info2; /* GFX9+ */ unsigned db_depth_view; unsigned db_depth_size; unsigned db_depth_slice; /* EG and later */ - unsigned db_stencil_base; /* EG and later */ unsigned db_stencil_info; /* EG and later */ + unsigned db_stencil_info2; /* GFX9+ */ unsigned db_prefetch_limit; /* R600 only */ unsigned db_htile_surface; - unsigned db_htile_data_base; unsigned db_preload_control; /* EG and later */ }; +struct r600_mmio_counter { + unsigned busy; + unsigned idle; +}; + +union r600_mmio_counters { + struct { + /* For global GPU load including SDMA. */ + struct r600_mmio_counter gpu; + + /* GRBM_STATUS */ + struct r600_mmio_counter spi; + struct r600_mmio_counter gui; + struct r600_mmio_counter ta; + struct r600_mmio_counter gds; + struct r600_mmio_counter vgt; + struct r600_mmio_counter ia; + struct r600_mmio_counter sx; + struct r600_mmio_counter wd; + struct r600_mmio_counter bci; + struct r600_mmio_counter sc; + struct r600_mmio_counter pa; + struct r600_mmio_counter db; + struct r600_mmio_counter cp; + struct r600_mmio_counter cb; + + /* SRBM_STATUS2 */ + struct r600_mmio_counter sdma; + + /* CP_STAT */ + struct r600_mmio_counter pfp; + struct r600_mmio_counter meq; + struct r600_mmio_counter me; + struct r600_mmio_counter surf_sync; + struct r600_mmio_counter dma; + struct r600_mmio_counter scratch_ram; + struct r600_mmio_counter ce; + } named; + unsigned array[0]; +}; + struct r600_common_screen { struct pipe_screen b; struct radeon_winsys *ws; @@ -375,6 +381,10 @@ struct r600_common_screen { uint64_t debug_flags; bool has_cp_dma; bool has_streamout; + bool has_rbplus; /* if RB+ registers exist */ + bool rbplus_allowed; /* if RB+ is allowed */ + + struct disk_cache *disk_shader_cache; struct slab_parent_pool pool_transfers; @@ -384,7 +394,7 @@ struct r600_common_screen { /* Auxiliary context. Mainly used to initialize resources. * It must be locked prior to using and flushed before unlocking. */ struct pipe_context *aux_context; - pipe_mutex aux_context_lock; + mtx_t aux_context_lock; /* This must be in the screen, because UE4 uses one context for * compilation and another one for rendering. @@ -394,12 +404,12 @@ struct r600_common_screen { * are loading shaders on demand. This is a monotonic counter. */ unsigned num_shaders_created; + unsigned num_shader_cache_hits; /* GPU load thread. */ - pipe_mutex gpu_load_mutex; - pipe_thread gpu_load_thread; - unsigned gpu_load_counter_busy; - unsigned gpu_load_counter_idle; + mtx_t gpu_load_mutex; + thrd_t gpu_load_thread; + union r600_mmio_counters mmio_counters; volatile unsigned gpu_load_stop_thread; /* bool */ char renderer_string[100]; @@ -407,12 +417,14 @@ struct r600_common_screen { /* Performance counters. */ struct r600_perfcounters *perfcounters; - /* If pipe_screen wants to re-emit the framebuffer state of all - * contexts, it should atomically increment this. Each context will - * compare this with its own last known value of the counter before - * drawing and re-emit the framebuffer state accordingly. + /* If pipe_screen wants to recompute and re-emit the framebuffer, + * sampler, and image states of all contexts, it should atomically + * increment this. + * + * Each context will compare this with its own last known value of + * the counter before drawing and re-emit the states accordingly. */ - unsigned dirty_fb_counter; + unsigned dirty_tex_counter; /* Atomically increment this counter when an existing texture's * metadata is enabled or disabled in a way that requires changing @@ -420,12 +432,6 @@ struct r600_common_screen { */ unsigned compressed_colortex_counter; - /* Atomically increment this counter when an existing texture's - * backing buffer or tile mode parameters have changed that requires - * recomputation of shader descriptors. - */ - unsigned dirty_tex_descriptor_counter; - struct { /* Context flags to set so that all writes from earlier jobs * in the CP are seen by L2 clients. @@ -480,7 +486,7 @@ struct r600_streamout { /* External state which comes from the vertex shader, * it must be set explicitly when binding a shader. */ - unsigned *stride_in_dw; + uint16_t *stride_in_dw; unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */ /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */ @@ -544,11 +550,9 @@ struct r600_common_context { unsigned num_gfx_cs_flushes; unsigned initial_gfx_cs_size; unsigned gpu_reset_counter; - unsigned last_dirty_fb_counter; + unsigned last_dirty_tex_counter; unsigned last_compressed_colortex_counter; - unsigned last_dirty_tex_descriptor_counter; - struct u_upload_mgr *uploader; struct u_suballocator *allocator_zeroed_memory; struct slab_child_pool pool_transfers; @@ -574,18 +578,19 @@ struct r600_common_context { int num_perfect_occlusion_queries; struct list_head active_queries; unsigned num_cs_dw_queries_suspend; - /* Additional hardware info. */ - unsigned backend_mask; - unsigned max_db; /* for OQ */ /* Misc stats. */ unsigned num_draw_calls; unsigned num_spill_draw_calls; unsigned num_compute_calls; unsigned num_spill_compute_calls; unsigned num_dma_calls; + unsigned num_cp_dma_calls; unsigned num_vs_flushes; unsigned num_ps_flushes; unsigned num_cs_flushes; + unsigned num_fb_cache_flushes; + unsigned num_L2_invalidates; + unsigned num_L2_writebacks; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ @@ -638,6 +643,9 @@ struct r600_common_context { unsigned src_level, const struct pipe_box *src_box); + void (*dma_clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned value); + void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, enum r600_coherency coher); @@ -693,7 +701,7 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, const struct pipe_resource *templ, unsigned alignment); struct pipe_resource * r600_aligned_buffer_create(struct pipe_screen *screen, - unsigned bind, + unsigned flags, unsigned usage, unsigned size, unsigned alignment); @@ -706,8 +714,11 @@ r600_invalidate_resource(struct pipe_context *ctx, struct pipe_resource *resource); /* r600_common_pipe.c */ -void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf, - uint64_t va, uint32_t old_value, uint32_t new_value); +void r600_gfx_write_event_eop(struct r600_common_context *ctx, + unsigned event, unsigned event_flags, + unsigned data_sel, + struct r600_resource *buf, uint64_t va, + uint32_t old_fence, uint32_t new_fence); unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen); void r600_gfx_wait_fence(struct r600_common_context *ctx, uint64_t va, uint32_t ref, uint32_t mask); @@ -729,14 +740,12 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen, bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor); void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value, - enum r600_coherency coher); + uint64_t offset, uint64_t size, unsigned value); struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, const struct pipe_resource *templ); const char *r600_get_llvm_processor_name(enum radeon_family family); void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src); -void r600_dma_emit_wait_idle(struct r600_common_context *rctx); void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs, struct radeon_saved_cs *saved); void radeon_clear_saved_cs(struct radeon_saved_cs *saved); @@ -744,8 +753,9 @@ bool r600_check_device_reset(struct r600_common_context *rctx); /* r600_gpu_load.c */ void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen); -uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen); -unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin); +uint64_t r600_begin_counter(struct r600_common_screen *rscreen, unsigned type); +unsigned r600_end_counter(struct r600_common_screen *rscreen, unsigned type, + uint64_t begin); /* r600_perfcounters.c */ void r600_perfcounters_destroy(struct r600_common_screen *rscreen); @@ -755,7 +765,7 @@ void r600_init_screen_query_functions(struct r600_common_screen *rscreen); void r600_query_init(struct r600_common_context *rctx); void r600_suspend_queries(struct r600_common_context *ctx); void r600_resume_queries(struct r600_common_context *ctx); -void r600_query_init_backend_mask(struct r600_common_context *ctx); +void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen); /* r600_streamout.c */ void r600_streamout_buffers_dirty(struct r600_common_context *rctx); @@ -789,18 +799,23 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, bool r600_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture, struct r600_texture **staging); -void r600_print_texture_info(struct r600_texture *rtex, FILE *f); +void r600_print_texture_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, FILE *f); struct pipe_resource *r600_texture_create(struct pipe_screen *screen, const struct pipe_resource *templ); bool vi_dcc_formats_compatible(enum pipe_format format1, enum pipe_format format2); -void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx, +bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, + unsigned level, + enum pipe_format view_format); +void vi_disable_dcc_if_incompatible_format(struct r600_common_context *rctx, struct pipe_resource *tex, unsigned level, enum pipe_format view_format); struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, struct pipe_resource *texture, const struct pipe_surface *templ, + unsigned width0, unsigned height0, unsigned width, unsigned height); unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap); void vi_separate_dcc_start_query(struct pipe_context *ctx, @@ -951,6 +966,12 @@ r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler) (!stencil_sampler && tex->can_sample_z); } +static inline bool +vi_dcc_enabled(struct r600_texture *tex, unsigned level) +{ + return tex->dcc_offset && level < tex->surface.num_dcc_levels; +} + #define COMPUTE_DBG(rscreen, fmt, args...) \ do { \ if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \ @@ -966,4 +987,9 @@ r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler) (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \ (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28)) +static inline int S_FIXED(float value, unsigned frac_bits) +{ + return value * (1 << frac_bits); +} + #endif diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.c b/lib/mesa/src/gallium/drivers/radeon/r600_query.c index 4b6767dd3..7764871aa 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_query.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.c @@ -26,7 +26,7 @@ #include "r600_cs.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" - +#include "os/os_time.h" #include "tgsi/tgsi_text.h" struct r600_hw_query_params { @@ -43,17 +43,20 @@ struct r600_query_sw { uint64_t begin_result; uint64_t end_result; + + uint64_t begin_time; + uint64_t end_time; + /* Fence for GPU_FINISHED. */ struct pipe_fence_handle *fence; }; -static void r600_query_sw_destroy(struct r600_common_context *rctx, +static void r600_query_sw_destroy(struct r600_common_screen *rscreen, struct r600_query *rquery) { - struct pipe_screen *screen = rctx->b.screen; struct r600_query_sw *query = (struct r600_query_sw *)rquery; - screen->fence_reference(screen, &query->fence, NULL); + rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL); FREE(query); } @@ -65,14 +68,18 @@ static enum radeon_value_id winsys_id_from_type(unsigned type) case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM; case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT; case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS; - case R600_QUERY_NUM_CTX_FLUSHES: return RADEON_NUM_CS_FLUSHES; + case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS; + case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS; + case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS; case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED; case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS; case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE; + case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE; case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE; case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE; case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK; case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK; + case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME; default: unreachable("query type does not correspond to winsys id"); } } @@ -81,6 +88,7 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx, struct r600_query *rquery) { struct r600_query_sw *query = (struct r600_query_sw *)rquery; + enum radeon_value_id ws_id; switch(query->b.type) { case PIPE_QUERY_TIMESTAMP_DISJOINT: @@ -101,6 +109,9 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx, case R600_QUERY_DMA_CALLS: query->begin_result = rctx->num_dma_calls; break; + case R600_QUERY_CP_DMA_CALLS: + query->begin_result = rctx->num_cp_dma_calls; + break; case R600_QUERY_NUM_VS_FLUSHES: query->begin_result = rctx->num_vs_flushes; break; @@ -110,28 +121,67 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx, case R600_QUERY_NUM_CS_FLUSHES: query->begin_result = rctx->num_cs_flushes; break; + case R600_QUERY_NUM_FB_CACHE_FLUSHES: + query->begin_result = rctx->num_fb_cache_flushes; + break; + case R600_QUERY_NUM_L2_INVALIDATES: + query->begin_result = rctx->num_L2_invalidates; + break; + case R600_QUERY_NUM_L2_WRITEBACKS: + query->begin_result = rctx->num_L2_writebacks; + break; case R600_QUERY_REQUESTED_VRAM: case R600_QUERY_REQUESTED_GTT: case R600_QUERY_MAPPED_VRAM: case R600_QUERY_MAPPED_GTT: case R600_QUERY_VRAM_USAGE: + case R600_QUERY_VRAM_VIS_USAGE: case R600_QUERY_GTT_USAGE: case R600_QUERY_GPU_TEMPERATURE: case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO: + case R600_QUERY_NUM_MAPPED_BUFFERS: query->begin_result = 0; break; case R600_QUERY_BUFFER_WAIT_TIME: - case R600_QUERY_NUM_CTX_FLUSHES: + case R600_QUERY_NUM_GFX_IBS: + case R600_QUERY_NUM_SDMA_IBS: case R600_QUERY_NUM_BYTES_MOVED: case R600_QUERY_NUM_EVICTIONS: { enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); break; } + case R600_QUERY_CS_THREAD_BUSY: + ws_id = winsys_id_from_type(query->b.type); + query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); + query->begin_time = os_time_get_nano(); + break; case R600_QUERY_GPU_LOAD: - query->begin_result = r600_gpu_load_begin(rctx->screen); + case R600_QUERY_GPU_SHADERS_BUSY: + case R600_QUERY_GPU_TA_BUSY: + case R600_QUERY_GPU_GDS_BUSY: + case R600_QUERY_GPU_VGT_BUSY: + case R600_QUERY_GPU_IA_BUSY: + case R600_QUERY_GPU_SX_BUSY: + case R600_QUERY_GPU_WD_BUSY: + case R600_QUERY_GPU_BCI_BUSY: + case R600_QUERY_GPU_SC_BUSY: + case R600_QUERY_GPU_PA_BUSY: + case R600_QUERY_GPU_DB_BUSY: + case R600_QUERY_GPU_CP_BUSY: + case R600_QUERY_GPU_CB_BUSY: + case R600_QUERY_GPU_SDMA_BUSY: + case R600_QUERY_GPU_PFP_BUSY: + case R600_QUERY_GPU_MEQ_BUSY: + case R600_QUERY_GPU_ME_BUSY: + case R600_QUERY_GPU_SURF_SYNC_BUSY: + case R600_QUERY_GPU_DMA_BUSY: + case R600_QUERY_GPU_SCRATCH_RAM_BUSY: + case R600_QUERY_GPU_CE_BUSY: + query->begin_result = r600_begin_counter(rctx->screen, + query->b.type); break; case R600_QUERY_NUM_COMPILATIONS: query->begin_result = p_atomic_read(&rctx->screen->num_compilations); @@ -139,6 +189,10 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx, case R600_QUERY_NUM_SHADERS_CREATED: query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created); break; + case R600_QUERY_NUM_SHADER_CACHE_HITS: + query->begin_result = + p_atomic_read(&rctx->screen->num_shader_cache_hits); + break; case R600_QUERY_GPIN_ASIC_ID: case R600_QUERY_GPIN_NUM_SIMD: case R600_QUERY_GPIN_NUM_RB: @@ -156,6 +210,7 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, struct r600_query *rquery) { struct r600_query_sw *query = (struct r600_query_sw *)rquery; + enum radeon_value_id ws_id; switch(query->b.type) { case PIPE_QUERY_TIMESTAMP_DISJOINT: @@ -178,6 +233,9 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, case R600_QUERY_DMA_CALLS: query->end_result = rctx->num_dma_calls; break; + case R600_QUERY_CP_DMA_CALLS: + query->end_result = rctx->num_cp_dma_calls; + break; case R600_QUERY_NUM_VS_FLUSHES: query->end_result = rctx->num_vs_flushes; break; @@ -187,26 +245,65 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, case R600_QUERY_NUM_CS_FLUSHES: query->end_result = rctx->num_cs_flushes; break; + case R600_QUERY_NUM_FB_CACHE_FLUSHES: + query->end_result = rctx->num_fb_cache_flushes; + break; + case R600_QUERY_NUM_L2_INVALIDATES: + query->end_result = rctx->num_L2_invalidates; + break; + case R600_QUERY_NUM_L2_WRITEBACKS: + query->end_result = rctx->num_L2_writebacks; + break; case R600_QUERY_REQUESTED_VRAM: case R600_QUERY_REQUESTED_GTT: case R600_QUERY_MAPPED_VRAM: case R600_QUERY_MAPPED_GTT: case R600_QUERY_VRAM_USAGE: + case R600_QUERY_VRAM_VIS_USAGE: case R600_QUERY_GTT_USAGE: case R600_QUERY_GPU_TEMPERATURE: case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: case R600_QUERY_BUFFER_WAIT_TIME: - case R600_QUERY_NUM_CTX_FLUSHES: + case R600_QUERY_NUM_MAPPED_BUFFERS: + case R600_QUERY_NUM_GFX_IBS: + case R600_QUERY_NUM_SDMA_IBS: case R600_QUERY_NUM_BYTES_MOVED: case R600_QUERY_NUM_EVICTIONS: { enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); query->end_result = rctx->ws->query_value(rctx->ws, ws_id); break; } + case R600_QUERY_CS_THREAD_BUSY: + ws_id = winsys_id_from_type(query->b.type); + query->end_result = rctx->ws->query_value(rctx->ws, ws_id); + query->end_time = os_time_get_nano(); + break; case R600_QUERY_GPU_LOAD: - query->end_result = r600_gpu_load_end(rctx->screen, - query->begin_result); + case R600_QUERY_GPU_SHADERS_BUSY: + case R600_QUERY_GPU_TA_BUSY: + case R600_QUERY_GPU_GDS_BUSY: + case R600_QUERY_GPU_VGT_BUSY: + case R600_QUERY_GPU_IA_BUSY: + case R600_QUERY_GPU_SX_BUSY: + case R600_QUERY_GPU_WD_BUSY: + case R600_QUERY_GPU_BCI_BUSY: + case R600_QUERY_GPU_SC_BUSY: + case R600_QUERY_GPU_PA_BUSY: + case R600_QUERY_GPU_DB_BUSY: + case R600_QUERY_GPU_CP_BUSY: + case R600_QUERY_GPU_CB_BUSY: + case R600_QUERY_GPU_SDMA_BUSY: + case R600_QUERY_GPU_PFP_BUSY: + case R600_QUERY_GPU_MEQ_BUSY: + case R600_QUERY_GPU_ME_BUSY: + case R600_QUERY_GPU_SURF_SYNC_BUSY: + case R600_QUERY_GPU_DMA_BUSY: + case R600_QUERY_GPU_SCRATCH_RAM_BUSY: + case R600_QUERY_GPU_CE_BUSY: + query->end_result = r600_end_counter(rctx->screen, + query->b.type, + query->begin_result); query->begin_result = 0; break; case R600_QUERY_NUM_COMPILATIONS: @@ -218,6 +315,10 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO: query->end_result = rctx->last_tex_ps_draw_ratio; break; + case R600_QUERY_NUM_SHADER_CACHE_HITS: + query->end_result = + p_atomic_read(&rctx->screen->num_shader_cache_hits); + break; case R600_QUERY_GPIN_ASIC_ID: case R600_QUERY_GPIN_NUM_SIMD: case R600_QUERY_GPIN_NUM_RB: @@ -252,6 +353,10 @@ static bool r600_query_sw_get_result(struct r600_common_context *rctx, return result->b; } + case R600_QUERY_CS_THREAD_BUSY: + result->u64 = (query->end_result - query->begin_result) * 100 / + (query->end_time - query->begin_time); + return true; case R600_QUERY_GPIN_ASIC_ID: result->u32 = 0; return true; @@ -294,8 +399,7 @@ static struct r600_query_ops sw_query_ops = { .get_result_resource = NULL }; -static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx, - unsigned query_type) +static struct pipe_query *r600_query_sw_create(unsigned query_type) { struct r600_query_sw *query; @@ -309,7 +413,7 @@ static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx, return (struct pipe_query *)query; } -void r600_query_hw_destroy(struct r600_common_context *rctx, +void r600_query_hw_destroy(struct r600_common_screen *rscreen, struct r600_query *rquery) { struct r600_query_hw *query = (struct r600_query_hw *)rquery; @@ -327,23 +431,23 @@ void r600_query_hw_destroy(struct r600_common_context *rctx, FREE(rquery); } -static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx, +static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen, struct r600_query_hw *query) { unsigned buf_size = MAX2(query->result_size, - ctx->screen->info.min_alloc_size); + rscreen->info.min_alloc_size); /* Queries are normally read by the CPU after * being written by the gpu, hence staging is probably a good * usage pattern. */ struct r600_resource *buf = (struct r600_resource*) - pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM, + pipe_buffer_create(&rscreen->b, 0, PIPE_USAGE_STAGING, buf_size); if (!buf) return NULL; - if (!query->ops->prepare_buffer(ctx, query, buf)) { + if (!query->ops->prepare_buffer(rscreen, query, buf)) { r600_resource_reference(&buf, NULL); return NULL; } @@ -351,14 +455,14 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c return buf; } -static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx, +static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen, struct r600_query_hw *query, struct r600_resource *buffer) { /* Callers ensure that the buffer is currently unused by the GPU. */ - uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL, - PIPE_TRANSFER_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED); + uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_UNSYNCHRONIZED); if (!results) return false; @@ -366,19 +470,21 @@ static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx, if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) { + unsigned max_rbs = rscreen->info.num_render_backends; + unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask; unsigned num_results; unsigned i, j; /* Set top bits for unused backends. */ num_results = buffer->b.b.width0 / query->result_size; for (j = 0; j < num_results; j++) { - for (i = 0; i < ctx->max_db; i++) { - if (!(ctx->backend_mask & (1<<i))) { + for (i = 0; i < max_rbs; i++) { + if (!(enabled_rb_mask & (1<<i))) { results[(i * 4)+1] = 0x80000000; results[(i * 4)+3] = 0x80000000; } } - results += 4 * ctx->max_db; + results += 4 * max_rbs; } } @@ -409,7 +515,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, struct r600_query_hw *query, struct r600_resource *buffer, uint64_t va); -static void r600_query_hw_add_result(struct r600_common_context *ctx, +static void r600_query_hw_add_result(struct r600_common_screen *rscreen, struct r600_query_hw *, void *buffer, union pipe_query_result *result); static void r600_query_hw_clear_result(struct r600_query_hw *, @@ -423,17 +529,17 @@ static struct r600_query_hw_ops query_hw_default_hw_ops = { .add_result = r600_query_hw_add_result, }; -bool r600_query_hw_init(struct r600_common_context *rctx, +bool r600_query_hw_init(struct r600_common_screen *rscreen, struct r600_query_hw *query) { - query->buffer.buf = r600_new_query_buffer(rctx, query); + query->buffer.buf = r600_new_query_buffer(rscreen, query); if (!query->buffer.buf) return false; return true; } -static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx, +static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen, unsigned query_type, unsigned index) { @@ -448,19 +554,19 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx, switch (query_type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: - query->result_size = 16 * rctx->max_db; + query->result_size = 16 * rscreen->info.num_render_backends; query->result_size += 16; /* for the fence + alignment */ query->num_cs_dw_begin = 6; - query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen); + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen); break; case PIPE_QUERY_TIME_ELAPSED: query->result_size = 24; query->num_cs_dw_begin = 8; - query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen); + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen); break; case PIPE_QUERY_TIMESTAMP: query->result_size = 16; - query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen); + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen); query->flags = R600_QUERY_HW_FLAG_NO_START; break; case PIPE_QUERY_PRIMITIVES_EMITTED: @@ -475,10 +581,10 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx, break; case PIPE_QUERY_PIPELINE_STATISTICS: /* 11 values on EG, 8 on R600. */ - query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16; + query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16; query->result_size += 8; /* for the fence + alignment */ query->num_cs_dw_begin = 6; - query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen); + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen); break; default: assert(0); @@ -486,7 +592,7 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx, return NULL; } - if (!r600_query_hw_init(rctx, query)) { + if (!r600_query_hw_init(rscreen, query)) { FREE(query); return NULL; } @@ -545,7 +651,7 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) & 0xFFFF); + radeon_emit(cs, va >> 32); break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -554,21 +660,17 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3)); radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) & 0xFFFF); + radeon_emit(cs, va >> 32); break; case PIPE_QUERY_TIME_ELAPSED: - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5)); - radeon_emit(cs, va); - radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF)); - radeon_emit(cs, 0); - radeon_emit(cs, 0); + r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, + 0, 3, NULL, va, 0, 0); break; case PIPE_QUERY_PIPELINE_STATISTICS: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) & 0xFFFF); + radeon_emit(cs, va >> 32); break; default: assert(0); @@ -597,7 +699,7 @@ static void r600_query_hw_emit_start(struct r600_common_context *ctx, *qbuf = query->buffer; query->buffer.results_end = 0; query->buffer.previous = qbuf; - query->buffer.buf = r600_new_query_buffer(ctx, query); + query->buffer.buf = r600_new_query_buffer(ctx->screen, query); if (!query->buffer.buf) return; } @@ -625,9 +727,9 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) & 0xFFFF); + radeon_emit(cs, va >> 32); - fence_va = va + ctx->max_db * 16 - 8; + fence_va = va + ctx->screen->info.num_render_backends * 16 - 8; break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -637,19 +739,14 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3)); radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) & 0xFFFF); + radeon_emit(cs, va >> 32); break; case PIPE_QUERY_TIME_ELAPSED: va += 8; /* fall through */ case PIPE_QUERY_TIMESTAMP: - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5)); - radeon_emit(cs, va); - radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF)); - radeon_emit(cs, 0); - radeon_emit(cs, 0); - + r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, + 0, 3, NULL, va, 0, 0); fence_va = va + 8; break; case PIPE_QUERY_PIPELINE_STATISTICS: { @@ -659,7 +756,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) & 0xFFFF); + radeon_emit(cs, va >> 32); fence_va = va + sample_size; break; @@ -671,7 +768,8 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, RADEON_PRIO_QUERY); if (fence_va) - r600_gfx_write_fence(ctx, query->buffer.buf, fence_va, 0, 0x80000000); + r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1, + query->buffer.buf, fence_va, 0, 0x80000000); } static void r600_query_hw_emit_stop(struct r600_common_context *ctx, @@ -743,12 +841,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, /* emit predicate packets for all data blocks */ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { unsigned results_base = 0; - uint64_t va = qbuf->buf->gpu_address; + uint64_t va_base = qbuf->buf->gpu_address; while (results_base < qbuf->results_end) { - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, va + results_base); - radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF)); + uint64_t va = va_base + results_base; + + if (ctx->chip_class >= GFX9) { + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0)); + radeon_emit(cs, op); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } else { + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); + radeon_emit(cs, va); + radeon_emit(cs, op | ((va >> 32) & 0xFF)); + } r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY); results_base += query->result_size; @@ -761,14 +868,15 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) { - struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_common_screen *rscreen = + (struct r600_common_screen *)ctx->screen; if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED || query_type >= PIPE_QUERY_DRIVER_SPECIFIC) - return r600_query_sw_create(ctx, query_type); + return r600_query_sw_create(query_type); - return r600_query_hw_create(rctx, query_type, index); + return r600_query_hw_create(rscreen, query_type, index); } static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query) @@ -776,7 +884,7 @@ static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *quer struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct r600_query *rquery = (struct r600_query *)query; - rquery->ops->destroy(rctx, rquery); + rquery->ops->destroy(rctx->screen, rquery); } static boolean r600_begin_query(struct pipe_context *ctx, @@ -808,9 +916,9 @@ void r600_query_hw_reset_buffers(struct r600_common_context *rctx, if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { r600_resource_reference(&query->buffer.buf, NULL); - query->buffer.buf = r600_new_query_buffer(rctx, query); + query->buffer.buf = r600_new_query_buffer(rctx->screen, query); } else { - if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf)) + if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf)) r600_resource_reference(&query->buffer.buf, NULL); } } @@ -867,6 +975,8 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx, struct r600_query_hw *rquery, int index, struct r600_hw_query_params *params) { + unsigned max_rbs = rctx->screen->info.num_render_backends; + params->pair_stride = 0; params->pair_count = 1; @@ -875,9 +985,9 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx, case PIPE_QUERY_OCCLUSION_PREDICATE: params->start_offset = 0; params->end_offset = 8; - params->fence_offset = rctx->max_db * 16; + params->fence_offset = max_rbs * 16; params->pair_stride = 16; - params->pair_count = rctx->max_db; + params->pair_count = max_rbs; break; case PIPE_QUERY_TIME_ELAPSED: params->start_offset = 0; @@ -936,14 +1046,16 @@ static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned return 0; } -static void r600_query_hw_add_result(struct r600_common_context *ctx, +static void r600_query_hw_add_result(struct r600_common_screen *rscreen, struct r600_query_hw *query, void *buffer, union pipe_query_result *result) { + unsigned max_rbs = rscreen->info.num_render_backends; + switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: { - for (unsigned i = 0; i < ctx->max_db; ++i) { + for (unsigned i = 0; i < max_rbs; ++i) { unsigned results_base = i * 16; result->u64 += r600_query_read_result(buffer + results_base, 0, 2, true); @@ -951,7 +1063,7 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx, break; } case PIPE_QUERY_OCCLUSION_PREDICATE: { - for (unsigned i = 0; i < ctx->max_db; ++i) { + for (unsigned i = 0; i < max_rbs; ++i) { unsigned results_base = i * 16; result->b = result->b || r600_query_read_result(buffer + results_base, 0, 2, true) != 0; @@ -989,7 +1101,7 @@ static void r600_query_hw_add_result(struct r600_common_context *ctx, r600_query_read_result(buffer, 0, 4, true); break; case PIPE_QUERY_PIPELINE_STATISTICS: - if (ctx->chip_class >= EVERGREEN) { + if (rscreen->chip_class >= EVERGREEN) { result->pipeline_statistics.ps_invocations += r600_query_read_result(buffer, 0, 22, false); result->pipeline_statistics.c_primitives += @@ -1087,6 +1199,7 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, struct r600_query *rquery, bool wait, union pipe_query_result *result) { + struct r600_common_screen *rscreen = rctx->screen; struct r600_query_hw *query = (struct r600_query_hw *)rquery; struct r600_query_buffer *qbuf; @@ -1103,7 +1216,7 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, return false; while (results_base != qbuf->results_end) { - query->ops->add_result(rctx, query, map + results_base, + query->ops->add_result(rscreen, query, map + results_base, result); results_base += query->result_size; } @@ -1112,7 +1225,7 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, /* Convert the time to expected units. */ if (rquery->type == PIPE_QUERY_TIME_ELAPSED || rquery->type == PIPE_QUERY_TIMESTAMP) { - result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq; + result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq; } return true; } @@ -1170,6 +1283,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) "IMM[1] UINT32 {1, 2, 4, 8}\n" "IMM[2] UINT32 {16, 32, 64, 128}\n" "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ + "IMM[4] UINT32 {0, 0, 0, 0}\n" "AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n" "UIF TEMP[5]\n" @@ -1269,7 +1383,7 @@ static void r600_create_query_result_shader(struct r600_common_context *rctx) /* Convert to boolean */ "AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n" "UIF TEMP[4]\n" - "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n" + "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" "MOV TEMP[0].y, IMM[0].xxxx\n" "ENDIF\n" @@ -1479,7 +1593,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, static void r600_render_condition(struct pipe_context *ctx, struct pipe_query *query, boolean condition, - uint mode) + enum pipe_render_cond_flag mode) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct r600_query_hw *rquery = (struct r600_query_hw *)query; @@ -1550,19 +1664,23 @@ void r600_resume_queries(struct r600_common_context *ctx) } } -/* Get backends mask */ -void r600_query_init_backend_mask(struct r600_common_context *ctx) +/* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */ +void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) { + struct r600_common_context *ctx = + (struct r600_common_context*)rscreen->aux_context; struct radeon_winsys_cs *cs = ctx->gfx.cs; struct r600_resource *buffer; uint32_t *results; - unsigned num_backends = ctx->screen->info.num_render_backends; unsigned i, mask = 0; + unsigned max_rbs = ctx->screen->info.num_render_backends; + + assert(rscreen->chip_class <= CAYMAN); /* if backend_map query is supported by the kernel */ - if (ctx->screen->info.r600_gb_backend_map_valid) { - unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes; - unsigned backend_map = ctx->screen->info.r600_gb_backend_map; + if (rscreen->info.r600_gb_backend_map_valid) { + unsigned num_tile_pipes = rscreen->info.num_tile_pipes; + unsigned backend_map = rscreen->info.r600_gb_backend_map; unsigned item_width, item_mask; if (ctx->chip_class >= EVERGREEN) { @@ -1579,7 +1697,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) backend_map >>= item_width; } if (mask != 0) { - ctx->backend_mask = mask; + rscreen->info.enabled_rb_mask = mask; return; } } @@ -1588,15 +1706,15 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) /* create buffer for event data */ buffer = (struct r600_resource*) - pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_STAGING, ctx->max_db*16); + pipe_buffer_create(ctx->b.screen, 0, + PIPE_USAGE_STAGING, max_rbs * 16); if (!buffer) - goto err; + return; /* initialize buffer with zeroes */ results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE); if (results) { - memset(results, 0, ctx->max_db * 4 * 4); + memset(results, 0, max_rbs * 4 * 4); /* emit EVENT_WRITE for ZPASS_DONE */ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); @@ -1610,7 +1728,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) /* analyze results */ results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ); if (results) { - for(i = 0; i < ctx->max_db; i++) { + for(i = 0; i < max_rbs; i++) { /* at least highest bit will be set if backend is used */ if (results[i*4 + 1]) mask |= (1<<i); @@ -1620,15 +1738,8 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) r600_resource_reference(&buffer, NULL); - if (mask != 0) { - ctx->backend_mask = mask; - return; - } - -err: - /* fallback to old method - set num_backends lower bits to 1 */ - ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends); - return; + if (mask) + rscreen->info.enabled_rb_mask = mask; } #define XFULL(name_, query_type_, type_, result_type_, group_id_) \ @@ -1649,23 +1760,32 @@ err: static struct pipe_driver_query_info r600_driver_query_list[] = { X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE), X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE), + X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE), X("draw-calls", DRAW_CALLS, UINT64, AVERAGE), X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE), X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), X("dma-calls", DMA_CALLS, UINT64, AVERAGE), + X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE), X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), + X("num-fb-cache-flushes", NUM_FB_CACHE_FLUSHES, UINT64, AVERAGE), + X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE), + X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE), + X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE), X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE), X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE), X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE), X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE), X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), - X("num-ctx-flushes", NUM_CTX_FLUSHES, UINT64, AVERAGE), + X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE), + X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE), + X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE), X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE), X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE), + X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE), X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE), @@ -1680,12 +1800,34 @@ static struct pipe_driver_query_info r600_driver_query_list[] = { XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE), XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE), - /* The following queries must be at the end of the list because their - * availability is adjusted dynamically based on the DRM version. */ - X("GPU-load", GPU_LOAD, UINT64, AVERAGE), X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE), X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE), X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE), + + /* The following queries must be at the end of the list because their + * availability is adjusted dynamically based on the DRM version. */ + X("GPU-load", GPU_LOAD, UINT64, AVERAGE), + X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE), + X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE), + X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE), + X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE), + X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE), + X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE), + X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE), + X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE), + X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE), + X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE), + X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE), + X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE), + X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE), + X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE), + X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE), + X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE), + X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE), + X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE), + X("GPU-dma-busy", GPU_DMA_BUSY, UINT64, AVERAGE), + X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE), + X("GPU-ce-busy", GPU_CE_BUSY, UINT64, AVERAGE), }; #undef X @@ -1696,10 +1838,14 @@ static unsigned r600_get_num_queries(struct r600_common_screen *rscreen) { if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) return ARRAY_SIZE(r600_driver_query_list); - else if (rscreen->info.drm_major == 3) - return ARRAY_SIZE(r600_driver_query_list) - 3; + else if (rscreen->info.drm_major == 3) { + if (rscreen->chip_class >= VI) + return ARRAY_SIZE(r600_driver_query_list); + else + return ARRAY_SIZE(r600_driver_query_list) - 7; + } else - return ARRAY_SIZE(r600_driver_query_list) - 4; + return ARRAY_SIZE(r600_driver_query_list) - 25; } static int r600_get_driver_query_info(struct pipe_screen *screen, @@ -1735,6 +1881,9 @@ static int r600_get_driver_query_info(struct pipe_screen *screen, case R600_QUERY_GPU_TEMPERATURE: info->max_value.u64 = 125; break; + case R600_QUERY_VRAM_VIS_USAGE: + info->max_value.u64 = rscreen->info.vram_vis_size; + break; } if (info->group_id != ~(unsigned)0 && rscreen->perfcounters) diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.h b/lib/mesa/src/gallium/drivers/radeon/r600_query.h index 14c433d91..b9ab44ca3 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_query.h +++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.h @@ -48,26 +48,56 @@ enum { R600_QUERY_COMPUTE_CALLS, R600_QUERY_SPILL_COMPUTE_CALLS, R600_QUERY_DMA_CALLS, + R600_QUERY_CP_DMA_CALLS, R600_QUERY_NUM_VS_FLUSHES, R600_QUERY_NUM_PS_FLUSHES, R600_QUERY_NUM_CS_FLUSHES, + R600_QUERY_NUM_FB_CACHE_FLUSHES, + R600_QUERY_NUM_L2_INVALIDATES, + R600_QUERY_NUM_L2_WRITEBACKS, + R600_QUERY_CS_THREAD_BUSY, R600_QUERY_REQUESTED_VRAM, R600_QUERY_REQUESTED_GTT, R600_QUERY_MAPPED_VRAM, R600_QUERY_MAPPED_GTT, R600_QUERY_BUFFER_WAIT_TIME, - R600_QUERY_NUM_CTX_FLUSHES, + R600_QUERY_NUM_MAPPED_BUFFERS, + R600_QUERY_NUM_GFX_IBS, + R600_QUERY_NUM_SDMA_IBS, R600_QUERY_NUM_BYTES_MOVED, R600_QUERY_NUM_EVICTIONS, R600_QUERY_VRAM_USAGE, + R600_QUERY_VRAM_VIS_USAGE, R600_QUERY_GTT_USAGE, R600_QUERY_GPU_TEMPERATURE, R600_QUERY_CURRENT_GPU_SCLK, R600_QUERY_CURRENT_GPU_MCLK, R600_QUERY_GPU_LOAD, + R600_QUERY_GPU_SHADERS_BUSY, + R600_QUERY_GPU_TA_BUSY, + R600_QUERY_GPU_GDS_BUSY, + R600_QUERY_GPU_VGT_BUSY, + R600_QUERY_GPU_IA_BUSY, + R600_QUERY_GPU_SX_BUSY, + R600_QUERY_GPU_WD_BUSY, + R600_QUERY_GPU_BCI_BUSY, + R600_QUERY_GPU_SC_BUSY, + R600_QUERY_GPU_PA_BUSY, + R600_QUERY_GPU_DB_BUSY, + R600_QUERY_GPU_CP_BUSY, + R600_QUERY_GPU_CB_BUSY, + R600_QUERY_GPU_SDMA_BUSY, + R600_QUERY_GPU_PFP_BUSY, + R600_QUERY_GPU_MEQ_BUSY, + R600_QUERY_GPU_ME_BUSY, + R600_QUERY_GPU_SURF_SYNC_BUSY, + R600_QUERY_GPU_DMA_BUSY, + R600_QUERY_GPU_SCRATCH_RAM_BUSY, + R600_QUERY_GPU_CE_BUSY, R600_QUERY_NUM_COMPILATIONS, R600_QUERY_NUM_SHADERS_CREATED, R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO, + R600_QUERY_NUM_SHADER_CACHE_HITS, R600_QUERY_GPIN_ASIC_ID, R600_QUERY_GPIN_NUM_SIMD, R600_QUERY_GPIN_NUM_RB, @@ -83,7 +113,7 @@ enum { }; struct r600_query_ops { - void (*destroy)(struct r600_common_context *, struct r600_query *); + void (*destroy)(struct r600_common_screen *, struct r600_query *); bool (*begin)(struct r600_common_context *, struct r600_query *); bool (*end)(struct r600_common_context *, struct r600_query *); bool (*get_result)(struct r600_common_context *, @@ -112,7 +142,7 @@ enum { }; struct r600_query_hw_ops { - bool (*prepare_buffer)(struct r600_common_context *, + bool (*prepare_buffer)(struct r600_common_screen *, struct r600_query_hw *, struct r600_resource *); void (*emit_start)(struct r600_common_context *, @@ -122,7 +152,7 @@ struct r600_query_hw_ops { struct r600_query_hw *, struct r600_resource *buffer, uint64_t va); void (*clear_result)(struct r600_query_hw *, union pipe_query_result *); - void (*add_result)(struct r600_common_context *ctx, + void (*add_result)(struct r600_common_screen *screen, struct r600_query_hw *, void *buffer, union pipe_query_result *result); }; @@ -157,9 +187,9 @@ struct r600_query_hw { unsigned stream; }; -bool r600_query_hw_init(struct r600_common_context *rctx, +bool r600_query_hw_init(struct r600_common_screen *rscreen, struct r600_query_hw *query); -void r600_query_hw_destroy(struct r600_common_context *rctx, +void r600_query_hw_destroy(struct r600_common_screen *rscreen, struct r600_query *rquery); bool r600_query_hw_begin(struct r600_common_context *rctx, struct r600_query *rquery); diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c index b5296aa56..a18089a3b 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c @@ -187,7 +187,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r { struct radeon_winsys_cs *cs = rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; - unsigned *stride_in_dw = rctx->streamout.stride_in_dw; + uint16_t *stride_in_dw = rctx->streamout.stride_in_dw; unsigned i, update_flags = 0; r600_flush_vgt_streamout(rctx); diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c b/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c index 1e60f6aff..9e1ff9e5f 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_test_dma.c @@ -26,26 +26,10 @@ #include "r600_pipe_common.h" #include "util/u_surface.h" +#include "util/rand_xor.h" static uint64_t seed_xorshift128plus[2]; -/* Super fast random number generator. - * - * This rand_xorshift128plus function by Sebastiano Vigna belongs - * to the public domain. - */ -static uint64_t rand_xorshift128plus(void) -{ - uint64_t *s = seed_xorshift128plus; - - uint64_t s1 = s[0]; - const uint64_t s0 = s[1]; - s[0] = s0; - s1 ^= s1 << 23; - s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); - return s[1] + s0; -} - #define RAND_NUM_SIZE 8 /* The GPU blits are emulated on the CPU using these CPU textures. */ @@ -91,8 +75,10 @@ static void set_random_pixels(struct pipe_context *ctx, assert(t->stride % RAND_NUM_SIZE == 0); assert(cpu->stride % RAND_NUM_SIZE == 0); - for (x = 0; x < size; x++) - *ptr++ = *ptr_cpu++ = rand_xorshift128plus(); + for (x = 0; x < size; x++) { + *ptr++ = *ptr_cpu++ = + rand_xorshift128plus(seed_xorshift128plus); + } } } @@ -149,18 +135,24 @@ static enum pipe_format get_format_from_bpp(int bpp) } } -static const char *array_mode_to_string(unsigned mode) +static const char *array_mode_to_string(struct r600_common_screen *rscreen, + struct radeon_surf *surf) { - switch (mode) { - case RADEON_SURF_MODE_LINEAR_ALIGNED: - return "LINEAR_ALIGNED"; - case RADEON_SURF_MODE_1D: - return "1D_TILED_THIN1"; - case RADEON_SURF_MODE_2D: - return "2D_TILED_THIN1"; - default: - assert(0); + if (rscreen->chip_class >= GFX9) { + /* TODO */ return " UNKNOWN"; + } else { + switch (surf->u.legacy.level[0].mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + return "LINEAR_ALIGNED"; + case RADEON_SURF_MODE_1D: + return "1D_TILED_THIN1"; + case RADEON_SURF_MODE_2D: + return "2D_TILED_THIN1"; + default: + assert(0); + return " UNKNOWN"; + } } } @@ -197,8 +189,7 @@ void r600_test_dma(struct r600_common_screen *rscreen) /* the seed for random test parameters */ srand(0x9b47d95b); /* the seed for random pixel data */ - seed_xorshift128plus[0] = 0x3bffb83978e24f88; - seed_xorshift128plus[1] = 0x9238d5d56c71cd35; + s_rand_xorshift128plus(seed_xorshift128plus, false); iterations = 1000000000; /* just kill it when you are bored */ num_partial_copies = 30; @@ -292,16 +283,16 @@ void r600_test_dma(struct r600_common_screen *rscreen) printf("%4u: dst = (%5u x %5u x %u, %s), " " src = (%5u x %5u x %u, %s), bpp = %2u, ", i, tdst.width0, tdst.height0, tdst.array_size, - array_mode_to_string(rdst->surface.level[0].mode), + array_mode_to_string(rscreen, &rdst->surface), tsrc.width0, tsrc.height0, tsrc.array_size, - array_mode_to_string(rsrc->surface.level[0].mode), bpp); + array_mode_to_string(rscreen, &rsrc->surface), bpp); fflush(stdout); /* set src pixels */ set_random_pixels(ctx, src, &src_cpu); /* clear dst pixels */ - rctx->clear_buffer(ctx, dst, 0, rdst->surface.bo_size, 0, true); + rctx->clear_buffer(ctx, dst, 0, rdst->surface.surf_size, 0, true); memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); /* preparation */ @@ -331,8 +322,8 @@ void r600_test_dma(struct r600_common_screen *rscreen) dstz = rand() % (tdst.array_size - depth + 1); /* special code path to hit the tiled partial copies */ - if (rsrc->surface.level[0].mode >= RADEON_SURF_MODE_1D && - rdst->surface.level[0].mode >= RADEON_SURF_MODE_1D && + if (!rsrc->surface.is_linear && + !rdst->surface.is_linear && rand() & 1) { if (max_width < 8 || max_height < 8) continue; @@ -359,8 +350,8 @@ void r600_test_dma(struct r600_common_screen *rscreen) } /* special code path to hit out-of-bounds reads in L2T */ - if (rsrc->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED && - rdst->surface.level[0].mode >= RADEON_SURF_MODE_1D && + if (rsrc->surface.is_linear && + !rdst->surface.is_linear && rand() % 4 == 0) { srcx = 0; srcy = 0; diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c index 27035c0fa..4b2082523 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c @@ -37,8 +37,9 @@ static void r600_texture_discard_cmask(struct r600_common_screen *rscreen, struct r600_texture *rtex); -static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, - const struct pipe_resource *templ); +static enum radeon_surf_mode +r600_choose_tiling(struct r600_common_screen *rscreen, + const struct pipe_resource *templ); bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, @@ -52,8 +53,7 @@ bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, if (!rctx->dma.cs) return false; - if (util_format_get_blocksizebits(rdst->resource.b.b.format) != - util_format_get_blocksizebits(rsrc->resource.b.b.format)) + if (rdst->surface.bpe != rsrc->surface.bpe) return false; /* MSAA: Blits don't exist in the real world. */ @@ -72,8 +72,8 @@ bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, * src: Use the 3D path. DCC decompression is expensive. * dst: Use the 3D path to compress the pixels with DCC. */ - if ((rsrc->dcc_offset && rsrc->surface.level[src_level].dcc_enabled) || - (rdst->dcc_offset && rdst->surface.level[dst_level].dcc_enabled)) + if (vi_dcc_enabled(rsrc, src_level) || + vi_dcc_enabled(rdst, dst_level)) return false; /* CMASK as: @@ -177,179 +177,170 @@ static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600 src, 0, &sbox); } -static unsigned r600_texture_get_offset(struct r600_texture *rtex, unsigned level, - const struct pipe_box *box) +static unsigned r600_texture_get_offset(struct r600_common_screen *rscreen, + struct r600_texture *rtex, unsigned level, + const struct pipe_box *box, + unsigned *stride, + unsigned *layer_stride) { - enum pipe_format format = rtex->resource.b.b.format; + if (rscreen->chip_class >= GFX9) { + *stride = rtex->surface.u.gfx9.surf_pitch * rtex->surface.bpe; + *layer_stride = rtex->surface.u.gfx9.surf_slice_size; + + if (!box) + return 0; + + /* Each texture is an array of slices. Each slice is an array + * of mipmap levels. */ + return box->z * rtex->surface.u.gfx9.surf_slice_size + + rtex->surface.u.gfx9.offset[level] + + (box->y / rtex->surface.blk_h * + rtex->surface.u.gfx9.surf_pitch + + box->x / rtex->surface.blk_w) * rtex->surface.bpe; + } else { + *stride = rtex->surface.u.legacy.level[level].nblk_x * + rtex->surface.bpe; + *layer_stride = rtex->surface.u.legacy.level[level].slice_size; - return rtex->surface.level[level].offset + - box->z * rtex->surface.level[level].slice_size + - box->y / util_format_get_blockheight(format) * rtex->surface.level[level].pitch_bytes + - box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format); + if (!box) + return rtex->surface.u.legacy.level[level].offset; + + /* Each texture is an array of mipmap levels. Each level is + * an array of slices. */ + return rtex->surface.u.legacy.level[level].offset + + box->z * rtex->surface.u.legacy.level[level].slice_size + + (box->y / rtex->surface.blk_h * + rtex->surface.u.legacy.level[level].nblk_x + + box->x / rtex->surface.blk_w) * rtex->surface.bpe; + } } static int r600_init_surface(struct r600_common_screen *rscreen, struct radeon_surf *surface, const struct pipe_resource *ptex, - unsigned array_mode, + enum radeon_surf_mode array_mode, + unsigned pitch_in_bytes_override, + unsigned offset, + bool is_imported, + bool is_scanout, bool is_flushed_depth, bool tc_compatible_htile) { const struct util_format_description *desc = util_format_description(ptex->format); bool is_depth, is_stencil; + int r; + unsigned i, bpe, flags = 0; is_depth = util_format_has_depth(desc); is_stencil = util_format_has_stencil(desc); - surface->npix_x = ptex->width0; - surface->npix_y = ptex->height0; - surface->npix_z = ptex->depth0; - surface->blk_w = util_format_get_blockwidth(ptex->format); - surface->blk_h = util_format_get_blockheight(ptex->format); - surface->blk_d = 1; - surface->array_size = 1; - surface->last_level = ptex->last_level; - if (rscreen->chip_class >= EVERGREEN && !is_flushed_depth && ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { - surface->bpe = 4; /* stencil is allocated separately on evergreen */ + bpe = 4; /* stencil is allocated separately on evergreen */ } else { - surface->bpe = util_format_get_blocksize(ptex->format); + bpe = util_format_get_blocksize(ptex->format); /* align byte per element on dword */ - if (surface->bpe == 3) { - surface->bpe = 4; + if (bpe == 3) { + bpe = 4; } } - surface->nsamples = ptex->nr_samples ? ptex->nr_samples : 1; - surface->flags = RADEON_SURF_SET(array_mode, MODE); - - switch (ptex->target) { - case PIPE_TEXTURE_1D: - surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D, TYPE); - break; - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D: - surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D, TYPE); - break; - case PIPE_TEXTURE_3D: - surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_3D, TYPE); - break; - case PIPE_TEXTURE_1D_ARRAY: - surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D_ARRAY, TYPE); - surface->array_size = ptex->array_size; - break; - case PIPE_TEXTURE_CUBE_ARRAY: /* cube array layout like 2d array */ - assert(ptex->array_size % 6 == 0); - case PIPE_TEXTURE_2D_ARRAY: - surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D_ARRAY, TYPE); - surface->array_size = ptex->array_size; - break; - case PIPE_TEXTURE_CUBE: - surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_CUBEMAP, TYPE); - break; - case PIPE_BUFFER: - default: - return -EINVAL; - } - if (!is_flushed_depth && is_depth) { - surface->flags |= RADEON_SURF_ZBUFFER; + flags |= RADEON_SURF_ZBUFFER; if (tc_compatible_htile && - array_mode == RADEON_SURF_MODE_2D) { + (rscreen->chip_class >= GFX9 || + array_mode == RADEON_SURF_MODE_2D)) { /* TC-compatible HTILE only supports Z32_FLOAT. - * Promote Z16 to Z32. DB->CB copies will convert + * GFX9 also supports Z16_UNORM. + * On VI, promote Z16 to Z32. DB->CB copies will convert * the format for transfers. */ - surface->bpe = 4; - surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; - } + if (rscreen->chip_class == VI) + bpe = 4; - if (is_stencil) { - surface->flags |= RADEON_SURF_SBUFFER | - RADEON_SURF_HAS_SBUFFER_MIPTREE; + flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; } - } - if (rscreen->chip_class >= SI) { - surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX; + if (is_stencil) + flags |= RADEON_SURF_SBUFFER; } if (rscreen->chip_class >= VI && (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC || ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT)) - surface->flags |= RADEON_SURF_DISABLE_DCC; + flags |= RADEON_SURF_DISABLE_DCC; - if (ptex->bind & PIPE_BIND_SCANOUT) { + if (ptex->bind & PIPE_BIND_SCANOUT || is_scanout) { /* This should catch bugs in gallium users setting incorrect flags. */ - assert(surface->nsamples == 1 && - surface->array_size == 1 && - surface->npix_z == 1 && - surface->last_level == 0 && - !(surface->flags & RADEON_SURF_Z_OR_SBUFFER)); + assert(ptex->nr_samples <= 1 && + ptex->array_size == 1 && + ptex->depth0 == 1 && + ptex->last_level == 0 && + !(flags & RADEON_SURF_Z_OR_SBUFFER)); - surface->flags |= RADEON_SURF_SCANOUT; + flags |= RADEON_SURF_SCANOUT; } - return 0; -} -static int r600_setup_surface(struct pipe_screen *screen, - struct r600_texture *rtex, - unsigned pitch_in_bytes_override, - unsigned offset) -{ - struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; - unsigned i; - int r; + if (is_imported) + flags |= RADEON_SURF_IMPORTED; + if (!(ptex->flags & R600_RESOURCE_FLAG_FORCE_TILING)) + flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; - r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface); + r = rscreen->ws->surface_init(rscreen->ws, ptex, flags, bpe, + array_mode, surface); if (r) { return r; } - rtex->size = rtex->surface.bo_size; - - if (pitch_in_bytes_override && pitch_in_bytes_override != rtex->surface.level[0].pitch_bytes) { - /* old ddx on evergreen over estimate alignment for 1d, only 1 level - * for those - */ - rtex->surface.level[0].nblk_x = pitch_in_bytes_override / rtex->surface.bpe; - rtex->surface.level[0].pitch_bytes = pitch_in_bytes_override; - rtex->surface.level[0].slice_size = pitch_in_bytes_override * rtex->surface.level[0].nblk_y; - } + if (rscreen->chip_class >= GFX9) { + assert(!pitch_in_bytes_override || + pitch_in_bytes_override == surface->u.gfx9.surf_pitch * bpe); + surface->u.gfx9.surf_offset = offset; + } else { + if (pitch_in_bytes_override && + pitch_in_bytes_override != surface->u.legacy.level[0].nblk_x * bpe) { + /* old ddx on evergreen over estimate alignment for 1d, only 1 level + * for those + */ + surface->u.legacy.level[0].nblk_x = pitch_in_bytes_override / bpe; + surface->u.legacy.level[0].slice_size = pitch_in_bytes_override * + surface->u.legacy.level[0].nblk_y; + } - if (offset) { - for (i = 0; i < ARRAY_SIZE(rtex->surface.level); ++i) - rtex->surface.level[i].offset += offset; + if (offset) { + for (i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i) + surface->u.legacy.level[i].offset += offset; + } } return 0; } -static void r600_texture_init_metadata(struct r600_texture *rtex, +static void r600_texture_init_metadata(struct r600_common_screen *rscreen, + struct r600_texture *rtex, struct radeon_bo_metadata *metadata) { struct radeon_surf *surface = &rtex->surface; memset(metadata, 0, sizeof(*metadata)); - metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; - metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; - metadata->pipe_config = surface->pipe_config; - metadata->bankw = surface->bankw; - metadata->bankh = surface->bankh; - metadata->tile_split = surface->tile_split; - metadata->mtilea = surface->mtilea; - metadata->num_banks = surface->num_banks; - metadata->stride = surface->level[0].pitch_bytes; - metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; -} -static void r600_dirty_all_framebuffer_states(struct r600_common_screen *rscreen) -{ - p_atomic_inc(&rscreen->dirty_fb_counter); + if (rscreen->chip_class >= GFX9) { + metadata->u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode; + } else { + metadata->u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->u.legacy.pipe_config = surface->u.legacy.pipe_config; + metadata->u.legacy.bankw = surface->u.legacy.bankw; + metadata->u.legacy.bankh = surface->u.legacy.bankh; + metadata->u.legacy.tile_split = surface->u.legacy.tile_split; + metadata->u.legacy.mtilea = surface->u.legacy.mtilea; + metadata->u.legacy.num_banks = surface->u.legacy.num_banks; + metadata->u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe; + metadata->u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; + } } static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx, @@ -359,13 +350,13 @@ static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx, struct pipe_context *ctx = &rctx->b; if (ctx == rscreen->aux_context) - pipe_mutex_lock(rscreen->aux_context_lock); + mtx_lock(&rscreen->aux_context_lock); ctx->flush_resource(ctx, &rtex->resource.b.b); ctx->flush(ctx, NULL, 0); if (ctx == rscreen->aux_context) - pipe_mutex_unlock(rscreen->aux_context_lock); + mtx_unlock(&rscreen->aux_context_lock); } static void r600_texture_discard_cmask(struct r600_common_screen *rscreen, @@ -390,7 +381,7 @@ static void r600_texture_discard_cmask(struct r600_common_screen *rscreen, r600_resource_reference(&rtex->cmask_buffer, NULL); /* Notify all contexts about the change. */ - r600_dirty_all_framebuffer_states(rscreen); + p_atomic_inc(&rscreen->dirty_tex_counter); p_atomic_inc(&rscreen->compressed_colortex_counter); } @@ -414,7 +405,7 @@ static bool r600_texture_discard_dcc(struct r600_common_screen *rscreen, rtex->dcc_offset = 0; /* Notify all contexts about the change. */ - r600_dirty_all_framebuffer_states(rscreen); + p_atomic_inc(&rscreen->dirty_tex_counter); return true; } @@ -448,14 +439,14 @@ bool r600_texture_disable_dcc(struct r600_common_context *rctx, return false; if (&rctx->b == rscreen->aux_context) - pipe_mutex_lock(rscreen->aux_context_lock); + mtx_lock(&rscreen->aux_context_lock); /* Decompress DCC. */ rctx->decompress_dcc(&rctx->b, rtex); rctx->b.flush(&rctx->b, NULL, 0); if (&rctx->b == rscreen->aux_context) - pipe_mutex_unlock(rscreen->aux_context_lock); + mtx_unlock(&rscreen->aux_context_lock); return r600_texture_discard_dcc(rscreen, rtex); } @@ -476,7 +467,7 @@ static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx, return; if (rtex->resource.is_shared || - rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED) + rtex->surface.is_linear) return; /* This fails with MSAA, depth, and compressed textures. */ @@ -529,8 +520,7 @@ static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx, r600_texture_reference(&new_tex, NULL); - r600_dirty_all_framebuffer_states(rctx->screen); - p_atomic_inc(&rctx->screen->dirty_tex_descriptor_counter); + p_atomic_inc(&rctx->screen->dirty_tex_counter); } static boolean r600_texture_get_handle(struct pipe_screen* screen, @@ -546,6 +536,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen, struct r600_texture *rtex = (struct r600_texture*)resource; struct radeon_bo_metadata metadata; bool update_metadata = false; + unsigned stride, offset, slice_size; /* This is not supported now, but it might be required for OpenCL * interop in the future. @@ -578,7 +569,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen, /* Set metadata. */ if (!res->is_shared || update_metadata) { - r600_texture_init_metadata(rtex, &metadata); + r600_texture_init_metadata(rscreen, rtex, &metadata); if (rscreen->query_opaque_metadata) rscreen->query_opaque_metadata(rscreen, rtex, &metadata); @@ -599,11 +590,25 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen, res->external_usage = usage; } - return rscreen->ws->buffer_get_handle(res->buf, - rtex->surface.level[0].pitch_bytes, - rtex->surface.level[0].offset, - rtex->surface.level[0].slice_size, - whandle); + if (res->b.b.target == PIPE_BUFFER) { + offset = 0; + stride = 0; + slice_size = 0; + } else { + if (rscreen->chip_class >= GFX9) { + offset = rtex->surface.u.gfx9.surf_offset; + stride = rtex->surface.u.gfx9.surf_pitch * + rtex->surface.bpe; + slice_size = rtex->surface.u.gfx9.surf_slice_size; + } else { + offset = rtex->surface.u.legacy.level[0].offset; + stride = rtex->surface.u.legacy.level[0].nblk_x * + rtex->surface.bpe; + slice_size = rtex->surface.u.legacy.level[0].slice_size; + } + } + return rscreen->ws->buffer_get_handle(res->buf, stride, offset, + slice_size, whandle); } static void r600_texture_destroy(struct pipe_screen *screen, @@ -633,35 +638,39 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen, struct r600_fmask_info *out) { /* FMASK is allocated like an ordinary texture. */ - struct radeon_surf fmask = rtex->surface; + struct pipe_resource templ = rtex->resource.b.b; + struct radeon_surf fmask = {}; + unsigned flags, bpe; memset(out, 0, sizeof(*out)); - fmask.bo_alignment = 0; - fmask.bo_size = 0; - fmask.nsamples = 1; - fmask.flags |= RADEON_SURF_FMASK; + if (rscreen->chip_class >= GFX9) { + out->alignment = rtex->surface.u.gfx9.fmask_alignment; + out->size = rtex->surface.u.gfx9.fmask_size; + return; + } - /* Force 2D tiling if it wasn't set. This may occur when creating - * FMASK for MSAA resolve on R6xx. On R6xx, the single-sample - * destination buffer must have an FMASK too. */ - fmask.flags = RADEON_SURF_CLR(fmask.flags, MODE); - fmask.flags |= RADEON_SURF_SET(RADEON_SURF_MODE_2D, MODE); + templ.nr_samples = 1; + flags = rtex->surface.flags | RADEON_SURF_FMASK; - if (rscreen->chip_class >= SI) { - fmask.flags |= RADEON_SURF_HAS_TILE_MODE_INDEX; + if (rscreen->chip_class <= CAYMAN) { + /* Use the same parameters and tile mode. */ + fmask.u.legacy.bankw = rtex->surface.u.legacy.bankw; + fmask.u.legacy.bankh = rtex->surface.u.legacy.bankh; + fmask.u.legacy.mtilea = rtex->surface.u.legacy.mtilea; + fmask.u.legacy.tile_split = rtex->surface.u.legacy.tile_split; + + if (nr_samples <= 4) + fmask.u.legacy.bankh = 4; } switch (nr_samples) { case 2: case 4: - fmask.bpe = 1; - if (rscreen->chip_class <= CAYMAN) { - fmask.bankh = 4; - } + bpe = 1; break; case 8: - fmask.bpe = 4; + bpe = 4; break; default: R600_ERR("Invalid sample count for FMASK allocation.\n"); @@ -672,25 +681,26 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen, * This can be fixed by writing a separate FMASK allocator specifically * for R600-R700 asics. */ if (rscreen->chip_class <= R700) { - fmask.bpe *= 2; + bpe *= 2; } - if (rscreen->ws->surface_init(rscreen->ws, &fmask)) { + if (rscreen->ws->surface_init(rscreen->ws, &templ, flags, bpe, + RADEON_SURF_MODE_2D, &fmask)) { R600_ERR("Got error in surface_init while allocating FMASK.\n"); return; } - assert(fmask.level[0].mode == RADEON_SURF_MODE_2D); + assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D); - out->slice_tile_max = (fmask.level[0].nblk_x * fmask.level[0].nblk_y) / 64; + out->slice_tile_max = (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64; if (out->slice_tile_max) out->slice_tile_max -= 1; - out->tile_mode_index = fmask.tiling_index[0]; - out->pitch_in_pixels = fmask.level[0].nblk_x; - out->bank_height = fmask.bankh; - out->alignment = MAX2(256, fmask.bo_alignment); - out->size = fmask.bo_size; + out->tile_mode_index = fmask.u.legacy.tiling_index[0]; + out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x; + out->bank_height = fmask.u.legacy.bankh; + out->alignment = MAX2(256, fmask.surf_alignment); + out->size = fmask.surf_size; } static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen, @@ -721,8 +731,8 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, unsigned macro_tile_width = util_next_power_of_two(sqrt_pixels_per_macro_tile); unsigned macro_tile_height = pixels_per_macro_tile / macro_tile_width; - unsigned pitch_elements = align(rtex->surface.npix_x, macro_tile_width); - unsigned height = align(rtex->surface.npix_y, macro_tile_height); + unsigned pitch_elements = align(rtex->resource.b.b.width0, macro_tile_width); + unsigned height = align(rtex->resource.b.b.height0, macro_tile_height); unsigned base_align = num_pipes * pipe_interleave_bytes; unsigned slice_bytes = @@ -731,10 +741,6 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, assert(macro_tile_width % 128 == 0); assert(macro_tile_height % 128 == 0); - out->pitch = pitch_elements; - out->height = height; - out->xalign = macro_tile_width; - out->yalign = macro_tile_height; out->slice_tile_max = ((pitch_elements * height) / (128*128)) - 1; out->alignment = MAX2(256, base_align); out->size = (util_max_layer(&rtex->resource.b.b, 0) + 1) * @@ -749,6 +755,12 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen, unsigned num_pipes = rscreen->info.num_tile_pipes; unsigned cl_width, cl_height; + if (rscreen->chip_class >= GFX9) { + out->alignment = rtex->surface.u.gfx9.cmask_alignment; + out->size = rtex->surface.u.gfx9.cmask_size; + return; + } + switch (num_pipes) { case 2: cl_width = 32; @@ -773,17 +785,13 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen, unsigned base_align = num_pipes * pipe_interleave_bytes; - unsigned width = align(rtex->surface.npix_x, cl_width*8); - unsigned height = align(rtex->surface.npix_y, cl_height*8); + unsigned width = align(rtex->resource.b.b.width0, cl_width*8); + unsigned height = align(rtex->resource.b.b.height0, cl_height*8); unsigned slice_elements = (width * height) / (8*8); /* Each element of CMASK is a nibble. */ unsigned slice_bytes = slice_elements / 2; - out->pitch = width; - out->height = height; - out->xalign = cl_width * 8; - out->yalign = cl_height * 8; out->slice_tile_max = (width * height) / (128*128); if (out->slice_tile_max) out->slice_tile_max -= 1; @@ -826,7 +834,9 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen } rtex->cmask_buffer = (struct r600_resource *) - r600_aligned_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT, + r600_aligned_buffer_create(&rscreen->b, + R600_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, rtex->cmask.size, rtex->cmask.alignment); if (rtex->cmask_buffer == NULL) { @@ -845,28 +855,32 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen p_atomic_inc(&rscreen->compressed_colortex_counter); } -static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, - struct r600_texture *rtex) +static void r600_texture_get_htile_size(struct r600_common_screen *rscreen, + struct r600_texture *rtex) { unsigned cl_width, cl_height, width, height; unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align; unsigned num_pipes = rscreen->info.num_tile_pipes; + assert(rscreen->chip_class <= VI); + + rtex->surface.htile_size = 0; + if (rscreen->chip_class <= EVERGREEN && rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26) - return 0; + return; /* HW bug on R6xx. */ if (rscreen->chip_class == R600 && - (rtex->surface.level[0].npix_x > 7680 || - rtex->surface.level[0].npix_y > 7680)) - return 0; + (rtex->resource.b.b.width0 > 7680 || + rtex->resource.b.b.height0 > 7680)) + return; /* HTILE is broken with 1D tiling on old kernels and CIK. */ if (rscreen->chip_class >= CIK && - rtex->surface.level[0].mode == RADEON_SURF_MODE_1D && + rtex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D && rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38) - return 0; + return; /* Overalign HTILE on P2 configs to work around GPU hangs in * piglit/depthstencil-render-miplevels 585. @@ -901,11 +915,11 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, break; default: assert(0); - return 0; + return; } - width = align(rtex->surface.npix_x, cl_width * 8); - height = align(rtex->surface.npix_y, cl_height * 8); + width = align(rtex->resource.b.b.width0, cl_width * 8); + height = align(rtex->resource.b.b.height0, cl_height * 8); slice_elements = (width * height) / (8 * 8); slice_bytes = slice_elements * 4; @@ -913,69 +927,122 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; base_align = num_pipes * pipe_interleave_bytes; - rtex->htile.pitch = width; - rtex->htile.height = height; - rtex->htile.xalign = cl_width * 8; - rtex->htile.yalign = cl_height * 8; - rtex->htile.alignment = base_align; - - return (util_max_layer(&rtex->resource.b.b, 0) + 1) * + rtex->surface.htile_alignment = base_align; + rtex->surface.htile_size = + (util_max_layer(&rtex->resource.b.b, 0) + 1) * align(slice_bytes, base_align); } static void r600_texture_allocate_htile(struct r600_common_screen *rscreen, struct r600_texture *rtex) { - uint64_t htile_size, alignment; uint32_t clear_value; - if (rtex->tc_compatible_htile) { - htile_size = rtex->surface.htile_size; - alignment = rtex->surface.htile_alignment; + if (rscreen->chip_class >= GFX9 || rtex->tc_compatible_htile) { clear_value = 0x0000030F; } else { - htile_size = r600_texture_get_htile_size(rscreen, rtex); - alignment = rtex->htile.alignment; + r600_texture_get_htile_size(rscreen, rtex); clear_value = 0; } - if (!htile_size) + if (!rtex->surface.htile_size) return; rtex->htile_buffer = (struct r600_resource*) - r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, - htile_size, alignment); + r600_aligned_buffer_create(&rscreen->b, + R600_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, + rtex->surface.htile_size, + rtex->surface.htile_alignment); if (rtex->htile_buffer == NULL) { /* this is not a fatal error as we can still keep rendering * without htile buffer */ R600_ERR("Failed to create buffer object for htile buffer.\n"); } else { r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, - 0, htile_size, clear_value, - R600_COHERENCY_NONE); + 0, rtex->surface.htile_size, + clear_value); } } -void r600_print_texture_info(struct r600_texture *rtex, FILE *f) +void r600_print_texture_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, FILE *f) { int i; + /* Common parameters. */ fprintf(f, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " - "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, " + "blk_h=%u, array_size=%u, last_level=%u, " "bpe=%u, nsamples=%u, flags=0x%x, %s\n", - rtex->surface.npix_x, rtex->surface.npix_y, - rtex->surface.npix_z, rtex->surface.blk_w, - rtex->surface.blk_h, rtex->surface.blk_d, - rtex->surface.array_size, rtex->surface.last_level, - rtex->surface.bpe, rtex->surface.nsamples, + rtex->resource.b.b.width0, rtex->resource.b.b.height0, + rtex->resource.b.b.depth0, rtex->surface.blk_w, + rtex->surface.blk_h, + rtex->resource.b.b.array_size, rtex->resource.b.b.last_level, + rtex->surface.bpe, rtex->resource.b.b.nr_samples, rtex->surface.flags, util_format_short_name(rtex->resource.b.b.format)); - fprintf(f, " Layout: size=%"PRIu64", alignment=%"PRIu64", bankw=%u, " + if (rscreen->chip_class >= GFX9) { + fprintf(f, " Surf: size=%"PRIu64", slice_size=%"PRIu64", " + "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n", + rtex->surface.surf_size, + rtex->surface.u.gfx9.surf_slice_size, + rtex->surface.surf_alignment, + rtex->surface.u.gfx9.surf.swizzle_mode, + rtex->surface.u.gfx9.surf.epitch, + rtex->surface.u.gfx9.surf_pitch); + + if (rtex->fmask.size) { + fprintf(f, " FMASK: offset=%"PRIu64", size=%"PRIu64", " + "alignment=%u, swmode=%u, epitch=%u\n", + rtex->fmask.offset, + rtex->surface.u.gfx9.fmask_size, + rtex->surface.u.gfx9.fmask_alignment, + rtex->surface.u.gfx9.fmask.swizzle_mode, + rtex->surface.u.gfx9.fmask.epitch); + } + + if (rtex->cmask.size) { + fprintf(f, " CMask: offset=%"PRIu64", size=%"PRIu64", " + "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n", + rtex->cmask.offset, + rtex->surface.u.gfx9.cmask_size, + rtex->surface.u.gfx9.cmask_alignment, + rtex->surface.u.gfx9.cmask.rb_aligned, + rtex->surface.u.gfx9.cmask.pipe_aligned); + } + + if (rtex->htile_buffer) { + fprintf(f, " HTile: size=%u, alignment=%u, " + "rb_aligned=%u, pipe_aligned=%u\n", + rtex->htile_buffer->b.b.width0, + rtex->htile_buffer->buf->alignment, + rtex->surface.u.gfx9.htile.rb_aligned, + rtex->surface.u.gfx9.htile.pipe_aligned); + } + + if (rtex->dcc_offset) { + fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", " + "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n", + rtex->dcc_offset, rtex->surface.dcc_size, + rtex->surface.dcc_alignment, + rtex->surface.u.gfx9.dcc_pitch_max, + rtex->surface.num_dcc_levels); + } + + if (rtex->surface.u.gfx9.stencil_offset) { + fprintf(f, " Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n", + rtex->surface.u.gfx9.stencil_offset, + rtex->surface.u.gfx9.stencil.swizzle_mode, + rtex->surface.u.gfx9.stencil.epitch); + } + return; + } + + fprintf(f, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, " "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", - rtex->surface.bo_size, rtex->surface.bo_alignment, rtex->surface.bankw, - rtex->surface.bankh, rtex->surface.num_banks, rtex->surface.mtilea, - rtex->surface.tile_split, rtex->surface.pipe_config, + rtex->surface.surf_size, rtex->surface.surf_alignment, rtex->surface.u.legacy.bankw, + rtex->surface.u.legacy.bankh, rtex->surface.u.legacy.num_banks, rtex->surface.u.legacy.mtilea, + rtex->surface.u.legacy.tile_split, rtex->surface.u.legacy.pipe_config, (rtex->surface.flags & RADEON_SURF_SCANOUT) != 0); if (rtex->fmask.size) @@ -986,65 +1053,60 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f) rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index); if (rtex->cmask.size) - fprintf(f, " CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, " - "height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n", + fprintf(f, " CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, " + "slice_tile_max=%u\n", rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment, - rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign, - rtex->cmask.yalign, rtex->cmask.slice_tile_max); + rtex->cmask.slice_tile_max); if (rtex->htile_buffer) - fprintf(f, " HTile: size=%u, alignment=%u, pitch=%u, height=%u, " - "xalign=%u, yalign=%u, TC_compatible = %u\n", + fprintf(f, " HTile: size=%u, alignment=%u, TC_compatible = %u\n", rtex->htile_buffer->b.b.width0, - rtex->htile_buffer->buf->alignment, rtex->htile.pitch, - rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign, + rtex->htile_buffer->buf->alignment, rtex->tc_compatible_htile); if (rtex->dcc_offset) { - fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n", + fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%u\n", rtex->dcc_offset, rtex->surface.dcc_size, rtex->surface.dcc_alignment); - for (i = 0; i <= rtex->surface.last_level; i++) + for (i = 0; i <= rtex->resource.b.b.last_level; i++) fprintf(f, " DCCLevel[%i]: enabled=%u, offset=%"PRIu64", " "fast_clear_size=%"PRIu64"\n", - i, rtex->surface.level[i].dcc_enabled, - rtex->surface.level[i].dcc_offset, - rtex->surface.level[i].dcc_fast_clear_size); + i, i < rtex->surface.num_dcc_levels, + rtex->surface.u.legacy.level[i].dcc_offset, + rtex->surface.u.legacy.level[i].dcc_fast_clear_size); } - for (i = 0; i <= rtex->surface.last_level; i++) + for (i = 0; i <= rtex->resource.b.b.last_level; i++) fprintf(f, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", " "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "nblk_z=%u, pitch_bytes=%u, mode=%u\n", - i, rtex->surface.level[i].offset, - rtex->surface.level[i].slice_size, + "mode=%u, tiling_index = %u\n", + i, rtex->surface.u.legacy.level[i].offset, + rtex->surface.u.legacy.level[i].slice_size, u_minify(rtex->resource.b.b.width0, i), u_minify(rtex->resource.b.b.height0, i), u_minify(rtex->resource.b.b.depth0, i), - rtex->surface.level[i].nblk_x, - rtex->surface.level[i].nblk_y, - rtex->surface.level[i].nblk_z, - rtex->surface.level[i].pitch_bytes, - rtex->surface.level[i].mode); + rtex->surface.u.legacy.level[i].nblk_x, + rtex->surface.u.legacy.level[i].nblk_y, + rtex->surface.u.legacy.level[i].mode, + rtex->surface.u.legacy.tiling_index[i]); if (rtex->surface.flags & RADEON_SURF_SBUFFER) { fprintf(f, " StencilLayout: tilesplit=%u\n", - rtex->surface.stencil_tile_split); - for (i = 0; i <= rtex->surface.last_level; i++) { + rtex->surface.u.legacy.stencil_tile_split); + for (i = 0; i <= rtex->resource.b.b.last_level; i++) { fprintf(f, " StencilLevel[%i]: offset=%"PRIu64", " "slice_size=%"PRIu64", npix_x=%u, " "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "nblk_z=%u, pitch_bytes=%u, mode=%u\n", - i, rtex->surface.stencil_level[i].offset, - rtex->surface.stencil_level[i].slice_size, + "mode=%u, tiling_index = %u\n", + i, rtex->surface.u.legacy.stencil_level[i].offset, + rtex->surface.u.legacy.stencil_level[i].slice_size, u_minify(rtex->resource.b.b.width0, i), u_minify(rtex->resource.b.b.height0, i), u_minify(rtex->resource.b.b.depth0, i), - rtex->surface.stencil_level[i].nblk_x, - rtex->surface.stencil_level[i].nblk_y, - rtex->surface.stencil_level[i].nblk_z, - rtex->surface.stencil_level[i].pitch_bytes, - rtex->surface.stencil_level[i].mode); + rtex->surface.u.legacy.stencil_level[i].nblk_x, + rtex->surface.u.legacy.stencil_level[i].nblk_y, + rtex->surface.u.legacy.stencil_level[i].mode, + rtex->surface.u.legacy.stencil_tiling_index[i]); } } } @@ -1053,8 +1115,6 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f) static struct r600_texture * r600_texture_create_object(struct pipe_screen *screen, const struct pipe_resource *base, - unsigned pitch_in_bytes_override, - unsigned offset, struct pb_buffer *buf, struct radeon_surf *surface) { @@ -1077,25 +1137,29 @@ r600_texture_create_object(struct pipe_screen *screen, rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format)); rtex->surface = *surface; - if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) { - FREE(rtex); - return NULL; - } + rtex->size = rtex->surface.surf_size; - rtex->tc_compatible_htile = rtex->surface.htile_size != 0; - assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) == - rtex->tc_compatible_htile); + rtex->tc_compatible_htile = rtex->surface.htile_size != 0 && + (rtex->surface.flags & + RADEON_SURF_TC_COMPATIBLE_HTILE); - /* TC-compatible HTILE only supports Z32_FLOAT. */ - if (rtex->tc_compatible_htile) - rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT; - else + /* TC-compatible HTILE: + * - VI only supports Z32_FLOAT. + * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */ + if (rtex->tc_compatible_htile) { + if (rscreen->chip_class >= GFX9 && + base->format == PIPE_FORMAT_Z16_UNORM) + rtex->db_render_format = base->format; + else + rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT; + } else { rtex->db_render_format = base->format; + } /* Tiled depth textures utilize the non-displayable tile order. * This must be done after r600_setup_surface. * Applies to R600-Cayman. */ - rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D; + rtex->non_disp_tiling = rtex->is_depth && rtex->surface.u.legacy.level[0].mode >= RADEON_SURF_MODE_1D; /* Applies to GCN. */ rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode; @@ -1109,8 +1173,13 @@ r600_texture_create_object(struct pipe_screen *screen, if (base->flags & (R600_RESOURCE_FLAG_TRANSFER | R600_RESOURCE_FLAG_FLUSHED_DEPTH) || rscreen->chip_class >= EVERGREEN) { - rtex->can_sample_z = !rtex->surface.depth_adjusted; - rtex->can_sample_s = !rtex->surface.stencil_adjusted; + if (rscreen->chip_class >= GFX9) { + rtex->can_sample_z = true; + rtex->can_sample_s = true; + } else { + rtex->can_sample_z = !rtex->surface.u.legacy.depth_adjusted; + rtex->can_sample_s = !rtex->surface.u.legacy.stencil_adjusted; + } } else { if (rtex->resource.b.b.nr_samples <= 1 && (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM || @@ -1154,7 +1223,7 @@ r600_texture_create_object(struct pipe_screen *screen, /* Now create the backing buffer. */ if (!buf) { r600_init_resource_fields(rscreen, resource, rtex->size, - rtex->surface.bo_alignment); + rtex->surface.surf_alignment); resource->flags |= RADEON_FLAG_HANDLE; @@ -1178,7 +1247,7 @@ r600_texture_create_object(struct pipe_screen *screen, /* Initialize the cmask to 0xCC (= compressed state). */ r600_screen_clear_buffer(rscreen, &rtex->cmask_buffer->b.b, rtex->cmask.offset, rtex->cmask.size, - 0xCCCCCCCC, R600_COHERENCY_NONE); + 0xCCCCCCCC); } /* Initialize DCC only if the texture is not being imported. */ @@ -1186,7 +1255,7 @@ r600_texture_create_object(struct pipe_screen *screen, r600_screen_clear_buffer(rscreen, &rtex->resource.b.b, rtex->dcc_offset, rtex->surface.dcc_size, - 0xFFFFFFFF, R600_COHERENCY_NONE); + 0xFFFFFFFF); } /* Initialize the CMASK base register value. */ @@ -1203,15 +1272,16 @@ r600_texture_create_object(struct pipe_screen *screen, if (rscreen->debug_flags & DBG_TEX) { puts("Texture:"); - r600_print_texture_info(rtex, stdout); + r600_print_texture_info(rscreen, rtex, stdout); fflush(stdout); } return rtex; } -static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, - const struct pipe_resource *templ) +static enum radeon_surf_mode +r600_choose_tiling(struct r600_common_screen *rscreen, + const struct pipe_resource *templ) { const struct util_format_description *desc = util_format_description(templ->format); bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING; @@ -1256,7 +1326,9 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, /* Textures with a very small height are recommended to be linear. */ if (templ->target == PIPE_TEXTURE_1D || templ->target == PIPE_TEXTURE_1D_ARRAY || - templ->height0 <= 4) + /* Only very thin and long 2D textures should benefit from + * linear_aligned. */ + (templ->width0 > 8 && templ->height0 <= 2)) return RADEON_SURF_MODE_LINEAR_ALIGNED; /* Textures likely to be mapped often. */ @@ -1291,17 +1363,15 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen, int r; r = r600_init_surface(rscreen, &surface, templ, - r600_choose_tiling(rscreen, templ), - is_flushed_depth, tc_compatible_htile); + r600_choose_tiling(rscreen, templ), 0, 0, + false, false, is_flushed_depth, + tc_compatible_htile); if (r) { return NULL; } - r = rscreen->ws->surface_best(rscreen->ws, &surface); - if (r) { - return NULL; - } - return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0, - 0, NULL, &surface); + + return (struct pipe_resource *) + r600_texture_create_object(screen, templ, NULL, &surface); } static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, @@ -1317,6 +1387,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen int r; struct radeon_bo_metadata metadata = {}; struct r600_texture *rtex; + bool is_scanout; /* Support only 2D textures without mipmaps */ if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) || @@ -1329,31 +1400,39 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen rscreen->ws->buffer_get_metadata(buf, &metadata); - surface.pipe_config = metadata.pipe_config; - surface.bankw = metadata.bankw; - surface.bankh = metadata.bankh; - surface.tile_split = metadata.tile_split; - surface.mtilea = metadata.mtilea; - surface.num_banks = metadata.num_banks; - - if (metadata.macrotile == RADEON_LAYOUT_TILED) - array_mode = RADEON_SURF_MODE_2D; - else if (metadata.microtile == RADEON_LAYOUT_TILED) - array_mode = RADEON_SURF_MODE_1D; - else - array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + if (rscreen->chip_class >= GFX9) { + if (metadata.u.gfx9.swizzle_mode > 0) + array_mode = RADEON_SURF_MODE_2D; + else + array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + + is_scanout = metadata.u.gfx9.swizzle_mode == 0 || + metadata.u.gfx9.swizzle_mode % 4 == 2; + } else { + surface.u.legacy.pipe_config = metadata.u.legacy.pipe_config; + surface.u.legacy.bankw = metadata.u.legacy.bankw; + surface.u.legacy.bankh = metadata.u.legacy.bankh; + surface.u.legacy.tile_split = metadata.u.legacy.tile_split; + surface.u.legacy.mtilea = metadata.u.legacy.mtilea; + surface.u.legacy.num_banks = metadata.u.legacy.num_banks; + + if (metadata.u.legacy.macrotile == RADEON_LAYOUT_TILED) + array_mode = RADEON_SURF_MODE_2D; + else if (metadata.u.legacy.microtile == RADEON_LAYOUT_TILED) + array_mode = RADEON_SURF_MODE_1D; + else + array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + + is_scanout = metadata.u.legacy.scanout; + } - r = r600_init_surface(rscreen, &surface, templ, array_mode, - false, false); + r = r600_init_surface(rscreen, &surface, templ, array_mode, stride, + offset, true, is_scanout, false, false); if (r) { return NULL; } - if (metadata.scanout) - surface.flags |= RADEON_SURF_SCANOUT; - - rtex = r600_texture_create_object(screen, templ, stride, - offset, buf, &surface); + rtex = r600_texture_create_object(screen, templ, buf, &surface); if (!rtex) return NULL; @@ -1363,6 +1442,11 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen if (rscreen->apply_opaque_metadata) rscreen->apply_opaque_metadata(rscreen, rtex, &metadata); + /* Validate that addrlib arrived at the same surface parameters. */ + if (rscreen->chip_class >= GFX9) { + assert(metadata.u.gfx9.swizzle_mode == surface.u.gfx9.surf.swizzle_mode); + } + return &rtex->resource.b.b; } @@ -1486,7 +1570,7 @@ static void r600_texture_invalidate_storage(struct r600_common_context *rctx, /* There is no point in discarding depth and tiled buffers. */ assert(!rtex->is_depth); - assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED); + assert(rtex->surface.is_linear); /* Reallocate the buffer in the same pipe_resource. */ r600_alloc_resource(rscreen, &rtex->resource); @@ -1495,8 +1579,7 @@ static void r600_texture_invalidate_storage(struct r600_common_context *rctx, rtex->cmask.base_address_reg = (rtex->resource.gpu_address + rtex->cmask.offset) >> 8; - r600_dirty_all_framebuffer_states(rscreen); - p_atomic_inc(&rscreen->dirty_tex_descriptor_counter); + p_atomic_inc(&rscreen->dirty_tex_counter); rctx->num_alloc_tex_transfer_bytes += rtex->size; } @@ -1517,6 +1600,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, bool use_staging_texture = false; assert(!(texture->flags & R600_RESOURCE_FLAG_TRANSFER)); + assert(box->width && box->height && box->depth); /* Depth textures use staging unconditionally. */ if (!rtex->is_depth) { @@ -1539,17 +1623,18 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, /* Tiled textures need to be converted into a linear texture for CPU * access. The staging texture is always linear and is placed in GART. * - * Reading from VRAM is slow, always use the staging texture in - * this case. + * Reading from VRAM or GTT WC is slow, always use the staging + * texture in this case. * * Use the staging texture for uploads if the underlying BO * is busy. */ - if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) + if (!rtex->surface.is_linear) use_staging_texture = true; else if (usage & PIPE_TRANSFER_READ) - use_staging_texture = (rtex->resource.domains & - RADEON_DOMAIN_VRAM) != 0; + use_staging_texture = + rtex->resource.domains & RADEON_DOMAIN_VRAM || + rtex->resource.flags & RADEON_FLAG_GTT_WC; /* Write & linear only: */ else if (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf, RADEON_USAGE_READWRITE) || @@ -1567,7 +1652,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, trans = CALLOC_STRUCT(r600_transfer); if (!trans) return NULL; - trans->transfer.resource = texture; + pipe_resource_reference(&trans->transfer.resource, texture); trans->transfer.level = level; trans->transfer.usage = usage; trans->transfer.box = *box; @@ -1609,8 +1694,12 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, 0, 0, 0, box->depth, 0, 0); pipe_resource_reference(&temp, NULL); } - } - else { + + /* Just get the strides. */ + r600_texture_get_offset(rctx->screen, staging_depth, level, NULL, + &trans->transfer.stride, + &trans->transfer.layer_stride); + } else { /* XXX: only readback the rectangle which is being mapped? */ /* XXX: when discard is true, no need to read back from depth texture */ if (!r600_init_flushed_depth_texture(ctx, texture, &staging_depth)) { @@ -1624,11 +1713,12 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, box->z, box->z + box->depth - 1, 0, 0); - offset = r600_texture_get_offset(staging_depth, level, box); + offset = r600_texture_get_offset(rctx->screen, staging_depth, + level, box, + &trans->transfer.stride, + &trans->transfer.layer_stride); } - trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes; - trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size; trans->staging = (struct r600_resource*)staging_depth; buf = trans->staging; } else if (use_staging_texture) { @@ -1648,8 +1738,11 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, return NULL; } trans->staging = &staging->resource; - trans->transfer.stride = staging->surface.level[0].pitch_bytes; - trans->transfer.layer_stride = staging->surface.level[0].slice_size; + + /* Just get the strides. */ + r600_texture_get_offset(rctx->screen, staging, 0, NULL, + &trans->transfer.stride, + &trans->transfer.layer_stride); if (usage & PIPE_TRANSFER_READ) r600_copy_to_staging_texture(ctx, trans); @@ -1659,9 +1752,9 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, buf = trans->staging; } else { /* the resource is mapped directly */ - trans->transfer.stride = rtex->surface.level[level].pitch_bytes; - trans->transfer.layer_stride = rtex->surface.level[level].slice_size; - offset = r600_texture_get_offset(rtex, level, box); + offset = r600_texture_get_offset(rctx->screen, rtex, level, box, + &trans->transfer.stride, + &trans->transfer.layer_stride); buf = &rtex->resource; } @@ -1717,6 +1810,7 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx, rctx->num_alloc_tex_transfer_bytes = 0; } + pipe_resource_reference(&transfer->resource, NULL); FREE(transfer); } @@ -1813,15 +1907,26 @@ bool vi_dcc_formats_compatible(enum pipe_format format1, type1 == type2; } -void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx, +bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, + unsigned level, + enum pipe_format view_format) +{ + struct r600_texture *rtex = (struct r600_texture *)tex; + + return vi_dcc_enabled(rtex, level) && + !vi_dcc_formats_compatible(tex->format, view_format); +} + +/* This can't be merged with the above function, because + * vi_dcc_formats_compatible should be called only when DCC is enabled. */ +void vi_disable_dcc_if_incompatible_format(struct r600_common_context *rctx, struct pipe_resource *tex, unsigned level, enum pipe_format view_format) { struct r600_texture *rtex = (struct r600_texture *)tex; - if (rtex->dcc_offset && - rtex->surface.level[level].dcc_enabled && + if (vi_dcc_enabled(rtex, level) && !vi_dcc_formats_compatible(tex->format, view_format)) if (!r600_texture_disable_dcc(rctx, (struct r600_texture*)tex)) rctx->decompress_dcc(&rctx->b, rtex); @@ -1830,10 +1935,9 @@ void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx, struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, struct pipe_resource *texture, const struct pipe_surface *templ, + unsigned width0, unsigned height0, unsigned width, unsigned height) { - struct r600_common_context *rctx = (struct r600_common_context*)pipe; - struct r600_texture *rtex = (struct r600_texture*)texture; struct r600_surface *surface = CALLOC_STRUCT(r600_surface); if (!surface) @@ -1849,13 +1953,14 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, surface->base.width = width; surface->base.height = height; surface->base.u = templ->u; - surface->level_info = &rtex->surface.level[templ->u.tex.level]; - if (texture->target != PIPE_BUFFER) - vi_dcc_disable_if_incompatible_format(rctx, texture, - templ->u.tex.level, - templ->format); + surface->width0 = width0; + surface->height0 = height0; + surface->dcc_incompatible = + texture->target != PIPE_BUFFER && + vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, + templ->format); return &surface->base; } @@ -1866,6 +1971,8 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe, unsigned level = templ->u.tex.level; unsigned width = u_minify(tex->width0, level); unsigned height = u_minify(tex->height0, level); + unsigned width0 = tex->width0; + unsigned height0 = tex->height0; if (tex->target != PIPE_BUFFER && templ->format != tex->format) { const struct util_format_description *tex_desc @@ -1884,10 +1991,15 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe, width = nblks_x * templ_desc->block.width; height = nblks_y * templ_desc->block.height; + + width0 = util_format_get_nblocksx(tex->format, width0); + height0 = util_format_get_nblocksy(tex->format, height0); } } - return r600_create_surface_custom(pipe, tex, templ, width, height); + return r600_create_surface_custom(pipe, tex, templ, + width0, height0, + width, height); } static void r600_surface_destroy(struct pipe_context *pipe, @@ -2157,7 +2269,7 @@ static void vi_separate_dcc_try_enable(struct r600_common_context *rctx, if (!tex->resource.is_shared || !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) || tex->resource.b.b.target != PIPE_TEXTURE_2D || - tex->surface.last_level > 0 || + tex->resource.b.b.last_level > 0 || !tex->surface.dcc_size) return; @@ -2173,7 +2285,7 @@ static void vi_separate_dcc_try_enable(struct r600_common_context *rctx, if (!vi_should_enable_separate_dcc(tex)) return; /* stats show that DCC decompression is too expensive */ - assert(tex->surface.level[0].dcc_enabled); + assert(tex->surface.num_dcc_levels); assert(!tex->dcc_separate_buffer); r600_texture_discard_cmask(rctx->screen, tex); @@ -2186,7 +2298,8 @@ static void vi_separate_dcc_try_enable(struct r600_common_context *rctx, tex->last_dcc_separate_buffer = NULL; } else { tex->dcc_separate_buffer = (struct r600_resource*) - r600_aligned_buffer_create(rctx->b.screen, 0, + r600_aligned_buffer_create(rctx->b.screen, + R600_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, tex->surface.dcc_size, tex->surface.dcc_alignment); @@ -2272,7 +2385,7 @@ static void evergreen_set_clear_color(struct r600_texture *rtex, memset(&uc, 0, sizeof(uc)); - if (util_format_get_blocksizebits(surface_format) == 128) { + if (rtex->surface.bpe == 16) { /* DCC fast clear only: * CLEAR_WORD0 = R = G = B * CLEAR_WORD1 = A @@ -2386,9 +2499,9 @@ void vi_dcc_clear_level(struct r600_common_context *rctx, unsigned level, unsigned clear_value) { struct pipe_resource *dcc_buffer; - uint64_t dcc_offset; + uint64_t dcc_offset, clear_size; - assert(rtex->dcc_offset && rtex->surface.level[level].dcc_enabled); + assert(vi_dcc_enabled(rtex, level)); if (rtex->dcc_separate_buffer) { dcc_buffer = &rtex->dcc_separate_buffer->b.b; @@ -2398,10 +2511,18 @@ void vi_dcc_clear_level(struct r600_common_context *rctx, dcc_offset = rtex->dcc_offset; } - dcc_offset += rtex->surface.level[level].dcc_offset; + if (rctx->chip_class >= GFX9) { + /* Mipmap level clears aren't implemented. */ + assert(rtex->resource.b.b.last_level == 0); + /* MSAA needs a different clear size. */ + assert(rtex->resource.b.b.nr_samples <= 1); + clear_size = rtex->surface.dcc_size; + } else { + dcc_offset += rtex->surface.u.legacy.level[level].dcc_offset; + clear_size = rtex->surface.u.legacy.level[level].dcc_fast_clear_size; + } - rctx->clear_buffer(&rctx->b, dcc_buffer, dcc_offset, - rtex->surface.level[level].dcc_fast_clear_size, + rctx->clear_buffer(&rctx->b, dcc_buffer, dcc_offset, clear_size, clear_value, R600_COHERENCY_CB_META); } @@ -2413,27 +2534,59 @@ static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen, struct r600_texture *rtex) { if (rtex->resource.is_shared || - rtex->surface.nsamples <= 1 || + rtex->resource.b.b.nr_samples <= 1 || rtex->surface.micro_tile_mode == rtex->last_msaa_resolve_target_micro_mode) return; - assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_2D); - assert(rtex->surface.last_level == 0); + assert(rscreen->chip_class >= GFX9 || + rtex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D); + assert(rtex->resource.b.b.last_level == 0); + + if (rscreen->chip_class >= GFX9) { + /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */ + assert(rtex->surface.u.gfx9.surf.swizzle_mode >= 4); + + /* If you do swizzle_mode % 4, you'll get: + * 0 = Depth + * 1 = Standard, + * 2 = Displayable + * 3 = Rotated + * + * Depth-sample order isn't allowed: + */ + assert(rtex->surface.u.gfx9.surf.swizzle_mode % 4 != 0); - /* These magic numbers were copied from addrlib. It doesn't use any - * definitions for them either. They are all 2D_TILED_THIN1 modes with - * different bpp and micro tile mode. - */ - if (rscreen->chip_class >= CIK) { switch (rtex->last_msaa_resolve_target_micro_mode) { - case 0: /* displayable */ - rtex->surface.tiling_index[0] = 10; + case RADEON_MICRO_MODE_DISPLAY: + rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; + rtex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */ break; - case 1: /* thin */ - rtex->surface.tiling_index[0] = 14; + case RADEON_MICRO_MODE_THIN: + rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; + rtex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */ break; - case 3: /* rotated */ - rtex->surface.tiling_index[0] = 28; + case RADEON_MICRO_MODE_ROTATED: + rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; + rtex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */ + break; + default: /* depth */ + assert(!"unexpected micro mode"); + return; + } + } else if (rscreen->chip_class >= CIK) { + /* These magic numbers were copied from addrlib. It doesn't use + * any definitions for them either. They are all 2D_TILED_THIN1 + * modes with different bpp and micro tile mode. + */ + switch (rtex->last_msaa_resolve_target_micro_mode) { + case RADEON_MICRO_MODE_DISPLAY: + rtex->surface.u.legacy.tiling_index[0] = 10; + break; + case RADEON_MICRO_MODE_THIN: + rtex->surface.u.legacy.tiling_index[0] = 14; + break; + case RADEON_MICRO_MODE_ROTATED: + rtex->surface.u.legacy.tiling_index[0] = 28; break; default: /* depth, thick */ assert(!"unexpected micro mode"); @@ -2441,32 +2594,32 @@ static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen, } } else { /* SI */ switch (rtex->last_msaa_resolve_target_micro_mode) { - case 0: /* displayable */ + case RADEON_MICRO_MODE_DISPLAY: switch (rtex->surface.bpe) { case 1: - rtex->surface.tiling_index[0] = 10; + rtex->surface.u.legacy.tiling_index[0] = 10; break; case 2: - rtex->surface.tiling_index[0] = 11; + rtex->surface.u.legacy.tiling_index[0] = 11; break; default: /* 4, 8 */ - rtex->surface.tiling_index[0] = 12; + rtex->surface.u.legacy.tiling_index[0] = 12; break; } break; - case 1: /* thin */ + case RADEON_MICRO_MODE_THIN: switch (rtex->surface.bpe) { case 1: - rtex->surface.tiling_index[0] = 14; + rtex->surface.u.legacy.tiling_index[0] = 14; break; case 2: - rtex->surface.tiling_index[0] = 15; + rtex->surface.u.legacy.tiling_index[0] = 15; break; case 4: - rtex->surface.tiling_index[0] = 16; + rtex->surface.u.legacy.tiling_index[0] = 16; break; default: /* 8, 16 */ - rtex->surface.tiling_index[0] = 17; + rtex->surface.u.legacy.tiling_index[0] = 17; break; } break; @@ -2478,8 +2631,7 @@ static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen, rtex->surface.micro_tile_mode = rtex->last_msaa_resolve_target_micro_mode; - p_atomic_inc(&rscreen->dirty_fb_counter); - p_atomic_inc(&rscreen->dirty_tex_descriptor_counter); + p_atomic_inc(&rscreen->dirty_tex_counter); } void evergreen_do_fast_color_clear(struct r600_common_context *rctx, @@ -2523,7 +2675,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, } /* only supported on tiled surfaces */ - if (tex->surface.level[0].mode < RADEON_SURF_MODE_1D) { + if (tex->surface.is_linear) { continue; } @@ -2536,8 +2688,8 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, continue; /* fast color clear with 1D tiling doesn't work on old kernels and CIK */ - if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D && - rctx->chip_class >= CIK && + if (rctx->chip_class == CIK && + tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D && rctx->screen->info.drm_major == 2 && rctx->screen->info.drm_minor < 38) { continue; @@ -2550,9 +2702,10 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, !(rctx->screen->debug_flags & DBG_NO_DCC_FB)) { vi_separate_dcc_try_enable(rctx, tex); - /* Stoney can't do a CMASK-based clear, so all clears are - * considered to be hypothetically slow clears, which - * is weighed when determining to enable separate DCC. + /* RB+ isn't supported with a CMASK clear only on Stoney, + * so all clears are considered to be hypothetically slow + * clears, which is weighed when determining whether to + * enable separate DCC. */ if (tex->dcc_gather_statistics && rctx->family == CHIP_STONEY) @@ -2560,10 +2713,14 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, } /* Try to clear DCC first, otherwise try CMASK. */ - if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) { + if (vi_dcc_enabled(tex, 0)) { uint32_t reset_value; bool clear_words_needed; + /* TODO: fix DCC clear */ + if (rctx->chip_class >= GFX9) + continue; + if (rctx->screen->debug_flags & DBG_NO_DCC_CLEAR) continue; @@ -2574,16 +2731,23 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, vi_dcc_clear_level(rctx, tex, 0, reset_value); - if (clear_words_needed) - tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; + unsigned level_bit = 1 << fb->cbufs[i]->u.tex.level; + if (clear_words_needed) { + bool need_compressed_update = !tex->dirty_level_mask; + + tex->dirty_level_mask |= level_bit; + + if (need_compressed_update) + p_atomic_inc(&rctx->screen->compressed_colortex_counter); + } tex->separate_dcc_dirty = true; } else { /* 128-bit formats are unusupported */ - if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) { + if (tex->surface.bpe > 8) { continue; } - /* Stoney/RB+ doesn't work with CMASK fast clear. */ + /* RB+ doesn't work with CMASK fast clear on Stoney. */ if (rctx->family == CHIP_STONEY) continue; @@ -2598,7 +2762,12 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, tex->cmask.offset, tex->cmask.size, 0, R600_COHERENCY_CB_META); + bool need_compressed_update = !tex->dirty_level_mask; + tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; + + if (need_compressed_update) + p_atomic_inc(&rctx->screen->compressed_colortex_counter); } /* We can change the micro tile mode before a full clear. */ diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c index fb1491a28..d5352d9de 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c @@ -91,6 +91,12 @@ struct ruvd_decoder { bool use_legacy; struct rvid_buffer ctx; struct rvid_buffer sessionctx; + struct { + unsigned data0; + unsigned data1; + unsigned cmd; + unsigned cntl; + } reg; }; /* flush IB to the hardware */ @@ -120,14 +126,14 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd, uint64_t addr; addr = dec->ws->buffer_get_virtual_address(buf); addr = addr + off; - set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr); - set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32); + set_reg(dec, dec->reg.data0, addr); + set_reg(dec, dec->reg.data1, addr >> 32); } else { off += dec->ws->buffer_get_reloc_offset(buf); set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off); set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4); } - set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1); + set_reg(dec, dec->reg.cmd, cmd << 1); } /* do the codec needs an IT buffer ?*/ @@ -151,6 +157,8 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec) /* calc buffer offsets */ dec->msg = (struct ruvd_msg *)ptr; + memset(dec->msg, 0, sizeof(*dec->msg)); + dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET); if (have_it(dec)) dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size); @@ -322,6 +330,14 @@ static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_ return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size; } +static unsigned get_db_pitch_alignment(struct ruvd_decoder *dec) +{ + if (((struct r600_common_screen*)dec->screen)->family < CHIP_VEGA10) + return 16; + else + return 32; +} + /* calculate size of reference picture buffer */ static unsigned calc_dpb_size(struct ruvd_decoder *dec) { @@ -335,7 +351,7 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec) unsigned max_references = dec->base.max_references + 1; // aligned size of a single frame - image_size = width * height; + image_size = align(width, get_db_pitch_alignment(dec)) * height; image_size += image_size / 2; image_size = align(image_size, 1024); @@ -410,9 +426,9 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec) width = align (width, 16); height = align (height, 16); if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - dpb_size = align((width * height * 9) / 4, 256) * max_references; + dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 9) / 4, 256) * max_references; else - dpb_size = align((width * height * 3) / 2, 256) * max_references; + dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 3) / 2, 256) * max_references; break; case PIPE_VIDEO_FORMAT_VC1: @@ -478,6 +494,7 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_ memset(&result, 0, sizeof(result)); switch (pic->base.profile) { case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE: result.profile = RUVD_H264_PROFILE_BASELINE; break; @@ -703,13 +720,16 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video result.direct_reflist[i][j] = pic->RefPicList[i][j]; } - if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) && - (target->buffer_format == PIPE_FORMAT_NV12)) { - result.p010_mode = 0; - result.luma_10to8 = 5; - result.chroma_10to8 = 5; - result.sclr_luma10to8 = 4; - result.sclr_chroma10to8 = 4; + if (pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) { + if (target->buffer_format == PIPE_FORMAT_P016) { + result.p010_mode = 1; + result.msb_mode = 1; + } else { + result.luma_10to8 = 5; + result.chroma_10to8 = 5; + result.sclr_luma10to8 = 4; + result.sclr_chroma10to8 = 4; + } } /* TODO @@ -931,7 +951,6 @@ static void ruvd_destroy(struct pipe_video_codec *decoder) assert(decoder); map_msg_fb_it_buf(dec); - memset(dec->msg, 0, sizeof(*dec->msg)); dec->msg->size = sizeof(*dec->msg); dec->msg->msg_type = RUVD_MSG_DESTROY; dec->msg->stream_handle = dec->stream_handle; @@ -1074,7 +1093,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size; dec->msg->body.decode.bsd_size = bs_size; - dec->msg->body.decode.db_pitch = align(dec->base.width, 16); + dec->msg->body.decode.db_pitch = align(dec->base.width, get_db_pitch_alignment(dec)); if (dec->stream_type == RUVD_CODEC_H264_PERF && ((struct r600_common_screen*)dec->screen)->family >= CHIP_POLARIS10) @@ -1146,7 +1165,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, if (have_it(dec)) send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf, FB_BUFFER_OFFSET + dec->fb_size, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); - set_reg(dec, RUVD_ENGINE_CNTL, 1); + set_reg(dec, dec->reg.cntl, 1); flush(dec, RADEON_FLUSH_ASYNC); next_buffer(dec); @@ -1280,6 +1299,18 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, rvid_clear_buffer(context, &dec->sessionctx); } + if (info.family >= CHIP_VEGA10) { + dec->reg.data0 = RUVD_GPCOM_VCPU_DATA0_SOC15; + dec->reg.data1 = RUVD_GPCOM_VCPU_DATA1_SOC15; + dec->reg.cmd = RUVD_GPCOM_VCPU_CMD_SOC15; + dec->reg.cntl = RUVD_ENGINE_CNTL_SOC15; + } else { + dec->reg.data0 = RUVD_GPCOM_VCPU_DATA0; + dec->reg.data1 = RUVD_GPCOM_VCPU_DATA1; + dec->reg.cmd = RUVD_GPCOM_VCPU_CMD; + dec->reg.cntl = RUVD_ENGINE_CNTL; + } + map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); dec->msg->msg_type = RUVD_MSG_CREATE; @@ -1315,10 +1346,20 @@ error: } /* calculate top/bottom offset */ -static unsigned texture_offset(struct radeon_surf *surface, unsigned layer) +static unsigned texture_offset(struct radeon_surf *surface, unsigned layer, + enum ruvd_surface_type type) { - return surface->level[0].offset + - layer * surface->level[0].slice_size; + switch (type) { + default: + case RUVD_SURFACE_TYPE_LEGACY: + return surface->u.legacy.level[0].offset + + layer * surface->u.legacy.level[0].slice_size; + break; + case RUVD_SURFACE_TYPE_GFX9: + return surface->u.gfx9.surf_offset + + layer * surface->u.gfx9.surf_slice_size; + break; + } } /* hw encode the aspect of macro tiles */ @@ -1351,42 +1392,63 @@ static unsigned bank_wh(unsigned bankwh) * fill decoding target field from the luma and chroma surfaces */ void ruvd_set_dt_surfaces(struct ruvd_msg *msg, struct radeon_surf *luma, - struct radeon_surf *chroma) + struct radeon_surf *chroma, enum ruvd_surface_type type) { - msg->body.decode.dt_pitch = luma->level[0].pitch_bytes; - switch (luma->level[0].mode) { - case RADEON_SURF_MODE_LINEAR_ALIGNED: - msg->body.decode.dt_tiling_mode = RUVD_TILE_LINEAR; - msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_LINEAR; - break; - case RADEON_SURF_MODE_1D: - msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8; - msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_1D_THIN; - break; - case RADEON_SURF_MODE_2D: - msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8; - msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_2D_THIN; - break; + switch (type) { default: - assert(0); - break; - } + case RUVD_SURFACE_TYPE_LEGACY: + msg->body.decode.dt_pitch = luma->u.legacy.level[0].nblk_x; + switch (luma->u.legacy.level[0].mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + msg->body.decode.dt_tiling_mode = RUVD_TILE_LINEAR; + msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_LINEAR; + break; + case RADEON_SURF_MODE_1D: + msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8; + msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_1D_THIN; + break; + case RADEON_SURF_MODE_2D: + msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8; + msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_2D_THIN; + break; + default: + assert(0); + break; + } - msg->body.decode.dt_luma_top_offset = texture_offset(luma, 0); - msg->body.decode.dt_chroma_top_offset = texture_offset(chroma, 0); - if (msg->body.decode.dt_field_mode) { - msg->body.decode.dt_luma_bottom_offset = texture_offset(luma, 1); - msg->body.decode.dt_chroma_bottom_offset = texture_offset(chroma, 1); - } else { - msg->body.decode.dt_luma_bottom_offset = msg->body.decode.dt_luma_top_offset; - msg->body.decode.dt_chroma_bottom_offset = msg->body.decode.dt_chroma_top_offset; - } + msg->body.decode.dt_luma_top_offset = texture_offset(luma, 0, type); + msg->body.decode.dt_chroma_top_offset = texture_offset(chroma, 0, type); + if (msg->body.decode.dt_field_mode) { + msg->body.decode.dt_luma_bottom_offset = texture_offset(luma, 1, type); + msg->body.decode.dt_chroma_bottom_offset = texture_offset(chroma, 1, type); + } else { + msg->body.decode.dt_luma_bottom_offset = msg->body.decode.dt_luma_top_offset; + msg->body.decode.dt_chroma_bottom_offset = msg->body.decode.dt_chroma_top_offset; + } - assert(luma->bankw == chroma->bankw); - assert(luma->bankh == chroma->bankh); - assert(luma->mtilea == chroma->mtilea); + assert(luma->u.legacy.bankw == chroma->u.legacy.bankw); + assert(luma->u.legacy.bankh == chroma->u.legacy.bankh); + assert(luma->u.legacy.mtilea == chroma->u.legacy.mtilea); - msg->body.decode.dt_surf_tile_config |= RUVD_BANK_WIDTH(bank_wh(luma->bankw)); - msg->body.decode.dt_surf_tile_config |= RUVD_BANK_HEIGHT(bank_wh(luma->bankh)); - msg->body.decode.dt_surf_tile_config |= RUVD_MACRO_TILE_ASPECT_RATIO(macro_tile_aspect(luma->mtilea)); + msg->body.decode.dt_surf_tile_config |= RUVD_BANK_WIDTH(bank_wh(luma->u.legacy.bankw)); + msg->body.decode.dt_surf_tile_config |= RUVD_BANK_HEIGHT(bank_wh(luma->u.legacy.bankh)); + msg->body.decode.dt_surf_tile_config |= RUVD_MACRO_TILE_ASPECT_RATIO(macro_tile_aspect(luma->u.legacy.mtilea)); + break; + case RUVD_SURFACE_TYPE_GFX9: + msg->body.decode.dt_pitch = luma->u.gfx9.surf_pitch * luma->bpe; + /* SWIZZLE LINEAR MODE */ + msg->body.decode.dt_tiling_mode = RUVD_TILE_LINEAR; + msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_LINEAR; + msg->body.decode.dt_luma_top_offset = texture_offset(luma, 0, type); + msg->body.decode.dt_chroma_top_offset = texture_offset(chroma, 0, type); + if (msg->body.decode.dt_field_mode) { + msg->body.decode.dt_luma_bottom_offset = texture_offset(luma, 1, type); + msg->body.decode.dt_chroma_bottom_offset = texture_offset(chroma, 1, type); + } else { + msg->body.decode.dt_luma_bottom_offset = msg->body.decode.dt_luma_top_offset; + msg->body.decode.dt_chroma_bottom_offset = msg->body.decode.dt_chroma_top_offset; + } + msg->body.decode.dt_surf_tile_config = 0; + break; + } } diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h index e3f8504d8..0c3797e22 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h @@ -56,6 +56,11 @@ #define RUVD_GPCOM_VCPU_DATA1 0xEF14 #define RUVD_ENGINE_CNTL 0xEF18 +#define RUVD_GPCOM_VCPU_CMD_SOC15 0x2070c +#define RUVD_GPCOM_VCPU_DATA0_SOC15 0x20710 +#define RUVD_GPCOM_VCPU_DATA1_SOC15 0x20714 +#define RUVD_ENGINE_CNTL_SOC15 0x20718 + /* UVD commands to VCPU */ #define RUVD_CMD_MSG_BUFFER 0x00000000 #define RUVD_CMD_DPB_BUFFER 0x00000001 @@ -111,6 +116,11 @@ #define RUVD_VC1_PROFILE_MAIN 0x00000001 #define RUVD_VC1_PROFILE_ADVANCED 0x00000002 +enum ruvd_surface_type { + RUVD_SURFACE_TYPE_LEGACY = 0, + RUVD_SURFACE_TYPE_GFX9 +}; + struct ruvd_mvc_element { uint16_t viewOrderIndex; uint16_t viewId; @@ -432,5 +442,5 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, /* fill decoding target field from the luma and chroma surfaces */ void ruvd_set_dt_surfaces(struct ruvd_msg *msg, struct radeon_surf *luma, - struct radeon_surf *chroma); + struct radeon_surf *chroma, enum ruvd_surface_type type); #endif diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c index ef93e46c1..70c1e60f5 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c @@ -52,6 +52,7 @@ #define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8)) #define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8)) #define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8)) +#define FW_53_19_4 ((53 << 24) | (19 << 16) | (4 << 8)) /** * flush commands to the hardware @@ -178,14 +179,15 @@ static unsigned get_cpb_num(struct rvce_encoder *enc) case 41: dpb = 32768; break; - default: case 42: dpb = 34816; break; case 50: dpb = 110400; break; + default: case 51: + case 52: dpb = 184320; break; } @@ -223,9 +225,17 @@ struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc) void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, signed *luma_offset, signed *chroma_offset) { - unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128); - unsigned vpitch = align(enc->luma->npix_y, 16); - unsigned fsize = pitch * (vpitch + vpitch / 2); + struct r600_common_screen *rscreen = (struct r600_common_screen *)enc->screen; + unsigned pitch, vpitch, fsize; + + if (rscreen->chip_class < GFX9) { + pitch = align(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe, 128); + vpitch = align(enc->luma->u.legacy.level[0].nblk_y, 16); + } else { + pitch = align(enc->luma->u.gfx9.surf_pitch * enc->luma->bpe, 256); + vpitch = align(enc->luma->u.gfx9.surf_height, 16); + } + fsize = pitch * (vpitch + vpitch / 2); *luma_offset = slot->index * fsize; *chroma_offset = *luma_offset + pitch * vpitch; @@ -412,7 +422,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, enc->use_vui = true; if (rscreen->info.family >= CHIP_TONGA && rscreen->info.family != CHIP_STONEY && - rscreen->info.family != CHIP_POLARIS11) + rscreen->info.family != CHIP_POLARIS11 && + rscreen->info.family != CHIP_POLARIS12) enc->dual_pipe = true; /* TODO enable B frame with dual instance */ if ((rscreen->info.family >= CHIP_TONGA) && @@ -454,8 +465,14 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, goto error; get_buffer(((struct vl_video_buffer *)tmp_buf)->resources[0], NULL, &tmp_surf); - cpb_size = align(tmp_surf->level[0].pitch_bytes, 128); - cpb_size = cpb_size * align(tmp_surf->npix_y, 32); + + cpb_size = (rscreen->chip_class < GFX9) ? + align(tmp_surf->u.legacy.level[0].nblk_x * tmp_surf->bpe, 128) * + align(tmp_surf->u.legacy.level[0].nblk_y, 32) : + + align(tmp_surf->u.gfx9.surf_pitch * tmp_surf->bpe, 256) * + align(tmp_surf->u.gfx9.surf_height, 32); + cpb_size = cpb_size * 3 / 2; cpb_size = cpb_size * enc->cpb_num; if (enc->dual_pipe) @@ -493,6 +510,10 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, radeon_vce_52_init(enc); get_pic_param = radeon_vce_52_get_param; break; + case FW_53_19_4: + radeon_vce_52_init(enc); + get_pic_param = radeon_vce_52_get_param; + break; default: goto error; @@ -525,6 +546,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen) case FW_52_0_3: case FW_52_4_3: case FW_52_8_3: + case FW_53_19_4: return true; default: return false; diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c index fe15ded39..b9afd089a 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c @@ -94,9 +94,9 @@ static void create(struct rvce_encoder *enc) RVCE_CS(0x00000000); // encPicStructRestriction RVCE_CS(enc->base.width); // encImageWidth RVCE_CS(enc->base.height); // encImageHeight - RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch - RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch - RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw + RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encRefPicLumaPitch + RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encRefPicChromaPitch + RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16) / 8); // encRefYHeightInQw RVCE_CS(0x00000000); // encRefPic(Addr|Array)Mode, encPicStructRestriction, disableRDO RVCE_END(); } @@ -320,12 +320,12 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(0x00000000); // endOfSequence RVCE_CS(0x00000000); // endOfStream RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, - enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo + enc->luma->u.legacy.level[0].offset); // inputPictureLumaAddressHi/Lo RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, - enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo - RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch - RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch - RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch + enc->chroma->u.legacy.level[0].offset); // inputPictureChromaAddressHi/Lo + RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16)); // encInputFrameYPitch + RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encInputPicLumaPitch + RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encInputPicChromaPitch RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode RVCE_CS(0x00000000); // encInputPicTileConfig RVCE_CS(enc->pic.picture_type); // encPicType diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c index 262e13ba9..0d1181451 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c @@ -127,12 +127,12 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(0x00000000); // endOfSequence RVCE_CS(0x00000000); // endOfStream RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, - enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo + enc->luma->u.legacy.level[0].offset); // inputPictureLumaAddressHi/Lo RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, - enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo - RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch - RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch - RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch + enc->chroma->u.legacy.level[0].offset); // inputPictureChromaAddressHi/Lo + RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16)); // encInputFrameYPitch + RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encInputPicLumaPitch + RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encInputPicChromaPitch if (enc->dual_pipe) RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading) else diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c index 5db01fe52..36cf48047 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c @@ -167,6 +167,7 @@ void radeon_vce_52_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_pict static void create(struct rvce_encoder *enc) { + struct r600_common_screen *rscreen = (struct r600_common_screen *)enc->screen; enc->task_info(enc, 0x00000000, 0, 0, 0); RVCE_BEGIN(0x01000001); // create cmd @@ -177,9 +178,17 @@ static void create(struct rvce_encoder *enc) RVCE_CS(enc->enc_pic.ec.enc_pic_struct_restriction); RVCE_CS(enc->base.width); // encImageWidth RVCE_CS(enc->base.height); // encImageHeight - RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch - RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch - RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw + + if (rscreen->chip_class < GFX9) { + RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encRefPicLumaPitch + RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encRefPicChromaPitch + RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16) / 8); // encRefYHeightInQw + } else { + RVCE_CS(enc->luma->u.gfx9.surf_pitch * enc->luma->bpe); // encRefPicLumaPitch + RVCE_CS(enc->chroma->u.gfx9.surf_pitch * enc->chroma->bpe); // encRefPicChromaPitch + RVCE_CS(align(enc->luma->u.gfx9.surf_height, 16) / 8); // encRefYHeightInQw + } + RVCE_CS(enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants); RVCE_CS(enc->enc_pic.ec.enc_pre_encode_context_buffer_offset); @@ -191,6 +200,7 @@ static void create(struct rvce_encoder *enc) static void encode(struct rvce_encoder *enc) { + struct r600_common_screen *rscreen = (struct r600_common_screen *)enc->screen; signed luma_offset, chroma_offset, bs_offset; unsigned dep, bs_idx = enc->bs_idx++; int i; @@ -239,13 +249,25 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(enc->enc_pic.eo.insert_aud); RVCE_CS(enc->enc_pic.eo.end_of_sequence); RVCE_CS(enc->enc_pic.eo.end_of_stream); - RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, - enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo - RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, - enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo - RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch - RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch - RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch + + if (rscreen->chip_class < GFX9) { + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->luma->u.legacy.level[0].offset); // inputPictureLumaAddressHi/Lo + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->chroma->u.legacy.level[0].offset); // inputPictureChromaAddressHi/Lo + RVCE_CS(align(enc->luma->u.legacy.level[0].nblk_y, 16)); // encInputFrameYPitch + RVCE_CS(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe); // encInputPicLumaPitch + RVCE_CS(enc->chroma->u.legacy.level[0].nblk_x * enc->chroma->bpe); // encInputPicChromaPitch + } else { + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->luma->u.gfx9.surf_offset); // inputPictureLumaAddressHi/Lo + RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, + enc->chroma->u.gfx9.surf_offset); // inputPictureChromaAddressHi/Lo + RVCE_CS(align(enc->luma->u.gfx9.surf_height, 16)); // encInputFrameYPitch + RVCE_CS(enc->luma->u.gfx9.surf_pitch * enc->luma->bpe); // encInputPicLumaPitch + RVCE_CS(enc->chroma->u.gfx9.surf_pitch * enc->chroma->bpe); // encInputPicChromaPitch + } + if (enc->dual_pipe) enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00000000; else diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c index de8e11cd8..c7ad7f7a3 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c @@ -72,7 +72,7 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer, * non-sub-allocated buffer. */ buffer->res = (struct r600_resource *) - pipe_buffer_create(screen, PIPE_BIND_CUSTOM | PIPE_BIND_SHARED, + pipe_buffer_create(screen, PIPE_BIND_SHARED, usage, size); return buffer->res != NULL; @@ -129,8 +129,8 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) { struct r600_common_context *rctx = (struct r600_common_context*)context; - rctx->clear_buffer(context, &buffer->res->b.b, 0, buffer->res->buf->size, - 0, R600_COHERENCY_NONE); + rctx->dma_clear_buffer(context, &buffer->res->b.b, 0, + buffer->res->buf->size, 0); context->flush(context, NULL, 0); } @@ -138,26 +138,31 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) * join surfaces into the same buffer with identical tiling params * sumup their sizes and replace the backend buffers with a single bo */ -void rvid_join_surfaces(struct radeon_winsys* ws, +void rvid_join_surfaces(struct r600_common_context *rctx, struct pb_buffer** buffers[VL_NUM_COMPONENTS], struct radeon_surf *surfaces[VL_NUM_COMPONENTS]) { + struct radeon_winsys* ws; unsigned best_tiling, best_wh, off; unsigned size, alignment; struct pb_buffer *pb; unsigned i, j; + ws = rctx->ws; + for (i = 0, best_tiling = 0, best_wh = ~0; i < VL_NUM_COMPONENTS; ++i) { unsigned wh; if (!surfaces[i]) continue; - /* choose the smallest bank w/h for now */ - wh = surfaces[i]->bankw * surfaces[i]->bankh; - if (wh < best_wh) { - best_wh = wh; - best_tiling = i; + if (rctx->chip_class < GFX9) { + /* choose the smallest bank w/h for now */ + wh = surfaces[i]->u.legacy.bankw * surfaces[i]->u.legacy.bankh; + if (wh < best_wh) { + best_wh = wh; + best_tiling = i; + } } } @@ -165,17 +170,22 @@ void rvid_join_surfaces(struct radeon_winsys* ws, if (!surfaces[i]) continue; - /* copy the tiling parameters */ - surfaces[i]->bankw = surfaces[best_tiling]->bankw; - surfaces[i]->bankh = surfaces[best_tiling]->bankh; - surfaces[i]->mtilea = surfaces[best_tiling]->mtilea; - surfaces[i]->tile_split = surfaces[best_tiling]->tile_split; - /* adjust the texture layer offsets */ - off = align(off, surfaces[i]->bo_alignment); - for (j = 0; j < ARRAY_SIZE(surfaces[i]->level); ++j) - surfaces[i]->level[j].offset += off; - off += surfaces[i]->bo_size; + off = align(off, surfaces[i]->surf_alignment); + + if (rctx->chip_class < GFX9) { + /* copy the tiling parameters */ + surfaces[i]->u.legacy.bankw = surfaces[best_tiling]->u.legacy.bankw; + surfaces[i]->u.legacy.bankh = surfaces[best_tiling]->u.legacy.bankh; + surfaces[i]->u.legacy.mtilea = surfaces[best_tiling]->u.legacy.mtilea; + surfaces[i]->u.legacy.tile_split = surfaces[best_tiling]->u.legacy.tile_split; + + for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.legacy.level); ++j) + surfaces[i]->u.legacy.level[j].offset += off; + } else + surfaces[i]->u.gfx9.surf_offset += off; + + off += surfaces[i]->surf_size; } for (i = 0, size = 0, alignment = 0; i < VL_NUM_COMPONENTS; ++i) { @@ -279,7 +289,11 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_CAP_MAX_HEIGHT: return (rscreen->family < CHIP_TONGA) ? 1152 : 4096; case PIPE_VIDEO_CAP_PREFERED_FORMAT: - return PIPE_FORMAT_NV12; + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return PIPE_FORMAT_P016; + else + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: if (rscreen->family < CHIP_PALM) { @@ -331,6 +345,11 @@ boolean rvid_is_format_supported(struct pipe_screen *screen, enum pipe_video_profile profile, enum pipe_video_entrypoint entrypoint) { + /* HEVC 10 bit decoding should use P016 instead of NV12 if possible */ + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return (format == PIPE_FORMAT_NV12) || + (format == PIPE_FORMAT_P016); + /* we can only handle this one with UVD */ if (profile != PIPE_VIDEO_PROFILE_UNKNOWN) return format == PIPE_FORMAT_NV12; diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h index 39305b4fd..3347c4ebc 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h @@ -66,7 +66,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) /* join surfaces into the same buffer with identical tiling params sumup their sizes and replace the backend buffers with a single bo */ -void rvid_join_surfaces(struct radeon_winsys* ws, +void rvid_join_surfaces(struct r600_common_context *rctx, struct pb_buffer** buffers[VL_NUM_COMPONENTS], struct radeon_surf *surfaces[VL_NUM_COMPONENTS]); diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h index 8946209d3..2e287c67e 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h @@ -52,7 +52,8 @@ enum radeon_bo_flag { /* bitfield */ RADEON_FLAG_GTT_WC = (1 << 0), RADEON_FLAG_CPU_ACCESS = (1 << 1), RADEON_FLAG_NO_CPU_ACCESS = (1 << 2), - RADEON_FLAG_HANDLE = (1 << 3), /* the buffer most not be suballocated */ + RADEON_FLAG_HANDLE = (1 << 3), /* the buffer must not be suballocated */ + RADEON_FLAG_SPARSE = (1 << 4), }; enum radeon_bo_usage { /* bitfield */ @@ -66,6 +67,8 @@ enum radeon_bo_usage { /* bitfield */ RADEON_USAGE_SYNCHRONIZED = 8 }; +#define RADEON_SPARSE_PAGE_SIZE (64 * 1024) + enum ring_type { RING_GFX = 0, RING_COMPUTE, @@ -81,16 +84,20 @@ enum radeon_value_id { RADEON_MAPPED_VRAM, RADEON_MAPPED_GTT, RADEON_BUFFER_WAIT_TIME_NS, + RADEON_NUM_MAPPED_BUFFERS, RADEON_TIMESTAMP, - RADEON_NUM_CS_FLUSHES, + RADEON_NUM_GFX_IBS, + RADEON_NUM_SDMA_IBS, RADEON_NUM_BYTES_MOVED, RADEON_NUM_EVICTIONS, RADEON_VRAM_USAGE, + RADEON_VRAM_VIS_USAGE, RADEON_GTT_USAGE, RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */ RADEON_CURRENT_SCLK, RADEON_CURRENT_MCLK, RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */ + RADEON_CS_THREAD_TIME, }; /* Each group of four has the same priority. */ @@ -182,6 +189,7 @@ struct radeon_info { uint32_t gart_page_size; uint64_t gart_size; uint64_t vram_size; + uint64_t vram_vis_size; uint64_t max_alloc_size; uint32_t min_alloc_size; bool has_dedicated_vram; @@ -196,6 +204,7 @@ struct radeon_info { uint32_t ce_fw_version; uint32_t vce_harvest_config; uint32_t clock_crystal_freq; + uint32_t tcc_cache_line_size; /* Kernel info. */ uint32_t drm_major; /* version */ @@ -231,16 +240,25 @@ struct radeon_bo_metadata { /* Tiling flags describing the texture layout for display code * and DRI sharing. */ - enum radeon_bo_layout microtile; - enum radeon_bo_layout macrotile; - unsigned pipe_config; - unsigned bankw; - unsigned bankh; - unsigned tile_split; - unsigned mtilea; - unsigned num_banks; - unsigned stride; - bool scanout; + union { + struct { + enum radeon_bo_layout microtile; + enum radeon_bo_layout macrotile; + unsigned pipe_config; + unsigned bankw; + unsigned bankh; + unsigned tile_split; + unsigned mtilea; + unsigned num_banks; + unsigned stride; + bool scanout; + } legacy; + + struct { + /* surface flags */ + unsigned swizzle_mode:5; + } gfx9; + } u; /* Additional metadata associated with the buffer, in bytes. * The maximum size is 64 * 4. This is opaque for the winsys & kernel. @@ -255,99 +273,151 @@ enum radeon_feature_id { RADEON_FID_R300_CMASK_ACCESS, }; -#define RADEON_SURF_MAX_LEVEL 32 - -#define RADEON_SURF_TYPE_MASK 0xFF -#define RADEON_SURF_TYPE_SHIFT 0 -#define RADEON_SURF_TYPE_1D 0 -#define RADEON_SURF_TYPE_2D 1 -#define RADEON_SURF_TYPE_3D 2 -#define RADEON_SURF_TYPE_CUBEMAP 3 -#define RADEON_SURF_TYPE_1D_ARRAY 4 -#define RADEON_SURF_TYPE_2D_ARRAY 5 -#define RADEON_SURF_MODE_MASK 0xFF -#define RADEON_SURF_MODE_SHIFT 8 -#define RADEON_SURF_MODE_LINEAR_ALIGNED 1 -#define RADEON_SURF_MODE_1D 2 -#define RADEON_SURF_MODE_2D 3 +#define RADEON_SURF_MAX_LEVELS 15 + +enum radeon_surf_mode { + RADEON_SURF_MODE_LINEAR_ALIGNED = 1, + RADEON_SURF_MODE_1D = 2, + RADEON_SURF_MODE_2D = 3, +}; + +/* These are defined exactly like GB_TILE_MODEn.MICRO_TILE_MODE_NEW. */ +enum radeon_micro_mode { + RADEON_MICRO_MODE_DISPLAY = 0, + RADEON_MICRO_MODE_THIN = 1, + RADEON_MICRO_MODE_DEPTH = 2, + RADEON_MICRO_MODE_ROTATED = 3, +}; + +/* the first 16 bits are reserved for libdrm_radeon, don't use them */ #define RADEON_SURF_SCANOUT (1 << 16) #define RADEON_SURF_ZBUFFER (1 << 17) #define RADEON_SURF_SBUFFER (1 << 18) #define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER) -#define RADEON_SURF_HAS_SBUFFER_MIPTREE (1 << 19) -#define RADEON_SURF_HAS_TILE_MODE_INDEX (1 << 20) +/* bits 19 and 20 are reserved for libdrm_radeon, don't use them */ #define RADEON_SURF_FMASK (1 << 21) #define RADEON_SURF_DISABLE_DCC (1 << 22) #define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23) +#define RADEON_SURF_IMPORTED (1 << 24) +#define RADEON_SURF_OPTIMIZE_FOR_SPACE (1 << 25) -#define RADEON_SURF_GET(v, field) (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK) -#define RADEON_SURF_SET(v, field) (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT) -#define RADEON_SURF_CLR(v, field) ((v) & ~(RADEON_SURF_ ## field ## _MASK << RADEON_SURF_ ## field ## _SHIFT)) - -struct radeon_surf_level { +struct legacy_surf_level { uint64_t offset; uint64_t slice_size; - uint32_t npix_x; - uint32_t npix_y; - uint32_t npix_z; - uint32_t nblk_x; - uint32_t nblk_y; - uint32_t nblk_z; - uint32_t pitch_bytes; - uint32_t mode; uint64_t dcc_offset; uint64_t dcc_fast_clear_size; - bool dcc_enabled; + uint16_t nblk_x; + uint16_t nblk_y; + enum radeon_surf_mode mode; }; -struct radeon_surf { - /* These are inputs to the calculator. */ - uint32_t npix_x; - uint32_t npix_y; - uint32_t npix_z; - uint32_t blk_w; - uint32_t blk_h; - uint32_t blk_d; - uint32_t array_size; - uint32_t last_level; - uint32_t bpe; - uint32_t nsamples; - uint32_t flags; - - /* These are return values. Some of them can be set by the caller, but - * they will be treated as hints (e.g. bankw, bankh) and might be - * changed by the calculator. - */ - uint64_t bo_size; - uint64_t bo_alignment; - /* This applies to EG and later. */ - uint32_t bankw; - uint32_t bankh; - uint32_t mtilea; - uint32_t tile_split; - uint32_t stencil_tile_split; - struct radeon_surf_level level[RADEON_SURF_MAX_LEVEL]; - struct radeon_surf_level stencil_level[RADEON_SURF_MAX_LEVEL]; - uint32_t tiling_index[RADEON_SURF_MAX_LEVEL]; - uint32_t stencil_tiling_index[RADEON_SURF_MAX_LEVEL]; - uint32_t pipe_config; - uint32_t num_banks; - uint32_t macro_tile_index; - uint32_t micro_tile_mode; /* displayable, thin, depth, rotated */ +struct legacy_surf_layout { + unsigned bankw:4; /* max 8 */ + unsigned bankh:4; /* max 8 */ + unsigned mtilea:4; /* max 8 */ + unsigned tile_split:13; /* max 4K */ + unsigned stencil_tile_split:13; /* max 4K */ + unsigned pipe_config:5; /* max 17 */ + unsigned num_banks:5; /* max 16 */ + unsigned macro_tile_index:4; /* max 15 */ /* Whether the depth miptree or stencil miptree as used by the DB are * adjusted from their TC compatible form to ensure depth/stencil * compatibility. If either is true, the corresponding plane cannot be * sampled from. */ - bool depth_adjusted; - bool stencil_adjusted; + unsigned depth_adjusted:1; + unsigned stencil_adjusted:1; + + struct legacy_surf_level level[RADEON_SURF_MAX_LEVELS]; + struct legacy_surf_level stencil_level[RADEON_SURF_MAX_LEVELS]; + uint8_t tiling_index[RADEON_SURF_MAX_LEVELS]; + uint8_t stencil_tiling_index[RADEON_SURF_MAX_LEVELS]; +}; + +/* Same as addrlib - AddrResourceType. */ +enum gfx9_resource_type { + RADEON_RESOURCE_1D = 0, + RADEON_RESOURCE_2D, + RADEON_RESOURCE_3D, +}; + +struct gfx9_surf_flags { + uint16_t swizzle_mode; /* tile mode */ + uint16_t epitch; /* (pitch - 1) or (height - 1) */ +}; + +struct gfx9_surf_meta_flags { + unsigned rb_aligned:1; /* optimal for RBs */ + unsigned pipe_aligned:1; /* optimal for TC */ +}; + +struct gfx9_surf_layout { + struct gfx9_surf_flags surf; /* color or depth surface */ + struct gfx9_surf_flags fmask; /* not added to surf_size */ + struct gfx9_surf_flags stencil; /* added to surf_size, use stencil_offset */ + + struct gfx9_surf_meta_flags dcc; /* metadata of color */ + struct gfx9_surf_meta_flags htile; /* metadata of depth and stencil */ + struct gfx9_surf_meta_flags cmask; /* metadata of fmask */ + + enum gfx9_resource_type resource_type; /* 1D, 2D or 3D */ + uint64_t surf_offset; /* 0 unless imported with an offset */ + /* The size of the 2D plane containing all mipmap levels. */ + uint64_t surf_slice_size; + uint16_t surf_pitch; /* in blocks */ + uint16_t surf_height; + /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */ + uint32_t offset[RADEON_SURF_MAX_LEVELS]; + uint16_t dcc_pitch_max; /* (mip chain pitch - 1) */ + + uint64_t stencil_offset; /* separate stencil */ + uint64_t fmask_size; + uint64_t cmask_size; + + uint32_t fmask_alignment; + uint32_t cmask_alignment; +}; + +struct radeon_surf { + /* Format properties. */ + unsigned blk_w:4; + unsigned blk_h:4; + unsigned bpe:5; + /* Number of mipmap levels where DCC is enabled starting from level 0. + * Non-zero levels may be disabled due to alignment constraints, but not + * the first level. + */ + unsigned num_dcc_levels:4; + unsigned is_linear:1; + /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */ + unsigned micro_tile_mode:3; + uint32_t flags; + + /* These are return values. Some of them can be set by the caller, but + * they will be treated as hints (e.g. bankw, bankh) and might be + * changed by the calculator. + */ + uint64_t surf_size; uint64_t dcc_size; - uint64_t dcc_alignment; - /* TC-compatible HTILE only. */ uint64_t htile_size; - uint64_t htile_alignment; + + uint32_t surf_alignment; + uint32_t dcc_alignment; + uint32_t htile_alignment; + + union { + /* R600-VI return values. + * + * Some of them can be set by the caller if certain parameters are + * desirable. The allocator will try to obey them. + */ + struct legacy_surf_layout legacy; + + /* GFX9+ return values. */ + struct gfx9_surf_layout gfx9; + } u; }; struct radeon_bo_list_item { @@ -508,6 +578,20 @@ struct radeon_winsys { struct winsys_handle *whandle); /** + * Change the commitment of a (64KB-page aligned) region of the given + * sparse buffer. + * + * \warning There is no automatic synchronization with command submission. + * + * \note Only implemented by the amdgpu winsys. + * + * \return false on out of memory or other failure, true on success. + */ + bool (*buffer_commit)(struct pb_buffer *buf, + uint64_t offset, uint64_t size, + bool commit); + + /** * Return the virtual address of a buffer. * * When virtual memory is not in use, this is the offset relative to the @@ -739,18 +823,16 @@ struct radeon_winsys { * Initialize surface * * \param ws The winsys this function is called from. - * \param surf Surface structure ptr + * \param tex Input texture description + * \param flags Bitmask of RADEON_SURF_* flags + * \param bpe Bytes per pixel, it can be different for Z buffers. + * \param mode Preferred tile mode. (linear, 1D, or 2D) + * \param surf Output structure */ int (*surface_init)(struct radeon_winsys *ws, - struct radeon_surf *surf); - - /** - * Find best values for a surface - * - * \param ws The winsys this function is called from. - * \param surf Surface structure ptr - */ - int (*surface_best)(struct radeon_winsys *ws, + const struct pipe_resource *tex, + unsigned flags, unsigned bpe, + enum radeon_surf_mode mode, struct radeon_surf *surf); uint64_t (*query_value)(struct radeon_winsys *ws, |